10#ifndef EIGEN_PACKET_MATH_ALTIVEC_H
11#define EIGEN_PACKET_MATH_ALTIVEC_H
14#include "../../InternalHeaderCheck.h"
20#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
21#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
24#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
25#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
29#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
30#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
33typedef __vector
float Packet4f;
34typedef __vector
int Packet4i;
35typedef __vector
unsigned int Packet4ui;
36typedef __vector __bool
int Packet4bi;
37typedef __vector
short int Packet8s;
38typedef __vector
unsigned short int Packet8us;
39typedef __vector __bool
short Packet8bi;
40typedef __vector
signed char Packet16c;
41typedef __vector
unsigned char Packet16uc;
42typedef eigen_packet_wrapper<__vector unsigned short int, 0> Packet8bf;
46#define EIGEN_DECLARE_CONST_FAST_Packet4f(NAME, X) Packet4f p4f_##NAME = {X, X, X, X}
48#define EIGEN_DECLARE_CONST_FAST_Packet4i(NAME, X) Packet4i p4i_##NAME = vec_splat_s32(X)
50#define EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME, X) Packet4ui p4ui_##NAME = {X, X, X, X}
52#define EIGEN_DECLARE_CONST_FAST_Packet8us(NAME, X) Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X}
54#define EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME, X) \
55 Packet16uc p16uc_##NAME = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}
57#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) Packet4f p4f_##NAME = pset1<Packet4f>(X)
59#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) Packet4i p4i_##NAME = pset1<Packet4i>(X)
61#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) Packet2d p2d_##NAME = pset1<Packet2d>(X)
63#define EIGEN_DECLARE_CONST_Packet2l(NAME, X) Packet2l p2l_##NAME = pset1<Packet2l>(X)
65#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) \
66 const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
69#define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
70#define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits<PACKETNAME>::type
73static EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0);
74static EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0);
75static EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1);
76static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16, -16);
77static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1, -1);
78static EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u);
79static EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu);
80static EIGEN_DECLARE_CONST_FAST_Packet8us(ONE, 1);
81static Packet4f p4f_MZERO =
82 (Packet4f)vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1);
84static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0);
87static Packet4f p4f_COUNTDOWN = {0.0, 1.0, 2.0, 3.0};
88static Packet4i p4i_COUNTDOWN = {0, 1, 2, 3};
89static Packet8s p8s_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7};
90static Packet8us p8us_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7};
92static Packet16c p16c_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
93static Packet16uc p16uc_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
95static Packet16uc p16uc_REVERSE32 = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
96static Packet16uc p16uc_REVERSE16 = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
97static Packet16uc p16uc_REVERSE8 = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
100static Packet16uc p16uc_DUPLICATE32_HI = {0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7};
102static const Packet16uc p16uc_DUPLICATE16_EVEN = {0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13};
103static const Packet16uc p16uc_DUPLICATE16_ODD = {2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15};
105static Packet16uc p16uc_QUADRUPLICATE16_HI = {0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3};
106static Packet16uc p16uc_QUADRUPLICATE16 = {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3};
108static Packet16uc p16uc_MERGEE16 = {0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29};
109static Packet16uc p16uc_MERGEO16 = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
111static Packet16uc p16uc_MERGEH16 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
113static Packet16uc p16uc_MERGEL16 = {2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31};
119static Packet16uc p16uc_FORWARD = vec_lvsl(0, (
float*)0);
120static Packet16uc p16uc_PSET32_WODD =
121 vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
123static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
125static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc)vec_abs(p4i_MINUS16), 3),
128static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
129static Packet16uc p16uc_PSET32_WODD =
130 vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
132static Packet16uc p16uc_PSET32_WEVEN =
133 vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
135static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc)vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO,
139static Packet16uc p16uc_PSET64_HI = (Packet16uc)vec_mergeh(
140 (Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);
141static Packet16uc p16uc_PSET64_LO = (Packet16uc)vec_mergel(
142 (Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);
143static Packet16uc p16uc_TRANSPOSE64_HI =
144 p16uc_PSET64_HI + p16uc_HALF64_0_16;
145static Packet16uc p16uc_TRANSPOSE64_LO =
146 p16uc_PSET64_LO + p16uc_HALF64_0_16;
148static Packet16uc p16uc_COMPLEX32_REV =
149 vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8);
151#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
152#define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR);
154#define EIGEN_PPC_PREFETCH(ADDR) asm(" dcbt [%[addr]]\n" ::[addr] "r"(ADDR) : "cc");
158#define LOAD_STORE_UNROLL_16 _Pragma("unroll 16")
160#define LOAD_STORE_UNROLL_16 _Pragma("GCC unroll(16)")
164struct packet_traits<float> : default_packet_traits {
165 typedef Packet4f type;
166 typedef Packet4f half;
179 HasSin = EIGEN_FAST_MATH,
180 HasCos = EIGEN_FAST_MATH,
187#ifdef EIGEN_VECTORIZE_VSX
197 HasTanh = EIGEN_FAST_MATH,
198 HasErf = EIGEN_FAST_MATH,
199 HasErfc = EIGEN_FAST_MATH,
211struct packet_traits<bfloat16> : default_packet_traits {
212 typedef Packet8bf type;
213 typedef Packet8bf half;
226 HasSin = EIGEN_FAST_MATH,
227 HasCos = EIGEN_FAST_MATH,
230#ifdef EIGEN_VECTORIZE_VSX
249struct packet_traits<int> : default_packet_traits {
250 typedef Packet4i type;
251 typedef Packet4i half;
261#if defined(_ARCH_PWR10) && (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11, 0, 0))
272struct packet_traits<short int> : default_packet_traits {
273 typedef Packet8s type;
274 typedef Packet8s half;
290struct packet_traits<unsigned short int> : default_packet_traits {
291 typedef Packet8us type;
292 typedef Packet8us half;
308struct packet_traits<signed char> : default_packet_traits {
309 typedef Packet16c type;
310 typedef Packet16c half;
326struct packet_traits<unsigned char> : default_packet_traits {
327 typedef Packet16uc type;
328 typedef Packet16uc half;
344struct unpacket_traits<Packet4f> {
346 typedef Packet4f half;
347 typedef Packet4i integer_packet;
352 masked_load_available =
false,
353 masked_store_available =
false
357struct unpacket_traits<Packet4i> {
359 typedef Packet4i half;
364 masked_load_available =
false,
365 masked_store_available =
false
369struct unpacket_traits<Packet8s> {
370 typedef short int type;
371 typedef Packet8s half;
376 masked_load_available =
false,
377 masked_store_available =
false
381struct unpacket_traits<Packet8us> {
382 typedef unsigned short int type;
383 typedef Packet8us half;
388 masked_load_available =
false,
389 masked_store_available =
false
394struct unpacket_traits<Packet16c> {
395 typedef signed char type;
396 typedef Packet16c half;
401 masked_load_available =
false,
402 masked_store_available =
false
406struct unpacket_traits<Packet16uc> {
407 typedef unsigned char type;
408 typedef Packet16uc half;
413 masked_load_available =
false,
414 masked_store_available =
false
419struct unpacket_traits<Packet8bf> {
420 typedef bfloat16 type;
421 typedef Packet8bf half;
426 masked_load_available =
false,
427 masked_store_available =
false
431template <
typename Packet>
432EIGEN_STRONG_INLINE Packet pload_common(
const __UNPACK_TYPE__(Packet) * from) {
435 EIGEN_UNUSED_VARIABLE(from);
436 EIGEN_DEBUG_ALIGNED_LOAD
437#ifdef EIGEN_VECTORIZE_VSX
438 return vec_xl(0,
const_cast<__UNPACK_TYPE__(Packet)*
>(from));
440 return vec_ld(0, from);
446EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(
const float* from) {
447 return pload_common<Packet4f>(from);
451EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(
const int* from) {
452 return pload_common<Packet4i>(from);
456EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(
const short int* from) {
457 return pload_common<Packet8s>(from);
461EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(
const unsigned short int* from) {
462 return pload_common<Packet8us>(from);
466EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(
const signed char* from) {
467 return pload_common<Packet16c>(from);
471EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(
const unsigned char* from) {
472 return pload_common<Packet16uc>(from);
476EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(
const bfloat16* from) {
477 return pload_common<Packet8us>(
reinterpret_cast<const unsigned short int*
>(from));
480template <
typename Packet>
481EIGEN_ALWAYS_INLINE Packet pload_ignore(
const __UNPACK_TYPE__(Packet) * from) {
484 EIGEN_UNUSED_VARIABLE(from);
485 EIGEN_DEBUG_ALIGNED_LOAD
488#pragma GCC diagnostic push
489#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
491#ifdef EIGEN_VECTORIZE_VSX
492 return vec_xl(0,
const_cast<__UNPACK_TYPE__(Packet)*
>(from));
494 return vec_ld(0, from);
497#pragma GCC diagnostic pop
502EIGEN_ALWAYS_INLINE Packet8bf pload_ignore<Packet8bf>(
const bfloat16* from) {
503 return pload_ignore<Packet8us>(
reinterpret_cast<const unsigned short int*
>(from));
506template <
typename Packet>
507EIGEN_ALWAYS_INLINE Packet pload_partial_common(
const __UNPACK_TYPE__(Packet) * from,
const Index n,
508 const Index offset) {
511 const Index packet_size = unpacket_traits<Packet>::size;
512 eigen_internal_assert(n + offset <= packet_size &&
"number of elements plus offset will read past end of packet");
513 const Index size =
sizeof(__UNPACK_TYPE__(Packet));
515 EIGEN_UNUSED_VARIABLE(packet_size);
516 EIGEN_DEBUG_ALIGNED_LOAD
517 EIGEN_UNUSED_VARIABLE(from);
518 Packet load = vec_xl_len(
const_cast<__UNPACK_TYPE__(Packet)*
>(from), n * size);
520 Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
522 load = Packet(vec_sro(Packet16uc(load), shift));
524 load = Packet(vec_slo(Packet16uc(load), shift));
530 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
531 unsigned char* load2 =
reinterpret_cast<unsigned char*
>(load + offset);
532 unsigned char* from2 =
reinterpret_cast<unsigned char*
>(
const_cast<__UNPACK_TYPE__(Packet)*
>(from));
535 pstoreu(load2, ploadu<Packet16uc>(from2));
537 memcpy((
void*)load2, (
void*)from2, n2);
539 return pload_ignore<Packet>(load);
541 return Packet(pset1<Packet16uc>(0));
547EIGEN_ALWAYS_INLINE Packet4f pload_partial<Packet4f>(
const float* from,
const Index n,
const Index offset) {
548 return pload_partial_common<Packet4f>(from, n, offset);
552EIGEN_ALWAYS_INLINE Packet4i pload_partial<Packet4i>(
const int* from,
const Index n,
const Index offset) {
553 return pload_partial_common<Packet4i>(from, n, offset);
557EIGEN_ALWAYS_INLINE Packet8s pload_partial<Packet8s>(
const short int* from,
const Index n,
const Index offset) {
558 return pload_partial_common<Packet8s>(from, n, offset);
562EIGEN_ALWAYS_INLINE Packet8us pload_partial<Packet8us>(
const unsigned short int* from,
const Index n,
563 const Index offset) {
564 return pload_partial_common<Packet8us>(from, n, offset);
568EIGEN_ALWAYS_INLINE Packet8bf pload_partial<Packet8bf>(
const bfloat16* from,
const Index n,
const Index offset) {
569 return pload_partial_common<Packet8us>(
reinterpret_cast<const unsigned short int*
>(from), n, offset);
573EIGEN_ALWAYS_INLINE Packet16c pload_partial<Packet16c>(
const signed char* from,
const Index n,
const Index offset) {
574 return pload_partial_common<Packet16c>(from, n, offset);
578EIGEN_ALWAYS_INLINE Packet16uc pload_partial<Packet16uc>(
const unsigned char* from,
const Index n,
const Index offset) {
579 return pload_partial_common<Packet16uc>(from, n, offset);
582template <
typename Packet>
583EIGEN_STRONG_INLINE
void pstore_common(__UNPACK_TYPE__(Packet) * to,
const Packet& from) {
586 EIGEN_UNUSED_VARIABLE(to);
587 EIGEN_DEBUG_ALIGNED_STORE
588#ifdef EIGEN_VECTORIZE_VSX
589 vec_xst(from, 0, to);
596EIGEN_STRONG_INLINE
void pstore<float>(
float* to,
const Packet4f& from) {
597 pstore_common<Packet4f>(to, from);
601EIGEN_STRONG_INLINE
void pstore<int>(
int* to,
const Packet4i& from) {
602 pstore_common<Packet4i>(to, from);
606EIGEN_STRONG_INLINE
void pstore<short int>(
short int* to,
const Packet8s& from) {
607 pstore_common<Packet8s>(to, from);
611EIGEN_STRONG_INLINE
void pstore<unsigned short int>(
unsigned short int* to,
const Packet8us& from) {
612 pstore_common<Packet8us>(to, from);
616EIGEN_STRONG_INLINE
void pstore<bfloat16>(bfloat16* to,
const Packet8bf& from) {
617 pstore_common<Packet8us>(
reinterpret_cast<unsigned short int*
>(to), from.m_val);
621EIGEN_STRONG_INLINE
void pstore<signed char>(
signed char* to,
const Packet16c& from) {
622 pstore_common<Packet16c>(to, from);
626EIGEN_STRONG_INLINE
void pstore<unsigned char>(
unsigned char* to,
const Packet16uc& from) {
627 pstore_common<Packet16uc>(to, from);
630template <
typename Packet>
631EIGEN_ALWAYS_INLINE
void pstore_partial_common(__UNPACK_TYPE__(Packet) * to,
const Packet& from,
const Index n,
632 const Index offset) {
635 const Index packet_size = unpacket_traits<Packet>::size;
636 eigen_internal_assert(n + offset <= packet_size &&
"number of elements plus offset will write past end of packet");
637 const Index size =
sizeof(__UNPACK_TYPE__(Packet));
639 EIGEN_UNUSED_VARIABLE(packet_size);
640 EIGEN_UNUSED_VARIABLE(to);
641 EIGEN_DEBUG_ALIGNED_STORE
644 Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
646 store = Packet(vec_slo(Packet16uc(store), shift));
648 store = Packet(vec_sro(Packet16uc(store), shift));
651 vec_xst_len(store, to, n * size);
654 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
656 unsigned char* store2 =
reinterpret_cast<unsigned char*
>(store + offset);
657 unsigned char* to2 =
reinterpret_cast<unsigned char*
>(to);
660 pstore(to2, ploadu<Packet16uc>(store2));
662 memcpy((
void*)to2, (
void*)store2, n2);
669EIGEN_ALWAYS_INLINE
void pstore_partial<float>(
float* to,
const Packet4f& from,
const Index n,
const Index offset) {
670 pstore_partial_common<Packet4f>(to, from, n, offset);
674EIGEN_ALWAYS_INLINE
void pstore_partial<int>(
int* to,
const Packet4i& from,
const Index n,
const Index offset) {
675 pstore_partial_common<Packet4i>(to, from, n, offset);
679EIGEN_ALWAYS_INLINE
void pstore_partial<short int>(
short int* to,
const Packet8s& from,
const Index n,
680 const Index offset) {
681 pstore_partial_common<Packet8s>(to, from, n, offset);
685EIGEN_ALWAYS_INLINE
void pstore_partial<unsigned short int>(
unsigned short int* to,
const Packet8us& from,
687 pstore_partial_common<Packet8us>(to, from, n, offset);
691EIGEN_ALWAYS_INLINE
void pstore_partial<bfloat16>(bfloat16* to,
const Packet8bf& from,
const Index n,
692 const Index offset) {
693 pstore_partial_common<Packet8us>(
reinterpret_cast<unsigned short int*
>(to), from.m_val, n, offset);
697EIGEN_ALWAYS_INLINE
void pstore_partial<signed char>(
signed char* to,
const Packet16c& from,
const Index n,
698 const Index offset) {
699 pstore_partial_common<Packet16c>(to, from, n, offset);
703EIGEN_ALWAYS_INLINE
void pstore_partial<unsigned char>(
unsigned char* to,
const Packet16uc& from,
const Index n,
704 const Index offset) {
705 pstore_partial_common<Packet16uc>(to, from, n, offset);
708template <
typename Packet>
709EIGEN_STRONG_INLINE Packet pset1_size4(
const __UNPACK_TYPE__(Packet) & from) {
710 Packet v = {from, from, from, from};
714template <
typename Packet>
715EIGEN_STRONG_INLINE Packet pset1_size8(
const __UNPACK_TYPE__(Packet) & from) {
716 Packet v = {from, from, from, from, from, from, from, from};
720template <
typename Packet>
721EIGEN_STRONG_INLINE Packet pset1_size16(
const __UNPACK_TYPE__(Packet) & from) {
722 Packet v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from};
727EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(
const float& from) {
728 return pset1_size4<Packet4f>(from);
732EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(
const int& from) {
733 return pset1_size4<Packet4i>(from);
737EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(
const short int& from) {
738 return pset1_size8<Packet8s>(from);
742EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(
const unsigned short int& from) {
743 return pset1_size8<Packet8us>(from);
747EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(
const signed char& from) {
748 return pset1_size16<Packet16c>(from);
752EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(
const unsigned char& from) {
753 return pset1_size16<Packet16uc>(from);
757EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(
unsigned int from) {
758 return reinterpret_cast<Packet4f
>(pset1<Packet4i>(from));
762EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(
const bfloat16& from) {
763 return pset1_size8<Packet8us>(
reinterpret_cast<const unsigned short int&
>(from));
766template <
typename Packet>
767EIGEN_STRONG_INLINE
void pbroadcast4_common(
const __UNPACK_TYPE__(Packet) * a, Packet& a0, Packet& a1, Packet& a2,
769 a3 = pload<Packet>(a);
770 a0 = vec_splat(a3, 0);
771 a1 = vec_splat(a3, 1);
772 a2 = vec_splat(a3, 2);
773 a3 = vec_splat(a3, 3);
777EIGEN_STRONG_INLINE
void pbroadcast4<Packet4f>(
const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
778 pbroadcast4_common<Packet4f>(a, a0, a1, a2, a3);
781EIGEN_STRONG_INLINE
void pbroadcast4<Packet4i>(
const int* a, Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) {
782 pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3);
785template <
typename Packet>
786EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_common(
const __UNPACK_TYPE__(Packet) * from,
Index stride,
787 const Index n = unpacket_traits<Packet>::size) {
788 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
789 eigen_internal_assert(n <= unpacket_traits<Packet>::size &&
"number of elements will gather past end of packet");
791 if (n == unpacket_traits<Packet>::size) {
792 return ploadu<Packet>(from);
794 return ploadu_partial<Packet>(from, n);
798 for (
Index i = 0; i < n; i++) {
799 a[i] = from[i * stride];
802 return pload_ignore<Packet>(a);
807EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather<float, Packet4f>(
const float* from,
Index stride) {
808 return pgather_common<Packet4f>(from, stride);
812EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather<int, Packet4i>(
const int* from,
Index stride) {
813 return pgather_common<Packet4i>(from, stride);
817EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather<short int, Packet8s>(
const short int* from,
Index stride) {
818 return pgather_common<Packet8s>(from, stride);
822EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us pgather<unsigned short int, Packet8us>(
const unsigned short int* from,
824 return pgather_common<Packet8us>(from, stride);
828EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather<bfloat16, Packet8bf>(
const bfloat16* from,
Index stride) {
829 return pgather_common<Packet8bf>(from, stride);
833EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather<signed char, Packet16c>(
const signed char* from,
Index stride) {
834 return pgather_common<Packet16c>(from, stride);
838EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather<unsigned char, Packet16uc>(
const unsigned char* from,
840 return pgather_common<Packet16uc>(from, stride);
844EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather_partial<float, Packet4f>(
const float* from,
Index stride,
846 return pgather_common<Packet4f>(from, stride, n);
850EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather_partial<int, Packet4i>(
const int* from,
Index stride,
852 return pgather_common<Packet4i>(from, stride, n);
856EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather_partial<short int, Packet8s>(
const short int* from,
Index stride,
858 return pgather_common<Packet8s>(from, stride, n);
862EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us
863pgather_partial<unsigned short int, Packet8us>(
const unsigned short int* from,
Index stride,
const Index n) {
864 return pgather_common<Packet8us>(from, stride, n);
868EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather_partial<bfloat16, Packet8bf>(
const bfloat16* from,
Index stride,
870 return pgather_common<Packet8bf>(from, stride, n);
874EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather_partial<signed char, Packet16c>(
const signed char* from,
876 return pgather_common<Packet16c>(from, stride, n);
880EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather_partial<unsigned char, Packet16uc>(
const unsigned char* from,
883 return pgather_common<Packet16uc>(from, stride, n);
886template <
typename Packet>
887EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter_common(__UNPACK_TYPE__(Packet) * to,
const Packet& from,
889 const Index n = unpacket_traits<Packet>::size) {
890 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
891 eigen_internal_assert(n <= unpacket_traits<Packet>::size &&
"number of elements will scatter past end of packet");
893 if (n == unpacket_traits<Packet>::size) {
894 return pstoreu(to, from);
896 return pstoreu_partial(to, from, n);
899 pstore<__UNPACK_TYPE__(Packet)>(a, from);
901 for (
Index i = 0; i < n; i++) {
902 to[i * stride] = a[i];
908EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter<float, Packet4f>(
float* to,
const Packet4f& from,
Index stride) {
909 pscatter_common<Packet4f>(to, from, stride);
913EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter<int, Packet4i>(
int* to,
const Packet4i& from,
Index stride) {
914 pscatter_common<Packet4i>(to, from, stride);
918EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter<short int, Packet8s>(
short int* to,
const Packet8s& from,
920 pscatter_common<Packet8s>(to, from, stride);
924EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter<unsigned short int, Packet8us>(
unsigned short int* to,
925 const Packet8us& from,
927 pscatter_common<Packet8us>(to, from, stride);
931EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter<bfloat16, Packet8bf>(bfloat16* to,
const Packet8bf& from,
933 pscatter_common<Packet8bf>(to, from, stride);
937EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter<signed char, Packet16c>(
signed char* to,
const Packet16c& from,
939 pscatter_common<Packet16c>(to, from, stride);
943EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter<unsigned char, Packet16uc>(
unsigned char* to,
944 const Packet16uc& from,
Index stride) {
945 pscatter_common<Packet16uc>(to, from, stride);
949EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter_partial<float, Packet4f>(
float* to,
const Packet4f& from,
951 pscatter_common<Packet4f>(to, from, stride, n);
955EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter_partial<int, Packet4i>(
int* to,
const Packet4i& from,
Index stride,
957 pscatter_common<Packet4i>(to, from, stride, n);
961EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter_partial<short int, Packet8s>(
short int* to,
const Packet8s& from,
963 pscatter_common<Packet8s>(to, from, stride, n);
967EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter_partial<unsigned short int, Packet8us>(
unsigned short int* to,
968 const Packet8us& from,
971 pscatter_common<Packet8us>(to, from, stride, n);
975EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter_partial<bfloat16, Packet8bf>(bfloat16* to,
const Packet8bf& from,
977 pscatter_common<Packet8bf>(to, from, stride, n);
981EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter_partial<signed char, Packet16c>(
signed char* to,
982 const Packet16c& from,
Index stride,
984 pscatter_common<Packet16c>(to, from, stride, n);
988EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter_partial<unsigned char, Packet16uc>(
unsigned char* to,
989 const Packet16uc& from,
991 pscatter_common<Packet16uc>(to, from, stride, n);
995EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(
const float& a) {
996 return pset1<Packet4f>(a) + p4f_COUNTDOWN;
999EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(
const int& a) {
1000 return pset1<Packet4i>(a) + p4i_COUNTDOWN;
1003EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(
const short int& a) {
1004 return pset1<Packet8s>(a) + p8s_COUNTDOWN;
1007EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(
const unsigned short int& a) {
1008 return pset1<Packet8us>(a) + p8us_COUNTDOWN;
1011EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(
const signed char& a) {
1012 return pset1<Packet16c>(a) + p16c_COUNTDOWN;
1015EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(
const unsigned char& a) {
1016 return pset1<Packet16uc>(a) + p16uc_COUNTDOWN;
1020EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1024EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1028EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1032EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1036EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1040EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1044EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1049EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1053EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1057EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1061EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1065EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1069EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1074EIGEN_STRONG_INLINE Packet4f pnegate(
const Packet4f& a) {
1075#ifdef __POWER8_VECTOR__
1078 return vec_xor(a, p4f_MZERO);
1082EIGEN_STRONG_INLINE Packet16c pnegate(
const Packet16c& a) {
1083#ifdef __POWER8_VECTOR__
1086 return reinterpret_cast<Packet16c
>(p4i_ZERO) - a;
1090EIGEN_STRONG_INLINE Packet8s pnegate(
const Packet8s& a) {
1091#ifdef __POWER8_VECTOR__
1094 return reinterpret_cast<Packet8s
>(p4i_ZERO) - a;
1098EIGEN_STRONG_INLINE Packet4i pnegate(
const Packet4i& a) {
1099#ifdef __POWER8_VECTOR__
1102 return p4i_ZERO - a;
1107EIGEN_STRONG_INLINE Packet4f pconj(
const Packet4f& a) {
1111EIGEN_STRONG_INLINE Packet4i pconj(
const Packet4i& a) {
1116EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1117 return vec_madd(a, b, p4f_MZERO);
1120EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1124EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1125 return vec_mul(a, b);
1128EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1129 return vec_mul(a, b);
1132EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1133 return vec_mul(a, b);
1136EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1137 return vec_mul(a, b);
1141EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1143 Packet4f t, y_0, y_1;
1149 t = vec_nmsub(y_0, b, p4f_ONE);
1150 y_1 = vec_madd(y_0, t, y_0);
1152 return vec_madd(a, y_1, p4f_MZERO);
1154 return vec_div(a, b);
1159EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1160#if defined(_ARCH_PWR10) && (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11, 0, 0))
1161 return vec_div(a, b);
1163 EIGEN_UNUSED_VARIABLE(a);
1164 EIGEN_UNUSED_VARIABLE(b);
1165 eigen_assert(
false &&
"packet integer division are not supported by AltiVec");
1166 return pset1<Packet4i>(0);
1172EIGEN_STRONG_INLINE Packet4f pmadd(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
1173 return vec_madd(a, b, c);
1176EIGEN_STRONG_INLINE Packet4i pmadd(
const Packet4i& a,
const Packet4i& b,
const Packet4i& c) {
1180EIGEN_STRONG_INLINE Packet8s pmadd(
const Packet8s& a,
const Packet8s& b,
const Packet8s& c) {
1181 return vec_madd(a, b, c);
1184EIGEN_STRONG_INLINE Packet8us pmadd(
const Packet8us& a,
const Packet8us& b,
const Packet8us& c) {
1185 return vec_madd(a, b, c);
1188#ifdef EIGEN_VECTORIZE_VSX
1190EIGEN_STRONG_INLINE Packet4f pmsub(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
1191 return vec_msub(a, b, c);
1194EIGEN_STRONG_INLINE Packet4f pnmadd(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
1195 return vec_nmsub(a, b, c);
1198EIGEN_STRONG_INLINE Packet4f pnmsub(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
1199 return vec_nmadd(a, b, c);
1204EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1205#ifdef EIGEN_VECTORIZE_VSX
1208 __asm__(
"xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" :
"=&wa"(ret) :
"wa"(a),
"wa"(b));
1211 return vec_min(a, b);
1215EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1216 return vec_min(a, b);
1219EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1220 return vec_min(a, b);
1223EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1224 return vec_min(a, b);
1227EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1228 return vec_min(a, b);
1231EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1232 return vec_min(a, b);
1236EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1237#ifdef EIGEN_VECTORIZE_VSX
1240 __asm__(
"xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" :
"=&wa"(ret) :
"wa"(a),
"wa"(b));
1243 return vec_max(a, b);
1247EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1248 return vec_max(a, b);
1251EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1252 return vec_max(a, b);
1255EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1256 return vec_max(a, b);
1259EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1260 return vec_max(a, b);
1263EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1264 return vec_max(a, b);
1268EIGEN_STRONG_INLINE Packet4f pcmp_le(
const Packet4f& a,
const Packet4f& b) {
1269 return reinterpret_cast<Packet4f
>(vec_cmple(a, b));
1272#ifdef EIGEN_VECTORIZE_VSX
1274EIGEN_STRONG_INLINE Packet4f pcmp_lt(
const Packet4f& a,
const Packet4f& b) {
1275 return reinterpret_cast<Packet4f
>(vec_cmplt(a, b));
1279EIGEN_STRONG_INLINE Packet4f pcmp_eq(
const Packet4f& a,
const Packet4f& b) {
1280 return reinterpret_cast<Packet4f
>(vec_cmpeq(a, b));
1283EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(
const Packet4f& a,
const Packet4f& b) {
1284 Packet4f c =
reinterpret_cast<Packet4f
>(vec_cmpge(a, b));
1285 return vec_nor(c, c);
1288#ifdef EIGEN_VECTORIZE_VSX
1290EIGEN_STRONG_INLINE Packet4i pcmp_le(
const Packet4i& a,
const Packet4i& b) {
1291 return reinterpret_cast<Packet4i
>(vec_cmple(a, b));
1295EIGEN_STRONG_INLINE Packet4i pcmp_lt(
const Packet4i& a,
const Packet4i& b) {
1296 return reinterpret_cast<Packet4i
>(vec_cmplt(a, b));
1299EIGEN_STRONG_INLINE Packet4i pcmp_eq(
const Packet4i& a,
const Packet4i& b) {
1300 return reinterpret_cast<Packet4i
>(vec_cmpeq(a, b));
1302#ifdef EIGEN_VECTORIZE_VSX
1304EIGEN_STRONG_INLINE Packet8s pcmp_le(
const Packet8s& a,
const Packet8s& b) {
1305 return reinterpret_cast<Packet8s
>(vec_cmple(a, b));
1309EIGEN_STRONG_INLINE Packet8s pcmp_lt(
const Packet8s& a,
const Packet8s& b) {
1310 return reinterpret_cast<Packet8s
>(vec_cmplt(a, b));
1313EIGEN_STRONG_INLINE Packet8s pcmp_eq(
const Packet8s& a,
const Packet8s& b) {
1314 return reinterpret_cast<Packet8s
>(vec_cmpeq(a, b));
1316#ifdef EIGEN_VECTORIZE_VSX
1318EIGEN_STRONG_INLINE Packet8us pcmp_le(
const Packet8us& a,
const Packet8us& b) {
1319 return reinterpret_cast<Packet8us
>(vec_cmple(a, b));
1323EIGEN_STRONG_INLINE Packet8us pcmp_lt(
const Packet8us& a,
const Packet8us& b) {
1324 return reinterpret_cast<Packet8us
>(vec_cmplt(a, b));
1327EIGEN_STRONG_INLINE Packet8us pcmp_eq(
const Packet8us& a,
const Packet8us& b) {
1328 return reinterpret_cast<Packet8us
>(vec_cmpeq(a, b));
1330#ifdef EIGEN_VECTORIZE_VSX
1332EIGEN_STRONG_INLINE Packet16c pcmp_le(
const Packet16c& a,
const Packet16c& b) {
1333 return reinterpret_cast<Packet16c
>(vec_cmple(a, b));
1337EIGEN_STRONG_INLINE Packet16c pcmp_lt(
const Packet16c& a,
const Packet16c& b) {
1338 return reinterpret_cast<Packet16c
>(vec_cmplt(a, b));
1341EIGEN_STRONG_INLINE Packet16c pcmp_eq(
const Packet16c& a,
const Packet16c& b) {
1342 return reinterpret_cast<Packet16c
>(vec_cmpeq(a, b));
1344#ifdef EIGEN_VECTORIZE_VSX
1346EIGEN_STRONG_INLINE Packet16uc pcmp_le(
const Packet16uc& a,
const Packet16uc& b) {
1347 return reinterpret_cast<Packet16uc
>(vec_cmple(a, b));
1351EIGEN_STRONG_INLINE Packet16uc pcmp_lt(
const Packet16uc& a,
const Packet16uc& b) {
1352 return reinterpret_cast<Packet16uc
>(vec_cmplt(a, b));
1355EIGEN_STRONG_INLINE Packet16uc pcmp_eq(
const Packet16uc& a,
const Packet16uc& b) {
1356 return reinterpret_cast<Packet16uc
>(vec_cmpeq(a, b));
1360EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1361 return vec_and(a, b);
1364EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1365 return vec_and(a, b);
1368EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1369 return vec_and(a, b);
1372EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1373 return vec_and(a, b);
1376EIGEN_STRONG_INLINE Packet8bf pand<Packet8bf>(
const Packet8bf& a,
const Packet8bf& b) {
1377 return pand<Packet8us>(a, b);
1381EIGEN_STRONG_INLINE Packet4f por<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1382 return vec_or(a, b);
1385EIGEN_STRONG_INLINE Packet4i por<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1386 return vec_or(a, b);
1389EIGEN_STRONG_INLINE Packet8s por<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1390 return vec_or(a, b);
1393EIGEN_STRONG_INLINE Packet8us por<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1394 return vec_or(a, b);
1397EIGEN_STRONG_INLINE Packet8bf por<Packet8bf>(
const Packet8bf& a,
const Packet8bf& b) {
1398 return por<Packet8us>(a, b);
1402EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1403 return vec_xor(a, b);
1406EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1407 return vec_xor(a, b);
1410EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1411 return vec_xor(a, b);
1414EIGEN_STRONG_INLINE Packet8bf pxor<Packet8bf>(
const Packet8bf& a,
const Packet8bf& b) {
1415 return pxor<Packet8us>(a, b);
1419EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1420 return vec_andc(a, b);
1423EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1424 return vec_andc(a, b);
1428EIGEN_STRONG_INLINE Packet4f pselect(
const Packet4f& mask,
const Packet4f& a,
const Packet4f& b) {
1429 return vec_sel(b, a,
reinterpret_cast<Packet4ui
>(mask));
1433EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(
const Packet4f& a) {
1434 Packet4f t = vec_add(
1435 reinterpret_cast<Packet4f
>(vec_or(vec_and(
reinterpret_cast<Packet4ui
>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
1438#ifdef EIGEN_VECTORIZE_VSX
1439 __asm__(
"xvrspiz %x0, %x1\n\t" :
"=&wa"(res) :
"wa"(t));
1441 __asm__(
"vrfiz %0, %1\n\t" :
"=v"(res) :
"v"(t));
1447EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(
const Packet4f& a) {
1451EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(
const Packet4f& a) {
1452 return vec_floor(a);
1455EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(
const Packet4f& a) {
1456 return vec_trunc(a);
1458#ifdef EIGEN_VECTORIZE_VSX
1460EIGEN_STRONG_INLINE Packet4f print<Packet4f>(
const Packet4f& a) {
1463 __asm__(
"xvrspic %x0, %x1\n\t" :
"=&wa"(res) :
"wa"(a));
1469template <
typename Packet>
1470EIGEN_STRONG_INLINE Packet ploadu_common(
const __UNPACK_TYPE__(Packet) * from) {
1471 EIGEN_DEBUG_UNALIGNED_LOAD
1472#if defined(EIGEN_VECTORIZE_VSX)
1473 return vec_xl(0,
const_cast<__UNPACK_TYPE__(Packet)*
>(from));
1475 Packet16uc MSQ = vec_ld(0, (
unsigned char*)from);
1476 Packet16uc LSQ = vec_ld(15, (
unsigned char*)from);
1477 Packet16uc mask = vec_lvsl(0, from);
1479 return (Packet)vec_perm(MSQ, LSQ, mask);
1484EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(
const float* from) {
1485 return ploadu_common<Packet4f>(from);
1488EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(
const int* from) {
1489 return ploadu_common<Packet4i>(from);
1492EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(
const short int* from) {
1493 return ploadu_common<Packet8s>(from);
1496EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(
const unsigned short int* from) {
1497 return ploadu_common<Packet8us>(from);
1500EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(
const bfloat16* from) {
1501 return ploadu_common<Packet8us>(
reinterpret_cast<const unsigned short int*
>(from));
1504EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(
const signed char* from) {
1505 return ploadu_common<Packet16c>(from);
1508EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(
const unsigned char* from) {
1509 return ploadu_common<Packet16uc>(from);
1512template <
typename Packet>
1513EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(
const __UNPACK_TYPE__(Packet) * from,
const Index n,
1514 const Index offset) {
1515 const Index packet_size = unpacket_traits<Packet>::size;
1516 eigen_internal_assert(n + offset <= packet_size &&
"number of elements plus offset will read past end of packet");
1517 const Index size =
sizeof(__UNPACK_TYPE__(Packet));
1519 EIGEN_UNUSED_VARIABLE(packet_size);
1520 EIGEN_DEBUG_ALIGNED_LOAD
1521 EIGEN_DEBUG_UNALIGNED_LOAD
1522 Packet load = vec_xl_len(
const_cast<__UNPACK_TYPE__(Packet)*
>(from), n * size);
1524 Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
1526 load = Packet(vec_sro(Packet16uc(load), shift));
1528 load = Packet(vec_slo(Packet16uc(load), shift));
1534 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
1535 unsigned char* load2 =
reinterpret_cast<unsigned char*
>(load + offset);
1536 unsigned char* from2 =
reinterpret_cast<unsigned char*
>(
const_cast<__UNPACK_TYPE__(Packet)*
>(from));
1537 Index n2 = n * size;
1539 pstoreu(load2, ploadu<Packet16uc>(from2));
1541 memcpy((
void*)load2, (
void*)from2, n2);
1543 return pload_ignore<Packet>(load);
1545 return Packet(pset1<Packet16uc>(0));
1551EIGEN_ALWAYS_INLINE Packet4f ploadu_partial<Packet4f>(
const float* from,
const Index n,
const Index offset) {
1552 return ploadu_partial_common<Packet4f>(from, n, offset);
1555EIGEN_ALWAYS_INLINE Packet4i ploadu_partial<Packet4i>(
const int* from,
const Index n,
const Index offset) {
1556 return ploadu_partial_common<Packet4i>(from, n, offset);
1559EIGEN_ALWAYS_INLINE Packet8s ploadu_partial<Packet8s>(
const short int* from,
const Index n,
const Index offset) {
1560 return ploadu_partial_common<Packet8s>(from, n, offset);
1563EIGEN_ALWAYS_INLINE Packet8us ploadu_partial<Packet8us>(
const unsigned short int* from,
const Index n,
1564 const Index offset) {
1565 return ploadu_partial_common<Packet8us>(from, n, offset);
1568EIGEN_ALWAYS_INLINE Packet8bf ploadu_partial<Packet8bf>(
const bfloat16* from,
const Index n,
const Index offset) {
1569 return ploadu_partial_common<Packet8us>(
reinterpret_cast<const unsigned short int*
>(from), n, offset);
1572EIGEN_ALWAYS_INLINE Packet16c ploadu_partial<Packet16c>(
const signed char* from,
const Index n,
const Index offset) {
1573 return ploadu_partial_common<Packet16c>(from, n, offset);
1576EIGEN_ALWAYS_INLINE Packet16uc ploadu_partial<Packet16uc>(
const unsigned char* from,
const Index n,
1577 const Index offset) {
1578 return ploadu_partial_common<Packet16uc>(from, n, offset);
1581template <
typename Packet>
1582EIGEN_STRONG_INLINE Packet ploaddup_common(
const __UNPACK_TYPE__(Packet) * from) {
1584 if ((std::ptrdiff_t(from) % 16) == 0)
1585 p = pload<Packet>(from);
1587 p = ploadu<Packet>(from);
1588 return vec_mergeh(p, p);
1591EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(
const float* from) {
1592 return ploaddup_common<Packet4f>(from);
1595EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(
const int* from) {
1596 return ploaddup_common<Packet4i>(from);
1600EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(
const short int* from) {
1602 if ((std::ptrdiff_t(from) % 16) == 0)
1603 p = pload<Packet8s>(from);
1605 p = ploadu<Packet8s>(from);
1606 return vec_mergeh(p, p);
1610EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(
const unsigned short int* from) {
1612 if ((std::ptrdiff_t(from) % 16) == 0)
1613 p = pload<Packet8us>(from);
1615 p = ploadu<Packet8us>(from);
1616 return vec_mergeh(p, p);
1620EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(
const short int* from) {
1622 if ((std::ptrdiff_t(from) % 16) == 0)
1623 p = pload<Packet8s>(from);
1625 p = ploadu<Packet8s>(from);
1626 return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
1630EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(
const unsigned short int* from) {
1632 if ((std::ptrdiff_t(from) % 16) == 0)
1633 p = pload<Packet8us>(from);
1635 p = ploadu<Packet8us>(from);
1636 return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
1640EIGEN_STRONG_INLINE Packet8bf ploadquad<Packet8bf>(
const bfloat16* from) {
1641 return ploadquad<Packet8us>(
reinterpret_cast<const unsigned short int*
>(from));
1645EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(
const signed char* from) {
1647 if ((std::ptrdiff_t(from) % 16) == 0)
1648 p = pload<Packet16c>(from);
1650 p = ploadu<Packet16c>(from);
1651 return vec_mergeh(p, p);
1655EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(
const unsigned char* from) {
1657 if ((std::ptrdiff_t(from) % 16) == 0)
1658 p = pload<Packet16uc>(from);
1660 p = ploadu<Packet16uc>(from);
1661 return vec_mergeh(p, p);
1665EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(
const signed char* from) {
1667 if ((std::ptrdiff_t(from) % 16) == 0)
1668 p = pload<Packet16c>(from);
1670 p = ploadu<Packet16c>(from);
1671 return vec_perm(p, p, p16uc_QUADRUPLICATE16);
1675EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(
const unsigned char* from) {
1677 if ((std::ptrdiff_t(from) % 16) == 0)
1678 p = pload<Packet16uc>(from);
1680 p = ploadu<Packet16uc>(from);
1681 return vec_perm(p, p, p16uc_QUADRUPLICATE16);
1684template <
typename Packet>
1685EIGEN_STRONG_INLINE
void pstoreu_common(__UNPACK_TYPE__(Packet) * to,
const Packet& from) {
1686 EIGEN_DEBUG_UNALIGNED_STORE
1687#if defined(EIGEN_VECTORIZE_VSX)
1688 vec_xst(from, 0, to);
1692 Packet16uc MSQ, LSQ, edges;
1693 Packet16uc edgeAlign, align;
1695 MSQ = vec_ld(0, (
unsigned char*)to);
1696 LSQ = vec_ld(15, (
unsigned char*)to);
1697 edgeAlign = vec_lvsl(0, to);
1698 edges = vec_perm(LSQ, MSQ, edgeAlign);
1699 align = vec_lvsr(0, to);
1700 MSQ = vec_perm(edges, (Packet16uc)from, align);
1701 LSQ = vec_perm((Packet16uc)from, edges, align);
1702 vec_st(LSQ, 15, (
unsigned char*)to);
1703 vec_st(MSQ, 0, (
unsigned char*)to);
1707EIGEN_STRONG_INLINE
void pstoreu<float>(
float* to,
const Packet4f& from) {
1708 pstoreu_common<Packet4f>(to, from);
1711EIGEN_STRONG_INLINE
void pstoreu<int>(
int* to,
const Packet4i& from) {
1712 pstoreu_common<Packet4i>(to, from);
1715EIGEN_STRONG_INLINE
void pstoreu<short int>(
short int* to,
const Packet8s& from) {
1716 pstoreu_common<Packet8s>(to, from);
1719EIGEN_STRONG_INLINE
void pstoreu<unsigned short int>(
unsigned short int* to,
const Packet8us& from) {
1720 pstoreu_common<Packet8us>(to, from);
1723EIGEN_STRONG_INLINE
void pstoreu<bfloat16>(bfloat16* to,
const Packet8bf& from) {
1724 pstoreu_common<Packet8us>(
reinterpret_cast<unsigned short int*
>(to), from.m_val);
1727EIGEN_STRONG_INLINE
void pstoreu<signed char>(
signed char* to,
const Packet16c& from) {
1728 pstoreu_common<Packet16c>(to, from);
1731EIGEN_STRONG_INLINE
void pstoreu<unsigned char>(
unsigned char* to,
const Packet16uc& from) {
1732 pstoreu_common<Packet16uc>(to, from);
1735template <
typename Packet>
1736EIGEN_ALWAYS_INLINE
void pstoreu_partial_common(__UNPACK_TYPE__(Packet) * to,
const Packet& from,
const Index n,
1737 const Index offset) {
1738 const Index packet_size = unpacket_traits<Packet>::size;
1739 eigen_internal_assert(n + offset <= packet_size &&
"number of elements plus offset will write past end of packet");
1740 const Index size =
sizeof(__UNPACK_TYPE__(Packet));
1742 EIGEN_UNUSED_VARIABLE(packet_size);
1743 EIGEN_DEBUG_UNALIGNED_STORE
1744 Packet store = from;
1746 Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
1748 store = Packet(vec_slo(Packet16uc(store), shift));
1750 store = Packet(vec_sro(Packet16uc(store), shift));
1753 vec_xst_len(store, to, n * size);
1756 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
1757 pstore(store, from);
1758 unsigned char* store2 =
reinterpret_cast<unsigned char*
>(store + offset);
1759 unsigned char* to2 =
reinterpret_cast<unsigned char*
>(to);
1760 Index n2 = n * size;
1762 pstoreu(to2, ploadu<Packet16uc>(store2));
1764 memcpy((
void*)to2, (
void*)store2, n2);
1771EIGEN_ALWAYS_INLINE
void pstoreu_partial<float>(
float* to,
const Packet4f& from,
const Index n,
const Index offset) {
1772 pstoreu_partial_common<Packet4f>(to, from, n, offset);
1775EIGEN_ALWAYS_INLINE
void pstoreu_partial<int>(
int* to,
const Packet4i& from,
const Index n,
const Index offset) {
1776 pstoreu_partial_common<Packet4i>(to, from, n, offset);
1779EIGEN_ALWAYS_INLINE
void pstoreu_partial<short int>(
short int* to,
const Packet8s& from,
const Index n,
1780 const Index offset) {
1781 pstoreu_partial_common<Packet8s>(to, from, n, offset);
1784EIGEN_ALWAYS_INLINE
void pstoreu_partial<unsigned short int>(
unsigned short int* to,
const Packet8us& from,
1786 pstoreu_partial_common<Packet8us>(to, from, n, offset);
1789EIGEN_ALWAYS_INLINE
void pstoreu_partial<bfloat16>(bfloat16* to,
const Packet8bf& from,
const Index n,
1790 const Index offset) {
1791 pstoreu_partial_common<Packet8us>(
reinterpret_cast<unsigned short int*
>(to), from, n, offset);
1794EIGEN_ALWAYS_INLINE
void pstoreu_partial<signed char>(
signed char* to,
const Packet16c& from,
const Index n,
1795 const Index offset) {
1796 pstoreu_partial_common<Packet16c>(to, from, n, offset);
1799EIGEN_ALWAYS_INLINE
void pstoreu_partial<unsigned char>(
unsigned char* to,
const Packet16uc& from,
const Index n,
1800 const Index offset) {
1801 pstoreu_partial_common<Packet16uc>(to, from, n, offset);
1805EIGEN_STRONG_INLINE
void prefetch<float>(
const float* addr) {
1806 EIGEN_PPC_PREFETCH(addr);
1809EIGEN_STRONG_INLINE
void prefetch<int>(
const int* addr) {
1810 EIGEN_PPC_PREFETCH(addr);
1814EIGEN_STRONG_INLINE
float pfirst<Packet4f>(
const Packet4f& a) {
1815 EIGEN_ALIGN16
float x;
1820EIGEN_STRONG_INLINE
int pfirst<Packet4i>(
const Packet4i& a) {
1821 EIGEN_ALIGN16
int x;
1826template <
typename Packet>
1827EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) pfirst_common(
const Packet& a) {
1828 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) x;
1834EIGEN_STRONG_INLINE
short int pfirst<Packet8s>(
const Packet8s& a) {
1835 return pfirst_common<Packet8s>(a);
1839EIGEN_STRONG_INLINE
unsigned short int pfirst<Packet8us>(
const Packet8us& a) {
1840 return pfirst_common<Packet8us>(a);
1844EIGEN_STRONG_INLINE
signed char pfirst<Packet16c>(
const Packet16c& a) {
1845 return pfirst_common<Packet16c>(a);
1849EIGEN_STRONG_INLINE
unsigned char pfirst<Packet16uc>(
const Packet16uc& a) {
1850 return pfirst_common<Packet16uc>(a);
1854EIGEN_STRONG_INLINE Packet4f preverse(
const Packet4f& a) {
1855 return reinterpret_cast<Packet4f
>(
1856 vec_perm(
reinterpret_cast<Packet16uc
>(a),
reinterpret_cast<Packet16uc
>(a), p16uc_REVERSE32));
1859EIGEN_STRONG_INLINE Packet4i preverse(
const Packet4i& a) {
1860 return reinterpret_cast<Packet4i
>(
1861 vec_perm(
reinterpret_cast<Packet16uc
>(a),
reinterpret_cast<Packet16uc
>(a), p16uc_REVERSE32));
1864EIGEN_STRONG_INLINE Packet8s preverse(
const Packet8s& a) {
1865 return reinterpret_cast<Packet8s
>(
1866 vec_perm(
reinterpret_cast<Packet16uc
>(a),
reinterpret_cast<Packet16uc
>(a), p16uc_REVERSE16));
1869EIGEN_STRONG_INLINE Packet8us preverse(
const Packet8us& a) {
1870 return reinterpret_cast<Packet8us
>(
1871 vec_perm(
reinterpret_cast<Packet16uc
>(a),
reinterpret_cast<Packet16uc
>(a), p16uc_REVERSE16));
1874EIGEN_STRONG_INLINE Packet16c preverse(
const Packet16c& a) {
1875 return vec_perm(a, a, p16uc_REVERSE8);
1878EIGEN_STRONG_INLINE Packet16uc preverse(
const Packet16uc& a) {
1879 return vec_perm(a, a, p16uc_REVERSE8);
1882EIGEN_STRONG_INLINE Packet8bf preverse(
const Packet8bf& a) {
1883 return preverse<Packet8us>(a);
1887EIGEN_STRONG_INLINE Packet4f pabs(
const Packet4f& a) {
1891EIGEN_STRONG_INLINE Packet4i pabs(
const Packet4i& a) {
1895EIGEN_STRONG_INLINE Packet8s pabs(
const Packet8s& a) {
1899EIGEN_STRONG_INLINE Packet8us pabs(
const Packet8us& a) {
1903EIGEN_STRONG_INLINE Packet16c pabs(
const Packet16c& a) {
1907EIGEN_STRONG_INLINE Packet16uc pabs(
const Packet16uc& a) {
1911EIGEN_STRONG_INLINE Packet8bf pabs(
const Packet8bf& a) {
1912 EIGEN_DECLARE_CONST_FAST_Packet8us(abs_mask, 0x7FFF);
1913 return pand<Packet8us>(p8us_abs_mask, a);
1917EIGEN_STRONG_INLINE Packet8bf psignbit(
const Packet8bf& a) {
1918 return vec_sra(a.m_val, vec_splat_u16(15));
1921EIGEN_STRONG_INLINE Packet4f psignbit(
const Packet4f& a) {
1922 return (Packet4f)vec_sra((Packet4i)a, vec_splats((
unsigned int)(31)));
1926EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(
const Packet4i& a) {
1927 return vec_sra(a,
reinterpret_cast<Packet4ui
>(pset1<Packet4i>(N)));
1930EIGEN_STRONG_INLINE Packet4i plogical_shift_right(
const Packet4i& a) {
1931 return vec_sr(a,
reinterpret_cast<Packet4ui
>(pset1<Packet4i>(N)));
1934EIGEN_STRONG_INLINE Packet4i plogical_shift_left(
const Packet4i& a) {
1935 return vec_sl(a,
reinterpret_cast<Packet4ui
>(pset1<Packet4i>(N)));
1938EIGEN_STRONG_INLINE Packet4f plogical_shift_left(
const Packet4f& a) {
1939 const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1940 Packet4ui r = vec_sl(
reinterpret_cast<Packet4ui
>(a), p4ui_mask);
1941 return reinterpret_cast<Packet4f
>(r);
1945EIGEN_STRONG_INLINE Packet4f plogical_shift_right(
const Packet4f& a) {
1946 const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1947 Packet4ui r = vec_sr(
reinterpret_cast<Packet4ui
>(a), p4ui_mask);
1948 return reinterpret_cast<Packet4f
>(r);
1952EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(
const Packet4ui& a) {
1953 const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1954 return vec_sr(a, p4ui_mask);
1958EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(
const Packet4ui& a) {
1959 const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1960 return vec_sl(a, p4ui_mask);
1964EIGEN_STRONG_INLINE Packet8us plogical_shift_left(
const Packet8us& a) {
1965 const EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
1966 return vec_sl(a, p8us_mask);
1969EIGEN_STRONG_INLINE Packet8us plogical_shift_right(
const Packet8us& a) {
1970 const EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
1971 return vec_sr(a, p8us_mask);
1974EIGEN_STRONG_INLINE Packet4f Bf16ToF32Even(
const Packet8bf& bf) {
1975 return plogical_shift_left<16>(
reinterpret_cast<Packet4f
>(bf.m_val));
1978EIGEN_STRONG_INLINE Packet4f Bf16ToF32Odd(
const Packet8bf& bf) {
1979 const EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000);
1980 return pand<Packet4f>(
reinterpret_cast<Packet4f
>(bf.m_val),
reinterpret_cast<Packet4f
>(p4ui_high_mask));
1983EIGEN_ALWAYS_INLINE Packet8us pmerge(Packet4ui even, Packet4ui odd) {
1985 return vec_perm(
reinterpret_cast<Packet8us
>(odd),
reinterpret_cast<Packet8us
>(even), p16uc_MERGEO16);
1987 return vec_perm(
reinterpret_cast<Packet8us
>(even),
reinterpret_cast<Packet8us
>(odd), p16uc_MERGEE16);
1993EIGEN_STRONG_INLINE Packet8bf F32ToBf16Bool(Packet4f even, Packet4f odd) {
1994 return pmerge(
reinterpret_cast<Packet4ui
>(even),
reinterpret_cast<Packet4ui
>(odd));
1999#ifndef __VEC_CLASS_FP_NAN
2000#define __VEC_CLASS_FP_NAN (1 << 6)
2003#if defined(SUPPORT_BF16_SUBNORMALS) && !defined(__VEC_CLASS_FP_SUBNORMAL)
2004#define __VEC_CLASS_FP_SUBNORMAL_P (1 << 1)
2005#define __VEC_CLASS_FP_SUBNORMAL_N (1 << 0)
2007#define __VEC_CLASS_FP_SUBNORMAL (__VEC_CLASS_FP_SUBNORMAL_P | __VEC_CLASS_FP_SUBNORMAL_N)
2010EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f) {
2012 return reinterpret_cast<Packet8us
>(__builtin_vsx_xvcvspbf16(
reinterpret_cast<Packet16uc
>(p4f)));
2014 Packet4ui input =
reinterpret_cast<Packet4ui
>(p4f);
2015 Packet4ui lsb = plogical_shift_right<16>(input);
2016 lsb = pand<Packet4ui>(lsb,
reinterpret_cast<Packet4ui
>(p4i_ONE));
2018 EIGEN_DECLARE_CONST_FAST_Packet4ui(BIAS, 0x7FFFu);
2019 Packet4ui rounding_bias = padd<Packet4ui>(lsb, p4ui_BIAS);
2020 input = padd<Packet4ui>(input, rounding_bias);
2022 const EIGEN_DECLARE_CONST_FAST_Packet4ui(nan, 0x7FC00000);
2023#if defined(_ARCH_PWR9) && defined(EIGEN_VECTORIZE_VSX)
2024 Packet4bi nan_selector = vec_test_data_class(p4f, __VEC_CLASS_FP_NAN);
2025 input = vec_sel(input, p4ui_nan, nan_selector);
2027#ifdef SUPPORT_BF16_SUBNORMALS
2028 Packet4bi subnormal_selector = vec_test_data_class(p4f, __VEC_CLASS_FP_SUBNORMAL);
2029 input = vec_sel(input,
reinterpret_cast<Packet4ui
>(p4f), subnormal_selector);
2032#ifdef SUPPORT_BF16_SUBNORMALS
2034 const EIGEN_DECLARE_CONST_FAST_Packet4ui(exp_mask, 0x7F800000);
2035 Packet4ui
exp = pand<Packet4ui>(p4ui_exp_mask,
reinterpret_cast<Packet4ui
>(p4f));
2037 const EIGEN_DECLARE_CONST_FAST_Packet4ui(mantissa_mask, 0x7FFFFF);
2038 Packet4ui mantissa = pand<Packet4ui>(p4ui_mantissa_mask,
reinterpret_cast<Packet4ui
>(p4f));
2040 Packet4bi is_max_exp = vec_cmpeq(
exp, p4ui_exp_mask);
2041 Packet4bi is_mant_zero = vec_cmpeq(mantissa,
reinterpret_cast<Packet4ui
>(p4i_ZERO));
2043 Packet4ui nan_selector =
2044 pandnot<Packet4ui>(
reinterpret_cast<Packet4ui
>(is_max_exp),
reinterpret_cast<Packet4ui
>(is_mant_zero));
2046 Packet4bi is_zero_exp = vec_cmpeq(
exp,
reinterpret_cast<Packet4ui
>(p4i_ZERO));
2048 Packet4ui subnormal_selector =
2049 pandnot<Packet4ui>(
reinterpret_cast<Packet4ui
>(is_zero_exp),
reinterpret_cast<Packet4ui
>(is_mant_zero));
2051 input = vec_sel(input, p4ui_nan, nan_selector);
2052 input = vec_sel(input,
reinterpret_cast<Packet4ui
>(p4f), subnormal_selector);
2055 Packet4bi nan_selector = vec_cmpeq(p4f, p4f);
2057 input = vec_sel(p4ui_nan, input, nan_selector);
2061 input = plogical_shift_right<16>(input);
2062 return reinterpret_cast<Packet8us
>(input);
2073EIGEN_ALWAYS_INLINE Packet8bf Bf16PackHigh(Packet4f lo, Packet4f hi) {
2075 return vec_perm(
reinterpret_cast<Packet8us
>(lo),
reinterpret_cast<Packet8us
>(hi), p16uc_MERGEH16);
2077 return vec_perm(
reinterpret_cast<Packet8us
>(hi),
reinterpret_cast<Packet8us
>(lo), p16uc_MERGEE16);
2087EIGEN_ALWAYS_INLINE Packet8bf Bf16PackLow(Packet4f lo, Packet4f hi) {
2089 return vec_pack(
reinterpret_cast<Packet4ui
>(lo),
reinterpret_cast<Packet4ui
>(hi));
2091 return vec_perm(
reinterpret_cast<Packet8us
>(hi),
reinterpret_cast<Packet8us
>(lo), p16uc_MERGEO16);
2096EIGEN_ALWAYS_INLINE Packet8bf Bf16PackLow(Packet4f hi, Packet4f lo) {
2098 return vec_pack(
reinterpret_cast<Packet4ui
>(hi),
reinterpret_cast<Packet4ui
>(lo));
2100 return vec_perm(
reinterpret_cast<Packet8us
>(hi),
reinterpret_cast<Packet8us
>(lo), p16uc_MERGEE16);
2105EIGEN_ALWAYS_INLINE Packet8bf Bf16PackHigh(Packet4f hi, Packet4f lo) {
2107 return vec_perm(
reinterpret_cast<Packet8us
>(hi),
reinterpret_cast<Packet8us
>(lo), p16uc_MERGEL16);
2109 return vec_perm(
reinterpret_cast<Packet8us
>(hi),
reinterpret_cast<Packet8us
>(lo), p16uc_MERGEO16);
2119template <
bool lohi = true>
2120EIGEN_ALWAYS_INLINE Packet8bf F32ToBf16Two(Packet4f lo, Packet4f hi) {
2121 Packet8us p4f = Bf16PackHigh<lohi>(lo, hi);
2122 Packet8us p4f2 = Bf16PackLow<lohi>(lo, hi);
2124 Packet8us lsb = pand<Packet8us>(p4f, p8us_ONE);
2125 EIGEN_DECLARE_CONST_FAST_Packet8us(BIAS, 0x7FFFu);
2126 lsb = padd<Packet8us>(lsb, p8us_BIAS);
2127 lsb = padd<Packet8us>(lsb, p4f2);
2129 Packet8bi rounding_bias = vec_cmplt(lsb, p4f2);
2130 Packet8us input = psub<Packet8us>(p4f,
reinterpret_cast<Packet8us
>(rounding_bias));
2132#if defined(_ARCH_PWR9) && defined(EIGEN_VECTORIZE_VSX)
2133 Packet4bi nan_selector_lo = vec_test_data_class(lo, __VEC_CLASS_FP_NAN);
2134 Packet4bi nan_selector_hi = vec_test_data_class(hi, __VEC_CLASS_FP_NAN);
2135 Packet8us nan_selector =
2136 Bf16PackLow<lohi>(
reinterpret_cast<Packet4f
>(nan_selector_lo),
reinterpret_cast<Packet4f
>(nan_selector_hi));
2138 input = vec_sel(input, p8us_BIAS, nan_selector);
2140#ifdef SUPPORT_BF16_SUBNORMALS
2141 Packet4bi subnormal_selector_lo = vec_test_data_class(lo, __VEC_CLASS_FP_SUBNORMAL);
2142 Packet4bi subnormal_selector_hi = vec_test_data_class(hi, __VEC_CLASS_FP_SUBNORMAL);
2143 Packet8us subnormal_selector = Bf16PackLow<lohi>(
reinterpret_cast<Packet4f
>(subnormal_selector_lo),
2144 reinterpret_cast<Packet4f
>(subnormal_selector_hi));
2146 input = vec_sel(input,
reinterpret_cast<Packet8us
>(p4f), subnormal_selector);
2149#ifdef SUPPORT_BF16_SUBNORMALS
2151 const EIGEN_DECLARE_CONST_FAST_Packet8us(exp_mask, 0x7F80);
2152 Packet8us
exp = pand<Packet8us>(p8us_exp_mask, p4f);
2154 const EIGEN_DECLARE_CONST_FAST_Packet8us(mantissa_mask, 0x7Fu);
2155 Packet8us mantissa = pand<Packet8us>(p8us_mantissa_mask, p4f);
2157 Packet8bi is_max_exp = vec_cmpeq(
exp, p8us_exp_mask);
2158 Packet8bi is_mant_zero = vec_cmpeq(mantissa,
reinterpret_cast<Packet8us
>(p4i_ZERO));
2160 Packet8us nan_selector =
2161 pandnot<Packet8us>(
reinterpret_cast<Packet8us
>(is_max_exp),
reinterpret_cast<Packet8us
>(is_mant_zero));
2163 Packet8bi is_zero_exp = vec_cmpeq(
exp,
reinterpret_cast<Packet8us
>(p4i_ZERO));
2165 Packet8us subnormal_selector =
2166 pandnot<Packet8us>(
reinterpret_cast<Packet8us
>(is_zero_exp),
reinterpret_cast<Packet8us
>(is_mant_zero));
2169 input = vec_sel(input, p8us_BIAS, nan_selector);
2170 input = vec_sel(input,
reinterpret_cast<Packet8us
>(p4f), subnormal_selector);
2173 Packet4bi nan_selector_lo = vec_cmpeq(lo, lo);
2174 Packet4bi nan_selector_hi = vec_cmpeq(hi, hi);
2175 Packet8us nan_selector =
2176 Bf16PackLow<lohi>(
reinterpret_cast<Packet4f
>(nan_selector_lo),
reinterpret_cast<Packet4f
>(nan_selector_hi));
2178 input = vec_sel(p8us_BIAS, input, nan_selector);
2188EIGEN_STRONG_INLINE Packet8bf F32ToBf16Both(Packet4f lo, Packet4f hi) {
2190 Packet8bf fp16_0 = F32ToBf16(lo);
2191 Packet8bf fp16_1 = F32ToBf16(hi);
2192 return vec_pack(
reinterpret_cast<Packet4ui
>(fp16_0.m_val),
reinterpret_cast<Packet4ui
>(fp16_1.m_val));
2194 return F32ToBf16Two(lo, hi);
2201EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f even, Packet4f odd) {
2203 return pmerge(
reinterpret_cast<Packet4ui
>(F32ToBf16(even).m_val),
reinterpret_cast<Packet4ui
>(F32ToBf16(odd).m_val));
2205 return F32ToBf16Two<false>(even, odd);
2208#define BF16_TO_F32_UNARY_OP_WRAPPER(OP, A) \
2209 Packet4f a_even = Bf16ToF32Even(A); \
2210 Packet4f a_odd = Bf16ToF32Odd(A); \
2211 Packet4f op_even = OP(a_even); \
2212 Packet4f op_odd = OP(a_odd); \
2213 return F32ToBf16(op_even, op_odd);
2215#define BF16_TO_F32_BINARY_OP_WRAPPER(OP, A, B) \
2216 Packet4f a_even = Bf16ToF32Even(A); \
2217 Packet4f a_odd = Bf16ToF32Odd(A); \
2218 Packet4f b_even = Bf16ToF32Even(B); \
2219 Packet4f b_odd = Bf16ToF32Odd(B); \
2220 Packet4f op_even = OP(a_even, b_even); \
2221 Packet4f op_odd = OP(a_odd, b_odd); \
2222 return F32ToBf16(op_even, op_odd);
2224#define BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(OP, A, B) \
2225 Packet4f a_even = Bf16ToF32Even(A); \
2226 Packet4f a_odd = Bf16ToF32Odd(A); \
2227 Packet4f b_even = Bf16ToF32Even(B); \
2228 Packet4f b_odd = Bf16ToF32Odd(B); \
2229 Packet4f op_even = OP(a_even, b_even); \
2230 Packet4f op_odd = OP(a_odd, b_odd); \
2231 return F32ToBf16Bool(op_even, op_odd);
2234EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(
const Packet8bf& a,
const Packet8bf& b) {
2235 BF16_TO_F32_BINARY_OP_WRAPPER(padd<Packet4f>, a, b);
2239EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(
const Packet8bf& a,
const Packet8bf& b) {
2240 BF16_TO_F32_BINARY_OP_WRAPPER(pmul<Packet4f>, a, b);
2244EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(
const Packet8bf& a,
const Packet8bf& b) {
2245 BF16_TO_F32_BINARY_OP_WRAPPER(pdiv<Packet4f>, a, b);
2249EIGEN_STRONG_INLINE Packet8bf pnegate<Packet8bf>(
const Packet8bf& a) {
2250 EIGEN_DECLARE_CONST_FAST_Packet8us(neg_mask, 0x8000);
2251 return pxor<Packet8us>(p8us_neg_mask, a);
2255EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(
const Packet8bf& a,
const Packet8bf& b) {
2256 BF16_TO_F32_BINARY_OP_WRAPPER(psub<Packet4f>, a, b);
2260EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf>(
const Packet8bf& a) {
2261 BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a);
2265EIGEN_STRONG_INLINE Packet8bf pexp2<Packet8bf>(
const Packet8bf& a) {
2266 BF16_TO_F32_UNARY_OP_WRAPPER(generic_exp2, a);
2270EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(
const Packet4f& a,
const Packet4f& exponent) {
2271 return pldexp_generic(a, exponent);
2274EIGEN_STRONG_INLINE Packet8bf pldexp<Packet8bf>(
const Packet8bf& a,
const Packet8bf& exponent) {
2275 BF16_TO_F32_BINARY_OP_WRAPPER(pldexp<Packet4f>, a, exponent);
2279EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(
const Packet4f& a, Packet4f& exponent) {
2280 return pfrexp_generic(a, exponent);
2283EIGEN_STRONG_INLINE Packet8bf pfrexp<Packet8bf>(
const Packet8bf& a, Packet8bf& e) {
2284 Packet4f a_even = Bf16ToF32Even(a);
2285 Packet4f a_odd = Bf16ToF32Odd(a);
2288 Packet4f op_even = pfrexp<Packet4f>(a_even, e_even);
2289 Packet4f op_odd = pfrexp<Packet4f>(a_odd, e_odd);
2290 e = F32ToBf16(e_even, e_odd);
2291 return F32ToBf16(op_even, op_odd);
2295EIGEN_STRONG_INLINE Packet8bf psin<Packet8bf>(
const Packet8bf& a) {
2296 BF16_TO_F32_UNARY_OP_WRAPPER(psin_float, a);
2299EIGEN_STRONG_INLINE Packet8bf pcos<Packet8bf>(
const Packet8bf& a) {
2300 BF16_TO_F32_UNARY_OP_WRAPPER(pcos_float, a);
2303EIGEN_STRONG_INLINE Packet8bf plog<Packet8bf>(
const Packet8bf& a) {
2304 BF16_TO_F32_UNARY_OP_WRAPPER(plog_float, a);
2307EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf>(
const Packet8bf& a) {
2308 BF16_TO_F32_UNARY_OP_WRAPPER(pfloor<Packet4f>, a);
2311EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf>(
const Packet8bf& a) {
2312 BF16_TO_F32_UNARY_OP_WRAPPER(pceil<Packet4f>, a);
2315EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf>(
const Packet8bf& a) {
2316 BF16_TO_F32_UNARY_OP_WRAPPER(pround<Packet4f>, a);
2319EIGEN_STRONG_INLINE Packet8bf ptrunc<Packet8bf>(
const Packet8bf& a) {
2320 BF16_TO_F32_UNARY_OP_WRAPPER(ptrunc<Packet4f>, a);
2322#ifdef EIGEN_VECTORIZE_VSX
2324EIGEN_STRONG_INLINE Packet8bf print<Packet8bf>(
const Packet8bf& a) {
2325 BF16_TO_F32_UNARY_OP_WRAPPER(print<Packet4f>, a);
2329EIGEN_STRONG_INLINE Packet8bf pmadd(
const Packet8bf& a,
const Packet8bf& b,
const Packet8bf& c) {
2330 Packet4f a_even = Bf16ToF32Even(a);
2331 Packet4f a_odd = Bf16ToF32Odd(a);
2332 Packet4f b_even = Bf16ToF32Even(b);
2333 Packet4f b_odd = Bf16ToF32Odd(b);
2334 Packet4f c_even = Bf16ToF32Even(c);
2335 Packet4f c_odd = Bf16ToF32Odd(c);
2336 Packet4f pmadd_even = pmadd<Packet4f>(a_even, b_even, c_even);
2337 Packet4f pmadd_odd = pmadd<Packet4f>(a_odd, b_odd, c_odd);
2338 return F32ToBf16(pmadd_even, pmadd_odd);
2342EIGEN_STRONG_INLINE Packet8bf pmsub(
const Packet8bf& a,
const Packet8bf& b,
const Packet8bf& c) {
2343 Packet4f a_even = Bf16ToF32Even(a);
2344 Packet4f a_odd = Bf16ToF32Odd(a);
2345 Packet4f b_even = Bf16ToF32Even(b);
2346 Packet4f b_odd = Bf16ToF32Odd(b);
2347 Packet4f c_even = Bf16ToF32Even(c);
2348 Packet4f c_odd = Bf16ToF32Odd(c);
2349 Packet4f pmadd_even = pmsub<Packet4f>(a_even, b_even, c_even);
2350 Packet4f pmadd_odd = pmsub<Packet4f>(a_odd, b_odd, c_odd);
2351 return F32ToBf16(pmadd_even, pmadd_odd);
2354EIGEN_STRONG_INLINE Packet8bf pnmadd(
const Packet8bf& a,
const Packet8bf& b,
const Packet8bf& c) {
2355 Packet4f a_even = Bf16ToF32Even(a);
2356 Packet4f a_odd = Bf16ToF32Odd(a);
2357 Packet4f b_even = Bf16ToF32Even(b);
2358 Packet4f b_odd = Bf16ToF32Odd(b);
2359 Packet4f c_even = Bf16ToF32Even(c);
2360 Packet4f c_odd = Bf16ToF32Odd(c);
2361 Packet4f pmadd_even = pnmadd<Packet4f>(a_even, b_even, c_even);
2362 Packet4f pmadd_odd = pnmadd<Packet4f>(a_odd, b_odd, c_odd);
2363 return F32ToBf16(pmadd_even, pmadd_odd);
2367EIGEN_STRONG_INLINE Packet8bf pnmsub(
const Packet8bf& a,
const Packet8bf& b,
const Packet8bf& c) {
2368 Packet4f a_even = Bf16ToF32Even(a);
2369 Packet4f a_odd = Bf16ToF32Odd(a);
2370 Packet4f b_even = Bf16ToF32Even(b);
2371 Packet4f b_odd = Bf16ToF32Odd(b);
2372 Packet4f c_even = Bf16ToF32Even(c);
2373 Packet4f c_odd = Bf16ToF32Odd(c);
2374 Packet4f pmadd_even = pnmsub<Packet4f>(a_even, b_even, c_even);
2375 Packet4f pmadd_odd = pnmsub<Packet4f>(a_odd, b_odd, c_odd);
2376 return F32ToBf16(pmadd_even, pmadd_odd);
2380EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(
const Packet8bf& a,
const Packet8bf& b) {
2381 BF16_TO_F32_BINARY_OP_WRAPPER(pmin<Packet4f>, a, b);
2385EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(
const Packet8bf& a,
const Packet8bf& b) {
2386 BF16_TO_F32_BINARY_OP_WRAPPER(pmax<Packet4f>, a, b);
2390EIGEN_STRONG_INLINE Packet8bf pcmp_lt(
const Packet8bf& a,
const Packet8bf& b) {
2391 BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt<Packet4f>, a, b);
2394EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(
const Packet8bf& a,
const Packet8bf& b) {
2395 BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt_or_nan<Packet4f>, a, b);
2398EIGEN_STRONG_INLINE Packet8bf pcmp_le(
const Packet8bf& a,
const Packet8bf& b) {
2399 BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_le<Packet4f>, a, b);
2402EIGEN_STRONG_INLINE Packet8bf pcmp_eq(
const Packet8bf& a,
const Packet8bf& b) {
2403 BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_eq<Packet4f>, a, b);
2407EIGEN_STRONG_INLINE bfloat16 pfirst(
const Packet8bf& a) {
2408 return Eigen::bfloat16_impl::raw_uint16_to_bfloat16((pfirst<Packet8us>(a)));
2412EIGEN_STRONG_INLINE Packet8bf ploaddup<Packet8bf>(
const bfloat16* from) {
2413 return ploaddup<Packet8us>(
reinterpret_cast<const unsigned short int*
>(from));
2417EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(
const bfloat16& a) {
2418 bfloat16 countdown[8] = {bfloat16(0), bfloat16(1), bfloat16(2), bfloat16(3),
2419 bfloat16(4), bfloat16(5), bfloat16(6), bfloat16(7)};
2420 return padd<Packet8bf>(pset1<Packet8bf>(a), pload<Packet8bf>(countdown));
2424EIGEN_STRONG_INLINE
float predux<Packet4f>(
const Packet4f& a) {
2426 b = vec_sld(a, a, 8);
2428 b = vec_sld(sum, sum, 4);
2434EIGEN_STRONG_INLINE
int predux<Packet4i>(
const Packet4i& a) {
2436 b = vec_sld(a, a, 8);
2438 b = vec_sld(sum, sum, 4);
2444EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(
const Packet8bf& a) {
2445 float redux_even = predux<Packet4f>(Bf16ToF32Even(a));
2446 float redux_odd = predux<Packet4f>(Bf16ToF32Odd(a));
2447 float f32_result = redux_even + redux_odd;
2448 return bfloat16(f32_result);
2450template <
typename Packet>
2451EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size8(
const Packet& a) {
2454 __UNPACK_TYPE__(Packet) n[8];
2458 EIGEN_ALIGN16
int first_loader[4] = {vt.n[0], vt.n[1], vt.n[2], vt.n[3]};
2459 EIGEN_ALIGN16
int second_loader[4] = {vt.n[4], vt.n[5], vt.n[6], vt.n[7]};
2460 Packet4i first_half = pload<Packet4i>(first_loader);
2461 Packet4i second_half = pload<Packet4i>(second_loader);
2463 return static_cast<__UNPACK_TYPE__(Packet)
>(predux(first_half) + predux(second_half));
2467EIGEN_STRONG_INLINE
short int predux<Packet8s>(
const Packet8s& a) {
2468 return predux_size8<Packet8s>(a);
2472EIGEN_STRONG_INLINE
unsigned short int predux<Packet8us>(
const Packet8us& a) {
2473 return predux_size8<Packet8us>(a);
2476template <
typename Packet>
2477EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size16(
const Packet& a) {
2480 __UNPACK_TYPE__(Packet) n[16];
2484 EIGEN_ALIGN16
int first_loader[4] = {vt.n[0], vt.n[1], vt.n[2], vt.n[3]};
2485 EIGEN_ALIGN16
int second_loader[4] = {vt.n[4], vt.n[5], vt.n[6], vt.n[7]};
2486 EIGEN_ALIGN16
int third_loader[4] = {vt.n[8], vt.n[9], vt.n[10], vt.n[11]};
2487 EIGEN_ALIGN16
int fourth_loader[4] = {vt.n[12], vt.n[13], vt.n[14], vt.n[15]};
2489 Packet4i first_quarter = pload<Packet4i>(first_loader);
2490 Packet4i second_quarter = pload<Packet4i>(second_loader);
2491 Packet4i third_quarter = pload<Packet4i>(third_loader);
2492 Packet4i fourth_quarter = pload<Packet4i>(fourth_loader);
2494 return static_cast<__UNPACK_TYPE__(Packet)
>(predux(first_quarter) + predux(second_quarter) + predux(third_quarter) +
2495 predux(fourth_quarter));
2499EIGEN_STRONG_INLINE
signed char predux<Packet16c>(
const Packet16c& a) {
2500 return predux_size16<Packet16c>(a);
2504EIGEN_STRONG_INLINE
unsigned char predux<Packet16uc>(
const Packet16uc& a) {
2505 return predux_size16<Packet16uc>(a);
2511EIGEN_STRONG_INLINE
float predux_mul<Packet4f>(
const Packet4f& a) {
2513 prod = pmul(a, vec_sld(a, a, 8));
2514 return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
2518EIGEN_STRONG_INLINE
int predux_mul<Packet4i>(
const Packet4i& a) {
2519 EIGEN_ALIGN16
int aux[4];
2521 return aux[0] * aux[1] * aux[2] * aux[3];
2525EIGEN_STRONG_INLINE
short int predux_mul<Packet8s>(
const Packet8s& a) {
2526 Packet8s pair, quad, octo;
2528 pair = vec_mul(a, vec_sld(a, a, 8));
2529 quad = vec_mul(pair, vec_sld(pair, pair, 4));
2530 octo = vec_mul(quad, vec_sld(quad, quad, 2));
2532 return pfirst(octo);
2536EIGEN_STRONG_INLINE
unsigned short int predux_mul<Packet8us>(
const Packet8us& a) {
2537 Packet8us pair, quad, octo;
2539 pair = vec_mul(a, vec_sld(a, a, 8));
2540 quad = vec_mul(pair, vec_sld(pair, pair, 4));
2541 octo = vec_mul(quad, vec_sld(quad, quad, 2));
2543 return pfirst(octo);
2547EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(
const Packet8bf& a) {
2548 float redux_even = predux_mul<Packet4f>(Bf16ToF32Even(a));
2549 float redux_odd = predux_mul<Packet4f>(Bf16ToF32Odd(a));
2550 float f32_result = redux_even * redux_odd;
2551 return bfloat16(f32_result);
2555EIGEN_STRONG_INLINE
signed char predux_mul<Packet16c>(
const Packet16c& a) {
2556 Packet16c pair, quad, octo, result;
2558 pair = vec_mul(a, vec_sld(a, a, 8));
2559 quad = vec_mul(pair, vec_sld(pair, pair, 4));
2560 octo = vec_mul(quad, vec_sld(quad, quad, 2));
2561 result = vec_mul(octo, vec_sld(octo, octo, 1));
2563 return pfirst(result);
2567EIGEN_STRONG_INLINE
unsigned char predux_mul<Packet16uc>(
const Packet16uc& a) {
2568 Packet16uc pair, quad, octo, result;
2570 pair = vec_mul(a, vec_sld(a, a, 8));
2571 quad = vec_mul(pair, vec_sld(pair, pair, 4));
2572 octo = vec_mul(quad, vec_sld(quad, quad, 2));
2573 result = vec_mul(octo, vec_sld(octo, octo, 1));
2575 return pfirst(result);
2579template <
typename Packet>
2580EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_min4(
const Packet& a) {
2582 b = vec_min(a, vec_sld(a, a, 8));
2583 res = vec_min(b, vec_sld(b, b, 4));
2588EIGEN_STRONG_INLINE
float predux_min<Packet4f>(
const Packet4f& a) {
2589 return predux_min4<Packet4f>(a);
2593EIGEN_STRONG_INLINE
int predux_min<Packet4i>(
const Packet4i& a) {
2594 return predux_min4<Packet4i>(a);
2598EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(
const Packet8bf& a) {
2599 float redux_even = predux_min<Packet4f>(Bf16ToF32Even(a));
2600 float redux_odd = predux_min<Packet4f>(Bf16ToF32Odd(a));
2601 float f32_result = (std::min)(redux_even, redux_odd);
2602 return bfloat16(f32_result);
2606EIGEN_STRONG_INLINE
short int predux_min<Packet8s>(
const Packet8s& a) {
2607 Packet8s pair, quad, octo;
2610 pair = vec_min(a, vec_sld(a, a, 8));
2613 quad = vec_min(pair, vec_sld(pair, pair, 4));
2616 octo = vec_min(quad, vec_sld(quad, quad, 2));
2617 return pfirst(octo);
2621EIGEN_STRONG_INLINE
unsigned short int predux_min<Packet8us>(
const Packet8us& a) {
2622 Packet8us pair, quad, octo;
2625 pair = vec_min(a, vec_sld(a, a, 8));
2628 quad = vec_min(pair, vec_sld(pair, pair, 4));
2631 octo = vec_min(quad, vec_sld(quad, quad, 2));
2632 return pfirst(octo);
2636EIGEN_STRONG_INLINE
signed char predux_min<Packet16c>(
const Packet16c& a) {
2637 Packet16c pair, quad, octo, result;
2639 pair = vec_min(a, vec_sld(a, a, 8));
2640 quad = vec_min(pair, vec_sld(pair, pair, 4));
2641 octo = vec_min(quad, vec_sld(quad, quad, 2));
2642 result = vec_min(octo, vec_sld(octo, octo, 1));
2644 return pfirst(result);
2648EIGEN_STRONG_INLINE
unsigned char predux_min<Packet16uc>(
const Packet16uc& a) {
2649 Packet16uc pair, quad, octo, result;
2651 pair = vec_min(a, vec_sld(a, a, 8));
2652 quad = vec_min(pair, vec_sld(pair, pair, 4));
2653 octo = vec_min(quad, vec_sld(quad, quad, 2));
2654 result = vec_min(octo, vec_sld(octo, octo, 1));
2656 return pfirst(result);
2659template <
typename Packet>
2660EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(
const Packet& a) {
2662 b = vec_max(a, vec_sld(a, a, 8));
2663 res = vec_max(b, vec_sld(b, b, 4));
2668EIGEN_STRONG_INLINE
float predux_max<Packet4f>(
const Packet4f& a) {
2669 return predux_max4<Packet4f>(a);
2673EIGEN_STRONG_INLINE
int predux_max<Packet4i>(
const Packet4i& a) {
2674 return predux_max4<Packet4i>(a);
2678EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(
const Packet8bf& a) {
2679 float redux_even = predux_max<Packet4f>(Bf16ToF32Even(a));
2680 float redux_odd = predux_max<Packet4f>(Bf16ToF32Odd(a));
2681 float f32_result = (std::max)(redux_even, redux_odd);
2682 return bfloat16(f32_result);
2686EIGEN_STRONG_INLINE
short int predux_max<Packet8s>(
const Packet8s& a) {
2687 Packet8s pair, quad, octo;
2690 pair = vec_max(a, vec_sld(a, a, 8));
2693 quad = vec_max(pair, vec_sld(pair, pair, 4));
2696 octo = vec_max(quad, vec_sld(quad, quad, 2));
2697 return pfirst(octo);
2701EIGEN_STRONG_INLINE
unsigned short int predux_max<Packet8us>(
const Packet8us& a) {
2702 Packet8us pair, quad, octo;
2705 pair = vec_max(a, vec_sld(a, a, 8));
2708 quad = vec_max(pair, vec_sld(pair, pair, 4));
2711 octo = vec_max(quad, vec_sld(quad, quad, 2));
2712 return pfirst(octo);
2716EIGEN_STRONG_INLINE
signed char predux_max<Packet16c>(
const Packet16c& a) {
2717 Packet16c pair, quad, octo, result;
2719 pair = vec_max(a, vec_sld(a, a, 8));
2720 quad = vec_max(pair, vec_sld(pair, pair, 4));
2721 octo = vec_max(quad, vec_sld(quad, quad, 2));
2722 result = vec_max(octo, vec_sld(octo, octo, 1));
2724 return pfirst(result);
2728EIGEN_STRONG_INLINE
unsigned char predux_max<Packet16uc>(
const Packet16uc& a) {
2729 Packet16uc pair, quad, octo, result;
2731 pair = vec_max(a, vec_sld(a, a, 8));
2732 quad = vec_max(pair, vec_sld(pair, pair, 4));
2733 octo = vec_max(quad, vec_sld(quad, quad, 2));
2734 result = vec_max(octo, vec_sld(octo, octo, 1));
2736 return pfirst(result);
2740EIGEN_STRONG_INLINE
bool predux_any(
const Packet4f& x) {
2741 return vec_any_ne(x, pzero(x));
2744template <
typename T>
2745EIGEN_DEVICE_FUNC
inline void ptranpose_common(PacketBlock<T, 4>& kernel) {
2747 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2748 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2749 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2750 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2751 kernel.packet[0] = vec_mergeh(t0, t2);
2752 kernel.packet[1] = vec_mergel(t0, t2);
2753 kernel.packet[2] = vec_mergeh(t1, t3);
2754 kernel.packet[3] = vec_mergel(t1, t3);
2757EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) { ptranpose_common<Packet4f>(kernel); }
2759EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) { ptranpose_common<Packet4i>(kernel); }
2761EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
2762 Packet8s t0, t1, t2, t3;
2763 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2764 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2765 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2766 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2767 kernel.packet[0] = vec_mergeh(t0, t2);
2768 kernel.packet[1] = vec_mergel(t0, t2);
2769 kernel.packet[2] = vec_mergeh(t1, t3);
2770 kernel.packet[3] = vec_mergel(t1, t3);
2773EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
2774 Packet8us t0, t1, t2, t3;
2775 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2776 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2777 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2778 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2779 kernel.packet[0] = vec_mergeh(t0, t2);
2780 kernel.packet[1] = vec_mergel(t0, t2);
2781 kernel.packet[2] = vec_mergeh(t1, t3);
2782 kernel.packet[3] = vec_mergel(t1, t3);
2785EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet8bf, 4>& kernel) {
2786 Packet8us t0, t1, t2, t3;
2788 t0 = vec_mergeh(kernel.packet[0].m_val, kernel.packet[2].m_val);
2789 t1 = vec_mergel(kernel.packet[0].m_val, kernel.packet[2].m_val);
2790 t2 = vec_mergeh(kernel.packet[1].m_val, kernel.packet[3].m_val);
2791 t3 = vec_mergel(kernel.packet[1].m_val, kernel.packet[3].m_val);
2792 kernel.packet[0] = vec_mergeh(t0, t2);
2793 kernel.packet[1] = vec_mergel(t0, t2);
2794 kernel.packet[2] = vec_mergeh(t1, t3);
2795 kernel.packet[3] = vec_mergel(t1, t3);
2798EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
2799 Packet16c t0, t1, t2, t3;
2800 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2801 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2802 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2803 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2804 kernel.packet[0] = vec_mergeh(t0, t2);
2805 kernel.packet[1] = vec_mergel(t0, t2);
2806 kernel.packet[2] = vec_mergeh(t1, t3);
2807 kernel.packet[3] = vec_mergel(t1, t3);
2810EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
2811 Packet16uc t0, t1, t2, t3;
2812 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2813 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2814 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2815 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2816 kernel.packet[0] = vec_mergeh(t0, t2);
2817 kernel.packet[1] = vec_mergel(t0, t2);
2818 kernel.packet[2] = vec_mergeh(t1, t3);
2819 kernel.packet[3] = vec_mergel(t1, t3);
2822EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
2823 Packet8s v[8], sum[8];
2825 v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
2826 v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
2827 v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
2828 v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
2829 v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
2830 v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
2831 v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
2832 v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
2833 sum[0] = vec_mergeh(v[0], v[4]);
2834 sum[1] = vec_mergel(v[0], v[4]);
2835 sum[2] = vec_mergeh(v[1], v[5]);
2836 sum[3] = vec_mergel(v[1], v[5]);
2837 sum[4] = vec_mergeh(v[2], v[6]);
2838 sum[5] = vec_mergel(v[2], v[6]);
2839 sum[6] = vec_mergeh(v[3], v[7]);
2840 sum[7] = vec_mergel(v[3], v[7]);
2842 kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
2843 kernel.packet[1] = vec_mergel(sum[0], sum[4]);
2844 kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
2845 kernel.packet[3] = vec_mergel(sum[1], sum[5]);
2846 kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
2847 kernel.packet[5] = vec_mergel(sum[2], sum[6]);
2848 kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
2849 kernel.packet[7] = vec_mergel(sum[3], sum[7]);
2852EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
2853 Packet8us v[8], sum[8];
2855 v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
2856 v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
2857 v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
2858 v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
2859 v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
2860 v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
2861 v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
2862 v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
2863 sum[0] = vec_mergeh(v[0], v[4]);
2864 sum[1] = vec_mergel(v[0], v[4]);
2865 sum[2] = vec_mergeh(v[1], v[5]);
2866 sum[3] = vec_mergel(v[1], v[5]);
2867 sum[4] = vec_mergeh(v[2], v[6]);
2868 sum[5] = vec_mergel(v[2], v[6]);
2869 sum[6] = vec_mergeh(v[3], v[7]);
2870 sum[7] = vec_mergel(v[3], v[7]);
2872 kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
2873 kernel.packet[1] = vec_mergel(sum[0], sum[4]);
2874 kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
2875 kernel.packet[3] = vec_mergel(sum[1], sum[5]);
2876 kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
2877 kernel.packet[5] = vec_mergel(sum[2], sum[6]);
2878 kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
2879 kernel.packet[7] = vec_mergel(sum[3], sum[7]);
2882EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet8bf, 8>& kernel) {
2883 Packet8bf v[8], sum[8];
2885 v[0] = vec_mergeh(kernel.packet[0].m_val, kernel.packet[4].m_val);
2886 v[1] = vec_mergel(kernel.packet[0].m_val, kernel.packet[4].m_val);
2887 v[2] = vec_mergeh(kernel.packet[1].m_val, kernel.packet[5].m_val);
2888 v[3] = vec_mergel(kernel.packet[1].m_val, kernel.packet[5].m_val);
2889 v[4] = vec_mergeh(kernel.packet[2].m_val, kernel.packet[6].m_val);
2890 v[5] = vec_mergel(kernel.packet[2].m_val, kernel.packet[6].m_val);
2891 v[6] = vec_mergeh(kernel.packet[3].m_val, kernel.packet[7].m_val);
2892 v[7] = vec_mergel(kernel.packet[3].m_val, kernel.packet[7].m_val);
2893 sum[0] = vec_mergeh(v[0].m_val, v[4].m_val);
2894 sum[1] = vec_mergel(v[0].m_val, v[4].m_val);
2895 sum[2] = vec_mergeh(v[1].m_val, v[5].m_val);
2896 sum[3] = vec_mergel(v[1].m_val, v[5].m_val);
2897 sum[4] = vec_mergeh(v[2].m_val, v[6].m_val);
2898 sum[5] = vec_mergel(v[2].m_val, v[6].m_val);
2899 sum[6] = vec_mergeh(v[3].m_val, v[7].m_val);
2900 sum[7] = vec_mergel(v[3].m_val, v[7].m_val);
2902 kernel.packet[0] = vec_mergeh(sum[0].m_val, sum[4].m_val);
2903 kernel.packet[1] = vec_mergel(sum[0].m_val, sum[4].m_val);
2904 kernel.packet[2] = vec_mergeh(sum[1].m_val, sum[5].m_val);
2905 kernel.packet[3] = vec_mergel(sum[1].m_val, sum[5].m_val);
2906 kernel.packet[4] = vec_mergeh(sum[2].m_val, sum[6].m_val);
2907 kernel.packet[5] = vec_mergel(sum[2].m_val, sum[6].m_val);
2908 kernel.packet[6] = vec_mergeh(sum[3].m_val, sum[7].m_val);
2909 kernel.packet[7] = vec_mergel(sum[3].m_val, sum[7].m_val);
2912EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
2913 Packet16c step1[16], step2[16], step3[16];
2915 step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
2916 step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
2917 step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
2918 step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
2919 step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
2920 step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
2921 step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
2922 step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
2923 step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
2924 step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
2925 step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
2926 step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
2927 step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
2928 step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
2929 step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
2930 step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
2932 step2[0] = vec_mergeh(step1[0], step1[8]);
2933 step2[1] = vec_mergel(step1[0], step1[8]);
2934 step2[2] = vec_mergeh(step1[1], step1[9]);
2935 step2[3] = vec_mergel(step1[1], step1[9]);
2936 step2[4] = vec_mergeh(step1[2], step1[10]);
2937 step2[5] = vec_mergel(step1[2], step1[10]);
2938 step2[6] = vec_mergeh(step1[3], step1[11]);
2939 step2[7] = vec_mergel(step1[3], step1[11]);
2940 step2[8] = vec_mergeh(step1[4], step1[12]);
2941 step2[9] = vec_mergel(step1[4], step1[12]);
2942 step2[10] = vec_mergeh(step1[5], step1[13]);
2943 step2[11] = vec_mergel(step1[5], step1[13]);
2944 step2[12] = vec_mergeh(step1[6], step1[14]);
2945 step2[13] = vec_mergel(step1[6], step1[14]);
2946 step2[14] = vec_mergeh(step1[7], step1[15]);
2947 step2[15] = vec_mergel(step1[7], step1[15]);
2949 step3[0] = vec_mergeh(step2[0], step2[8]);
2950 step3[1] = vec_mergel(step2[0], step2[8]);
2951 step3[2] = vec_mergeh(step2[1], step2[9]);
2952 step3[3] = vec_mergel(step2[1], step2[9]);
2953 step3[4] = vec_mergeh(step2[2], step2[10]);
2954 step3[5] = vec_mergel(step2[2], step2[10]);
2955 step3[6] = vec_mergeh(step2[3], step2[11]);
2956 step3[7] = vec_mergel(step2[3], step2[11]);
2957 step3[8] = vec_mergeh(step2[4], step2[12]);
2958 step3[9] = vec_mergel(step2[4], step2[12]);
2959 step3[10] = vec_mergeh(step2[5], step2[13]);
2960 step3[11] = vec_mergel(step2[5], step2[13]);
2961 step3[12] = vec_mergeh(step2[6], step2[14]);
2962 step3[13] = vec_mergel(step2[6], step2[14]);
2963 step3[14] = vec_mergeh(step2[7], step2[15]);
2964 step3[15] = vec_mergel(step2[7], step2[15]);
2966 kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
2967 kernel.packet[1] = vec_mergel(step3[0], step3[8]);
2968 kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
2969 kernel.packet[3] = vec_mergel(step3[1], step3[9]);
2970 kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
2971 kernel.packet[5] = vec_mergel(step3[2], step3[10]);
2972 kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
2973 kernel.packet[7] = vec_mergel(step3[3], step3[11]);
2974 kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
2975 kernel.packet[9] = vec_mergel(step3[4], step3[12]);
2976 kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
2977 kernel.packet[11] = vec_mergel(step3[5], step3[13]);
2978 kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
2979 kernel.packet[13] = vec_mergel(step3[6], step3[14]);
2980 kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
2981 kernel.packet[15] = vec_mergel(step3[7], step3[15]);
2984EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
2985 Packet16uc step1[16], step2[16], step3[16];
2987 step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
2988 step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
2989 step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
2990 step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
2991 step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
2992 step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
2993 step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
2994 step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
2995 step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
2996 step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
2997 step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
2998 step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
2999 step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
3000 step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
3001 step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
3002 step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
3004 step2[0] = vec_mergeh(step1[0], step1[8]);
3005 step2[1] = vec_mergel(step1[0], step1[8]);
3006 step2[2] = vec_mergeh(step1[1], step1[9]);
3007 step2[3] = vec_mergel(step1[1], step1[9]);
3008 step2[4] = vec_mergeh(step1[2], step1[10]);
3009 step2[5] = vec_mergel(step1[2], step1[10]);
3010 step2[6] = vec_mergeh(step1[3], step1[11]);
3011 step2[7] = vec_mergel(step1[3], step1[11]);
3012 step2[8] = vec_mergeh(step1[4], step1[12]);
3013 step2[9] = vec_mergel(step1[4], step1[12]);
3014 step2[10] = vec_mergeh(step1[5], step1[13]);
3015 step2[11] = vec_mergel(step1[5], step1[13]);
3016 step2[12] = vec_mergeh(step1[6], step1[14]);
3017 step2[13] = vec_mergel(step1[6], step1[14]);
3018 step2[14] = vec_mergeh(step1[7], step1[15]);
3019 step2[15] = vec_mergel(step1[7], step1[15]);
3021 step3[0] = vec_mergeh(step2[0], step2[8]);
3022 step3[1] = vec_mergel(step2[0], step2[8]);
3023 step3[2] = vec_mergeh(step2[1], step2[9]);
3024 step3[3] = vec_mergel(step2[1], step2[9]);
3025 step3[4] = vec_mergeh(step2[2], step2[10]);
3026 step3[5] = vec_mergel(step2[2], step2[10]);
3027 step3[6] = vec_mergeh(step2[3], step2[11]);
3028 step3[7] = vec_mergel(step2[3], step2[11]);
3029 step3[8] = vec_mergeh(step2[4], step2[12]);
3030 step3[9] = vec_mergel(step2[4], step2[12]);
3031 step3[10] = vec_mergeh(step2[5], step2[13]);
3032 step3[11] = vec_mergel(step2[5], step2[13]);
3033 step3[12] = vec_mergeh(step2[6], step2[14]);
3034 step3[13] = vec_mergel(step2[6], step2[14]);
3035 step3[14] = vec_mergeh(step2[7], step2[15]);
3036 step3[15] = vec_mergel(step2[7], step2[15]);
3038 kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
3039 kernel.packet[1] = vec_mergel(step3[0], step3[8]);
3040 kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
3041 kernel.packet[3] = vec_mergel(step3[1], step3[9]);
3042 kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
3043 kernel.packet[5] = vec_mergel(step3[2], step3[10]);
3044 kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
3045 kernel.packet[7] = vec_mergel(step3[3], step3[11]);
3046 kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
3047 kernel.packet[9] = vec_mergel(step3[4], step3[12]);
3048 kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
3049 kernel.packet[11] = vec_mergel(step3[5], step3[13]);
3050 kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
3051 kernel.packet[13] = vec_mergel(step3[6], step3[14]);
3052 kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
3053 kernel.packet[15] = vec_mergel(step3[7], step3[15]);
3056template <
typename Packet>
3057EIGEN_STRONG_INLINE Packet pblend4(
const Selector<4>& ifPacket,
const Packet& thenPacket,
const Packet& elsePacket) {
3058 Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
3059 Packet4ui mask =
reinterpret_cast<Packet4ui
>(pnegate(
reinterpret_cast<Packet4i
>(select)));
3060 return vec_sel(elsePacket, thenPacket, mask);
3064EIGEN_STRONG_INLINE Packet4i pblend(
const Selector<4>& ifPacket,
const Packet4i& thenPacket,
3065 const Packet4i& elsePacket) {
3066 return pblend4<Packet4i>(ifPacket, thenPacket, elsePacket);
3070EIGEN_STRONG_INLINE Packet4f pblend(
const Selector<4>& ifPacket,
const Packet4f& thenPacket,
3071 const Packet4f& elsePacket) {
3072 return pblend4<Packet4f>(ifPacket, thenPacket, elsePacket);
3076EIGEN_STRONG_INLINE Packet8s pblend(
const Selector<8>& ifPacket,
const Packet8s& thenPacket,
3077 const Packet8s& elsePacket) {
3078 Packet8us select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
3079 ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7]};
3080 Packet8us mask =
reinterpret_cast<Packet8us
>(pnegate(
reinterpret_cast<Packet8s
>(select)));
3081 Packet8s result = vec_sel(elsePacket, thenPacket, mask);
3086EIGEN_STRONG_INLINE Packet8us pblend(
const Selector<8>& ifPacket,
const Packet8us& thenPacket,
3087 const Packet8us& elsePacket) {
3088 Packet8us select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
3089 ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7]};
3090 Packet8us mask =
reinterpret_cast<Packet8us
>(pnegate(
reinterpret_cast<Packet8s
>(select)));
3091 return vec_sel(elsePacket, thenPacket, mask);
3095EIGEN_STRONG_INLINE Packet8bf pblend(
const Selector<8>& ifPacket,
const Packet8bf& thenPacket,
3096 const Packet8bf& elsePacket) {
3097 return pblend<Packet8us>(ifPacket, thenPacket, elsePacket);
3101EIGEN_STRONG_INLINE Packet16c pblend(
const Selector<16>& ifPacket,
const Packet16c& thenPacket,
3102 const Packet16c& elsePacket) {
3103 Packet16uc select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
3104 ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
3105 ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
3106 ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15]};
3108 Packet16uc mask =
reinterpret_cast<Packet16uc
>(pnegate(
reinterpret_cast<Packet16c
>(select)));
3109 return vec_sel(elsePacket, thenPacket, mask);
3113EIGEN_STRONG_INLINE Packet16uc pblend(
const Selector<16>& ifPacket,
const Packet16uc& thenPacket,
3114 const Packet16uc& elsePacket) {
3115 Packet16uc select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
3116 ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
3117 ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
3118 ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15]};
3120 Packet16uc mask =
reinterpret_cast<Packet16uc
>(pnegate(
reinterpret_cast<Packet16c
>(select)));
3121 return vec_sel(elsePacket, thenPacket, mask);
3125#ifdef EIGEN_VECTORIZE_VSX
3126typedef __vector
double Packet2d;
3127typedef __vector
unsigned long long Packet2ul;
3128typedef __vector
long long Packet2l;
3130typedef Packet2ul Packet2bl;
3132typedef __vector __bool
long Packet2bl;
3135static Packet2l p2l_ZERO =
reinterpret_cast<Packet2l
>(p4i_ZERO);
3136static Packet2ul p2ul_SIGN = {0x8000000000000000ull, 0x8000000000000000ull};
3137static Packet2ul p2ul_PREV0DOT5 = {0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull};
3138static Packet2d p2d_ONE = {1.0, 1.0};
3139static Packet2d p2d_ZERO =
reinterpret_cast<Packet2d
>(p4f_ZERO);
3140static Packet2d p2d_MZERO = {numext::bit_cast<double>(0x8000000000000000ull),
3141 numext::bit_cast<double>(0x8000000000000000ull)};
3144static Packet2d p2d_COUNTDOWN =
3145 reinterpret_cast<Packet2d
>(vec_sld(
reinterpret_cast<Packet4f
>(p2d_ZERO),
reinterpret_cast<Packet4f
>(p2d_ONE), 8));
3147static Packet2d p2d_COUNTDOWN =
3148 reinterpret_cast<Packet2d
>(vec_sld(
reinterpret_cast<Packet4f
>(p2d_ONE),
reinterpret_cast<Packet4f
>(p2d_ZERO), 8));
3152Packet2d vec_splat_dbl(Packet2d& a) {
3153 return vec_splat(a, index);
3157struct packet_traits<double> : default_packet_traits {
3158 typedef Packet2d type;
3159 typedef Packet2d half;
3162 AlignedOnScalar = 1,
3172 HasSin = EIGEN_FAST_MATH,
3173 HasCos = EIGEN_FAST_MATH,
3174 HasTanh = EIGEN_FAST_MATH,
3175 HasErf = EIGEN_FAST_MATH,
3176 HasErfc = EIGEN_FAST_MATH,
3184#if !EIGEN_COMP_CLANG
3195struct unpacket_traits<Packet2d> {
3196 typedef double type;
3197 typedef Packet2l integer_packet;
3201 vectorizable =
true,
3202 masked_load_available =
false,
3203 masked_store_available =
false
3205 typedef Packet2d half;
3208struct unpacket_traits<Packet2l> {
3209 typedef int64_t type;
3210 typedef Packet2l half;
3214 vectorizable =
false,
3215 masked_load_available =
false,
3216 masked_store_available =
false
3220inline std::ostream& operator<<(std::ostream& s,
const Packet2l& v) {
3226 s << vt.n[0] <<
", " << vt.n[1];
3230inline std::ostream& operator<<(std::ostream& s,
const Packet2d& v) {
3236 s << vt.n[0] <<
", " << vt.n[1];
3242EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(
const double* from) {
3243 EIGEN_DEBUG_ALIGNED_LOAD
3244 return vec_xl(0,
const_cast<double*
>(from));
3248EIGEN_ALWAYS_INLINE Packet2d pload_partial<Packet2d>(
const double* from,
const Index n,
const Index offset) {
3249 return pload_partial_common<Packet2d>(from, n, offset);
3253EIGEN_STRONG_INLINE
void pstore<double>(
double* to,
const Packet2d& from) {
3254 EIGEN_DEBUG_ALIGNED_STORE
3255 vec_xst(from, 0, to);
3259EIGEN_ALWAYS_INLINE
void pstore_partial<double>(
double* to,
const Packet2d& from,
const Index n,
const Index offset) {
3260 pstore_partial_common<Packet2d>(to, from, n, offset);
3264EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(
const double& from) {
3265 Packet2d v = {from, from};
3269EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(
const int64_t& from) {
3270 Packet2l v = {from, from};
3275EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(
unsigned long from) {
3276 Packet2l v = {
static_cast<long long>(from),
static_cast<long long>(from)};
3277 return reinterpret_cast<Packet2d
>(v);
3281EIGEN_STRONG_INLINE
void pbroadcast4<Packet2d>(
const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2,
3284 a0 = pset1<Packet2d>(a[0]);
3285 a1 = pset1<Packet2d>(a[1]);
3286 a2 = pset1<Packet2d>(a[2]);
3287 a3 = pset1<Packet2d>(a[3]);
3291EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather<double, Packet2d>(
const double* from,
Index stride) {
3292 return pgather_common<Packet2d>(from, stride);
3295EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather_partial<double, Packet2d>(
const double* from,
Index stride,
3297 return pgather_common<Packet2d>(from, stride, n);
3300EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter<double, Packet2d>(
double* to,
const Packet2d& from,
Index stride) {
3301 pscatter_common<Packet2d>(to, from, stride);
3304EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter_partial<double, Packet2d>(
double* to,
const Packet2d& from,
3306 pscatter_common<Packet2d>(to, from, stride, n);
3310EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(
const double& a) {
3311 return pset1<Packet2d>(a) + p2d_COUNTDOWN;
3315EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
3320EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
3325EIGEN_STRONG_INLINE Packet2d pnegate(
const Packet2d& a) {
3326#ifdef __POWER8_VECTOR__
3329 return vec_xor(a, p2d_MZERO);
3334EIGEN_STRONG_INLINE Packet2d pconj(
const Packet2d& a) {
3339EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
3340 return vec_madd(a, b, p2d_MZERO);
3343EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
3344 return vec_div(a, b);
3349EIGEN_STRONG_INLINE Packet2d pmadd(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
3350 return vec_madd(a, b, c);
3353EIGEN_STRONG_INLINE Packet2d pmsub(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
3354 return vec_msub(a, b, c);
3357EIGEN_STRONG_INLINE Packet2d pnmadd(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
3358 return vec_nmsub(a, b, c);
3361EIGEN_STRONG_INLINE Packet2d pnmsub(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
3362 return vec_nmadd(a, b, c);
3366EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
3369 __asm__(
"xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" :
"=&wa"(ret) :
"wa"(a),
"wa"(b));
3374EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
3377 __asm__(
"xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" :
"=&wa"(ret) :
"wa"(a),
"wa"(b));
3382EIGEN_STRONG_INLINE Packet2d pcmp_le(
const Packet2d& a,
const Packet2d& b) {
3383 return reinterpret_cast<Packet2d
>(vec_cmple(a, b));
3386EIGEN_STRONG_INLINE Packet2d pcmp_lt(
const Packet2d& a,
const Packet2d& b) {
3387 return reinterpret_cast<Packet2d
>(vec_cmplt(a, b));
3390EIGEN_STRONG_INLINE Packet2d pcmp_eq(
const Packet2d& a,
const Packet2d& b) {
3391 return reinterpret_cast<Packet2d
>(vec_cmpeq(a, b));
3394#ifdef __POWER8_VECTOR__
3395EIGEN_STRONG_INLINE Packet2l pcmp_eq(
const Packet2l& a,
const Packet2l& b) {
3396 return reinterpret_cast<Packet2l
>(vec_cmpeq(a, b));
3399EIGEN_STRONG_INLINE Packet2l pcmp_eq(
const Packet2l& a,
const Packet2l& b) {
3400 Packet4i halves =
reinterpret_cast<Packet4i
>(vec_cmpeq(
reinterpret_cast<Packet4i
>(a),
reinterpret_cast<Packet4i
>(b)));
3401 Packet4i flipped = vec_perm(halves, halves, p16uc_COMPLEX32_REV);
3402 return reinterpret_cast<Packet2l
>(pand(halves, flipped));
3406EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(
const Packet2d& a,
const Packet2d& b) {
3407 Packet2d c =
reinterpret_cast<Packet2d
>(vec_cmpge(a, b));
3408 return vec_nor(c, c);
3412EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
3413 return vec_and(a, b);
3417EIGEN_STRONG_INLINE Packet2d por<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
3418 return vec_or(a, b);
3422EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
3423 return vec_xor(a, b);
3427EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
3428 return vec_and(a, vec_nor(b, b));
3432EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(
const Packet2d& a) {
3433 Packet2d t = vec_add(
3434 reinterpret_cast<Packet2d
>(vec_or(vec_and(
reinterpret_cast<Packet2ul
>(a), p2ul_SIGN), p2ul_PREV0DOT5)), a);
3437 __asm__(
"xvrdpiz %x0, %x1\n\t" :
"=&wa"(res) :
"wa"(t));
3442EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(
const Packet2d& a) {
3446EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(
const Packet2d& a) {
3447 return vec_floor(a);
3450EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(
const Packet2d& a) {
3451 return vec_trunc(a);
3454EIGEN_STRONG_INLINE Packet2d print<Packet2d>(
const Packet2d& a) {
3457 __asm__(
"xvrdpic %x0, %x1\n\t" :
"=&wa"(res) :
"wa"(a));
3463EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(
const double* from) {
3464 EIGEN_DEBUG_UNALIGNED_LOAD
3465 return vec_xl(0,
const_cast<double*
>(from));
3469EIGEN_ALWAYS_INLINE Packet2d ploadu_partial<Packet2d>(
const double* from,
const Index n,
const Index offset) {
3470 return ploadu_partial_common<Packet2d>(from, n, offset);
3474EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(
const double* from) {
3476 if ((std::ptrdiff_t(from) % 16) == 0)
3477 p = pload<Packet2d>(from);
3479 p = ploadu<Packet2d>(from);
3480 return vec_splat_dbl<0>(p);
3484EIGEN_STRONG_INLINE
void pstoreu<double>(
double* to,
const Packet2d& from) {
3485 EIGEN_DEBUG_UNALIGNED_STORE
3486 vec_xst(from, 0, to);
3490EIGEN_ALWAYS_INLINE
void pstoreu_partial<double>(
double* to,
const Packet2d& from,
const Index n,
const Index offset) {
3491 pstoreu_partial_common<Packet2d>(to, from, n, offset);
3495EIGEN_STRONG_INLINE
void prefetch<double>(
const double* addr) {
3496 EIGEN_PPC_PREFETCH(addr);
3500EIGEN_STRONG_INLINE
double pfirst<Packet2d>(
const Packet2d& a) {
3501 EIGEN_ALIGN16
double x[2];
3502 pstore<double>(x, a);
3507EIGEN_STRONG_INLINE Packet2d preverse(
const Packet2d& a) {
3508 return vec_sld(a, a, 8);
3511EIGEN_STRONG_INLINE Packet2d pabs(
const Packet2d& a) {
3514#ifdef __POWER8_VECTOR__
3516EIGEN_STRONG_INLINE Packet2d psignbit(
const Packet2d& a) {
3517 return (Packet2d)vec_sra((Packet2l)a, vec_splats((
unsigned long long)(63)));
3521static Packet16uc p16uc_DUPSIGN = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
3523static Packet16uc p16uc_DUPSIGN = {7, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15};
3527EIGEN_STRONG_INLINE Packet2d psignbit(
const Packet2d& a) {
3528 Packet16c tmp = vec_sra(
reinterpret_cast<Packet16c
>(a), vec_splats((
unsigned char)(7)));
3529 return reinterpret_cast<Packet2d
>(vec_perm(tmp, tmp, p16uc_DUPSIGN));
3534inline Packet2l pcast<Packet2d, Packet2l>(
const Packet2d& x);
3537inline Packet2d pcast<Packet2l, Packet2d>(
const Packet2l& x);
3545#ifdef __POWER8_VECTOR__
3548EIGEN_STRONG_INLINE Packet2l plogical_shift_left(
const Packet2l& a) {
3549 const Packet2ul shift = {N, N};
3550 return vec_sl(a, shift);
3554EIGEN_STRONG_INLINE Packet2l plogical_shift_right(
const Packet2l& a) {
3555 const Packet2ul shift = {N, N};
3556 return vec_sr(a, shift);
3563EIGEN_ALWAYS_INLINE Packet4i shift_even_left(
const Packet4i& a) {
3564 static const Packet16uc perm = {0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
3565 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b};
3567 return vec_perm(p4i_ZERO, a, perm);
3569 return vec_perm(a, p4i_ZERO, perm);
3575EIGEN_ALWAYS_INLINE Packet4i shift_odd_right(
const Packet4i& a) {
3576 static const Packet16uc perm = {0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
3577 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b};
3579 return vec_perm(p4i_ZERO, a, perm);
3581 return vec_perm(a, p4i_ZERO, perm);
3585template <
int N,
typename EnableIf =
void>
3586struct plogical_shift_left_impl;
3589struct plogical_shift_left_impl<N, std::enable_if_t<(N < 32) && (N >= 0)> > {
3590 static EIGEN_STRONG_INLINE Packet2l run(
const Packet2l& a) {
3591 static const unsigned n =
static_cast<unsigned>(N);
3592 const Packet4ui shift = {n, n, n, n};
3593 const Packet4i ai =
reinterpret_cast<Packet4i
>(a);
3594 static const unsigned m =
static_cast<unsigned>(32 - N);
3595 const Packet4ui shift_right = {m, m, m, m};
3596 const Packet4i out_hi = vec_sl(ai, shift);
3597 const Packet4i out_lo = shift_even_left(vec_sr(ai, shift_right));
3598 return reinterpret_cast<Packet2l
>(por<Packet4i>(out_hi, out_lo));
3603struct plogical_shift_left_impl<N, std::enable_if_t<(N >= 32)> > {
3604 static EIGEN_STRONG_INLINE Packet2l run(
const Packet2l& a) {
3605 static const unsigned m =
static_cast<unsigned>(N - 32);
3606 const Packet4ui shift = {m, m, m, m};
3607 const Packet4i ai =
reinterpret_cast<Packet4i
>(a);
3608 return reinterpret_cast<Packet2l
>(shift_even_left(vec_sl(ai, shift)));
3613EIGEN_STRONG_INLINE Packet2l plogical_shift_left(
const Packet2l& a) {
3614 return plogical_shift_left_impl<N>::run(a);
3617template <
int N,
typename EnableIf =
void>
3618struct plogical_shift_right_impl;
3621struct plogical_shift_right_impl<N, std::enable_if_t<(N < 32) && (N >= 0)> > {
3622 static EIGEN_STRONG_INLINE Packet2l run(
const Packet2l& a) {
3623 static const unsigned n =
static_cast<unsigned>(N);
3624 const Packet4ui shift = {n, n, n, n};
3625 const Packet4i ai =
reinterpret_cast<Packet4i
>(a);
3626 static const unsigned m =
static_cast<unsigned>(32 - N);
3627 const Packet4ui shift_left = {m, m, m, m};
3628 const Packet4i out_lo = vec_sr(ai, shift);
3629 const Packet4i out_hi = shift_odd_right(vec_sl(ai, shift_left));
3630 return reinterpret_cast<Packet2l
>(por<Packet4i>(out_hi, out_lo));
3635struct plogical_shift_right_impl<N, std::enable_if_t<(N >= 32)> > {
3636 static EIGEN_STRONG_INLINE Packet2l run(
const Packet2l& a) {
3637 static const unsigned m =
static_cast<unsigned>(N - 32);
3638 const Packet4ui shift = {m, m, m, m};
3639 const Packet4i ai =
reinterpret_cast<Packet4i
>(a);
3640 return reinterpret_cast<Packet2l
>(shift_odd_right(vec_sr(ai, shift)));
3645EIGEN_STRONG_INLINE Packet2l plogical_shift_right(
const Packet2l& a) {
3646 return plogical_shift_right_impl<N>::run(a);
3651EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(
const Packet2d& a,
const Packet2d& exponent) {
3653 const Packet2d max_exponent = pset1<Packet2d>(2099.0);
3654 const Packet2l e = pcast<Packet2d, Packet2l>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
3657 const Packet2l bias = {1023, 1023};
3658 Packet2l b = plogical_shift_right<2>(e);
3659 Packet2d c =
reinterpret_cast<Packet2d
>(plogical_shift_left<52>(b + bias));
3660 Packet2d out = pmul(pmul(pmul(a, c), c), c);
3661 b = psub(psub(psub(e, b), b), b);
3662 c =
reinterpret_cast<Packet2d
>(plogical_shift_left<52>(b + bias));
3669EIGEN_STRONG_INLINE Packet2d pfrexp_generic_get_biased_exponent(
const Packet2d& a) {
3670 return pcast<Packet2l, Packet2d>(plogical_shift_right<52>(
reinterpret_cast<Packet2l
>(pabs(a))));
3674EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(
const Packet2d& a, Packet2d& exponent) {
3675 return pfrexp_generic(a, exponent);
3679EIGEN_STRONG_INLINE
double predux<Packet2d>(
const Packet2d& a) {
3681 b =
reinterpret_cast<Packet2d
>(vec_sld(
reinterpret_cast<Packet4f
>(a),
reinterpret_cast<Packet4f
>(a), 8));
3683 return pfirst<Packet2d>(sum);
3689EIGEN_STRONG_INLINE
double predux_mul<Packet2d>(
const Packet2d& a) {
3691 pmul(a,
reinterpret_cast<Packet2d
>(vec_sld(
reinterpret_cast<Packet4ui
>(a),
reinterpret_cast<Packet4ui
>(a), 8))));
3696EIGEN_STRONG_INLINE
double predux_min<Packet2d>(
const Packet2d& a) {
3698 pmin(a,
reinterpret_cast<Packet2d
>(vec_sld(
reinterpret_cast<Packet4ui
>(a),
reinterpret_cast<Packet4ui
>(a), 8))));
3703EIGEN_STRONG_INLINE
double predux_max<Packet2d>(
const Packet2d& a) {
3705 pmax(a,
reinterpret_cast<Packet2d
>(vec_sld(
reinterpret_cast<Packet4ui
>(a),
reinterpret_cast<Packet4ui
>(a), 8))));
3708EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
3710 t0 = vec_mergeh(kernel.packet[0], kernel.packet[1]);
3711 t1 = vec_mergel(kernel.packet[0], kernel.packet[1]);
3712 kernel.packet[0] = t0;
3713 kernel.packet[1] = t1;
3717EIGEN_STRONG_INLINE Packet2d pblend(
const Selector<2>& ifPacket,
const Packet2d& thenPacket,
3718 const Packet2d& elsePacket) {
3719 Packet2l select = {ifPacket.select[0], ifPacket.select[1]};
3720 Packet2ul mask =
reinterpret_cast<Packet2ul
>(pnegate(
reinterpret_cast<Packet2l
>(select)));
3721 return vec_sel(elsePacket, thenPacket, mask);
@ Aligned16
Definition Constants.h:237
Namespace containing all symbols from the Eigen library.
Definition B01_Experimental.dox:1
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_exp_op< typename Derived::Scalar >, const Derived > exp(const Eigen::ArrayBase< Derived > &x)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:82