10#ifndef EIGEN_PACKET_MATH_ALTIVEC_H
11#define EIGEN_PACKET_MATH_ALTIVEC_H
14#include "../../InternalHeaderCheck.h"
20#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
21#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
24#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
25#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
29#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
30#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
33typedef __vector
float Packet4f;
34typedef __vector
int Packet4i;
35typedef __vector
unsigned int Packet4ui;
36typedef __vector __bool
int Packet4bi;
37typedef __vector
short int Packet8s;
38typedef __vector
unsigned short int Packet8us;
39typedef __vector __bool
short Packet8bi;
40typedef __vector
signed char Packet16c;
41typedef __vector
unsigned char Packet16uc;
42typedef eigen_packet_wrapper<__vector unsigned short int, 0> Packet8bf;
46#define EIGEN_DECLARE_CONST_FAST_Packet4f(NAME, X) Packet4f p4f_##NAME = {X, X, X, X}
48#define EIGEN_DECLARE_CONST_FAST_Packet4i(NAME, X) Packet4i p4i_##NAME = vec_splat_s32(X)
50#define EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME, X) Packet4ui p4ui_##NAME = {X, X, X, X}
52#define EIGEN_DECLARE_CONST_FAST_Packet8us(NAME, X) Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X}
54#define EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME, X) \
55 Packet16uc p16uc_##NAME = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}
57#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) Packet4f p4f_##NAME = pset1<Packet4f>(X)
59#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) Packet4i p4i_##NAME = pset1<Packet4i>(X)
61#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) Packet2d p2d_##NAME = pset1<Packet2d>(X)
63#define EIGEN_DECLARE_CONST_Packet2l(NAME, X) Packet2l p2l_##NAME = pset1<Packet2l>(X)
65#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) \
66 const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
69#define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
70#define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits<PACKETNAME>::type
73static EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0);
74static EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0);
75static EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1);
76static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16, -16);
77static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1, -1);
78static EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u);
79static EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu);
80static EIGEN_DECLARE_CONST_FAST_Packet8us(ONE, 1);
81static Packet4f p4f_MZERO =
82 (Packet4f)vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1);
84static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0);
87static Packet4f p4f_COUNTDOWN = {0.0, 1.0, 2.0, 3.0};
88static Packet4i p4i_COUNTDOWN = {0, 1, 2, 3};
89static Packet8s p8s_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7};
90static Packet8us p8us_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7};
92static Packet16c p16c_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
93static Packet16uc p16uc_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
95static Packet16uc p16uc_REVERSE32 = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
96static Packet16uc p16uc_REVERSE16 = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
97static Packet16uc p16uc_REVERSE8 = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
100static Packet16uc p16uc_DUPLICATE32_HI = {0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7};
102static const Packet16uc p16uc_DUPLICATE16_EVEN = {0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13};
103static const Packet16uc p16uc_DUPLICATE16_ODD = {2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15};
105static Packet16uc p16uc_QUADRUPLICATE16_HI = {0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3};
106static Packet16uc p16uc_QUADRUPLICATE16 = {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3};
108static Packet16uc p16uc_MERGEE16 = {0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29};
109static Packet16uc p16uc_MERGEO16 = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
111static Packet16uc p16uc_MERGEH16 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
113static Packet16uc p16uc_MERGEL16 = {2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31};
119static Packet16uc p16uc_FORWARD = vec_lvsl(0, (
float*)0);
120static Packet16uc p16uc_PSET32_WODD =
121 vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
123static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
125static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc)vec_abs(p4i_MINUS16), 3),
128static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
129static Packet16uc p16uc_PSET32_WODD =
130 vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
132static Packet16uc p16uc_PSET32_WEVEN =
133 vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
135static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc)vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO,
139static Packet16uc p16uc_PSET64_HI = (Packet16uc)vec_mergeh(
140 (Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);
141static Packet16uc p16uc_PSET64_LO = (Packet16uc)vec_mergel(
142 (Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);
143static Packet16uc p16uc_TRANSPOSE64_HI =
144 p16uc_PSET64_HI + p16uc_HALF64_0_16;
145static Packet16uc p16uc_TRANSPOSE64_LO =
146 p16uc_PSET64_LO + p16uc_HALF64_0_16;
148static Packet16uc p16uc_COMPLEX32_REV =
149 vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8);
151#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
152#define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR);
154#define EIGEN_PPC_PREFETCH(ADDR) asm(" dcbt [%[addr]]\n" ::[addr] "r"(ADDR) : "cc");
158#define LOAD_STORE_UNROLL_16 _Pragma("unroll 16")
160#define LOAD_STORE_UNROLL_16 _Pragma("GCC unroll(16)")
164struct packet_traits<float> : default_packet_traits {
165 typedef Packet4f type;
166 typedef Packet4f half;
179 HasSin = EIGEN_FAST_MATH,
180 HasCos = EIGEN_FAST_MATH,
189#ifdef EIGEN_VECTORIZE_VSX
199 HasTanh = EIGEN_FAST_MATH,
200 HasErf = EIGEN_FAST_MATH,
201 HasErfc = EIGEN_FAST_MATH,
212struct packet_traits<bfloat16> : default_packet_traits {
213 typedef Packet8bf type;
214 typedef Packet8bf half;
227 HasSin = EIGEN_FAST_MATH,
228 HasCos = EIGEN_FAST_MATH,
231#ifdef EIGEN_VECTORIZE_VSX
249struct packet_traits<int> : default_packet_traits {
250 typedef Packet4i type;
251 typedef Packet4i half;
261#if defined(_ARCH_PWR10) && (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11, 0, 0))
271struct packet_traits<short int> : default_packet_traits {
272 typedef Packet8s type;
273 typedef Packet8s half;
288struct packet_traits<unsigned short int> : default_packet_traits {
289 typedef Packet8us type;
290 typedef Packet8us half;
305struct packet_traits<signed char> : default_packet_traits {
306 typedef Packet16c type;
307 typedef Packet16c half;
322struct packet_traits<unsigned char> : default_packet_traits {
323 typedef Packet16uc type;
324 typedef Packet16uc half;
339struct unpacket_traits<Packet4f> {
341 typedef Packet4f half;
342 typedef Packet4i integer_packet;
347 masked_load_available =
false,
348 masked_store_available =
false
352struct unpacket_traits<Packet4i> {
354 typedef Packet4i half;
359 masked_load_available =
false,
360 masked_store_available =
false
364struct unpacket_traits<Packet8s> {
365 typedef short int type;
366 typedef Packet8s half;
371 masked_load_available =
false,
372 masked_store_available =
false
376struct unpacket_traits<Packet8us> {
377 typedef unsigned short int type;
378 typedef Packet8us half;
383 masked_load_available =
false,
384 masked_store_available =
false
389struct unpacket_traits<Packet16c> {
390 typedef signed char type;
391 typedef Packet16c half;
396 masked_load_available =
false,
397 masked_store_available =
false
401struct unpacket_traits<Packet16uc> {
402 typedef unsigned char type;
403 typedef Packet16uc half;
408 masked_load_available =
false,
409 masked_store_available =
false
414struct unpacket_traits<Packet8bf> {
415 typedef bfloat16 type;
416 typedef Packet8bf half;
421 masked_load_available =
false,
422 masked_store_available =
false
426template <
typename Packet>
427EIGEN_STRONG_INLINE Packet pload_common(
const __UNPACK_TYPE__(Packet) * from) {
430 EIGEN_UNUSED_VARIABLE(from);
431 EIGEN_DEBUG_ALIGNED_LOAD
432#ifdef EIGEN_VECTORIZE_VSX
433 return vec_xl(0,
const_cast<__UNPACK_TYPE__(Packet)*
>(from));
435 return vec_ld(0, from);
441EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(
const float* from) {
442 return pload_common<Packet4f>(from);
446EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(
const int* from) {
447 return pload_common<Packet4i>(from);
451EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(
const short int* from) {
452 return pload_common<Packet8s>(from);
456EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(
const unsigned short int* from) {
457 return pload_common<Packet8us>(from);
461EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(
const signed char* from) {
462 return pload_common<Packet16c>(from);
466EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(
const unsigned char* from) {
467 return pload_common<Packet16uc>(from);
471EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(
const bfloat16* from) {
472 return pload_common<Packet8us>(
reinterpret_cast<const unsigned short int*
>(from));
475template <
typename Packet>
476EIGEN_ALWAYS_INLINE Packet pload_ignore(
const __UNPACK_TYPE__(Packet) * from) {
479 EIGEN_UNUSED_VARIABLE(from);
480 EIGEN_DEBUG_ALIGNED_LOAD
483#pragma GCC diagnostic push
484#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
486#ifdef EIGEN_VECTORIZE_VSX
487 return vec_xl(0,
const_cast<__UNPACK_TYPE__(Packet)*
>(from));
489 return vec_ld(0, from);
492#pragma GCC diagnostic pop
497EIGEN_ALWAYS_INLINE Packet8bf pload_ignore<Packet8bf>(
const bfloat16* from) {
498 return pload_ignore<Packet8us>(
reinterpret_cast<const unsigned short int*
>(from));
501template <
typename Packet>
502EIGEN_ALWAYS_INLINE Packet pload_partial_common(
const __UNPACK_TYPE__(Packet) * from,
const Index n,
503 const Index offset) {
506 const Index packet_size = unpacket_traits<Packet>::size;
507 eigen_internal_assert(n + offset <= packet_size &&
"number of elements plus offset will read past end of packet");
508 const Index size =
sizeof(__UNPACK_TYPE__(Packet));
510 EIGEN_UNUSED_VARIABLE(packet_size);
511 EIGEN_DEBUG_ALIGNED_LOAD
512 EIGEN_UNUSED_VARIABLE(from);
513 Packet load = vec_xl_len(
const_cast<__UNPACK_TYPE__(Packet)*
>(from), n * size);
515 Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
517 load = Packet(vec_sro(Packet16uc(load), shift));
519 load = Packet(vec_slo(Packet16uc(load), shift));
525 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
526 unsigned char* load2 =
reinterpret_cast<unsigned char*
>(load + offset);
527 unsigned char* from2 =
reinterpret_cast<unsigned char*
>(
const_cast<__UNPACK_TYPE__(Packet)*
>(from));
530 pstoreu(load2, ploadu<Packet16uc>(from2));
532 memcpy((
void*)load2, (
void*)from2, n2);
534 return pload_ignore<Packet>(load);
536 return Packet(pset1<Packet16uc>(0));
542EIGEN_ALWAYS_INLINE Packet4f pload_partial<Packet4f>(
const float* from,
const Index n,
const Index offset) {
543 return pload_partial_common<Packet4f>(from, n, offset);
547EIGEN_ALWAYS_INLINE Packet4i pload_partial<Packet4i>(
const int* from,
const Index n,
const Index offset) {
548 return pload_partial_common<Packet4i>(from, n, offset);
552EIGEN_ALWAYS_INLINE Packet8s pload_partial<Packet8s>(
const short int* from,
const Index n,
const Index offset) {
553 return pload_partial_common<Packet8s>(from, n, offset);
557EIGEN_ALWAYS_INLINE Packet8us pload_partial<Packet8us>(
const unsigned short int* from,
const Index n,
558 const Index offset) {
559 return pload_partial_common<Packet8us>(from, n, offset);
563EIGEN_ALWAYS_INLINE Packet8bf pload_partial<Packet8bf>(
const bfloat16* from,
const Index n,
const Index offset) {
564 return pload_partial_common<Packet8us>(
reinterpret_cast<const unsigned short int*
>(from), n, offset);
568EIGEN_ALWAYS_INLINE Packet16c pload_partial<Packet16c>(
const signed char* from,
const Index n,
const Index offset) {
569 return pload_partial_common<Packet16c>(from, n, offset);
573EIGEN_ALWAYS_INLINE Packet16uc pload_partial<Packet16uc>(
const unsigned char* from,
const Index n,
const Index offset) {
574 return pload_partial_common<Packet16uc>(from, n, offset);
577template <
typename Packet>
578EIGEN_STRONG_INLINE
void pstore_common(__UNPACK_TYPE__(Packet) * to,
const Packet& from) {
581 EIGEN_UNUSED_VARIABLE(to);
582 EIGEN_DEBUG_ALIGNED_STORE
583#ifdef EIGEN_VECTORIZE_VSX
584 vec_xst(from, 0, to);
591EIGEN_STRONG_INLINE
void pstore<float>(
float* to,
const Packet4f& from) {
592 pstore_common<Packet4f>(to, from);
596EIGEN_STRONG_INLINE
void pstore<int>(
int* to,
const Packet4i& from) {
597 pstore_common<Packet4i>(to, from);
601EIGEN_STRONG_INLINE
void pstore<short int>(
short int* to,
const Packet8s& from) {
602 pstore_common<Packet8s>(to, from);
606EIGEN_STRONG_INLINE
void pstore<unsigned short int>(
unsigned short int* to,
const Packet8us& from) {
607 pstore_common<Packet8us>(to, from);
611EIGEN_STRONG_INLINE
void pstore<bfloat16>(bfloat16* to,
const Packet8bf& from) {
612 pstore_common<Packet8us>(
reinterpret_cast<unsigned short int*
>(to), from.m_val);
616EIGEN_STRONG_INLINE
void pstore<signed char>(
signed char* to,
const Packet16c& from) {
617 pstore_common<Packet16c>(to, from);
621EIGEN_STRONG_INLINE
void pstore<unsigned char>(
unsigned char* to,
const Packet16uc& from) {
622 pstore_common<Packet16uc>(to, from);
625template <
typename Packet>
626EIGEN_ALWAYS_INLINE
void pstore_partial_common(__UNPACK_TYPE__(Packet) * to,
const Packet& from,
const Index n,
627 const Index offset) {
630 const Index packet_size = unpacket_traits<Packet>::size;
631 eigen_internal_assert(n + offset <= packet_size &&
"number of elements plus offset will write past end of packet");
632 const Index size =
sizeof(__UNPACK_TYPE__(Packet));
634 EIGEN_UNUSED_VARIABLE(packet_size);
635 EIGEN_UNUSED_VARIABLE(to);
636 EIGEN_DEBUG_ALIGNED_STORE
639 Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
641 store = Packet(vec_slo(Packet16uc(store), shift));
643 store = Packet(vec_sro(Packet16uc(store), shift));
646 vec_xst_len(store, to, n * size);
649 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
651 unsigned char* store2 =
reinterpret_cast<unsigned char*
>(store + offset);
652 unsigned char* to2 =
reinterpret_cast<unsigned char*
>(to);
655 pstore(to2, ploadu<Packet16uc>(store2));
657 memcpy((
void*)to2, (
void*)store2, n2);
664EIGEN_ALWAYS_INLINE
void pstore_partial<float>(
float* to,
const Packet4f& from,
const Index n,
const Index offset) {
665 pstore_partial_common<Packet4f>(to, from, n, offset);
669EIGEN_ALWAYS_INLINE
void pstore_partial<int>(
int* to,
const Packet4i& from,
const Index n,
const Index offset) {
670 pstore_partial_common<Packet4i>(to, from, n, offset);
674EIGEN_ALWAYS_INLINE
void pstore_partial<short int>(
short int* to,
const Packet8s& from,
const Index n,
675 const Index offset) {
676 pstore_partial_common<Packet8s>(to, from, n, offset);
680EIGEN_ALWAYS_INLINE
void pstore_partial<unsigned short int>(
unsigned short int* to,
const Packet8us& from,
682 pstore_partial_common<Packet8us>(to, from, n, offset);
686EIGEN_ALWAYS_INLINE
void pstore_partial<bfloat16>(bfloat16* to,
const Packet8bf& from,
const Index n,
687 const Index offset) {
688 pstore_partial_common<Packet8us>(
reinterpret_cast<unsigned short int*
>(to), from.m_val, n, offset);
692EIGEN_ALWAYS_INLINE
void pstore_partial<signed char>(
signed char* to,
const Packet16c& from,
const Index n,
693 const Index offset) {
694 pstore_partial_common<Packet16c>(to, from, n, offset);
698EIGEN_ALWAYS_INLINE
void pstore_partial<unsigned char>(
unsigned char* to,
const Packet16uc& from,
const Index n,
699 const Index offset) {
700 pstore_partial_common<Packet16uc>(to, from, n, offset);
703template <
typename Packet>
704EIGEN_STRONG_INLINE Packet pset1_size4(
const __UNPACK_TYPE__(Packet) & from) {
705 Packet v = {from, from, from, from};
709template <
typename Packet>
710EIGEN_STRONG_INLINE Packet pset1_size8(
const __UNPACK_TYPE__(Packet) & from) {
711 Packet v = {from, from, from, from, from, from, from, from};
715template <
typename Packet>
716EIGEN_STRONG_INLINE Packet pset1_size16(
const __UNPACK_TYPE__(Packet) & from) {
717 Packet v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from};
722EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(
const float& from) {
723 return pset1_size4<Packet4f>(from);
727EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(
const int& from) {
728 return pset1_size4<Packet4i>(from);
732EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(
const short int& from) {
733 return pset1_size8<Packet8s>(from);
737EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(
const unsigned short int& from) {
738 return pset1_size8<Packet8us>(from);
742EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(
const signed char& from) {
743 return pset1_size16<Packet16c>(from);
747EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(
const unsigned char& from) {
748 return pset1_size16<Packet16uc>(from);
752EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(
unsigned int from) {
753 return reinterpret_cast<Packet4f
>(pset1<Packet4i>(from));
757EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(
const bfloat16& from) {
758 return pset1_size8<Packet8us>(
reinterpret_cast<const unsigned short int&
>(from));
761template <
typename Packet>
762EIGEN_STRONG_INLINE
void pbroadcast4_common(
const __UNPACK_TYPE__(Packet) * a, Packet& a0, Packet& a1, Packet& a2,
764 a3 = pload<Packet>(a);
765 a0 = vec_splat(a3, 0);
766 a1 = vec_splat(a3, 1);
767 a2 = vec_splat(a3, 2);
768 a3 = vec_splat(a3, 3);
772EIGEN_STRONG_INLINE
void pbroadcast4<Packet4f>(
const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
773 pbroadcast4_common<Packet4f>(a, a0, a1, a2, a3);
776EIGEN_STRONG_INLINE
void pbroadcast4<Packet4i>(
const int* a, Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) {
777 pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3);
780template <
typename Packet>
781EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_common(
const __UNPACK_TYPE__(Packet) * from,
Index stride,
782 const Index n = unpacket_traits<Packet>::size) {
783 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
784 eigen_internal_assert(n <= unpacket_traits<Packet>::size &&
"number of elements will gather past end of packet");
786 if (n == unpacket_traits<Packet>::size) {
787 return ploadu<Packet>(from);
789 return ploadu_partial<Packet>(from, n);
793 for (
Index i = 0; i < n; i++) {
794 a[i] = from[i * stride];
797 return pload_ignore<Packet>(a);
802EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather<float, Packet4f>(
const float* from,
Index stride) {
803 return pgather_common<Packet4f>(from, stride);
807EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather<int, Packet4i>(
const int* from,
Index stride) {
808 return pgather_common<Packet4i>(from, stride);
812EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather<short int, Packet8s>(
const short int* from,
Index stride) {
813 return pgather_common<Packet8s>(from, stride);
817EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us pgather<unsigned short int, Packet8us>(
const unsigned short int* from,
819 return pgather_common<Packet8us>(from, stride);
823EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather<bfloat16, Packet8bf>(
const bfloat16* from,
Index stride) {
824 return pgather_common<Packet8bf>(from, stride);
828EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather<signed char, Packet16c>(
const signed char* from,
Index stride) {
829 return pgather_common<Packet16c>(from, stride);
833EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather<unsigned char, Packet16uc>(
const unsigned char* from,
835 return pgather_common<Packet16uc>(from, stride);
839EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather_partial<float, Packet4f>(
const float* from,
Index stride,
841 return pgather_common<Packet4f>(from, stride, n);
845EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather_partial<int, Packet4i>(
const int* from,
Index stride,
847 return pgather_common<Packet4i>(from, stride, n);
851EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather_partial<short int, Packet8s>(
const short int* from,
Index stride,
853 return pgather_common<Packet8s>(from, stride, n);
857EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us
858pgather_partial<unsigned short int, Packet8us>(
const unsigned short int* from,
Index stride,
const Index n) {
859 return pgather_common<Packet8us>(from, stride, n);
863EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather_partial<bfloat16, Packet8bf>(
const bfloat16* from,
Index stride,
865 return pgather_common<Packet8bf>(from, stride, n);
869EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather_partial<signed char, Packet16c>(
const signed char* from,
871 return pgather_common<Packet16c>(from, stride, n);
875EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather_partial<unsigned char, Packet16uc>(
const unsigned char* from,
878 return pgather_common<Packet16uc>(from, stride, n);
881template <
typename Packet>
882EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter_common(__UNPACK_TYPE__(Packet) * to,
const Packet& from,
884 const Index n = unpacket_traits<Packet>::size) {
885 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
886 eigen_internal_assert(n <= unpacket_traits<Packet>::size &&
"number of elements will scatter past end of packet");
888 if (n == unpacket_traits<Packet>::size) {
889 return pstoreu(to, from);
891 return pstoreu_partial(to, from, n);
894 pstore<__UNPACK_TYPE__(Packet)>(a, from);
896 for (
Index i = 0; i < n; i++) {
897 to[i * stride] = a[i];
903EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter<float, Packet4f>(
float* to,
const Packet4f& from,
Index stride) {
904 pscatter_common<Packet4f>(to, from, stride);
908EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter<int, Packet4i>(
int* to,
const Packet4i& from,
Index stride) {
909 pscatter_common<Packet4i>(to, from, stride);
913EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter<short int, Packet8s>(
short int* to,
const Packet8s& from,
915 pscatter_common<Packet8s>(to, from, stride);
919EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter<unsigned short int, Packet8us>(
unsigned short int* to,
920 const Packet8us& from,
922 pscatter_common<Packet8us>(to, from, stride);
926EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter<bfloat16, Packet8bf>(bfloat16* to,
const Packet8bf& from,
928 pscatter_common<Packet8bf>(to, from, stride);
932EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter<signed char, Packet16c>(
signed char* to,
const Packet16c& from,
934 pscatter_common<Packet16c>(to, from, stride);
938EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter<unsigned char, Packet16uc>(
unsigned char* to,
939 const Packet16uc& from,
Index stride) {
940 pscatter_common<Packet16uc>(to, from, stride);
944EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter_partial<float, Packet4f>(
float* to,
const Packet4f& from,
946 pscatter_common<Packet4f>(to, from, stride, n);
950EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter_partial<int, Packet4i>(
int* to,
const Packet4i& from,
Index stride,
952 pscatter_common<Packet4i>(to, from, stride, n);
956EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter_partial<short int, Packet8s>(
short int* to,
const Packet8s& from,
958 pscatter_common<Packet8s>(to, from, stride, n);
962EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter_partial<unsigned short int, Packet8us>(
unsigned short int* to,
963 const Packet8us& from,
966 pscatter_common<Packet8us>(to, from, stride, n);
970EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter_partial<bfloat16, Packet8bf>(bfloat16* to,
const Packet8bf& from,
972 pscatter_common<Packet8bf>(to, from, stride, n);
976EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter_partial<signed char, Packet16c>(
signed char* to,
977 const Packet16c& from,
Index stride,
979 pscatter_common<Packet16c>(to, from, stride, n);
983EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter_partial<unsigned char, Packet16uc>(
unsigned char* to,
984 const Packet16uc& from,
986 pscatter_common<Packet16uc>(to, from, stride, n);
990EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(
const float& a) {
991 return pset1<Packet4f>(a) + p4f_COUNTDOWN;
994EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(
const int& a) {
995 return pset1<Packet4i>(a) + p4i_COUNTDOWN;
998EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(
const short int& a) {
999 return pset1<Packet8s>(a) + p8s_COUNTDOWN;
1002EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(
const unsigned short int& a) {
1003 return pset1<Packet8us>(a) + p8us_COUNTDOWN;
1006EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(
const signed char& a) {
1007 return pset1<Packet16c>(a) + p16c_COUNTDOWN;
1010EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(
const unsigned char& a) {
1011 return pset1<Packet16uc>(a) + p16uc_COUNTDOWN;
1015EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1019EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1023EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1027EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1031EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1035EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1039EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1044EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1048EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1052EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1056EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1060EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1064EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1069EIGEN_STRONG_INLINE Packet4f pnegate(
const Packet4f& a) {
1070#ifdef __POWER8_VECTOR__
1073 return vec_xor(a, p4f_MZERO);
1077EIGEN_STRONG_INLINE Packet16c pnegate(
const Packet16c& a) {
1078#ifdef __POWER8_VECTOR__
1081 return reinterpret_cast<Packet16c
>(p4i_ZERO) - a;
1085EIGEN_STRONG_INLINE Packet8s pnegate(
const Packet8s& a) {
1086#ifdef __POWER8_VECTOR__
1089 return reinterpret_cast<Packet8s
>(p4i_ZERO) - a;
1093EIGEN_STRONG_INLINE Packet4i pnegate(
const Packet4i& a) {
1094#ifdef __POWER8_VECTOR__
1097 return p4i_ZERO - a;
1102EIGEN_STRONG_INLINE Packet4f pconj(
const Packet4f& a) {
1106EIGEN_STRONG_INLINE Packet4i pconj(
const Packet4i& a) {
1111EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1112 return vec_madd(a, b, p4f_MZERO);
1115EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1119EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1120 return vec_mul(a, b);
1123EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1124 return vec_mul(a, b);
1127EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1128 return vec_mul(a, b);
1131EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1132 return vec_mul(a, b);
1136EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1138 Packet4f t, y_0, y_1;
1144 t = vec_nmsub(y_0, b, p4f_ONE);
1145 y_1 = vec_madd(y_0, t, y_0);
1147 return vec_madd(a, y_1, p4f_MZERO);
1149 return vec_div(a, b);
1154EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1155#if defined(_ARCH_PWR10) && (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11, 0, 0))
1156 return vec_div(a, b);
1158 EIGEN_UNUSED_VARIABLE(a);
1159 EIGEN_UNUSED_VARIABLE(b);
1160 eigen_assert(
false &&
"packet integer division are not supported by AltiVec");
1161 return pset1<Packet4i>(0);
1167EIGEN_STRONG_INLINE Packet4f pmadd(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
1168 return vec_madd(a, b, c);
1171EIGEN_STRONG_INLINE Packet4i pmadd(
const Packet4i& a,
const Packet4i& b,
const Packet4i& c) {
1175EIGEN_STRONG_INLINE Packet8s pmadd(
const Packet8s& a,
const Packet8s& b,
const Packet8s& c) {
1176 return vec_madd(a, b, c);
1179EIGEN_STRONG_INLINE Packet8us pmadd(
const Packet8us& a,
const Packet8us& b,
const Packet8us& c) {
1180 return vec_madd(a, b, c);
1183#ifdef EIGEN_VECTORIZE_VSX
1185EIGEN_STRONG_INLINE Packet4f pmsub(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
1186 return vec_msub(a, b, c);
1189EIGEN_STRONG_INLINE Packet4f pnmadd(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
1190 return vec_nmsub(a, b, c);
1193EIGEN_STRONG_INLINE Packet4f pnmsub(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
1194 return vec_nmadd(a, b, c);
1199EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1200#ifdef EIGEN_VECTORIZE_VSX
1203 __asm__(
"xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" :
"=&wa"(ret) :
"wa"(a),
"wa"(b));
1206 return vec_min(a, b);
1210EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1211 return vec_min(a, b);
1214EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1215 return vec_min(a, b);
1218EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1219 return vec_min(a, b);
1222EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1223 return vec_min(a, b);
1226EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1227 return vec_min(a, b);
1231EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1232#ifdef EIGEN_VECTORIZE_VSX
1235 __asm__(
"xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" :
"=&wa"(ret) :
"wa"(a),
"wa"(b));
1238 return vec_max(a, b);
1242EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1243 return vec_max(a, b);
1246EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1247 return vec_max(a, b);
1250EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1251 return vec_max(a, b);
1254EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1255 return vec_max(a, b);
1258EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1259 return vec_max(a, b);
1263EIGEN_STRONG_INLINE Packet4f pcmp_le(
const Packet4f& a,
const Packet4f& b) {
1264 return reinterpret_cast<Packet4f
>(vec_cmple(a, b));
1267#ifdef EIGEN_VECTORIZE_VSX
1269EIGEN_STRONG_INLINE Packet4f pcmp_lt(
const Packet4f& a,
const Packet4f& b) {
1270 return reinterpret_cast<Packet4f
>(vec_cmplt(a, b));
1274EIGEN_STRONG_INLINE Packet4f pcmp_eq(
const Packet4f& a,
const Packet4f& b) {
1275 return reinterpret_cast<Packet4f
>(vec_cmpeq(a, b));
1278EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(
const Packet4f& a,
const Packet4f& b) {
1279 Packet4f c =
reinterpret_cast<Packet4f
>(vec_cmpge(a, b));
1280 return vec_nor(c, c);
1283#ifdef EIGEN_VECTORIZE_VSX
1285EIGEN_STRONG_INLINE Packet4i pcmp_le(
const Packet4i& a,
const Packet4i& b) {
1286 return reinterpret_cast<Packet4i
>(vec_cmple(a, b));
1290EIGEN_STRONG_INLINE Packet4i pcmp_lt(
const Packet4i& a,
const Packet4i& b) {
1291 return reinterpret_cast<Packet4i
>(vec_cmplt(a, b));
1294EIGEN_STRONG_INLINE Packet4i pcmp_eq(
const Packet4i& a,
const Packet4i& b) {
1295 return reinterpret_cast<Packet4i
>(vec_cmpeq(a, b));
1297#ifdef EIGEN_VECTORIZE_VSX
1299EIGEN_STRONG_INLINE Packet8s pcmp_le(
const Packet8s& a,
const Packet8s& b) {
1300 return reinterpret_cast<Packet8s
>(vec_cmple(a, b));
1304EIGEN_STRONG_INLINE Packet8s pcmp_lt(
const Packet8s& a,
const Packet8s& b) {
1305 return reinterpret_cast<Packet8s
>(vec_cmplt(a, b));
1308EIGEN_STRONG_INLINE Packet8s pcmp_eq(
const Packet8s& a,
const Packet8s& b) {
1309 return reinterpret_cast<Packet8s
>(vec_cmpeq(a, b));
1311#ifdef EIGEN_VECTORIZE_VSX
1313EIGEN_STRONG_INLINE Packet8us pcmp_le(
const Packet8us& a,
const Packet8us& b) {
1314 return reinterpret_cast<Packet8us
>(vec_cmple(a, b));
1318EIGEN_STRONG_INLINE Packet8us pcmp_lt(
const Packet8us& a,
const Packet8us& b) {
1319 return reinterpret_cast<Packet8us
>(vec_cmplt(a, b));
1322EIGEN_STRONG_INLINE Packet8us pcmp_eq(
const Packet8us& a,
const Packet8us& b) {
1323 return reinterpret_cast<Packet8us
>(vec_cmpeq(a, b));
1325#ifdef EIGEN_VECTORIZE_VSX
1327EIGEN_STRONG_INLINE Packet16c pcmp_le(
const Packet16c& a,
const Packet16c& b) {
1328 return reinterpret_cast<Packet16c
>(vec_cmple(a, b));
1332EIGEN_STRONG_INLINE Packet16c pcmp_lt(
const Packet16c& a,
const Packet16c& b) {
1333 return reinterpret_cast<Packet16c
>(vec_cmplt(a, b));
1336EIGEN_STRONG_INLINE Packet16c pcmp_eq(
const Packet16c& a,
const Packet16c& b) {
1337 return reinterpret_cast<Packet16c
>(vec_cmpeq(a, b));
1339#ifdef EIGEN_VECTORIZE_VSX
1341EIGEN_STRONG_INLINE Packet16uc pcmp_le(
const Packet16uc& a,
const Packet16uc& b) {
1342 return reinterpret_cast<Packet16uc
>(vec_cmple(a, b));
1346EIGEN_STRONG_INLINE Packet16uc pcmp_lt(
const Packet16uc& a,
const Packet16uc& b) {
1347 return reinterpret_cast<Packet16uc
>(vec_cmplt(a, b));
1350EIGEN_STRONG_INLINE Packet16uc pcmp_eq(
const Packet16uc& a,
const Packet16uc& b) {
1351 return reinterpret_cast<Packet16uc
>(vec_cmpeq(a, b));
1355EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1356 return vec_and(a, b);
1359EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1360 return vec_and(a, b);
1363EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1364 return vec_and(a, b);
1367EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1368 return vec_and(a, b);
1371EIGEN_STRONG_INLINE Packet8bf pand<Packet8bf>(
const Packet8bf& a,
const Packet8bf& b) {
1372 return pand<Packet8us>(a, b);
1376EIGEN_STRONG_INLINE Packet4f por<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1377 return vec_or(a, b);
1380EIGEN_STRONG_INLINE Packet4i por<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1381 return vec_or(a, b);
1384EIGEN_STRONG_INLINE Packet8s por<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1385 return vec_or(a, b);
1388EIGEN_STRONG_INLINE Packet8us por<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1389 return vec_or(a, b);
1392EIGEN_STRONG_INLINE Packet8bf por<Packet8bf>(
const Packet8bf& a,
const Packet8bf& b) {
1393 return por<Packet8us>(a, b);
1397EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1398 return vec_xor(a, b);
1401EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1402 return vec_xor(a, b);
1405EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1406 return vec_xor(a, b);
1409EIGEN_STRONG_INLINE Packet8bf pxor<Packet8bf>(
const Packet8bf& a,
const Packet8bf& b) {
1410 return pxor<Packet8us>(a, b);
1414EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1415 return vec_andc(a, b);
1418EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1419 return vec_andc(a, b);
1423EIGEN_STRONG_INLINE Packet4f pselect(
const Packet4f& mask,
const Packet4f& a,
const Packet4f& b) {
1424 return vec_sel(b, a,
reinterpret_cast<Packet4ui
>(mask));
1428EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(
const Packet4f& a) {
1429 Packet4f t = vec_add(
1430 reinterpret_cast<Packet4f
>(vec_or(vec_and(
reinterpret_cast<Packet4ui
>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
1433#ifdef EIGEN_VECTORIZE_VSX
1434 __asm__(
"xvrspiz %x0, %x1\n\t" :
"=&wa"(res) :
"wa"(t));
1436 __asm__(
"vrfiz %0, %1\n\t" :
"=v"(res) :
"v"(t));
1442EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(
const Packet4f& a) {
1446EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(
const Packet4f& a) {
1447 return vec_floor(a);
1450EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(
const Packet4f& a) {
1451 return vec_trunc(a);
1453#ifdef EIGEN_VECTORIZE_VSX
1455EIGEN_STRONG_INLINE Packet4f print<Packet4f>(
const Packet4f& a) {
1458 __asm__(
"xvrspic %x0, %x1\n\t" :
"=&wa"(res) :
"wa"(a));
1464template <
typename Packet>
1465EIGEN_STRONG_INLINE Packet ploadu_common(
const __UNPACK_TYPE__(Packet) * from) {
1466 EIGEN_DEBUG_UNALIGNED_LOAD
1467#if defined(EIGEN_VECTORIZE_VSX)
1468 return vec_xl(0,
const_cast<__UNPACK_TYPE__(Packet)*
>(from));
1470 Packet16uc MSQ = vec_ld(0, (
unsigned char*)from);
1471 Packet16uc LSQ = vec_ld(15, (
unsigned char*)from);
1472 Packet16uc mask = vec_lvsl(0, from);
1474 return (Packet)vec_perm(MSQ, LSQ, mask);
1479EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(
const float* from) {
1480 return ploadu_common<Packet4f>(from);
1483EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(
const int* from) {
1484 return ploadu_common<Packet4i>(from);
1487EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(
const short int* from) {
1488 return ploadu_common<Packet8s>(from);
1491EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(
const unsigned short int* from) {
1492 return ploadu_common<Packet8us>(from);
1495EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(
const bfloat16* from) {
1496 return ploadu_common<Packet8us>(
reinterpret_cast<const unsigned short int*
>(from));
1499EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(
const signed char* from) {
1500 return ploadu_common<Packet16c>(from);
1503EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(
const unsigned char* from) {
1504 return ploadu_common<Packet16uc>(from);
1507template <
typename Packet>
1508EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(
const __UNPACK_TYPE__(Packet) * from,
const Index n,
1509 const Index offset) {
1510 const Index packet_size = unpacket_traits<Packet>::size;
1511 eigen_internal_assert(n + offset <= packet_size &&
"number of elements plus offset will read past end of packet");
1512 const Index size =
sizeof(__UNPACK_TYPE__(Packet));
1514 EIGEN_UNUSED_VARIABLE(packet_size);
1515 EIGEN_DEBUG_ALIGNED_LOAD
1516 EIGEN_DEBUG_UNALIGNED_LOAD
1517 Packet load = vec_xl_len(
const_cast<__UNPACK_TYPE__(Packet)*
>(from), n * size);
1519 Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
1521 load = Packet(vec_sro(Packet16uc(load), shift));
1523 load = Packet(vec_slo(Packet16uc(load), shift));
1529 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
1530 unsigned char* load2 =
reinterpret_cast<unsigned char*
>(load + offset);
1531 unsigned char* from2 =
reinterpret_cast<unsigned char*
>(
const_cast<__UNPACK_TYPE__(Packet)*
>(from));
1532 Index n2 = n * size;
1534 pstoreu(load2, ploadu<Packet16uc>(from2));
1536 memcpy((
void*)load2, (
void*)from2, n2);
1538 return pload_ignore<Packet>(load);
1540 return Packet(pset1<Packet16uc>(0));
1546EIGEN_ALWAYS_INLINE Packet4f ploadu_partial<Packet4f>(
const float* from,
const Index n,
const Index offset) {
1547 return ploadu_partial_common<Packet4f>(from, n, offset);
1550EIGEN_ALWAYS_INLINE Packet4i ploadu_partial<Packet4i>(
const int* from,
const Index n,
const Index offset) {
1551 return ploadu_partial_common<Packet4i>(from, n, offset);
1554EIGEN_ALWAYS_INLINE Packet8s ploadu_partial<Packet8s>(
const short int* from,
const Index n,
const Index offset) {
1555 return ploadu_partial_common<Packet8s>(from, n, offset);
1558EIGEN_ALWAYS_INLINE Packet8us ploadu_partial<Packet8us>(
const unsigned short int* from,
const Index n,
1559 const Index offset) {
1560 return ploadu_partial_common<Packet8us>(from, n, offset);
1563EIGEN_ALWAYS_INLINE Packet8bf ploadu_partial<Packet8bf>(
const bfloat16* from,
const Index n,
const Index offset) {
1564 return ploadu_partial_common<Packet8us>(
reinterpret_cast<const unsigned short int*
>(from), n, offset);
1567EIGEN_ALWAYS_INLINE Packet16c ploadu_partial<Packet16c>(
const signed char* from,
const Index n,
const Index offset) {
1568 return ploadu_partial_common<Packet16c>(from, n, offset);
1571EIGEN_ALWAYS_INLINE Packet16uc ploadu_partial<Packet16uc>(
const unsigned char* from,
const Index n,
1572 const Index offset) {
1573 return ploadu_partial_common<Packet16uc>(from, n, offset);
1576template <
typename Packet>
1577EIGEN_STRONG_INLINE Packet ploaddup_common(
const __UNPACK_TYPE__(Packet) * from) {
1579 if ((std::ptrdiff_t(from) % 16) == 0)
1580 p = pload<Packet>(from);
1582 p = ploadu<Packet>(from);
1583 return vec_mergeh(p, p);
1586EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(
const float* from) {
1587 return ploaddup_common<Packet4f>(from);
1590EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(
const int* from) {
1591 return ploaddup_common<Packet4i>(from);
1595EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(
const short int* from) {
1597 if ((std::ptrdiff_t(from) % 16) == 0)
1598 p = pload<Packet8s>(from);
1600 p = ploadu<Packet8s>(from);
1601 return vec_mergeh(p, p);
1605EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(
const unsigned short int* from) {
1607 if ((std::ptrdiff_t(from) % 16) == 0)
1608 p = pload<Packet8us>(from);
1610 p = ploadu<Packet8us>(from);
1611 return vec_mergeh(p, p);
1615EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(
const short int* from) {
1617 if ((std::ptrdiff_t(from) % 16) == 0)
1618 p = pload<Packet8s>(from);
1620 p = ploadu<Packet8s>(from);
1621 return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
1625EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(
const unsigned short int* from) {
1627 if ((std::ptrdiff_t(from) % 16) == 0)
1628 p = pload<Packet8us>(from);
1630 p = ploadu<Packet8us>(from);
1631 return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
1635EIGEN_STRONG_INLINE Packet8bf ploadquad<Packet8bf>(
const bfloat16* from) {
1636 return ploadquad<Packet8us>(
reinterpret_cast<const unsigned short int*
>(from));
1640EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(
const signed char* from) {
1642 if ((std::ptrdiff_t(from) % 16) == 0)
1643 p = pload<Packet16c>(from);
1645 p = ploadu<Packet16c>(from);
1646 return vec_mergeh(p, p);
1650EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(
const unsigned char* from) {
1652 if ((std::ptrdiff_t(from) % 16) == 0)
1653 p = pload<Packet16uc>(from);
1655 p = ploadu<Packet16uc>(from);
1656 return vec_mergeh(p, p);
1660EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(
const signed char* from) {
1662 if ((std::ptrdiff_t(from) % 16) == 0)
1663 p = pload<Packet16c>(from);
1665 p = ploadu<Packet16c>(from);
1666 return vec_perm(p, p, p16uc_QUADRUPLICATE16);
1670EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(
const unsigned char* from) {
1672 if ((std::ptrdiff_t(from) % 16) == 0)
1673 p = pload<Packet16uc>(from);
1675 p = ploadu<Packet16uc>(from);
1676 return vec_perm(p, p, p16uc_QUADRUPLICATE16);
1679template <
typename Packet>
1680EIGEN_STRONG_INLINE
void pstoreu_common(__UNPACK_TYPE__(Packet) * to,
const Packet& from) {
1681 EIGEN_DEBUG_UNALIGNED_STORE
1682#if defined(EIGEN_VECTORIZE_VSX)
1683 vec_xst(from, 0, to);
1687 Packet16uc MSQ, LSQ, edges;
1688 Packet16uc edgeAlign, align;
1690 MSQ = vec_ld(0, (
unsigned char*)to);
1691 LSQ = vec_ld(15, (
unsigned char*)to);
1692 edgeAlign = vec_lvsl(0, to);
1693 edges = vec_perm(LSQ, MSQ, edgeAlign);
1694 align = vec_lvsr(0, to);
1695 MSQ = vec_perm(edges, (Packet16uc)from, align);
1696 LSQ = vec_perm((Packet16uc)from, edges, align);
1697 vec_st(LSQ, 15, (
unsigned char*)to);
1698 vec_st(MSQ, 0, (
unsigned char*)to);
1702EIGEN_STRONG_INLINE
void pstoreu<float>(
float* to,
const Packet4f& from) {
1703 pstoreu_common<Packet4f>(to, from);
1706EIGEN_STRONG_INLINE
void pstoreu<int>(
int* to,
const Packet4i& from) {
1707 pstoreu_common<Packet4i>(to, from);
1710EIGEN_STRONG_INLINE
void pstoreu<short int>(
short int* to,
const Packet8s& from) {
1711 pstoreu_common<Packet8s>(to, from);
1714EIGEN_STRONG_INLINE
void pstoreu<unsigned short int>(
unsigned short int* to,
const Packet8us& from) {
1715 pstoreu_common<Packet8us>(to, from);
1718EIGEN_STRONG_INLINE
void pstoreu<bfloat16>(bfloat16* to,
const Packet8bf& from) {
1719 pstoreu_common<Packet8us>(
reinterpret_cast<unsigned short int*
>(to), from.m_val);
1722EIGEN_STRONG_INLINE
void pstoreu<signed char>(
signed char* to,
const Packet16c& from) {
1723 pstoreu_common<Packet16c>(to, from);
1726EIGEN_STRONG_INLINE
void pstoreu<unsigned char>(
unsigned char* to,
const Packet16uc& from) {
1727 pstoreu_common<Packet16uc>(to, from);
1730template <
typename Packet>
1731EIGEN_ALWAYS_INLINE
void pstoreu_partial_common(__UNPACK_TYPE__(Packet) * to,
const Packet& from,
const Index n,
1732 const Index offset) {
1733 const Index packet_size = unpacket_traits<Packet>::size;
1734 eigen_internal_assert(n + offset <= packet_size &&
"number of elements plus offset will write past end of packet");
1735 const Index size =
sizeof(__UNPACK_TYPE__(Packet));
1737 EIGEN_UNUSED_VARIABLE(packet_size);
1738 EIGEN_DEBUG_UNALIGNED_STORE
1739 Packet store = from;
1741 Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
1743 store = Packet(vec_slo(Packet16uc(store), shift));
1745 store = Packet(vec_sro(Packet16uc(store), shift));
1748 vec_xst_len(store, to, n * size);
1751 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
1752 pstore(store, from);
1753 unsigned char* store2 =
reinterpret_cast<unsigned char*
>(store + offset);
1754 unsigned char* to2 =
reinterpret_cast<unsigned char*
>(to);
1755 Index n2 = n * size;
1757 pstoreu(to2, ploadu<Packet16uc>(store2));
1759 memcpy((
void*)to2, (
void*)store2, n2);
1766EIGEN_ALWAYS_INLINE
void pstoreu_partial<float>(
float* to,
const Packet4f& from,
const Index n,
const Index offset) {
1767 pstoreu_partial_common<Packet4f>(to, from, n, offset);
1770EIGEN_ALWAYS_INLINE
void pstoreu_partial<int>(
int* to,
const Packet4i& from,
const Index n,
const Index offset) {
1771 pstoreu_partial_common<Packet4i>(to, from, n, offset);
1774EIGEN_ALWAYS_INLINE
void pstoreu_partial<short int>(
short int* to,
const Packet8s& from,
const Index n,
1775 const Index offset) {
1776 pstoreu_partial_common<Packet8s>(to, from, n, offset);
1779EIGEN_ALWAYS_INLINE
void pstoreu_partial<unsigned short int>(
unsigned short int* to,
const Packet8us& from,
1781 pstoreu_partial_common<Packet8us>(to, from, n, offset);
1784EIGEN_ALWAYS_INLINE
void pstoreu_partial<bfloat16>(bfloat16* to,
const Packet8bf& from,
const Index n,
1785 const Index offset) {
1786 pstoreu_partial_common<Packet8us>(
reinterpret_cast<unsigned short int*
>(to), from, n, offset);
1789EIGEN_ALWAYS_INLINE
void pstoreu_partial<signed char>(
signed char* to,
const Packet16c& from,
const Index n,
1790 const Index offset) {
1791 pstoreu_partial_common<Packet16c>(to, from, n, offset);
1794EIGEN_ALWAYS_INLINE
void pstoreu_partial<unsigned char>(
unsigned char* to,
const Packet16uc& from,
const Index n,
1795 const Index offset) {
1796 pstoreu_partial_common<Packet16uc>(to, from, n, offset);
1800EIGEN_STRONG_INLINE
void prefetch<float>(
const float* addr) {
1801 EIGEN_PPC_PREFETCH(addr);
1804EIGEN_STRONG_INLINE
void prefetch<int>(
const int* addr) {
1805 EIGEN_PPC_PREFETCH(addr);
1809EIGEN_STRONG_INLINE
float pfirst<Packet4f>(
const Packet4f& a) {
1810 EIGEN_ALIGN16
float x;
1815EIGEN_STRONG_INLINE
int pfirst<Packet4i>(
const Packet4i& a) {
1816 EIGEN_ALIGN16
int x;
1821template <
typename Packet>
1822EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) pfirst_common(
const Packet& a) {
1823 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) x;
1829EIGEN_STRONG_INLINE
short int pfirst<Packet8s>(
const Packet8s& a) {
1830 return pfirst_common<Packet8s>(a);
1834EIGEN_STRONG_INLINE
unsigned short int pfirst<Packet8us>(
const Packet8us& a) {
1835 return pfirst_common<Packet8us>(a);
1839EIGEN_STRONG_INLINE
signed char pfirst<Packet16c>(
const Packet16c& a) {
1840 return pfirst_common<Packet16c>(a);
1844EIGEN_STRONG_INLINE
unsigned char pfirst<Packet16uc>(
const Packet16uc& a) {
1845 return pfirst_common<Packet16uc>(a);
1849EIGEN_STRONG_INLINE Packet4f preverse(
const Packet4f& a) {
1850 return reinterpret_cast<Packet4f
>(
1851 vec_perm(
reinterpret_cast<Packet16uc
>(a),
reinterpret_cast<Packet16uc
>(a), p16uc_REVERSE32));
1854EIGEN_STRONG_INLINE Packet4i preverse(
const Packet4i& a) {
1855 return reinterpret_cast<Packet4i
>(
1856 vec_perm(
reinterpret_cast<Packet16uc
>(a),
reinterpret_cast<Packet16uc
>(a), p16uc_REVERSE32));
1859EIGEN_STRONG_INLINE Packet8s preverse(
const Packet8s& a) {
1860 return reinterpret_cast<Packet8s
>(
1861 vec_perm(
reinterpret_cast<Packet16uc
>(a),
reinterpret_cast<Packet16uc
>(a), p16uc_REVERSE16));
1864EIGEN_STRONG_INLINE Packet8us preverse(
const Packet8us& a) {
1865 return reinterpret_cast<Packet8us
>(
1866 vec_perm(
reinterpret_cast<Packet16uc
>(a),
reinterpret_cast<Packet16uc
>(a), p16uc_REVERSE16));
1869EIGEN_STRONG_INLINE Packet16c preverse(
const Packet16c& a) {
1870 return vec_perm(a, a, p16uc_REVERSE8);
1873EIGEN_STRONG_INLINE Packet16uc preverse(
const Packet16uc& a) {
1874 return vec_perm(a, a, p16uc_REVERSE8);
1877EIGEN_STRONG_INLINE Packet8bf preverse(
const Packet8bf& a) {
1878 return preverse<Packet8us>(a);
1882EIGEN_STRONG_INLINE Packet4f pabs(
const Packet4f& a) {
1886EIGEN_STRONG_INLINE Packet4i pabs(
const Packet4i& a) {
1890EIGEN_STRONG_INLINE Packet8s pabs(
const Packet8s& a) {
1894EIGEN_STRONG_INLINE Packet8us pabs(
const Packet8us& a) {
1898EIGEN_STRONG_INLINE Packet16c pabs(
const Packet16c& a) {
1902EIGEN_STRONG_INLINE Packet16uc pabs(
const Packet16uc& a) {
1906EIGEN_STRONG_INLINE Packet8bf pabs(
const Packet8bf& a) {
1907 EIGEN_DECLARE_CONST_FAST_Packet8us(abs_mask, 0x7FFF);
1908 return pand<Packet8us>(p8us_abs_mask, a);
1912EIGEN_STRONG_INLINE Packet8bf psignbit(
const Packet8bf& a) {
1913 return vec_sra(a.m_val, vec_splat_u16(15));
1916EIGEN_STRONG_INLINE Packet4f psignbit(
const Packet4f& a) {
1917 return (Packet4f)vec_sra((Packet4i)a, vec_splats((
unsigned int)(31)));
1921EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(
const Packet4i& a) {
1922 return vec_sra(a,
reinterpret_cast<Packet4ui
>(pset1<Packet4i>(N)));
1925EIGEN_STRONG_INLINE Packet4i plogical_shift_right(
const Packet4i& a) {
1926 return vec_sr(a,
reinterpret_cast<Packet4ui
>(pset1<Packet4i>(N)));
1929EIGEN_STRONG_INLINE Packet4i plogical_shift_left(
const Packet4i& a) {
1930 return vec_sl(a,
reinterpret_cast<Packet4ui
>(pset1<Packet4i>(N)));
1933EIGEN_STRONG_INLINE Packet4f plogical_shift_left(
const Packet4f& a) {
1934 const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1935 Packet4ui r = vec_sl(
reinterpret_cast<Packet4ui
>(a), p4ui_mask);
1936 return reinterpret_cast<Packet4f
>(r);
1940EIGEN_STRONG_INLINE Packet4f plogical_shift_right(
const Packet4f& a) {
1941 const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1942 Packet4ui r = vec_sr(
reinterpret_cast<Packet4ui
>(a), p4ui_mask);
1943 return reinterpret_cast<Packet4f
>(r);
1947EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(
const Packet4ui& a) {
1948 const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1949 return vec_sr(a, p4ui_mask);
1953EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(
const Packet4ui& a) {
1954 const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1955 return vec_sl(a, p4ui_mask);
1959EIGEN_STRONG_INLINE Packet8us plogical_shift_left(
const Packet8us& a) {
1960 const EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
1961 return vec_sl(a, p8us_mask);
1964EIGEN_STRONG_INLINE Packet8us plogical_shift_right(
const Packet8us& a) {
1965 const EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
1966 return vec_sr(a, p8us_mask);
1969EIGEN_STRONG_INLINE Packet4f Bf16ToF32Even(
const Packet8bf& bf) {
1970 return plogical_shift_left<16>(
reinterpret_cast<Packet4f
>(bf.m_val));
1973EIGEN_STRONG_INLINE Packet4f Bf16ToF32Odd(
const Packet8bf& bf) {
1974 const EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000);
1975 return pand<Packet4f>(
reinterpret_cast<Packet4f
>(bf.m_val),
reinterpret_cast<Packet4f
>(p4ui_high_mask));
1978EIGEN_ALWAYS_INLINE Packet8us pmerge(Packet4ui even, Packet4ui odd) {
1980 return vec_perm(
reinterpret_cast<Packet8us
>(odd),
reinterpret_cast<Packet8us
>(even), p16uc_MERGEO16);
1982 return vec_perm(
reinterpret_cast<Packet8us
>(even),
reinterpret_cast<Packet8us
>(odd), p16uc_MERGEE16);
1988EIGEN_STRONG_INLINE Packet8bf F32ToBf16Bool(Packet4f even, Packet4f odd) {
1989 return pmerge(
reinterpret_cast<Packet4ui
>(even),
reinterpret_cast<Packet4ui
>(odd));
1994#ifndef __VEC_CLASS_FP_NAN
1995#define __VEC_CLASS_FP_NAN (1 << 6)
1998#if defined(SUPPORT_BF16_SUBNORMALS) && !defined(__VEC_CLASS_FP_SUBNORMAL)
1999#define __VEC_CLASS_FP_SUBNORMAL_P (1 << 1)
2000#define __VEC_CLASS_FP_SUBNORMAL_N (1 << 0)
2002#define __VEC_CLASS_FP_SUBNORMAL (__VEC_CLASS_FP_SUBNORMAL_P | __VEC_CLASS_FP_SUBNORMAL_N)
2005EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f) {
2007 return reinterpret_cast<Packet8us
>(__builtin_vsx_xvcvspbf16(
reinterpret_cast<Packet16uc
>(p4f)));
2009 Packet4ui input =
reinterpret_cast<Packet4ui
>(p4f);
2010 Packet4ui lsb = plogical_shift_right<16>(input);
2011 lsb = pand<Packet4ui>(lsb,
reinterpret_cast<Packet4ui
>(p4i_ONE));
2013 EIGEN_DECLARE_CONST_FAST_Packet4ui(BIAS, 0x7FFFu);
2014 Packet4ui rounding_bias = padd<Packet4ui>(lsb, p4ui_BIAS);
2015 input = padd<Packet4ui>(input, rounding_bias);
2017 const EIGEN_DECLARE_CONST_FAST_Packet4ui(nan, 0x7FC00000);
2018#if defined(_ARCH_PWR9) && defined(EIGEN_VECTORIZE_VSX)
2019 Packet4bi nan_selector = vec_test_data_class(p4f, __VEC_CLASS_FP_NAN);
2020 input = vec_sel(input, p4ui_nan, nan_selector);
2022#ifdef SUPPORT_BF16_SUBNORMALS
2023 Packet4bi subnormal_selector = vec_test_data_class(p4f, __VEC_CLASS_FP_SUBNORMAL);
2024 input = vec_sel(input,
reinterpret_cast<Packet4ui
>(p4f), subnormal_selector);
2027#ifdef SUPPORT_BF16_SUBNORMALS
2029 const EIGEN_DECLARE_CONST_FAST_Packet4ui(exp_mask, 0x7F800000);
2030 Packet4ui
exp = pand<Packet4ui>(p4ui_exp_mask,
reinterpret_cast<Packet4ui
>(p4f));
2032 const EIGEN_DECLARE_CONST_FAST_Packet4ui(mantissa_mask, 0x7FFFFF);
2033 Packet4ui mantissa = pand<Packet4ui>(p4ui_mantissa_mask,
reinterpret_cast<Packet4ui
>(p4f));
2035 Packet4bi is_max_exp = vec_cmpeq(
exp, p4ui_exp_mask);
2036 Packet4bi is_mant_zero = vec_cmpeq(mantissa,
reinterpret_cast<Packet4ui
>(p4i_ZERO));
2038 Packet4ui nan_selector =
2039 pandnot<Packet4ui>(
reinterpret_cast<Packet4ui
>(is_max_exp),
reinterpret_cast<Packet4ui
>(is_mant_zero));
2041 Packet4bi is_zero_exp = vec_cmpeq(
exp,
reinterpret_cast<Packet4ui
>(p4i_ZERO));
2043 Packet4ui subnormal_selector =
2044 pandnot<Packet4ui>(
reinterpret_cast<Packet4ui
>(is_zero_exp),
reinterpret_cast<Packet4ui
>(is_mant_zero));
2046 input = vec_sel(input, p4ui_nan, nan_selector);
2047 input = vec_sel(input,
reinterpret_cast<Packet4ui
>(p4f), subnormal_selector);
2050 Packet4bi nan_selector = vec_cmpeq(p4f, p4f);
2052 input = vec_sel(p4ui_nan, input, nan_selector);
2056 input = plogical_shift_right<16>(input);
2057 return reinterpret_cast<Packet8us
>(input);
2068EIGEN_ALWAYS_INLINE Packet8bf Bf16PackHigh(Packet4f lo, Packet4f hi) {
2070 return vec_perm(
reinterpret_cast<Packet8us
>(lo),
reinterpret_cast<Packet8us
>(hi), p16uc_MERGEH16);
2072 return vec_perm(
reinterpret_cast<Packet8us
>(hi),
reinterpret_cast<Packet8us
>(lo), p16uc_MERGEE16);
2082EIGEN_ALWAYS_INLINE Packet8bf Bf16PackLow(Packet4f lo, Packet4f hi) {
2084 return vec_pack(
reinterpret_cast<Packet4ui
>(lo),
reinterpret_cast<Packet4ui
>(hi));
2086 return vec_perm(
reinterpret_cast<Packet8us
>(hi),
reinterpret_cast<Packet8us
>(lo), p16uc_MERGEO16);
2091EIGEN_ALWAYS_INLINE Packet8bf Bf16PackLow(Packet4f hi, Packet4f lo) {
2093 return vec_pack(
reinterpret_cast<Packet4ui
>(hi),
reinterpret_cast<Packet4ui
>(lo));
2095 return vec_perm(
reinterpret_cast<Packet8us
>(hi),
reinterpret_cast<Packet8us
>(lo), p16uc_MERGEE16);
2100EIGEN_ALWAYS_INLINE Packet8bf Bf16PackHigh(Packet4f hi, Packet4f lo) {
2102 return vec_perm(
reinterpret_cast<Packet8us
>(hi),
reinterpret_cast<Packet8us
>(lo), p16uc_MERGEL16);
2104 return vec_perm(
reinterpret_cast<Packet8us
>(hi),
reinterpret_cast<Packet8us
>(lo), p16uc_MERGEO16);
2114template <
bool lohi = true>
2115EIGEN_ALWAYS_INLINE Packet8bf F32ToBf16Two(Packet4f lo, Packet4f hi) {
2116 Packet8us p4f = Bf16PackHigh<lohi>(lo, hi);
2117 Packet8us p4f2 = Bf16PackLow<lohi>(lo, hi);
2119 Packet8us lsb = pand<Packet8us>(p4f, p8us_ONE);
2120 EIGEN_DECLARE_CONST_FAST_Packet8us(BIAS, 0x7FFFu);
2121 lsb = padd<Packet8us>(lsb, p8us_BIAS);
2122 lsb = padd<Packet8us>(lsb, p4f2);
2124 Packet8bi rounding_bias = vec_cmplt(lsb, p4f2);
2125 Packet8us input = psub<Packet8us>(p4f,
reinterpret_cast<Packet8us
>(rounding_bias));
2127#if defined(_ARCH_PWR9) && defined(EIGEN_VECTORIZE_VSX)
2128 Packet4bi nan_selector_lo = vec_test_data_class(lo, __VEC_CLASS_FP_NAN);
2129 Packet4bi nan_selector_hi = vec_test_data_class(hi, __VEC_CLASS_FP_NAN);
2130 Packet8us nan_selector =
2131 Bf16PackLow<lohi>(
reinterpret_cast<Packet4f
>(nan_selector_lo),
reinterpret_cast<Packet4f
>(nan_selector_hi));
2133 input = vec_sel(input, p8us_BIAS, nan_selector);
2135#ifdef SUPPORT_BF16_SUBNORMALS
2136 Packet4bi subnormal_selector_lo = vec_test_data_class(lo, __VEC_CLASS_FP_SUBNORMAL);
2137 Packet4bi subnormal_selector_hi = vec_test_data_class(hi, __VEC_CLASS_FP_SUBNORMAL);
2138 Packet8us subnormal_selector = Bf16PackLow<lohi>(
reinterpret_cast<Packet4f
>(subnormal_selector_lo),
2139 reinterpret_cast<Packet4f
>(subnormal_selector_hi));
2141 input = vec_sel(input,
reinterpret_cast<Packet8us
>(p4f), subnormal_selector);
2144#ifdef SUPPORT_BF16_SUBNORMALS
2146 const EIGEN_DECLARE_CONST_FAST_Packet8us(exp_mask, 0x7F80);
2147 Packet8us
exp = pand<Packet8us>(p8us_exp_mask, p4f);
2149 const EIGEN_DECLARE_CONST_FAST_Packet8us(mantissa_mask, 0x7Fu);
2150 Packet8us mantissa = pand<Packet8us>(p8us_mantissa_mask, p4f);
2152 Packet8bi is_max_exp = vec_cmpeq(
exp, p8us_exp_mask);
2153 Packet8bi is_mant_zero = vec_cmpeq(mantissa,
reinterpret_cast<Packet8us
>(p4i_ZERO));
2155 Packet8us nan_selector =
2156 pandnot<Packet8us>(
reinterpret_cast<Packet8us
>(is_max_exp),
reinterpret_cast<Packet8us
>(is_mant_zero));
2158 Packet8bi is_zero_exp = vec_cmpeq(
exp,
reinterpret_cast<Packet8us
>(p4i_ZERO));
2160 Packet8us subnormal_selector =
2161 pandnot<Packet8us>(
reinterpret_cast<Packet8us
>(is_zero_exp),
reinterpret_cast<Packet8us
>(is_mant_zero));
2164 input = vec_sel(input, p8us_BIAS, nan_selector);
2165 input = vec_sel(input,
reinterpret_cast<Packet8us
>(p4f), subnormal_selector);
2168 Packet4bi nan_selector_lo = vec_cmpeq(lo, lo);
2169 Packet4bi nan_selector_hi = vec_cmpeq(hi, hi);
2170 Packet8us nan_selector =
2171 Bf16PackLow<lohi>(
reinterpret_cast<Packet4f
>(nan_selector_lo),
reinterpret_cast<Packet4f
>(nan_selector_hi));
2173 input = vec_sel(p8us_BIAS, input, nan_selector);
2183EIGEN_STRONG_INLINE Packet8bf F32ToBf16Both(Packet4f lo, Packet4f hi) {
2185 Packet8bf fp16_0 = F32ToBf16(lo);
2186 Packet8bf fp16_1 = F32ToBf16(hi);
2187 return vec_pack(
reinterpret_cast<Packet4ui
>(fp16_0.m_val),
reinterpret_cast<Packet4ui
>(fp16_1.m_val));
2189 return F32ToBf16Two(lo, hi);
2196EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f even, Packet4f odd) {
2198 return pmerge(
reinterpret_cast<Packet4ui
>(F32ToBf16(even).m_val),
reinterpret_cast<Packet4ui
>(F32ToBf16(odd).m_val));
2200 return F32ToBf16Two<false>(even, odd);
2203#define BF16_TO_F32_UNARY_OP_WRAPPER(OP, A) \
2204 Packet4f a_even = Bf16ToF32Even(A); \
2205 Packet4f a_odd = Bf16ToF32Odd(A); \
2206 Packet4f op_even = OP(a_even); \
2207 Packet4f op_odd = OP(a_odd); \
2208 return F32ToBf16(op_even, op_odd);
2210#define BF16_TO_F32_BINARY_OP_WRAPPER(OP, A, B) \
2211 Packet4f a_even = Bf16ToF32Even(A); \
2212 Packet4f a_odd = Bf16ToF32Odd(A); \
2213 Packet4f b_even = Bf16ToF32Even(B); \
2214 Packet4f b_odd = Bf16ToF32Odd(B); \
2215 Packet4f op_even = OP(a_even, b_even); \
2216 Packet4f op_odd = OP(a_odd, b_odd); \
2217 return F32ToBf16(op_even, op_odd);
2219#define BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(OP, A, B) \
2220 Packet4f a_even = Bf16ToF32Even(A); \
2221 Packet4f a_odd = Bf16ToF32Odd(A); \
2222 Packet4f b_even = Bf16ToF32Even(B); \
2223 Packet4f b_odd = Bf16ToF32Odd(B); \
2224 Packet4f op_even = OP(a_even, b_even); \
2225 Packet4f op_odd = OP(a_odd, b_odd); \
2226 return F32ToBf16Bool(op_even, op_odd);
2229EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(
const Packet8bf& a,
const Packet8bf& b) {
2230 BF16_TO_F32_BINARY_OP_WRAPPER(padd<Packet4f>, a, b);
2234EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(
const Packet8bf& a,
const Packet8bf& b) {
2235 BF16_TO_F32_BINARY_OP_WRAPPER(pmul<Packet4f>, a, b);
2239EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(
const Packet8bf& a,
const Packet8bf& b) {
2240 BF16_TO_F32_BINARY_OP_WRAPPER(pdiv<Packet4f>, a, b);
2244EIGEN_STRONG_INLINE Packet8bf pnegate<Packet8bf>(
const Packet8bf& a) {
2245 EIGEN_DECLARE_CONST_FAST_Packet8us(neg_mask, 0x8000);
2246 return pxor<Packet8us>(p8us_neg_mask, a);
2250EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(
const Packet8bf& a,
const Packet8bf& b) {
2251 BF16_TO_F32_BINARY_OP_WRAPPER(psub<Packet4f>, a, b);
2255EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf>(
const Packet8bf& a) {
2256 BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a);
2260EIGEN_STRONG_INLINE Packet8bf pexp2<Packet8bf>(
const Packet8bf& a) {
2261 BF16_TO_F32_UNARY_OP_WRAPPER(generic_exp2, a);
2265EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(
const Packet4f& a,
const Packet4f& exponent) {
2266 return pldexp_generic(a, exponent);
2269EIGEN_STRONG_INLINE Packet8bf pldexp<Packet8bf>(
const Packet8bf& a,
const Packet8bf& exponent) {
2270 BF16_TO_F32_BINARY_OP_WRAPPER(pldexp<Packet4f>, a, exponent);
2274EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(
const Packet4f& a, Packet4f& exponent) {
2275 return pfrexp_generic(a, exponent);
2278EIGEN_STRONG_INLINE Packet8bf pfrexp<Packet8bf>(
const Packet8bf& a, Packet8bf& e) {
2279 Packet4f a_even = Bf16ToF32Even(a);
2280 Packet4f a_odd = Bf16ToF32Odd(a);
2283 Packet4f op_even = pfrexp<Packet4f>(a_even, e_even);
2284 Packet4f op_odd = pfrexp<Packet4f>(a_odd, e_odd);
2285 e = F32ToBf16(e_even, e_odd);
2286 return F32ToBf16(op_even, op_odd);
2290EIGEN_STRONG_INLINE Packet8bf psin<Packet8bf>(
const Packet8bf& a) {
2291 BF16_TO_F32_UNARY_OP_WRAPPER(psin_float, a);
2294EIGEN_STRONG_INLINE Packet8bf pcos<Packet8bf>(
const Packet8bf& a) {
2295 BF16_TO_F32_UNARY_OP_WRAPPER(pcos_float, a);
2298EIGEN_STRONG_INLINE Packet8bf plog<Packet8bf>(
const Packet8bf& a) {
2299 BF16_TO_F32_UNARY_OP_WRAPPER(plog_float, a);
2302EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf>(
const Packet8bf& a) {
2303 BF16_TO_F32_UNARY_OP_WRAPPER(pfloor<Packet4f>, a);
2306EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf>(
const Packet8bf& a) {
2307 BF16_TO_F32_UNARY_OP_WRAPPER(pceil<Packet4f>, a);
2310EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf>(
const Packet8bf& a) {
2311 BF16_TO_F32_UNARY_OP_WRAPPER(pround<Packet4f>, a);
2314EIGEN_STRONG_INLINE Packet8bf ptrunc<Packet8bf>(
const Packet8bf& a) {
2315 BF16_TO_F32_UNARY_OP_WRAPPER(ptrunc<Packet4f>, a);
2317#ifdef EIGEN_VECTORIZE_VSX
2319EIGEN_STRONG_INLINE Packet8bf print<Packet8bf>(
const Packet8bf& a) {
2320 BF16_TO_F32_UNARY_OP_WRAPPER(print<Packet4f>, a);
2324EIGEN_STRONG_INLINE Packet8bf pmadd(
const Packet8bf& a,
const Packet8bf& b,
const Packet8bf& c) {
2325 Packet4f a_even = Bf16ToF32Even(a);
2326 Packet4f a_odd = Bf16ToF32Odd(a);
2327 Packet4f b_even = Bf16ToF32Even(b);
2328 Packet4f b_odd = Bf16ToF32Odd(b);
2329 Packet4f c_even = Bf16ToF32Even(c);
2330 Packet4f c_odd = Bf16ToF32Odd(c);
2331 Packet4f pmadd_even = pmadd<Packet4f>(a_even, b_even, c_even);
2332 Packet4f pmadd_odd = pmadd<Packet4f>(a_odd, b_odd, c_odd);
2333 return F32ToBf16(pmadd_even, pmadd_odd);
2337EIGEN_STRONG_INLINE Packet8bf pmsub(
const Packet8bf& a,
const Packet8bf& b,
const Packet8bf& c) {
2338 Packet4f a_even = Bf16ToF32Even(a);
2339 Packet4f a_odd = Bf16ToF32Odd(a);
2340 Packet4f b_even = Bf16ToF32Even(b);
2341 Packet4f b_odd = Bf16ToF32Odd(b);
2342 Packet4f c_even = Bf16ToF32Even(c);
2343 Packet4f c_odd = Bf16ToF32Odd(c);
2344 Packet4f pmadd_even = pmsub<Packet4f>(a_even, b_even, c_even);
2345 Packet4f pmadd_odd = pmsub<Packet4f>(a_odd, b_odd, c_odd);
2346 return F32ToBf16(pmadd_even, pmadd_odd);
2349EIGEN_STRONG_INLINE Packet8bf pnmadd(
const Packet8bf& a,
const Packet8bf& b,
const Packet8bf& c) {
2350 Packet4f a_even = Bf16ToF32Even(a);
2351 Packet4f a_odd = Bf16ToF32Odd(a);
2352 Packet4f b_even = Bf16ToF32Even(b);
2353 Packet4f b_odd = Bf16ToF32Odd(b);
2354 Packet4f c_even = Bf16ToF32Even(c);
2355 Packet4f c_odd = Bf16ToF32Odd(c);
2356 Packet4f pmadd_even = pnmadd<Packet4f>(a_even, b_even, c_even);
2357 Packet4f pmadd_odd = pnmadd<Packet4f>(a_odd, b_odd, c_odd);
2358 return F32ToBf16(pmadd_even, pmadd_odd);
2362EIGEN_STRONG_INLINE Packet8bf pnmsub(
const Packet8bf& a,
const Packet8bf& b,
const Packet8bf& c) {
2363 Packet4f a_even = Bf16ToF32Even(a);
2364 Packet4f a_odd = Bf16ToF32Odd(a);
2365 Packet4f b_even = Bf16ToF32Even(b);
2366 Packet4f b_odd = Bf16ToF32Odd(b);
2367 Packet4f c_even = Bf16ToF32Even(c);
2368 Packet4f c_odd = Bf16ToF32Odd(c);
2369 Packet4f pmadd_even = pnmsub<Packet4f>(a_even, b_even, c_even);
2370 Packet4f pmadd_odd = pnmsub<Packet4f>(a_odd, b_odd, c_odd);
2371 return F32ToBf16(pmadd_even, pmadd_odd);
2375EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(
const Packet8bf& a,
const Packet8bf& b) {
2376 BF16_TO_F32_BINARY_OP_WRAPPER(pmin<Packet4f>, a, b);
2380EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(
const Packet8bf& a,
const Packet8bf& b) {
2381 BF16_TO_F32_BINARY_OP_WRAPPER(pmax<Packet4f>, a, b);
2385EIGEN_STRONG_INLINE Packet8bf pcmp_lt(
const Packet8bf& a,
const Packet8bf& b) {
2386 BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt<Packet4f>, a, b);
2389EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(
const Packet8bf& a,
const Packet8bf& b) {
2390 BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt_or_nan<Packet4f>, a, b);
2393EIGEN_STRONG_INLINE Packet8bf pcmp_le(
const Packet8bf& a,
const Packet8bf& b) {
2394 BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_le<Packet4f>, a, b);
2397EIGEN_STRONG_INLINE Packet8bf pcmp_eq(
const Packet8bf& a,
const Packet8bf& b) {
2398 BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_eq<Packet4f>, a, b);
2402EIGEN_STRONG_INLINE bfloat16 pfirst(
const Packet8bf& a) {
2403 return Eigen::bfloat16_impl::raw_uint16_to_bfloat16((pfirst<Packet8us>(a)));
2407EIGEN_STRONG_INLINE Packet8bf ploaddup<Packet8bf>(
const bfloat16* from) {
2408 return ploaddup<Packet8us>(
reinterpret_cast<const unsigned short int*
>(from));
2412EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(
const bfloat16& a) {
2413 bfloat16 countdown[8] = {bfloat16(0), bfloat16(1), bfloat16(2), bfloat16(3),
2414 bfloat16(4), bfloat16(5), bfloat16(6), bfloat16(7)};
2415 return padd<Packet8bf>(pset1<Packet8bf>(a), pload<Packet8bf>(countdown));
2419EIGEN_STRONG_INLINE
float predux<Packet4f>(
const Packet4f& a) {
2421 b = vec_sld(a, a, 8);
2423 b = vec_sld(sum, sum, 4);
2429EIGEN_STRONG_INLINE
int predux<Packet4i>(
const Packet4i& a) {
2431 b = vec_sld(a, a, 8);
2433 b = vec_sld(sum, sum, 4);
2439EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(
const Packet8bf& a) {
2440 float redux_even = predux<Packet4f>(Bf16ToF32Even(a));
2441 float redux_odd = predux<Packet4f>(Bf16ToF32Odd(a));
2442 float f32_result = redux_even + redux_odd;
2443 return bfloat16(f32_result);
2445template <
typename Packet>
2446EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size8(
const Packet& a) {
2449 __UNPACK_TYPE__(Packet) n[8];
2453 EIGEN_ALIGN16
int first_loader[4] = {vt.n[0], vt.n[1], vt.n[2], vt.n[3]};
2454 EIGEN_ALIGN16
int second_loader[4] = {vt.n[4], vt.n[5], vt.n[6], vt.n[7]};
2455 Packet4i first_half = pload<Packet4i>(first_loader);
2456 Packet4i second_half = pload<Packet4i>(second_loader);
2458 return static_cast<__UNPACK_TYPE__(Packet)
>(predux(first_half) + predux(second_half));
2462EIGEN_STRONG_INLINE
short int predux<Packet8s>(
const Packet8s& a) {
2463 return predux_size8<Packet8s>(a);
2467EIGEN_STRONG_INLINE
unsigned short int predux<Packet8us>(
const Packet8us& a) {
2468 return predux_size8<Packet8us>(a);
2471template <
typename Packet>
2472EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size16(
const Packet& a) {
2475 __UNPACK_TYPE__(Packet) n[16];
2479 EIGEN_ALIGN16
int first_loader[4] = {vt.n[0], vt.n[1], vt.n[2], vt.n[3]};
2480 EIGEN_ALIGN16
int second_loader[4] = {vt.n[4], vt.n[5], vt.n[6], vt.n[7]};
2481 EIGEN_ALIGN16
int third_loader[4] = {vt.n[8], vt.n[9], vt.n[10], vt.n[11]};
2482 EIGEN_ALIGN16
int fourth_loader[4] = {vt.n[12], vt.n[13], vt.n[14], vt.n[15]};
2484 Packet4i first_quarter = pload<Packet4i>(first_loader);
2485 Packet4i second_quarter = pload<Packet4i>(second_loader);
2486 Packet4i third_quarter = pload<Packet4i>(third_loader);
2487 Packet4i fourth_quarter = pload<Packet4i>(fourth_loader);
2489 return static_cast<__UNPACK_TYPE__(Packet)
>(predux(first_quarter) + predux(second_quarter) + predux(third_quarter) +
2490 predux(fourth_quarter));
2494EIGEN_STRONG_INLINE
signed char predux<Packet16c>(
const Packet16c& a) {
2495 return predux_size16<Packet16c>(a);
2499EIGEN_STRONG_INLINE
unsigned char predux<Packet16uc>(
const Packet16uc& a) {
2500 return predux_size16<Packet16uc>(a);
2506EIGEN_STRONG_INLINE
float predux_mul<Packet4f>(
const Packet4f& a) {
2508 prod = pmul(a, vec_sld(a, a, 8));
2509 return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
2513EIGEN_STRONG_INLINE
int predux_mul<Packet4i>(
const Packet4i& a) {
2514 EIGEN_ALIGN16
int aux[4];
2516 return aux[0] * aux[1] * aux[2] * aux[3];
2520EIGEN_STRONG_INLINE
short int predux_mul<Packet8s>(
const Packet8s& a) {
2521 Packet8s pair, quad, octo;
2523 pair = vec_mul(a, vec_sld(a, a, 8));
2524 quad = vec_mul(pair, vec_sld(pair, pair, 4));
2525 octo = vec_mul(quad, vec_sld(quad, quad, 2));
2527 return pfirst(octo);
2531EIGEN_STRONG_INLINE
unsigned short int predux_mul<Packet8us>(
const Packet8us& a) {
2532 Packet8us pair, quad, octo;
2534 pair = vec_mul(a, vec_sld(a, a, 8));
2535 quad = vec_mul(pair, vec_sld(pair, pair, 4));
2536 octo = vec_mul(quad, vec_sld(quad, quad, 2));
2538 return pfirst(octo);
2542EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(
const Packet8bf& a) {
2543 float redux_even = predux_mul<Packet4f>(Bf16ToF32Even(a));
2544 float redux_odd = predux_mul<Packet4f>(Bf16ToF32Odd(a));
2545 float f32_result = redux_even * redux_odd;
2546 return bfloat16(f32_result);
2550EIGEN_STRONG_INLINE
signed char predux_mul<Packet16c>(
const Packet16c& a) {
2551 Packet16c pair, quad, octo, result;
2553 pair = vec_mul(a, vec_sld(a, a, 8));
2554 quad = vec_mul(pair, vec_sld(pair, pair, 4));
2555 octo = vec_mul(quad, vec_sld(quad, quad, 2));
2556 result = vec_mul(octo, vec_sld(octo, octo, 1));
2558 return pfirst(result);
2562EIGEN_STRONG_INLINE
unsigned char predux_mul<Packet16uc>(
const Packet16uc& a) {
2563 Packet16uc pair, quad, octo, result;
2565 pair = vec_mul(a, vec_sld(a, a, 8));
2566 quad = vec_mul(pair, vec_sld(pair, pair, 4));
2567 octo = vec_mul(quad, vec_sld(quad, quad, 2));
2568 result = vec_mul(octo, vec_sld(octo, octo, 1));
2570 return pfirst(result);
2574template <
typename Packet>
2575EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_min4(
const Packet& a) {
2577 b = vec_min(a, vec_sld(a, a, 8));
2578 res = vec_min(b, vec_sld(b, b, 4));
2583EIGEN_STRONG_INLINE
float predux_min<Packet4f>(
const Packet4f& a) {
2584 return predux_min4<Packet4f>(a);
2588EIGEN_STRONG_INLINE
int predux_min<Packet4i>(
const Packet4i& a) {
2589 return predux_min4<Packet4i>(a);
2593EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(
const Packet8bf& a) {
2594 float redux_even = predux_min<Packet4f>(Bf16ToF32Even(a));
2595 float redux_odd = predux_min<Packet4f>(Bf16ToF32Odd(a));
2596 float f32_result = (std::min)(redux_even, redux_odd);
2597 return bfloat16(f32_result);
2601EIGEN_STRONG_INLINE
short int predux_min<Packet8s>(
const Packet8s& a) {
2602 Packet8s pair, quad, octo;
2605 pair = vec_min(a, vec_sld(a, a, 8));
2608 quad = vec_min(pair, vec_sld(pair, pair, 4));
2611 octo = vec_min(quad, vec_sld(quad, quad, 2));
2612 return pfirst(octo);
2616EIGEN_STRONG_INLINE
unsigned short int predux_min<Packet8us>(
const Packet8us& a) {
2617 Packet8us pair, quad, octo;
2620 pair = vec_min(a, vec_sld(a, a, 8));
2623 quad = vec_min(pair, vec_sld(pair, pair, 4));
2626 octo = vec_min(quad, vec_sld(quad, quad, 2));
2627 return pfirst(octo);
2631EIGEN_STRONG_INLINE
signed char predux_min<Packet16c>(
const Packet16c& a) {
2632 Packet16c pair, quad, octo, result;
2634 pair = vec_min(a, vec_sld(a, a, 8));
2635 quad = vec_min(pair, vec_sld(pair, pair, 4));
2636 octo = vec_min(quad, vec_sld(quad, quad, 2));
2637 result = vec_min(octo, vec_sld(octo, octo, 1));
2639 return pfirst(result);
2643EIGEN_STRONG_INLINE
unsigned char predux_min<Packet16uc>(
const Packet16uc& a) {
2644 Packet16uc pair, quad, octo, result;
2646 pair = vec_min(a, vec_sld(a, a, 8));
2647 quad = vec_min(pair, vec_sld(pair, pair, 4));
2648 octo = vec_min(quad, vec_sld(quad, quad, 2));
2649 result = vec_min(octo, vec_sld(octo, octo, 1));
2651 return pfirst(result);
2654template <
typename Packet>
2655EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(
const Packet& a) {
2657 b = vec_max(a, vec_sld(a, a, 8));
2658 res = vec_max(b, vec_sld(b, b, 4));
2663EIGEN_STRONG_INLINE
float predux_max<Packet4f>(
const Packet4f& a) {
2664 return predux_max4<Packet4f>(a);
2668EIGEN_STRONG_INLINE
int predux_max<Packet4i>(
const Packet4i& a) {
2669 return predux_max4<Packet4i>(a);
2673EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(
const Packet8bf& a) {
2674 float redux_even = predux_max<Packet4f>(Bf16ToF32Even(a));
2675 float redux_odd = predux_max<Packet4f>(Bf16ToF32Odd(a));
2676 float f32_result = (std::max)(redux_even, redux_odd);
2677 return bfloat16(f32_result);
2681EIGEN_STRONG_INLINE
short int predux_max<Packet8s>(
const Packet8s& a) {
2682 Packet8s pair, quad, octo;
2685 pair = vec_max(a, vec_sld(a, a, 8));
2688 quad = vec_max(pair, vec_sld(pair, pair, 4));
2691 octo = vec_max(quad, vec_sld(quad, quad, 2));
2692 return pfirst(octo);
2696EIGEN_STRONG_INLINE
unsigned short int predux_max<Packet8us>(
const Packet8us& a) {
2697 Packet8us pair, quad, octo;
2700 pair = vec_max(a, vec_sld(a, a, 8));
2703 quad = vec_max(pair, vec_sld(pair, pair, 4));
2706 octo = vec_max(quad, vec_sld(quad, quad, 2));
2707 return pfirst(octo);
2711EIGEN_STRONG_INLINE
signed char predux_max<Packet16c>(
const Packet16c& a) {
2712 Packet16c pair, quad, octo, result;
2714 pair = vec_max(a, vec_sld(a, a, 8));
2715 quad = vec_max(pair, vec_sld(pair, pair, 4));
2716 octo = vec_max(quad, vec_sld(quad, quad, 2));
2717 result = vec_max(octo, vec_sld(octo, octo, 1));
2719 return pfirst(result);
2723EIGEN_STRONG_INLINE
unsigned char predux_max<Packet16uc>(
const Packet16uc& a) {
2724 Packet16uc pair, quad, octo, result;
2726 pair = vec_max(a, vec_sld(a, a, 8));
2727 quad = vec_max(pair, vec_sld(pair, pair, 4));
2728 octo = vec_max(quad, vec_sld(quad, quad, 2));
2729 result = vec_max(octo, vec_sld(octo, octo, 1));
2731 return pfirst(result);
2735EIGEN_STRONG_INLINE
bool predux_any(
const Packet4f& x) {
2736 return vec_any_ne(x, pzero(x));
2739template <
typename T>
2740EIGEN_DEVICE_FUNC
inline void ptranpose_common(PacketBlock<T, 4>& kernel) {
2742 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2743 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2744 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2745 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2746 kernel.packet[0] = vec_mergeh(t0, t2);
2747 kernel.packet[1] = vec_mergel(t0, t2);
2748 kernel.packet[2] = vec_mergeh(t1, t3);
2749 kernel.packet[3] = vec_mergel(t1, t3);
2752EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) { ptranpose_common<Packet4f>(kernel); }
2754EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) { ptranpose_common<Packet4i>(kernel); }
2756EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
2757 Packet8s t0, t1, t2, t3;
2758 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2759 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2760 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2761 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2762 kernel.packet[0] = vec_mergeh(t0, t2);
2763 kernel.packet[1] = vec_mergel(t0, t2);
2764 kernel.packet[2] = vec_mergeh(t1, t3);
2765 kernel.packet[3] = vec_mergel(t1, t3);
2768EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
2769 Packet8us t0, t1, t2, t3;
2770 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2771 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2772 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2773 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2774 kernel.packet[0] = vec_mergeh(t0, t2);
2775 kernel.packet[1] = vec_mergel(t0, t2);
2776 kernel.packet[2] = vec_mergeh(t1, t3);
2777 kernel.packet[3] = vec_mergel(t1, t3);
2780EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet8bf, 4>& kernel) {
2781 Packet8us t0, t1, t2, t3;
2783 t0 = vec_mergeh(kernel.packet[0].m_val, kernel.packet[2].m_val);
2784 t1 = vec_mergel(kernel.packet[0].m_val, kernel.packet[2].m_val);
2785 t2 = vec_mergeh(kernel.packet[1].m_val, kernel.packet[3].m_val);
2786 t3 = vec_mergel(kernel.packet[1].m_val, kernel.packet[3].m_val);
2787 kernel.packet[0] = vec_mergeh(t0, t2);
2788 kernel.packet[1] = vec_mergel(t0, t2);
2789 kernel.packet[2] = vec_mergeh(t1, t3);
2790 kernel.packet[3] = vec_mergel(t1, t3);
2793EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
2794 Packet16c t0, t1, t2, t3;
2795 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2796 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2797 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2798 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2799 kernel.packet[0] = vec_mergeh(t0, t2);
2800 kernel.packet[1] = vec_mergel(t0, t2);
2801 kernel.packet[2] = vec_mergeh(t1, t3);
2802 kernel.packet[3] = vec_mergel(t1, t3);
2805EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
2806 Packet16uc t0, t1, t2, t3;
2807 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2808 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2809 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2810 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2811 kernel.packet[0] = vec_mergeh(t0, t2);
2812 kernel.packet[1] = vec_mergel(t0, t2);
2813 kernel.packet[2] = vec_mergeh(t1, t3);
2814 kernel.packet[3] = vec_mergel(t1, t3);
2817EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
2818 Packet8s v[8], sum[8];
2820 v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
2821 v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
2822 v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
2823 v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
2824 v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
2825 v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
2826 v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
2827 v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
2828 sum[0] = vec_mergeh(v[0], v[4]);
2829 sum[1] = vec_mergel(v[0], v[4]);
2830 sum[2] = vec_mergeh(v[1], v[5]);
2831 sum[3] = vec_mergel(v[1], v[5]);
2832 sum[4] = vec_mergeh(v[2], v[6]);
2833 sum[5] = vec_mergel(v[2], v[6]);
2834 sum[6] = vec_mergeh(v[3], v[7]);
2835 sum[7] = vec_mergel(v[3], v[7]);
2837 kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
2838 kernel.packet[1] = vec_mergel(sum[0], sum[4]);
2839 kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
2840 kernel.packet[3] = vec_mergel(sum[1], sum[5]);
2841 kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
2842 kernel.packet[5] = vec_mergel(sum[2], sum[6]);
2843 kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
2844 kernel.packet[7] = vec_mergel(sum[3], sum[7]);
2847EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
2848 Packet8us v[8], sum[8];
2850 v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
2851 v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
2852 v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
2853 v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
2854 v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
2855 v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
2856 v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
2857 v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
2858 sum[0] = vec_mergeh(v[0], v[4]);
2859 sum[1] = vec_mergel(v[0], v[4]);
2860 sum[2] = vec_mergeh(v[1], v[5]);
2861 sum[3] = vec_mergel(v[1], v[5]);
2862 sum[4] = vec_mergeh(v[2], v[6]);
2863 sum[5] = vec_mergel(v[2], v[6]);
2864 sum[6] = vec_mergeh(v[3], v[7]);
2865 sum[7] = vec_mergel(v[3], v[7]);
2867 kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
2868 kernel.packet[1] = vec_mergel(sum[0], sum[4]);
2869 kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
2870 kernel.packet[3] = vec_mergel(sum[1], sum[5]);
2871 kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
2872 kernel.packet[5] = vec_mergel(sum[2], sum[6]);
2873 kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
2874 kernel.packet[7] = vec_mergel(sum[3], sum[7]);
2877EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet8bf, 8>& kernel) {
2878 Packet8bf v[8], sum[8];
2880 v[0] = vec_mergeh(kernel.packet[0].m_val, kernel.packet[4].m_val);
2881 v[1] = vec_mergel(kernel.packet[0].m_val, kernel.packet[4].m_val);
2882 v[2] = vec_mergeh(kernel.packet[1].m_val, kernel.packet[5].m_val);
2883 v[3] = vec_mergel(kernel.packet[1].m_val, kernel.packet[5].m_val);
2884 v[4] = vec_mergeh(kernel.packet[2].m_val, kernel.packet[6].m_val);
2885 v[5] = vec_mergel(kernel.packet[2].m_val, kernel.packet[6].m_val);
2886 v[6] = vec_mergeh(kernel.packet[3].m_val, kernel.packet[7].m_val);
2887 v[7] = vec_mergel(kernel.packet[3].m_val, kernel.packet[7].m_val);
2888 sum[0] = vec_mergeh(v[0].m_val, v[4].m_val);
2889 sum[1] = vec_mergel(v[0].m_val, v[4].m_val);
2890 sum[2] = vec_mergeh(v[1].m_val, v[5].m_val);
2891 sum[3] = vec_mergel(v[1].m_val, v[5].m_val);
2892 sum[4] = vec_mergeh(v[2].m_val, v[6].m_val);
2893 sum[5] = vec_mergel(v[2].m_val, v[6].m_val);
2894 sum[6] = vec_mergeh(v[3].m_val, v[7].m_val);
2895 sum[7] = vec_mergel(v[3].m_val, v[7].m_val);
2897 kernel.packet[0] = vec_mergeh(sum[0].m_val, sum[4].m_val);
2898 kernel.packet[1] = vec_mergel(sum[0].m_val, sum[4].m_val);
2899 kernel.packet[2] = vec_mergeh(sum[1].m_val, sum[5].m_val);
2900 kernel.packet[3] = vec_mergel(sum[1].m_val, sum[5].m_val);
2901 kernel.packet[4] = vec_mergeh(sum[2].m_val, sum[6].m_val);
2902 kernel.packet[5] = vec_mergel(sum[2].m_val, sum[6].m_val);
2903 kernel.packet[6] = vec_mergeh(sum[3].m_val, sum[7].m_val);
2904 kernel.packet[7] = vec_mergel(sum[3].m_val, sum[7].m_val);
2907EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
2908 Packet16c step1[16], step2[16], step3[16];
2910 step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
2911 step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
2912 step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
2913 step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
2914 step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
2915 step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
2916 step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
2917 step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
2918 step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
2919 step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
2920 step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
2921 step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
2922 step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
2923 step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
2924 step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
2925 step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
2927 step2[0] = vec_mergeh(step1[0], step1[8]);
2928 step2[1] = vec_mergel(step1[0], step1[8]);
2929 step2[2] = vec_mergeh(step1[1], step1[9]);
2930 step2[3] = vec_mergel(step1[1], step1[9]);
2931 step2[4] = vec_mergeh(step1[2], step1[10]);
2932 step2[5] = vec_mergel(step1[2], step1[10]);
2933 step2[6] = vec_mergeh(step1[3], step1[11]);
2934 step2[7] = vec_mergel(step1[3], step1[11]);
2935 step2[8] = vec_mergeh(step1[4], step1[12]);
2936 step2[9] = vec_mergel(step1[4], step1[12]);
2937 step2[10] = vec_mergeh(step1[5], step1[13]);
2938 step2[11] = vec_mergel(step1[5], step1[13]);
2939 step2[12] = vec_mergeh(step1[6], step1[14]);
2940 step2[13] = vec_mergel(step1[6], step1[14]);
2941 step2[14] = vec_mergeh(step1[7], step1[15]);
2942 step2[15] = vec_mergel(step1[7], step1[15]);
2944 step3[0] = vec_mergeh(step2[0], step2[8]);
2945 step3[1] = vec_mergel(step2[0], step2[8]);
2946 step3[2] = vec_mergeh(step2[1], step2[9]);
2947 step3[3] = vec_mergel(step2[1], step2[9]);
2948 step3[4] = vec_mergeh(step2[2], step2[10]);
2949 step3[5] = vec_mergel(step2[2], step2[10]);
2950 step3[6] = vec_mergeh(step2[3], step2[11]);
2951 step3[7] = vec_mergel(step2[3], step2[11]);
2952 step3[8] = vec_mergeh(step2[4], step2[12]);
2953 step3[9] = vec_mergel(step2[4], step2[12]);
2954 step3[10] = vec_mergeh(step2[5], step2[13]);
2955 step3[11] = vec_mergel(step2[5], step2[13]);
2956 step3[12] = vec_mergeh(step2[6], step2[14]);
2957 step3[13] = vec_mergel(step2[6], step2[14]);
2958 step3[14] = vec_mergeh(step2[7], step2[15]);
2959 step3[15] = vec_mergel(step2[7], step2[15]);
2961 kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
2962 kernel.packet[1] = vec_mergel(step3[0], step3[8]);
2963 kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
2964 kernel.packet[3] = vec_mergel(step3[1], step3[9]);
2965 kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
2966 kernel.packet[5] = vec_mergel(step3[2], step3[10]);
2967 kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
2968 kernel.packet[7] = vec_mergel(step3[3], step3[11]);
2969 kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
2970 kernel.packet[9] = vec_mergel(step3[4], step3[12]);
2971 kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
2972 kernel.packet[11] = vec_mergel(step3[5], step3[13]);
2973 kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
2974 kernel.packet[13] = vec_mergel(step3[6], step3[14]);
2975 kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
2976 kernel.packet[15] = vec_mergel(step3[7], step3[15]);
2979EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
2980 Packet16uc step1[16], step2[16], step3[16];
2982 step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
2983 step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
2984 step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
2985 step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
2986 step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
2987 step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
2988 step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
2989 step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
2990 step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
2991 step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
2992 step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
2993 step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
2994 step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
2995 step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
2996 step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
2997 step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
2999 step2[0] = vec_mergeh(step1[0], step1[8]);
3000 step2[1] = vec_mergel(step1[0], step1[8]);
3001 step2[2] = vec_mergeh(step1[1], step1[9]);
3002 step2[3] = vec_mergel(step1[1], step1[9]);
3003 step2[4] = vec_mergeh(step1[2], step1[10]);
3004 step2[5] = vec_mergel(step1[2], step1[10]);
3005 step2[6] = vec_mergeh(step1[3], step1[11]);
3006 step2[7] = vec_mergel(step1[3], step1[11]);
3007 step2[8] = vec_mergeh(step1[4], step1[12]);
3008 step2[9] = vec_mergel(step1[4], step1[12]);
3009 step2[10] = vec_mergeh(step1[5], step1[13]);
3010 step2[11] = vec_mergel(step1[5], step1[13]);
3011 step2[12] = vec_mergeh(step1[6], step1[14]);
3012 step2[13] = vec_mergel(step1[6], step1[14]);
3013 step2[14] = vec_mergeh(step1[7], step1[15]);
3014 step2[15] = vec_mergel(step1[7], step1[15]);
3016 step3[0] = vec_mergeh(step2[0], step2[8]);
3017 step3[1] = vec_mergel(step2[0], step2[8]);
3018 step3[2] = vec_mergeh(step2[1], step2[9]);
3019 step3[3] = vec_mergel(step2[1], step2[9]);
3020 step3[4] = vec_mergeh(step2[2], step2[10]);
3021 step3[5] = vec_mergel(step2[2], step2[10]);
3022 step3[6] = vec_mergeh(step2[3], step2[11]);
3023 step3[7] = vec_mergel(step2[3], step2[11]);
3024 step3[8] = vec_mergeh(step2[4], step2[12]);
3025 step3[9] = vec_mergel(step2[4], step2[12]);
3026 step3[10] = vec_mergeh(step2[5], step2[13]);
3027 step3[11] = vec_mergel(step2[5], step2[13]);
3028 step3[12] = vec_mergeh(step2[6], step2[14]);
3029 step3[13] = vec_mergel(step2[6], step2[14]);
3030 step3[14] = vec_mergeh(step2[7], step2[15]);
3031 step3[15] = vec_mergel(step2[7], step2[15]);
3033 kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
3034 kernel.packet[1] = vec_mergel(step3[0], step3[8]);
3035 kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
3036 kernel.packet[3] = vec_mergel(step3[1], step3[9]);
3037 kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
3038 kernel.packet[5] = vec_mergel(step3[2], step3[10]);
3039 kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
3040 kernel.packet[7] = vec_mergel(step3[3], step3[11]);
3041 kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
3042 kernel.packet[9] = vec_mergel(step3[4], step3[12]);
3043 kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
3044 kernel.packet[11] = vec_mergel(step3[5], step3[13]);
3045 kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
3046 kernel.packet[13] = vec_mergel(step3[6], step3[14]);
3047 kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
3048 kernel.packet[15] = vec_mergel(step3[7], step3[15]);
3052#ifdef EIGEN_VECTORIZE_VSX
3053typedef __vector
double Packet2d;
3054typedef __vector
unsigned long long Packet2ul;
3055typedef __vector
long long Packet2l;
3057typedef Packet2ul Packet2bl;
3059typedef __vector __bool
long Packet2bl;
3062static Packet2l p2l_ZERO =
reinterpret_cast<Packet2l
>(p4i_ZERO);
3063static Packet2ul p2ul_SIGN = {0x8000000000000000ull, 0x8000000000000000ull};
3064static Packet2ul p2ul_PREV0DOT5 = {0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull};
3065static Packet2d p2d_ONE = {1.0, 1.0};
3066static Packet2d p2d_ZERO =
reinterpret_cast<Packet2d
>(p4f_ZERO);
3067static Packet2d p2d_MZERO = {numext::bit_cast<double>(0x8000000000000000ull),
3068 numext::bit_cast<double>(0x8000000000000000ull)};
3071static Packet2d p2d_COUNTDOWN =
3072 reinterpret_cast<Packet2d
>(vec_sld(
reinterpret_cast<Packet4f
>(p2d_ZERO),
reinterpret_cast<Packet4f
>(p2d_ONE), 8));
3074static Packet2d p2d_COUNTDOWN =
3075 reinterpret_cast<Packet2d
>(vec_sld(
reinterpret_cast<Packet4f
>(p2d_ONE),
reinterpret_cast<Packet4f
>(p2d_ZERO), 8));
3079Packet2d vec_splat_dbl(Packet2d& a) {
3080 return vec_splat(a, index);
3084struct packet_traits<double> : default_packet_traits {
3085 typedef Packet2d type;
3086 typedef Packet2d half;
3089 AlignedOnScalar = 1,
3099 HasSin = EIGEN_FAST_MATH,
3100 HasCos = EIGEN_FAST_MATH,
3101 HasTanh = EIGEN_FAST_MATH,
3102 HasErf = EIGEN_FAST_MATH,
3103 HasErfc = EIGEN_FAST_MATH,
3113#if !EIGEN_COMP_CLANG
3123struct unpacket_traits<Packet2d> {
3124 typedef double type;
3125 typedef Packet2l integer_packet;
3129 vectorizable =
true,
3130 masked_load_available =
false,
3131 masked_store_available =
false
3133 typedef Packet2d half;
3136struct unpacket_traits<Packet2l> {
3137 typedef int64_t type;
3138 typedef Packet2l half;
3142 vectorizable =
false,
3143 masked_load_available =
false,
3144 masked_store_available =
false
3148inline std::ostream& operator<<(std::ostream& s,
const Packet2l& v) {
3154 s << vt.n[0] <<
", " << vt.n[1];
3158inline std::ostream& operator<<(std::ostream& s,
const Packet2d& v) {
3164 s << vt.n[0] <<
", " << vt.n[1];
3170EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(
const double* from) {
3171 EIGEN_DEBUG_ALIGNED_LOAD
3172 return vec_xl(0,
const_cast<double*
>(from));
3176EIGEN_ALWAYS_INLINE Packet2d pload_partial<Packet2d>(
const double* from,
const Index n,
const Index offset) {
3177 return pload_partial_common<Packet2d>(from, n, offset);
3181EIGEN_STRONG_INLINE
void pstore<double>(
double* to,
const Packet2d& from) {
3182 EIGEN_DEBUG_ALIGNED_STORE
3183 vec_xst(from, 0, to);
3187EIGEN_ALWAYS_INLINE
void pstore_partial<double>(
double* to,
const Packet2d& from,
const Index n,
const Index offset) {
3188 pstore_partial_common<Packet2d>(to, from, n, offset);
3192EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(
const double& from) {
3193 Packet2d v = {from, from};
3197EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(
const int64_t& from) {
3198 Packet2l v = {from, from};
3203EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(
unsigned long from) {
3204 Packet2l v = {
static_cast<long long>(from),
static_cast<long long>(from)};
3205 return reinterpret_cast<Packet2d
>(v);
3209EIGEN_STRONG_INLINE
void pbroadcast4<Packet2d>(
const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2,
3212 a0 = pset1<Packet2d>(a[0]);
3213 a1 = pset1<Packet2d>(a[1]);
3214 a2 = pset1<Packet2d>(a[2]);
3215 a3 = pset1<Packet2d>(a[3]);
3219EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather<double, Packet2d>(
const double* from,
Index stride) {
3220 return pgather_common<Packet2d>(from, stride);
3223EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather_partial<double, Packet2d>(
const double* from,
Index stride,
3225 return pgather_common<Packet2d>(from, stride, n);
3228EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter<double, Packet2d>(
double* to,
const Packet2d& from,
Index stride) {
3229 pscatter_common<Packet2d>(to, from, stride);
3232EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void pscatter_partial<double, Packet2d>(
double* to,
const Packet2d& from,
3234 pscatter_common<Packet2d>(to, from, stride, n);
3238EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(
const double& a) {
3239 return pset1<Packet2d>(a) + p2d_COUNTDOWN;
3243EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
3248EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
3253EIGEN_STRONG_INLINE Packet2d pnegate(
const Packet2d& a) {
3254#ifdef __POWER8_VECTOR__
3257 return vec_xor(a, p2d_MZERO);
3262EIGEN_STRONG_INLINE Packet2d pconj(
const Packet2d& a) {
3267EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
3268 return vec_madd(a, b, p2d_MZERO);
3271EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
3272 return vec_div(a, b);
3277EIGEN_STRONG_INLINE Packet2d pmadd(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
3278 return vec_madd(a, b, c);
3281EIGEN_STRONG_INLINE Packet2d pmsub(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
3282 return vec_msub(a, b, c);
3285EIGEN_STRONG_INLINE Packet2d pnmadd(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
3286 return vec_nmsub(a, b, c);
3289EIGEN_STRONG_INLINE Packet2d pnmsub(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
3290 return vec_nmadd(a, b, c);
3294EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
3297 __asm__(
"xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" :
"=&wa"(ret) :
"wa"(a),
"wa"(b));
3302EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
3305 __asm__(
"xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" :
"=&wa"(ret) :
"wa"(a),
"wa"(b));
3310EIGEN_STRONG_INLINE Packet2d pcmp_le(
const Packet2d& a,
const Packet2d& b) {
3311 return reinterpret_cast<Packet2d
>(vec_cmple(a, b));
3314EIGEN_STRONG_INLINE Packet2d pcmp_lt(
const Packet2d& a,
const Packet2d& b) {
3315 return reinterpret_cast<Packet2d
>(vec_cmplt(a, b));
3318EIGEN_STRONG_INLINE Packet2d pcmp_eq(
const Packet2d& a,
const Packet2d& b) {
3319 return reinterpret_cast<Packet2d
>(vec_cmpeq(a, b));
3322#ifdef __POWER8_VECTOR__
3323EIGEN_STRONG_INLINE Packet2l pcmp_eq(
const Packet2l& a,
const Packet2l& b) {
3324 return reinterpret_cast<Packet2l
>(vec_cmpeq(a, b));
3327EIGEN_STRONG_INLINE Packet2l pcmp_eq(
const Packet2l& a,
const Packet2l& b) {
3328 Packet4i halves =
reinterpret_cast<Packet4i
>(vec_cmpeq(
reinterpret_cast<Packet4i
>(a),
reinterpret_cast<Packet4i
>(b)));
3329 Packet4i flipped = vec_perm(halves, halves, p16uc_COMPLEX32_REV);
3330 return reinterpret_cast<Packet2l
>(pand(halves, flipped));
3334EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(
const Packet2d& a,
const Packet2d& b) {
3335 Packet2d c =
reinterpret_cast<Packet2d
>(vec_cmpge(a, b));
3336 return vec_nor(c, c);
3340EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
3341 return vec_and(a, b);
3345EIGEN_STRONG_INLINE Packet2d por<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
3346 return vec_or(a, b);
3350EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
3351 return vec_xor(a, b);
3355EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
3356 return vec_and(a, vec_nor(b, b));
3360EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(
const Packet2d& a) {
3361 Packet2d t = vec_add(
3362 reinterpret_cast<Packet2d
>(vec_or(vec_and(
reinterpret_cast<Packet2ul
>(a), p2ul_SIGN), p2ul_PREV0DOT5)), a);
3365 __asm__(
"xvrdpiz %x0, %x1\n\t" :
"=&wa"(res) :
"wa"(t));
3370EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(
const Packet2d& a) {
3374EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(
const Packet2d& a) {
3375 return vec_floor(a);
3378EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(
const Packet2d& a) {
3379 return vec_trunc(a);
3382EIGEN_STRONG_INLINE Packet2d print<Packet2d>(
const Packet2d& a) {
3385 __asm__(
"xvrdpic %x0, %x1\n\t" :
"=&wa"(res) :
"wa"(a));
3391EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(
const double* from) {
3392 EIGEN_DEBUG_UNALIGNED_LOAD
3393 return vec_xl(0,
const_cast<double*
>(from));
3397EIGEN_ALWAYS_INLINE Packet2d ploadu_partial<Packet2d>(
const double* from,
const Index n,
const Index offset) {
3398 return ploadu_partial_common<Packet2d>(from, n, offset);
3402EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(
const double* from) {
3404 if ((std::ptrdiff_t(from) % 16) == 0)
3405 p = pload<Packet2d>(from);
3407 p = ploadu<Packet2d>(from);
3408 return vec_splat_dbl<0>(p);
3412EIGEN_STRONG_INLINE
void pstoreu<double>(
double* to,
const Packet2d& from) {
3413 EIGEN_DEBUG_UNALIGNED_STORE
3414 vec_xst(from, 0, to);
3418EIGEN_ALWAYS_INLINE
void pstoreu_partial<double>(
double* to,
const Packet2d& from,
const Index n,
const Index offset) {
3419 pstoreu_partial_common<Packet2d>(to, from, n, offset);
3423EIGEN_STRONG_INLINE
void prefetch<double>(
const double* addr) {
3424 EIGEN_PPC_PREFETCH(addr);
3428EIGEN_STRONG_INLINE
double pfirst<Packet2d>(
const Packet2d& a) {
3429 EIGEN_ALIGN16
double x[2];
3430 pstore<double>(x, a);
3435EIGEN_STRONG_INLINE Packet2d preverse(
const Packet2d& a) {
3436 return vec_sld(a, a, 8);
3439EIGEN_STRONG_INLINE Packet2d pabs(
const Packet2d& a) {
3442#ifdef __POWER8_VECTOR__
3444EIGEN_STRONG_INLINE Packet2d psignbit(
const Packet2d& a) {
3445 return (Packet2d)vec_sra((Packet2l)a, vec_splats((
unsigned long long)(63)));
3449static Packet16uc p16uc_DUPSIGN = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
3451static Packet16uc p16uc_DUPSIGN = {7, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15};
3455EIGEN_STRONG_INLINE Packet2d psignbit(
const Packet2d& a) {
3456 Packet16c tmp = vec_sra(
reinterpret_cast<Packet16c
>(a), vec_splats((
unsigned char)(7)));
3457 return reinterpret_cast<Packet2d
>(vec_perm(tmp, tmp, p16uc_DUPSIGN));
3462inline Packet2l pcast<Packet2d, Packet2l>(
const Packet2d& x);
3465inline Packet2d pcast<Packet2l, Packet2d>(
const Packet2l& x);
3473#ifdef __POWER8_VECTOR__
3476EIGEN_STRONG_INLINE Packet2l plogical_shift_left(
const Packet2l& a) {
3477 const Packet2ul shift = {N, N};
3478 return vec_sl(a, shift);
3482EIGEN_STRONG_INLINE Packet2l plogical_shift_right(
const Packet2l& a) {
3483 const Packet2ul shift = {N, N};
3484 return vec_sr(a, shift);
3491EIGEN_ALWAYS_INLINE Packet4i shift_even_left(
const Packet4i& a) {
3492 static const Packet16uc perm = {0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
3493 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b};
3495 return vec_perm(p4i_ZERO, a, perm);
3497 return vec_perm(a, p4i_ZERO, perm);
3503EIGEN_ALWAYS_INLINE Packet4i shift_odd_right(
const Packet4i& a) {
3504 static const Packet16uc perm = {0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
3505 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b};
3507 return vec_perm(p4i_ZERO, a, perm);
3509 return vec_perm(a, p4i_ZERO, perm);
3513template <
int N,
typename EnableIf =
void>
3514struct plogical_shift_left_impl;
3517struct plogical_shift_left_impl<N, std::enable_if_t<(N < 32) && (N >= 0)> > {
3518 static EIGEN_STRONG_INLINE Packet2l run(
const Packet2l& a) {
3519 static const unsigned n =
static_cast<unsigned>(N);
3520 const Packet4ui shift = {n, n, n, n};
3521 const Packet4i ai =
reinterpret_cast<Packet4i
>(a);
3522 static const unsigned m =
static_cast<unsigned>(32 - N);
3523 const Packet4ui shift_right = {m, m, m, m};
3524 const Packet4i out_hi = vec_sl(ai, shift);
3525 const Packet4i out_lo = shift_even_left(vec_sr(ai, shift_right));
3526 return reinterpret_cast<Packet2l
>(por<Packet4i>(out_hi, out_lo));
3531struct plogical_shift_left_impl<N, std::enable_if_t<(N >= 32)> > {
3532 static EIGEN_STRONG_INLINE Packet2l run(
const Packet2l& a) {
3533 static const unsigned m =
static_cast<unsigned>(N - 32);
3534 const Packet4ui shift = {m, m, m, m};
3535 const Packet4i ai =
reinterpret_cast<Packet4i
>(a);
3536 return reinterpret_cast<Packet2l
>(shift_even_left(vec_sl(ai, shift)));
3541EIGEN_STRONG_INLINE Packet2l plogical_shift_left(
const Packet2l& a) {
3542 return plogical_shift_left_impl<N>::run(a);
3545template <
int N,
typename EnableIf =
void>
3546struct plogical_shift_right_impl;
3549struct plogical_shift_right_impl<N, std::enable_if_t<(N < 32) && (N >= 0)> > {
3550 static EIGEN_STRONG_INLINE Packet2l run(
const Packet2l& a) {
3551 static const unsigned n =
static_cast<unsigned>(N);
3552 const Packet4ui shift = {n, n, n, n};
3553 const Packet4i ai =
reinterpret_cast<Packet4i
>(a);
3554 static const unsigned m =
static_cast<unsigned>(32 - N);
3555 const Packet4ui shift_left = {m, m, m, m};
3556 const Packet4i out_lo = vec_sr(ai, shift);
3557 const Packet4i out_hi = shift_odd_right(vec_sl(ai, shift_left));
3558 return reinterpret_cast<Packet2l
>(por<Packet4i>(out_hi, out_lo));
3563struct plogical_shift_right_impl<N, std::enable_if_t<(N >= 32)> > {
3564 static EIGEN_STRONG_INLINE Packet2l run(
const Packet2l& a) {
3565 static const unsigned m =
static_cast<unsigned>(N - 32);
3566 const Packet4ui shift = {m, m, m, m};
3567 const Packet4i ai =
reinterpret_cast<Packet4i
>(a);
3568 return reinterpret_cast<Packet2l
>(shift_odd_right(vec_sr(ai, shift)));
3573EIGEN_STRONG_INLINE Packet2l plogical_shift_right(
const Packet2l& a) {
3574 return plogical_shift_right_impl<N>::run(a);
3579EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(
const Packet2d& a,
const Packet2d& exponent) {
3581 const Packet2d max_exponent = pset1<Packet2d>(2099.0);
3582 const Packet2l e = pcast<Packet2d, Packet2l>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
3585 const Packet2l bias = {1023, 1023};
3586 Packet2l b = plogical_shift_right<2>(e);
3587 Packet2d c =
reinterpret_cast<Packet2d
>(plogical_shift_left<52>(b + bias));
3588 Packet2d out = pmul(pmul(pmul(a, c), c), c);
3589 b = psub(psub(psub(e, b), b), b);
3590 c =
reinterpret_cast<Packet2d
>(plogical_shift_left<52>(b + bias));
3597EIGEN_STRONG_INLINE Packet2d pfrexp_generic_get_biased_exponent(
const Packet2d& a) {
3598 return pcast<Packet2l, Packet2d>(plogical_shift_right<52>(
reinterpret_cast<Packet2l
>(pabs(a))));
3602EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(
const Packet2d& a, Packet2d& exponent) {
3603 return pfrexp_generic(a, exponent);
3607EIGEN_STRONG_INLINE
double predux<Packet2d>(
const Packet2d& a) {
3609 b =
reinterpret_cast<Packet2d
>(vec_sld(
reinterpret_cast<Packet4f
>(a),
reinterpret_cast<Packet4f
>(a), 8));
3611 return pfirst<Packet2d>(sum);
3617EIGEN_STRONG_INLINE
double predux_mul<Packet2d>(
const Packet2d& a) {
3619 pmul(a,
reinterpret_cast<Packet2d
>(vec_sld(
reinterpret_cast<Packet4ui
>(a),
reinterpret_cast<Packet4ui
>(a), 8))));
3624EIGEN_STRONG_INLINE
double predux_min<Packet2d>(
const Packet2d& a) {
3626 pmin(a,
reinterpret_cast<Packet2d
>(vec_sld(
reinterpret_cast<Packet4ui
>(a),
reinterpret_cast<Packet4ui
>(a), 8))));
3631EIGEN_STRONG_INLINE
double predux_max<Packet2d>(
const Packet2d& a) {
3633 pmax(a,
reinterpret_cast<Packet2d
>(vec_sld(
reinterpret_cast<Packet4ui
>(a),
reinterpret_cast<Packet4ui
>(a), 8))));
3636EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
3638 t0 = vec_mergeh(kernel.packet[0], kernel.packet[1]);
3639 t1 = vec_mergel(kernel.packet[0], kernel.packet[1]);
3640 kernel.packet[0] = t0;
3641 kernel.packet[1] = t1;
@ Aligned16
Definition Constants.h:237
Namespace containing all symbols from the Eigen library.
Definition B01_Experimental.dox:1
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_exp_op< typename Derived::Scalar >, const Derived > exp(const Eigen::ArrayBase< Derived > &x)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:82