10#ifndef EIGEN_PACKET_MATH_ZVECTOR_H
11#define EIGEN_PACKET_MATH_ZVECTOR_H
14#include "../../InternalHeaderCheck.h"
20#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
21#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16
24#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
25#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
28#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
29#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
32typedef __vector
int Packet4i;
33typedef __vector
unsigned int Packet4ui;
34typedef __vector __bool
int Packet4bi;
35typedef __vector
short int Packet8i;
36typedef __vector
unsigned char Packet16uc;
37typedef __vector
double Packet2d;
38typedef __vector
unsigned long long Packet2ul;
39typedef __vector
long long Packet2l;
42#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
43typedef __vector
float Packet4f;
52 numext::uint32_t ui[4];
54 numext::uint64_t ul[2];
62#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
70#define EIGEN_DECLARE_CONST_FAST_Packet4i(NAME, X) Packet4i p4i_##NAME = reinterpret_cast<Packet4i>(vec_splat_s32(X))
72#define EIGEN_DECLARE_CONST_FAST_Packet2d(NAME, X) Packet2d p2d_##NAME = reinterpret_cast<Packet2d>(vec_splat_s64(X))
74#define EIGEN_DECLARE_CONST_FAST_Packet2l(NAME, X) Packet2l p2l_##NAME = reinterpret_cast<Packet2l>(vec_splat_s64(X))
76#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) Packet4i p4i_##NAME = pset1<Packet4i>(X)
78#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) Packet2d p2d_##NAME = pset1<Packet2d>(X)
80#define EIGEN_DECLARE_CONST_Packet2l(NAME, X) Packet2l p2l_##NAME = pset1<Packet2l>(X)
83static EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0);
84static EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1);
86static EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0);
87static EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0);
88static EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1);
90static Packet2d p2d_ONE = {1.0, 1.0};
91static Packet2d p2d_ZERO_ = {numext::bit_cast<double>(0x8000000000000000ull),
92 numext::bit_cast<double>(0x8000000000000000ull)};
94#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
95#define EIGEN_DECLARE_CONST_FAST_Packet4f(NAME, X) Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
97#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) Packet4f p4f_##NAME = pset1<Packet4f>(X)
99#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) \
100 const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
102static EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0);
103static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1, -1);
104static Packet4f p4f_MZERO = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
107static Packet4i p4i_COUNTDOWN = {0, 1, 2, 3};
108static Packet4f p4f_COUNTDOWN = {0.0, 1.0, 2.0, 3.0};
109static Packet2d p2d_COUNTDOWN =
reinterpret_cast<Packet2d
>(
110 vec_sld(
reinterpret_cast<Packet16uc
>(p2d_ZERO),
reinterpret_cast<Packet16uc
>(p2d_ONE), 8));
112static Packet16uc p16uc_PSET64_HI = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
113static Packet16uc p16uc_DUPLICATE32_HI = {0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7};
116#define EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0
118#define EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & EIGEN_MASK_ALIGNMENT)
123static Packet16uc p16uc_FORWARD = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
124static Packet16uc p16uc_REVERSE32 = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
125static Packet16uc p16uc_REVERSE64 = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7};
127static Packet16uc p16uc_PSET32_WODD =
128 vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
130static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
137static Packet16uc p16uc_PSET64_LO = (Packet16uc)vec_mergel(
138 (Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);
142static Packet16uc p16uc_TRANSPOSE64_HI = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
143static Packet16uc p16uc_TRANSPOSE64_LO = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
145static Packet16uc p16uc_COMPLEX32_REV =
146 vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8);
148static Packet16uc p16uc_COMPLEX32_REV2 =
149 vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);
151#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
152#define EIGEN_ZVECTOR_PREFETCH(ADDR) __builtin_prefetch(ADDR);
154#define EIGEN_ZVECTOR_PREFETCH(ADDR) asm(" pfd [%[addr]]\n" ::[addr] "r"(ADDR) : "cc");
158struct packet_traits<int> : default_packet_traits {
159 typedef Packet4i type;
160 typedef Packet4i half;
174struct packet_traits<float> : default_packet_traits {
175 typedef Packet4f type;
176 typedef Packet4f half;
203struct packet_traits<double> : default_packet_traits {
204 typedef Packet2d type;
205 typedef Packet2d half;
229struct unpacket_traits<Packet4i> {
235 masked_load_available =
false,
236 masked_store_available =
false
238 typedef Packet4i half;
241struct unpacket_traits<Packet4f> {
247 masked_load_available =
false,
248 masked_store_available =
false
250 typedef Packet4f half;
251 typedef Packet4i integer_packet;
254struct unpacket_traits<Packet2d> {
260 masked_load_available =
false,
261 masked_store_available =
false
263 typedef Packet2d half;
264 typedef Packet2l integer_packet;
268EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet4f, 4>& kernel);
270inline std::ostream& operator<<(std::ostream& s,
const Packet4i& v) {
273 s << vt.i[0] <<
", " << vt.i[1] <<
", " << vt.i[2] <<
", " << vt.i[3];
277inline std::ostream& operator<<(std::ostream& s,
const Packet4ui& v) {
280 s << vt.ui[0] <<
", " << vt.ui[1] <<
", " << vt.ui[2] <<
", " << vt.ui[3];
284inline std::ostream& operator<<(std::ostream& s,
const Packet2l& v) {
287 s << vt.l[0] <<
", " << vt.l[1];
291inline std::ostream& operator<<(std::ostream& s,
const Packet2ul& v) {
294 s << vt.ul[0] <<
", " << vt.ul[1];
298inline std::ostream& operator<<(std::ostream& s,
const Packet2d& v) {
301 s << vt.d[0] <<
", " << vt.d[1];
305#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
306inline std::ostream& operator<<(std::ostream& s,
const Packet4f& v) {
309 s << vt.f[0] <<
", " << vt.f[1] <<
", " << vt.f[2] <<
", " << vt.f[3];
315EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(
const int* from) {
316 EIGEN_DEBUG_ALIGNED_LOAD
317 return vec_xl(0, from);
321EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(
const double* from) {
322 EIGEN_DEBUG_ALIGNED_LOAD
323 return vec_xl(0, from);
327EIGEN_STRONG_INLINE
void pstore<int>(
int* to,
const Packet4i& from) {
328 EIGEN_DEBUG_ALIGNED_STORE
329 vec_xst(from, 0, to);
333EIGEN_STRONG_INLINE
void pstore<double>(
double* to,
const Packet2d& from) {
334 EIGEN_DEBUG_ALIGNED_STORE
335 vec_xst(from, 0, to);
339EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(
const Packet4f& a, Packet4f& exponent) {
340 return pfrexp_generic(a, exponent);
344EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(
const Packet2d& a, Packet2d& exponent) {
345 return pfrexp_generic(a, exponent);
349EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(
const int& from) {
350 return vec_splats(from);
353EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(
const double& from) {
354 return vec_splats(from);
358EIGEN_STRONG_INLINE
void pbroadcast4<Packet4i>(
const int* a, Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) {
359 a3 = pload<Packet4i>(a);
360 a0 = vec_splat(a3, 0);
361 a1 = vec_splat(a3, 1);
362 a2 = vec_splat(a3, 2);
363 a3 = vec_splat(a3, 3);
367EIGEN_STRONG_INLINE
void pbroadcast4<Packet2d>(
const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2,
369 a1 = pload<Packet2d>(a);
370 a0 = vec_splat(a1, 0);
371 a1 = vec_splat(a1, 1);
372 a3 = pload<Packet2d>(a + 2);
373 a2 = vec_splat(a3, 0);
374 a3 = vec_splat(a3, 1);
378EIGEN_DEVICE_FUNC
inline Packet4i pgather<int, Packet4i>(
const int* from,
Index stride) {
379 EIGEN_ALIGN16
int ai[4];
380 ai[0] = from[0 * stride];
381 ai[1] = from[1 * stride];
382 ai[2] = from[2 * stride];
383 ai[3] = from[3 * stride];
384 return pload<Packet4i>(ai);
388EIGEN_DEVICE_FUNC
inline Packet2d pgather<double, Packet2d>(
const double* from,
Index stride) {
389 EIGEN_ALIGN16
double af[2];
390 af[0] = from[0 * stride];
391 af[1] = from[1 * stride];
392 return pload<Packet2d>(af);
396EIGEN_DEVICE_FUNC
inline void pscatter<int, Packet4i>(
int* to,
const Packet4i& from,
Index stride) {
397 EIGEN_ALIGN16
int ai[4];
398 pstore<int>((
int*)ai, from);
399 to[0 * stride] = ai[0];
400 to[1 * stride] = ai[1];
401 to[2 * stride] = ai[2];
402 to[3 * stride] = ai[3];
406EIGEN_DEVICE_FUNC
inline void pscatter<double, Packet2d>(
double* to,
const Packet2d& from,
Index stride) {
407 EIGEN_ALIGN16
double af[2];
408 pstore<double>(af, from);
409 to[0 * stride] = af[0];
410 to[1 * stride] = af[1];
414EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
418EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
423EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
427EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
432EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
436EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
441EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
445EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
450EIGEN_STRONG_INLINE Packet4i pnegate(
const Packet4i& a) {
454EIGEN_STRONG_INLINE Packet2d pnegate(
const Packet2d& a) {
459EIGEN_STRONG_INLINE Packet4i pconj(
const Packet4i& a) {
463EIGEN_STRONG_INLINE Packet2d pconj(
const Packet2d& a) {
468EIGEN_STRONG_INLINE Packet4i pmadd(
const Packet4i& a,
const Packet4i& b,
const Packet4i& c) {
469 return padd<Packet4i>(pmul<Packet4i>(a, b), c);
472EIGEN_STRONG_INLINE Packet2d pmadd(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
473 return vec_madd(a, b, c);
477EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(
const int& a) {
478 return padd<Packet4i>(pset1<Packet4i>(a), p4i_COUNTDOWN);
481EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(
const double& a) {
482 return padd<Packet2d>(pset1<Packet2d>(a), p2d_COUNTDOWN);
486EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
487 return vec_min(a, b);
490EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
491 return vec_min(a, b);
495EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
496 return vec_max(a, b);
499EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
500 return vec_max(a, b);
504EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
505 return vec_and(a, b);
508EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
509 return vec_and(a, b);
513EIGEN_STRONG_INLINE Packet4i por<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
517EIGEN_STRONG_INLINE Packet2d por<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
522EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
523 return vec_xor(a, b);
526EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
527 return vec_xor(a, b);
531EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
532 return pand<Packet4i>(a, vec_nor(b, b));
535EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
536 return vec_and(a, vec_nor(b, b));
540EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(
const Packet2d& a) {
542 return __builtin_s390_vfidb(a, 0, 1);
545EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(
const Packet2d& a) {
549EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(
const Packet2d& a) {
554EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(
const int* from) {
555 return pload<Packet4i>(from);
558EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(
const double* from) {
559 return pload<Packet2d>(from);
563EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(
const int* from) {
564 Packet4i p = pload<Packet4i>(from);
565 return vec_perm(p, p, p16uc_DUPLICATE32_HI);
569EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(
const double* from) {
570 Packet2d p = pload<Packet2d>(from);
571 return vec_perm(p, p, p16uc_PSET64_HI);
575EIGEN_STRONG_INLINE
void pstoreu<int>(
int* to,
const Packet4i& from) {
576 pstore<int>(to, from);
579EIGEN_STRONG_INLINE
void pstoreu<double>(
double* to,
const Packet2d& from) {
580 pstore<double>(to, from);
584EIGEN_STRONG_INLINE
void prefetch<int>(
const int* addr) {
585 EIGEN_ZVECTOR_PREFETCH(addr);
588EIGEN_STRONG_INLINE
void prefetch<double>(
const double* addr) {
589 EIGEN_ZVECTOR_PREFETCH(addr);
593EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(
const Packet2l& a) {
594 return Packet2l{parithmetic_shift_right<N>(a[0]), parithmetic_shift_right<N>(a[1])};
597EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(
const Packet4i& a) {
598 return Packet4i{parithmetic_shift_right<N>(a[0]), parithmetic_shift_right<N>(a[1]), parithmetic_shift_right<N>(a[2]),
599 parithmetic_shift_right<N>(a[3])};
603EIGEN_STRONG_INLINE Packet2l plogical_shift_right(
const Packet2l& a) {
604 return Packet2l{plogical_shift_right<N>(a[0]), plogical_shift_right<N>(a[1])};
607EIGEN_STRONG_INLINE Packet4i plogical_shift_right(
const Packet4i& a) {
608 return Packet4i{plogical_shift_right<N>(a[0]), plogical_shift_right<N>(a[1]), plogical_shift_right<N>(a[2]),
609 plogical_shift_right<N>(a[3])};
613EIGEN_STRONG_INLINE Packet2l plogical_shift_left(
const Packet2l& a) {
614 return Packet2l{plogical_shift_left<N>(a[0]), plogical_shift_left<N>(a[1])};
617EIGEN_STRONG_INLINE Packet4i plogical_shift_left(
const Packet4i& a) {
618 return Packet4i{plogical_shift_left<N>(a[0]), plogical_shift_left<N>(a[1]), plogical_shift_left<N>(a[2]),
619 plogical_shift_left<N>(a[3])};
623EIGEN_STRONG_INLINE
int pfirst<Packet4i>(
const Packet4i& a) {
624 EIGEN_ALIGN16
int x[4];
629EIGEN_STRONG_INLINE
double pfirst<Packet2d>(
const Packet2d& a) {
630 EIGEN_ALIGN16
double x[2];
636EIGEN_STRONG_INLINE Packet4i preverse(
const Packet4i& a) {
637 return reinterpret_cast<Packet4i
>(
638 vec_perm(
reinterpret_cast<Packet16uc
>(a),
reinterpret_cast<Packet16uc
>(a), p16uc_REVERSE32));
642EIGEN_STRONG_INLINE Packet2d preverse(
const Packet2d& a) {
643 return reinterpret_cast<Packet2d
>(
644 vec_perm(
reinterpret_cast<Packet16uc
>(a),
reinterpret_cast<Packet16uc
>(a), p16uc_REVERSE64));
648EIGEN_STRONG_INLINE Packet4i pabs<Packet4i>(
const Packet4i& a) {
652EIGEN_STRONG_INLINE Packet2d pabs<Packet2d>(
const Packet2d& a) {
657EIGEN_STRONG_INLINE
int predux<Packet4i>(
const Packet4i& a) {
659 b = vec_sld(a, a, 8);
660 sum = padd<Packet4i>(a, b);
661 b = vec_sld(sum, sum, 4);
662 sum = padd<Packet4i>(sum, b);
667EIGEN_STRONG_INLINE
double predux<Packet2d>(
const Packet2d& a) {
669 b =
reinterpret_cast<Packet2d
>(vec_sld(
reinterpret_cast<Packet4i
>(a),
reinterpret_cast<Packet4i
>(a), 8));
670 sum = padd<Packet2d>(a, b);
677EIGEN_STRONG_INLINE
int predux_mul<Packet4i>(
const Packet4i& a) {
678 EIGEN_ALIGN16
int aux[4];
680 return aux[0] * aux[1] * aux[2] * aux[3];
684EIGEN_STRONG_INLINE
double predux_mul<Packet2d>(
const Packet2d& a) {
686 pmul(a,
reinterpret_cast<Packet2d
>(vec_sld(
reinterpret_cast<Packet4i
>(a),
reinterpret_cast<Packet4i
>(a), 8))));
691EIGEN_STRONG_INLINE
int predux_min<Packet4i>(
const Packet4i& a) {
693 b = pmin<Packet4i>(a, vec_sld(a, a, 8));
694 res = pmin<Packet4i>(b, vec_sld(b, b, 4));
699EIGEN_STRONG_INLINE
double predux_min<Packet2d>(
const Packet2d& a) {
700 return pfirst(pmin<Packet2d>(
701 a,
reinterpret_cast<Packet2d
>(vec_sld(
reinterpret_cast<Packet4i
>(a),
reinterpret_cast<Packet4i
>(a), 8))));
706EIGEN_STRONG_INLINE
int predux_max<Packet4i>(
const Packet4i& a) {
708 b = pmax<Packet4i>(a, vec_sld(a, a, 8));
709 res = pmax<Packet4i>(b, vec_sld(b, b, 4));
715EIGEN_STRONG_INLINE
double predux_max<Packet2d>(
const Packet2d& a) {
716 return pfirst(pmax<Packet2d>(
717 a,
reinterpret_cast<Packet2d
>(vec_sld(
reinterpret_cast<Packet4i
>(a),
reinterpret_cast<Packet4i
>(a), 8))));
720EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
721 Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
722 Packet4i t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
723 Packet4i t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
724 Packet4i t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
725 kernel.packet[0] = vec_mergeh(t0, t2);
726 kernel.packet[1] = vec_mergel(t0, t2);
727 kernel.packet[2] = vec_mergeh(t1, t3);
728 kernel.packet[3] = vec_mergel(t1, t3);
731EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
732 Packet2d t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI);
733 Packet2d t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO);
734 kernel.packet[0] = t0;
735 kernel.packet[1] = t1;
741#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
744template <
int element>
745EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(
const Packet4f& from) {
749 splat.v4f[0] = vec_splat(from.v4f[0], 0);
750 splat.v4f[1] = splat.v4f[0];
753 splat.v4f[0] = vec_splat(from.v4f[0], 1);
754 splat.v4f[1] = splat.v4f[0];
757 splat.v4f[0] = vec_splat(from.v4f[1], 0);
758 splat.v4f[1] = splat.v4f[0];
761 splat.v4f[0] = vec_splat(from.v4f[1], 1);
762 splat.v4f[1] = splat.v4f[0];
769EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(
const float* from) {
771 EIGEN_DEBUG_ALIGNED_LOAD
773 vfrom.v4f[0] = vec_ld2f(&from[0]);
774 vfrom.v4f[1] = vec_ld2f(&from[2]);
779EIGEN_STRONG_INLINE
void pstore<float>(
float* to,
const Packet4f& from) {
781 EIGEN_DEBUG_ALIGNED_STORE
782 vec_st2f(from.v4f[0], &to[0]);
783 vec_st2f(from.v4f[1], &to[2]);
787EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(
const float& from) {
789 to.v4f[0] = pset1<Packet2d>(
static_cast<const double&
>(from));
790 to.v4f[1] = to.v4f[0];
795EIGEN_STRONG_INLINE
void pbroadcast4<Packet4f>(
const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
796 a3 = pload<Packet4f>(a);
797 a0 = vec_splat_packet4f<0>(a3);
798 a1 = vec_splat_packet4f<1>(a3);
799 a2 = vec_splat_packet4f<2>(a3);
800 a3 = vec_splat_packet4f<3>(a3);
804EIGEN_DEVICE_FUNC
inline Packet4f pgather<float, Packet4f>(
const float* from,
Index stride) {
805 EIGEN_ALIGN16
float ai[4];
806 ai[0] = from[0 * stride];
807 ai[1] = from[1 * stride];
808 ai[2] = from[2 * stride];
809 ai[3] = from[3 * stride];
810 return pload<Packet4f>(ai);
814EIGEN_DEVICE_FUNC
inline void pscatter<float, Packet4f>(
float* to,
const Packet4f& from,
Index stride) {
815 EIGEN_ALIGN16
float ai[4];
816 pstore<float>((
float*)ai, from);
817 to[0 * stride] = ai[0];
818 to[1 * stride] = ai[1];
819 to[2 * stride] = ai[2];
820 to[3 * stride] = ai[3];
824EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
826 c.v4f[0] = a.v4f[0] + b.v4f[0];
827 c.v4f[1] = a.v4f[1] + b.v4f[1];
832EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
834 c.v4f[0] = a.v4f[0] - b.v4f[0];
835 c.v4f[1] = a.v4f[1] - b.v4f[1];
840EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
842 c.v4f[0] = a.v4f[0] * b.v4f[0];
843 c.v4f[1] = a.v4f[1] * b.v4f[1];
848EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
850 c.v4f[0] = a.v4f[0] / b.v4f[0];
851 c.v4f[1] = a.v4f[1] / b.v4f[1];
856EIGEN_STRONG_INLINE Packet4f pnegate(
const Packet4f& a) {
858 c.v4f[0] = -a.v4f[0];
859 c.v4f[1] = -a.v4f[1];
864EIGEN_STRONG_INLINE Packet4f pmadd(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
866 res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]);
867 res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]);
872EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
874 res.v4f[0] = pmin(a.v4f[0], b.v4f[0]);
875 res.v4f[1] = pmin(a.v4f[1], b.v4f[1]);
880EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
882 res.v4f[0] = pmax(a.v4f[0], b.v4f[0]);
883 res.v4f[1] = pmax(a.v4f[1], b.v4f[1]);
888EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
890 res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
891 res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
896EIGEN_STRONG_INLINE Packet4f por<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
898 res.v4f[0] = por(a.v4f[0], b.v4f[0]);
899 res.v4f[1] = por(a.v4f[1], b.v4f[1]);
904EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
906 res.v4f[0] = pxor(a.v4f[0], b.v4f[0]);
907 res.v4f[1] = pxor(a.v4f[1], b.v4f[1]);
912EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
914 res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]);
915 res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]);
920EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(
const Packet4f& a) {
922 res.v4f[0] = generic_round(a.v4f[0]);
923 res.v4f[1] = generic_round(a.v4f[1]);
928EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(
const Packet4f& a) {
930 res.v4f[0] = vec_ceil(a.v4f[0]);
931 res.v4f[1] = vec_ceil(a.v4f[1]);
936EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(
const Packet4f& a) {
938 res.v4f[0] = vec_floor(a.v4f[0]);
939 res.v4f[1] = vec_floor(a.v4f[1]);
944EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(
const float* from) {
945 Packet4f p = pload<Packet4f>(from);
946 p.v4f[1] = vec_splat(p.v4f[0], 1);
947 p.v4f[0] = vec_splat(p.v4f[0], 0);
952EIGEN_STRONG_INLINE
float pfirst<Packet4f>(
const Packet4f& a) {
953 EIGEN_ALIGN16
float x[2];
954 vec_st2f(a.v4f[0], &x[0]);
959EIGEN_STRONG_INLINE Packet4f preverse(
const Packet4f& a) {
961 rev.v4f[0] = preverse<Packet2d>(a.v4f[1]);
962 rev.v4f[1] = preverse<Packet2d>(a.v4f[0]);
967EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(
const Packet4f& a) {
969 res.v4f[0] = pabs(a.v4f[0]);
970 res.v4f[1] = pabs(a.v4f[1]);
975EIGEN_STRONG_INLINE
float predux<Packet4f>(
const Packet4f& a) {
977 sum = padd<Packet2d>(a.v4f[0], a.v4f[1]);
978 double first = predux<Packet2d>(sum);
979 return static_cast<float>(first);
983EIGEN_STRONG_INLINE
float predux_mul<Packet4f>(
const Packet4f& a) {
985 return static_cast<float>(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1]))));
989EIGEN_STRONG_INLINE
float predux_min<Packet4f>(
const Packet4f& a) {
991 b = pmin<Packet2d>(a.v4f[0], a.v4f[1]);
992 res = pmin<Packet2d>(
993 b,
reinterpret_cast<Packet2d
>(vec_sld(
reinterpret_cast<Packet4i
>(b),
reinterpret_cast<Packet4i
>(b), 8)));
994 return static_cast<float>(pfirst(res));
998EIGEN_STRONG_INLINE
float predux_max<Packet4f>(
const Packet4f& a) {
1000 b = pmax<Packet2d>(a.v4f[0], a.v4f[1]);
1001 res = pmax<Packet2d>(
1002 b,
reinterpret_cast<Packet2d
>(vec_sld(
reinterpret_cast<Packet4i
>(b),
reinterpret_cast<Packet4i
>(b), 8)));
1003 return static_cast<float>(pfirst(res));
1008EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
1009 PacketBlock<Packet2d, 2> t0, t1, t2, t3;
1011 t0.packet[0] = kernel.packet[0].v4f[0];
1012 t0.packet[1] = kernel.packet[1].v4f[0];
1015 t1.packet[0] = kernel.packet[0].v4f[1];
1016 t1.packet[1] = kernel.packet[1].v4f[1];
1019 t2.packet[0] = kernel.packet[2].v4f[0];
1020 t2.packet[1] = kernel.packet[3].v4f[0];
1023 t3.packet[0] = kernel.packet[2].v4f[1];
1024 t3.packet[1] = kernel.packet[3].v4f[1];
1033 kernel.packet[0].v4f[0] = t0.packet[0];
1034 kernel.packet[0].v4f[1] = t2.packet[0];
1035 kernel.packet[1].v4f[0] = t0.packet[1];
1036 kernel.packet[1].v4f[1] = t2.packet[1];
1037 kernel.packet[2].v4f[0] = t1.packet[0];
1038 kernel.packet[2].v4f[1] = t3.packet[0];
1039 kernel.packet[3].v4f[0] = t1.packet[1];
1040 kernel.packet[3].v4f[1] = t3.packet[1];
1044Packet4f EIGEN_STRONG_INLINE pcmp_le<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1046 res.v4f[0] = pcmp_le(a.v4f[0], b.v4f[0]);
1047 res.v4f[1] = pcmp_le(a.v4f[1], b.v4f[1]);
1052Packet4f EIGEN_STRONG_INLINE pcmp_lt<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1054 res.v4f[0] = pcmp_lt(a.v4f[0], b.v4f[0]);
1055 res.v4f[1] = pcmp_lt(a.v4f[1], b.v4f[1]);
1060Packet4f EIGEN_STRONG_INLINE pcmp_eq<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1062 res.v4f[0] = pcmp_eq(a.v4f[0], b.v4f[0]);
1063 res.v4f[1] = pcmp_eq(a.v4f[1], b.v4f[1]);
1069EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(
const float* from) {
1070 EIGEN_DEBUG_ALIGNED_LOAD
1071 return vec_xl(0, from);
1075EIGEN_STRONG_INLINE
void pstore<float>(
float* to,
const Packet4f& from) {
1076 EIGEN_DEBUG_ALIGNED_STORE
1077 vec_xst(from, 0, to);
1081EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(
const float& from) {
1082 return vec_splats(from);
1086EIGEN_STRONG_INLINE
void pbroadcast4<Packet4f>(
const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
1087 a3 = pload<Packet4f>(a);
1088 a0 = vec_splat(a3, 0);
1089 a1 = vec_splat(a3, 1);
1090 a2 = vec_splat(a3, 2);
1091 a3 = vec_splat(a3, 3);
1095EIGEN_DEVICE_FUNC
inline Packet4f pgather<float, Packet4f>(
const float* from,
Index stride) {
1096 EIGEN_ALIGN16
float af[4];
1097 af[0] = from[0 * stride];
1098 af[1] = from[1 * stride];
1099 af[2] = from[2 * stride];
1100 af[3] = from[3 * stride];
1101 return pload<Packet4f>(af);
1105EIGEN_DEVICE_FUNC
inline void pscatter<float, Packet4f>(
float* to,
const Packet4f& from,
Index stride) {
1106 EIGEN_ALIGN16
float af[4];
1107 pstore<float>((
float*)af, from);
1108 to[0 * stride] = af[0];
1109 to[1 * stride] = af[1];
1110 to[2 * stride] = af[2];
1111 to[3 * stride] = af[3];
1115EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1119EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1123EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1127EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1131EIGEN_STRONG_INLINE Packet4f pnegate<Packet4f>(
const Packet4f& a) {
1135EIGEN_STRONG_INLINE Packet4f pconj<Packet4f>(
const Packet4f& a) {
1139EIGEN_STRONG_INLINE Packet4f pmadd<Packet4f>(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
1140 return vec_madd(a, b, c);
1143EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1144 return vec_min(a, b);
1147EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1148 return vec_max(a, b);
1151EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1152 return vec_and(a, b);
1155EIGEN_STRONG_INLINE Packet4f por<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1156 return vec_or(a, b);
1159EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1160 return vec_xor(a, b);
1163EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1164 return vec_and(a, vec_nor(b, b));
1167EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(
const Packet4f& a) {
1169 return __builtin_s390_vfisb(a, 0, 1);
1172EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(
const Packet4f& a) {
1176EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(
const Packet4f& a) {
1177 return vec_floor(a);
1180EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(
const Packet4f& a) {
1184EIGEN_STRONG_INLINE
float pfirst<Packet4f>(
const Packet4f& a) {
1185 EIGEN_ALIGN16
float x[4];
1191EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(
const float* from) {
1192 Packet4f p = pload<Packet4f>(from);
1193 return vec_perm(p, p, p16uc_DUPLICATE32_HI);
1197EIGEN_STRONG_INLINE Packet4f preverse(
const Packet4f& a) {
1198 return reinterpret_cast<Packet4f
>(
1199 vec_perm(
reinterpret_cast<Packet16uc
>(a),
reinterpret_cast<Packet16uc
>(a), p16uc_REVERSE32));
1203EIGEN_STRONG_INLINE
float predux<Packet4f>(
const Packet4f& a) {
1205 b = vec_sld(a, a, 8);
1206 sum = padd<Packet4f>(a, b);
1207 b = vec_sld(sum, sum, 4);
1208 sum = padd<Packet4f>(sum, b);
1215EIGEN_STRONG_INLINE
float predux_mul<Packet4f>(
const Packet4f& a) {
1217 prod = pmul(a, vec_sld(a, a, 8));
1218 return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
1223EIGEN_STRONG_INLINE
float predux_min<Packet4f>(
const Packet4f& a) {
1225 b = pmin<Packet4f>(a, vec_sld(a, a, 8));
1226 res = pmin<Packet4f>(b, vec_sld(b, b, 4));
1232EIGEN_STRONG_INLINE
float predux_max<Packet4f>(
const Packet4f& a) {
1234 b = pmax<Packet4f>(a, vec_sld(a, a, 8));
1235 res = pmax<Packet4f>(b, vec_sld(b, b, 4));
1239EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
1240 Packet4f t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
1241 Packet4f t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
1242 Packet4f t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
1243 Packet4f t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
1244 kernel.packet[0] = vec_mergeh(t0, t2);
1245 kernel.packet[1] = vec_mergel(t0, t2);
1246 kernel.packet[2] = vec_mergeh(t1, t3);
1247 kernel.packet[3] = vec_mergel(t1, t3);
1253EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(
const Packet4f& a,
const Packet4f& exponent) {
1254 return pldexp_generic(a, exponent);
1258EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(
const Packet2d& a,
const Packet2d& exponent) {
1260 const Packet2d max_exponent = pset1<Packet2d>(2099.0);
1261 const Packet2l e = pcast<Packet2d, Packet2l>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
1264 const Packet2l bias = {1023, 1023};
1265 Packet2l b = plogical_shift_right<2>(e);
1266 Packet2d c =
reinterpret_cast<Packet2d
>(plogical_shift_left<52>(b + bias));
1267 Packet2d out = pmul(pmul(pmul(a, c), c), c);
1268 b = psub(psub(psub(e, b), b), b);
1269 c =
reinterpret_cast<Packet2d
>(plogical_shift_left<52>(b + bias));
1275EIGEN_STRONG_INLINE
void prefetch<float>(
const float* addr) {
1276 EIGEN_ZVECTOR_PREFETCH(addr);
1279EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(
const float* from) {
1280 return pload<Packet4f>(from);
1283EIGEN_STRONG_INLINE
void pstoreu<float>(
float* to,
const Packet4f& from) {
1284 pstore<float>(to, from);
1287EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(
const float& a) {
1288 return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN);
1291#if !defined(vec_float) || !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 13)
1292#pragma GCC warning "float->int and int->float conversion is simulated. compile for z15 for improved performance"
1294struct cast_impl<Packet4i, Packet4f> {
1295 EIGEN_DEVICE_FUNC
static inline Packet4f run(
const Packet4i& a) {
1296 return Packet4f{float(a[0]), float(a[1]), float(a[2]), float(a[3])};
1301struct cast_impl<Packet4f, Packet4i> {
1302 EIGEN_DEVICE_FUNC
static inline Packet4i run(
const Packet4f& a) {
1303 return Packet4i{int(a[0]), int(a[1]), int(a[2]), int(a[3])};
1308struct cast_impl<Packet2l, Packet2d> {
1309 EIGEN_DEVICE_FUNC
static inline Packet2d run(
const Packet2l& a) {
return Packet2d{double(a[0]), double(a[1])}; }
1313struct cast_impl<Packet2d, Packet2l> {
1314 EIGEN_DEVICE_FUNC
static inline Packet2l run(
const Packet2d& a) {
1315 return Packet2l{(
long long)(a[0]), (
long long)(a[1])};
1320struct cast_impl<Packet4i, Packet4f> {
1321 EIGEN_DEVICE_FUNC
static inline Packet4f run(
const Packet4i& a) {
return vec_float(a); }
1325struct cast_impl<Packet4f, Packet4i> {
1326 EIGEN_DEVICE_FUNC
static inline Packet4i run(
const Packet4f& a) {
return vec_signed(a); }
1330struct cast_impl<Packet2l, Packet2d> {
1331 EIGEN_DEVICE_FUNC
static inline Packet2d run(
const Packet2l& a) {
return vec_double(a); }
1335struct cast_impl<Packet2d, Packet2l> {
1336 EIGEN_DEVICE_FUNC
static inline Packet2l run(
const Packet2d& a) {
return vec_signed(a); }
1341EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(uint32_t from) {
1342 return pset1<Packet4f>(Eigen::numext::bit_cast<float>(from));
1345EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
1346 return pset1<Packet2d>(Eigen::numext::bit_cast<double>(from));
@ Aligned16
Definition Constants.h:237
Namespace containing all symbols from the Eigen library.
Definition B01_Experimental.dox:1
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:82