13#ifndef EIGEN_PACKET_MATH_MSA_H
14#define EIGEN_PACKET_MATH_MSA_H
20#include "../../InternalHeaderCheck.h"
26#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
27#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
30#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
31#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
34#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
35#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
39#define EIGEN_MSA_DEBUG \
40 static bool firstTime = true; \
43 std::cout << __FILE__ << ':' << __LINE__ << ':' << __FUNCTION__ << std::endl; \
48#define EIGEN_MSA_DEBUG
51#define EIGEN_MSA_SHF_I8(a, b, c, d) (((d) << 6) | ((c) << 4) | ((b) << 2) | (a))
53typedef v4f32 Packet4f;
54typedef v4i32 Packet4i;
55typedef v4u32 Packet4ui;
57#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = {X, X, X, X}
58#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = {X, X, X, X}
59#define EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = {X, X, X, X}
61inline std::ostream& operator<<(std::ostream& os,
const Packet4f& value) {
62 os <<
"[ " << value[0] <<
", " << value[1] <<
", " << value[2] <<
", " << value[3] <<
" ]";
66inline std::ostream& operator<<(std::ostream& os,
const Packet4i& value) {
67 os <<
"[ " << value[0] <<
", " << value[1] <<
", " << value[2] <<
", " << value[3] <<
" ]";
71inline std::ostream& operator<<(std::ostream& os,
const Packet4ui& value) {
72 os <<
"[ " << value[0] <<
", " << value[1] <<
", " << value[2] <<
", " << value[3] <<
" ]";
77struct packet_traits<float> : default_packet_traits {
78 typedef Packet4f type;
79 typedef Packet4f half;
86 HasSin = EIGEN_FAST_MATH,
87 HasCos = EIGEN_FAST_MATH,
88 HasTanh = EIGEN_FAST_MATH,
89 HasErf = EIGEN_FAST_MATH,
98struct packet_traits<int32_t> : default_packet_traits {
99 typedef Packet4i type;
100 typedef Packet4i half;
111struct unpacket_traits<Packet4f> {
117 masked_load_available =
false,
118 masked_store_available =
false
120 typedef Packet4f half;
124struct unpacket_traits<Packet4i> {
125 typedef int32_t type;
130 masked_load_available =
false,
131 masked_store_available =
false
133 typedef Packet4i half;
137EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(
const float& from) {
140 Packet4f v = {from, from, from, from};
145EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(
const int32_t& from) {
148 return __builtin_msa_fill_w(from);
152EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(
const float* from) {
156 Packet4f v = {f, f, f, f};
161EIGEN_STRONG_INLINE Packet4i pload1<Packet4i>(
const int32_t* from) {
164 return __builtin_msa_fill_w(*from);
168EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
171 return __builtin_msa_fadd_w(a, b);
175EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
178 return __builtin_msa_addv_w(a, b);
182EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(
const float& a) {
185 static const Packet4f countdown = {0.0f, 1.0f, 2.0f, 3.0f};
186 return padd(pset1<Packet4f>(a), countdown);
190EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(
const int32_t& a) {
193 static const Packet4i countdown = {0, 1, 2, 3};
194 return padd(pset1<Packet4i>(a), countdown);
198EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
201 return __builtin_msa_fsub_w(a, b);
205EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
208 return __builtin_msa_subv_w(a, b);
212EIGEN_STRONG_INLINE Packet4f pnegate(
const Packet4f& a) {
215 return (Packet4f)__builtin_msa_bnegi_w((v4u32)a, 31);
219EIGEN_STRONG_INLINE Packet4i pnegate(
const Packet4i& a) {
222 return __builtin_msa_addvi_w((v4i32)__builtin_msa_nori_b((v16u8)a, 0), 1);
226EIGEN_STRONG_INLINE Packet4f pconj(
const Packet4f& a) {
233EIGEN_STRONG_INLINE Packet4i pconj(
const Packet4i& a) {
240EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
243 return __builtin_msa_fmul_w(a, b);
247EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
250 return __builtin_msa_mulv_w(a, b);
254EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
257 return __builtin_msa_fdiv_w(a, b);
261EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
264 return __builtin_msa_div_s_w(a, b);
268EIGEN_STRONG_INLINE Packet4f pmadd(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
271 return __builtin_msa_fmadd_w(c, a, b);
275EIGEN_STRONG_INLINE Packet4i pmadd(
const Packet4i& a,
const Packet4i& b,
const Packet4i& c) {
280 __asm__(
"maddv.w %w[value], %w[a], %w[b]\n"
282 : [value]
"+f"(value)
284 : [a]
"f"(a), [b]
"f"(b));
289EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
292 return (Packet4f)__builtin_msa_and_v((v16u8)a, (v16u8)b);
296EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
299 return (Packet4i)__builtin_msa_and_v((v16u8)a, (v16u8)b);
303EIGEN_STRONG_INLINE Packet4f por<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
306 return (Packet4f)__builtin_msa_or_v((v16u8)a, (v16u8)b);
310EIGEN_STRONG_INLINE Packet4i por<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
313 return (Packet4i)__builtin_msa_or_v((v16u8)a, (v16u8)b);
317EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
320 return (Packet4f)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
324EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
327 return (Packet4i)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
331EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
334 return pand(a, (Packet4f)__builtin_msa_xori_b((v16u8)b, 255));
338EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
341 return pand(a, (Packet4i)__builtin_msa_xori_b((v16u8)b, 255));
345EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
350 return __builtin_msa_fmin_w(a, b);
353 Packet4i aNaN = __builtin_msa_fcun_w(a, a);
354 Packet4i aMinOrNaN = por(__builtin_msa_fclt_w(a, b), aNaN);
355 return (Packet4f)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
360EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
363 return __builtin_msa_min_s_w(a, b);
367EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
372 return __builtin_msa_fmax_w(a, b);
375 Packet4i aNaN = __builtin_msa_fcun_w(a, a);
376 Packet4i aMaxOrNaN = por(__builtin_msa_fclt_w(b, a), aNaN);
377 return (Packet4f)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
382EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
385 return __builtin_msa_max_s_w(a, b);
389EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(
const float* from) {
392 EIGEN_DEBUG_ALIGNED_LOAD
return (Packet4f)__builtin_msa_ld_w(
const_cast<float*
>(from), 0);
396EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(
const int32_t* from) {
399 EIGEN_DEBUG_ALIGNED_LOAD
return __builtin_msa_ld_w(
const_cast<int32_t*
>(from), 0);
403EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(
const float* from) {
406 EIGEN_DEBUG_UNALIGNED_LOAD
return (Packet4f)__builtin_msa_ld_w(
const_cast<float*
>(from), 0);
410EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(
const int32_t* from) {
413 EIGEN_DEBUG_UNALIGNED_LOAD
return (Packet4i)__builtin_msa_ld_w(
const_cast<int32_t*
>(from), 0);
417EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(
const float* from) {
420 float f0 = from[0], f1 = from[1];
421 Packet4f v0 = {f0, f0, f0, f0};
422 Packet4f v1 = {f1, f1, f1, f1};
423 return (Packet4f)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
427EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(
const int32_t* from) {
430 int32_t i0 = from[0], i1 = from[1];
431 Packet4i v0 = {i0, i0, i0, i0};
432 Packet4i v1 = {i1, i1, i1, i1};
433 return (Packet4i)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
437EIGEN_STRONG_INLINE
void pstore<float>(
float* to,
const Packet4f& from) {
440 EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
444EIGEN_STRONG_INLINE
void pstore<int32_t>(int32_t* to,
const Packet4i& from) {
447 EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w(from, to, 0);
451EIGEN_STRONG_INLINE
void pstoreu<float>(
float* to,
const Packet4f& from) {
454 EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
458EIGEN_STRONG_INLINE
void pstoreu<int32_t>(int32_t* to,
const Packet4i& from) {
461 EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w(from, to, 0);
465EIGEN_DEVICE_FUNC
inline Packet4f pgather<float, Packet4f>(
const float* from,
Index stride) {
469 Packet4f v = {f, f, f, f};
471 v[2] = from[2 * stride];
472 v[3] = from[3 * stride];
477EIGEN_DEVICE_FUNC
inline Packet4i pgather<int32_t, Packet4i>(
const int32_t* from,
Index stride) {
481 Packet4i v = {i, i, i, i};
483 v[2] = from[2 * stride];
484 v[3] = from[3 * stride];
489EIGEN_DEVICE_FUNC
inline void pscatter<float, Packet4f>(
float* to,
const Packet4f& from,
Index stride) {
502EIGEN_DEVICE_FUNC
inline void pscatter<int32_t, Packet4i>(int32_t* to,
const Packet4i& from,
Index stride) {
515EIGEN_STRONG_INLINE
void prefetch<float>(
const float* addr) {
518 __builtin_prefetch(addr);
522EIGEN_STRONG_INLINE
void prefetch<int32_t>(
const int32_t* addr) {
525 __builtin_prefetch(addr);
529EIGEN_STRONG_INLINE
float pfirst<Packet4f>(
const Packet4f& a) {
536EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(
const Packet4i& a) {
543EIGEN_STRONG_INLINE Packet4f preverse(
const Packet4f& a) {
546 return (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
550EIGEN_STRONG_INLINE Packet4i preverse(
const Packet4i& a) {
553 return __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
557EIGEN_STRONG_INLINE Packet4f pabs(
const Packet4f& a) {
560 return (Packet4f)__builtin_msa_bclri_w((v4u32)a, 31);
564EIGEN_STRONG_INLINE Packet4i pabs(
const Packet4i& a) {
567 Packet4i zero = __builtin_msa_ldi_w(0);
568 return __builtin_msa_add_a_w(zero, a);
572EIGEN_STRONG_INLINE
float predux<Packet4f>(
const Packet4f& a) {
575 Packet4f s = padd(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
576 s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
581EIGEN_STRONG_INLINE int32_t predux<Packet4i>(
const Packet4i& a) {
584 Packet4i s = padd(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
585 s = padd(s, __builtin_msa_shf_w(s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
592EIGEN_STRONG_INLINE
float predux_mul<Packet4f>(
const Packet4f& a) {
595 Packet4f p = pmul(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
596 p = pmul(p, (Packet4f)__builtin_msa_shf_w((v4i32)p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
601EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(
const Packet4i& a) {
604 Packet4i p = pmul(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
605 p = pmul(p, __builtin_msa_shf_w(p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
611EIGEN_STRONG_INLINE
float predux_min<Packet4f>(
const Packet4f& a) {
615 Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
619 v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
621 unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
624 Packet4f v = __builtin_msa_fmin_w(a, swapped);
625 v = __builtin_msa_fmin_w(v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
628 v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
629 v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
635EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(
const Packet4i& a) {
638 Packet4i m = pmin(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
639 m = pmin(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
645EIGEN_STRONG_INLINE
float predux_max<Packet4f>(
const Packet4f& a) {
649 Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
653 v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
655 unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
658 Packet4f v = __builtin_msa_fmax_w(a, swapped);
659 v = __builtin_msa_fmax_w(v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
662 v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
663 v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
669EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(
const Packet4i& a) {
672 Packet4i m = pmax(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
673 m = pmax(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
677inline std::ostream& operator<<(std::ostream& os,
const PacketBlock<Packet4f, 4>& value) {
678 os <<
"[ " << value.packet[0] <<
"," << std::endl
679 <<
" " << value.packet[1] <<
"," << std::endl
680 <<
" " << value.packet[2] <<
"," << std::endl
681 <<
" " << value.packet[3] <<
" ]";
685EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
688 v4i32 tmp1, tmp2, tmp3, tmp4;
690 tmp1 = __builtin_msa_ilvr_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
691 tmp2 = __builtin_msa_ilvr_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
692 tmp3 = __builtin_msa_ilvl_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
693 tmp4 = __builtin_msa_ilvl_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
695 kernel.packet[0] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
696 kernel.packet[1] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
697 kernel.packet[2] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
698 kernel.packet[3] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
701inline std::ostream& operator<<(std::ostream& os,
const PacketBlock<Packet4i, 4>& value) {
702 os <<
"[ " << value.packet[0] <<
"," << std::endl
703 <<
" " << value.packet[1] <<
"," << std::endl
704 <<
" " << value.packet[2] <<
"," << std::endl
705 <<
" " << value.packet[3] <<
" ]";
709EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
712 v4i32 tmp1, tmp2, tmp3, tmp4;
714 tmp1 = __builtin_msa_ilvr_w(kernel.packet[1], kernel.packet[0]);
715 tmp2 = __builtin_msa_ilvr_w(kernel.packet[3], kernel.packet[2]);
716 tmp3 = __builtin_msa_ilvl_w(kernel.packet[1], kernel.packet[0]);
717 tmp4 = __builtin_msa_ilvl_w(kernel.packet[3], kernel.packet[2]);
719 kernel.packet[0] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
720 kernel.packet[1] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
721 kernel.packet[2] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
722 kernel.packet[3] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
726EIGEN_STRONG_INLINE Packet4f psqrt(
const Packet4f& a) {
729 return __builtin_msa_fsqrt_w(a);
733EIGEN_STRONG_INLINE Packet4f prsqrt(
const Packet4f& a) {
737 return __builtin_msa_frsqrt_w(a);
739 Packet4f ones = __builtin_msa_ffint_s_w(__builtin_msa_ldi_w(1));
740 return pdiv(ones, psqrt(a));
745EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(
const Packet4f& a) {
747 int32_t old_mode, new_mode;
749 "cfcmsa %[old_mode], $1\n"
750 "ori %[new_mode], %[old_mode], 3\n"
751 "ctcmsa $1, %[new_mode]\n"
752 "frint.w %w[v], %w[v]\n"
753 "ctcmsa $1, %[old_mode]\n"
755 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
764EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(
const Packet4f& a) {
766 int32_t old_mode, new_mode;
768 "cfcmsa %[old_mode], $1\n"
769 "ori %[new_mode], %[old_mode], 3\n"
770 "xori %[new_mode], %[new_mode], 1\n"
771 "ctcmsa $1, %[new_mode]\n"
772 "frint.w %w[v], %w[v]\n"
773 "ctcmsa $1, %[old_mode]\n"
775 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
784EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(
const Packet4f& a) {
786 int32_t old_mode, new_mode;
788 "cfcmsa %[old_mode], $1\n"
789 "ori %[new_mode], %[old_mode], 3\n"
790 "xori %[new_mode], %[new_mode], 3\n"
791 "ctcmsa $1, %[new_mode]\n"
792 "frint.w %w[v], %w[v]\n"
793 "ctcmsa $1, %[old_mode]\n"
795 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
805typedef v2f64 Packet2d;
806typedef v2i64 Packet2l;
807typedef v2u64 Packet2ul;
809#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = {X, X}
810#define EIGEN_DECLARE_CONST_Packet2l(NAME, X) const Packet2l p2l_##NAME = {X, X}
811#define EIGEN_DECLARE_CONST_Packet2ul(NAME, X) const Packet2ul p2ul_##NAME = {X, X}
813inline std::ostream& operator<<(std::ostream& os,
const Packet2d& value) {
814 os <<
"[ " << value[0] <<
", " << value[1] <<
" ]";
818inline std::ostream& operator<<(std::ostream& os,
const Packet2l& value) {
819 os <<
"[ " << value[0] <<
", " << value[1] <<
" ]";
823inline std::ostream& operator<<(std::ostream& os,
const Packet2ul& value) {
824 os <<
"[ " << value[0] <<
", " << value[1] <<
" ]";
829struct packet_traits<double> : default_packet_traits {
830 typedef Packet2d type;
831 typedef Packet2d half;
845struct unpacket_traits<Packet2d> {
851 masked_load_available =
false,
852 masked_store_available =
false
854 typedef Packet2d half;
858EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(
const double& from) {
861 Packet2d value = {from, from};
866EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
869 return __builtin_msa_fadd_d(a, b);
873EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(
const double& a) {
876 static const Packet2d countdown = {0.0, 1.0};
877 return padd(pset1<Packet2d>(a), countdown);
881EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
884 return __builtin_msa_fsub_d(a, b);
888EIGEN_STRONG_INLINE Packet2d pnegate(
const Packet2d& a) {
891 return (Packet2d)__builtin_msa_bnegi_d((v2u64)a, 63);
895EIGEN_STRONG_INLINE Packet2d pconj(
const Packet2d& a) {
902EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
905 return __builtin_msa_fmul_d(a, b);
909EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
912 return __builtin_msa_fdiv_d(a, b);
916EIGEN_STRONG_INLINE Packet2d pmadd(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
919 return __builtin_msa_fmadd_d(c, a, b);
925EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
928 return (Packet2d)__builtin_msa_and_v((v16u8)a, (v16u8)b);
932EIGEN_STRONG_INLINE Packet2d por<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
935 return (Packet2d)__builtin_msa_or_v((v16u8)a, (v16u8)b);
939EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
942 return (Packet2d)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
946EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
949 return pand(a, (Packet2d)__builtin_msa_xori_b((v16u8)b, 255));
953EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(
const double* from) {
956 EIGEN_DEBUG_UNALIGNED_LOAD
return (Packet2d)__builtin_msa_ld_d(
const_cast<double*
>(from), 0);
960EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
965 return __builtin_msa_fmin_d(a, b);
968 v2i64 aNaN = __builtin_msa_fcun_d(a, a);
969 v2i64 aMinOrNaN = por(__builtin_msa_fclt_d(a, b), aNaN);
970 return (Packet2d)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
975EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
980 return __builtin_msa_fmax_d(a, b);
983 v2i64 aNaN = __builtin_msa_fcun_d(a, a);
984 v2i64 aMaxOrNaN = por(__builtin_msa_fclt_d(b, a), aNaN);
985 return (Packet2d)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
990EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(
const double* from) {
993 EIGEN_DEBUG_UNALIGNED_LOAD
return (Packet2d)__builtin_msa_ld_d(
const_cast<double*
>(from), 0);
997EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(
const double* from) {
1000 Packet2d value = {*from, *from};
1005EIGEN_STRONG_INLINE
void pstore<double>(
double* to,
const Packet2d& from) {
1008 EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
1012EIGEN_STRONG_INLINE
void pstoreu<double>(
double* to,
const Packet2d& from) {
1015 EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
1019EIGEN_DEVICE_FUNC
inline Packet2d pgather<double, Packet2d>(
const double* from,
Index stride) {
1030EIGEN_DEVICE_FUNC
inline void pscatter<double, Packet2d>(
double* to,
const Packet2d& from,
Index stride) {
1039EIGEN_STRONG_INLINE
void prefetch<double>(
const double* addr) {
1042 __builtin_prefetch(addr);
1046EIGEN_STRONG_INLINE
double pfirst<Packet2d>(
const Packet2d& a) {
1053EIGEN_STRONG_INLINE Packet2d preverse(
const Packet2d& a) {
1056 return (Packet2d)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
1060EIGEN_STRONG_INLINE Packet2d pabs(
const Packet2d& a) {
1063 return (Packet2d)__builtin_msa_bclri_d((v2u64)a, 63);
1067EIGEN_STRONG_INLINE
double predux<Packet2d>(
const Packet2d& a) {
1070 Packet2d s = padd(a, preverse(a));
1077EIGEN_STRONG_INLINE
double predux_mul<Packet2d>(
const Packet2d& a) {
1080 Packet2d p = pmul(a, preverse(a));
1086EIGEN_STRONG_INLINE
double predux_min<Packet2d>(
const Packet2d& a) {
1090 Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
1091 Packet2d v = __builtin_msa_fmin_d(a, swapped);
1094 double a0 = a[0], a1 = a[1];
1095 return ((numext::isnan)(a0) || a0 < a1) ? a0 : a1;
1101EIGEN_STRONG_INLINE
double predux_max<Packet2d>(
const Packet2d& a) {
1105 Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
1106 Packet2d v = __builtin_msa_fmax_d(a, swapped);
1109 double a0 = a[0], a1 = a[1];
1110 return ((numext::isnan)(a0) || a0 > a1) ? a0 : a1;
1115EIGEN_STRONG_INLINE Packet2d psqrt(
const Packet2d& a) {
1118 return __builtin_msa_fsqrt_d(a);
1122EIGEN_STRONG_INLINE Packet2d prsqrt(
const Packet2d& a) {
1126 return __builtin_msa_frsqrt_d(a);
1128 Packet2d ones = __builtin_msa_ffint_s_d(__builtin_msa_ldi_d(1));
1129 return pdiv(ones, psqrt(a));
1133inline std::ostream& operator<<(std::ostream& os,
const PacketBlock<Packet2d, 2>& value) {
1134 os <<
"[ " << value.packet[0] <<
"," << std::endl <<
" " << value.packet[1] <<
" ]";
1138EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
1141 Packet2d trn1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
1142 Packet2d trn2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
1143 kernel.packet[0] = trn1;
1144 kernel.packet[1] = trn2;
1148EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(
const Packet2d& a) {
1150 int32_t old_mode, new_mode;
1152 "cfcmsa %[old_mode], $1\n"
1153 "ori %[new_mode], %[old_mode], 3\n"
1154 "ctcmsa $1, %[new_mode]\n"
1155 "frint.d %w[v], %w[v]\n"
1156 "ctcmsa $1, %[old_mode]\n"
1158 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
1167EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(
const Packet2d& a) {
1169 int32_t old_mode, new_mode;
1171 "cfcmsa %[old_mode], $1\n"
1172 "ori %[new_mode], %[old_mode], 3\n"
1173 "xori %[new_mode], %[new_mode], 1\n"
1174 "ctcmsa $1, %[new_mode]\n"
1175 "frint.d %w[v], %w[v]\n"
1176 "ctcmsa $1, %[old_mode]\n"
1178 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
1187EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(
const Packet2d& a) {
1189 int32_t old_mode, new_mode;
1191 "cfcmsa %[old_mode], $1\n"
1192 "ori %[new_mode], %[old_mode], 3\n"
1193 "xori %[new_mode], %[new_mode], 3\n"
1194 "ctcmsa $1, %[new_mode]\n"
1195 "frint.d %w[v], %w[v]\n"
1196 "ctcmsa $1, %[old_mode]\n"
1198 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
@ Aligned16
Definition Constants.h:237
Namespace containing all symbols from the Eigen library.
Definition B01_Experimental.dox:1
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:82