11#ifndef EIGEN_PACKET_MATH_LSX_H
12#define EIGEN_PACKET_MATH_LSX_H
15#include "../../InternalHeaderCheck.h"
21#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
22#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
25#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
26#if EIGEN_ARCH_LOONGARCH64
27#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
31#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
32#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
35typedef __m128 Packet4f;
36typedef __m128d Packet2d;
38typedef eigen_packet_wrapper<__m128i, 0> Packet16c;
39typedef eigen_packet_wrapper<__m128i, 1> Packet8s;
40typedef eigen_packet_wrapper<__m128i, 2> Packet4i;
41typedef eigen_packet_wrapper<__m128i, 3> Packet2l;
42typedef eigen_packet_wrapper<__m128i, 4> Packet16uc;
43typedef eigen_packet_wrapper<__m128i, 5> Packet8us;
44typedef eigen_packet_wrapper<__m128i, 6> Packet4ui;
45typedef eigen_packet_wrapper<__m128i, 7> Packet2ul;
48struct is_arithmetic<__m128> {
49 enum { value =
true };
52struct is_arithmetic<__m128i> {
53 enum { value =
true };
56struct is_arithmetic<__m128d> {
57 enum { value =
true };
60struct is_arithmetic<Packet16c> {
61 enum { value =
true };
64struct is_arithmetic<Packet8s> {
65 enum { value =
true };
68struct is_arithmetic<Packet4i> {
69 enum { value =
true };
72struct is_arithmetic<Packet2l> {
73 enum { value =
true };
76struct is_arithmetic<Packet16uc> {
77 enum { value =
false };
80struct is_arithmetic<Packet8us> {
81 enum { value =
false };
84struct is_arithmetic<Packet4ui> {
85 enum { value =
false };
88struct is_arithmetic<Packet2ul> {
89 enum { value =
false };
92EIGEN_ALWAYS_INLINE Packet4f make_packet4f(
float a,
float b,
float c,
float d) {
93 float from[4] = {a, b, c, d};
94 return (Packet4f)__lsx_vld(from, 0);
97EIGEN_STRONG_INLINE Packet4f shuffle1(
const Packet4f& m,
int mask) {
98 const float* a =
reinterpret_cast<const float*
>(&m);
100 make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(a + ((mask >> 6) & 3)));
104template <
bool interleave>
105EIGEN_STRONG_INLINE Packet4f shuffle2(
const Packet4f& m,
const Packet4f& n,
int mask) {
106 const float* a =
reinterpret_cast<const float*
>(&m);
107 const float* b =
reinterpret_cast<const float*
>(&n);
109 make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
114EIGEN_STRONG_INLINE Packet4f shuffle2<true>(
const Packet4f& m,
const Packet4f& n,
int mask) {
115 const float* a =
reinterpret_cast<const float*
>(&m);
116 const float* b =
reinterpret_cast<const float*
>(&n);
118 make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
122EIGEN_STRONG_INLINE
static int eigen_lsx_shuffle_mask(
int p,
int q,
int r,
int s) {
123 return ((s) << 6 | (r) << 4 | (q) << 2 | (p));
126EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(
const Packet4f& a,
int p,
int q,
int r,
int s) {
127 return shuffle1(a, eigen_lsx_shuffle_mask(p, q, r, s));
129EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(
const Packet4f& a,
const Packet4f& b,
int p,
int q,
int r,
int s) {
130 return shuffle2<false>(a, b, eigen_lsx_shuffle_mask(p, q, r, s));
132EIGEN_STRONG_INLINE Packet4f vec4f_movelh(
const Packet4f& a,
const Packet4f& b) {
133 return shuffle2<false>(a, b, eigen_lsx_shuffle_mask(0, 1, 0, 1));
135EIGEN_STRONG_INLINE Packet4f vec4f_movehl(
const Packet4f& a,
const Packet4f& b) {
136 return shuffle2<false>(b, a, eigen_lsx_shuffle_mask(2, 3, 2, 3));
138EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(
const Packet4f& a,
const Packet4f& b) {
139 return shuffle2<true>(a, b, eigen_lsx_shuffle_mask(0, 0, 1, 1));
141EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(
const Packet4f& a,
const Packet4f& b) {
142 return shuffle2<true>(a, b, eigen_lsx_shuffle_mask(2, 2, 3, 3));
145EIGEN_ALWAYS_INLINE Packet2d make_packet2d(
double a,
double b) {
146 double from[2] = {a, b};
147 return (Packet2d)__lsx_vld(from, 0);
150EIGEN_STRONG_INLINE Packet2d shuffle(
const Packet2d& m,
const Packet2d& n,
int mask) {
151 const double* a =
reinterpret_cast<const double*
>(&m);
152 const double* b =
reinterpret_cast<const double*
>(&n);
153 Packet2d res = make_packet2d(*(a + (mask & 1)), *(b + ((mask >> 1) & 1)));
157EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(
const Packet2d& a,
const Packet2d& b,
int mask) {
158 return shuffle(a, b, mask);
160EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(
const Packet2d& a,
const Packet2d& b) {
return shuffle(a, b, 0); }
161EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(
const Packet2d& a,
const Packet2d& b) {
return shuffle(a, b, 3); }
164struct packet_traits<int8_t> : default_packet_traits {
165 typedef Packet16c type;
166 typedef Packet16c half;
178struct packet_traits<int16_t> : default_packet_traits {
179 typedef Packet8s type;
180 typedef Packet8s half;
193struct packet_traits<int32_t> : default_packet_traits {
194 typedef Packet4i type;
195 typedef Packet4i half;
208struct packet_traits<int64_t> : default_packet_traits {
209 typedef Packet2l type;
210 typedef Packet2l half;
223struct packet_traits<uint8_t> : default_packet_traits {
224 typedef Packet16uc type;
225 typedef Packet16uc half;
238struct packet_traits<uint16_t> : default_packet_traits {
239 typedef Packet8us type;
240 typedef Packet8us half;
254struct packet_traits<uint32_t> : default_packet_traits {
255 typedef Packet4ui type;
256 typedef Packet4ui half;
270struct packet_traits<uint64_t> : default_packet_traits {
271 typedef Packet2ul type;
272 typedef Packet2ul half;
286struct packet_traits<float> : default_packet_traits {
287 typedef Packet4f type;
288 typedef Packet4f half;
305struct packet_traits<double> : default_packet_traits {
306 typedef Packet2d type;
307 typedef Packet2d half;
323struct unpacket_traits<Packet16c> {
325 typedef Packet16c half;
330 masked_load_available =
false,
331 masked_store_available =
false
335struct unpacket_traits<Packet8s> {
336 typedef int16_t type;
337 typedef Packet8s half;
342 masked_load_available =
false,
343 masked_store_available =
false
347struct unpacket_traits<Packet4i> {
348 typedef int32_t type;
349 typedef Packet4i half;
354 masked_load_available =
false,
355 masked_store_available =
false
359struct unpacket_traits<Packet2l> {
360 typedef int64_t type;
361 typedef Packet2l half;
366 masked_load_available =
false,
367 masked_store_available =
false
371struct unpacket_traits<Packet16uc> {
372 typedef uint8_t type;
373 typedef Packet16uc half;
378 masked_load_available =
false,
379 masked_store_available =
false
383struct unpacket_traits<Packet8us> {
384 typedef uint16_t type;
385 typedef Packet8us half;
390 masked_load_available =
false,
391 masked_store_available =
false
395struct unpacket_traits<Packet4ui> {
396 typedef uint32_t type;
397 typedef Packet4ui half;
402 masked_load_available =
false,
403 masked_store_available =
false
407struct unpacket_traits<Packet2ul> {
408 typedef uint64_t type;
409 typedef Packet2ul half;
414 masked_load_available =
false,
415 masked_store_available =
false
419struct unpacket_traits<Packet4f> {
421 typedef Packet4f half;
422 typedef Packet4i integer_packet;
427 masked_load_available =
false,
428 masked_store_available =
false
432struct unpacket_traits<Packet2d> {
434 typedef Packet2d half;
435 typedef Packet2l integer_packet;
440 masked_load_available =
false,
441 masked_store_available =
false
446EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(
const int8_t& from) {
447 return __lsx_vreplgr2vr_b(from);
450EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(
const int16_t& from) {
451 return __lsx_vreplgr2vr_h(from);
454EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(
const int32_t& from) {
455 return __lsx_vreplgr2vr_w(from);
458EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(
const int64_t& from) {
459 return __lsx_vreplgr2vr_d(from);
462EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(
const uint8_t& from) {
463 return __lsx_vreplgr2vr_b(from);
466EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(
const uint16_t& from) {
467 return __lsx_vreplgr2vr_h(from);
470EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(
const uint32_t& from) {
471 return __lsx_vreplgr2vr_w(from);
474EIGEN_STRONG_INLINE Packet2ul pset1<Packet2ul>(
const uint64_t& from) {
475 return __lsx_vreplgr2vr_d(from);
478EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(
const float& from) {
479 Packet4f v = {from, from, from, from};
483EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(
const double& from) {
484 Packet2d v = {from, from};
489EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(uint32_t from) {
490 return reinterpret_cast<__m128
>((__m128i)pset1<Packet4ui>(from));
493EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
494 return reinterpret_cast<__m128d
>((__m128i)pset1<Packet2ul>(from));
498EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(
const int8_t& a) {
499 const int8_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
500 return __lsx_vadd_b(pset1<Packet16c>(a), __lsx_vld(countdown, 0));
503EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(
const int16_t& a) {
504 const int16_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7};
505 return __lsx_vadd_h(pset1<Packet8s>(a), __lsx_vld(countdown, 0));
508EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(
const int32_t& a) {
509 const int32_t countdown[] = {0, 1, 2, 3};
510 return __lsx_vadd_w(pset1<Packet4i>(a), __lsx_vld(countdown, 0));
513EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(
const int64_t& a) {
514 const int64_t countdown[] = {0, 1};
515 return __lsx_vadd_d(pset1<Packet2l>(a), __lsx_vld(countdown, 0));
518EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(
const uint8_t& a) {
519 const uint8_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
520 return __lsx_vadd_b(pset1<Packet16uc>(a), __lsx_vld(countdown, 0));
523EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(
const uint16_t& a) {
524 const uint16_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7};
525 return __lsx_vadd_h(pset1<Packet8us>(a), __lsx_vld(countdown, 0));
528EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(
const uint32_t& a) {
529 const uint32_t countdown[] = {0, 1, 2, 3};
530 return __lsx_vadd_w(pset1<Packet4ui>(a), __lsx_vld(countdown, 0));
533EIGEN_STRONG_INLINE Packet2ul plset<Packet2ul>(
const uint64_t& a) {
534 const uint64_t countdown[] = {0, 1};
535 return __lsx_vadd_d(pset1<Packet2ul>(a), __lsx_vld(countdown, 0));
538EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(
const float& a) {
539 static const Packet4f countdown = {0.0f, 1.0f, 2.0f, 3.0f};
540 return __lsx_vfadd_s(pset1<Packet4f>(a), countdown);
543EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(
const double& a) {
544 static const Packet2d countdown = {0.0f, 1.0f};
545 return __lsx_vfadd_d(pset1<Packet2d>(a), countdown);
549EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
550 return __lsx_vadd_b(a, b);
553EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
554 return __lsx_vadd_h(a, b);
557EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
558 return __lsx_vadd_w(a, b);
561EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
562 return __lsx_vadd_d(a, b);
565EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
566 return __lsx_vadd_b(a, b);
569EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
570 return __lsx_vadd_h(a, b);
573EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
574 return __lsx_vadd_w(a, b);
577EIGEN_STRONG_INLINE Packet2ul padd<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
578 return __lsx_vadd_d(a, b);
581EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
582 return __lsx_vfadd_s(a, b);
585EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
586 return __lsx_vfadd_d(a, b);
590EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
591 return __lsx_vsub_b(a, b);
594EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
595 return __lsx_vsub_h(a, b);
598EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
599 return __lsx_vsub_w(a, b);
602EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
603 return __lsx_vsub_d(a, b);
606EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
607 return __lsx_vsub_b(a, b);
610EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
611 return __lsx_vsub_h(a, b);
614EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
615 return __lsx_vsub_w(a, b);
618EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
619 return __lsx_vsub_d(a, b);
622EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
623 return __lsx_vfsub_s(a, b);
626EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
627 return __lsx_vfsub_d(a, b);
631EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(
const Packet4f& a,
const Packet4f& b);
633EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
634 const Packet4f mask =
635 make_packet4f(numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f);
636 return padd(a, pxor(mask, b));
639EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(
const Packet2d& a,
const Packet2d& b);
641EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
642 const Packet2d mask = make_packet2d(numext::bit_cast<double>(0x8000000000000000ull), 0.0);
643 return padd(a, pxor(mask, b));
647EIGEN_STRONG_INLINE Packet4f pnegate(
const Packet4f& a) {
648 Packet4f mask = make_packet4f(numext::bit_cast<float>(0x80000000), numext::bit_cast<float>(0x80000000),
649 numext::bit_cast<float>(0x80000000), numext::bit_cast<float>(0x80000000));
650 return (Packet4f)__lsx_vxor_v(numext::bit_cast<__m128i>(mask), numext::bit_cast<__m128i>(a));
653EIGEN_STRONG_INLINE Packet2d pnegate(
const Packet2d& a) {
655 make_packet2d(numext::bit_cast<double>(0x8000000000000000), numext::bit_cast<double>(0x8000000000000000));
656 return (Packet2d)__lsx_vxor_v(numext::bit_cast<__m128i>(mask), numext::bit_cast<__m128i>(a));
659EIGEN_STRONG_INLINE Packet16c pnegate(
const Packet16c& a) {
660 return __lsx_vneg_b(a);
663EIGEN_STRONG_INLINE Packet8s pnegate(
const Packet8s& a) {
664 return __lsx_vneg_h(a);
667EIGEN_STRONG_INLINE Packet4i pnegate(
const Packet4i& a) {
668 return __lsx_vneg_w(a);
671EIGEN_STRONG_INLINE Packet2l pnegate(
const Packet2l& a) {
672 return __lsx_vneg_d(a);
676EIGEN_STRONG_INLINE Packet4f pconj(
const Packet4f& a) {
680EIGEN_STRONG_INLINE Packet2d pconj(
const Packet2d& a) {
684EIGEN_STRONG_INLINE Packet16c pconj(
const Packet16c& a) {
688EIGEN_STRONG_INLINE Packet8s pconj(
const Packet8s& a) {
692EIGEN_STRONG_INLINE Packet4i pconj(
const Packet4i& a) {
696EIGEN_STRONG_INLINE Packet2l pconj(
const Packet2l& a) {
700EIGEN_STRONG_INLINE Packet16uc pconj(
const Packet16uc& a) {
704EIGEN_STRONG_INLINE Packet8us pconj(
const Packet8us& a) {
708EIGEN_STRONG_INLINE Packet4ui pconj(
const Packet4ui& a) {
712EIGEN_STRONG_INLINE Packet2ul pconj(
const Packet2ul& a) {
717EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
718 return __lsx_vfmul_s(a, b);
721EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
722 return __lsx_vfmul_d(a, b);
725EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
726 return __lsx_vmul_b(a, b);
729EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
730 return __lsx_vmul_h(a, b);
733EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
734 return __lsx_vmul_w(a, b);
737EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
738 return __lsx_vmul_d(a, b);
741EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
742 return __lsx_vmul_b(a, b);
745EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
746 return __lsx_vmul_h(a, b);
749EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
750 return __lsx_vmul_w(a, b);
753EIGEN_STRONG_INLINE Packet2ul pmul<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
754 return __lsx_vmul_d(a, b);
758EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
759 return __lsx_vfdiv_s(a, b);
762EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
763 return __lsx_vfdiv_d(a, b);
766EIGEN_STRONG_INLINE Packet8s pdiv<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
767 return __lsx_vdiv_h(a, b);
770EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
771 return __lsx_vdiv_w(a, b);
774EIGEN_STRONG_INLINE Packet2l pdiv<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
775 return __lsx_vdiv_d(a, b);
778EIGEN_STRONG_INLINE Packet8us pdiv<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
779 return __lsx_vdiv_hu(a, b);
782EIGEN_STRONG_INLINE Packet4ui pdiv<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
783 return __lsx_vdiv_wu(a, b);
786EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
787 return __lsx_vdiv_du(a, b);
791EIGEN_STRONG_INLINE Packet4f pmadd(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
792 return __lsx_vfmadd_s(a, b, c);
795EIGEN_STRONG_INLINE Packet2d pmadd(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
796 return __lsx_vfmadd_d(a, b, c);
799EIGEN_STRONG_INLINE Packet4f pmsub(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
800 return __lsx_vfmsub_s(a, b, c);
803EIGEN_STRONG_INLINE Packet2d pmsub(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
804 return __lsx_vfmsub_d(a, b, c);
807EIGEN_STRONG_INLINE Packet4f pnmadd(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
808 return __lsx_vfnmsub_s(a, b, c);
811EIGEN_STRONG_INLINE Packet2d pnmadd(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
812 return __lsx_vfnmsub_d(a, b, c);
815EIGEN_STRONG_INLINE Packet4f pnmsub(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
816 return __lsx_vfnmadd_s(a, b, c);
819EIGEN_STRONG_INLINE Packet2d pnmsub(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
820 return __lsx_vfnmadd_d(a, b, c);
823EIGEN_STRONG_INLINE Packet16c pmadd(
const Packet16c& a,
const Packet16c& b,
const Packet16c& c) {
824 return __lsx_vmadd_b(c, a, b);
827EIGEN_STRONG_INLINE Packet8s pmadd(
const Packet8s& a,
const Packet8s& b,
const Packet8s& c) {
828 return __lsx_vmadd_h(c, a, b);
831EIGEN_STRONG_INLINE Packet4i pmadd(
const Packet4i& a,
const Packet4i& b,
const Packet4i& c) {
832 return __lsx_vmadd_w(c, a, b);
835EIGEN_STRONG_INLINE Packet2l pmadd(
const Packet2l& a,
const Packet2l& b,
const Packet2l& c) {
836 return __lsx_vmadd_d(c, a, b);
839EIGEN_STRONG_INLINE Packet16uc pmadd(
const Packet16uc& a,
const Packet16uc& b,
const Packet16uc& c) {
840 return __lsx_vmadd_b(c, a, b);
843EIGEN_STRONG_INLINE Packet8us pmadd(
const Packet8us& a,
const Packet8us& b,
const Packet8us& c) {
844 return __lsx_vmadd_h(c, a, b);
847EIGEN_STRONG_INLINE Packet4ui pmadd(
const Packet4ui& a,
const Packet4ui& b,
const Packet4ui& c) {
848 return __lsx_vmadd_w(c, a, b);
851EIGEN_STRONG_INLINE Packet2ul pmadd(
const Packet2ul& a,
const Packet2ul& b,
const Packet2ul& c) {
852 return __lsx_vmadd_d(c, a, b);
856EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
857 return (Packet4f)__lsx_vand_v((__m128i)a, (__m128i)b);
860EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
861 return (Packet2d)__lsx_vand_v((__m128i)a, (__m128i)b);
864EIGEN_STRONG_INLINE Packet16c pand<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
865 return __lsx_vand_v(a, b);
868EIGEN_STRONG_INLINE Packet8s pand<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
869 return __lsx_vand_v(a, b);
872EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
873 return __lsx_vand_v(a, b);
876EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
877 return __lsx_vand_v(a, b);
880EIGEN_STRONG_INLINE Packet16uc pand<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
881 return __lsx_vand_v(a, b);
884EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
885 return __lsx_vand_v(a, b);
888EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
889 return __lsx_vand_v(a, b);
892EIGEN_STRONG_INLINE Packet2ul pand<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
893 return __lsx_vand_v(a, b);
897EIGEN_STRONG_INLINE Packet4f por<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
898 return (Packet4f)__lsx_vor_v((__m128i)a, (__m128i)b);
901EIGEN_STRONG_INLINE Packet2d por<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
902 return (Packet2d)__lsx_vor_v((__m128i)a, (__m128i)b);
905EIGEN_STRONG_INLINE Packet16c por<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
906 return __lsx_vor_v(a, b);
909EIGEN_STRONG_INLINE Packet8s por<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
910 return __lsx_vor_v(a, b);
913EIGEN_STRONG_INLINE Packet4i por<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
914 return __lsx_vor_v(a, b);
917EIGEN_STRONG_INLINE Packet2l por<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
918 return __lsx_vor_v(a, b);
921EIGEN_STRONG_INLINE Packet16uc por<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
922 return __lsx_vor_v(a, b);
925EIGEN_STRONG_INLINE Packet8us por<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
926 return __lsx_vor_v(a, b);
929EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
930 return __lsx_vor_v(a, b);
933EIGEN_STRONG_INLINE Packet2ul por<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
934 return __lsx_vor_v(a, b);
938EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
939 return (Packet4f)__lsx_vxor_v((__m128i)a, (__m128i)b);
942EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
943 return (Packet2d)__lsx_vxor_v((__m128i)a, (__m128i)b);
946EIGEN_STRONG_INLINE Packet16c pxor<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
947 return __lsx_vxor_v(a, b);
950EIGEN_STRONG_INLINE Packet8s pxor<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
951 return __lsx_vxor_v(a, b);
954EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
955 return __lsx_vxor_v(a, b);
958EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
959 return __lsx_vxor_v(a, b);
962EIGEN_STRONG_INLINE Packet16uc pxor<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
963 return __lsx_vxor_v(a, b);
966EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
967 return __lsx_vxor_v(a, b);
970EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
971 return __lsx_vxor_v(a, b);
974EIGEN_STRONG_INLINE Packet2ul pxor<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
975 return __lsx_vxor_v(a, b);
979EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
980 return (Packet4f)__lsx_vandn_v((__m128i)b, (__m128i)a);
983EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
984 return (Packet2d)__lsx_vandn_v((__m128i)b, (__m128i)a);
987EIGEN_STRONG_INLINE Packet16c pandnot<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
988 return __lsx_vandn_v(b, a);
991EIGEN_STRONG_INLINE Packet8s pandnot<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
992 return __lsx_vandn_v(b, a);
995EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
996 return __lsx_vandn_v(b, a);
999EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
1000 return __lsx_vandn_v(b, a);
1003EIGEN_STRONG_INLINE Packet16uc pandnot<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1004 return __lsx_vandn_v(b, a);
1007EIGEN_STRONG_INLINE Packet8us pandnot<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1008 return __lsx_vandn_v(b, a);
1011EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1012 return __lsx_vandn_v(b, a);
1015EIGEN_STRONG_INLINE Packet2ul pandnot<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
1016 return __lsx_vandn_v(b, a);
1020EIGEN_STRONG_INLINE Packet4f pcmp_le<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1021 return (Packet4f)__lsx_vfcmp_cle_s(a, b);
1024EIGEN_STRONG_INLINE Packet2d pcmp_le<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
1025 return (Packet2d)__lsx_vfcmp_cle_d(a, b);
1028EIGEN_STRONG_INLINE Packet16c pcmp_le<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1029 return __lsx_vsle_b(a, b);
1032EIGEN_STRONG_INLINE Packet8s pcmp_le<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1033 return __lsx_vsle_h(a, b);
1036EIGEN_STRONG_INLINE Packet4i pcmp_le<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1037 return __lsx_vsle_w(a, b);
1040EIGEN_STRONG_INLINE Packet2l pcmp_le<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
1041 return __lsx_vsle_d(a, b);
1044EIGEN_STRONG_INLINE Packet16uc pcmp_le<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1045 return __lsx_vsle_bu(a, b);
1048EIGEN_STRONG_INLINE Packet8us pcmp_le<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1049 return __lsx_vsle_hu(a, b);
1052EIGEN_STRONG_INLINE Packet4ui pcmp_le<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1053 return __lsx_vsle_wu(a, b);
1056EIGEN_STRONG_INLINE Packet2ul pcmp_le<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
1057 return __lsx_vsle_du(a, b);
1061EIGEN_STRONG_INLINE Packet4f pcmp_lt<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1062 return (Packet4f)__lsx_vfcmp_clt_s(a, b);
1065EIGEN_STRONG_INLINE Packet2d pcmp_lt<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
1066 return (Packet2d)__lsx_vfcmp_clt_d(a, b);
1069EIGEN_STRONG_INLINE Packet16c pcmp_lt<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1070 return __lsx_vslt_b(a, b);
1073EIGEN_STRONG_INLINE Packet8s pcmp_lt<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1074 return __lsx_vslt_h(a, b);
1077EIGEN_STRONG_INLINE Packet4i pcmp_lt<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1078 return __lsx_vslt_w(a, b);
1081EIGEN_STRONG_INLINE Packet2l pcmp_lt<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
1082 return __lsx_vslt_d(a, b);
1085EIGEN_STRONG_INLINE Packet16uc pcmp_lt<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1086 return __lsx_vslt_bu(a, b);
1089EIGEN_STRONG_INLINE Packet8us pcmp_lt<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1090 return __lsx_vslt_hu(a, b);
1093EIGEN_STRONG_INLINE Packet4ui pcmp_lt<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1094 return __lsx_vslt_wu(a, b);
1097EIGEN_STRONG_INLINE Packet2ul pcmp_lt<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
1098 return __lsx_vslt_du(a, b);
1102EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1103 return (Packet4f)__lsx_vfcmp_sult_s(a, b);
1106EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
1107 return (Packet2d)__lsx_vfcmp_sult_d(a, b);
1111EIGEN_STRONG_INLINE Packet4f pcmp_eq<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1112 return (Packet4f)__lsx_vfcmp_seq_s(a, b);
1115EIGEN_STRONG_INLINE Packet2d pcmp_eq<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
1116 return (Packet2d)__lsx_vfcmp_seq_d(a, b);
1119EIGEN_STRONG_INLINE Packet16c pcmp_eq<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1120 return __lsx_vseq_b(a, b);
1123EIGEN_STRONG_INLINE Packet8s pcmp_eq<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1124 return __lsx_vseq_h(a, b);
1127EIGEN_STRONG_INLINE Packet4i pcmp_eq<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1128 return __lsx_vseq_w(a, b);
1131EIGEN_STRONG_INLINE Packet2l pcmp_eq<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
1132 return __lsx_vseq_d(a, b);
1135EIGEN_STRONG_INLINE Packet16uc pcmp_eq<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1136 return __lsx_vseq_b(a, b);
1139EIGEN_STRONG_INLINE Packet8us pcmp_eq<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1140 return __lsx_vseq_h(a, b);
1143EIGEN_STRONG_INLINE Packet4ui pcmp_eq<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1144 return __lsx_vseq_w(a, b);
1147EIGEN_STRONG_INLINE Packet2ul pcmp_eq<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
1148 return __lsx_vseq_d(a, b);
1152EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1153 return __lsx_vmin_b(a, b);
1156EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1157 return __lsx_vmin_h(a, b);
1160EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1161 return __lsx_vmin_w(a, b);
1164EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
1165 return __lsx_vmin_d(a, b);
1168EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1169 return __lsx_vmin_bu(a, b);
1172EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1173 return __lsx_vmin_hu(a, b);
1176EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1177 return __lsx_vmin_wu(a, b);
1180EIGEN_STRONG_INLINE Packet2ul pmin<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
1181 return __lsx_vmin_du(a, b);
1185EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1186 return __lsx_vmax_b(a, b);
1189EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1190 return __lsx_vmax_h(a, b);
1193EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1194 return __lsx_vmax_w(a, b);
1197EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
1198 return __lsx_vmax_d(a, b);
1201EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1202 return __lsx_vmax_bu(a, b);
1205EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1206 return __lsx_vmax_hu(a, b);
1209EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1210 return __lsx_vmax_wu(a, b);
1213EIGEN_STRONG_INLINE Packet2ul pmax<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
1214 return __lsx_vmax_du(a, b);
1218EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1219 Packet4i aNaN = __lsx_vfcmp_cun_s(a, a);
1220 Packet4i aMinOrNaN = por<Packet4i>(__lsx_vfcmp_clt_s(a, b), aNaN);
1221 return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMinOrNaN);
1224EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
1225 Packet2l aNaN = __lsx_vfcmp_cun_d(a, a);
1226 Packet2l aMinOrNaN = por<Packet2l>(__lsx_vfcmp_clt_d(a, b), aNaN);
1227 return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMinOrNaN);
1230EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1231 Packet4i aNaN = __lsx_vfcmp_cun_s(a, a);
1232 Packet4i aMaxOrNaN = por<Packet4i>(__lsx_vfcmp_clt_s(b, a), aNaN);
1233 return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMaxOrNaN);
1236EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
1237 Packet2l aNaN = __lsx_vfcmp_cun_d(a, a);
1238 Packet2l aMaxOrNaN = por<Packet2l>(__lsx_vfcmp_clt_d(b, a), aNaN);
1239 return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMaxOrNaN);
1243EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(
const Packet16c& a) {
1244 return __lsx_vsrai_b((__m128i)a, N);
1247EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(
const Packet8s& a) {
1248 return __lsx_vsrai_h((__m128i)a, N);
1251EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(
const Packet4i& a) {
1252 return __lsx_vsrai_w((__m128i)a, N);
1255EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(
const Packet2l& a) {
1256 return __lsx_vsrai_d((__m128i)a, N);
1259EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(
const Packet16uc& a) {
1260 return __lsx_vsrli_b((__m128i)a, N);
1263EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(
const Packet8us& a) {
1264 return __lsx_vsrli_h((__m128i)a, N);
1267EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(
const Packet4ui& a) {
1268 return __lsx_vsrli_w((__m128i)a, N);
1271EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(
const Packet2ul& a) {
1272 return __lsx_vsrli_d((__m128i)a, N);
1276EIGEN_STRONG_INLINE Packet16c plogical_shift_right(
const Packet16c& a) {
1277 return __lsx_vsrli_b((__m128i)a, N);
1280EIGEN_STRONG_INLINE Packet8s plogical_shift_right(
const Packet8s& a) {
1281 return __lsx_vsrli_h((__m128i)a, N);
1284EIGEN_STRONG_INLINE Packet4i plogical_shift_right(
const Packet4i& a) {
1285 return __lsx_vsrli_w((__m128i)a, N);
1288EIGEN_STRONG_INLINE Packet2l plogical_shift_right(
const Packet2l& a) {
1289 return __lsx_vsrli_d((__m128i)a, N);
1292EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(
const Packet16uc& a) {
1293 return __lsx_vsrli_b((__m128i)a, N);
1296EIGEN_STRONG_INLINE Packet8us plogical_shift_right(
const Packet8us& a) {
1297 return __lsx_vsrli_h((__m128i)a, N);
1300EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(
const Packet4ui& a) {
1301 return __lsx_vsrli_w((__m128i)a, N);
1304EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(
const Packet2ul& a) {
1305 return __lsx_vsrli_d((__m128i)a, N);
1309EIGEN_STRONG_INLINE Packet16c plogical_shift_left(
const Packet16c& a) {
1310 return __lsx_vslli_b((__m128i)a, N);
1313EIGEN_STRONG_INLINE Packet8s plogical_shift_left(
const Packet8s& a) {
1314 return __lsx_vslli_h((__m128i)a, N);
1317EIGEN_STRONG_INLINE Packet4i plogical_shift_left(
const Packet4i& a) {
1318 return __lsx_vslli_w((__m128i)a, N);
1321EIGEN_STRONG_INLINE Packet2l plogical_shift_left(
const Packet2l& a) {
1322 return __lsx_vslli_d((__m128i)a, N);
1325EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(
const Packet16uc& a) {
1326 return __lsx_vslli_b((__m128i)a, N);
1329EIGEN_STRONG_INLINE Packet8us plogical_shift_left(
const Packet8us& a) {
1330 return __lsx_vslli_h((__m128i)a, N);
1333EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(
const Packet4ui& a) {
1334 return __lsx_vslli_w((__m128i)a, N);
1337EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(
const Packet2ul& a) {
1338 return __lsx_vslli_d((__m128i)a, N);
1342EIGEN_STRONG_INLINE Packet4f pabs(
const Packet4f& a) {
1343 return (Packet4f)__lsx_vbitclri_w((__m128i)a, 31);
1346EIGEN_STRONG_INLINE Packet2d pabs(
const Packet2d& a) {
1347 return (Packet2d)__lsx_vbitclri_d((__m128i)a, 63);
1350EIGEN_STRONG_INLINE Packet16c pabs(
const Packet16c& a) {
1351 return __lsx_vabsd_b(a, pzero(a));
1354EIGEN_STRONG_INLINE Packet8s pabs(
const Packet8s& a) {
1355 return __lsx_vabsd_h(a, pzero(a));
1358EIGEN_STRONG_INLINE Packet4i pabs(
const Packet4i& a) {
1359 return __lsx_vabsd_w(a, pzero(a));
1362EIGEN_STRONG_INLINE Packet2l pabs(
const Packet2l& a) {
1363 return __lsx_vabsd_d(a, pzero(a));
1366EIGEN_STRONG_INLINE Packet16uc pabs(
const Packet16uc& a) {
1370EIGEN_STRONG_INLINE Packet8us pabs(
const Packet8us& a) {
1374EIGEN_STRONG_INLINE Packet4ui pabs(
const Packet4ui& a) {
1378EIGEN_STRONG_INLINE Packet2ul pabs(
const Packet2ul& a) {
1383EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(
const float* from) {
1384 EIGEN_DEBUG_ALIGNED_LOAD
return (Packet4f)__lsx_vld(from, 0);
1387EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(
const double* from) {
1388 EIGEN_DEBUG_ALIGNED_LOAD
return (Packet2d)__lsx_vld(from, 0);
1391EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(
const int8_t* from) {
1392 EIGEN_DEBUG_ALIGNED_LOAD
return __lsx_vld(from, 0);
1395EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(
const int16_t* from) {
1396 EIGEN_DEBUG_ALIGNED_LOAD
return __lsx_vld(from, 0);
1399EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(
const int32_t* from) {
1400 EIGEN_DEBUG_ALIGNED_LOAD
return __lsx_vld(from, 0);
1403EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(
const int64_t* from) {
1404 EIGEN_DEBUG_ALIGNED_LOAD
return __lsx_vld(from, 0);
1407EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(
const uint8_t* from) {
1408 EIGEN_DEBUG_ALIGNED_LOAD
return __lsx_vld(from, 0);
1411EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(
const uint16_t* from) {
1412 EIGEN_DEBUG_ALIGNED_LOAD
return __lsx_vld(from, 0);
1415EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(
const uint32_t* from) {
1416 EIGEN_DEBUG_ALIGNED_LOAD
return __lsx_vld(from, 0);
1419EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(
const uint64_t* from) {
1420 EIGEN_DEBUG_ALIGNED_LOAD
return __lsx_vld(from, 0);
1424EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(
const float* from) {
1425 EIGEN_DEBUG_UNALIGNED_LOAD
return (Packet4f)__lsx_vld(from, 0);
1428EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(
const double* from) {
1429 EIGEN_DEBUG_UNALIGNED_LOAD
return (Packet2d)__lsx_vld(from, 0);
1432EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(
const int8_t* from) {
1433 EIGEN_DEBUG_UNALIGNED_LOAD
return __lsx_vld(from, 0);
1436EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(
const int16_t* from) {
1437 EIGEN_DEBUG_UNALIGNED_LOAD
return __lsx_vld(from, 0);
1440EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(
const int32_t* from) {
1441 EIGEN_DEBUG_UNALIGNED_LOAD
return __lsx_vld(from, 0);
1444EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(
const int64_t* from) {
1445 EIGEN_DEBUG_UNALIGNED_LOAD
return __lsx_vld(from, 0);
1448EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(
const uint8_t* from) {
1449 EIGEN_DEBUG_UNALIGNED_LOAD
return __lsx_vld(from, 0);
1452EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(
const uint16_t* from) {
1453 EIGEN_DEBUG_UNALIGNED_LOAD
return __lsx_vld(from, 0);
1456EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(
const uint32_t* from) {
1457 EIGEN_DEBUG_UNALIGNED_LOAD
return __lsx_vld(from, 0);
1460EIGEN_STRONG_INLINE Packet2ul ploadu<Packet2ul>(
const uint64_t* from) {
1461 EIGEN_DEBUG_UNALIGNED_LOAD
return __lsx_vld(from, 0);
1465EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(
const float* from) {
1466 float f0 = from[0], f1 = from[1];
1467 return make_packet4f(f0, f0, f1, f1);
1470EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(
const double* from) {
1471 return pset1<Packet2d>(from[0]);
1474EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(
const int8_t* from) {
1475 Packet16c tmp = pload<Packet16c>(from);
1476 return __lsx_vilvl_b(tmp, tmp);
1479EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(
const int16_t* from) {
1480 Packet8s tmp = pload<Packet8s>(from);
1481 return __lsx_vilvl_h(tmp, tmp);
1484EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(
const int32_t* from) {
1485 Packet4i tmp = pload<Packet4i>(from);
1486 return __lsx_vilvl_w(tmp, tmp);
1489EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(
const int64_t* from) {
1490 return pset1<Packet2l>(from[0]);
1493EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(
const uint8_t* from) {
1494 Packet16uc tmp = pload<Packet16uc>(from);
1495 return __lsx_vilvl_b(tmp, tmp);
1498EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(
const uint16_t* from) {
1499 Packet8us tmp = pload<Packet8us>(from);
1500 return __lsx_vilvl_h(tmp, tmp);
1503EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(
const uint32_t* from) {
1504 Packet4ui tmp = pload<Packet4ui>(from);
1505 return __lsx_vilvl_w(tmp, tmp);
1508EIGEN_STRONG_INLINE Packet2ul ploaddup<Packet2ul>(
const uint64_t* from) {
1509 return pset1<Packet2ul>(from[0]);
1513EIGEN_STRONG_INLINE
void pstore<float>(
float* to,
const Packet4f& from) {
1514 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst(from, to, 0);
1517EIGEN_STRONG_INLINE
void pstore<double>(
double* to,
const Packet2d& from) {
1518 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst(from, to, 0);
1521EIGEN_STRONG_INLINE
void pstore<int8_t>(int8_t* to,
const Packet16c& from) {
1522 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1525EIGEN_STRONG_INLINE
void pstore<int16_t>(int16_t* to,
const Packet8s& from) {
1526 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1529EIGEN_STRONG_INLINE
void pstore<int32_t>(int32_t* to,
const Packet4i& from) {
1530 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1533EIGEN_STRONG_INLINE
void pstore<int64_t>(int64_t* to,
const Packet2l& from) {
1534 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1537EIGEN_STRONG_INLINE
void pstore<uint8_t>(uint8_t* to,
const Packet16uc& from) {
1538 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1541EIGEN_STRONG_INLINE
void pstore<uint16_t>(uint16_t* to,
const Packet8us& from) {
1542 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1545EIGEN_STRONG_INLINE
void pstore<uint32_t>(uint32_t* to,
const Packet4ui& from) {
1546 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1549EIGEN_STRONG_INLINE
void pstore<uint64_t>(uint64_t* to,
const Packet2ul& from) {
1550 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1554EIGEN_STRONG_INLINE
void pstoreu<float>(
float* to,
const Packet4f& from) {
1555 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst(from, to, 0);
1558EIGEN_STRONG_INLINE
void pstoreu<double>(
double* to,
const Packet2d& from) {
1559 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst(from, to, 0);
1563EIGEN_STRONG_INLINE
void pstoreu<int8_t>(int8_t* to,
const Packet16c& from) {
1564 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1567EIGEN_STRONG_INLINE
void pstoreu<int16_t>(int16_t* to,
const Packet8s& from) {
1568 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1571EIGEN_STRONG_INLINE
void pstoreu<int32_t>(int32_t* to,
const Packet4i& from) {
1572 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1575EIGEN_STRONG_INLINE
void pstoreu<int64_t>(int64_t* to,
const Packet2l& from) {
1576 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1579EIGEN_STRONG_INLINE
void pstoreu<uint8_t>(uint8_t* to,
const Packet16uc& from) {
1580 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1583EIGEN_STRONG_INLINE
void pstoreu<uint16_t>(uint16_t* to,
const Packet8us& from) {
1584 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1587EIGEN_STRONG_INLINE
void pstoreu<uint32_t>(uint32_t* to,
const Packet4ui& from) {
1588 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1591EIGEN_STRONG_INLINE
void pstoreu<uint64_t>(uint64_t* to,
const Packet2ul& from) {
1592 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1596EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(
const float* from,
Index stride) {
1597 Packet4f v = {from[0], from[stride], from[2 * stride], from[3 * stride]};
1601EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(
const double* from,
Index stride) {
1602 Packet2d v = {from[0], from[stride]};
1606EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather<int8_t, Packet16c>(
const int8_t* from,
Index stride) {
1607 int8_t v[16] __attribute__((aligned(16)));
1609 v[1] = from[stride];
1610 v[2] = from[2 * stride];
1611 v[3] = from[3 * stride];
1612 v[4] = from[4 * stride];
1613 v[5] = from[5 * stride];
1614 v[6] = from[6 * stride];
1615 v[7] = from[7 * stride];
1616 v[8] = from[8 * stride];
1617 v[9] = from[9 * stride];
1618 v[10] = from[10 * stride];
1619 v[11] = from[11 * stride];
1620 v[12] = from[12 * stride];
1621 v[13] = from[13 * stride];
1622 v[14] = from[14 * stride];
1623 v[15] = from[15 * stride];
1624 return __lsx_vld(v, 0);
1627EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather<int16_t, Packet8s>(
const int16_t* from,
Index stride) {
1628 int16_t v[8] __attribute__((aligned(16)));
1630 v[1] = from[stride];
1631 v[2] = from[2 * stride];
1632 v[3] = from[3 * stride];
1633 v[4] = from[4 * stride];
1634 v[5] = from[5 * stride];
1635 v[6] = from[6 * stride];
1636 v[7] = from[7 * stride];
1637 return __lsx_vld(v, 0);
1640EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather<int32_t, Packet4i>(
const int32_t* from,
Index stride) {
1641 int32_t v[4] __attribute__((aligned(16)));
1643 v[1] = from[stride];
1644 v[2] = from[2 * stride];
1645 v[3] = from[3 * stride];
1646 return __lsx_vld(v, 0);
1649EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(
const int64_t* from,
Index stride) {
1650 int64_t v[2] __attribute__((aligned(16)));
1652 v[1] = from[stride];
1653 return __lsx_vld(v, 0);
1656EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather<uint8_t, Packet16uc>(
const uint8_t* from,
Index stride) {
1657 uint8_t v[16] __attribute__((aligned(16)));
1659 v[1] = from[stride];
1660 v[2] = from[2 * stride];
1661 v[3] = from[3 * stride];
1662 v[4] = from[4 * stride];
1663 v[5] = from[5 * stride];
1664 v[6] = from[6 * stride];
1665 v[7] = from[7 * stride];
1666 v[8] = from[8 * stride];
1667 v[9] = from[9 * stride];
1668 v[10] = from[10 * stride];
1669 v[11] = from[11 * stride];
1670 v[12] = from[12 * stride];
1671 v[13] = from[13 * stride];
1672 v[14] = from[14 * stride];
1673 v[15] = from[15 * stride];
1674 return __lsx_vld(v, 0);
1677EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather<uint16_t, Packet8us>(
const uint16_t* from,
Index stride) {
1678 uint16_t v[8] __attribute__((aligned(16)));
1680 v[1] = from[stride];
1681 v[2] = from[2 * stride];
1682 v[3] = from[3 * stride];
1683 v[4] = from[4 * stride];
1684 v[5] = from[5 * stride];
1685 v[6] = from[6 * stride];
1686 v[7] = from[7 * stride];
1687 return __lsx_vld(v, 0);
1690EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(
const uint32_t* from,
Index stride) {
1691 uint32_t v[4] __attribute__((aligned(16)));
1693 v[1] = from[stride];
1694 v[2] = from[2 * stride];
1695 v[3] = from[3 * stride];
1696 return __lsx_vld(v, 0);
1699EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather<uint64_t, Packet2ul>(
const uint64_t* from,
Index stride) {
1700 uint64_t v[2] __attribute__((aligned(16)));
1702 v[1] = from[stride];
1703 return __lsx_vld(v, 0);
1707EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<float, Packet4f>(
float* to,
const Packet4f& from,
Index stride) {
1708 __lsx_vstelm_w(from, to, 0, 0);
1709 __lsx_vstelm_w(from, to + stride * 1, 0, 1);
1710 __lsx_vstelm_w(from, to + stride * 2, 0, 2);
1711 __lsx_vstelm_w(from, to + stride * 3, 0, 3);
1714EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<double, Packet2d>(
double* to,
const Packet2d& from,
Index stride) {
1715 __lsx_vstelm_d(from, to, 0, 0);
1716 __lsx_vstelm_d(from, to + stride, 0, 1);
1719EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<int8_t, Packet16c>(int8_t* to,
const Packet16c& from,
1721 __lsx_vstelm_b((__m128i)from, to, 0, 0);
1722 __lsx_vstelm_b((__m128i)from, to + stride * 1, 0, 1);
1723 __lsx_vstelm_b((__m128i)from, to + stride * 2, 0, 2);
1724 __lsx_vstelm_b((__m128i)from, to + stride * 3, 0, 3);
1725 __lsx_vstelm_b((__m128i)from, to + stride * 4, 0, 4);
1726 __lsx_vstelm_b((__m128i)from, to + stride * 5, 0, 5);
1727 __lsx_vstelm_b((__m128i)from, to + stride * 6, 0, 6);
1728 __lsx_vstelm_b((__m128i)from, to + stride * 7, 0, 7);
1729 __lsx_vstelm_b((__m128i)from, to + stride * 8, 0, 8);
1730 __lsx_vstelm_b((__m128i)from, to + stride * 9, 0, 9);
1731 __lsx_vstelm_b((__m128i)from, to + stride * 10, 0, 10);
1732 __lsx_vstelm_b((__m128i)from, to + stride * 11, 0, 11);
1733 __lsx_vstelm_b((__m128i)from, to + stride * 12, 0, 12);
1734 __lsx_vstelm_b((__m128i)from, to + stride * 13, 0, 13);
1735 __lsx_vstelm_b((__m128i)from, to + stride * 14, 0, 14);
1736 __lsx_vstelm_b((__m128i)from, to + stride * 15, 0, 15);
1739EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<int16_t, Packet8s>(int16_t* to,
const Packet8s& from,
1741 __lsx_vstelm_h((__m128i)from, to, 0, 0);
1742 __lsx_vstelm_h((__m128i)from, to + stride * 1, 0, 1);
1743 __lsx_vstelm_h((__m128i)from, to + stride * 2, 0, 2);
1744 __lsx_vstelm_h((__m128i)from, to + stride * 3, 0, 3);
1745 __lsx_vstelm_h((__m128i)from, to + stride * 4, 0, 4);
1746 __lsx_vstelm_h((__m128i)from, to + stride * 5, 0, 5);
1747 __lsx_vstelm_h((__m128i)from, to + stride * 6, 0, 6);
1748 __lsx_vstelm_h((__m128i)from, to + stride * 7, 0, 7);
1751EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<int32_t, Packet4i>(int32_t* to,
const Packet4i& from,
1753 __lsx_vstelm_w((__m128i)from, to, 0, 0);
1754 __lsx_vstelm_w((__m128i)from, to + stride * 1, 0, 1);
1755 __lsx_vstelm_w((__m128i)from, to + stride * 2, 0, 2);
1756 __lsx_vstelm_w((__m128i)from, to + stride * 3, 0, 3);
1759EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<int64_t, Packet2l>(int64_t* to,
const Packet2l& from,
1761 __lsx_vstelm_d((__m128i)from, to, 0, 0);
1762 __lsx_vstelm_d((__m128i)from, to + stride * 1, 0, 1);
1765EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<uint8_t, Packet16uc>(uint8_t* to,
const Packet16uc& from,
1767 __lsx_vstelm_b((__m128i)from, to, 0, 0);
1768 __lsx_vstelm_b((__m128i)from, to + stride * 1, 0, 1);
1769 __lsx_vstelm_b((__m128i)from, to + stride * 2, 0, 2);
1770 __lsx_vstelm_b((__m128i)from, to + stride * 3, 0, 3);
1771 __lsx_vstelm_b((__m128i)from, to + stride * 4, 0, 4);
1772 __lsx_vstelm_b((__m128i)from, to + stride * 5, 0, 5);
1773 __lsx_vstelm_b((__m128i)from, to + stride * 6, 0, 6);
1774 __lsx_vstelm_b((__m128i)from, to + stride * 7, 0, 7);
1775 __lsx_vstelm_b((__m128i)from, to + stride * 8, 0, 8);
1776 __lsx_vstelm_b((__m128i)from, to + stride * 9, 0, 9);
1777 __lsx_vstelm_b((__m128i)from, to + stride * 10, 0, 10);
1778 __lsx_vstelm_b((__m128i)from, to + stride * 11, 0, 11);
1779 __lsx_vstelm_b((__m128i)from, to + stride * 12, 0, 12);
1780 __lsx_vstelm_b((__m128i)from, to + stride * 13, 0, 13);
1781 __lsx_vstelm_b((__m128i)from, to + stride * 14, 0, 14);
1782 __lsx_vstelm_b((__m128i)from, to + stride * 15, 0, 15);
1785EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<uint16_t, Packet8us>(uint16_t* to,
const Packet8us& from,
1787 __lsx_vstelm_h((__m128i)from, to, 0, 0);
1788 __lsx_vstelm_h((__m128i)from, to + stride * 1, 0, 1);
1789 __lsx_vstelm_h((__m128i)from, to + stride * 2, 0, 2);
1790 __lsx_vstelm_h((__m128i)from, to + stride * 3, 0, 3);
1791 __lsx_vstelm_h((__m128i)from, to + stride * 4, 0, 4);
1792 __lsx_vstelm_h((__m128i)from, to + stride * 5, 0, 5);
1793 __lsx_vstelm_h((__m128i)from, to + stride * 6, 0, 6);
1794 __lsx_vstelm_h((__m128i)from, to + stride * 7, 0, 7);
1797EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<uint32_t, Packet4ui>(uint32_t* to,
const Packet4ui& from,
1799 __lsx_vstelm_w((__m128i)from, to, 0, 0);
1800 __lsx_vstelm_w((__m128i)from, to + stride * 1, 0, 1);
1801 __lsx_vstelm_w((__m128i)from, to + stride * 2, 0, 2);
1802 __lsx_vstelm_w((__m128i)from, to + stride * 3, 0, 3);
1805EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<uint64_t, Packet2ul>(uint64_t* to,
const Packet2ul& from,
1807 __lsx_vstelm_d((__m128i)from, to, 0, 0);
1808 __lsx_vstelm_d((__m128i)from, to + stride * 1, 0, 1);
1812EIGEN_STRONG_INLINE
void prefetch<float>(
const float* addr) {
1813 __builtin_prefetch(addr);
1816EIGEN_STRONG_INLINE
void prefetch<double>(
const double* addr) {
1817 __builtin_prefetch(addr);
1820EIGEN_STRONG_INLINE
void prefetch<int8_t>(
const int8_t* addr) {
1821 __builtin_prefetch(addr);
1824EIGEN_STRONG_INLINE
void prefetch<int16_t>(
const int16_t* addr) {
1825 __builtin_prefetch(addr);
1828EIGEN_STRONG_INLINE
void prefetch<int32_t>(
const int32_t* addr) {
1829 __builtin_prefetch(addr);
1832EIGEN_STRONG_INLINE
void prefetch<int64_t>(
const int64_t* addr) {
1833 __builtin_prefetch(addr);
1836EIGEN_STRONG_INLINE
void prefetch<uint8_t>(
const uint8_t* addr) {
1837 __builtin_prefetch(addr);
1840EIGEN_STRONG_INLINE
void prefetch<uint16_t>(
const uint16_t* addr) {
1841 __builtin_prefetch(addr);
1844EIGEN_STRONG_INLINE
void prefetch<uint32_t>(
const uint32_t* addr) {
1845 __builtin_prefetch(addr);
1848EIGEN_STRONG_INLINE
void prefetch<uint64_t>(
const uint64_t* addr) {
1849 __builtin_prefetch(addr);
1853EIGEN_STRONG_INLINE
float pfirst<Packet4f>(
const Packet4f& a) {
1855 __lsx_vstelm_w(a, &v, 0, 0);
1859EIGEN_STRONG_INLINE
double pfirst<Packet2d>(
const Packet2d& a) {
1861 __lsx_vstelm_d(a, &v, 0, 0);
1866EIGEN_STRONG_INLINE int8_t pfirst<Packet16c>(
const Packet16c& a) {
1867 return (int8_t)__lsx_vpickve2gr_b((__m128i)a, 0);
1870EIGEN_STRONG_INLINE int16_t pfirst<Packet8s>(
const Packet8s& a) {
1871 return (int16_t)__lsx_vpickve2gr_h((__m128i)a, 0);
1874EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(
const Packet4i& a) {
1875 return __lsx_vpickve2gr_w((__m128i)a, 0);
1878EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(
const Packet2l& a) {
1879 return __lsx_vpickve2gr_d((__m128i)a, 0);
1882EIGEN_STRONG_INLINE uint8_t pfirst<Packet16uc>(
const Packet16uc& a) {
1883 return (uint8_t)__lsx_vpickve2gr_bu((__m128i)a, 0);
1886EIGEN_STRONG_INLINE uint16_t pfirst<Packet8us>(
const Packet8us& a) {
1887 return (uint16_t)__lsx_vpickve2gr_hu((__m128i)a, 0);
1890EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(
const Packet4ui& a) {
1891 return __lsx_vpickve2gr_wu((__m128i)a, 0);
1894EIGEN_STRONG_INLINE uint64_t pfirst<Packet2ul>(
const Packet2ul& a) {
1895 return __lsx_vpickve2gr_du((__m128i)a, 0);
1899EIGEN_STRONG_INLINE Packet4f preverse(
const Packet4f& a) {
1900 return (Packet4f)__lsx_vshuf4i_w(a, 0x1B);
1903EIGEN_STRONG_INLINE Packet2d preverse(
const Packet2d& a) {
1904 return (Packet2d)__lsx_vshuf4i_d(a, a, 0x1);
1907EIGEN_STRONG_INLINE Packet16c preverse(
const Packet16c& a) {
1908 return __lsx_vshuf4i_b(__lsx_vshuf4i_w((__m128i)a, 0x1B), 0x1B);
1911EIGEN_STRONG_INLINE Packet8s preverse(
const Packet8s& a) {
1912 return __lsx_vshuf4i_h(__lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1), 0x1B);
1915EIGEN_STRONG_INLINE Packet4i preverse(
const Packet4i& a) {
1916 return __lsx_vshuf4i_w((__m128i)a, 0x1B);
1919EIGEN_STRONG_INLINE Packet2l preverse(
const Packet2l& a) {
1920 return __lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1);
1923EIGEN_STRONG_INLINE Packet16uc preverse(
const Packet16uc& a) {
1924 return __lsx_vshuf4i_b(__lsx_vshuf4i_w((__m128i)a, 0x1B), 0x1B);
1927EIGEN_STRONG_INLINE Packet8us preverse(
const Packet8us& a) {
1928 return __lsx_vshuf4i_h(__lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1), 0x1B);
1931EIGEN_STRONG_INLINE Packet4ui preverse(
const Packet4ui& a) {
1932 return __lsx_vshuf4i_w((__m128i)a, 0x1B);
1935EIGEN_STRONG_INLINE Packet2ul preverse(
const Packet2ul& a) {
1936 return __lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1);
1940EIGEN_STRONG_INLINE
float predux<Packet4f>(
const Packet4f& a) {
1941 Packet4f tmp = __lsx_vfadd_s(a, vec4f_swizzle1(a, 2, 3, 2, 3));
1942 return pfirst<Packet4f>(__lsx_vfadd_s(tmp, vec4f_swizzle1(tmp, 1, 1, 1, 1)));
1945EIGEN_STRONG_INLINE
double predux<Packet2d>(
const Packet2d& a) {
1946 return pfirst<Packet2d>(__lsx_vfadd_d(a, preverse(a)));
1949EIGEN_STRONG_INLINE int8_t predux<Packet16c>(
const Packet16c& a) {
1950 Packet8s tmp1 = __lsx_vhaddw_h_b(a, a);
1951 Packet4i tmp2 = __lsx_vhaddw_w_h(tmp1, tmp1);
1952 Packet2l tmp3 = __lsx_vhaddw_d_w(tmp2, tmp2);
1953 return (int8_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp3, tmp3), 0);
1956EIGEN_STRONG_INLINE int16_t predux<Packet8s>(
const Packet8s& a) {
1957 Packet4i tmp1 = __lsx_vhaddw_w_h(a, a);
1958 Packet2l tmp2 = __lsx_vhaddw_d_w(tmp1, tmp1);
1959 return (int16_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp2, tmp2), 0);
1962EIGEN_STRONG_INLINE int32_t predux<Packet4i>(
const Packet4i& a) {
1963 Packet2l tmp = __lsx_vhaddw_d_w(a, a);
1964 return (int32_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp, tmp), 0);
1967EIGEN_STRONG_INLINE int64_t predux<Packet2l>(
const Packet2l& a) {
1968 return (int64_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(a, a), 0);
1971EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(
const Packet16uc& a) {
1972 Packet8us tmp1 = __lsx_vhaddw_hu_bu(a, a);
1973 Packet4ui tmp2 = __lsx_vhaddw_wu_hu(tmp1, tmp1);
1974 Packet2ul tmp3 = __lsx_vhaddw_du_wu(tmp2, tmp2);
1975 return (uint8_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp3, tmp3), 0);
1978EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(
const Packet8us& a) {
1979 Packet4ui tmp1 = __lsx_vhaddw_wu_hu(a, a);
1980 Packet2ul tmp2 = __lsx_vhaddw_du_wu(tmp1, tmp1);
1981 return (uint16_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp2, tmp2), 0);
1984EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(
const Packet4ui& a) {
1985 Packet2ul tmp = __lsx_vhaddw_du_wu(a, a);
1986 return (uint32_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp, tmp), 0);
1989EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(
const Packet2ul& a) {
1990 return (uint64_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(a, a), 0);
1994EIGEN_STRONG_INLINE
float predux_mul<Packet4f>(
const Packet4f& a) {
1995 Packet4f tmp = __lsx_vfmul_s(a, vec4f_swizzle1(a, 2, 3, 2, 3));
1996 return pfirst<Packet4f>(__lsx_vfmul_s(tmp, vec4f_swizzle1(tmp, 1, 1, 1, 1)));
1999EIGEN_STRONG_INLINE
double predux_mul<Packet2d>(
const Packet2d& a) {
2000 return pfirst<Packet2d>(__lsx_vfmul_d(a, preverse(a)));
2003EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(
const Packet16c& a) {
2004 Packet8s tmp1 = __lsx_vmulwev_h_b(a, preverse(a));
2005 Packet4i tmp2 = __lsx_vmulwev_w_h(tmp1, preverse(tmp1));
2006 Packet2l tmp3 = __lsx_vmulwev_d_w(tmp2, preverse(tmp2));
2007 return (int8_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp3, preverse(tmp3)), 0);
2010EIGEN_STRONG_INLINE int16_t predux_mul<Packet8s>(
const Packet8s& a) {
2011 Packet4i tmp1 = __lsx_vmulwev_w_h(a, preverse(a));
2012 Packet2l tmp2 = __lsx_vmulwev_d_w(tmp1, preverse(tmp1));
2013 return (int16_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp2, preverse(tmp2)), 0);
2016EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(
const Packet4i& a) {
2017 Packet2l tmp = __lsx_vmulwev_d_w(a, preverse(a));
2018 return (int32_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp, preverse(tmp)), 0);
2021EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(
const Packet2l& a) {
2022 return (int64_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(a, preverse(a)), 0);
2025EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(
const Packet16uc& a) {
2026 Packet8us tmp1 = __lsx_vmulwev_h_bu(a, preverse(a));
2027 Packet4ui tmp2 = __lsx_vmulwev_w_h(tmp1, preverse(tmp1));
2028 Packet2ul tmp3 = __lsx_vmulwev_d_w(tmp2, preverse(tmp2));
2029 return (uint8_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp3, preverse(tmp3)), 0);
2032EIGEN_STRONG_INLINE uint16_t predux_mul<Packet8us>(
const Packet8us& a) {
2033 Packet4ui tmp1 = __lsx_vmulwev_w_hu(a, preverse(a));
2034 Packet2ul tmp2 = __lsx_vmulwev_d_w(tmp1, preverse(tmp1));
2035 return (uint16_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp2, preverse(tmp2)), 0);
2038EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(
const Packet4ui& a) {
2039 Packet2ul tmp = __lsx_vmulwev_d_wu(a, preverse(a));
2040 return (uint32_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp, preverse(tmp)), 0);
2043EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(
const Packet2ul& a) {
2044 return (uint64_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_du(a, preverse(a)), 0);
2048EIGEN_STRONG_INLINE
float predux_min<Packet4f>(
const Packet4f& a) {
2049 Packet4f tmp = __lsx_vfmin_s(a, (Packet4f)__lsx_vshuf4i_w(a, 0x4E));
2050 return pfirst(__lsx_vfmin_s(tmp, (Packet4f)__lsx_vshuf4i_w(tmp, 0xB1)));
2053EIGEN_STRONG_INLINE
double predux_min<Packet2d>(
const Packet2d& a) {
2054 return pfirst(__lsx_vfmin_d(a, preverse(a)));
2057EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(
const Packet16c& a) {
2058 Packet16c tmp1 = __lsx_vmin_b(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2059 Packet16c tmp2 = __lsx_vmin_b(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2060 Packet16c tmp3 = __lsx_vmin_b(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
2061 return pfirst((Packet16c)__lsx_vmin_b(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
2064EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(
const Packet8s& a) {
2065 Packet8s tmp1 = __lsx_vmin_h(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2066 Packet8s tmp2 = __lsx_vmin_h(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2067 return pfirst((Packet8s)__lsx_vmin_h(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
2070EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(
const Packet4i& a) {
2071 Packet4i tmp = __lsx_vmin_w(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2072 return pfirst((Packet4i)__lsx_vmin_w(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
2075EIGEN_STRONG_INLINE int64_t predux_min<Packet2l>(
const Packet2l& a) {
2076 return pfirst((Packet2l)__lsx_vmin_d(a, preverse(a)));
2079EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(
const Packet16uc& a) {
2080 Packet16uc tmp1 = __lsx_vmin_bu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2081 Packet16uc tmp2 = __lsx_vmin_bu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2082 Packet16uc tmp3 = __lsx_vmin_bu(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
2083 return pfirst((Packet16uc)__lsx_vmin_bu(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
2086EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(
const Packet8us& a) {
2087 Packet8us tmp1 = __lsx_vmin_hu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2088 Packet8us tmp2 = __lsx_vmin_hu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2089 return pfirst((Packet8us)__lsx_vmin_hu(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
2092EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(
const Packet4ui& a) {
2093 Packet4ui tmp = __lsx_vmin_wu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2094 return pfirst((Packet4ui)__lsx_vmin_wu(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
2097EIGEN_STRONG_INLINE uint64_t predux_min<Packet2ul>(
const Packet2ul& a) {
2098 return pfirst((Packet2ul)__lsx_vmin_du(a, preverse(a)));
2102EIGEN_STRONG_INLINE
float predux_max<Packet4f>(
const Packet4f& a) {
2103 Packet4f tmp = __lsx_vfmax_s(a, (Packet4f)__lsx_vshuf4i_w(a, 0x4E));
2104 return pfirst(__lsx_vfmax_s(tmp, (Packet4f)__lsx_vshuf4i_w(tmp, 0xB1)));
2107EIGEN_STRONG_INLINE
double predux_max<Packet2d>(
const Packet2d& a) {
2108 return pfirst(__lsx_vfmax_d(a, preverse(a)));
2111EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(
const Packet16c& a) {
2112 Packet16c tmp1 = __lsx_vmax_b(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2113 Packet16c tmp2 = __lsx_vmax_b(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2114 Packet16c tmp3 = __lsx_vmax_b(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
2115 return pfirst((Packet16c)__lsx_vmax_b(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
2118EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(
const Packet8s& a) {
2119 Packet8s tmp1 = __lsx_vmax_h(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2120 Packet8s tmp2 = __lsx_vmax_h(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2121 return pfirst((Packet8s)__lsx_vmax_h(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
2124EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(
const Packet4i& a) {
2125 Packet4i tmp = __lsx_vmax_w(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2126 return pfirst((Packet4i)__lsx_vmax_w(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
2129EIGEN_STRONG_INLINE int64_t predux_max<Packet2l>(
const Packet2l& a) {
2130 return pfirst((Packet2l)__lsx_vmax_d(a, preverse(a)));
2133EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(
const Packet16uc& a) {
2134 Packet16uc tmp1 = __lsx_vmax_bu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2135 Packet16uc tmp2 = __lsx_vmax_bu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2136 Packet16uc tmp3 = __lsx_vmax_bu(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
2137 return pfirst((Packet16uc)__lsx_vmax_bu(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
2140EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(
const Packet8us& a) {
2141 Packet8us tmp1 = __lsx_vmax_hu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2142 Packet8us tmp2 = __lsx_vmax_hu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2143 return pfirst((Packet8us)__lsx_vmax_hu(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
2146EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(
const Packet4ui& a) {
2147 Packet4ui tmp = __lsx_vmax_wu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2148 return pfirst((Packet4ui)__lsx_vmax_wu(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
2151EIGEN_STRONG_INLINE uint64_t predux_max<Packet2ul>(
const Packet2ul& a) {
2152 return pfirst((Packet2ul)__lsx_vmax_du(a, preverse(a)));
2156EIGEN_STRONG_INLINE Packet4f psqrt(
const Packet4f& a) {
2157 return __lsx_vfsqrt_s(a);
2160EIGEN_STRONG_INLINE Packet2d psqrt(
const Packet2d& a) {
2161 return __lsx_vfsqrt_d(a);
2164EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
2165 Packet4f T0 = (Packet4f)__lsx_vilvl_w((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
2166 Packet4f T1 = (Packet4f)__lsx_vilvh_w((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
2167 Packet4f T2 = (Packet4f)__lsx_vilvl_w((__m128i)kernel.packet[3], (__m128i)kernel.packet[2]);
2168 Packet4f T3 = (Packet4f)__lsx_vilvh_w((__m128i)kernel.packet[3], (__m128i)kernel.packet[2]);
2170 kernel.packet[0] = (Packet4f)__lsx_vilvl_d((__m128i)T2, (__m128i)T0);
2171 kernel.packet[1] = (Packet4f)__lsx_vilvh_d((__m128i)T2, (__m128i)T0);
2172 kernel.packet[2] = (Packet4f)__lsx_vilvl_d((__m128i)T3, (__m128i)T1);
2173 kernel.packet[3] = (Packet4f)__lsx_vilvh_d((__m128i)T3, (__m128i)T1);
2175EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
2176 Packet2d tmp = (Packet2d)__lsx_vilvh_d((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
2177 kernel.packet[0] = (Packet2d)__lsx_vilvl_d((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
2178 kernel.packet[1] = tmp;
2180EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
2181 __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2182 __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2183 __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2184 __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2185 __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
2186 __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
2187 __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
2188 __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
2189 __m128i t8 = __lsx_vilvl_b(kernel.packet[9], kernel.packet[8]);
2190 __m128i t9 = __lsx_vilvh_b(kernel.packet[9], kernel.packet[8]);
2191 __m128i ta = __lsx_vilvl_b(kernel.packet[11], kernel.packet[10]);
2192 __m128i tb = __lsx_vilvh_b(kernel.packet[11], kernel.packet[10]);
2193 __m128i tc = __lsx_vilvl_b(kernel.packet[13], kernel.packet[12]);
2194 __m128i td = __lsx_vilvh_b(kernel.packet[13], kernel.packet[12]);
2195 __m128i te = __lsx_vilvl_b(kernel.packet[15], kernel.packet[14]);
2196 __m128i tf = __lsx_vilvh_b(kernel.packet[15], kernel.packet[14]);
2198 __m128i s0 = __lsx_vilvl_h(t2, t0);
2199 __m128i s1 = __lsx_vilvh_h(t2, t0);
2200 __m128i s2 = __lsx_vilvl_h(t3, t1);
2201 __m128i s3 = __lsx_vilvh_h(t3, t1);
2202 __m128i s4 = __lsx_vilvl_h(t6, t4);
2203 __m128i s5 = __lsx_vilvh_h(t6, t4);
2204 __m128i s6 = __lsx_vilvl_h(t7, t5);
2205 __m128i s7 = __lsx_vilvh_h(t7, t5);
2206 __m128i s8 = __lsx_vilvl_h(ta, t8);
2207 __m128i s9 = __lsx_vilvh_h(ta, t8);
2208 __m128i sa = __lsx_vilvl_h(tb, t9);
2209 __m128i sb = __lsx_vilvh_h(tb, t9);
2210 __m128i sc = __lsx_vilvl_h(te, tc);
2211 __m128i sd = __lsx_vilvh_h(te, tc);
2212 __m128i se = __lsx_vilvl_h(tf, td);
2213 __m128i sf = __lsx_vilvh_h(tf, td);
2215 __m128i u0 = __lsx_vilvl_w(s4, s0);
2216 __m128i u1 = __lsx_vilvh_w(s4, s0);
2217 __m128i u2 = __lsx_vilvl_w(s5, s1);
2218 __m128i u3 = __lsx_vilvh_w(s5, s1);
2219 __m128i u4 = __lsx_vilvl_w(s6, s2);
2220 __m128i u5 = __lsx_vilvh_w(s6, s2);
2221 __m128i u6 = __lsx_vilvl_w(s7, s3);
2222 __m128i u7 = __lsx_vilvh_w(s7, s3);
2223 __m128i u8 = __lsx_vilvl_w(sc, s8);
2224 __m128i u9 = __lsx_vilvh_w(sc, s8);
2225 __m128i ua = __lsx_vilvl_w(sd, s9);
2226 __m128i ub = __lsx_vilvh_w(sd, s9);
2227 __m128i uc = __lsx_vilvl_w(se, sa);
2228 __m128i ud = __lsx_vilvh_w(se, sa);
2229 __m128i ue = __lsx_vilvl_w(sf, sb);
2230 __m128i uf = __lsx_vilvh_w(sf, sb);
2232 kernel.packet[0] = __lsx_vilvl_d(u8, u0);
2233 kernel.packet[1] = __lsx_vilvh_d(u8, u0);
2234 kernel.packet[2] = __lsx_vilvl_d(u9, u1);
2235 kernel.packet[3] = __lsx_vilvh_d(u9, u1);
2236 kernel.packet[4] = __lsx_vilvl_d(ua, u2);
2237 kernel.packet[5] = __lsx_vilvh_d(ua, u2);
2238 kernel.packet[6] = __lsx_vilvl_d(ub, u3);
2239 kernel.packet[7] = __lsx_vilvh_d(ub, u3);
2240 kernel.packet[8] = __lsx_vilvl_d(uc, u4);
2241 kernel.packet[9] = __lsx_vilvh_d(uc, u4);
2242 kernel.packet[10] = __lsx_vilvl_d(ud, u5);
2243 kernel.packet[11] = __lsx_vilvh_d(ud, u5);
2244 kernel.packet[12] = __lsx_vilvl_d(ue, u6);
2245 kernel.packet[13] = __lsx_vilvh_d(ue, u6);
2246 kernel.packet[14] = __lsx_vilvl_d(uf, u7);
2247 kernel.packet[15] = __lsx_vilvh_d(uf, u7);
2249EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16c, 8>& kernel) {
2250 __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2251 __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2252 __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2253 __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2254 __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
2255 __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
2256 __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
2257 __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
2259 __m128i s0 = __lsx_vilvl_h(t2, t0);
2260 __m128i s1 = __lsx_vilvh_h(t2, t0);
2261 __m128i s2 = __lsx_vilvl_h(t3, t1);
2262 __m128i s3 = __lsx_vilvh_h(t3, t1);
2263 __m128i s4 = __lsx_vilvl_h(t6, t4);
2264 __m128i s5 = __lsx_vilvh_h(t6, t4);
2265 __m128i s6 = __lsx_vilvl_h(t7, t5);
2266 __m128i s7 = __lsx_vilvh_h(t7, t5);
2268 kernel.packet[0] = __lsx_vilvl_w(s4, s0);
2269 kernel.packet[1] = __lsx_vilvh_w(s4, s0);
2270 kernel.packet[2] = __lsx_vilvl_w(s5, s1);
2271 kernel.packet[3] = __lsx_vilvh_w(s5, s1);
2272 kernel.packet[4] = __lsx_vilvl_w(s6, s2);
2273 kernel.packet[5] = __lsx_vilvh_w(s6, s2);
2274 kernel.packet[6] = __lsx_vilvl_w(s7, s3);
2275 kernel.packet[7] = __lsx_vilvh_w(s7, s3);
2277EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
2278 __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2279 __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2280 __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2281 __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2283 kernel.packet[0] = __lsx_vilvl_h(t2, t0);
2284 kernel.packet[1] = __lsx_vilvh_h(t2, t0);
2285 kernel.packet[2] = __lsx_vilvl_h(t3, t1);
2286 kernel.packet[3] = __lsx_vilvh_h(t3, t1);
2288EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
2289 __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
2290 __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
2291 __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
2292 __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
2293 __m128i t4 = __lsx_vilvl_h(kernel.packet[5], kernel.packet[4]);
2294 __m128i t5 = __lsx_vilvh_h(kernel.packet[5], kernel.packet[4]);
2295 __m128i t6 = __lsx_vilvl_h(kernel.packet[7], kernel.packet[6]);
2296 __m128i t7 = __lsx_vilvh_h(kernel.packet[7], kernel.packet[6]);
2298 __m128i s0 = __lsx_vilvl_w(t2, t0);
2299 __m128i s1 = __lsx_vilvh_w(t2, t0);
2300 __m128i s2 = __lsx_vilvl_w(t3, t1);
2301 __m128i s3 = __lsx_vilvh_w(t3, t1);
2302 __m128i s4 = __lsx_vilvl_w(t6, t4);
2303 __m128i s5 = __lsx_vilvh_w(t6, t4);
2304 __m128i s6 = __lsx_vilvl_w(t7, t5);
2305 __m128i s7 = __lsx_vilvh_w(t7, t5);
2307 kernel.packet[0] = __lsx_vilvl_d(s4, s0);
2308 kernel.packet[1] = __lsx_vilvh_d(s4, s0);
2309 kernel.packet[2] = __lsx_vilvl_d(s5, s1);
2310 kernel.packet[3] = __lsx_vilvh_d(s5, s1);
2311 kernel.packet[4] = __lsx_vilvl_d(s6, s2);
2312 kernel.packet[5] = __lsx_vilvh_d(s6, s2);
2313 kernel.packet[6] = __lsx_vilvl_d(s7, s3);
2314 kernel.packet[7] = __lsx_vilvh_d(s7, s3);
2316EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
2317 __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
2318 __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
2319 __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
2320 __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
2322 kernel.packet[0] = __lsx_vilvl_w(t2, t0);
2323 kernel.packet[1] = __lsx_vilvh_w(t2, t0);
2324 kernel.packet[2] = __lsx_vilvl_w(t3, t1);
2325 kernel.packet[3] = __lsx_vilvh_w(t3, t1);
2327EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
2328 __m128i T0 = __lsx_vilvl_w(kernel.packet[1], kernel.packet[0]);
2329 __m128i T1 = __lsx_vilvh_w(kernel.packet[1], kernel.packet[0]);
2330 __m128i T2 = __lsx_vilvl_w(kernel.packet[3], kernel.packet[2]);
2331 __m128i T3 = __lsx_vilvh_w(kernel.packet[3], kernel.packet[2]);
2333 kernel.packet[0] = __lsx_vilvl_d(T2, T0);
2334 kernel.packet[1] = __lsx_vilvh_d(T2, T0);
2335 kernel.packet[2] = __lsx_vilvl_d(T3, T1);
2336 kernel.packet[3] = __lsx_vilvh_d(T3, T1);
2338EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet2l, 2>& kernel) {
2339 __m128i tmp = __lsx_vilvh_d(kernel.packet[1], kernel.packet[0]);
2340 kernel.packet[0] = __lsx_vilvl_d(kernel.packet[1], kernel.packet[0]);
2341 kernel.packet[1] = tmp;
2343EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
2344 __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2345 __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2346 __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2347 __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2348 __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
2349 __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
2350 __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
2351 __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
2352 __m128i t8 = __lsx_vilvl_b(kernel.packet[9], kernel.packet[8]);
2353 __m128i t9 = __lsx_vilvh_b(kernel.packet[9], kernel.packet[8]);
2354 __m128i ta = __lsx_vilvl_b(kernel.packet[11], kernel.packet[10]);
2355 __m128i tb = __lsx_vilvh_b(kernel.packet[11], kernel.packet[10]);
2356 __m128i tc = __lsx_vilvl_b(kernel.packet[13], kernel.packet[12]);
2357 __m128i td = __lsx_vilvh_b(kernel.packet[13], kernel.packet[12]);
2358 __m128i te = __lsx_vilvl_b(kernel.packet[15], kernel.packet[14]);
2359 __m128i tf = __lsx_vilvh_b(kernel.packet[15], kernel.packet[14]);
2361 __m128i s0 = __lsx_vilvl_h(t2, t0);
2362 __m128i s1 = __lsx_vilvh_h(t2, t0);
2363 __m128i s2 = __lsx_vilvl_h(t3, t1);
2364 __m128i s3 = __lsx_vilvh_h(t3, t1);
2365 __m128i s4 = __lsx_vilvl_h(t6, t4);
2366 __m128i s5 = __lsx_vilvh_h(t6, t4);
2367 __m128i s6 = __lsx_vilvl_h(t7, t5);
2368 __m128i s7 = __lsx_vilvh_h(t7, t5);
2369 __m128i s8 = __lsx_vilvl_h(ta, t8);
2370 __m128i s9 = __lsx_vilvh_h(ta, t8);
2371 __m128i sa = __lsx_vilvl_h(tb, t9);
2372 __m128i sb = __lsx_vilvh_h(tb, t9);
2373 __m128i sc = __lsx_vilvl_h(te, tc);
2374 __m128i sd = __lsx_vilvh_h(te, tc);
2375 __m128i se = __lsx_vilvl_h(tf, td);
2376 __m128i sf = __lsx_vilvh_h(tf, td);
2378 __m128i u0 = __lsx_vilvl_w(s4, s0);
2379 __m128i u1 = __lsx_vilvh_w(s4, s0);
2380 __m128i u2 = __lsx_vilvl_w(s5, s1);
2381 __m128i u3 = __lsx_vilvh_w(s5, s1);
2382 __m128i u4 = __lsx_vilvl_w(s6, s2);
2383 __m128i u5 = __lsx_vilvh_w(s6, s2);
2384 __m128i u6 = __lsx_vilvl_w(s7, s3);
2385 __m128i u7 = __lsx_vilvh_w(s7, s3);
2386 __m128i u8 = __lsx_vilvl_w(sc, s8);
2387 __m128i u9 = __lsx_vilvh_w(sc, s8);
2388 __m128i ua = __lsx_vilvl_w(sd, s9);
2389 __m128i ub = __lsx_vilvh_w(sd, s9);
2390 __m128i uc = __lsx_vilvl_w(se, sa);
2391 __m128i ud = __lsx_vilvh_w(se, sa);
2392 __m128i ue = __lsx_vilvl_w(sf, sb);
2393 __m128i uf = __lsx_vilvh_w(sf, sb);
2395 kernel.packet[0] = __lsx_vilvl_d(u8, u0);
2396 kernel.packet[1] = __lsx_vilvh_d(u8, u0);
2397 kernel.packet[2] = __lsx_vilvl_d(u9, u1);
2398 kernel.packet[3] = __lsx_vilvh_d(u9, u1);
2399 kernel.packet[4] = __lsx_vilvl_d(ua, u2);
2400 kernel.packet[5] = __lsx_vilvh_d(ua, u2);
2401 kernel.packet[6] = __lsx_vilvl_d(ub, u3);
2402 kernel.packet[7] = __lsx_vilvh_d(ub, u3);
2403 kernel.packet[8] = __lsx_vilvl_d(uc, u4);
2404 kernel.packet[9] = __lsx_vilvh_d(uc, u4);
2405 kernel.packet[10] = __lsx_vilvl_d(ud, u5);
2406 kernel.packet[11] = __lsx_vilvh_d(ud, u5);
2407 kernel.packet[12] = __lsx_vilvl_d(ue, u6);
2408 kernel.packet[13] = __lsx_vilvh_d(ue, u6);
2409 kernel.packet[14] = __lsx_vilvl_d(uf, u7);
2410 kernel.packet[15] = __lsx_vilvh_d(uf, u7);
2412EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16uc, 8>& kernel) {
2413 __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2414 __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2415 __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2416 __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2417 __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
2418 __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
2419 __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
2420 __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
2422 __m128i s0 = __lsx_vilvl_h(t2, t0);
2423 __m128i s1 = __lsx_vilvh_h(t2, t0);
2424 __m128i s2 = __lsx_vilvl_h(t3, t1);
2425 __m128i s3 = __lsx_vilvh_h(t3, t1);
2426 __m128i s4 = __lsx_vilvl_h(t6, t4);
2427 __m128i s5 = __lsx_vilvh_h(t6, t4);
2428 __m128i s6 = __lsx_vilvl_h(t7, t5);
2429 __m128i s7 = __lsx_vilvh_h(t7, t5);
2431 kernel.packet[0] = __lsx_vilvl_w(s4, s0);
2432 kernel.packet[1] = __lsx_vilvh_w(s4, s0);
2433 kernel.packet[2] = __lsx_vilvl_w(s5, s1);
2434 kernel.packet[3] = __lsx_vilvh_w(s5, s1);
2435 kernel.packet[4] = __lsx_vilvl_w(s6, s2);
2436 kernel.packet[5] = __lsx_vilvh_w(s6, s2);
2437 kernel.packet[6] = __lsx_vilvl_w(s7, s3);
2438 kernel.packet[7] = __lsx_vilvh_w(s7, s3);
2440EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
2441 __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2442 __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2443 __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2444 __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2446 kernel.packet[0] = __lsx_vilvl_h(t2, t0);
2447 kernel.packet[1] = __lsx_vilvh_h(t2, t0);
2448 kernel.packet[2] = __lsx_vilvl_h(t3, t1);
2449 kernel.packet[3] = __lsx_vilvh_h(t3, t1);
2451EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
2452 __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
2453 __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
2454 __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
2455 __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
2456 __m128i t4 = __lsx_vilvl_h(kernel.packet[5], kernel.packet[4]);
2457 __m128i t5 = __lsx_vilvh_h(kernel.packet[5], kernel.packet[4]);
2458 __m128i t6 = __lsx_vilvl_h(kernel.packet[7], kernel.packet[6]);
2459 __m128i t7 = __lsx_vilvh_h(kernel.packet[7], kernel.packet[6]);
2461 __m128i s0 = __lsx_vilvl_w(t2, t0);
2462 __m128i s1 = __lsx_vilvh_w(t2, t0);
2463 __m128i s2 = __lsx_vilvl_w(t3, t1);
2464 __m128i s3 = __lsx_vilvh_w(t3, t1);
2465 __m128i s4 = __lsx_vilvl_w(t6, t4);
2466 __m128i s5 = __lsx_vilvh_w(t6, t4);
2467 __m128i s6 = __lsx_vilvl_w(t7, t5);
2468 __m128i s7 = __lsx_vilvh_w(t7, t5);
2470 kernel.packet[0] = __lsx_vilvl_d(s4, s0);
2471 kernel.packet[1] = __lsx_vilvh_d(s4, s0);
2472 kernel.packet[2] = __lsx_vilvl_d(s5, s1);
2473 kernel.packet[3] = __lsx_vilvh_d(s5, s1);
2474 kernel.packet[4] = __lsx_vilvl_d(s6, s2);
2475 kernel.packet[5] = __lsx_vilvh_d(s6, s2);
2476 kernel.packet[6] = __lsx_vilvl_d(s7, s3);
2477 kernel.packet[7] = __lsx_vilvh_d(s7, s3);
2479EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
2480 __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
2481 __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
2482 __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
2483 __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
2485 kernel.packet[0] = __lsx_vilvl_w(t2, t0);
2486 kernel.packet[1] = __lsx_vilvh_w(t2, t0);
2487 kernel.packet[2] = __lsx_vilvl_w(t3, t1);
2488 kernel.packet[3] = __lsx_vilvh_w(t3, t1);
2490EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
2491 __m128i T0 = __lsx_vilvl_w(kernel.packet[1], kernel.packet[0]);
2492 __m128i T1 = __lsx_vilvh_w(kernel.packet[1], kernel.packet[0]);
2493 __m128i T2 = __lsx_vilvl_w(kernel.packet[3], kernel.packet[2]);
2494 __m128i T3 = __lsx_vilvh_w(kernel.packet[3], kernel.packet[2]);
2496 kernel.packet[0] = __lsx_vilvl_d(T2, T0);
2497 kernel.packet[1] = __lsx_vilvh_d(T2, T0);
2498 kernel.packet[2] = __lsx_vilvl_d(T3, T1);
2499 kernel.packet[3] = __lsx_vilvh_d(T3, T1);
2501EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet2ul, 2>& kernel) {
2502 __m128i tmp = __lsx_vilvh_d(kernel.packet[1], kernel.packet[0]);
2503 kernel.packet[0] = __lsx_vilvl_d(kernel.packet[1], kernel.packet[0]);
2504 kernel.packet[1] = tmp;
2508EIGEN_STRONG_INLINE Packet4f prsqrt(
const Packet4f& a) {
2509 return __lsx_vfrsqrt_s(a);
2512EIGEN_STRONG_INLINE Packet2d prsqrt(
const Packet2d& a) {
2513 return __lsx_vfrsqrt_d(a);
2517EIGEN_STRONG_INLINE Packet4f pfloor(
const Packet4f& a) {
2518 return __lsx_vfrintrm_s(a);
2521EIGEN_STRONG_INLINE Packet2d pfloor(
const Packet2d& a) {
2522 return __lsx_vfrintrm_d(a);
2526EIGEN_STRONG_INLINE Packet4f pceil(
const Packet4f& a) {
2527 return __lsx_vfrintrp_s(a);
2530EIGEN_STRONG_INLINE Packet2d pceil(
const Packet2d& a) {
2531 return __lsx_vfrintrp_d(a);
2535EIGEN_STRONG_INLINE Packet4f pround(
const Packet4f& a) {
2536 const Packet4f mask = pset1frombits<Packet4f>(
static_cast<numext::uint32_t
>(0x80000000u));
2537 const Packet4f prev0dot5 = pset1frombits<Packet4f>(
static_cast<numext::uint32_t
>(0x3EFFFFFFu));
2538 return __lsx_vfrintrz_s(padd(pxor(pand(a, mask), prev0dot5), a));
2541EIGEN_STRONG_INLINE Packet2d pround(
const Packet2d& a) {
2542 const Packet2d mask = pset1frombits<Packet2d>(
static_cast<numext::uint64_t
>(0x8000000000000000ull));
2543 const Packet2d prev0dot5 = pset1frombits<Packet2d>(
static_cast<numext::uint64_t
>(0x3FDFFFFFFFFFFFFFull));
2544 return __lsx_vfrintrz_d(padd(por(pand(a, mask), prev0dot5), a));
2548EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(
const Packet4f& mask,
const Packet4f& a,
const Packet4f& b) {
2549 return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask);
2552EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(
const Packet16c& mask,
const Packet16c& a,
const Packet16c& b) {
2553 return (Packet16c)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask);
2557EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(
const int8_t* from) {
2558 int8_t tmp[16] = {*from, *from, *from, *from, *(from + 1), *(from + 1),
2559 *(from + 1), *(from + 1), *(from + 2), *(from + 2), *(from + 2), *(from + 2),
2560 *(from + 3), *(from + 3), *(from + 3), *(from + 3)};
2561 return __lsx_vld(tmp, 0);
2564EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(
const uint8_t* from) {
2565 uint8_t tmp[16] = {*from, *from, *from, *from, *(from + 1), *(from + 1),
2566 *(from + 1), *(from + 1), *(from + 2), *(from + 2), *(from + 2), *(from + 2),
2567 *(from + 3), *(from + 3), *(from + 3), *(from + 3)};
2568 return __lsx_vld(tmp, 0);
2571EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(
const int16_t* from) {
2572 int16_t tmp[8] = {*from, *from, *from, *from, *(from + 1), *(from + 1), *(from + 1), *(from + 1)};
2573 return __lsx_vld(tmp, 0);
2576EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(
const uint16_t* from) {
2577 uint16_t tmp[8] = {*from, *from, *from, *from, *(from + 1), *(from + 1), *(from + 1), *(from + 1)};
2578 return __lsx_vld(tmp, 0);
2581EIGEN_STRONG_INLINE Packet4i ploadquad<Packet4i>(
const int32_t* from) {
2582 int32_t tmp[4] = {*from, *from, *from, *from};
2583 return __lsx_vld(tmp, 0);
2586EIGEN_STRONG_INLINE Packet4ui ploadquad<Packet4ui>(
const uint32_t* from) {
2587 uint32_t tmp[4] = {*from, *from, *from, *from};
2588 return __lsx_vld(tmp, 0);
2592EIGEN_STRONG_INLINE Packet16c pnmsub(
const Packet16c& a,
const Packet16c& b,
const Packet16c& c) {
2593 return __lsx_vmsub_b(pnegate(c), a, b);
2596EIGEN_STRONG_INLINE Packet8s pnmsub(
const Packet8s& a,
const Packet8s& b,
const Packet8s& c) {
2597 return __lsx_vmsub_h(pnegate(c), a, b);
2600EIGEN_STRONG_INLINE Packet4i pnmsub(
const Packet4i& a,
const Packet4i& b,
const Packet4i& c) {
2601 return __lsx_vmsub_w(pnegate(c), a, b);
2604EIGEN_STRONG_INLINE Packet2l pnmsub(
const Packet2l& a,
const Packet2l& b,
const Packet2l& c) {
2605 return __lsx_vmsub_d(pnegate(c), a, b);
2609EIGEN_STRONG_INLINE Packet16c pmsub(
const Packet16c& a,
const Packet16c& b,
const Packet16c& c) {
2610 return __lsx_vmadd_b(pnegate(c), a, b);
2613EIGEN_STRONG_INLINE Packet8s pmsub(
const Packet8s& a,
const Packet8s& b,
const Packet8s& c) {
2614 return __lsx_vmadd_h(pnegate(c), a, b);
2617EIGEN_STRONG_INLINE Packet4i pmsub(
const Packet4i& a,
const Packet4i& b,
const Packet4i& c) {
2618 return __lsx_vmadd_w(pnegate(c), a, b);
2621EIGEN_STRONG_INLINE Packet2l pmsub(
const Packet2l& a,
const Packet2l& b,
const Packet2l& c) {
2622 return __lsx_vmadd_d(pnegate(c), a, b);
2626EIGEN_STRONG_INLINE Packet16c pnmadd(
const Packet16c& a,
const Packet16c& b,
const Packet16c& c) {
2627 return __lsx_vmsub_b(c, a, b);
2630EIGEN_STRONG_INLINE Packet8s pnmadd(
const Packet8s& a,
const Packet8s& b,
const Packet8s& c) {
2631 return __lsx_vmsub_h(c, a, b);
2634EIGEN_STRONG_INLINE Packet4i pnmadd(
const Packet4i& a,
const Packet4i& b,
const Packet4i& c) {
2635 return __lsx_vmsub_w(c, a, b);
2638EIGEN_STRONG_INLINE Packet2l pnmadd(
const Packet2l& a,
const Packet2l& b,
const Packet2l& c) {
2639 return __lsx_vmsub_d(c, a, b);
2643EIGEN_STRONG_INLINE Packet4f pexp(
const Packet4f& _x) {
2644 return pexp_float(_x);
2647EIGEN_STRONG_INLINE Packet2d pexp(
const Packet2d& _x) {
2648 return pexp_double(_x);
2652EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(
const Packet4f& a,
const Packet4f& exponent) {
2653 return pldexp_generic(a, exponent);
2657EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(
const Packet2d& a, Packet2d& exponent) {
2658 return pfrexp_generic(a, exponent);
2661EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(
const Packet4f& a, Packet4f& exponent) {
2662 return pfrexp_generic(a, exponent);
2665EIGEN_STRONG_INLINE Packet4f pzero(
const Packet4f& ) {
2666 Packet4f v = {0.0f, 0.0f, 0.0f, 0.0f};
2670EIGEN_STRONG_INLINE Packet4f pabsdiff<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
2671 Packet4f v = psub(a, b);
2675EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(
const Packet4f& a,
const Packet4f& b) {
2676 return pmin<Packet4f>(a, b);
2679EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(
const Packet4f& a,
const Packet4f& b) {
2680 return pmax<Packet4f>(a, b);
2683EIGEN_STRONG_INLINE Packet4f ploadquad<Packet4f>(
const float* from) {
2684 return (__m128)__lsx_vldrepl_w(from, 0);
2687EIGEN_STRONG_INLINE Packet4f psignbit(
const Packet4f& a) {
2688 return (__m128)__lsx_vsrai_w((__m128i)a, 31);
2691EIGEN_STRONG_INLINE Packet4f print<Packet4f>(
const Packet4f& a) {
2692 return __lsx_vfrintrne_s(a);
2695EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(
const Packet4f& a) {
2696 return __lsx_vfrintrz_s(a);
2699EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(
const Packet4f& a) {
2700 return __lsx_vfrecip_s(a);
2704EIGEN_STRONG_INLINE Packet2d pzero(
const Packet2d& ) {
2705 Packet2d v = {0.0, 0.0};
2709EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(
const Packet2d& a,
const Packet2d& b) {
2710 return pmin<Packet2d>(a, b);
2713EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(
const Packet2d& a,
const Packet2d& b) {
2714 return pmax<Packet2d>(a, b);
2717EIGEN_STRONG_INLINE Packet2d psignbit(
const Packet2d& a) {
2718 return (__m128d)(__lsx_vsrai_d((__m128i)a, 63));
2721EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect(
const Packet2d& mask,
const Packet2d& a,
const Packet2d& b) {
2722 return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask);
2725EIGEN_STRONG_INLINE Packet2d print<Packet2d>(
const Packet2d& a) {
2726 return __lsx_vfrintrne_d(a);
2729EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(
const Packet2d& a) {
2730 return __lsx_vfrintrz_d(a);
2733EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(
const Packet2d& a,
const Packet2d& exponent) {
2734 return pldexp_generic(a, exponent);
2738EIGEN_STRONG_INLINE Packet16c pabsdiff<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
2739 Packet16c v = psub(a, b);
2744EIGEN_STRONG_INLINE Packet8s pabsdiff<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
2745 Packet8s v = psub(a, b);
2749EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(
const Packet8s& mask,
const Packet8s& a,
const Packet8s& b) {
2750 return __lsx_vbitsel_v(b, a, mask);
2754EIGEN_STRONG_INLINE Packet4i pabsdiff<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
2755 Packet4i v = psub(a, b);
2759EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(
const Packet4i& mask,
const Packet4i& a,
const Packet4i& b) {
2760 return __lsx_vbitsel_v(b, a, mask);
2764EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(
const Packet2l& mask,
const Packet2l& a,
const Packet2l& b) {
2765 return __lsx_vbitsel_v(b, a, mask);
2769EIGEN_STRONG_INLINE Packet16uc pdiv<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
2770 return __lsx_vdiv_bu(a, b);
2773EIGEN_STRONG_INLINE Packet16uc pabsdiff<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
2774 Packet16uc v = psub(a, b);
2778EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(
const Packet16uc& mask,
const Packet16uc& a,
2779 const Packet16uc& b) {
2780 return __lsx_vbitsel_v(b, a, mask);
2783EIGEN_STRONG_INLINE Packet16uc psqrt(
const Packet16uc& a) {
2784 __m128i res = {0, 0};
2785 __m128i add = {0x0808080808080808, 0x0808080808080808};
2786 for (
int i = 0; i < 4; i++) {
2787 const __m128i temp = __lsx_vor_v(res, add);
2788 const __m128i tmul = __lsx_vpackev_b(__lsx_vmulwod_h_bu(temp, temp), __lsx_vmulwev_h_bu(temp, temp));
2789 res = __lsx_vbitsel_v(res, temp, __lsx_vsle_bu(tmul, a));
2790 add = __lsx_vsrli_b(add, 1);
2796EIGEN_STRONG_INLINE Packet8us pabsdiff<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
2797 Packet8us v = psub(a, b);
2801EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(
const Packet8us& mask,
const Packet8us& a,
const Packet8us& b) {
2802 return __lsx_vbitsel_v(b, a, mask);
2805EIGEN_STRONG_INLINE Packet8us psqrt(
const Packet8us& a) {
2806 __m128i res = {0, 0};
2807 __m128i add = {0x0080008000800080, 0x0080008000800080};
2808 for (
int i = 0; i < 4; i++) {
2809 const __m128i temp = __lsx_vor_v(res, add);
2810 const __m128i tmul = __lsx_vpackev_h(__lsx_vmulwod_w_hu(temp, temp), __lsx_vmulwev_w_hu(temp, temp));
2811 res = __lsx_vbitsel_v(res, temp, __lsx_vsle_hu(tmul, a));
2812 add = __lsx_vsrli_h(add, 1);
2818EIGEN_STRONG_INLINE Packet4ui pabsdiff<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
2819 Packet4ui v = psub(a, b);
2823EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(
const Packet4ui& mask,
const Packet4ui& a,
const Packet4ui& b) {
2824 return __lsx_vbitsel_v(b, a, mask);
2827EIGEN_STRONG_INLINE Packet4ui psqrt(
const Packet4ui& a) {
2828 __m128i res = {0, 0};
2829 __m128i add = {0x0000800000008000, 0x0000800000008000};
2830 for (
int i = 0; i < 4; i++) {
2831 const __m128i temp = __lsx_vor_v(res, add);
2832 const __m128i tmul = __lsx_vpackev_w(__lsx_vmulwod_d_wu(temp, temp), __lsx_vmulwev_d_wu(temp, temp));
2833 res = __lsx_vbitsel_v(res, temp, __lsx_vsle_wu(tmul, a));
2834 add = __lsx_vsrli_w(add, 1);
2840EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(
const Packet2ul& mask,
const Packet2ul& a,
const Packet2ul& b) {
2841 return __lsx_vbitsel_v(b, a, mask);
@ Aligned16
Definition Constants.h:237
Namespace containing all symbols from the Eigen library.
Definition B01_Experimental.dox:1
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:82