11#ifndef EIGEN_PACKET_MATH_LSX_H
12#define EIGEN_PACKET_MATH_LSX_H
15#include "../../InternalHeaderCheck.h"
21#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
22#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
25#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
26#if EIGEN_ARCH_LOONGARCH64
27#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
31#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
32#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
35typedef __m128 Packet4f;
36typedef __m128d Packet2d;
38typedef eigen_packet_wrapper<__m128i, 0> Packet16c;
39typedef eigen_packet_wrapper<__m128i, 1> Packet8s;
40typedef eigen_packet_wrapper<__m128i, 2> Packet4i;
41typedef eigen_packet_wrapper<__m128i, 3> Packet2l;
42typedef eigen_packet_wrapper<__m128i, 4> Packet16uc;
43typedef eigen_packet_wrapper<__m128i, 5> Packet8us;
44typedef eigen_packet_wrapper<__m128i, 6> Packet4ui;
45typedef eigen_packet_wrapper<__m128i, 7> Packet2ul;
48struct is_arithmetic<__m128> {
49 enum { value =
true };
52struct is_arithmetic<__m128i> {
53 enum { value =
true };
56struct is_arithmetic<__m128d> {
57 enum { value =
true };
60struct is_arithmetic<Packet16c> {
61 enum { value =
true };
64struct is_arithmetic<Packet8s> {
65 enum { value =
true };
68struct is_arithmetic<Packet4i> {
69 enum { value =
true };
72struct is_arithmetic<Packet2l> {
73 enum { value =
true };
76struct is_arithmetic<Packet16uc> {
77 enum { value =
false };
80struct is_arithmetic<Packet8us> {
81 enum { value =
false };
84struct is_arithmetic<Packet4ui> {
85 enum { value =
false };
88struct is_arithmetic<Packet2ul> {
89 enum { value =
false };
92EIGEN_ALWAYS_INLINE Packet4f make_packet4f(
float a,
float b,
float c,
float d) {
93 float from[4] = {a, b, c, d};
94 return (Packet4f)__lsx_vld(from, 0);
97EIGEN_STRONG_INLINE Packet4f shuffle1(
const Packet4f& m,
int mask) {
98 const float* a =
reinterpret_cast<const float*
>(&m);
100 make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(a + ((mask >> 6) & 3)));
104template <
bool interleave>
105EIGEN_STRONG_INLINE Packet4f shuffle2(
const Packet4f& m,
const Packet4f& n,
int mask) {
106 const float* a =
reinterpret_cast<const float*
>(&m);
107 const float* b =
reinterpret_cast<const float*
>(&n);
109 make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
114EIGEN_STRONG_INLINE Packet4f shuffle2<true>(
const Packet4f& m,
const Packet4f& n,
int mask) {
115 const float* a =
reinterpret_cast<const float*
>(&m);
116 const float* b =
reinterpret_cast<const float*
>(&n);
118 make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
122EIGEN_STRONG_INLINE
static int eigen_lsx_shuffle_mask(
int p,
int q,
int r,
int s) {
123 return ((s) << 6 | (r) << 4 | (q) << 2 | (p));
126EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(
const Packet4f& a,
int p,
int q,
int r,
int s) {
127 return shuffle1(a, eigen_lsx_shuffle_mask(p, q, r, s));
129EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(
const Packet4f& a,
const Packet4f& b,
int p,
int q,
int r,
int s) {
130 return shuffle2<false>(a, b, eigen_lsx_shuffle_mask(p, q, r, s));
132EIGEN_STRONG_INLINE Packet4f vec4f_movelh(
const Packet4f& a,
const Packet4f& b) {
133 return shuffle2<false>(a, b, eigen_lsx_shuffle_mask(0, 1, 0, 1));
135EIGEN_STRONG_INLINE Packet4f vec4f_movehl(
const Packet4f& a,
const Packet4f& b) {
136 return shuffle2<false>(b, a, eigen_lsx_shuffle_mask(2, 3, 2, 3));
138EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(
const Packet4f& a,
const Packet4f& b) {
139 return shuffle2<true>(a, b, eigen_lsx_shuffle_mask(0, 0, 1, 1));
141EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(
const Packet4f& a,
const Packet4f& b) {
142 return shuffle2<true>(a, b, eigen_lsx_shuffle_mask(2, 2, 3, 3));
145EIGEN_ALWAYS_INLINE Packet2d make_packet2d(
double a,
double b) {
146 double from[2] = {a, b};
147 return (Packet2d)__lsx_vld(from, 0);
150EIGEN_STRONG_INLINE Packet2d shuffle(
const Packet2d& m,
const Packet2d& n,
int mask) {
151 const double* a =
reinterpret_cast<const double*
>(&m);
152 const double* b =
reinterpret_cast<const double*
>(&n);
153 Packet2d res = make_packet2d(*(a + (mask & 1)), *(b + ((mask >> 1) & 1)));
157EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(
const Packet2d& a,
const Packet2d& b,
int mask) {
158 return shuffle(a, b, mask);
160EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(
const Packet2d& a,
const Packet2d& b) {
return shuffle(a, b, 0); }
161EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(
const Packet2d& a,
const Packet2d& b) {
return shuffle(a, b, 3); }
164struct packet_traits<int8_t> : default_packet_traits {
165 typedef Packet16c type;
166 typedef Packet16c half;
180struct packet_traits<int16_t> : default_packet_traits {
181 typedef Packet8s type;
182 typedef Packet8s half;
197struct packet_traits<int32_t> : default_packet_traits {
198 typedef Packet4i type;
199 typedef Packet4i half;
214struct packet_traits<int64_t> : default_packet_traits {
215 typedef Packet2l type;
216 typedef Packet2l half;
231struct packet_traits<uint8_t> : default_packet_traits {
232 typedef Packet16uc type;
233 typedef Packet16uc half;
248struct packet_traits<uint16_t> : default_packet_traits {
249 typedef Packet8us type;
250 typedef Packet8us half;
266struct packet_traits<uint32_t> : default_packet_traits {
267 typedef Packet4ui type;
268 typedef Packet4ui half;
284struct packet_traits<uint64_t> : default_packet_traits {
285 typedef Packet2ul type;
286 typedef Packet2ul half;
302struct packet_traits<float> : default_packet_traits {
303 typedef Packet4f type;
304 typedef Packet4f half;
323struct packet_traits<double> : default_packet_traits {
324 typedef Packet2d type;
325 typedef Packet2d half;
343struct unpacket_traits<Packet16c> {
345 typedef Packet16c half;
350 masked_load_available =
false,
351 masked_store_available =
false
355struct unpacket_traits<Packet8s> {
356 typedef int16_t type;
357 typedef Packet8s half;
362 masked_load_available =
false,
363 masked_store_available =
false
367struct unpacket_traits<Packet4i> {
368 typedef int32_t type;
369 typedef Packet4i half;
374 masked_load_available =
false,
375 masked_store_available =
false
379struct unpacket_traits<Packet2l> {
380 typedef int64_t type;
381 typedef Packet2l half;
386 masked_load_available =
false,
387 masked_store_available =
false
391struct unpacket_traits<Packet16uc> {
392 typedef uint8_t type;
393 typedef Packet16uc half;
398 masked_load_available =
false,
399 masked_store_available =
false
403struct unpacket_traits<Packet8us> {
404 typedef uint16_t type;
405 typedef Packet8us half;
410 masked_load_available =
false,
411 masked_store_available =
false
415struct unpacket_traits<Packet4ui> {
416 typedef uint32_t type;
417 typedef Packet4ui half;
422 masked_load_available =
false,
423 masked_store_available =
false
427struct unpacket_traits<Packet2ul> {
428 typedef uint64_t type;
429 typedef Packet2ul half;
434 masked_load_available =
false,
435 masked_store_available =
false
439struct unpacket_traits<Packet4f> {
441 typedef Packet4f half;
442 typedef Packet4i integer_packet;
447 masked_load_available =
false,
448 masked_store_available =
false
452struct unpacket_traits<Packet2d> {
454 typedef Packet2d half;
455 typedef Packet2l integer_packet;
460 masked_load_available =
false,
461 masked_store_available =
false
466EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(
const int8_t& from) {
467 return __lsx_vreplgr2vr_b(from);
470EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(
const int16_t& from) {
471 return __lsx_vreplgr2vr_h(from);
474EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(
const int32_t& from) {
475 return __lsx_vreplgr2vr_w(from);
478EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(
const int64_t& from) {
479 return __lsx_vreplgr2vr_d(from);
482EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(
const uint8_t& from) {
483 return __lsx_vreplgr2vr_b(from);
486EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(
const uint16_t& from) {
487 return __lsx_vreplgr2vr_h(from);
490EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(
const uint32_t& from) {
491 return __lsx_vreplgr2vr_w(from);
494EIGEN_STRONG_INLINE Packet2ul pset1<Packet2ul>(
const uint64_t& from) {
495 return __lsx_vreplgr2vr_d(from);
498EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(
const float& from) {
499 Packet4f v = {from, from, from, from};
503EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(
const double& from) {
504 Packet2d v = {from, from};
509EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(uint32_t from) {
510 return reinterpret_cast<__m128
>((__m128i)pset1<Packet4ui>(from));
513EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
514 return reinterpret_cast<__m128d
>((__m128i)pset1<Packet2ul>(from));
518EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(
const int8_t& a) {
519 const int8_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
520 return __lsx_vadd_b(pset1<Packet16c>(a), __lsx_vld(countdown, 0));
523EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(
const int16_t& a) {
524 const int16_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7};
525 return __lsx_vadd_h(pset1<Packet8s>(a), __lsx_vld(countdown, 0));
528EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(
const int32_t& a) {
529 const int32_t countdown[] = {0, 1, 2, 3};
530 return __lsx_vadd_w(pset1<Packet4i>(a), __lsx_vld(countdown, 0));
533EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(
const int64_t& a) {
534 const int64_t countdown[] = {0, 1};
535 return __lsx_vadd_d(pset1<Packet2l>(a), __lsx_vld(countdown, 0));
538EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(
const uint8_t& a) {
539 const uint8_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
540 return __lsx_vadd_b(pset1<Packet16uc>(a), __lsx_vld(countdown, 0));
543EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(
const uint16_t& a) {
544 const uint16_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7};
545 return __lsx_vadd_h(pset1<Packet8us>(a), __lsx_vld(countdown, 0));
548EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(
const uint32_t& a) {
549 const uint32_t countdown[] = {0, 1, 2, 3};
550 return __lsx_vadd_w(pset1<Packet4ui>(a), __lsx_vld(countdown, 0));
553EIGEN_STRONG_INLINE Packet2ul plset<Packet2ul>(
const uint64_t& a) {
554 const uint64_t countdown[] = {0, 1};
555 return __lsx_vadd_d(pset1<Packet2ul>(a), __lsx_vld(countdown, 0));
558EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(
const float& a) {
559 static const Packet4f countdown = {0.0f, 1.0f, 2.0f, 3.0f};
560 return __lsx_vfadd_s(pset1<Packet4f>(a), countdown);
563EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(
const double& a) {
564 static const Packet2d countdown = {0.0f, 1.0f};
565 return __lsx_vfadd_d(pset1<Packet2d>(a), countdown);
569EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
570 return __lsx_vadd_b(a, b);
573EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
574 return __lsx_vadd_h(a, b);
577EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
578 return __lsx_vadd_w(a, b);
581EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
582 return __lsx_vadd_d(a, b);
585EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
586 return __lsx_vadd_b(a, b);
589EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
590 return __lsx_vadd_h(a, b);
593EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
594 return __lsx_vadd_w(a, b);
597EIGEN_STRONG_INLINE Packet2ul padd<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
598 return __lsx_vadd_d(a, b);
601EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
602 return __lsx_vfadd_s(a, b);
605EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
606 return __lsx_vfadd_d(a, b);
610EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
611 return __lsx_vsub_b(a, b);
614EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
615 return __lsx_vsub_h(a, b);
618EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
619 return __lsx_vsub_w(a, b);
622EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
623 return __lsx_vsub_d(a, b);
626EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
627 return __lsx_vsub_b(a, b);
630EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
631 return __lsx_vsub_h(a, b);
634EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
635 return __lsx_vsub_w(a, b);
638EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
639 return __lsx_vsub_d(a, b);
642EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
643 return __lsx_vfsub_s(a, b);
646EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
647 return __lsx_vfsub_d(a, b);
651EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(
const Packet4f& a,
const Packet4f& b);
653EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
654 const Packet4f mask =
655 make_packet4f(numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f);
656 return padd(a, pxor(mask, b));
659EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(
const Packet2d& a,
const Packet2d& b);
661EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
662 const Packet2d mask = make_packet2d(numext::bit_cast<double>(0x8000000000000000ull), 0.0);
663 return padd(a, pxor(mask, b));
667EIGEN_STRONG_INLINE Packet4f pnegate(
const Packet4f& a) {
668 Packet4f mask = make_packet4f(numext::bit_cast<float>(0x80000000), numext::bit_cast<float>(0x80000000),
669 numext::bit_cast<float>(0x80000000), numext::bit_cast<float>(0x80000000));
670 return (Packet4f)__lsx_vxor_v(numext::bit_cast<__m128i>(mask), numext::bit_cast<__m128i>(a));
673EIGEN_STRONG_INLINE Packet2d pnegate(
const Packet2d& a) {
675 make_packet2d(numext::bit_cast<double>(0x8000000000000000), numext::bit_cast<double>(0x8000000000000000));
676 return (Packet2d)__lsx_vxor_v(numext::bit_cast<__m128i>(mask), numext::bit_cast<__m128i>(a));
679EIGEN_STRONG_INLINE Packet16c pnegate(
const Packet16c& a) {
680 return __lsx_vneg_b(a);
683EIGEN_STRONG_INLINE Packet8s pnegate(
const Packet8s& a) {
684 return __lsx_vneg_h(a);
687EIGEN_STRONG_INLINE Packet4i pnegate(
const Packet4i& a) {
688 return __lsx_vneg_w(a);
691EIGEN_STRONG_INLINE Packet2l pnegate(
const Packet2l& a) {
692 return __lsx_vneg_d(a);
696EIGEN_STRONG_INLINE Packet4f pconj(
const Packet4f& a) {
700EIGEN_STRONG_INLINE Packet2d pconj(
const Packet2d& a) {
704EIGEN_STRONG_INLINE Packet16c pconj(
const Packet16c& a) {
708EIGEN_STRONG_INLINE Packet8s pconj(
const Packet8s& a) {
712EIGEN_STRONG_INLINE Packet4i pconj(
const Packet4i& a) {
716EIGEN_STRONG_INLINE Packet2l pconj(
const Packet2l& a) {
720EIGEN_STRONG_INLINE Packet16uc pconj(
const Packet16uc& a) {
724EIGEN_STRONG_INLINE Packet8us pconj(
const Packet8us& a) {
728EIGEN_STRONG_INLINE Packet4ui pconj(
const Packet4ui& a) {
732EIGEN_STRONG_INLINE Packet2ul pconj(
const Packet2ul& a) {
737EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
738 return __lsx_vfmul_s(a, b);
741EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
742 return __lsx_vfmul_d(a, b);
745EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
746 return __lsx_vmul_b(a, b);
749EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
750 return __lsx_vmul_h(a, b);
753EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
754 return __lsx_vmul_w(a, b);
757EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
758 return __lsx_vmul_d(a, b);
761EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
762 return __lsx_vmul_b(a, b);
765EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
766 return __lsx_vmul_h(a, b);
769EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
770 return __lsx_vmul_w(a, b);
773EIGEN_STRONG_INLINE Packet2ul pmul<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
774 return __lsx_vmul_d(a, b);
778EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
779 return __lsx_vfdiv_s(a, b);
782EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
783 return __lsx_vfdiv_d(a, b);
786EIGEN_STRONG_INLINE Packet8s pdiv<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
787 return __lsx_vdiv_h(a, b);
790EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
791 return __lsx_vdiv_w(a, b);
794EIGEN_STRONG_INLINE Packet2l pdiv<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
795 return __lsx_vdiv_d(a, b);
798EIGEN_STRONG_INLINE Packet8us pdiv<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
799 return __lsx_vdiv_hu(a, b);
802EIGEN_STRONG_INLINE Packet4ui pdiv<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
803 return __lsx_vdiv_wu(a, b);
806EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
807 return __lsx_vdiv_du(a, b);
811EIGEN_STRONG_INLINE Packet4f pmadd(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
812 return __lsx_vfmadd_s(a, b, c);
815EIGEN_STRONG_INLINE Packet2d pmadd(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
816 return __lsx_vfmadd_d(a, b, c);
819EIGEN_STRONG_INLINE Packet4f pmsub(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
820 return __lsx_vfmsub_s(a, b, c);
823EIGEN_STRONG_INLINE Packet2d pmsub(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
824 return __lsx_vfmsub_d(a, b, c);
827EIGEN_STRONG_INLINE Packet4f pnmadd(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
828 return __lsx_vfnmsub_s(a, b, c);
831EIGEN_STRONG_INLINE Packet2d pnmadd(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
832 return __lsx_vfnmsub_d(a, b, c);
835EIGEN_STRONG_INLINE Packet4f pnmsub(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
836 return __lsx_vfnmadd_s(a, b, c);
839EIGEN_STRONG_INLINE Packet2d pnmsub(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
840 return __lsx_vfnmadd_d(a, b, c);
843EIGEN_STRONG_INLINE Packet16c pmadd(
const Packet16c& a,
const Packet16c& b,
const Packet16c& c) {
844 return __lsx_vmadd_b(c, a, b);
847EIGEN_STRONG_INLINE Packet8s pmadd(
const Packet8s& a,
const Packet8s& b,
const Packet8s& c) {
848 return __lsx_vmadd_h(c, a, b);
851EIGEN_STRONG_INLINE Packet4i pmadd(
const Packet4i& a,
const Packet4i& b,
const Packet4i& c) {
852 return __lsx_vmadd_w(c, a, b);
855EIGEN_STRONG_INLINE Packet2l pmadd(
const Packet2l& a,
const Packet2l& b,
const Packet2l& c) {
856 return __lsx_vmadd_d(c, a, b);
859EIGEN_STRONG_INLINE Packet16uc pmadd(
const Packet16uc& a,
const Packet16uc& b,
const Packet16uc& c) {
860 return __lsx_vmadd_b(c, a, b);
863EIGEN_STRONG_INLINE Packet8us pmadd(
const Packet8us& a,
const Packet8us& b,
const Packet8us& c) {
864 return __lsx_vmadd_h(c, a, b);
867EIGEN_STRONG_INLINE Packet4ui pmadd(
const Packet4ui& a,
const Packet4ui& b,
const Packet4ui& c) {
868 return __lsx_vmadd_w(c, a, b);
871EIGEN_STRONG_INLINE Packet2ul pmadd(
const Packet2ul& a,
const Packet2ul& b,
const Packet2ul& c) {
872 return __lsx_vmadd_d(c, a, b);
876EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
877 return (Packet4f)__lsx_vand_v((__m128i)a, (__m128i)b);
880EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
881 return (Packet2d)__lsx_vand_v((__m128i)a, (__m128i)b);
884EIGEN_STRONG_INLINE Packet16c pand<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
885 return __lsx_vand_v(a, b);
888EIGEN_STRONG_INLINE Packet8s pand<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
889 return __lsx_vand_v(a, b);
892EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
893 return __lsx_vand_v(a, b);
896EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
897 return __lsx_vand_v(a, b);
900EIGEN_STRONG_INLINE Packet16uc pand<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
901 return __lsx_vand_v(a, b);
904EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
905 return __lsx_vand_v(a, b);
908EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
909 return __lsx_vand_v(a, b);
912EIGEN_STRONG_INLINE Packet2ul pand<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
913 return __lsx_vand_v(a, b);
917EIGEN_STRONG_INLINE Packet4f por<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
918 return (Packet4f)__lsx_vor_v((__m128i)a, (__m128i)b);
921EIGEN_STRONG_INLINE Packet2d por<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
922 return (Packet2d)__lsx_vor_v((__m128i)a, (__m128i)b);
925EIGEN_STRONG_INLINE Packet16c por<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
926 return __lsx_vor_v(a, b);
929EIGEN_STRONG_INLINE Packet8s por<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
930 return __lsx_vor_v(a, b);
933EIGEN_STRONG_INLINE Packet4i por<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
934 return __lsx_vor_v(a, b);
937EIGEN_STRONG_INLINE Packet2l por<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
938 return __lsx_vor_v(a, b);
941EIGEN_STRONG_INLINE Packet16uc por<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
942 return __lsx_vor_v(a, b);
945EIGEN_STRONG_INLINE Packet8us por<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
946 return __lsx_vor_v(a, b);
949EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
950 return __lsx_vor_v(a, b);
953EIGEN_STRONG_INLINE Packet2ul por<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
954 return __lsx_vor_v(a, b);
958EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
959 return (Packet4f)__lsx_vxor_v((__m128i)a, (__m128i)b);
962EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
963 return (Packet2d)__lsx_vxor_v((__m128i)a, (__m128i)b);
966EIGEN_STRONG_INLINE Packet16c pxor<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
967 return __lsx_vxor_v(a, b);
970EIGEN_STRONG_INLINE Packet8s pxor<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
971 return __lsx_vxor_v(a, b);
974EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
975 return __lsx_vxor_v(a, b);
978EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
979 return __lsx_vxor_v(a, b);
982EIGEN_STRONG_INLINE Packet16uc pxor<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
983 return __lsx_vxor_v(a, b);
986EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
987 return __lsx_vxor_v(a, b);
990EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
991 return __lsx_vxor_v(a, b);
994EIGEN_STRONG_INLINE Packet2ul pxor<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
995 return __lsx_vxor_v(a, b);
999EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1000 return (Packet4f)__lsx_vandn_v((__m128i)b, (__m128i)a);
1003EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
1004 return (Packet2d)__lsx_vandn_v((__m128i)b, (__m128i)a);
1007EIGEN_STRONG_INLINE Packet16c pandnot<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1008 return __lsx_vandn_v(b, a);
1011EIGEN_STRONG_INLINE Packet8s pandnot<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1012 return __lsx_vandn_v(b, a);
1015EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1016 return __lsx_vandn_v(b, a);
1019EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
1020 return __lsx_vandn_v(b, a);
1023EIGEN_STRONG_INLINE Packet16uc pandnot<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1024 return __lsx_vandn_v(b, a);
1027EIGEN_STRONG_INLINE Packet8us pandnot<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1028 return __lsx_vandn_v(b, a);
1031EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1032 return __lsx_vandn_v(b, a);
1035EIGEN_STRONG_INLINE Packet2ul pandnot<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
1036 return __lsx_vandn_v(b, a);
1040EIGEN_STRONG_INLINE Packet4f pcmp_le<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1041 return (Packet4f)__lsx_vfcmp_cle_s(a, b);
1044EIGEN_STRONG_INLINE Packet2d pcmp_le<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
1045 return (Packet2d)__lsx_vfcmp_cle_d(a, b);
1048EIGEN_STRONG_INLINE Packet16c pcmp_le<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1049 return __lsx_vsle_b(a, b);
1052EIGEN_STRONG_INLINE Packet8s pcmp_le<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1053 return __lsx_vsle_h(a, b);
1056EIGEN_STRONG_INLINE Packet4i pcmp_le<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1057 return __lsx_vsle_w(a, b);
1060EIGEN_STRONG_INLINE Packet2l pcmp_le<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
1061 return __lsx_vsle_d(a, b);
1064EIGEN_STRONG_INLINE Packet16uc pcmp_le<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1065 return __lsx_vsle_bu(a, b);
1068EIGEN_STRONG_INLINE Packet8us pcmp_le<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1069 return __lsx_vsle_hu(a, b);
1072EIGEN_STRONG_INLINE Packet4ui pcmp_le<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1073 return __lsx_vsle_wu(a, b);
1076EIGEN_STRONG_INLINE Packet2ul pcmp_le<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
1077 return __lsx_vsle_du(a, b);
1081EIGEN_STRONG_INLINE Packet4f pcmp_lt<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1082 return (Packet4f)__lsx_vfcmp_clt_s(a, b);
1085EIGEN_STRONG_INLINE Packet2d pcmp_lt<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
1086 return (Packet2d)__lsx_vfcmp_clt_d(a, b);
1089EIGEN_STRONG_INLINE Packet16c pcmp_lt<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1090 return __lsx_vslt_b(a, b);
1093EIGEN_STRONG_INLINE Packet8s pcmp_lt<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1094 return __lsx_vslt_h(a, b);
1097EIGEN_STRONG_INLINE Packet4i pcmp_lt<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1098 return __lsx_vslt_w(a, b);
1101EIGEN_STRONG_INLINE Packet2l pcmp_lt<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
1102 return __lsx_vslt_d(a, b);
1105EIGEN_STRONG_INLINE Packet16uc pcmp_lt<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1106 return __lsx_vslt_bu(a, b);
1109EIGEN_STRONG_INLINE Packet8us pcmp_lt<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1110 return __lsx_vslt_hu(a, b);
1113EIGEN_STRONG_INLINE Packet4ui pcmp_lt<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1114 return __lsx_vslt_wu(a, b);
1117EIGEN_STRONG_INLINE Packet2ul pcmp_lt<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
1118 return __lsx_vslt_du(a, b);
1122EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1123 return (Packet4f)__lsx_vfcmp_sult_s(a, b);
1126EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
1127 return (Packet2d)__lsx_vfcmp_sult_d(a, b);
1131EIGEN_STRONG_INLINE Packet4f pcmp_eq<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1132 return (Packet4f)__lsx_vfcmp_seq_s(a, b);
1135EIGEN_STRONG_INLINE Packet2d pcmp_eq<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
1136 return (Packet2d)__lsx_vfcmp_seq_d(a, b);
1139EIGEN_STRONG_INLINE Packet16c pcmp_eq<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1140 return __lsx_vseq_b(a, b);
1143EIGEN_STRONG_INLINE Packet8s pcmp_eq<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1144 return __lsx_vseq_h(a, b);
1147EIGEN_STRONG_INLINE Packet4i pcmp_eq<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1148 return __lsx_vseq_w(a, b);
1151EIGEN_STRONG_INLINE Packet2l pcmp_eq<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
1152 return __lsx_vseq_d(a, b);
1155EIGEN_STRONG_INLINE Packet16uc pcmp_eq<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1156 return __lsx_vseq_b(a, b);
1159EIGEN_STRONG_INLINE Packet8us pcmp_eq<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1160 return __lsx_vseq_h(a, b);
1163EIGEN_STRONG_INLINE Packet4ui pcmp_eq<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1164 return __lsx_vseq_w(a, b);
1167EIGEN_STRONG_INLINE Packet2ul pcmp_eq<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
1168 return __lsx_vseq_d(a, b);
1172EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1173 return __lsx_vmin_b(a, b);
1176EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1177 return __lsx_vmin_h(a, b);
1180EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1181 return __lsx_vmin_w(a, b);
1184EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
1185 return __lsx_vmin_d(a, b);
1188EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1189 return __lsx_vmin_bu(a, b);
1192EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1193 return __lsx_vmin_hu(a, b);
1196EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1197 return __lsx_vmin_wu(a, b);
1200EIGEN_STRONG_INLINE Packet2ul pmin<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
1201 return __lsx_vmin_du(a, b);
1205EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
1206 return __lsx_vmax_b(a, b);
1209EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
1210 return __lsx_vmax_h(a, b);
1213EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1214 return __lsx_vmax_w(a, b);
1217EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
1218 return __lsx_vmax_d(a, b);
1221EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
1222 return __lsx_vmax_bu(a, b);
1225EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
1226 return __lsx_vmax_hu(a, b);
1229EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1230 return __lsx_vmax_wu(a, b);
1233EIGEN_STRONG_INLINE Packet2ul pmax<Packet2ul>(
const Packet2ul& a,
const Packet2ul& b) {
1234 return __lsx_vmax_du(a, b);
1238EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1239 Packet4i aNaN = __lsx_vfcmp_cun_s(a, a);
1240 Packet4i aMinOrNaN = por<Packet4i>(__lsx_vfcmp_clt_s(a, b), aNaN);
1241 return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMinOrNaN);
1244EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
1245 Packet2l aNaN = __lsx_vfcmp_cun_d(a, a);
1246 Packet2l aMinOrNaN = por<Packet2l>(__lsx_vfcmp_clt_d(a, b), aNaN);
1247 return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMinOrNaN);
1250EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1251 Packet4i aNaN = __lsx_vfcmp_cun_s(a, a);
1252 Packet4i aMaxOrNaN = por<Packet4i>(__lsx_vfcmp_clt_s(b, a), aNaN);
1253 return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMaxOrNaN);
1256EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
1257 Packet2l aNaN = __lsx_vfcmp_cun_d(a, a);
1258 Packet2l aMaxOrNaN = por<Packet2l>(__lsx_vfcmp_clt_d(b, a), aNaN);
1259 return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMaxOrNaN);
1263EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(
const Packet16c& a) {
1264 return __lsx_vsrai_b((__m128i)a, N);
1267EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(
const Packet8s& a) {
1268 return __lsx_vsrai_h((__m128i)a, N);
1271EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(
const Packet4i& a) {
1272 return __lsx_vsrai_w((__m128i)a, N);
1275EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(
const Packet2l& a) {
1276 return __lsx_vsrai_d((__m128i)a, N);
1279EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(
const Packet16uc& a) {
1280 return __lsx_vsrli_b((__m128i)a, N);
1283EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(
const Packet8us& a) {
1284 return __lsx_vsrli_h((__m128i)a, N);
1287EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(
const Packet4ui& a) {
1288 return __lsx_vsrli_w((__m128i)a, N);
1291EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(
const Packet2ul& a) {
1292 return __lsx_vsrli_d((__m128i)a, N);
1296EIGEN_STRONG_INLINE Packet16c plogical_shift_right(
const Packet16c& a) {
1297 return __lsx_vsrli_b((__m128i)a, N);
1300EIGEN_STRONG_INLINE Packet8s plogical_shift_right(
const Packet8s& a) {
1301 return __lsx_vsrli_h((__m128i)a, N);
1304EIGEN_STRONG_INLINE Packet4i plogical_shift_right(
const Packet4i& a) {
1305 return __lsx_vsrli_w((__m128i)a, N);
1308EIGEN_STRONG_INLINE Packet2l plogical_shift_right(
const Packet2l& a) {
1309 return __lsx_vsrli_d((__m128i)a, N);
1312EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(
const Packet16uc& a) {
1313 return __lsx_vsrli_b((__m128i)a, N);
1316EIGEN_STRONG_INLINE Packet8us plogical_shift_right(
const Packet8us& a) {
1317 return __lsx_vsrli_h((__m128i)a, N);
1320EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(
const Packet4ui& a) {
1321 return __lsx_vsrli_w((__m128i)a, N);
1324EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(
const Packet2ul& a) {
1325 return __lsx_vsrli_d((__m128i)a, N);
1329EIGEN_STRONG_INLINE Packet16c plogical_shift_left(
const Packet16c& a) {
1330 return __lsx_vslli_b((__m128i)a, N);
1333EIGEN_STRONG_INLINE Packet8s plogical_shift_left(
const Packet8s& a) {
1334 return __lsx_vslli_h((__m128i)a, N);
1337EIGEN_STRONG_INLINE Packet4i plogical_shift_left(
const Packet4i& a) {
1338 return __lsx_vslli_w((__m128i)a, N);
1341EIGEN_STRONG_INLINE Packet2l plogical_shift_left(
const Packet2l& a) {
1342 return __lsx_vslli_d((__m128i)a, N);
1345EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(
const Packet16uc& a) {
1346 return __lsx_vslli_b((__m128i)a, N);
1349EIGEN_STRONG_INLINE Packet8us plogical_shift_left(
const Packet8us& a) {
1350 return __lsx_vslli_h((__m128i)a, N);
1353EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(
const Packet4ui& a) {
1354 return __lsx_vslli_w((__m128i)a, N);
1357EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(
const Packet2ul& a) {
1358 return __lsx_vslli_d((__m128i)a, N);
1362EIGEN_STRONG_INLINE Packet4f pabs(
const Packet4f& a) {
1363 return (Packet4f)__lsx_vbitclri_w((__m128i)a, 31);
1366EIGEN_STRONG_INLINE Packet2d pabs(
const Packet2d& a) {
1367 return (Packet2d)__lsx_vbitclri_d((__m128i)a, 63);
1370EIGEN_STRONG_INLINE Packet16c pabs(
const Packet16c& a) {
1371 return __lsx_vabsd_b(a, pzero(a));
1374EIGEN_STRONG_INLINE Packet8s pabs(
const Packet8s& a) {
1375 return __lsx_vabsd_h(a, pzero(a));
1378EIGEN_STRONG_INLINE Packet4i pabs(
const Packet4i& a) {
1379 return __lsx_vabsd_w(a, pzero(a));
1382EIGEN_STRONG_INLINE Packet2l pabs(
const Packet2l& a) {
1383 return __lsx_vabsd_d(a, pzero(a));
1386EIGEN_STRONG_INLINE Packet16uc pabs(
const Packet16uc& a) {
1390EIGEN_STRONG_INLINE Packet8us pabs(
const Packet8us& a) {
1394EIGEN_STRONG_INLINE Packet4ui pabs(
const Packet4ui& a) {
1398EIGEN_STRONG_INLINE Packet2ul pabs(
const Packet2ul& a) {
1403EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(
const float* from) {
1404 EIGEN_DEBUG_ALIGNED_LOAD
return (Packet4f)__lsx_vld(from, 0);
1407EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(
const double* from) {
1408 EIGEN_DEBUG_ALIGNED_LOAD
return (Packet2d)__lsx_vld(from, 0);
1411EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(
const int8_t* from) {
1412 EIGEN_DEBUG_ALIGNED_LOAD
return __lsx_vld(from, 0);
1415EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(
const int16_t* from) {
1416 EIGEN_DEBUG_ALIGNED_LOAD
return __lsx_vld(from, 0);
1419EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(
const int32_t* from) {
1420 EIGEN_DEBUG_ALIGNED_LOAD
return __lsx_vld(from, 0);
1423EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(
const int64_t* from) {
1424 EIGEN_DEBUG_ALIGNED_LOAD
return __lsx_vld(from, 0);
1427EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(
const uint8_t* from) {
1428 EIGEN_DEBUG_ALIGNED_LOAD
return __lsx_vld(from, 0);
1431EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(
const uint16_t* from) {
1432 EIGEN_DEBUG_ALIGNED_LOAD
return __lsx_vld(from, 0);
1435EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(
const uint32_t* from) {
1436 EIGEN_DEBUG_ALIGNED_LOAD
return __lsx_vld(from, 0);
1439EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(
const uint64_t* from) {
1440 EIGEN_DEBUG_ALIGNED_LOAD
return __lsx_vld(from, 0);
1444EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(
const float* from) {
1445 EIGEN_DEBUG_UNALIGNED_LOAD
return (Packet4f)__lsx_vld(from, 0);
1448EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(
const double* from) {
1449 EIGEN_DEBUG_UNALIGNED_LOAD
return (Packet2d)__lsx_vld(from, 0);
1452EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(
const int8_t* from) {
1453 EIGEN_DEBUG_UNALIGNED_LOAD
return __lsx_vld(from, 0);
1456EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(
const int16_t* from) {
1457 EIGEN_DEBUG_UNALIGNED_LOAD
return __lsx_vld(from, 0);
1460EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(
const int32_t* from) {
1461 EIGEN_DEBUG_UNALIGNED_LOAD
return __lsx_vld(from, 0);
1464EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(
const int64_t* from) {
1465 EIGEN_DEBUG_UNALIGNED_LOAD
return __lsx_vld(from, 0);
1468EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(
const uint8_t* from) {
1469 EIGEN_DEBUG_UNALIGNED_LOAD
return __lsx_vld(from, 0);
1472EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(
const uint16_t* from) {
1473 EIGEN_DEBUG_UNALIGNED_LOAD
return __lsx_vld(from, 0);
1476EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(
const uint32_t* from) {
1477 EIGEN_DEBUG_UNALIGNED_LOAD
return __lsx_vld(from, 0);
1480EIGEN_STRONG_INLINE Packet2ul ploadu<Packet2ul>(
const uint64_t* from) {
1481 EIGEN_DEBUG_UNALIGNED_LOAD
return __lsx_vld(from, 0);
1485EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(
const float* from) {
1486 float f0 = from[0], f1 = from[1];
1487 return make_packet4f(f0, f0, f1, f1);
1490EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(
const double* from) {
1491 return pset1<Packet2d>(from[0]);
1494EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(
const int8_t* from) {
1495 Packet16c tmp = pload<Packet16c>(from);
1496 return __lsx_vilvl_b(tmp, tmp);
1499EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(
const int16_t* from) {
1500 Packet8s tmp = pload<Packet8s>(from);
1501 return __lsx_vilvl_h(tmp, tmp);
1504EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(
const int32_t* from) {
1505 Packet4i tmp = pload<Packet4i>(from);
1506 return __lsx_vilvl_w(tmp, tmp);
1509EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(
const int64_t* from) {
1510 return pset1<Packet2l>(from[0]);
1513EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(
const uint8_t* from) {
1514 Packet16uc tmp = pload<Packet16uc>(from);
1515 return __lsx_vilvl_b(tmp, tmp);
1518EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(
const uint16_t* from) {
1519 Packet8us tmp = pload<Packet8us>(from);
1520 return __lsx_vilvl_h(tmp, tmp);
1523EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(
const uint32_t* from) {
1524 Packet4ui tmp = pload<Packet4ui>(from);
1525 return __lsx_vilvl_w(tmp, tmp);
1528EIGEN_STRONG_INLINE Packet2ul ploaddup<Packet2ul>(
const uint64_t* from) {
1529 return pset1<Packet2ul>(from[0]);
1533EIGEN_STRONG_INLINE
void pstore<float>(
float* to,
const Packet4f& from) {
1534 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst(from, to, 0);
1537EIGEN_STRONG_INLINE
void pstore<double>(
double* to,
const Packet2d& from) {
1538 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst(from, to, 0);
1541EIGEN_STRONG_INLINE
void pstore<int8_t>(int8_t* to,
const Packet16c& from) {
1542 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1545EIGEN_STRONG_INLINE
void pstore<int16_t>(int16_t* to,
const Packet8s& from) {
1546 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1549EIGEN_STRONG_INLINE
void pstore<int32_t>(int32_t* to,
const Packet4i& from) {
1550 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1553EIGEN_STRONG_INLINE
void pstore<int64_t>(int64_t* to,
const Packet2l& from) {
1554 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1557EIGEN_STRONG_INLINE
void pstore<uint8_t>(uint8_t* to,
const Packet16uc& from) {
1558 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1561EIGEN_STRONG_INLINE
void pstore<uint16_t>(uint16_t* to,
const Packet8us& from) {
1562 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1565EIGEN_STRONG_INLINE
void pstore<uint32_t>(uint32_t* to,
const Packet4ui& from) {
1566 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1569EIGEN_STRONG_INLINE
void pstore<uint64_t>(uint64_t* to,
const Packet2ul& from) {
1570 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1574EIGEN_STRONG_INLINE
void pstoreu<float>(
float* to,
const Packet4f& from) {
1575 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst(from, to, 0);
1578EIGEN_STRONG_INLINE
void pstoreu<double>(
double* to,
const Packet2d& from) {
1579 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst(from, to, 0);
1583EIGEN_STRONG_INLINE
void pstoreu<int8_t>(int8_t* to,
const Packet16c& from) {
1584 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1587EIGEN_STRONG_INLINE
void pstoreu<int16_t>(int16_t* to,
const Packet8s& from) {
1588 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1591EIGEN_STRONG_INLINE
void pstoreu<int32_t>(int32_t* to,
const Packet4i& from) {
1592 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1595EIGEN_STRONG_INLINE
void pstoreu<int64_t>(int64_t* to,
const Packet2l& from) {
1596 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1599EIGEN_STRONG_INLINE
void pstoreu<uint8_t>(uint8_t* to,
const Packet16uc& from) {
1600 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1603EIGEN_STRONG_INLINE
void pstoreu<uint16_t>(uint16_t* to,
const Packet8us& from) {
1604 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1607EIGEN_STRONG_INLINE
void pstoreu<uint32_t>(uint32_t* to,
const Packet4ui& from) {
1608 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1611EIGEN_STRONG_INLINE
void pstoreu<uint64_t>(uint64_t* to,
const Packet2ul& from) {
1612 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1616EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(
const float* from,
Index stride) {
1617 Packet4f v = {from[0], from[stride], from[2 * stride], from[3 * stride]};
1621EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(
const double* from,
Index stride) {
1622 Packet2d v = {from[0], from[stride]};
1626EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather<int8_t, Packet16c>(
const int8_t* from,
Index stride) {
1627 int8_t v[16] __attribute__((aligned(16)));
1629 v[1] = from[stride];
1630 v[2] = from[2 * stride];
1631 v[3] = from[3 * stride];
1632 v[4] = from[4 * stride];
1633 v[5] = from[5 * stride];
1634 v[6] = from[6 * stride];
1635 v[7] = from[7 * stride];
1636 v[8] = from[8 * stride];
1637 v[9] = from[9 * stride];
1638 v[10] = from[10 * stride];
1639 v[11] = from[11 * stride];
1640 v[12] = from[12 * stride];
1641 v[13] = from[13 * stride];
1642 v[14] = from[14 * stride];
1643 v[15] = from[15 * stride];
1644 return __lsx_vld(v, 0);
1647EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather<int16_t, Packet8s>(
const int16_t* from,
Index stride) {
1648 int16_t v[8] __attribute__((aligned(16)));
1650 v[1] = from[stride];
1651 v[2] = from[2 * stride];
1652 v[3] = from[3 * stride];
1653 v[4] = from[4 * stride];
1654 v[5] = from[5 * stride];
1655 v[6] = from[6 * stride];
1656 v[7] = from[7 * stride];
1657 return __lsx_vld(v, 0);
1660EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather<int32_t, Packet4i>(
const int32_t* from,
Index stride) {
1661 int32_t v[4] __attribute__((aligned(16)));
1663 v[1] = from[stride];
1664 v[2] = from[2 * stride];
1665 v[3] = from[3 * stride];
1666 return __lsx_vld(v, 0);
1669EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(
const int64_t* from,
Index stride) {
1670 int64_t v[2] __attribute__((aligned(16)));
1672 v[1] = from[stride];
1673 return __lsx_vld(v, 0);
1676EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather<uint8_t, Packet16uc>(
const uint8_t* from,
Index stride) {
1677 uint8_t v[16] __attribute__((aligned(16)));
1679 v[1] = from[stride];
1680 v[2] = from[2 * stride];
1681 v[3] = from[3 * stride];
1682 v[4] = from[4 * stride];
1683 v[5] = from[5 * stride];
1684 v[6] = from[6 * stride];
1685 v[7] = from[7 * stride];
1686 v[8] = from[8 * stride];
1687 v[9] = from[9 * stride];
1688 v[10] = from[10 * stride];
1689 v[11] = from[11 * stride];
1690 v[12] = from[12 * stride];
1691 v[13] = from[13 * stride];
1692 v[14] = from[14 * stride];
1693 v[15] = from[15 * stride];
1694 return __lsx_vld(v, 0);
1697EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather<uint16_t, Packet8us>(
const uint16_t* from,
Index stride) {
1698 uint16_t v[8] __attribute__((aligned(16)));
1700 v[1] = from[stride];
1701 v[2] = from[2 * stride];
1702 v[3] = from[3 * stride];
1703 v[4] = from[4 * stride];
1704 v[5] = from[5 * stride];
1705 v[6] = from[6 * stride];
1706 v[7] = from[7 * stride];
1707 return __lsx_vld(v, 0);
1710EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(
const uint32_t* from,
Index stride) {
1711 uint32_t v[4] __attribute__((aligned(16)));
1713 v[1] = from[stride];
1714 v[2] = from[2 * stride];
1715 v[3] = from[3 * stride];
1716 return __lsx_vld(v, 0);
1719EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather<uint64_t, Packet2ul>(
const uint64_t* from,
Index stride) {
1720 uint64_t v[2] __attribute__((aligned(16)));
1722 v[1] = from[stride];
1723 return __lsx_vld(v, 0);
1727EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<float, Packet4f>(
float* to,
const Packet4f& from,
Index stride) {
1728 __lsx_vstelm_w(from, to, 0, 0);
1729 __lsx_vstelm_w(from, to + stride * 1, 0, 1);
1730 __lsx_vstelm_w(from, to + stride * 2, 0, 2);
1731 __lsx_vstelm_w(from, to + stride * 3, 0, 3);
1734EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<double, Packet2d>(
double* to,
const Packet2d& from,
Index stride) {
1735 __lsx_vstelm_d(from, to, 0, 0);
1736 __lsx_vstelm_d(from, to + stride, 0, 1);
1739EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<int8_t, Packet16c>(int8_t* to,
const Packet16c& from,
1741 __lsx_vstelm_b((__m128i)from, to, 0, 0);
1742 __lsx_vstelm_b((__m128i)from, to + stride * 1, 0, 1);
1743 __lsx_vstelm_b((__m128i)from, to + stride * 2, 0, 2);
1744 __lsx_vstelm_b((__m128i)from, to + stride * 3, 0, 3);
1745 __lsx_vstelm_b((__m128i)from, to + stride * 4, 0, 4);
1746 __lsx_vstelm_b((__m128i)from, to + stride * 5, 0, 5);
1747 __lsx_vstelm_b((__m128i)from, to + stride * 6, 0, 6);
1748 __lsx_vstelm_b((__m128i)from, to + stride * 7, 0, 7);
1749 __lsx_vstelm_b((__m128i)from, to + stride * 8, 0, 8);
1750 __lsx_vstelm_b((__m128i)from, to + stride * 9, 0, 9);
1751 __lsx_vstelm_b((__m128i)from, to + stride * 10, 0, 10);
1752 __lsx_vstelm_b((__m128i)from, to + stride * 11, 0, 11);
1753 __lsx_vstelm_b((__m128i)from, to + stride * 12, 0, 12);
1754 __lsx_vstelm_b((__m128i)from, to + stride * 13, 0, 13);
1755 __lsx_vstelm_b((__m128i)from, to + stride * 14, 0, 14);
1756 __lsx_vstelm_b((__m128i)from, to + stride * 15, 0, 15);
1759EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<int16_t, Packet8s>(int16_t* to,
const Packet8s& from,
1761 __lsx_vstelm_h((__m128i)from, to, 0, 0);
1762 __lsx_vstelm_h((__m128i)from, to + stride * 1, 0, 1);
1763 __lsx_vstelm_h((__m128i)from, to + stride * 2, 0, 2);
1764 __lsx_vstelm_h((__m128i)from, to + stride * 3, 0, 3);
1765 __lsx_vstelm_h((__m128i)from, to + stride * 4, 0, 4);
1766 __lsx_vstelm_h((__m128i)from, to + stride * 5, 0, 5);
1767 __lsx_vstelm_h((__m128i)from, to + stride * 6, 0, 6);
1768 __lsx_vstelm_h((__m128i)from, to + stride * 7, 0, 7);
1771EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<int32_t, Packet4i>(int32_t* to,
const Packet4i& from,
1773 __lsx_vstelm_w((__m128i)from, to, 0, 0);
1774 __lsx_vstelm_w((__m128i)from, to + stride * 1, 0, 1);
1775 __lsx_vstelm_w((__m128i)from, to + stride * 2, 0, 2);
1776 __lsx_vstelm_w((__m128i)from, to + stride * 3, 0, 3);
1779EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<int64_t, Packet2l>(int64_t* to,
const Packet2l& from,
1781 __lsx_vstelm_d((__m128i)from, to, 0, 0);
1782 __lsx_vstelm_d((__m128i)from, to + stride * 1, 0, 1);
1785EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<uint8_t, Packet16uc>(uint8_t* to,
const Packet16uc& from,
1787 __lsx_vstelm_b((__m128i)from, to, 0, 0);
1788 __lsx_vstelm_b((__m128i)from, to + stride * 1, 0, 1);
1789 __lsx_vstelm_b((__m128i)from, to + stride * 2, 0, 2);
1790 __lsx_vstelm_b((__m128i)from, to + stride * 3, 0, 3);
1791 __lsx_vstelm_b((__m128i)from, to + stride * 4, 0, 4);
1792 __lsx_vstelm_b((__m128i)from, to + stride * 5, 0, 5);
1793 __lsx_vstelm_b((__m128i)from, to + stride * 6, 0, 6);
1794 __lsx_vstelm_b((__m128i)from, to + stride * 7, 0, 7);
1795 __lsx_vstelm_b((__m128i)from, to + stride * 8, 0, 8);
1796 __lsx_vstelm_b((__m128i)from, to + stride * 9, 0, 9);
1797 __lsx_vstelm_b((__m128i)from, to + stride * 10, 0, 10);
1798 __lsx_vstelm_b((__m128i)from, to + stride * 11, 0, 11);
1799 __lsx_vstelm_b((__m128i)from, to + stride * 12, 0, 12);
1800 __lsx_vstelm_b((__m128i)from, to + stride * 13, 0, 13);
1801 __lsx_vstelm_b((__m128i)from, to + stride * 14, 0, 14);
1802 __lsx_vstelm_b((__m128i)from, to + stride * 15, 0, 15);
1805EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<uint16_t, Packet8us>(uint16_t* to,
const Packet8us& from,
1807 __lsx_vstelm_h((__m128i)from, to, 0, 0);
1808 __lsx_vstelm_h((__m128i)from, to + stride * 1, 0, 1);
1809 __lsx_vstelm_h((__m128i)from, to + stride * 2, 0, 2);
1810 __lsx_vstelm_h((__m128i)from, to + stride * 3, 0, 3);
1811 __lsx_vstelm_h((__m128i)from, to + stride * 4, 0, 4);
1812 __lsx_vstelm_h((__m128i)from, to + stride * 5, 0, 5);
1813 __lsx_vstelm_h((__m128i)from, to + stride * 6, 0, 6);
1814 __lsx_vstelm_h((__m128i)from, to + stride * 7, 0, 7);
1817EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<uint32_t, Packet4ui>(uint32_t* to,
const Packet4ui& from,
1819 __lsx_vstelm_w((__m128i)from, to, 0, 0);
1820 __lsx_vstelm_w((__m128i)from, to + stride * 1, 0, 1);
1821 __lsx_vstelm_w((__m128i)from, to + stride * 2, 0, 2);
1822 __lsx_vstelm_w((__m128i)from, to + stride * 3, 0, 3);
1825EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<uint64_t, Packet2ul>(uint64_t* to,
const Packet2ul& from,
1827 __lsx_vstelm_d((__m128i)from, to, 0, 0);
1828 __lsx_vstelm_d((__m128i)from, to + stride * 1, 0, 1);
1832EIGEN_STRONG_INLINE
void prefetch<float>(
const float* addr) {
1833 __builtin_prefetch(addr);
1836EIGEN_STRONG_INLINE
void prefetch<double>(
const double* addr) {
1837 __builtin_prefetch(addr);
1840EIGEN_STRONG_INLINE
void prefetch<int8_t>(
const int8_t* addr) {
1841 __builtin_prefetch(addr);
1844EIGEN_STRONG_INLINE
void prefetch<int16_t>(
const int16_t* addr) {
1845 __builtin_prefetch(addr);
1848EIGEN_STRONG_INLINE
void prefetch<int32_t>(
const int32_t* addr) {
1849 __builtin_prefetch(addr);
1852EIGEN_STRONG_INLINE
void prefetch<int64_t>(
const int64_t* addr) {
1853 __builtin_prefetch(addr);
1856EIGEN_STRONG_INLINE
void prefetch<uint8_t>(
const uint8_t* addr) {
1857 __builtin_prefetch(addr);
1860EIGEN_STRONG_INLINE
void prefetch<uint16_t>(
const uint16_t* addr) {
1861 __builtin_prefetch(addr);
1864EIGEN_STRONG_INLINE
void prefetch<uint32_t>(
const uint32_t* addr) {
1865 __builtin_prefetch(addr);
1868EIGEN_STRONG_INLINE
void prefetch<uint64_t>(
const uint64_t* addr) {
1869 __builtin_prefetch(addr);
1873EIGEN_STRONG_INLINE
float pfirst<Packet4f>(
const Packet4f& a) {
1875 __lsx_vstelm_w(a, &v, 0, 0);
1879EIGEN_STRONG_INLINE
double pfirst<Packet2d>(
const Packet2d& a) {
1881 __lsx_vstelm_d(a, &v, 0, 0);
1886EIGEN_STRONG_INLINE int8_t pfirst<Packet16c>(
const Packet16c& a) {
1887 return (int8_t)__lsx_vpickve2gr_b((__m128i)a, 0);
1890EIGEN_STRONG_INLINE int16_t pfirst<Packet8s>(
const Packet8s& a) {
1891 return (int16_t)__lsx_vpickve2gr_h((__m128i)a, 0);
1894EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(
const Packet4i& a) {
1895 return __lsx_vpickve2gr_w((__m128i)a, 0);
1898EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(
const Packet2l& a) {
1899 return __lsx_vpickve2gr_d((__m128i)a, 0);
1902EIGEN_STRONG_INLINE uint8_t pfirst<Packet16uc>(
const Packet16uc& a) {
1903 return (uint8_t)__lsx_vpickve2gr_bu((__m128i)a, 0);
1906EIGEN_STRONG_INLINE uint16_t pfirst<Packet8us>(
const Packet8us& a) {
1907 return (uint16_t)__lsx_vpickve2gr_hu((__m128i)a, 0);
1910EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(
const Packet4ui& a) {
1911 return __lsx_vpickve2gr_wu((__m128i)a, 0);
1914EIGEN_STRONG_INLINE uint64_t pfirst<Packet2ul>(
const Packet2ul& a) {
1915 return __lsx_vpickve2gr_du((__m128i)a, 0);
1919EIGEN_STRONG_INLINE Packet4f preverse(
const Packet4f& a) {
1920 return (Packet4f)__lsx_vshuf4i_w(a, 0x1B);
1923EIGEN_STRONG_INLINE Packet2d preverse(
const Packet2d& a) {
1924 return (Packet2d)__lsx_vshuf4i_d(a, a, 0x1);
1927EIGEN_STRONG_INLINE Packet16c preverse(
const Packet16c& a) {
1928 return __lsx_vshuf4i_b(__lsx_vshuf4i_w((__m128i)a, 0x1B), 0x1B);
1931EIGEN_STRONG_INLINE Packet8s preverse(
const Packet8s& a) {
1932 return __lsx_vshuf4i_h(__lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1), 0x1B);
1935EIGEN_STRONG_INLINE Packet4i preverse(
const Packet4i& a) {
1936 return __lsx_vshuf4i_w((__m128i)a, 0x1B);
1939EIGEN_STRONG_INLINE Packet2l preverse(
const Packet2l& a) {
1940 return __lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1);
1943EIGEN_STRONG_INLINE Packet16uc preverse(
const Packet16uc& a) {
1944 return __lsx_vshuf4i_b(__lsx_vshuf4i_w((__m128i)a, 0x1B), 0x1B);
1947EIGEN_STRONG_INLINE Packet8us preverse(
const Packet8us& a) {
1948 return __lsx_vshuf4i_h(__lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1), 0x1B);
1951EIGEN_STRONG_INLINE Packet4ui preverse(
const Packet4ui& a) {
1952 return __lsx_vshuf4i_w((__m128i)a, 0x1B);
1955EIGEN_STRONG_INLINE Packet2ul preverse(
const Packet2ul& a) {
1956 return __lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1);
1960EIGEN_STRONG_INLINE
float predux<Packet4f>(
const Packet4f& a) {
1961 Packet4f tmp = __lsx_vfadd_s(a, vec4f_swizzle1(a, 2, 3, 2, 3));
1962 return pfirst<Packet4f>(__lsx_vfadd_s(tmp, vec4f_swizzle1(tmp, 1, 1, 1, 1)));
1965EIGEN_STRONG_INLINE
double predux<Packet2d>(
const Packet2d& a) {
1966 return pfirst<Packet2d>(__lsx_vfadd_d(a, preverse(a)));
1969EIGEN_STRONG_INLINE int8_t predux<Packet16c>(
const Packet16c& a) {
1970 Packet8s tmp1 = __lsx_vhaddw_h_b(a, a);
1971 Packet4i tmp2 = __lsx_vhaddw_w_h(tmp1, tmp1);
1972 Packet2l tmp3 = __lsx_vhaddw_d_w(tmp2, tmp2);
1973 return (int8_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp3, tmp3), 0);
1976EIGEN_STRONG_INLINE int16_t predux<Packet8s>(
const Packet8s& a) {
1977 Packet4i tmp1 = __lsx_vhaddw_w_h(a, a);
1978 Packet2l tmp2 = __lsx_vhaddw_d_w(tmp1, tmp1);
1979 return (int16_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp2, tmp2), 0);
1982EIGEN_STRONG_INLINE int32_t predux<Packet4i>(
const Packet4i& a) {
1983 Packet2l tmp = __lsx_vhaddw_d_w(a, a);
1984 return (int32_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp, tmp), 0);
1987EIGEN_STRONG_INLINE int64_t predux<Packet2l>(
const Packet2l& a) {
1988 return (int64_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(a, a), 0);
1991EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(
const Packet16uc& a) {
1992 Packet8us tmp1 = __lsx_vhaddw_hu_bu(a, a);
1993 Packet4ui tmp2 = __lsx_vhaddw_wu_hu(tmp1, tmp1);
1994 Packet2ul tmp3 = __lsx_vhaddw_du_wu(tmp2, tmp2);
1995 return (uint8_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp3, tmp3), 0);
1998EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(
const Packet8us& a) {
1999 Packet4ui tmp1 = __lsx_vhaddw_wu_hu(a, a);
2000 Packet2ul tmp2 = __lsx_vhaddw_du_wu(tmp1, tmp1);
2001 return (uint16_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp2, tmp2), 0);
2004EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(
const Packet4ui& a) {
2005 Packet2ul tmp = __lsx_vhaddw_du_wu(a, a);
2006 return (uint32_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp, tmp), 0);
2009EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(
const Packet2ul& a) {
2010 return (uint64_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(a, a), 0);
2014EIGEN_STRONG_INLINE
float predux_mul<Packet4f>(
const Packet4f& a) {
2015 Packet4f tmp = __lsx_vfmul_s(a, vec4f_swizzle1(a, 2, 3, 2, 3));
2016 return pfirst<Packet4f>(__lsx_vfmul_s(tmp, vec4f_swizzle1(tmp, 1, 1, 1, 1)));
2019EIGEN_STRONG_INLINE
double predux_mul<Packet2d>(
const Packet2d& a) {
2020 return pfirst<Packet2d>(__lsx_vfmul_d(a, preverse(a)));
2023EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(
const Packet16c& a) {
2024 Packet8s tmp1 = __lsx_vmulwev_h_b(a, preverse(a));
2025 Packet4i tmp2 = __lsx_vmulwev_w_h(tmp1, preverse(tmp1));
2026 Packet2l tmp3 = __lsx_vmulwev_d_w(tmp2, preverse(tmp2));
2027 return (int8_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp3, preverse(tmp3)), 0);
2030EIGEN_STRONG_INLINE int16_t predux_mul<Packet8s>(
const Packet8s& a) {
2031 Packet4i tmp1 = __lsx_vmulwev_w_h(a, preverse(a));
2032 Packet2l tmp2 = __lsx_vmulwev_d_w(tmp1, preverse(tmp1));
2033 return (int16_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp2, preverse(tmp2)), 0);
2036EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(
const Packet4i& a) {
2037 Packet2l tmp = __lsx_vmulwev_d_w(a, preverse(a));
2038 return (int32_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp, preverse(tmp)), 0);
2041EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(
const Packet2l& a) {
2042 return (int64_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(a, preverse(a)), 0);
2045EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(
const Packet16uc& a) {
2046 Packet8us tmp1 = __lsx_vmulwev_h_bu(a, preverse(a));
2047 Packet4ui tmp2 = __lsx_vmulwev_w_h(tmp1, preverse(tmp1));
2048 Packet2ul tmp3 = __lsx_vmulwev_d_w(tmp2, preverse(tmp2));
2049 return (uint8_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp3, preverse(tmp3)), 0);
2052EIGEN_STRONG_INLINE uint16_t predux_mul<Packet8us>(
const Packet8us& a) {
2053 Packet4ui tmp1 = __lsx_vmulwev_w_hu(a, preverse(a));
2054 Packet2ul tmp2 = __lsx_vmulwev_d_w(tmp1, preverse(tmp1));
2055 return (uint16_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp2, preverse(tmp2)), 0);
2058EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(
const Packet4ui& a) {
2059 Packet2ul tmp = __lsx_vmulwev_d_wu(a, preverse(a));
2060 return (uint32_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp, preverse(tmp)), 0);
2063EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(
const Packet2ul& a) {
2064 return (uint64_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_du(a, preverse(a)), 0);
2068EIGEN_STRONG_INLINE
float predux_min<Packet4f>(
const Packet4f& a) {
2069 Packet4f tmp = __lsx_vfmin_s(a, (Packet4f)__lsx_vshuf4i_w(a, 0x4E));
2070 return pfirst(__lsx_vfmin_s(tmp, (Packet4f)__lsx_vshuf4i_w(tmp, 0xB1)));
2073EIGEN_STRONG_INLINE
double predux_min<Packet2d>(
const Packet2d& a) {
2074 return pfirst(__lsx_vfmin_d(a, preverse(a)));
2077EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(
const Packet16c& a) {
2078 Packet16c tmp1 = __lsx_vmin_b(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2079 Packet16c tmp2 = __lsx_vmin_b(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2080 Packet16c tmp3 = __lsx_vmin_b(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
2081 return pfirst((Packet16c)__lsx_vmin_b(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
2084EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(
const Packet8s& a) {
2085 Packet8s tmp1 = __lsx_vmin_h(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2086 Packet8s tmp2 = __lsx_vmin_h(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2087 return pfirst((Packet8s)__lsx_vmin_h(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
2090EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(
const Packet4i& a) {
2091 Packet4i tmp = __lsx_vmin_w(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2092 return pfirst((Packet4i)__lsx_vmin_w(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
2095EIGEN_STRONG_INLINE int64_t predux_min<Packet2l>(
const Packet2l& a) {
2096 return pfirst((Packet2l)__lsx_vmin_d(a, preverse(a)));
2099EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(
const Packet16uc& a) {
2100 Packet16uc tmp1 = __lsx_vmin_bu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2101 Packet16uc tmp2 = __lsx_vmin_bu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2102 Packet16uc tmp3 = __lsx_vmin_bu(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
2103 return pfirst((Packet16uc)__lsx_vmin_bu(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
2106EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(
const Packet8us& a) {
2107 Packet8us tmp1 = __lsx_vmin_hu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2108 Packet8us tmp2 = __lsx_vmin_hu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2109 return pfirst((Packet8us)__lsx_vmin_hu(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
2112EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(
const Packet4ui& a) {
2113 Packet4ui tmp = __lsx_vmin_wu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2114 return pfirst((Packet4ui)__lsx_vmin_wu(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
2117EIGEN_STRONG_INLINE uint64_t predux_min<Packet2ul>(
const Packet2ul& a) {
2118 return pfirst((Packet2ul)__lsx_vmin_du(a, preverse(a)));
2122EIGEN_STRONG_INLINE
float predux_max<Packet4f>(
const Packet4f& a) {
2123 Packet4f tmp = __lsx_vfmax_s(a, (Packet4f)__lsx_vshuf4i_w(a, 0x4E));
2124 return pfirst(__lsx_vfmax_s(tmp, (Packet4f)__lsx_vshuf4i_w(tmp, 0xB1)));
2127EIGEN_STRONG_INLINE
double predux_max<Packet2d>(
const Packet2d& a) {
2128 return pfirst(__lsx_vfmax_d(a, preverse(a)));
2131EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(
const Packet16c& a) {
2132 Packet16c tmp1 = __lsx_vmax_b(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2133 Packet16c tmp2 = __lsx_vmax_b(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2134 Packet16c tmp3 = __lsx_vmax_b(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
2135 return pfirst((Packet16c)__lsx_vmax_b(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
2138EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(
const Packet8s& a) {
2139 Packet8s tmp1 = __lsx_vmax_h(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2140 Packet8s tmp2 = __lsx_vmax_h(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2141 return pfirst((Packet8s)__lsx_vmax_h(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
2144EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(
const Packet4i& a) {
2145 Packet4i tmp = __lsx_vmax_w(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2146 return pfirst((Packet4i)__lsx_vmax_w(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
2149EIGEN_STRONG_INLINE int64_t predux_max<Packet2l>(
const Packet2l& a) {
2150 return pfirst((Packet2l)__lsx_vmax_d(a, preverse(a)));
2153EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(
const Packet16uc& a) {
2154 Packet16uc tmp1 = __lsx_vmax_bu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2155 Packet16uc tmp2 = __lsx_vmax_bu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2156 Packet16uc tmp3 = __lsx_vmax_bu(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
2157 return pfirst((Packet16uc)__lsx_vmax_bu(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
2160EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(
const Packet8us& a) {
2161 Packet8us tmp1 = __lsx_vmax_hu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2162 Packet8us tmp2 = __lsx_vmax_hu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2163 return pfirst((Packet8us)__lsx_vmax_hu(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
2166EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(
const Packet4ui& a) {
2167 Packet4ui tmp = __lsx_vmax_wu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2168 return pfirst((Packet4ui)__lsx_vmax_wu(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
2171EIGEN_STRONG_INLINE uint64_t predux_max<Packet2ul>(
const Packet2ul& a) {
2172 return pfirst((Packet2ul)__lsx_vmax_du(a, preverse(a)));
2176EIGEN_STRONG_INLINE Packet4f psqrt(
const Packet4f& a) {
2177 return __lsx_vfsqrt_s(a);
2180EIGEN_STRONG_INLINE Packet2d psqrt(
const Packet2d& a) {
2181 return __lsx_vfsqrt_d(a);
2184EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
2185 Packet4f T0 = (Packet4f)__lsx_vilvl_w((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
2186 Packet4f T1 = (Packet4f)__lsx_vilvh_w((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
2187 Packet4f T2 = (Packet4f)__lsx_vilvl_w((__m128i)kernel.packet[3], (__m128i)kernel.packet[2]);
2188 Packet4f T3 = (Packet4f)__lsx_vilvh_w((__m128i)kernel.packet[3], (__m128i)kernel.packet[2]);
2190 kernel.packet[0] = (Packet4f)__lsx_vilvl_d((__m128i)T2, (__m128i)T0);
2191 kernel.packet[1] = (Packet4f)__lsx_vilvh_d((__m128i)T2, (__m128i)T0);
2192 kernel.packet[2] = (Packet4f)__lsx_vilvl_d((__m128i)T3, (__m128i)T1);
2193 kernel.packet[3] = (Packet4f)__lsx_vilvh_d((__m128i)T3, (__m128i)T1);
2195EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
2196 Packet2d tmp = (Packet2d)__lsx_vilvh_d((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
2197 kernel.packet[0] = (Packet2d)__lsx_vilvl_d((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
2198 kernel.packet[1] = tmp;
2200EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
2201 __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2202 __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2203 __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2204 __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2205 __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
2206 __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
2207 __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
2208 __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
2209 __m128i t8 = __lsx_vilvl_b(kernel.packet[9], kernel.packet[8]);
2210 __m128i t9 = __lsx_vilvh_b(kernel.packet[9], kernel.packet[8]);
2211 __m128i ta = __lsx_vilvl_b(kernel.packet[11], kernel.packet[10]);
2212 __m128i tb = __lsx_vilvh_b(kernel.packet[11], kernel.packet[10]);
2213 __m128i tc = __lsx_vilvl_b(kernel.packet[13], kernel.packet[12]);
2214 __m128i td = __lsx_vilvh_b(kernel.packet[13], kernel.packet[12]);
2215 __m128i te = __lsx_vilvl_b(kernel.packet[15], kernel.packet[14]);
2216 __m128i tf = __lsx_vilvh_b(kernel.packet[15], kernel.packet[14]);
2218 __m128i s0 = __lsx_vilvl_h(t2, t0);
2219 __m128i s1 = __lsx_vilvh_h(t2, t0);
2220 __m128i s2 = __lsx_vilvl_h(t3, t1);
2221 __m128i s3 = __lsx_vilvh_h(t3, t1);
2222 __m128i s4 = __lsx_vilvl_h(t6, t4);
2223 __m128i s5 = __lsx_vilvh_h(t6, t4);
2224 __m128i s6 = __lsx_vilvl_h(t7, t5);
2225 __m128i s7 = __lsx_vilvh_h(t7, t5);
2226 __m128i s8 = __lsx_vilvl_h(ta, t8);
2227 __m128i s9 = __lsx_vilvh_h(ta, t8);
2228 __m128i sa = __lsx_vilvl_h(tb, t9);
2229 __m128i sb = __lsx_vilvh_h(tb, t9);
2230 __m128i sc = __lsx_vilvl_h(te, tc);
2231 __m128i sd = __lsx_vilvh_h(te, tc);
2232 __m128i se = __lsx_vilvl_h(tf, td);
2233 __m128i sf = __lsx_vilvh_h(tf, td);
2235 __m128i u0 = __lsx_vilvl_w(s4, s0);
2236 __m128i u1 = __lsx_vilvh_w(s4, s0);
2237 __m128i u2 = __lsx_vilvl_w(s5, s1);
2238 __m128i u3 = __lsx_vilvh_w(s5, s1);
2239 __m128i u4 = __lsx_vilvl_w(s6, s2);
2240 __m128i u5 = __lsx_vilvh_w(s6, s2);
2241 __m128i u6 = __lsx_vilvl_w(s7, s3);
2242 __m128i u7 = __lsx_vilvh_w(s7, s3);
2243 __m128i u8 = __lsx_vilvl_w(sc, s8);
2244 __m128i u9 = __lsx_vilvh_w(sc, s8);
2245 __m128i ua = __lsx_vilvl_w(sd, s9);
2246 __m128i ub = __lsx_vilvh_w(sd, s9);
2247 __m128i uc = __lsx_vilvl_w(se, sa);
2248 __m128i ud = __lsx_vilvh_w(se, sa);
2249 __m128i ue = __lsx_vilvl_w(sf, sb);
2250 __m128i uf = __lsx_vilvh_w(sf, sb);
2252 kernel.packet[0] = __lsx_vilvl_d(u8, u0);
2253 kernel.packet[1] = __lsx_vilvh_d(u8, u0);
2254 kernel.packet[2] = __lsx_vilvl_d(u9, u1);
2255 kernel.packet[3] = __lsx_vilvh_d(u9, u1);
2256 kernel.packet[4] = __lsx_vilvl_d(ua, u2);
2257 kernel.packet[5] = __lsx_vilvh_d(ua, u2);
2258 kernel.packet[6] = __lsx_vilvl_d(ub, u3);
2259 kernel.packet[7] = __lsx_vilvh_d(ub, u3);
2260 kernel.packet[8] = __lsx_vilvl_d(uc, u4);
2261 kernel.packet[9] = __lsx_vilvh_d(uc, u4);
2262 kernel.packet[10] = __lsx_vilvl_d(ud, u5);
2263 kernel.packet[11] = __lsx_vilvh_d(ud, u5);
2264 kernel.packet[12] = __lsx_vilvl_d(ue, u6);
2265 kernel.packet[13] = __lsx_vilvh_d(ue, u6);
2266 kernel.packet[14] = __lsx_vilvl_d(uf, u7);
2267 kernel.packet[15] = __lsx_vilvh_d(uf, u7);
2269EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16c, 8>& kernel) {
2270 __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2271 __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2272 __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2273 __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2274 __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
2275 __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
2276 __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
2277 __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
2279 __m128i s0 = __lsx_vilvl_h(t2, t0);
2280 __m128i s1 = __lsx_vilvh_h(t2, t0);
2281 __m128i s2 = __lsx_vilvl_h(t3, t1);
2282 __m128i s3 = __lsx_vilvh_h(t3, t1);
2283 __m128i s4 = __lsx_vilvl_h(t6, t4);
2284 __m128i s5 = __lsx_vilvh_h(t6, t4);
2285 __m128i s6 = __lsx_vilvl_h(t7, t5);
2286 __m128i s7 = __lsx_vilvh_h(t7, t5);
2288 kernel.packet[0] = __lsx_vilvl_w(s4, s0);
2289 kernel.packet[1] = __lsx_vilvh_w(s4, s0);
2290 kernel.packet[2] = __lsx_vilvl_w(s5, s1);
2291 kernel.packet[3] = __lsx_vilvh_w(s5, s1);
2292 kernel.packet[4] = __lsx_vilvl_w(s6, s2);
2293 kernel.packet[5] = __lsx_vilvh_w(s6, s2);
2294 kernel.packet[6] = __lsx_vilvl_w(s7, s3);
2295 kernel.packet[7] = __lsx_vilvh_w(s7, s3);
2297EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
2298 __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2299 __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2300 __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2301 __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2303 kernel.packet[0] = __lsx_vilvl_h(t2, t0);
2304 kernel.packet[1] = __lsx_vilvh_h(t2, t0);
2305 kernel.packet[2] = __lsx_vilvl_h(t3, t1);
2306 kernel.packet[3] = __lsx_vilvh_h(t3, t1);
2308EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
2309 __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
2310 __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
2311 __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
2312 __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
2313 __m128i t4 = __lsx_vilvl_h(kernel.packet[5], kernel.packet[4]);
2314 __m128i t5 = __lsx_vilvh_h(kernel.packet[5], kernel.packet[4]);
2315 __m128i t6 = __lsx_vilvl_h(kernel.packet[7], kernel.packet[6]);
2316 __m128i t7 = __lsx_vilvh_h(kernel.packet[7], kernel.packet[6]);
2318 __m128i s0 = __lsx_vilvl_w(t2, t0);
2319 __m128i s1 = __lsx_vilvh_w(t2, t0);
2320 __m128i s2 = __lsx_vilvl_w(t3, t1);
2321 __m128i s3 = __lsx_vilvh_w(t3, t1);
2322 __m128i s4 = __lsx_vilvl_w(t6, t4);
2323 __m128i s5 = __lsx_vilvh_w(t6, t4);
2324 __m128i s6 = __lsx_vilvl_w(t7, t5);
2325 __m128i s7 = __lsx_vilvh_w(t7, t5);
2327 kernel.packet[0] = __lsx_vilvl_d(s4, s0);
2328 kernel.packet[1] = __lsx_vilvh_d(s4, s0);
2329 kernel.packet[2] = __lsx_vilvl_d(s5, s1);
2330 kernel.packet[3] = __lsx_vilvh_d(s5, s1);
2331 kernel.packet[4] = __lsx_vilvl_d(s6, s2);
2332 kernel.packet[5] = __lsx_vilvh_d(s6, s2);
2333 kernel.packet[6] = __lsx_vilvl_d(s7, s3);
2334 kernel.packet[7] = __lsx_vilvh_d(s7, s3);
2336EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
2337 __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
2338 __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
2339 __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
2340 __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
2342 kernel.packet[0] = __lsx_vilvl_w(t2, t0);
2343 kernel.packet[1] = __lsx_vilvh_w(t2, t0);
2344 kernel.packet[2] = __lsx_vilvl_w(t3, t1);
2345 kernel.packet[3] = __lsx_vilvh_w(t3, t1);
2347EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
2348 __m128i T0 = __lsx_vilvl_w(kernel.packet[1], kernel.packet[0]);
2349 __m128i T1 = __lsx_vilvh_w(kernel.packet[1], kernel.packet[0]);
2350 __m128i T2 = __lsx_vilvl_w(kernel.packet[3], kernel.packet[2]);
2351 __m128i T3 = __lsx_vilvh_w(kernel.packet[3], kernel.packet[2]);
2353 kernel.packet[0] = __lsx_vilvl_d(T2, T0);
2354 kernel.packet[1] = __lsx_vilvh_d(T2, T0);
2355 kernel.packet[2] = __lsx_vilvl_d(T3, T1);
2356 kernel.packet[3] = __lsx_vilvh_d(T3, T1);
2358EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet2l, 2>& kernel) {
2359 __m128i tmp = __lsx_vilvh_d(kernel.packet[1], kernel.packet[0]);
2360 kernel.packet[0] = __lsx_vilvl_d(kernel.packet[1], kernel.packet[0]);
2361 kernel.packet[1] = tmp;
2363EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
2364 __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2365 __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2366 __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2367 __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2368 __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
2369 __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
2370 __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
2371 __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
2372 __m128i t8 = __lsx_vilvl_b(kernel.packet[9], kernel.packet[8]);
2373 __m128i t9 = __lsx_vilvh_b(kernel.packet[9], kernel.packet[8]);
2374 __m128i ta = __lsx_vilvl_b(kernel.packet[11], kernel.packet[10]);
2375 __m128i tb = __lsx_vilvh_b(kernel.packet[11], kernel.packet[10]);
2376 __m128i tc = __lsx_vilvl_b(kernel.packet[13], kernel.packet[12]);
2377 __m128i td = __lsx_vilvh_b(kernel.packet[13], kernel.packet[12]);
2378 __m128i te = __lsx_vilvl_b(kernel.packet[15], kernel.packet[14]);
2379 __m128i tf = __lsx_vilvh_b(kernel.packet[15], kernel.packet[14]);
2381 __m128i s0 = __lsx_vilvl_h(t2, t0);
2382 __m128i s1 = __lsx_vilvh_h(t2, t0);
2383 __m128i s2 = __lsx_vilvl_h(t3, t1);
2384 __m128i s3 = __lsx_vilvh_h(t3, t1);
2385 __m128i s4 = __lsx_vilvl_h(t6, t4);
2386 __m128i s5 = __lsx_vilvh_h(t6, t4);
2387 __m128i s6 = __lsx_vilvl_h(t7, t5);
2388 __m128i s7 = __lsx_vilvh_h(t7, t5);
2389 __m128i s8 = __lsx_vilvl_h(ta, t8);
2390 __m128i s9 = __lsx_vilvh_h(ta, t8);
2391 __m128i sa = __lsx_vilvl_h(tb, t9);
2392 __m128i sb = __lsx_vilvh_h(tb, t9);
2393 __m128i sc = __lsx_vilvl_h(te, tc);
2394 __m128i sd = __lsx_vilvh_h(te, tc);
2395 __m128i se = __lsx_vilvl_h(tf, td);
2396 __m128i sf = __lsx_vilvh_h(tf, td);
2398 __m128i u0 = __lsx_vilvl_w(s4, s0);
2399 __m128i u1 = __lsx_vilvh_w(s4, s0);
2400 __m128i u2 = __lsx_vilvl_w(s5, s1);
2401 __m128i u3 = __lsx_vilvh_w(s5, s1);
2402 __m128i u4 = __lsx_vilvl_w(s6, s2);
2403 __m128i u5 = __lsx_vilvh_w(s6, s2);
2404 __m128i u6 = __lsx_vilvl_w(s7, s3);
2405 __m128i u7 = __lsx_vilvh_w(s7, s3);
2406 __m128i u8 = __lsx_vilvl_w(sc, s8);
2407 __m128i u9 = __lsx_vilvh_w(sc, s8);
2408 __m128i ua = __lsx_vilvl_w(sd, s9);
2409 __m128i ub = __lsx_vilvh_w(sd, s9);
2410 __m128i uc = __lsx_vilvl_w(se, sa);
2411 __m128i ud = __lsx_vilvh_w(se, sa);
2412 __m128i ue = __lsx_vilvl_w(sf, sb);
2413 __m128i uf = __lsx_vilvh_w(sf, sb);
2415 kernel.packet[0] = __lsx_vilvl_d(u8, u0);
2416 kernel.packet[1] = __lsx_vilvh_d(u8, u0);
2417 kernel.packet[2] = __lsx_vilvl_d(u9, u1);
2418 kernel.packet[3] = __lsx_vilvh_d(u9, u1);
2419 kernel.packet[4] = __lsx_vilvl_d(ua, u2);
2420 kernel.packet[5] = __lsx_vilvh_d(ua, u2);
2421 kernel.packet[6] = __lsx_vilvl_d(ub, u3);
2422 kernel.packet[7] = __lsx_vilvh_d(ub, u3);
2423 kernel.packet[8] = __lsx_vilvl_d(uc, u4);
2424 kernel.packet[9] = __lsx_vilvh_d(uc, u4);
2425 kernel.packet[10] = __lsx_vilvl_d(ud, u5);
2426 kernel.packet[11] = __lsx_vilvh_d(ud, u5);
2427 kernel.packet[12] = __lsx_vilvl_d(ue, u6);
2428 kernel.packet[13] = __lsx_vilvh_d(ue, u6);
2429 kernel.packet[14] = __lsx_vilvl_d(uf, u7);
2430 kernel.packet[15] = __lsx_vilvh_d(uf, u7);
2432EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16uc, 8>& kernel) {
2433 __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2434 __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2435 __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2436 __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2437 __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
2438 __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
2439 __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
2440 __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
2442 __m128i s0 = __lsx_vilvl_h(t2, t0);
2443 __m128i s1 = __lsx_vilvh_h(t2, t0);
2444 __m128i s2 = __lsx_vilvl_h(t3, t1);
2445 __m128i s3 = __lsx_vilvh_h(t3, t1);
2446 __m128i s4 = __lsx_vilvl_h(t6, t4);
2447 __m128i s5 = __lsx_vilvh_h(t6, t4);
2448 __m128i s6 = __lsx_vilvl_h(t7, t5);
2449 __m128i s7 = __lsx_vilvh_h(t7, t5);
2451 kernel.packet[0] = __lsx_vilvl_w(s4, s0);
2452 kernel.packet[1] = __lsx_vilvh_w(s4, s0);
2453 kernel.packet[2] = __lsx_vilvl_w(s5, s1);
2454 kernel.packet[3] = __lsx_vilvh_w(s5, s1);
2455 kernel.packet[4] = __lsx_vilvl_w(s6, s2);
2456 kernel.packet[5] = __lsx_vilvh_w(s6, s2);
2457 kernel.packet[6] = __lsx_vilvl_w(s7, s3);
2458 kernel.packet[7] = __lsx_vilvh_w(s7, s3);
2460EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
2461 __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2462 __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2463 __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2464 __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2466 kernel.packet[0] = __lsx_vilvl_h(t2, t0);
2467 kernel.packet[1] = __lsx_vilvh_h(t2, t0);
2468 kernel.packet[2] = __lsx_vilvl_h(t3, t1);
2469 kernel.packet[3] = __lsx_vilvh_h(t3, t1);
2471EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
2472 __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
2473 __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
2474 __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
2475 __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
2476 __m128i t4 = __lsx_vilvl_h(kernel.packet[5], kernel.packet[4]);
2477 __m128i t5 = __lsx_vilvh_h(kernel.packet[5], kernel.packet[4]);
2478 __m128i t6 = __lsx_vilvl_h(kernel.packet[7], kernel.packet[6]);
2479 __m128i t7 = __lsx_vilvh_h(kernel.packet[7], kernel.packet[6]);
2481 __m128i s0 = __lsx_vilvl_w(t2, t0);
2482 __m128i s1 = __lsx_vilvh_w(t2, t0);
2483 __m128i s2 = __lsx_vilvl_w(t3, t1);
2484 __m128i s3 = __lsx_vilvh_w(t3, t1);
2485 __m128i s4 = __lsx_vilvl_w(t6, t4);
2486 __m128i s5 = __lsx_vilvh_w(t6, t4);
2487 __m128i s6 = __lsx_vilvl_w(t7, t5);
2488 __m128i s7 = __lsx_vilvh_w(t7, t5);
2490 kernel.packet[0] = __lsx_vilvl_d(s4, s0);
2491 kernel.packet[1] = __lsx_vilvh_d(s4, s0);
2492 kernel.packet[2] = __lsx_vilvl_d(s5, s1);
2493 kernel.packet[3] = __lsx_vilvh_d(s5, s1);
2494 kernel.packet[4] = __lsx_vilvl_d(s6, s2);
2495 kernel.packet[5] = __lsx_vilvh_d(s6, s2);
2496 kernel.packet[6] = __lsx_vilvl_d(s7, s3);
2497 kernel.packet[7] = __lsx_vilvh_d(s7, s3);
2499EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
2500 __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
2501 __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
2502 __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
2503 __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
2505 kernel.packet[0] = __lsx_vilvl_w(t2, t0);
2506 kernel.packet[1] = __lsx_vilvh_w(t2, t0);
2507 kernel.packet[2] = __lsx_vilvl_w(t3, t1);
2508 kernel.packet[3] = __lsx_vilvh_w(t3, t1);
2510EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
2511 __m128i T0 = __lsx_vilvl_w(kernel.packet[1], kernel.packet[0]);
2512 __m128i T1 = __lsx_vilvh_w(kernel.packet[1], kernel.packet[0]);
2513 __m128i T2 = __lsx_vilvl_w(kernel.packet[3], kernel.packet[2]);
2514 __m128i T3 = __lsx_vilvh_w(kernel.packet[3], kernel.packet[2]);
2516 kernel.packet[0] = __lsx_vilvl_d(T2, T0);
2517 kernel.packet[1] = __lsx_vilvh_d(T2, T0);
2518 kernel.packet[2] = __lsx_vilvl_d(T3, T1);
2519 kernel.packet[3] = __lsx_vilvh_d(T3, T1);
2521EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet2ul, 2>& kernel) {
2522 __m128i tmp = __lsx_vilvh_d(kernel.packet[1], kernel.packet[0]);
2523 kernel.packet[0] = __lsx_vilvl_d(kernel.packet[1], kernel.packet[0]);
2524 kernel.packet[1] = tmp;
2528EIGEN_STRONG_INLINE Packet4f prsqrt(
const Packet4f& a) {
2529 return __lsx_vfrsqrt_s(a);
2532EIGEN_STRONG_INLINE Packet2d prsqrt(
const Packet2d& a) {
2533 return __lsx_vfrsqrt_d(a);
2537EIGEN_STRONG_INLINE Packet4f pfloor(
const Packet4f& a) {
2538 return __lsx_vfrintrm_s(a);
2541EIGEN_STRONG_INLINE Packet2d pfloor(
const Packet2d& a) {
2542 return __lsx_vfrintrm_d(a);
2546EIGEN_STRONG_INLINE Packet4f pceil(
const Packet4f& a) {
2547 return __lsx_vfrintrp_s(a);
2550EIGEN_STRONG_INLINE Packet2d pceil(
const Packet2d& a) {
2551 return __lsx_vfrintrp_d(a);
2555EIGEN_STRONG_INLINE Packet4f pround(
const Packet4f& a) {
2556 const Packet4f mask = pset1frombits<Packet4f>(
static_cast<numext::uint32_t
>(0x80000000u));
2557 const Packet4f prev0dot5 = pset1frombits<Packet4f>(
static_cast<numext::uint32_t
>(0x3EFFFFFFu));
2558 return __lsx_vfrintrz_s(padd(pxor(pand(a, mask), prev0dot5), a));
2561EIGEN_STRONG_INLINE Packet2d pround(
const Packet2d& a) {
2562 const Packet2d mask = pset1frombits<Packet2d>(
static_cast<numext::uint64_t
>(0x8000000000000000ull));
2563 const Packet2d prev0dot5 = pset1frombits<Packet2d>(
static_cast<numext::uint64_t
>(0x3FDFFFFFFFFFFFFFull));
2564 return __lsx_vfrintrz_d(padd(por(pand(a, mask), prev0dot5), a));
2568EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(
const Packet4f& mask,
const Packet4f& a,
const Packet4f& b) {
2569 return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask);
2572EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(
const Packet16c& mask,
const Packet16c& a,
const Packet16c& b) {
2573 return (Packet16c)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask);
2577EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(
const int8_t* from) {
2578 int8_t tmp[16] = {*from, *from, *from, *from, *(from + 1), *(from + 1),
2579 *(from + 1), *(from + 1), *(from + 2), *(from + 2), *(from + 2), *(from + 2),
2580 *(from + 3), *(from + 3), *(from + 3), *(from + 3)};
2581 return __lsx_vld(tmp, 0);
2584EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(
const uint8_t* from) {
2585 uint8_t tmp[16] = {*from, *from, *from, *from, *(from + 1), *(from + 1),
2586 *(from + 1), *(from + 1), *(from + 2), *(from + 2), *(from + 2), *(from + 2),
2587 *(from + 3), *(from + 3), *(from + 3), *(from + 3)};
2588 return __lsx_vld(tmp, 0);
2591EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(
const int16_t* from) {
2592 int16_t tmp[8] = {*from, *from, *from, *from, *(from + 1), *(from + 1), *(from + 1), *(from + 1)};
2593 return __lsx_vld(tmp, 0);
2596EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(
const uint16_t* from) {
2597 uint16_t tmp[8] = {*from, *from, *from, *from, *(from + 1), *(from + 1), *(from + 1), *(from + 1)};
2598 return __lsx_vld(tmp, 0);
2601EIGEN_STRONG_INLINE Packet4i ploadquad<Packet4i>(
const int32_t* from) {
2602 int32_t tmp[4] = {*from, *from, *from, *from};
2603 return __lsx_vld(tmp, 0);
2606EIGEN_STRONG_INLINE Packet4ui ploadquad<Packet4ui>(
const uint32_t* from) {
2607 uint32_t tmp[4] = {*from, *from, *from, *from};
2608 return __lsx_vld(tmp, 0);
2612EIGEN_STRONG_INLINE Packet16c pnmsub(
const Packet16c& a,
const Packet16c& b,
const Packet16c& c) {
2613 return __lsx_vmsub_b(pnegate(c), a, b);
2616EIGEN_STRONG_INLINE Packet8s pnmsub(
const Packet8s& a,
const Packet8s& b,
const Packet8s& c) {
2617 return __lsx_vmsub_h(pnegate(c), a, b);
2620EIGEN_STRONG_INLINE Packet4i pnmsub(
const Packet4i& a,
const Packet4i& b,
const Packet4i& c) {
2621 return __lsx_vmsub_w(pnegate(c), a, b);
2624EIGEN_STRONG_INLINE Packet2l pnmsub(
const Packet2l& a,
const Packet2l& b,
const Packet2l& c) {
2625 return __lsx_vmsub_d(pnegate(c), a, b);
2629EIGEN_STRONG_INLINE Packet16c pmsub(
const Packet16c& a,
const Packet16c& b,
const Packet16c& c) {
2630 return __lsx_vmadd_b(pnegate(c), a, b);
2633EIGEN_STRONG_INLINE Packet8s pmsub(
const Packet8s& a,
const Packet8s& b,
const Packet8s& c) {
2634 return __lsx_vmadd_h(pnegate(c), a, b);
2637EIGEN_STRONG_INLINE Packet4i pmsub(
const Packet4i& a,
const Packet4i& b,
const Packet4i& c) {
2638 return __lsx_vmadd_w(pnegate(c), a, b);
2641EIGEN_STRONG_INLINE Packet2l pmsub(
const Packet2l& a,
const Packet2l& b,
const Packet2l& c) {
2642 return __lsx_vmadd_d(pnegate(c), a, b);
2646EIGEN_STRONG_INLINE Packet16c pnmadd(
const Packet16c& a,
const Packet16c& b,
const Packet16c& c) {
2647 return __lsx_vmsub_b(c, a, b);
2650EIGEN_STRONG_INLINE Packet8s pnmadd(
const Packet8s& a,
const Packet8s& b,
const Packet8s& c) {
2651 return __lsx_vmsub_h(c, a, b);
2654EIGEN_STRONG_INLINE Packet4i pnmadd(
const Packet4i& a,
const Packet4i& b,
const Packet4i& c) {
2655 return __lsx_vmsub_w(c, a, b);
2658EIGEN_STRONG_INLINE Packet2l pnmadd(
const Packet2l& a,
const Packet2l& b,
const Packet2l& c) {
2659 return __lsx_vmsub_d(c, a, b);
2663EIGEN_STRONG_INLINE Packet4f pexp(
const Packet4f& _x) {
2664 return pexp_float(_x);
2667EIGEN_STRONG_INLINE Packet2d pexp(
const Packet2d& _x) {
2668 return pexp_double(_x);
2672EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(
const Packet4f& a,
const Packet4f& exponent) {
2673 return pldexp_generic(a, exponent);
2677EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(
const Packet2d& a, Packet2d& exponent) {
2678 return pfrexp_generic(a, exponent);
2681EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(
const Packet4f& a, Packet4f& exponent) {
2682 return pfrexp_generic(a, exponent);
2685EIGEN_STRONG_INLINE Packet4f pzero(
const Packet4f& ) {
2686 Packet4f v = {0.0f, 0.0f, 0.0f, 0.0f};
2690EIGEN_STRONG_INLINE Packet4f pabsdiff<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
2691 Packet4f v = psub(a, b);
2695EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(
const Packet4f& a,
const Packet4f& b) {
2696 return pmin<Packet4f>(a, b);
2699EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(
const Packet4f& a,
const Packet4f& b) {
2700 return pmax<Packet4f>(a, b);
2703EIGEN_STRONG_INLINE Packet4f ploadquad<Packet4f>(
const float* from) {
2704 return (__m128)__lsx_vldrepl_w(from, 0);
2707EIGEN_STRONG_INLINE Packet4f psignbit(
const Packet4f& a) {
2708 return (__m128)__lsx_vsrai_w((__m128i)a, 31);
2711EIGEN_STRONG_INLINE Packet4f print<Packet4f>(
const Packet4f& a) {
2712 return __lsx_vfrintrne_s(a);
2715EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(
const Packet4f& a) {
2716 return __lsx_vfrintrz_s(a);
2719EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(
const Packet4f& a) {
2720 return __lsx_vfrecip_s(a);
2724EIGEN_STRONG_INLINE Packet2d pzero(
const Packet2d& ) {
2725 Packet2d v = {0.0, 0.0};
2729EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(
const Packet2d& a,
const Packet2d& b) {
2730 return pmin<Packet2d>(a, b);
2733EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(
const Packet2d& a,
const Packet2d& b) {
2734 return pmax<Packet2d>(a, b);
2737EIGEN_STRONG_INLINE Packet2d psignbit(
const Packet2d& a) {
2738 return (__m128d)(__lsx_vsrai_d((__m128i)a, 63));
2741EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect(
const Packet2d& mask,
const Packet2d& a,
const Packet2d& b) {
2742 return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask);
2745EIGEN_STRONG_INLINE Packet2d print<Packet2d>(
const Packet2d& a) {
2746 return __lsx_vfrintrne_d(a);
2749EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(
const Packet2d& a) {
2750 return __lsx_vfrintrz_d(a);
2753EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(
const Packet2d& a,
const Packet2d& exponent) {
2754 return pldexp_generic(a, exponent);
2758EIGEN_STRONG_INLINE Packet16c pabsdiff<Packet16c>(
const Packet16c& a,
const Packet16c& b) {
2759 Packet16c v = psub(a, b);
2764EIGEN_STRONG_INLINE Packet8s pabsdiff<Packet8s>(
const Packet8s& a,
const Packet8s& b) {
2765 Packet8s v = psub(a, b);
2769EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(
const Packet8s& mask,
const Packet8s& a,
const Packet8s& b) {
2770 return __lsx_vbitsel_v(b, a, mask);
2774EIGEN_STRONG_INLINE Packet4i pabsdiff<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
2775 Packet4i v = psub(a, b);
2779EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(
const Packet4i& mask,
const Packet4i& a,
const Packet4i& b) {
2780 return __lsx_vbitsel_v(b, a, mask);
2784EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(
const Packet2l& mask,
const Packet2l& a,
const Packet2l& b) {
2785 return __lsx_vbitsel_v(b, a, mask);
2789EIGEN_STRONG_INLINE Packet16uc pdiv<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
2790 return __lsx_vdiv_bu(a, b);
2793EIGEN_STRONG_INLINE Packet16uc pabsdiff<Packet16uc>(
const Packet16uc& a,
const Packet16uc& b) {
2794 Packet16uc v = psub(a, b);
2798EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(
const Packet16uc& mask,
const Packet16uc& a,
2799 const Packet16uc& b) {
2800 return __lsx_vbitsel_v(b, a, mask);
2803EIGEN_STRONG_INLINE Packet16uc psqrt(
const Packet16uc& a) {
2804 __m128i res = {0, 0};
2805 __m128i add = {0x0808080808080808, 0x0808080808080808};
2806 for (
int i = 0; i < 4; i++) {
2807 const __m128i temp = __lsx_vor_v(res, add);
2808 const __m128i tmul = __lsx_vpackev_b(__lsx_vmulwod_h_bu(temp, temp), __lsx_vmulwev_h_bu(temp, temp));
2809 res = __lsx_vbitsel_v(res, temp, __lsx_vsle_bu(tmul, a));
2810 add = __lsx_vsrli_b(add, 1);
2816EIGEN_STRONG_INLINE Packet8us pabsdiff<Packet8us>(
const Packet8us& a,
const Packet8us& b) {
2817 Packet8us v = psub(a, b);
2821EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(
const Packet8us& mask,
const Packet8us& a,
const Packet8us& b) {
2822 return __lsx_vbitsel_v(b, a, mask);
2825EIGEN_STRONG_INLINE Packet8us psqrt(
const Packet8us& a) {
2826 __m128i res = {0, 0};
2827 __m128i add = {0x0080008000800080, 0x0080008000800080};
2828 for (
int i = 0; i < 4; i++) {
2829 const __m128i temp = __lsx_vor_v(res, add);
2830 const __m128i tmul = __lsx_vpackev_h(__lsx_vmulwod_w_hu(temp, temp), __lsx_vmulwev_w_hu(temp, temp));
2831 res = __lsx_vbitsel_v(res, temp, __lsx_vsle_hu(tmul, a));
2832 add = __lsx_vsrli_h(add, 1);
2838EIGEN_STRONG_INLINE Packet4ui pabsdiff<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
2839 Packet4ui v = psub(a, b);
2843EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(
const Packet4ui& mask,
const Packet4ui& a,
const Packet4ui& b) {
2844 return __lsx_vbitsel_v(b, a, mask);
2847EIGEN_STRONG_INLINE Packet4ui psqrt(
const Packet4ui& a) {
2848 __m128i res = {0, 0};
2849 __m128i add = {0x0000800000008000, 0x0000800000008000};
2850 for (
int i = 0; i < 4; i++) {
2851 const __m128i temp = __lsx_vor_v(res, add);
2852 const __m128i tmul = __lsx_vpackev_w(__lsx_vmulwod_d_wu(temp, temp), __lsx_vmulwev_d_wu(temp, temp));
2853 res = __lsx_vbitsel_v(res, temp, __lsx_vsle_wu(tmul, a));
2854 add = __lsx_vsrli_w(add, 1);
2860EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(
const Packet2ul& mask,
const Packet2ul& a,
const Packet2ul& b) {
2861 return __lsx_vbitsel_v(b, a, mask);
@ Aligned16
Definition Constants.h:237
Namespace containing all symbols from the Eigen library.
Definition B01_Experimental.dox:1
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:82