Eigen  5.0.1-dev+7c7d8473
 
Loading...
Searching...
No Matches
PacketMath.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2023 Zang Ruochen <zangruochen@loongson.cn>
5// Copyright (C) 2024 XiWei Gu <guxiwei-hf@loongson.cn>
6//
7// This Source Code Form is subject to the terms of the Mozilla
8// Public License v. 2.0. If a copy of the MPL was not distributed
9// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
10
11#ifndef EIGEN_PACKET_MATH_LSX_H
12#define EIGEN_PACKET_MATH_LSX_H
13
14// IWYU pragma: private
15#include "../../InternalHeaderCheck.h"
16
17namespace Eigen {
18
19namespace internal {
20
21#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
22#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
23#endif
24
25#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
26#if EIGEN_ARCH_LOONGARCH64
27#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
28#endif
29#endif
30
31#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
32#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
33#endif
34
35typedef __m128 Packet4f;
36typedef __m128d Packet2d;
37
38typedef eigen_packet_wrapper<__m128i, 0> Packet16c;
39typedef eigen_packet_wrapper<__m128i, 1> Packet8s;
40typedef eigen_packet_wrapper<__m128i, 2> Packet4i;
41typedef eigen_packet_wrapper<__m128i, 3> Packet2l;
42typedef eigen_packet_wrapper<__m128i, 4> Packet16uc;
43typedef eigen_packet_wrapper<__m128i, 5> Packet8us;
44typedef eigen_packet_wrapper<__m128i, 6> Packet4ui;
45typedef eigen_packet_wrapper<__m128i, 7> Packet2ul;
46
47template <>
48struct is_arithmetic<__m128> {
49 enum { value = true };
50};
51template <>
52struct is_arithmetic<__m128i> {
53 enum { value = true };
54};
55template <>
56struct is_arithmetic<__m128d> {
57 enum { value = true };
58};
59template <>
60struct is_arithmetic<Packet16c> {
61 enum { value = true };
62};
63template <>
64struct is_arithmetic<Packet8s> {
65 enum { value = true };
66};
67template <>
68struct is_arithmetic<Packet4i> {
69 enum { value = true };
70};
71template <>
72struct is_arithmetic<Packet2l> {
73 enum { value = true };
74};
75template <>
76struct is_arithmetic<Packet16uc> {
77 enum { value = false };
78};
79template <>
80struct is_arithmetic<Packet8us> {
81 enum { value = false };
82};
83template <>
84struct is_arithmetic<Packet4ui> {
85 enum { value = false };
86};
87template <>
88struct is_arithmetic<Packet2ul> {
89 enum { value = false };
90};
91
92EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) {
93 float from[4] = {a, b, c, d};
94 return (Packet4f)__lsx_vld(from, 0);
95}
96
97EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask) {
98 const float* a = reinterpret_cast<const float*>(&m);
99 Packet4f res =
100 make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(a + ((mask >> 6) & 3)));
101 return res;
102}
103
104template <bool interleave>
105EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f& m, const Packet4f& n, int mask) {
106 const float* a = reinterpret_cast<const float*>(&m);
107 const float* b = reinterpret_cast<const float*>(&n);
108 Packet4f res =
109 make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
110 return res;
111}
112
113template <>
114EIGEN_STRONG_INLINE Packet4f shuffle2<true>(const Packet4f& m, const Packet4f& n, int mask) {
115 const float* a = reinterpret_cast<const float*>(&m);
116 const float* b = reinterpret_cast<const float*>(&n);
117 Packet4f res =
118 make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
119 return res;
120}
121
122EIGEN_STRONG_INLINE static int eigen_lsx_shuffle_mask(int p, int q, int r, int s) {
123 return ((s) << 6 | (r) << 4 | (q) << 2 | (p));
124}
125
126EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s) {
127 return shuffle1(a, eigen_lsx_shuffle_mask(p, q, r, s));
128}
129EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s) {
130 return shuffle2<false>(a, b, eigen_lsx_shuffle_mask(p, q, r, s));
131}
132EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) {
133 return shuffle2<false>(a, b, eigen_lsx_shuffle_mask(0, 1, 0, 1));
134}
135EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) {
136 return shuffle2<false>(b, a, eigen_lsx_shuffle_mask(2, 3, 2, 3));
137}
138EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) {
139 return shuffle2<true>(a, b, eigen_lsx_shuffle_mask(0, 0, 1, 1));
140}
141EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) {
142 return shuffle2<true>(a, b, eigen_lsx_shuffle_mask(2, 2, 3, 3));
143}
144
145EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) {
146 double from[2] = {a, b};
147 return (Packet2d)__lsx_vld(from, 0);
148}
149
150EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask) {
151 const double* a = reinterpret_cast<const double*>(&m);
152 const double* b = reinterpret_cast<const double*>(&n);
153 Packet2d res = make_packet2d(*(a + (mask & 1)), *(b + ((mask >> 1) & 1)));
154 return res;
155}
156
157EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d& a, const Packet2d& b, int mask) {
158 return shuffle(a, b, mask);
159}
160EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 0); }
161EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 3); }
162
163template <>
164struct packet_traits<int8_t> : default_packet_traits {
165 typedef Packet16c type;
166 typedef Packet16c half;
167 enum {
168 Vectorizable = 1,
169 AlignedOnScalar = 1,
170 size = 16,
171
172 HasSetLinear = 0,
173 HasCmp = 1,
174 };
175};
176
177template <>
178struct packet_traits<int16_t> : default_packet_traits {
179 typedef Packet8s type;
180 typedef Packet8s half;
181 enum {
182 Vectorizable = 1,
183 AlignedOnScalar = 1,
184 size = 8,
185
186 HasSetLinear = 0,
187 HasCmp = 1,
188 HasDiv = 1,
189 };
190};
191
192template <>
193struct packet_traits<int32_t> : default_packet_traits {
194 typedef Packet4i type;
195 typedef Packet4i half;
196 enum {
197 Vectorizable = 1,
198 AlignedOnScalar = 1,
199 size = 4,
200
201 HasSetLinear = 0,
202 HasCmp = 1,
203 HasDiv = 1,
204 };
205};
206
207template <>
208struct packet_traits<int64_t> : default_packet_traits {
209 typedef Packet2l type;
210 typedef Packet2l half;
211 enum {
212 Vectorizable = 1,
213 AlignedOnScalar = 1,
214 size = 2,
215
216 HasSetLinear = 0,
217 HasCmp = 1,
218 HasDiv = 1,
219 };
220};
221
222template <>
223struct packet_traits<uint8_t> : default_packet_traits {
224 typedef Packet16uc type;
225 typedef Packet16uc half;
226 enum {
227 Vectorizable = 1,
228 AlignedOnScalar = 1,
229 size = 16,
230
231 HasSetLinear = 0,
232 HasNegate = 0,
233 HasCmp = 1,
234 };
235};
236
237template <>
238struct packet_traits<uint16_t> : default_packet_traits {
239 typedef Packet8us type;
240 typedef Packet8us half;
241 enum {
242 Vectorizable = 1,
243 AlignedOnScalar = 1,
244 size = 8,
245
246 HasSetLinear = 0,
247 HasNegate = 0,
248 HasCmp = 1,
249 HasDiv = 1,
250 };
251};
252
253template <>
254struct packet_traits<uint32_t> : default_packet_traits {
255 typedef Packet4ui type;
256 typedef Packet4ui half;
257 enum {
258 Vectorizable = 1,
259 AlignedOnScalar = 1,
260 size = 4,
261
262 HasSetLinear = 0,
263 HasNegate = 0,
264 HasCmp = 1,
265 HasDiv = 1,
266 };
267};
268
269template <>
270struct packet_traits<uint64_t> : default_packet_traits {
271 typedef Packet2ul type;
272 typedef Packet2ul half;
273 enum {
274 Vectorizable = 1,
275 AlignedOnScalar = 1,
276 size = 2,
277
278 HasSetLinear = 0,
279 HasNegate = 0,
280 HasCmp = 1,
281 HasDiv = 1,
282 };
283};
284
285template <>
286struct packet_traits<float> : default_packet_traits {
287 typedef Packet4f type;
288 typedef Packet4f half;
289 enum {
290 Vectorizable = 1,
291 AlignedOnScalar = 1,
292 size = 4,
293
294 HasSetLinear = 0,
295 HasSign = 0,
296 HasDiv = 1,
297 HasExp = 1,
298 HasSqrt = 1,
299 HasLog = 1,
300 HasRsqrt = 1
301 };
302};
303
304template <>
305struct packet_traits<double> : default_packet_traits {
306 typedef Packet2d type;
307 typedef Packet2d half;
308 enum {
309 Vectorizable = 1,
310 AlignedOnScalar = 1,
311 size = 2,
312
313 HasSetLinear = 0,
314 HasSign = 0,
315 HasDiv = 1,
316 HasSqrt = 1,
317 HasLog = 1,
318 HasRsqrt = 1
319 };
320};
321
322template <>
323struct unpacket_traits<Packet16c> {
324 typedef int8_t type;
325 typedef Packet16c half;
326 enum {
327 size = 16,
328 alignment = Aligned16,
329 vectorizable = true,
330 masked_load_available = false,
331 masked_store_available = false
332 };
333};
334template <>
335struct unpacket_traits<Packet8s> {
336 typedef int16_t type;
337 typedef Packet8s half;
338 enum {
339 size = 8,
340 alignment = Aligned16,
341 vectorizable = true,
342 masked_load_available = false,
343 masked_store_available = false
344 };
345};
346template <>
347struct unpacket_traits<Packet4i> {
348 typedef int32_t type;
349 typedef Packet4i half;
350 enum {
351 size = 4,
352 alignment = Aligned16,
353 vectorizable = true,
354 masked_load_available = false,
355 masked_store_available = false
356 };
357};
358template <>
359struct unpacket_traits<Packet2l> {
360 typedef int64_t type;
361 typedef Packet2l half;
362 enum {
363 size = 2,
364 alignment = Aligned16,
365 vectorizable = true,
366 masked_load_available = false,
367 masked_store_available = false
368 };
369};
370template <>
371struct unpacket_traits<Packet16uc> {
372 typedef uint8_t type;
373 typedef Packet16uc half;
374 enum {
375 size = 16,
376 alignment = Aligned16,
377 vectorizable = true,
378 masked_load_available = false,
379 masked_store_available = false
380 };
381};
382template <>
383struct unpacket_traits<Packet8us> {
384 typedef uint16_t type;
385 typedef Packet8us half;
386 enum {
387 size = 8,
388 alignment = Aligned16,
389 vectorizable = true,
390 masked_load_available = false,
391 masked_store_available = false
392 };
393};
394template <>
395struct unpacket_traits<Packet4ui> {
396 typedef uint32_t type;
397 typedef Packet4ui half;
398 enum {
399 size = 4,
400 alignment = Aligned16,
401 vectorizable = true,
402 masked_load_available = false,
403 masked_store_available = false
404 };
405};
406template <>
407struct unpacket_traits<Packet2ul> {
408 typedef uint64_t type;
409 typedef Packet2ul half;
410 enum {
411 size = 2,
412 alignment = Aligned16,
413 vectorizable = true,
414 masked_load_available = false,
415 masked_store_available = false
416 };
417};
418template <>
419struct unpacket_traits<Packet4f> {
420 typedef float type;
421 typedef Packet4f half;
422 typedef Packet4i integer_packet;
423 enum {
424 size = 4,
425 alignment = Aligned16,
426 vectorizable = true,
427 masked_load_available = false,
428 masked_store_available = false
429 };
430};
431template <>
432struct unpacket_traits<Packet2d> {
433 typedef double type;
434 typedef Packet2d half;
435 typedef Packet2l integer_packet;
436 enum {
437 size = 2,
438 alignment = Aligned16,
439 vectorizable = true,
440 masked_load_available = false,
441 masked_store_available = false
442 };
443};
444
445template <>
446EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const int8_t& from) {
447 return __lsx_vreplgr2vr_b(from);
448}
449template <>
450EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const int16_t& from) {
451 return __lsx_vreplgr2vr_h(from);
452}
453template <>
454EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) {
455 return __lsx_vreplgr2vr_w(from);
456}
457template <>
458EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) {
459 return __lsx_vreplgr2vr_d(from);
460}
461template <>
462EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const uint8_t& from) {
463 return __lsx_vreplgr2vr_b(from);
464}
465template <>
466EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const uint16_t& from) {
467 return __lsx_vreplgr2vr_h(from);
468}
469template <>
470EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) {
471 return __lsx_vreplgr2vr_w(from);
472}
473template <>
474EIGEN_STRONG_INLINE Packet2ul pset1<Packet2ul>(const uint64_t& from) {
475 return __lsx_vreplgr2vr_d(from);
476}
477template <>
478EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
479 Packet4f v = {from, from, from, from};
480 return v;
481}
482template <>
483EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
484 Packet2d v = {from, from};
485 return v;
486}
487
488template <>
489EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(uint32_t from) {
490 return reinterpret_cast<__m128>((__m128i)pset1<Packet4ui>(from));
491}
492template <>
493EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
494 return reinterpret_cast<__m128d>((__m128i)pset1<Packet2ul>(from));
495}
496
497template <>
498EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const int8_t& a) {
499 const int8_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
500 return __lsx_vadd_b(pset1<Packet16c>(a), __lsx_vld(countdown, 0));
501}
502template <>
503EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const int16_t& a) {
504 const int16_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7};
505 return __lsx_vadd_h(pset1<Packet8s>(a), __lsx_vld(countdown, 0));
506}
507template <>
508EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a) {
509 const int32_t countdown[] = {0, 1, 2, 3};
510 return __lsx_vadd_w(pset1<Packet4i>(a), __lsx_vld(countdown, 0));
511}
512template <>
513EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(const int64_t& a) {
514 const int64_t countdown[] = {0, 1};
515 return __lsx_vadd_d(pset1<Packet2l>(a), __lsx_vld(countdown, 0));
516}
517template <>
518EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const uint8_t& a) {
519 const uint8_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
520 return __lsx_vadd_b(pset1<Packet16uc>(a), __lsx_vld(countdown, 0));
521}
522template <>
523EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const uint16_t& a) {
524 const uint16_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7};
525 return __lsx_vadd_h(pset1<Packet8us>(a), __lsx_vld(countdown, 0));
526}
527template <>
528EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a) {
529 const uint32_t countdown[] = {0, 1, 2, 3};
530 return __lsx_vadd_w(pset1<Packet4ui>(a), __lsx_vld(countdown, 0));
531}
532template <>
533EIGEN_STRONG_INLINE Packet2ul plset<Packet2ul>(const uint64_t& a) {
534 const uint64_t countdown[] = {0, 1};
535 return __lsx_vadd_d(pset1<Packet2ul>(a), __lsx_vld(countdown, 0));
536}
537template <>
538EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
539 static const Packet4f countdown = {0.0f, 1.0f, 2.0f, 3.0f};
540 return __lsx_vfadd_s(pset1<Packet4f>(a), countdown);
541}
542template <>
543EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
544 static const Packet2d countdown = {0.0f, 1.0f};
545 return __lsx_vfadd_d(pset1<Packet2d>(a), countdown);
546}
547
548template <>
549EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) {
550 return __lsx_vadd_b(a, b);
551}
552template <>
553EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) {
554 return __lsx_vadd_h(a, b);
555}
556template <>
557EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
558 return __lsx_vadd_w(a, b);
559}
560template <>
561EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(const Packet2l& a, const Packet2l& b) {
562 return __lsx_vadd_d(a, b);
563}
564template <>
565EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
566 return __lsx_vadd_b(a, b);
567}
568template <>
569EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) {
570 return __lsx_vadd_h(a, b);
571}
572template <>
573EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
574 return __lsx_vadd_w(a, b);
575}
576template <>
577EIGEN_STRONG_INLINE Packet2ul padd<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
578 return __lsx_vadd_d(a, b);
579}
580template <>
581EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
582 return __lsx_vfadd_s(a, b);
583}
584template <>
585EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
586 return __lsx_vfadd_d(a, b);
587}
588
589template <>
590EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) {
591 return __lsx_vsub_b(a, b);
592}
593template <>
594EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) {
595 return __lsx_vsub_h(a, b);
596}
597template <>
598EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
599 return __lsx_vsub_w(a, b);
600}
601template <>
602EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) {
603 return __lsx_vsub_d(a, b);
604}
605template <>
606EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
607 return __lsx_vsub_b(a, b);
608}
609template <>
610EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) {
611 return __lsx_vsub_h(a, b);
612}
613template <>
614EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
615 return __lsx_vsub_w(a, b);
616}
617template <>
618EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
619 return __lsx_vsub_d(a, b);
620}
621template <>
622EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
623 return __lsx_vfsub_s(a, b);
624}
625template <>
626EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
627 return __lsx_vfsub_d(a, b);
628}
629
630template <>
631EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
632template <>
633EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
634 const Packet4f mask =
635 make_packet4f(numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f);
636 return padd(a, pxor(mask, b));
637}
638template <>
639EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b);
640template <>
641EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b) {
642 const Packet2d mask = make_packet2d(numext::bit_cast<double>(0x8000000000000000ull), 0.0);
643 return padd(a, pxor(mask, b));
644}
645
646template <>
647EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
648 Packet4f mask = make_packet4f(numext::bit_cast<float>(0x80000000), numext::bit_cast<float>(0x80000000),
649 numext::bit_cast<float>(0x80000000), numext::bit_cast<float>(0x80000000));
650 return (Packet4f)__lsx_vxor_v(numext::bit_cast<__m128i>(mask), numext::bit_cast<__m128i>(a));
651}
652template <>
653EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
654 Packet2d mask =
655 make_packet2d(numext::bit_cast<double>(0x8000000000000000), numext::bit_cast<double>(0x8000000000000000));
656 return (Packet2d)__lsx_vxor_v(numext::bit_cast<__m128i>(mask), numext::bit_cast<__m128i>(a));
657}
658template <>
659EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) {
660 return __lsx_vneg_b(a);
661}
662template <>
663EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) {
664 return __lsx_vneg_h(a);
665}
666template <>
667EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
668 return __lsx_vneg_w(a);
669}
670template <>
671EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) {
672 return __lsx_vneg_d(a);
673}
674
675template <>
676EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
677 return a;
678}
679template <>
680EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
681 return a;
682}
683template <>
684EIGEN_STRONG_INLINE Packet16c pconj(const Packet16c& a) {
685 return a;
686}
687template <>
688EIGEN_STRONG_INLINE Packet8s pconj(const Packet8s& a) {
689 return a;
690}
691template <>
692EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
693 return a;
694}
695template <>
696EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) {
697 return a;
698}
699template <>
700EIGEN_STRONG_INLINE Packet16uc pconj(const Packet16uc& a) {
701 return a;
702}
703template <>
704EIGEN_STRONG_INLINE Packet8us pconj(const Packet8us& a) {
705 return a;
706}
707template <>
708EIGEN_STRONG_INLINE Packet4ui pconj(const Packet4ui& a) {
709 return a;
710}
711template <>
712EIGEN_STRONG_INLINE Packet2ul pconj(const Packet2ul& a) {
713 return a;
714}
715
716template <>
717EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
718 return __lsx_vfmul_s(a, b);
719}
720template <>
721EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
722 return __lsx_vfmul_d(a, b);
723}
724template <>
725EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) {
726 return __lsx_vmul_b(a, b);
727}
728template <>
729EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) {
730 return __lsx_vmul_h(a, b);
731}
732template <>
733EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
734 return __lsx_vmul_w(a, b);
735}
736template <>
737EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(const Packet2l& a, const Packet2l& b) {
738 return __lsx_vmul_d(a, b);
739}
740template <>
741EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
742 return __lsx_vmul_b(a, b);
743}
744template <>
745EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) {
746 return __lsx_vmul_h(a, b);
747}
748template <>
749EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
750 return __lsx_vmul_w(a, b);
751}
752template <>
753EIGEN_STRONG_INLINE Packet2ul pmul<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
754 return __lsx_vmul_d(a, b);
755}
756
757template <>
758EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
759 return __lsx_vfdiv_s(a, b);
760}
761template <>
762EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
763 return __lsx_vfdiv_d(a, b);
764}
765template <>
766EIGEN_STRONG_INLINE Packet8s pdiv<Packet8s>(const Packet8s& a, const Packet8s& b) {
767 return __lsx_vdiv_h(a, b);
768}
769template <>
770EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
771 return __lsx_vdiv_w(a, b);
772}
773template <>
774EIGEN_STRONG_INLINE Packet2l pdiv<Packet2l>(const Packet2l& a, const Packet2l& b) {
775 return __lsx_vdiv_d(a, b);
776}
777template <>
778EIGEN_STRONG_INLINE Packet8us pdiv<Packet8us>(const Packet8us& a, const Packet8us& b) {
779 return __lsx_vdiv_hu(a, b);
780}
781template <>
782EIGEN_STRONG_INLINE Packet4ui pdiv<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
783 return __lsx_vdiv_wu(a, b);
784}
785template <>
786EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
787 return __lsx_vdiv_du(a, b);
788}
789
790template <>
791EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
792 return __lsx_vfmadd_s(a, b, c);
793}
794template <>
795EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
796 return __lsx_vfmadd_d(a, b, c);
797}
798template <>
799EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
800 return __lsx_vfmsub_s(a, b, c);
801}
802template <>
803EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
804 return __lsx_vfmsub_d(a, b, c);
805}
806template <>
807EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
808 return __lsx_vfnmsub_s(a, b, c);
809}
810template <>
811EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
812 return __lsx_vfnmsub_d(a, b, c);
813}
814template <>
815EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
816 return __lsx_vfnmadd_s(a, b, c);
817}
818template <>
819EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
820 return __lsx_vfnmadd_d(a, b, c);
821}
822template <>
823EIGEN_STRONG_INLINE Packet16c pmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
824 return __lsx_vmadd_b(c, a, b);
825}
826template <>
827EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
828 return __lsx_vmadd_h(c, a, b);
829}
830template <>
831EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
832 return __lsx_vmadd_w(c, a, b);
833}
834template <>
835EIGEN_STRONG_INLINE Packet2l pmadd(const Packet2l& a, const Packet2l& b, const Packet2l& c) {
836 return __lsx_vmadd_d(c, a, b);
837}
838template <>
839EIGEN_STRONG_INLINE Packet16uc pmadd(const Packet16uc& a, const Packet16uc& b, const Packet16uc& c) {
840 return __lsx_vmadd_b(c, a, b);
841}
842template <>
843EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) {
844 return __lsx_vmadd_h(c, a, b);
845}
846template <>
847EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c) {
848 return __lsx_vmadd_w(c, a, b);
849}
850template <>
851EIGEN_STRONG_INLINE Packet2ul pmadd(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c) {
852 return __lsx_vmadd_d(c, a, b);
853}
854
855template <>
856EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
857 return (Packet4f)__lsx_vand_v((__m128i)a, (__m128i)b);
858}
859template <>
860EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
861 return (Packet2d)__lsx_vand_v((__m128i)a, (__m128i)b);
862}
863template <>
864EIGEN_STRONG_INLINE Packet16c pand<Packet16c>(const Packet16c& a, const Packet16c& b) {
865 return __lsx_vand_v(a, b);
866}
867template <>
868EIGEN_STRONG_INLINE Packet8s pand<Packet8s>(const Packet8s& a, const Packet8s& b) {
869 return __lsx_vand_v(a, b);
870}
871template <>
872EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
873 return __lsx_vand_v(a, b);
874}
875template <>
876EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(const Packet2l& a, const Packet2l& b) {
877 return __lsx_vand_v(a, b);
878}
879template <>
880EIGEN_STRONG_INLINE Packet16uc pand<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
881 return __lsx_vand_v(a, b);
882}
883template <>
884EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) {
885 return __lsx_vand_v(a, b);
886}
887template <>
888EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
889 return __lsx_vand_v(a, b);
890}
891template <>
892EIGEN_STRONG_INLINE Packet2ul pand<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
893 return __lsx_vand_v(a, b);
894}
895
896template <>
897EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
898 return (Packet4f)__lsx_vor_v((__m128i)a, (__m128i)b);
899}
900template <>
901EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
902 return (Packet2d)__lsx_vor_v((__m128i)a, (__m128i)b);
903}
904template <>
905EIGEN_STRONG_INLINE Packet16c por<Packet16c>(const Packet16c& a, const Packet16c& b) {
906 return __lsx_vor_v(a, b);
907}
908template <>
909EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) {
910 return __lsx_vor_v(a, b);
911}
912template <>
913EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
914 return __lsx_vor_v(a, b);
915}
916template <>
917EIGEN_STRONG_INLINE Packet2l por<Packet2l>(const Packet2l& a, const Packet2l& b) {
918 return __lsx_vor_v(a, b);
919}
920template <>
921EIGEN_STRONG_INLINE Packet16uc por<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
922 return __lsx_vor_v(a, b);
923}
924template <>
925EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) {
926 return __lsx_vor_v(a, b);
927}
928template <>
929EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
930 return __lsx_vor_v(a, b);
931}
932template <>
933EIGEN_STRONG_INLINE Packet2ul por<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
934 return __lsx_vor_v(a, b);
935}
936
937template <>
938EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
939 return (Packet4f)__lsx_vxor_v((__m128i)a, (__m128i)b);
940}
941template <>
942EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
943 return (Packet2d)__lsx_vxor_v((__m128i)a, (__m128i)b);
944}
945template <>
946EIGEN_STRONG_INLINE Packet16c pxor<Packet16c>(const Packet16c& a, const Packet16c& b) {
947 return __lsx_vxor_v(a, b);
948}
949template <>
950EIGEN_STRONG_INLINE Packet8s pxor<Packet8s>(const Packet8s& a, const Packet8s& b) {
951 return __lsx_vxor_v(a, b);
952}
953template <>
954EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
955 return __lsx_vxor_v(a, b);
956}
957template <>
958EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(const Packet2l& a, const Packet2l& b) {
959 return __lsx_vxor_v(a, b);
960}
961template <>
962EIGEN_STRONG_INLINE Packet16uc pxor<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
963 return __lsx_vxor_v(a, b);
964}
965template <>
966EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b) {
967 return __lsx_vxor_v(a, b);
968}
969template <>
970EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
971 return __lsx_vxor_v(a, b);
972}
973template <>
974EIGEN_STRONG_INLINE Packet2ul pxor<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
975 return __lsx_vxor_v(a, b);
976}
977
978template <>
979EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
980 return (Packet4f)__lsx_vandn_v((__m128i)b, (__m128i)a);
981}
982template <>
983EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
984 return (Packet2d)__lsx_vandn_v((__m128i)b, (__m128i)a);
985}
986template <>
987EIGEN_STRONG_INLINE Packet16c pandnot<Packet16c>(const Packet16c& a, const Packet16c& b) {
988 return __lsx_vandn_v(b, a);
989}
990template <>
991EIGEN_STRONG_INLINE Packet8s pandnot<Packet8s>(const Packet8s& a, const Packet8s& b) {
992 return __lsx_vandn_v(b, a);
993}
994template <>
995EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
996 return __lsx_vandn_v(b, a);
997}
998template <>
999EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, const Packet2l& b) {
1000 return __lsx_vandn_v(b, a);
1001}
1002template <>
1003EIGEN_STRONG_INLINE Packet16uc pandnot<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1004 return __lsx_vandn_v(b, a);
1005}
1006template <>
1007EIGEN_STRONG_INLINE Packet8us pandnot<Packet8us>(const Packet8us& a, const Packet8us& b) {
1008 return __lsx_vandn_v(b, a);
1009}
1010template <>
1011EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1012 return __lsx_vandn_v(b, a);
1013}
1014template <>
1015EIGEN_STRONG_INLINE Packet2ul pandnot<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1016 return __lsx_vandn_v(b, a);
1017}
1018
1019template <>
1020EIGEN_STRONG_INLINE Packet4f pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b) {
1021 return (Packet4f)__lsx_vfcmp_cle_s(a, b);
1022}
1023template <>
1024EIGEN_STRONG_INLINE Packet2d pcmp_le<Packet2d>(const Packet2d& a, const Packet2d& b) {
1025 return (Packet2d)__lsx_vfcmp_cle_d(a, b);
1026}
1027template <>
1028EIGEN_STRONG_INLINE Packet16c pcmp_le<Packet16c>(const Packet16c& a, const Packet16c& b) {
1029 return __lsx_vsle_b(a, b);
1030}
1031template <>
1032EIGEN_STRONG_INLINE Packet8s pcmp_le<Packet8s>(const Packet8s& a, const Packet8s& b) {
1033 return __lsx_vsle_h(a, b);
1034}
1035template <>
1036EIGEN_STRONG_INLINE Packet4i pcmp_le<Packet4i>(const Packet4i& a, const Packet4i& b) {
1037 return __lsx_vsle_w(a, b);
1038}
1039template <>
1040EIGEN_STRONG_INLINE Packet2l pcmp_le<Packet2l>(const Packet2l& a, const Packet2l& b) {
1041 return __lsx_vsle_d(a, b);
1042}
1043template <>
1044EIGEN_STRONG_INLINE Packet16uc pcmp_le<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1045 return __lsx_vsle_bu(a, b);
1046}
1047template <>
1048EIGEN_STRONG_INLINE Packet8us pcmp_le<Packet8us>(const Packet8us& a, const Packet8us& b) {
1049 return __lsx_vsle_hu(a, b);
1050}
1051template <>
1052EIGEN_STRONG_INLINE Packet4ui pcmp_le<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1053 return __lsx_vsle_wu(a, b);
1054}
1055template <>
1056EIGEN_STRONG_INLINE Packet2ul pcmp_le<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1057 return __lsx_vsle_du(a, b);
1058}
1059
1060template <>
1061EIGEN_STRONG_INLINE Packet4f pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b) {
1062 return (Packet4f)__lsx_vfcmp_clt_s(a, b);
1063}
1064template <>
1065EIGEN_STRONG_INLINE Packet2d pcmp_lt<Packet2d>(const Packet2d& a, const Packet2d& b) {
1066 return (Packet2d)__lsx_vfcmp_clt_d(a, b);
1067}
1068template <>
1069EIGEN_STRONG_INLINE Packet16c pcmp_lt<Packet16c>(const Packet16c& a, const Packet16c& b) {
1070 return __lsx_vslt_b(a, b);
1071}
1072template <>
1073EIGEN_STRONG_INLINE Packet8s pcmp_lt<Packet8s>(const Packet8s& a, const Packet8s& b) {
1074 return __lsx_vslt_h(a, b);
1075}
1076template <>
1077EIGEN_STRONG_INLINE Packet4i pcmp_lt<Packet4i>(const Packet4i& a, const Packet4i& b) {
1078 return __lsx_vslt_w(a, b);
1079}
1080template <>
1081EIGEN_STRONG_INLINE Packet2l pcmp_lt<Packet2l>(const Packet2l& a, const Packet2l& b) {
1082 return __lsx_vslt_d(a, b);
1083}
1084template <>
1085EIGEN_STRONG_INLINE Packet16uc pcmp_lt<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1086 return __lsx_vslt_bu(a, b);
1087}
1088template <>
1089EIGEN_STRONG_INLINE Packet8us pcmp_lt<Packet8us>(const Packet8us& a, const Packet8us& b) {
1090 return __lsx_vslt_hu(a, b);
1091}
1092template <>
1093EIGEN_STRONG_INLINE Packet4ui pcmp_lt<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1094 return __lsx_vslt_wu(a, b);
1095}
1096template <>
1097EIGEN_STRONG_INLINE Packet2ul pcmp_lt<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1098 return __lsx_vslt_du(a, b);
1099}
1100
1101template <>
1102EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan<Packet4f>(const Packet4f& a, const Packet4f& b) {
1103 return (Packet4f)__lsx_vfcmp_sult_s(a, b);
1104}
1105template <>
1106EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan<Packet2d>(const Packet2d& a, const Packet2d& b) {
1107 return (Packet2d)__lsx_vfcmp_sult_d(a, b);
1108}
1109
1110template <>
1111EIGEN_STRONG_INLINE Packet4f pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b) {
1112 return (Packet4f)__lsx_vfcmp_seq_s(a, b);
1113}
1114template <>
1115EIGEN_STRONG_INLINE Packet2d pcmp_eq<Packet2d>(const Packet2d& a, const Packet2d& b) {
1116 return (Packet2d)__lsx_vfcmp_seq_d(a, b);
1117}
1118template <>
1119EIGEN_STRONG_INLINE Packet16c pcmp_eq<Packet16c>(const Packet16c& a, const Packet16c& b) {
1120 return __lsx_vseq_b(a, b);
1121}
1122template <>
1123EIGEN_STRONG_INLINE Packet8s pcmp_eq<Packet8s>(const Packet8s& a, const Packet8s& b) {
1124 return __lsx_vseq_h(a, b);
1125}
1126template <>
1127EIGEN_STRONG_INLINE Packet4i pcmp_eq<Packet4i>(const Packet4i& a, const Packet4i& b) {
1128 return __lsx_vseq_w(a, b);
1129}
1130template <>
1131EIGEN_STRONG_INLINE Packet2l pcmp_eq<Packet2l>(const Packet2l& a, const Packet2l& b) {
1132 return __lsx_vseq_d(a, b);
1133}
1134template <>
1135EIGEN_STRONG_INLINE Packet16uc pcmp_eq<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1136 return __lsx_vseq_b(a, b);
1137}
1138template <>
1139EIGEN_STRONG_INLINE Packet8us pcmp_eq<Packet8us>(const Packet8us& a, const Packet8us& b) {
1140 return __lsx_vseq_h(a, b);
1141}
1142template <>
1143EIGEN_STRONG_INLINE Packet4ui pcmp_eq<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1144 return __lsx_vseq_w(a, b);
1145}
1146template <>
1147EIGEN_STRONG_INLINE Packet2ul pcmp_eq<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1148 return __lsx_vseq_d(a, b);
1149}
1150
1151template <>
1152EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) {
1153 return __lsx_vmin_b(a, b);
1154}
1155template <>
1156EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) {
1157 return __lsx_vmin_h(a, b);
1158}
1159template <>
1160EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
1161 return __lsx_vmin_w(a, b);
1162}
1163template <>
1164EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(const Packet2l& a, const Packet2l& b) {
1165 return __lsx_vmin_d(a, b);
1166}
1167template <>
1168EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1169 return __lsx_vmin_bu(a, b);
1170}
1171template <>
1172EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) {
1173 return __lsx_vmin_hu(a, b);
1174}
1175template <>
1176EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1177 return __lsx_vmin_wu(a, b);
1178}
1179template <>
1180EIGEN_STRONG_INLINE Packet2ul pmin<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1181 return __lsx_vmin_du(a, b);
1182}
1183
1184template <>
1185EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) {
1186 return __lsx_vmax_b(a, b);
1187}
1188template <>
1189EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) {
1190 return __lsx_vmax_h(a, b);
1191}
1192template <>
1193EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
1194 return __lsx_vmax_w(a, b);
1195}
1196template <>
1197EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(const Packet2l& a, const Packet2l& b) {
1198 return __lsx_vmax_d(a, b);
1199}
1200template <>
1201EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1202 return __lsx_vmax_bu(a, b);
1203}
1204template <>
1205EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) {
1206 return __lsx_vmax_hu(a, b);
1207}
1208template <>
1209EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1210 return __lsx_vmax_wu(a, b);
1211}
1212template <>
1213EIGEN_STRONG_INLINE Packet2ul pmax<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1214 return __lsx_vmax_du(a, b);
1215}
1216
1217template <>
1218EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
1219 Packet4i aNaN = __lsx_vfcmp_cun_s(a, a);
1220 Packet4i aMinOrNaN = por<Packet4i>(__lsx_vfcmp_clt_s(a, b), aNaN);
1221 return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMinOrNaN);
1222}
1223template <>
1224EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
1225 Packet2l aNaN = __lsx_vfcmp_cun_d(a, a);
1226 Packet2l aMinOrNaN = por<Packet2l>(__lsx_vfcmp_clt_d(a, b), aNaN);
1227 return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMinOrNaN);
1228}
1229template <>
1230EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
1231 Packet4i aNaN = __lsx_vfcmp_cun_s(a, a);
1232 Packet4i aMaxOrNaN = por<Packet4i>(__lsx_vfcmp_clt_s(b, a), aNaN);
1233 return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMaxOrNaN);
1234}
1235template <>
1236EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
1237 Packet2l aNaN = __lsx_vfcmp_cun_d(a, a);
1238 Packet2l aMaxOrNaN = por<Packet2l>(__lsx_vfcmp_clt_d(b, a), aNaN);
1239 return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMaxOrNaN);
1240}
1241
1242template <int N>
1243EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(const Packet16c& a) {
1244 return __lsx_vsrai_b((__m128i)a, N);
1245}
1246template <int N>
1247EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(const Packet8s& a) {
1248 return __lsx_vsrai_h((__m128i)a, N);
1249}
1250template <int N>
1251EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
1252 return __lsx_vsrai_w((__m128i)a, N);
1253}
1254template <int N>
1255EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(const Packet2l& a) {
1256 return __lsx_vsrai_d((__m128i)a, N);
1257}
1258template <int N>
1259EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(const Packet16uc& a) {
1260 return __lsx_vsrli_b((__m128i)a, N);
1261}
1262template <int N>
1263EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(const Packet8us& a) {
1264 return __lsx_vsrli_h((__m128i)a, N);
1265}
1266template <int N>
1267EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(const Packet4ui& a) {
1268 return __lsx_vsrli_w((__m128i)a, N);
1269}
1270template <int N>
1271EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(const Packet2ul& a) {
1272 return __lsx_vsrli_d((__m128i)a, N);
1273}
1274
1275template <int N>
1276EIGEN_STRONG_INLINE Packet16c plogical_shift_right(const Packet16c& a) {
1277 return __lsx_vsrli_b((__m128i)a, N);
1278}
1279template <int N>
1280EIGEN_STRONG_INLINE Packet8s plogical_shift_right(const Packet8s& a) {
1281 return __lsx_vsrli_h((__m128i)a, N);
1282}
1283template <int N>
1284EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) {
1285 return __lsx_vsrli_w((__m128i)a, N);
1286}
1287template <int N>
1288EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
1289 return __lsx_vsrli_d((__m128i)a, N);
1290}
1291template <int N>
1292EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(const Packet16uc& a) {
1293 return __lsx_vsrli_b((__m128i)a, N);
1294}
1295template <int N>
1296EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a) {
1297 return __lsx_vsrli_h((__m128i)a, N);
1298}
1299template <int N>
1300EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a) {
1301 return __lsx_vsrli_w((__m128i)a, N);
1302}
1303template <int N>
1304EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(const Packet2ul& a) {
1305 return __lsx_vsrli_d((__m128i)a, N);
1306}
1307
1308template <int N>
1309EIGEN_STRONG_INLINE Packet16c plogical_shift_left(const Packet16c& a) {
1310 return __lsx_vslli_b((__m128i)a, N);
1311}
1312template <int N>
1313EIGEN_STRONG_INLINE Packet8s plogical_shift_left(const Packet8s& a) {
1314 return __lsx_vslli_h((__m128i)a, N);
1315}
1316template <int N>
1317EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
1318 return __lsx_vslli_w((__m128i)a, N);
1319}
1320template <int N>
1321EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
1322 return __lsx_vslli_d((__m128i)a, N);
1323}
1324template <int N>
1325EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(const Packet16uc& a) {
1326 return __lsx_vslli_b((__m128i)a, N);
1327}
1328template <int N>
1329EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a) {
1330 return __lsx_vslli_h((__m128i)a, N);
1331}
1332template <int N>
1333EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a) {
1334 return __lsx_vslli_w((__m128i)a, N);
1335}
1336template <int N>
1337EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(const Packet2ul& a) {
1338 return __lsx_vslli_d((__m128i)a, N);
1339}
1340
1341template <>
1342EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
1343 return (Packet4f)__lsx_vbitclri_w((__m128i)a, 31);
1344}
1345template <>
1346EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
1347 return (Packet2d)__lsx_vbitclri_d((__m128i)a, 63);
1348}
1349template <>
1350EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) {
1351 return __lsx_vabsd_b(a, pzero(a));
1352}
1353template <>
1354EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) {
1355 return __lsx_vabsd_h(a, pzero(a));
1356}
1357template <>
1358EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
1359 return __lsx_vabsd_w(a, pzero(a));
1360}
1361template <>
1362EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) {
1363 return __lsx_vabsd_d(a, pzero(a));
1364}
1365template <>
1366EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) {
1367 return a;
1368}
1369template <>
1370EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) {
1371 return a;
1372}
1373template <>
1374EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) {
1375 return a;
1376}
1377template <>
1378EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) {
1379 return a;
1380}
1381
1382template <>
1383EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
1384 EIGEN_DEBUG_ALIGNED_LOAD return (Packet4f)__lsx_vld(from, 0);
1385}
1386template <>
1387EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
1388 EIGEN_DEBUG_ALIGNED_LOAD return (Packet2d)__lsx_vld(from, 0);
1389}
1390template <>
1391EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const int8_t* from) {
1392 EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
1393}
1394template <>
1395EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const int16_t* from) {
1396 EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
1397}
1398template <>
1399EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) {
1400 EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
1401}
1402template <>
1403EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from) {
1404 EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
1405}
1406template <>
1407EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const uint8_t* from) {
1408 EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
1409}
1410template <>
1411EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const uint16_t* from) {
1412 EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
1413}
1414template <>
1415EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from) {
1416 EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
1417}
1418template <>
1419EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(const uint64_t* from) {
1420 EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
1421}
1422
1423template <>
1424EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
1425 EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4f)__lsx_vld(from, 0);
1426}
1427template <>
1428EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
1429 EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__lsx_vld(from, 0);
1430}
1431template <>
1432EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const int8_t* from) {
1433 EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
1434}
1435template <>
1436EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const int16_t* from) {
1437 EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
1438}
1439template <>
1440EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) {
1441 EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
1442}
1443template <>
1444EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(const int64_t* from) {
1445 EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
1446}
1447template <>
1448EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const uint8_t* from) {
1449 EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
1450}
1451template <>
1452EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const uint16_t* from) {
1453 EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
1454}
1455template <>
1456EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from) {
1457 EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
1458}
1459template <>
1460EIGEN_STRONG_INLINE Packet2ul ploadu<Packet2ul>(const uint64_t* from) {
1461 EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
1462}
1463
1464template <>
1465EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
1466 float f0 = from[0], f1 = from[1];
1467 return make_packet4f(f0, f0, f1, f1);
1468}
1469template <>
1470EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
1471 return pset1<Packet2d>(from[0]);
1472}
1473template <>
1474EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const int8_t* from) {
1475 Packet16c tmp = pload<Packet16c>(from);
1476 return __lsx_vilvl_b(tmp, tmp);
1477}
1478template <>
1479EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const int16_t* from) {
1480 Packet8s tmp = pload<Packet8s>(from);
1481 return __lsx_vilvl_h(tmp, tmp);
1482}
1483template <>
1484EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from) {
1485 Packet4i tmp = pload<Packet4i>(from);
1486 return __lsx_vilvl_w(tmp, tmp);
1487}
1488template <>
1489EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(const int64_t* from) {
1490 return pset1<Packet2l>(from[0]);
1491}
1492template <>
1493EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const uint8_t* from) {
1494 Packet16uc tmp = pload<Packet16uc>(from);
1495 return __lsx_vilvl_b(tmp, tmp);
1496}
1497template <>
1498EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const uint16_t* from) {
1499 Packet8us tmp = pload<Packet8us>(from);
1500 return __lsx_vilvl_h(tmp, tmp);
1501}
1502template <>
1503EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from) {
1504 Packet4ui tmp = pload<Packet4ui>(from);
1505 return __lsx_vilvl_w(tmp, tmp);
1506}
1507template <>
1508EIGEN_STRONG_INLINE Packet2ul ploaddup<Packet2ul>(const uint64_t* from) {
1509 return pset1<Packet2ul>(from[0]);
1510}
1511
1512template <>
1513EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
1514 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst(from, to, 0);
1515}
1516template <>
1517EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
1518 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst(from, to, 0);
1519}
1520template <>
1521EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet16c& from) {
1522 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1523}
1524template <>
1525EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet8s& from) {
1526 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1527}
1528template <>
1529EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) {
1530 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1531}
1532template <>
1533EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from) {
1534 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1535}
1536template <>
1537EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet16uc& from) {
1538 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1539}
1540template <>
1541EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet8us& from) {
1542 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1543}
1544template <>
1545EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from) {
1546 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1547}
1548template <>
1549EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet2ul& from) {
1550 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1551}
1552
1553template <>
1554EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
1555 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst(from, to, 0);
1556}
1557template <>
1558EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
1559 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst(from, to, 0);
1560}
1561
1562template <>
1563EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet16c& from) {
1564 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1565}
1566template <>
1567EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet8s& from) {
1568 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1569}
1570template <>
1571EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) {
1572 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1573}
1574template <>
1575EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from) {
1576 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1577}
1578template <>
1579EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet16uc& from) {
1580 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1581}
1582template <>
1583EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet8us& from) {
1584 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1585}
1586template <>
1587EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from) {
1588 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1589}
1590template <>
1591EIGEN_STRONG_INLINE void pstoreu<uint64_t>(uint64_t* to, const Packet2ul& from) {
1592 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1593}
1594
1595template <>
1596EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
1597 Packet4f v = {from[0], from[stride], from[2 * stride], from[3 * stride]};
1598 return v;
1599}
1600template <>
1601EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
1602 Packet2d v = {from[0], from[stride]};
1603 return v;
1604}
1605template <>
1606EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather<int8_t, Packet16c>(const int8_t* from, Index stride) {
1607 int8_t v[16] __attribute__((aligned(16)));
1608 v[0] = from[0];
1609 v[1] = from[stride];
1610 v[2] = from[2 * stride];
1611 v[3] = from[3 * stride];
1612 v[4] = from[4 * stride];
1613 v[5] = from[5 * stride];
1614 v[6] = from[6 * stride];
1615 v[7] = from[7 * stride];
1616 v[8] = from[8 * stride];
1617 v[9] = from[9 * stride];
1618 v[10] = from[10 * stride];
1619 v[11] = from[11 * stride];
1620 v[12] = from[12 * stride];
1621 v[13] = from[13 * stride];
1622 v[14] = from[14 * stride];
1623 v[15] = from[15 * stride];
1624 return __lsx_vld(v, 0);
1625}
1626template <>
1627EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather<int16_t, Packet8s>(const int16_t* from, Index stride) {
1628 int16_t v[8] __attribute__((aligned(16)));
1629 v[0] = from[0];
1630 v[1] = from[stride];
1631 v[2] = from[2 * stride];
1632 v[3] = from[3 * stride];
1633 v[4] = from[4 * stride];
1634 v[5] = from[5 * stride];
1635 v[6] = from[6 * stride];
1636 v[7] = from[7 * stride];
1637 return __lsx_vld(v, 0);
1638}
1639template <>
1640EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride) {
1641 int32_t v[4] __attribute__((aligned(16)));
1642 v[0] = from[0];
1643 v[1] = from[stride];
1644 v[2] = from[2 * stride];
1645 v[3] = from[3 * stride];
1646 return __lsx_vld(v, 0);
1647}
1648template <>
1649EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride) {
1650 int64_t v[2] __attribute__((aligned(16)));
1651 v[0] = from[0];
1652 v[1] = from[stride];
1653 return __lsx_vld(v, 0);
1654}
1655template <>
1656EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather<uint8_t, Packet16uc>(const uint8_t* from, Index stride) {
1657 uint8_t v[16] __attribute__((aligned(16)));
1658 v[0] = from[0];
1659 v[1] = from[stride];
1660 v[2] = from[2 * stride];
1661 v[3] = from[3 * stride];
1662 v[4] = from[4 * stride];
1663 v[5] = from[5 * stride];
1664 v[6] = from[6 * stride];
1665 v[7] = from[7 * stride];
1666 v[8] = from[8 * stride];
1667 v[9] = from[9 * stride];
1668 v[10] = from[10 * stride];
1669 v[11] = from[11 * stride];
1670 v[12] = from[12 * stride];
1671 v[13] = from[13 * stride];
1672 v[14] = from[14 * stride];
1673 v[15] = from[15 * stride];
1674 return __lsx_vld(v, 0);
1675}
1676template <>
1677EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather<uint16_t, Packet8us>(const uint16_t* from, Index stride) {
1678 uint16_t v[8] __attribute__((aligned(16)));
1679 v[0] = from[0];
1680 v[1] = from[stride];
1681 v[2] = from[2 * stride];
1682 v[3] = from[3 * stride];
1683 v[4] = from[4 * stride];
1684 v[5] = from[5 * stride];
1685 v[6] = from[6 * stride];
1686 v[7] = from[7 * stride];
1687 return __lsx_vld(v, 0);
1688}
1689template <>
1690EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride) {
1691 uint32_t v[4] __attribute__((aligned(16)));
1692 v[0] = from[0];
1693 v[1] = from[stride];
1694 v[2] = from[2 * stride];
1695 v[3] = from[3 * stride];
1696 return __lsx_vld(v, 0);
1697}
1698template <>
1699EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather<uint64_t, Packet2ul>(const uint64_t* from, Index stride) {
1700 uint64_t v[2] __attribute__((aligned(16)));
1701 v[0] = from[0];
1702 v[1] = from[stride];
1703 return __lsx_vld(v, 0);
1704}
1705
1706template <>
1707EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
1708 __lsx_vstelm_w(from, to, 0, 0);
1709 __lsx_vstelm_w(from, to + stride * 1, 0, 1);
1710 __lsx_vstelm_w(from, to + stride * 2, 0, 2);
1711 __lsx_vstelm_w(from, to + stride * 3, 0, 3);
1712}
1713template <>
1714EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
1715 __lsx_vstelm_d(from, to, 0, 0);
1716 __lsx_vstelm_d(from, to + stride, 0, 1);
1717}
1718template <>
1719EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet16c>(int8_t* to, const Packet16c& from,
1720 Index stride) {
1721 __lsx_vstelm_b((__m128i)from, to, 0, 0);
1722 __lsx_vstelm_b((__m128i)from, to + stride * 1, 0, 1);
1723 __lsx_vstelm_b((__m128i)from, to + stride * 2, 0, 2);
1724 __lsx_vstelm_b((__m128i)from, to + stride * 3, 0, 3);
1725 __lsx_vstelm_b((__m128i)from, to + stride * 4, 0, 4);
1726 __lsx_vstelm_b((__m128i)from, to + stride * 5, 0, 5);
1727 __lsx_vstelm_b((__m128i)from, to + stride * 6, 0, 6);
1728 __lsx_vstelm_b((__m128i)from, to + stride * 7, 0, 7);
1729 __lsx_vstelm_b((__m128i)from, to + stride * 8, 0, 8);
1730 __lsx_vstelm_b((__m128i)from, to + stride * 9, 0, 9);
1731 __lsx_vstelm_b((__m128i)from, to + stride * 10, 0, 10);
1732 __lsx_vstelm_b((__m128i)from, to + stride * 11, 0, 11);
1733 __lsx_vstelm_b((__m128i)from, to + stride * 12, 0, 12);
1734 __lsx_vstelm_b((__m128i)from, to + stride * 13, 0, 13);
1735 __lsx_vstelm_b((__m128i)from, to + stride * 14, 0, 14);
1736 __lsx_vstelm_b((__m128i)from, to + stride * 15, 0, 15);
1737}
1738template <>
1739EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet8s>(int16_t* to, const Packet8s& from,
1740 Index stride) {
1741 __lsx_vstelm_h((__m128i)from, to, 0, 0);
1742 __lsx_vstelm_h((__m128i)from, to + stride * 1, 0, 1);
1743 __lsx_vstelm_h((__m128i)from, to + stride * 2, 0, 2);
1744 __lsx_vstelm_h((__m128i)from, to + stride * 3, 0, 3);
1745 __lsx_vstelm_h((__m128i)from, to + stride * 4, 0, 4);
1746 __lsx_vstelm_h((__m128i)from, to + stride * 5, 0, 5);
1747 __lsx_vstelm_h((__m128i)from, to + stride * 6, 0, 6);
1748 __lsx_vstelm_h((__m128i)from, to + stride * 7, 0, 7);
1749}
1750template <>
1751EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from,
1752 Index stride) {
1753 __lsx_vstelm_w((__m128i)from, to, 0, 0);
1754 __lsx_vstelm_w((__m128i)from, to + stride * 1, 0, 1);
1755 __lsx_vstelm_w((__m128i)from, to + stride * 2, 0, 2);
1756 __lsx_vstelm_w((__m128i)from, to + stride * 3, 0, 3);
1757}
1758template <>
1759EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from,
1760 Index stride) {
1761 __lsx_vstelm_d((__m128i)from, to, 0, 0);
1762 __lsx_vstelm_d((__m128i)from, to + stride * 1, 0, 1);
1763}
1764template <>
1765EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet16uc>(uint8_t* to, const Packet16uc& from,
1766 Index stride) {
1767 __lsx_vstelm_b((__m128i)from, to, 0, 0);
1768 __lsx_vstelm_b((__m128i)from, to + stride * 1, 0, 1);
1769 __lsx_vstelm_b((__m128i)from, to + stride * 2, 0, 2);
1770 __lsx_vstelm_b((__m128i)from, to + stride * 3, 0, 3);
1771 __lsx_vstelm_b((__m128i)from, to + stride * 4, 0, 4);
1772 __lsx_vstelm_b((__m128i)from, to + stride * 5, 0, 5);
1773 __lsx_vstelm_b((__m128i)from, to + stride * 6, 0, 6);
1774 __lsx_vstelm_b((__m128i)from, to + stride * 7, 0, 7);
1775 __lsx_vstelm_b((__m128i)from, to + stride * 8, 0, 8);
1776 __lsx_vstelm_b((__m128i)from, to + stride * 9, 0, 9);
1777 __lsx_vstelm_b((__m128i)from, to + stride * 10, 0, 10);
1778 __lsx_vstelm_b((__m128i)from, to + stride * 11, 0, 11);
1779 __lsx_vstelm_b((__m128i)from, to + stride * 12, 0, 12);
1780 __lsx_vstelm_b((__m128i)from, to + stride * 13, 0, 13);
1781 __lsx_vstelm_b((__m128i)from, to + stride * 14, 0, 14);
1782 __lsx_vstelm_b((__m128i)from, to + stride * 15, 0, 15);
1783}
1784template <>
1785EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet8us>(uint16_t* to, const Packet8us& from,
1786 Index stride) {
1787 __lsx_vstelm_h((__m128i)from, to, 0, 0);
1788 __lsx_vstelm_h((__m128i)from, to + stride * 1, 0, 1);
1789 __lsx_vstelm_h((__m128i)from, to + stride * 2, 0, 2);
1790 __lsx_vstelm_h((__m128i)from, to + stride * 3, 0, 3);
1791 __lsx_vstelm_h((__m128i)from, to + stride * 4, 0, 4);
1792 __lsx_vstelm_h((__m128i)from, to + stride * 5, 0, 5);
1793 __lsx_vstelm_h((__m128i)from, to + stride * 6, 0, 6);
1794 __lsx_vstelm_h((__m128i)from, to + stride * 7, 0, 7);
1795}
1796template <>
1797EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from,
1798 Index stride) {
1799 __lsx_vstelm_w((__m128i)from, to, 0, 0);
1800 __lsx_vstelm_w((__m128i)from, to + stride * 1, 0, 1);
1801 __lsx_vstelm_w((__m128i)from, to + stride * 2, 0, 2);
1802 __lsx_vstelm_w((__m128i)from, to + stride * 3, 0, 3);
1803}
1804template <>
1805EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint64_t, Packet2ul>(uint64_t* to, const Packet2ul& from,
1806 Index stride) {
1807 __lsx_vstelm_d((__m128i)from, to, 0, 0);
1808 __lsx_vstelm_d((__m128i)from, to + stride * 1, 0, 1);
1809}
1810
1811template <>
1812EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
1813 __builtin_prefetch(addr);
1814}
1815template <>
1816EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
1817 __builtin_prefetch(addr);
1818}
1819template <>
1820EIGEN_STRONG_INLINE void prefetch<int8_t>(const int8_t* addr) {
1821 __builtin_prefetch(addr);
1822}
1823template <>
1824EIGEN_STRONG_INLINE void prefetch<int16_t>(const int16_t* addr) {
1825 __builtin_prefetch(addr);
1826}
1827template <>
1828EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) {
1829 __builtin_prefetch(addr);
1830}
1831template <>
1832EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) {
1833 __builtin_prefetch(addr);
1834}
1835template <>
1836EIGEN_STRONG_INLINE void prefetch<uint8_t>(const uint8_t* addr) {
1837 __builtin_prefetch(addr);
1838}
1839template <>
1840EIGEN_STRONG_INLINE void prefetch<uint16_t>(const uint16_t* addr) {
1841 __builtin_prefetch(addr);
1842}
1843template <>
1844EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) {
1845 __builtin_prefetch(addr);
1846}
1847template <>
1848EIGEN_STRONG_INLINE void prefetch<uint64_t>(const uint64_t* addr) {
1849 __builtin_prefetch(addr);
1850}
1851
1852template <>
1853EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
1854 float v;
1855 __lsx_vstelm_w(a, &v, 0, 0);
1856 return v;
1857}
1858template <>
1859EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
1860 double v;
1861 __lsx_vstelm_d(a, &v, 0, 0);
1862 return v;
1863}
1864
1865template <>
1866EIGEN_STRONG_INLINE int8_t pfirst<Packet16c>(const Packet16c& a) {
1867 return (int8_t)__lsx_vpickve2gr_b((__m128i)a, 0);
1868}
1869template <>
1870EIGEN_STRONG_INLINE int16_t pfirst<Packet8s>(const Packet8s& a) {
1871 return (int16_t)__lsx_vpickve2gr_h((__m128i)a, 0);
1872}
1873template <>
1874EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) {
1875 return __lsx_vpickve2gr_w((__m128i)a, 0);
1876}
1877template <>
1878EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
1879 return __lsx_vpickve2gr_d((__m128i)a, 0);
1880}
1881template <>
1882EIGEN_STRONG_INLINE uint8_t pfirst<Packet16uc>(const Packet16uc& a) {
1883 return (uint8_t)__lsx_vpickve2gr_bu((__m128i)a, 0);
1884}
1885template <>
1886EIGEN_STRONG_INLINE uint16_t pfirst<Packet8us>(const Packet8us& a) {
1887 return (uint16_t)__lsx_vpickve2gr_hu((__m128i)a, 0);
1888}
1889template <>
1890EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
1891 return __lsx_vpickve2gr_wu((__m128i)a, 0);
1892}
1893template <>
1894EIGEN_STRONG_INLINE uint64_t pfirst<Packet2ul>(const Packet2ul& a) {
1895 return __lsx_vpickve2gr_du((__m128i)a, 0);
1896}
1897
1898template <>
1899EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
1900 return (Packet4f)__lsx_vshuf4i_w(a, 0x1B);
1901}
1902template <>
1903EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
1904 return (Packet2d)__lsx_vshuf4i_d(a, a, 0x1);
1905}
1906template <>
1907EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) {
1908 return __lsx_vshuf4i_b(__lsx_vshuf4i_w((__m128i)a, 0x1B), 0x1B);
1909}
1910template <>
1911EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) {
1912 return __lsx_vshuf4i_h(__lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1), 0x1B);
1913}
1914template <>
1915EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
1916 return __lsx_vshuf4i_w((__m128i)a, 0x1B);
1917}
1918template <>
1919EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a) {
1920 return __lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1);
1921}
1922template <>
1923EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) {
1924 return __lsx_vshuf4i_b(__lsx_vshuf4i_w((__m128i)a, 0x1B), 0x1B);
1925}
1926template <>
1927EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) {
1928 return __lsx_vshuf4i_h(__lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1), 0x1B);
1929}
1930template <>
1931EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) {
1932 return __lsx_vshuf4i_w((__m128i)a, 0x1B);
1933}
1934template <>
1935EIGEN_STRONG_INLINE Packet2ul preverse(const Packet2ul& a) {
1936 return __lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1);
1937}
1938
1939template <>
1940EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
1941 Packet4f tmp = __lsx_vfadd_s(a, vec4f_swizzle1(a, 2, 3, 2, 3));
1942 return pfirst<Packet4f>(__lsx_vfadd_s(tmp, vec4f_swizzle1(tmp, 1, 1, 1, 1)));
1943}
1944template <>
1945EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
1946 return pfirst<Packet2d>(__lsx_vfadd_d(a, preverse(a)));
1947}
1948template <>
1949EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a) {
1950 Packet8s tmp1 = __lsx_vhaddw_h_b(a, a);
1951 Packet4i tmp2 = __lsx_vhaddw_w_h(tmp1, tmp1);
1952 Packet2l tmp3 = __lsx_vhaddw_d_w(tmp2, tmp2);
1953 return (int8_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp3, tmp3), 0);
1954}
1955template <>
1956EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a) {
1957 Packet4i tmp1 = __lsx_vhaddw_w_h(a, a);
1958 Packet2l tmp2 = __lsx_vhaddw_d_w(tmp1, tmp1);
1959 return (int16_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp2, tmp2), 0);
1960}
1961template <>
1962EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
1963 Packet2l tmp = __lsx_vhaddw_d_w(a, a);
1964 return (int32_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp, tmp), 0);
1965}
1966template <>
1967EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a) {
1968 return (int64_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(a, a), 0);
1969}
1970template <>
1971EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a) {
1972 Packet8us tmp1 = __lsx_vhaddw_hu_bu(a, a);
1973 Packet4ui tmp2 = __lsx_vhaddw_wu_hu(tmp1, tmp1);
1974 Packet2ul tmp3 = __lsx_vhaddw_du_wu(tmp2, tmp2);
1975 return (uint8_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp3, tmp3), 0);
1976}
1977template <>
1978EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a) {
1979 Packet4ui tmp1 = __lsx_vhaddw_wu_hu(a, a);
1980 Packet2ul tmp2 = __lsx_vhaddw_du_wu(tmp1, tmp1);
1981 return (uint16_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp2, tmp2), 0);
1982}
1983template <>
1984EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
1985 Packet2ul tmp = __lsx_vhaddw_du_wu(a, a);
1986 return (uint32_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp, tmp), 0);
1987}
1988template <>
1989EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a) {
1990 return (uint64_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(a, a), 0);
1991}
1992
1993template <>
1994EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
1995 Packet4f tmp = __lsx_vfmul_s(a, vec4f_swizzle1(a, 2, 3, 2, 3));
1996 return pfirst<Packet4f>(__lsx_vfmul_s(tmp, vec4f_swizzle1(tmp, 1, 1, 1, 1)));
1997}
1998template <>
1999EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
2000 return pfirst<Packet2d>(__lsx_vfmul_d(a, preverse(a)));
2001}
2002template <>
2003EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(const Packet16c& a) {
2004 Packet8s tmp1 = __lsx_vmulwev_h_b(a, preverse(a));
2005 Packet4i tmp2 = __lsx_vmulwev_w_h(tmp1, preverse(tmp1));
2006 Packet2l tmp3 = __lsx_vmulwev_d_w(tmp2, preverse(tmp2));
2007 return (int8_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp3, preverse(tmp3)), 0);
2008}
2009template <>
2010EIGEN_STRONG_INLINE int16_t predux_mul<Packet8s>(const Packet8s& a) {
2011 Packet4i tmp1 = __lsx_vmulwev_w_h(a, preverse(a));
2012 Packet2l tmp2 = __lsx_vmulwev_d_w(tmp1, preverse(tmp1));
2013 return (int16_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp2, preverse(tmp2)), 0);
2014}
2015template <>
2016EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a) {
2017 Packet2l tmp = __lsx_vmulwev_d_w(a, preverse(a));
2018 return (int32_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp, preverse(tmp)), 0);
2019}
2020template <>
2021EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a) {
2022 return (int64_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(a, preverse(a)), 0);
2023}
2024template <>
2025EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(const Packet16uc& a) {
2026 Packet8us tmp1 = __lsx_vmulwev_h_bu(a, preverse(a));
2027 Packet4ui tmp2 = __lsx_vmulwev_w_h(tmp1, preverse(tmp1));
2028 Packet2ul tmp3 = __lsx_vmulwev_d_w(tmp2, preverse(tmp2));
2029 return (uint8_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp3, preverse(tmp3)), 0);
2030}
2031template <>
2032EIGEN_STRONG_INLINE uint16_t predux_mul<Packet8us>(const Packet8us& a) {
2033 Packet4ui tmp1 = __lsx_vmulwev_w_hu(a, preverse(a));
2034 Packet2ul tmp2 = __lsx_vmulwev_d_w(tmp1, preverse(tmp1));
2035 return (uint16_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp2, preverse(tmp2)), 0);
2036}
2037template <>
2038EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a) {
2039 Packet2ul tmp = __lsx_vmulwev_d_wu(a, preverse(a));
2040 return (uint32_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp, preverse(tmp)), 0);
2041}
2042template <>
2043EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(const Packet2ul& a) {
2044 return (uint64_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_du(a, preverse(a)), 0);
2045}
2046
2047template <>
2048EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
2049 Packet4f tmp = __lsx_vfmin_s(a, (Packet4f)__lsx_vshuf4i_w(a, 0x4E));
2050 return pfirst(__lsx_vfmin_s(tmp, (Packet4f)__lsx_vshuf4i_w(tmp, 0xB1)));
2051}
2052template <>
2053EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
2054 return pfirst(__lsx_vfmin_d(a, preverse(a)));
2055}
2056template <>
2057EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a) {
2058 Packet16c tmp1 = __lsx_vmin_b(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2059 Packet16c tmp2 = __lsx_vmin_b(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2060 Packet16c tmp3 = __lsx_vmin_b(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
2061 return pfirst((Packet16c)__lsx_vmin_b(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
2062}
2063template <>
2064EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a) {
2065 Packet8s tmp1 = __lsx_vmin_h(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2066 Packet8s tmp2 = __lsx_vmin_h(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2067 return pfirst((Packet8s)__lsx_vmin_h(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
2068}
2069template <>
2070EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) {
2071 Packet4i tmp = __lsx_vmin_w(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2072 return pfirst((Packet4i)__lsx_vmin_w(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
2073}
2074template <>
2075EIGEN_STRONG_INLINE int64_t predux_min<Packet2l>(const Packet2l& a) {
2076 return pfirst((Packet2l)__lsx_vmin_d(a, preverse(a)));
2077}
2078template <>
2079EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a) {
2080 Packet16uc tmp1 = __lsx_vmin_bu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2081 Packet16uc tmp2 = __lsx_vmin_bu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2082 Packet16uc tmp3 = __lsx_vmin_bu(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
2083 return pfirst((Packet16uc)__lsx_vmin_bu(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
2084}
2085template <>
2086EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a) {
2087 Packet8us tmp1 = __lsx_vmin_hu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2088 Packet8us tmp2 = __lsx_vmin_hu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2089 return pfirst((Packet8us)__lsx_vmin_hu(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
2090}
2091template <>
2092EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a) {
2093 Packet4ui tmp = __lsx_vmin_wu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2094 return pfirst((Packet4ui)__lsx_vmin_wu(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
2095}
2096template <>
2097EIGEN_STRONG_INLINE uint64_t predux_min<Packet2ul>(const Packet2ul& a) {
2098 return pfirst((Packet2ul)__lsx_vmin_du(a, preverse(a)));
2099}
2100
2101template <>
2102EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
2103 Packet4f tmp = __lsx_vfmax_s(a, (Packet4f)__lsx_vshuf4i_w(a, 0x4E));
2104 return pfirst(__lsx_vfmax_s(tmp, (Packet4f)__lsx_vshuf4i_w(tmp, 0xB1)));
2105}
2106template <>
2107EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
2108 return pfirst(__lsx_vfmax_d(a, preverse(a)));
2109}
2110template <>
2111EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a) {
2112 Packet16c tmp1 = __lsx_vmax_b(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2113 Packet16c tmp2 = __lsx_vmax_b(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2114 Packet16c tmp3 = __lsx_vmax_b(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
2115 return pfirst((Packet16c)__lsx_vmax_b(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
2116}
2117template <>
2118EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a) {
2119 Packet8s tmp1 = __lsx_vmax_h(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2120 Packet8s tmp2 = __lsx_vmax_h(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2121 return pfirst((Packet8s)__lsx_vmax_h(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
2122}
2123template <>
2124EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
2125 Packet4i tmp = __lsx_vmax_w(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2126 return pfirst((Packet4i)__lsx_vmax_w(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
2127}
2128template <>
2129EIGEN_STRONG_INLINE int64_t predux_max<Packet2l>(const Packet2l& a) {
2130 return pfirst((Packet2l)__lsx_vmax_d(a, preverse(a)));
2131}
2132template <>
2133EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a) {
2134 Packet16uc tmp1 = __lsx_vmax_bu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2135 Packet16uc tmp2 = __lsx_vmax_bu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2136 Packet16uc tmp3 = __lsx_vmax_bu(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
2137 return pfirst((Packet16uc)__lsx_vmax_bu(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
2138}
2139template <>
2140EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a) {
2141 Packet8us tmp1 = __lsx_vmax_hu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2142 Packet8us tmp2 = __lsx_vmax_hu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2143 return pfirst((Packet8us)__lsx_vmax_hu(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
2144}
2145template <>
2146EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a) {
2147 Packet4ui tmp = __lsx_vmax_wu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2148 return pfirst((Packet4ui)__lsx_vmax_wu(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
2149}
2150template <>
2151EIGEN_STRONG_INLINE uint64_t predux_max<Packet2ul>(const Packet2ul& a) {
2152 return pfirst((Packet2ul)__lsx_vmax_du(a, preverse(a)));
2153}
2154
2155template <>
2156EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
2157 return __lsx_vfsqrt_s(a);
2158}
2159template <>
2160EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& a) {
2161 return __lsx_vfsqrt_d(a);
2162}
2163
2164EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
2165 Packet4f T0 = (Packet4f)__lsx_vilvl_w((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
2166 Packet4f T1 = (Packet4f)__lsx_vilvh_w((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
2167 Packet4f T2 = (Packet4f)__lsx_vilvl_w((__m128i)kernel.packet[3], (__m128i)kernel.packet[2]);
2168 Packet4f T3 = (Packet4f)__lsx_vilvh_w((__m128i)kernel.packet[3], (__m128i)kernel.packet[2]);
2169
2170 kernel.packet[0] = (Packet4f)__lsx_vilvl_d((__m128i)T2, (__m128i)T0);
2171 kernel.packet[1] = (Packet4f)__lsx_vilvh_d((__m128i)T2, (__m128i)T0);
2172 kernel.packet[2] = (Packet4f)__lsx_vilvl_d((__m128i)T3, (__m128i)T1);
2173 kernel.packet[3] = (Packet4f)__lsx_vilvh_d((__m128i)T3, (__m128i)T1);
2174}
2175EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
2176 Packet2d tmp = (Packet2d)__lsx_vilvh_d((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
2177 kernel.packet[0] = (Packet2d)__lsx_vilvl_d((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
2178 kernel.packet[1] = tmp;
2179}
2180EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
2181 __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2182 __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2183 __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2184 __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2185 __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
2186 __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
2187 __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
2188 __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
2189 __m128i t8 = __lsx_vilvl_b(kernel.packet[9], kernel.packet[8]);
2190 __m128i t9 = __lsx_vilvh_b(kernel.packet[9], kernel.packet[8]);
2191 __m128i ta = __lsx_vilvl_b(kernel.packet[11], kernel.packet[10]);
2192 __m128i tb = __lsx_vilvh_b(kernel.packet[11], kernel.packet[10]);
2193 __m128i tc = __lsx_vilvl_b(kernel.packet[13], kernel.packet[12]);
2194 __m128i td = __lsx_vilvh_b(kernel.packet[13], kernel.packet[12]);
2195 __m128i te = __lsx_vilvl_b(kernel.packet[15], kernel.packet[14]);
2196 __m128i tf = __lsx_vilvh_b(kernel.packet[15], kernel.packet[14]);
2197
2198 __m128i s0 = __lsx_vilvl_h(t2, t0);
2199 __m128i s1 = __lsx_vilvh_h(t2, t0);
2200 __m128i s2 = __lsx_vilvl_h(t3, t1);
2201 __m128i s3 = __lsx_vilvh_h(t3, t1);
2202 __m128i s4 = __lsx_vilvl_h(t6, t4);
2203 __m128i s5 = __lsx_vilvh_h(t6, t4);
2204 __m128i s6 = __lsx_vilvl_h(t7, t5);
2205 __m128i s7 = __lsx_vilvh_h(t7, t5);
2206 __m128i s8 = __lsx_vilvl_h(ta, t8);
2207 __m128i s9 = __lsx_vilvh_h(ta, t8);
2208 __m128i sa = __lsx_vilvl_h(tb, t9);
2209 __m128i sb = __lsx_vilvh_h(tb, t9);
2210 __m128i sc = __lsx_vilvl_h(te, tc);
2211 __m128i sd = __lsx_vilvh_h(te, tc);
2212 __m128i se = __lsx_vilvl_h(tf, td);
2213 __m128i sf = __lsx_vilvh_h(tf, td);
2214
2215 __m128i u0 = __lsx_vilvl_w(s4, s0);
2216 __m128i u1 = __lsx_vilvh_w(s4, s0);
2217 __m128i u2 = __lsx_vilvl_w(s5, s1);
2218 __m128i u3 = __lsx_vilvh_w(s5, s1);
2219 __m128i u4 = __lsx_vilvl_w(s6, s2);
2220 __m128i u5 = __lsx_vilvh_w(s6, s2);
2221 __m128i u6 = __lsx_vilvl_w(s7, s3);
2222 __m128i u7 = __lsx_vilvh_w(s7, s3);
2223 __m128i u8 = __lsx_vilvl_w(sc, s8);
2224 __m128i u9 = __lsx_vilvh_w(sc, s8);
2225 __m128i ua = __lsx_vilvl_w(sd, s9);
2226 __m128i ub = __lsx_vilvh_w(sd, s9);
2227 __m128i uc = __lsx_vilvl_w(se, sa);
2228 __m128i ud = __lsx_vilvh_w(se, sa);
2229 __m128i ue = __lsx_vilvl_w(sf, sb);
2230 __m128i uf = __lsx_vilvh_w(sf, sb);
2231
2232 kernel.packet[0] = __lsx_vilvl_d(u8, u0);
2233 kernel.packet[1] = __lsx_vilvh_d(u8, u0);
2234 kernel.packet[2] = __lsx_vilvl_d(u9, u1);
2235 kernel.packet[3] = __lsx_vilvh_d(u9, u1);
2236 kernel.packet[4] = __lsx_vilvl_d(ua, u2);
2237 kernel.packet[5] = __lsx_vilvh_d(ua, u2);
2238 kernel.packet[6] = __lsx_vilvl_d(ub, u3);
2239 kernel.packet[7] = __lsx_vilvh_d(ub, u3);
2240 kernel.packet[8] = __lsx_vilvl_d(uc, u4);
2241 kernel.packet[9] = __lsx_vilvh_d(uc, u4);
2242 kernel.packet[10] = __lsx_vilvl_d(ud, u5);
2243 kernel.packet[11] = __lsx_vilvh_d(ud, u5);
2244 kernel.packet[12] = __lsx_vilvl_d(ue, u6);
2245 kernel.packet[13] = __lsx_vilvh_d(ue, u6);
2246 kernel.packet[14] = __lsx_vilvl_d(uf, u7);
2247 kernel.packet[15] = __lsx_vilvh_d(uf, u7);
2248}
2249EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 8>& kernel) {
2250 __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2251 __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2252 __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2253 __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2254 __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
2255 __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
2256 __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
2257 __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
2258
2259 __m128i s0 = __lsx_vilvl_h(t2, t0);
2260 __m128i s1 = __lsx_vilvh_h(t2, t0);
2261 __m128i s2 = __lsx_vilvl_h(t3, t1);
2262 __m128i s3 = __lsx_vilvh_h(t3, t1);
2263 __m128i s4 = __lsx_vilvl_h(t6, t4);
2264 __m128i s5 = __lsx_vilvh_h(t6, t4);
2265 __m128i s6 = __lsx_vilvl_h(t7, t5);
2266 __m128i s7 = __lsx_vilvh_h(t7, t5);
2267
2268 kernel.packet[0] = __lsx_vilvl_w(s4, s0);
2269 kernel.packet[1] = __lsx_vilvh_w(s4, s0);
2270 kernel.packet[2] = __lsx_vilvl_w(s5, s1);
2271 kernel.packet[3] = __lsx_vilvh_w(s5, s1);
2272 kernel.packet[4] = __lsx_vilvl_w(s6, s2);
2273 kernel.packet[5] = __lsx_vilvh_w(s6, s2);
2274 kernel.packet[6] = __lsx_vilvl_w(s7, s3);
2275 kernel.packet[7] = __lsx_vilvh_w(s7, s3);
2276}
2277EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
2278 __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2279 __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2280 __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2281 __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2282
2283 kernel.packet[0] = __lsx_vilvl_h(t2, t0);
2284 kernel.packet[1] = __lsx_vilvh_h(t2, t0);
2285 kernel.packet[2] = __lsx_vilvl_h(t3, t1);
2286 kernel.packet[3] = __lsx_vilvh_h(t3, t1);
2287}
2288EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
2289 __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
2290 __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
2291 __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
2292 __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
2293 __m128i t4 = __lsx_vilvl_h(kernel.packet[5], kernel.packet[4]);
2294 __m128i t5 = __lsx_vilvh_h(kernel.packet[5], kernel.packet[4]);
2295 __m128i t6 = __lsx_vilvl_h(kernel.packet[7], kernel.packet[6]);
2296 __m128i t7 = __lsx_vilvh_h(kernel.packet[7], kernel.packet[6]);
2297
2298 __m128i s0 = __lsx_vilvl_w(t2, t0);
2299 __m128i s1 = __lsx_vilvh_w(t2, t0);
2300 __m128i s2 = __lsx_vilvl_w(t3, t1);
2301 __m128i s3 = __lsx_vilvh_w(t3, t1);
2302 __m128i s4 = __lsx_vilvl_w(t6, t4);
2303 __m128i s5 = __lsx_vilvh_w(t6, t4);
2304 __m128i s6 = __lsx_vilvl_w(t7, t5);
2305 __m128i s7 = __lsx_vilvh_w(t7, t5);
2306
2307 kernel.packet[0] = __lsx_vilvl_d(s4, s0);
2308 kernel.packet[1] = __lsx_vilvh_d(s4, s0);
2309 kernel.packet[2] = __lsx_vilvl_d(s5, s1);
2310 kernel.packet[3] = __lsx_vilvh_d(s5, s1);
2311 kernel.packet[4] = __lsx_vilvl_d(s6, s2);
2312 kernel.packet[5] = __lsx_vilvh_d(s6, s2);
2313 kernel.packet[6] = __lsx_vilvl_d(s7, s3);
2314 kernel.packet[7] = __lsx_vilvh_d(s7, s3);
2315}
2316EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
2317 __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
2318 __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
2319 __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
2320 __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
2321
2322 kernel.packet[0] = __lsx_vilvl_w(t2, t0);
2323 kernel.packet[1] = __lsx_vilvh_w(t2, t0);
2324 kernel.packet[2] = __lsx_vilvl_w(t3, t1);
2325 kernel.packet[3] = __lsx_vilvh_w(t3, t1);
2326}
2327EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
2328 __m128i T0 = __lsx_vilvl_w(kernel.packet[1], kernel.packet[0]);
2329 __m128i T1 = __lsx_vilvh_w(kernel.packet[1], kernel.packet[0]);
2330 __m128i T2 = __lsx_vilvl_w(kernel.packet[3], kernel.packet[2]);
2331 __m128i T3 = __lsx_vilvh_w(kernel.packet[3], kernel.packet[2]);
2332
2333 kernel.packet[0] = __lsx_vilvl_d(T2, T0);
2334 kernel.packet[1] = __lsx_vilvh_d(T2, T0);
2335 kernel.packet[2] = __lsx_vilvl_d(T3, T1);
2336 kernel.packet[3] = __lsx_vilvh_d(T3, T1);
2337}
2338EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2l, 2>& kernel) {
2339 __m128i tmp = __lsx_vilvh_d(kernel.packet[1], kernel.packet[0]);
2340 kernel.packet[0] = __lsx_vilvl_d(kernel.packet[1], kernel.packet[0]);
2341 kernel.packet[1] = tmp;
2342}
2343EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
2344 __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2345 __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2346 __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2347 __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2348 __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
2349 __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
2350 __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
2351 __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
2352 __m128i t8 = __lsx_vilvl_b(kernel.packet[9], kernel.packet[8]);
2353 __m128i t9 = __lsx_vilvh_b(kernel.packet[9], kernel.packet[8]);
2354 __m128i ta = __lsx_vilvl_b(kernel.packet[11], kernel.packet[10]);
2355 __m128i tb = __lsx_vilvh_b(kernel.packet[11], kernel.packet[10]);
2356 __m128i tc = __lsx_vilvl_b(kernel.packet[13], kernel.packet[12]);
2357 __m128i td = __lsx_vilvh_b(kernel.packet[13], kernel.packet[12]);
2358 __m128i te = __lsx_vilvl_b(kernel.packet[15], kernel.packet[14]);
2359 __m128i tf = __lsx_vilvh_b(kernel.packet[15], kernel.packet[14]);
2360
2361 __m128i s0 = __lsx_vilvl_h(t2, t0);
2362 __m128i s1 = __lsx_vilvh_h(t2, t0);
2363 __m128i s2 = __lsx_vilvl_h(t3, t1);
2364 __m128i s3 = __lsx_vilvh_h(t3, t1);
2365 __m128i s4 = __lsx_vilvl_h(t6, t4);
2366 __m128i s5 = __lsx_vilvh_h(t6, t4);
2367 __m128i s6 = __lsx_vilvl_h(t7, t5);
2368 __m128i s7 = __lsx_vilvh_h(t7, t5);
2369 __m128i s8 = __lsx_vilvl_h(ta, t8);
2370 __m128i s9 = __lsx_vilvh_h(ta, t8);
2371 __m128i sa = __lsx_vilvl_h(tb, t9);
2372 __m128i sb = __lsx_vilvh_h(tb, t9);
2373 __m128i sc = __lsx_vilvl_h(te, tc);
2374 __m128i sd = __lsx_vilvh_h(te, tc);
2375 __m128i se = __lsx_vilvl_h(tf, td);
2376 __m128i sf = __lsx_vilvh_h(tf, td);
2377
2378 __m128i u0 = __lsx_vilvl_w(s4, s0);
2379 __m128i u1 = __lsx_vilvh_w(s4, s0);
2380 __m128i u2 = __lsx_vilvl_w(s5, s1);
2381 __m128i u3 = __lsx_vilvh_w(s5, s1);
2382 __m128i u4 = __lsx_vilvl_w(s6, s2);
2383 __m128i u5 = __lsx_vilvh_w(s6, s2);
2384 __m128i u6 = __lsx_vilvl_w(s7, s3);
2385 __m128i u7 = __lsx_vilvh_w(s7, s3);
2386 __m128i u8 = __lsx_vilvl_w(sc, s8);
2387 __m128i u9 = __lsx_vilvh_w(sc, s8);
2388 __m128i ua = __lsx_vilvl_w(sd, s9);
2389 __m128i ub = __lsx_vilvh_w(sd, s9);
2390 __m128i uc = __lsx_vilvl_w(se, sa);
2391 __m128i ud = __lsx_vilvh_w(se, sa);
2392 __m128i ue = __lsx_vilvl_w(sf, sb);
2393 __m128i uf = __lsx_vilvh_w(sf, sb);
2394
2395 kernel.packet[0] = __lsx_vilvl_d(u8, u0);
2396 kernel.packet[1] = __lsx_vilvh_d(u8, u0);
2397 kernel.packet[2] = __lsx_vilvl_d(u9, u1);
2398 kernel.packet[3] = __lsx_vilvh_d(u9, u1);
2399 kernel.packet[4] = __lsx_vilvl_d(ua, u2);
2400 kernel.packet[5] = __lsx_vilvh_d(ua, u2);
2401 kernel.packet[6] = __lsx_vilvl_d(ub, u3);
2402 kernel.packet[7] = __lsx_vilvh_d(ub, u3);
2403 kernel.packet[8] = __lsx_vilvl_d(uc, u4);
2404 kernel.packet[9] = __lsx_vilvh_d(uc, u4);
2405 kernel.packet[10] = __lsx_vilvl_d(ud, u5);
2406 kernel.packet[11] = __lsx_vilvh_d(ud, u5);
2407 kernel.packet[12] = __lsx_vilvl_d(ue, u6);
2408 kernel.packet[13] = __lsx_vilvh_d(ue, u6);
2409 kernel.packet[14] = __lsx_vilvl_d(uf, u7);
2410 kernel.packet[15] = __lsx_vilvh_d(uf, u7);
2411}
2412EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 8>& kernel) {
2413 __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2414 __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2415 __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2416 __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2417 __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
2418 __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
2419 __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
2420 __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
2421
2422 __m128i s0 = __lsx_vilvl_h(t2, t0);
2423 __m128i s1 = __lsx_vilvh_h(t2, t0);
2424 __m128i s2 = __lsx_vilvl_h(t3, t1);
2425 __m128i s3 = __lsx_vilvh_h(t3, t1);
2426 __m128i s4 = __lsx_vilvl_h(t6, t4);
2427 __m128i s5 = __lsx_vilvh_h(t6, t4);
2428 __m128i s6 = __lsx_vilvl_h(t7, t5);
2429 __m128i s7 = __lsx_vilvh_h(t7, t5);
2430
2431 kernel.packet[0] = __lsx_vilvl_w(s4, s0);
2432 kernel.packet[1] = __lsx_vilvh_w(s4, s0);
2433 kernel.packet[2] = __lsx_vilvl_w(s5, s1);
2434 kernel.packet[3] = __lsx_vilvh_w(s5, s1);
2435 kernel.packet[4] = __lsx_vilvl_w(s6, s2);
2436 kernel.packet[5] = __lsx_vilvh_w(s6, s2);
2437 kernel.packet[6] = __lsx_vilvl_w(s7, s3);
2438 kernel.packet[7] = __lsx_vilvh_w(s7, s3);
2439}
2440EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
2441 __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2442 __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2443 __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2444 __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2445
2446 kernel.packet[0] = __lsx_vilvl_h(t2, t0);
2447 kernel.packet[1] = __lsx_vilvh_h(t2, t0);
2448 kernel.packet[2] = __lsx_vilvl_h(t3, t1);
2449 kernel.packet[3] = __lsx_vilvh_h(t3, t1);
2450}
2451EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
2452 __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
2453 __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
2454 __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
2455 __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
2456 __m128i t4 = __lsx_vilvl_h(kernel.packet[5], kernel.packet[4]);
2457 __m128i t5 = __lsx_vilvh_h(kernel.packet[5], kernel.packet[4]);
2458 __m128i t6 = __lsx_vilvl_h(kernel.packet[7], kernel.packet[6]);
2459 __m128i t7 = __lsx_vilvh_h(kernel.packet[7], kernel.packet[6]);
2460
2461 __m128i s0 = __lsx_vilvl_w(t2, t0);
2462 __m128i s1 = __lsx_vilvh_w(t2, t0);
2463 __m128i s2 = __lsx_vilvl_w(t3, t1);
2464 __m128i s3 = __lsx_vilvh_w(t3, t1);
2465 __m128i s4 = __lsx_vilvl_w(t6, t4);
2466 __m128i s5 = __lsx_vilvh_w(t6, t4);
2467 __m128i s6 = __lsx_vilvl_w(t7, t5);
2468 __m128i s7 = __lsx_vilvh_w(t7, t5);
2469
2470 kernel.packet[0] = __lsx_vilvl_d(s4, s0);
2471 kernel.packet[1] = __lsx_vilvh_d(s4, s0);
2472 kernel.packet[2] = __lsx_vilvl_d(s5, s1);
2473 kernel.packet[3] = __lsx_vilvh_d(s5, s1);
2474 kernel.packet[4] = __lsx_vilvl_d(s6, s2);
2475 kernel.packet[5] = __lsx_vilvh_d(s6, s2);
2476 kernel.packet[6] = __lsx_vilvl_d(s7, s3);
2477 kernel.packet[7] = __lsx_vilvh_d(s7, s3);
2478}
2479EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
2480 __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
2481 __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
2482 __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
2483 __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
2484
2485 kernel.packet[0] = __lsx_vilvl_w(t2, t0);
2486 kernel.packet[1] = __lsx_vilvh_w(t2, t0);
2487 kernel.packet[2] = __lsx_vilvl_w(t3, t1);
2488 kernel.packet[3] = __lsx_vilvh_w(t3, t1);
2489}
2490EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
2491 __m128i T0 = __lsx_vilvl_w(kernel.packet[1], kernel.packet[0]);
2492 __m128i T1 = __lsx_vilvh_w(kernel.packet[1], kernel.packet[0]);
2493 __m128i T2 = __lsx_vilvl_w(kernel.packet[3], kernel.packet[2]);
2494 __m128i T3 = __lsx_vilvh_w(kernel.packet[3], kernel.packet[2]);
2495
2496 kernel.packet[0] = __lsx_vilvl_d(T2, T0);
2497 kernel.packet[1] = __lsx_vilvh_d(T2, T0);
2498 kernel.packet[2] = __lsx_vilvl_d(T3, T1);
2499 kernel.packet[3] = __lsx_vilvh_d(T3, T1);
2500}
2501EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2ul, 2>& kernel) {
2502 __m128i tmp = __lsx_vilvh_d(kernel.packet[1], kernel.packet[0]);
2503 kernel.packet[0] = __lsx_vilvl_d(kernel.packet[1], kernel.packet[0]);
2504 kernel.packet[1] = tmp;
2505}
2506
2507template <>
2508EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
2509 return __lsx_vfrsqrt_s(a);
2510}
2511template <>
2512EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
2513 return __lsx_vfrsqrt_d(a);
2514}
2515
2516template <>
2517EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) {
2518 return __lsx_vfrintrm_s(a);
2519}
2520template <>
2521EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) {
2522 return __lsx_vfrintrm_d(a);
2523}
2524
2525template <>
2526EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) {
2527 return __lsx_vfrintrp_s(a);
2528}
2529template <>
2530EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) {
2531 return __lsx_vfrintrp_d(a);
2532}
2533
2534template <>
2535EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) {
2536 const Packet4f mask = pset1frombits<Packet4f>(static_cast<numext::uint32_t>(0x80000000u));
2537 const Packet4f prev0dot5 = pset1frombits<Packet4f>(static_cast<numext::uint32_t>(0x3EFFFFFFu));
2538 return __lsx_vfrintrz_s(padd(pxor(pand(a, mask), prev0dot5), a));
2539}
2540template <>
2541EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) {
2542 const Packet2d mask = pset1frombits<Packet2d>(static_cast<numext::uint64_t>(0x8000000000000000ull));
2543 const Packet2d prev0dot5 = pset1frombits<Packet2d>(static_cast<numext::uint64_t>(0x3FDFFFFFFFFFFFFFull));
2544 return __lsx_vfrintrz_d(padd(por(pand(a, mask), prev0dot5), a));
2545}
2546
2547template <>
2548EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
2549 return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask);
2550}
2551template <>
2552EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b) {
2553 return (Packet16c)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask);
2554}
2555
2556template <>
2557EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const int8_t* from) {
2558 int8_t tmp[16] = {*from, *from, *from, *from, *(from + 1), *(from + 1),
2559 *(from + 1), *(from + 1), *(from + 2), *(from + 2), *(from + 2), *(from + 2),
2560 *(from + 3), *(from + 3), *(from + 3), *(from + 3)};
2561 return __lsx_vld(tmp, 0);
2562}
2563template <>
2564EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const uint8_t* from) {
2565 uint8_t tmp[16] = {*from, *from, *from, *from, *(from + 1), *(from + 1),
2566 *(from + 1), *(from + 1), *(from + 2), *(from + 2), *(from + 2), *(from + 2),
2567 *(from + 3), *(from + 3), *(from + 3), *(from + 3)};
2568 return __lsx_vld(tmp, 0);
2569}
2570template <>
2571EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const int16_t* from) {
2572 int16_t tmp[8] = {*from, *from, *from, *from, *(from + 1), *(from + 1), *(from + 1), *(from + 1)};
2573 return __lsx_vld(tmp, 0);
2574}
2575template <>
2576EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const uint16_t* from) {
2577 uint16_t tmp[8] = {*from, *from, *from, *from, *(from + 1), *(from + 1), *(from + 1), *(from + 1)};
2578 return __lsx_vld(tmp, 0);
2579}
2580template <>
2581EIGEN_STRONG_INLINE Packet4i ploadquad<Packet4i>(const int32_t* from) {
2582 int32_t tmp[4] = {*from, *from, *from, *from};
2583 return __lsx_vld(tmp, 0);
2584}
2585template <>
2586EIGEN_STRONG_INLINE Packet4ui ploadquad<Packet4ui>(const uint32_t* from) {
2587 uint32_t tmp[4] = {*from, *from, *from, *from};
2588 return __lsx_vld(tmp, 0);
2589}
2590
2591template <>
2592EIGEN_STRONG_INLINE Packet16c pnmsub(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
2593 return __lsx_vmsub_b(pnegate(c), a, b);
2594}
2595template <>
2596EIGEN_STRONG_INLINE Packet8s pnmsub(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
2597 return __lsx_vmsub_h(pnegate(c), a, b);
2598}
2599template <>
2600EIGEN_STRONG_INLINE Packet4i pnmsub(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
2601 return __lsx_vmsub_w(pnegate(c), a, b);
2602}
2603template <>
2604EIGEN_STRONG_INLINE Packet2l pnmsub(const Packet2l& a, const Packet2l& b, const Packet2l& c) {
2605 return __lsx_vmsub_d(pnegate(c), a, b);
2606}
2607
2608template <>
2609EIGEN_STRONG_INLINE Packet16c pmsub(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
2610 return __lsx_vmadd_b(pnegate(c), a, b);
2611}
2612template <>
2613EIGEN_STRONG_INLINE Packet8s pmsub(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
2614 return __lsx_vmadd_h(pnegate(c), a, b);
2615}
2616template <>
2617EIGEN_STRONG_INLINE Packet4i pmsub(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
2618 return __lsx_vmadd_w(pnegate(c), a, b);
2619}
2620template <>
2621EIGEN_STRONG_INLINE Packet2l pmsub(const Packet2l& a, const Packet2l& b, const Packet2l& c) {
2622 return __lsx_vmadd_d(pnegate(c), a, b);
2623}
2624
2625template <>
2626EIGEN_STRONG_INLINE Packet16c pnmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
2627 return __lsx_vmsub_b(c, a, b);
2628}
2629template <>
2630EIGEN_STRONG_INLINE Packet8s pnmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
2631 return __lsx_vmsub_h(c, a, b);
2632}
2633template <>
2634EIGEN_STRONG_INLINE Packet4i pnmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
2635 return __lsx_vmsub_w(c, a, b);
2636}
2637template <>
2638EIGEN_STRONG_INLINE Packet2l pnmadd(const Packet2l& a, const Packet2l& b, const Packet2l& c) {
2639 return __lsx_vmsub_d(c, a, b);
2640}
2641
2642template <>
2643EIGEN_STRONG_INLINE Packet4f pexp(const Packet4f& _x) {
2644 return pexp_float(_x);
2645}
2646template <>
2647EIGEN_STRONG_INLINE Packet2d pexp(const Packet2d& _x) {
2648 return pexp_double(_x);
2649}
2650
2651template <>
2652EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
2653 return pldexp_generic(a, exponent);
2654}
2655
2656template <>
2657EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
2658 return pfrexp_generic(a, exponent);
2659}
2660template <>
2661EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
2662 return pfrexp_generic(a, exponent);
2663}
2664template <>
2665EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /* a */) {
2666 Packet4f v = {0.0f, 0.0f, 0.0f, 0.0f};
2667 return v;
2668}
2669template <>
2670EIGEN_STRONG_INLINE Packet4f pabsdiff<Packet4f>(const Packet4f& a, const Packet4f& b) {
2671 Packet4f v = psub(a, b);
2672 return pabs(v);
2673}
2674template <>
2675EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
2676 return pmin<Packet4f>(a, b);
2677}
2678template <>
2679EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
2680 return pmax<Packet4f>(a, b);
2681}
2682template <>
2683EIGEN_STRONG_INLINE Packet4f ploadquad<Packet4f>(const float* from) {
2684 return (__m128)__lsx_vldrepl_w(from, 0);
2685}
2686template <>
2687EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
2688 return (__m128)__lsx_vsrai_w((__m128i)a, 31);
2689}
2690template <>
2691EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
2692 return __lsx_vfrintrne_s(a);
2693}
2694template <>
2695EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(const Packet4f& a) {
2696 return __lsx_vfrintrz_s(a);
2697}
2698template <>
2699EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& a) {
2700 return __lsx_vfrecip_s(a);
2701}
2702
2703template <>
2704EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /* a */) {
2705 Packet2d v = {0.0, 0.0};
2706 return v;
2707}
2708template <>
2709EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
2710 return pmin<Packet2d>(a, b);
2711}
2712template <>
2713EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
2714 return pmax<Packet2d>(a, b);
2715}
2716template <>
2717EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
2718 return (__m128d)(__lsx_vsrai_d((__m128i)a, 63));
2719}
2720template <>
2721EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) {
2722 return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask);
2723}
2724template <>
2725EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
2726 return __lsx_vfrintrne_d(a);
2727}
2728template <>
2729EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(const Packet2d& a) {
2730 return __lsx_vfrintrz_d(a);
2731}
2732template <>
2733EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
2734 return pldexp_generic(a, exponent);
2735}
2736
2737template <>
2738EIGEN_STRONG_INLINE Packet16c pabsdiff<Packet16c>(const Packet16c& a, const Packet16c& b) {
2739 Packet16c v = psub(a, b);
2740 return pabs(v);
2741}
2742
2743template <>
2744EIGEN_STRONG_INLINE Packet8s pabsdiff<Packet8s>(const Packet8s& a, const Packet8s& b) {
2745 Packet8s v = psub(a, b);
2746 return pabs(v);
2747}
2748template <>
2749EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b) {
2750 return __lsx_vbitsel_v(b, a, mask);
2751}
2752
2753template <>
2754EIGEN_STRONG_INLINE Packet4i pabsdiff<Packet4i>(const Packet4i& a, const Packet4i& b) {
2755 Packet4i v = psub(a, b);
2756 return pabs(v);
2757}
2758template <>
2759EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
2760 return __lsx_vbitsel_v(b, a, mask);
2761}
2762
2763template <>
2764EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b) {
2765 return __lsx_vbitsel_v(b, a, mask);
2766}
2767
2768template <>
2769EIGEN_STRONG_INLINE Packet16uc pdiv<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
2770 return __lsx_vdiv_bu(a, b);
2771}
2772template <>
2773EIGEN_STRONG_INLINE Packet16uc pabsdiff<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
2774 Packet16uc v = psub(a, b);
2775 return pabs(v);
2776}
2777template <>
2778EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a,
2779 const Packet16uc& b) {
2780 return __lsx_vbitsel_v(b, a, mask);
2781}
2782template <>
2783EIGEN_STRONG_INLINE Packet16uc psqrt(const Packet16uc& a) {
2784 __m128i res = {0, 0};
2785 __m128i add = {0x0808080808080808, 0x0808080808080808};
2786 for (int i = 0; i < 4; i++) {
2787 const __m128i temp = __lsx_vor_v(res, add);
2788 const __m128i tmul = __lsx_vpackev_b(__lsx_vmulwod_h_bu(temp, temp), __lsx_vmulwev_h_bu(temp, temp));
2789 res = __lsx_vbitsel_v(res, temp, __lsx_vsle_bu(tmul, a));
2790 add = __lsx_vsrli_b(add, 1);
2791 }
2792 return res;
2793}
2794
2795template <>
2796EIGEN_STRONG_INLINE Packet8us pabsdiff<Packet8us>(const Packet8us& a, const Packet8us& b) {
2797 Packet8us v = psub(a, b);
2798 return pabs(v);
2799}
2800template <>
2801EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b) {
2802 return __lsx_vbitsel_v(b, a, mask);
2803}
2804template <>
2805EIGEN_STRONG_INLINE Packet8us psqrt(const Packet8us& a) {
2806 __m128i res = {0, 0};
2807 __m128i add = {0x0080008000800080, 0x0080008000800080};
2808 for (int i = 0; i < 4; i++) {
2809 const __m128i temp = __lsx_vor_v(res, add);
2810 const __m128i tmul = __lsx_vpackev_h(__lsx_vmulwod_w_hu(temp, temp), __lsx_vmulwev_w_hu(temp, temp));
2811 res = __lsx_vbitsel_v(res, temp, __lsx_vsle_hu(tmul, a));
2812 add = __lsx_vsrli_h(add, 1);
2813 }
2814 return res;
2815}
2816
2817template <>
2818EIGEN_STRONG_INLINE Packet4ui pabsdiff<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
2819 Packet4ui v = psub(a, b);
2820 return pabs(v);
2821}
2822template <>
2823EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) {
2824 return __lsx_vbitsel_v(b, a, mask);
2825}
2826template <>
2827EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) {
2828 __m128i res = {0, 0};
2829 __m128i add = {0x0000800000008000, 0x0000800000008000};
2830 for (int i = 0; i < 4; i++) {
2831 const __m128i temp = __lsx_vor_v(res, add);
2832 const __m128i tmul = __lsx_vpackev_w(__lsx_vmulwod_d_wu(temp, temp), __lsx_vmulwev_d_wu(temp, temp));
2833 res = __lsx_vbitsel_v(res, temp, __lsx_vsle_wu(tmul, a));
2834 add = __lsx_vsrli_w(add, 1);
2835 }
2836 return res;
2837}
2838
2839template <>
2840EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b) {
2841 return __lsx_vbitsel_v(b, a, mask);
2842}
2843
2844} // namespace internal
2845} // namespace Eigen
2846#endif
@ Aligned16
Definition Constants.h:237
Namespace containing all symbols from the Eigen library.
Definition B01_Experimental.dox:1
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:82