Eigen  5.0.1-dev+284dcc12
 
Loading...
Searching...
No Matches
PacketMath.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2023 Zang Ruochen <zangruochen@loongson.cn>
5// Copyright (C) 2024 XiWei Gu <guxiwei-hf@loongson.cn>
6//
7// This Source Code Form is subject to the terms of the Mozilla
8// Public License v. 2.0. If a copy of the MPL was not distributed
9// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
10
11#ifndef EIGEN_PACKET_MATH_LSX_H
12#define EIGEN_PACKET_MATH_LSX_H
13
14// IWYU pragma: private
15#include "../../InternalHeaderCheck.h"
16
17namespace Eigen {
18
19namespace internal {
20
21#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
22#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
23#endif
24
25#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
26#if EIGEN_ARCH_LOONGARCH64
27#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
28#endif
29#endif
30
31#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
32#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
33#endif
34
35typedef __m128 Packet4f;
36typedef __m128d Packet2d;
37
38typedef eigen_packet_wrapper<__m128i, 0> Packet16c;
39typedef eigen_packet_wrapper<__m128i, 1> Packet8s;
40typedef eigen_packet_wrapper<__m128i, 2> Packet4i;
41typedef eigen_packet_wrapper<__m128i, 3> Packet2l;
42typedef eigen_packet_wrapper<__m128i, 4> Packet16uc;
43typedef eigen_packet_wrapper<__m128i, 5> Packet8us;
44typedef eigen_packet_wrapper<__m128i, 6> Packet4ui;
45typedef eigen_packet_wrapper<__m128i, 7> Packet2ul;
46
47template <>
48struct is_arithmetic<__m128> {
49 enum { value = true };
50};
51template <>
52struct is_arithmetic<__m128i> {
53 enum { value = true };
54};
55template <>
56struct is_arithmetic<__m128d> {
57 enum { value = true };
58};
59template <>
60struct is_arithmetic<Packet16c> {
61 enum { value = true };
62};
63template <>
64struct is_arithmetic<Packet8s> {
65 enum { value = true };
66};
67template <>
68struct is_arithmetic<Packet4i> {
69 enum { value = true };
70};
71template <>
72struct is_arithmetic<Packet2l> {
73 enum { value = true };
74};
75template <>
76struct is_arithmetic<Packet16uc> {
77 enum { value = false };
78};
79template <>
80struct is_arithmetic<Packet8us> {
81 enum { value = false };
82};
83template <>
84struct is_arithmetic<Packet4ui> {
85 enum { value = false };
86};
87template <>
88struct is_arithmetic<Packet2ul> {
89 enum { value = false };
90};
91
92EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) {
93 float from[4] = {a, b, c, d};
94 return (Packet4f)__lsx_vld(from, 0);
95}
96
97EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask) {
98 const float* a = reinterpret_cast<const float*>(&m);
99 Packet4f res =
100 make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(a + ((mask >> 6) & 3)));
101 return res;
102}
103
104template <bool interleave>
105EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f& m, const Packet4f& n, int mask) {
106 const float* a = reinterpret_cast<const float*>(&m);
107 const float* b = reinterpret_cast<const float*>(&n);
108 Packet4f res =
109 make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
110 return res;
111}
112
113template <>
114EIGEN_STRONG_INLINE Packet4f shuffle2<true>(const Packet4f& m, const Packet4f& n, int mask) {
115 const float* a = reinterpret_cast<const float*>(&m);
116 const float* b = reinterpret_cast<const float*>(&n);
117 Packet4f res =
118 make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
119 return res;
120}
121
122EIGEN_STRONG_INLINE static int eigen_lsx_shuffle_mask(int p, int q, int r, int s) {
123 return ((s) << 6 | (r) << 4 | (q) << 2 | (p));
124}
125
126EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s) {
127 return shuffle1(a, eigen_lsx_shuffle_mask(p, q, r, s));
128}
129EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s) {
130 return shuffle2<false>(a, b, eigen_lsx_shuffle_mask(p, q, r, s));
131}
132EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) {
133 return shuffle2<false>(a, b, eigen_lsx_shuffle_mask(0, 1, 0, 1));
134}
135EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) {
136 return shuffle2<false>(b, a, eigen_lsx_shuffle_mask(2, 3, 2, 3));
137}
138EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) {
139 return shuffle2<true>(a, b, eigen_lsx_shuffle_mask(0, 0, 1, 1));
140}
141EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) {
142 return shuffle2<true>(a, b, eigen_lsx_shuffle_mask(2, 2, 3, 3));
143}
144
145EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) {
146 double from[2] = {a, b};
147 return (Packet2d)__lsx_vld(from, 0);
148}
149
150EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask) {
151 const double* a = reinterpret_cast<const double*>(&m);
152 const double* b = reinterpret_cast<const double*>(&n);
153 Packet2d res = make_packet2d(*(a + (mask & 1)), *(b + ((mask >> 1) & 1)));
154 return res;
155}
156
157EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d& a, const Packet2d& b, int mask) {
158 return shuffle(a, b, mask);
159}
160EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 0); }
161EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 3); }
162
163template <>
164struct packet_traits<int8_t> : default_packet_traits {
165 typedef Packet16c type;
166 typedef Packet16c half;
167 enum {
168 Vectorizable = 1,
169 AlignedOnScalar = 1,
170 size = 16,
171
172 HasAbs2 = 0,
173 HasSetLinear = 0,
174 HasCmp = 1,
175 HasBlend = 0
176 };
177};
178
179template <>
180struct packet_traits<int16_t> : default_packet_traits {
181 typedef Packet8s type;
182 typedef Packet8s half;
183 enum {
184 Vectorizable = 1,
185 AlignedOnScalar = 1,
186 size = 8,
187
188 HasAbs2 = 0,
189 HasSetLinear = 0,
190 HasCmp = 1,
191 HasDiv = 1,
192 HasBlend = 0
193 };
194};
195
196template <>
197struct packet_traits<int32_t> : default_packet_traits {
198 typedef Packet4i type;
199 typedef Packet4i half;
200 enum {
201 Vectorizable = 1,
202 AlignedOnScalar = 1,
203 size = 4,
204
205 HasAbs2 = 0,
206 HasSetLinear = 0,
207 HasCmp = 1,
208 HasDiv = 1,
209 HasBlend = 0
210 };
211};
212
213template <>
214struct packet_traits<int64_t> : default_packet_traits {
215 typedef Packet2l type;
216 typedef Packet2l half;
217 enum {
218 Vectorizable = 1,
219 AlignedOnScalar = 1,
220 size = 2,
221
222 HasAbs2 = 0,
223 HasSetLinear = 0,
224 HasCmp = 1,
225 HasDiv = 1,
226 HasBlend = 0
227 };
228};
229
230template <>
231struct packet_traits<uint8_t> : default_packet_traits {
232 typedef Packet16uc type;
233 typedef Packet16uc half;
234 enum {
235 Vectorizable = 1,
236 AlignedOnScalar = 1,
237 size = 16,
238
239 HasAbs2 = 0,
240 HasSetLinear = 0,
241 HasNegate = 0,
242 HasCmp = 1,
243 HasBlend = 0
244 };
245};
246
247template <>
248struct packet_traits<uint16_t> : default_packet_traits {
249 typedef Packet8us type;
250 typedef Packet8us half;
251 enum {
252 Vectorizable = 1,
253 AlignedOnScalar = 1,
254 size = 8,
255
256 HasAbs2 = 0,
257 HasSetLinear = 0,
258 HasNegate = 0,
259 HasCmp = 1,
260 HasDiv = 1,
261 HasBlend = 0
262 };
263};
264
265template <>
266struct packet_traits<uint32_t> : default_packet_traits {
267 typedef Packet4ui type;
268 typedef Packet4ui half;
269 enum {
270 Vectorizable = 1,
271 AlignedOnScalar = 1,
272 size = 4,
273
274 HasAbs2 = 0,
275 HasSetLinear = 0,
276 HasNegate = 0,
277 HasCmp = 1,
278 HasDiv = 1,
279 HasBlend = 0
280 };
281};
282
283template <>
284struct packet_traits<uint64_t> : default_packet_traits {
285 typedef Packet2ul type;
286 typedef Packet2ul half;
287 enum {
288 Vectorizable = 1,
289 AlignedOnScalar = 1,
290 size = 2,
291
292 HasAbs2 = 0,
293 HasSetLinear = 0,
294 HasNegate = 0,
295 HasCmp = 1,
296 HasDiv = 1,
297 HasBlend = 0
298 };
299};
300
301template <>
302struct packet_traits<float> : default_packet_traits {
303 typedef Packet4f type;
304 typedef Packet4f half;
305 enum {
306 Vectorizable = 1,
307 AlignedOnScalar = 1,
308 size = 4,
309
310 HasAbs2 = 0,
311 HasSetLinear = 0,
312 HasBlend = 0,
313 HasSign = 0,
314 HasDiv = 1,
315 HasExp = 1,
316 HasSqrt = 1,
317 HasLog = 1,
318 HasRsqrt = 1
319 };
320};
321
322template <>
323struct packet_traits<double> : default_packet_traits {
324 typedef Packet2d type;
325 typedef Packet2d half;
326 enum {
327 Vectorizable = 1,
328 AlignedOnScalar = 1,
329 size = 2,
330
331 HasAbs2 = 0,
332 HasSetLinear = 0,
333 HasBlend = 0,
334 HasSign = 0,
335 HasDiv = 1,
336 HasSqrt = 1,
337 HasLog = 1,
338 HasRsqrt = 1
339 };
340};
341
342template <>
343struct unpacket_traits<Packet16c> {
344 typedef int8_t type;
345 typedef Packet16c half;
346 enum {
347 size = 16,
348 alignment = Aligned16,
349 vectorizable = true,
350 masked_load_available = false,
351 masked_store_available = false
352 };
353};
354template <>
355struct unpacket_traits<Packet8s> {
356 typedef int16_t type;
357 typedef Packet8s half;
358 enum {
359 size = 8,
360 alignment = Aligned16,
361 vectorizable = true,
362 masked_load_available = false,
363 masked_store_available = false
364 };
365};
366template <>
367struct unpacket_traits<Packet4i> {
368 typedef int32_t type;
369 typedef Packet4i half;
370 enum {
371 size = 4,
372 alignment = Aligned16,
373 vectorizable = true,
374 masked_load_available = false,
375 masked_store_available = false
376 };
377};
378template <>
379struct unpacket_traits<Packet2l> {
380 typedef int64_t type;
381 typedef Packet2l half;
382 enum {
383 size = 2,
384 alignment = Aligned16,
385 vectorizable = true,
386 masked_load_available = false,
387 masked_store_available = false
388 };
389};
390template <>
391struct unpacket_traits<Packet16uc> {
392 typedef uint8_t type;
393 typedef Packet16uc half;
394 enum {
395 size = 16,
396 alignment = Aligned16,
397 vectorizable = true,
398 masked_load_available = false,
399 masked_store_available = false
400 };
401};
402template <>
403struct unpacket_traits<Packet8us> {
404 typedef uint16_t type;
405 typedef Packet8us half;
406 enum {
407 size = 8,
408 alignment = Aligned16,
409 vectorizable = true,
410 masked_load_available = false,
411 masked_store_available = false
412 };
413};
414template <>
415struct unpacket_traits<Packet4ui> {
416 typedef uint32_t type;
417 typedef Packet4ui half;
418 enum {
419 size = 4,
420 alignment = Aligned16,
421 vectorizable = true,
422 masked_load_available = false,
423 masked_store_available = false
424 };
425};
426template <>
427struct unpacket_traits<Packet2ul> {
428 typedef uint64_t type;
429 typedef Packet2ul half;
430 enum {
431 size = 2,
432 alignment = Aligned16,
433 vectorizable = true,
434 masked_load_available = false,
435 masked_store_available = false
436 };
437};
438template <>
439struct unpacket_traits<Packet4f> {
440 typedef float type;
441 typedef Packet4f half;
442 typedef Packet4i integer_packet;
443 enum {
444 size = 4,
445 alignment = Aligned16,
446 vectorizable = true,
447 masked_load_available = false,
448 masked_store_available = false
449 };
450};
451template <>
452struct unpacket_traits<Packet2d> {
453 typedef double type;
454 typedef Packet2d half;
455 typedef Packet2l integer_packet;
456 enum {
457 size = 2,
458 alignment = Aligned16,
459 vectorizable = true,
460 masked_load_available = false,
461 masked_store_available = false
462 };
463};
464
465template <>
466EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const int8_t& from) {
467 return __lsx_vreplgr2vr_b(from);
468}
469template <>
470EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const int16_t& from) {
471 return __lsx_vreplgr2vr_h(from);
472}
473template <>
474EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) {
475 return __lsx_vreplgr2vr_w(from);
476}
477template <>
478EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) {
479 return __lsx_vreplgr2vr_d(from);
480}
481template <>
482EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const uint8_t& from) {
483 return __lsx_vreplgr2vr_b(from);
484}
485template <>
486EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const uint16_t& from) {
487 return __lsx_vreplgr2vr_h(from);
488}
489template <>
490EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) {
491 return __lsx_vreplgr2vr_w(from);
492}
493template <>
494EIGEN_STRONG_INLINE Packet2ul pset1<Packet2ul>(const uint64_t& from) {
495 return __lsx_vreplgr2vr_d(from);
496}
497template <>
498EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
499 Packet4f v = {from, from, from, from};
500 return v;
501}
502template <>
503EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
504 Packet2d v = {from, from};
505 return v;
506}
507
508template <>
509EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(uint32_t from) {
510 return reinterpret_cast<__m128>((__m128i)pset1<Packet4ui>(from));
511}
512template <>
513EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
514 return reinterpret_cast<__m128d>((__m128i)pset1<Packet2ul>(from));
515}
516
517template <>
518EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const int8_t& a) {
519 const int8_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
520 return __lsx_vadd_b(pset1<Packet16c>(a), __lsx_vld(countdown, 0));
521}
522template <>
523EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const int16_t& a) {
524 const int16_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7};
525 return __lsx_vadd_h(pset1<Packet8s>(a), __lsx_vld(countdown, 0));
526}
527template <>
528EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a) {
529 const int32_t countdown[] = {0, 1, 2, 3};
530 return __lsx_vadd_w(pset1<Packet4i>(a), __lsx_vld(countdown, 0));
531}
532template <>
533EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(const int64_t& a) {
534 const int64_t countdown[] = {0, 1};
535 return __lsx_vadd_d(pset1<Packet2l>(a), __lsx_vld(countdown, 0));
536}
537template <>
538EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const uint8_t& a) {
539 const uint8_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
540 return __lsx_vadd_b(pset1<Packet16uc>(a), __lsx_vld(countdown, 0));
541}
542template <>
543EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const uint16_t& a) {
544 const uint16_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7};
545 return __lsx_vadd_h(pset1<Packet8us>(a), __lsx_vld(countdown, 0));
546}
547template <>
548EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a) {
549 const uint32_t countdown[] = {0, 1, 2, 3};
550 return __lsx_vadd_w(pset1<Packet4ui>(a), __lsx_vld(countdown, 0));
551}
552template <>
553EIGEN_STRONG_INLINE Packet2ul plset<Packet2ul>(const uint64_t& a) {
554 const uint64_t countdown[] = {0, 1};
555 return __lsx_vadd_d(pset1<Packet2ul>(a), __lsx_vld(countdown, 0));
556}
557template <>
558EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
559 static const Packet4f countdown = {0.0f, 1.0f, 2.0f, 3.0f};
560 return __lsx_vfadd_s(pset1<Packet4f>(a), countdown);
561}
562template <>
563EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
564 static const Packet2d countdown = {0.0f, 1.0f};
565 return __lsx_vfadd_d(pset1<Packet2d>(a), countdown);
566}
567
568template <>
569EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) {
570 return __lsx_vadd_b(a, b);
571}
572template <>
573EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) {
574 return __lsx_vadd_h(a, b);
575}
576template <>
577EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
578 return __lsx_vadd_w(a, b);
579}
580template <>
581EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(const Packet2l& a, const Packet2l& b) {
582 return __lsx_vadd_d(a, b);
583}
584template <>
585EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
586 return __lsx_vadd_b(a, b);
587}
588template <>
589EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) {
590 return __lsx_vadd_h(a, b);
591}
592template <>
593EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
594 return __lsx_vadd_w(a, b);
595}
596template <>
597EIGEN_STRONG_INLINE Packet2ul padd<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
598 return __lsx_vadd_d(a, b);
599}
600template <>
601EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
602 return __lsx_vfadd_s(a, b);
603}
604template <>
605EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
606 return __lsx_vfadd_d(a, b);
607}
608
609template <>
610EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) {
611 return __lsx_vsub_b(a, b);
612}
613template <>
614EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) {
615 return __lsx_vsub_h(a, b);
616}
617template <>
618EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
619 return __lsx_vsub_w(a, b);
620}
621template <>
622EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) {
623 return __lsx_vsub_d(a, b);
624}
625template <>
626EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
627 return __lsx_vsub_b(a, b);
628}
629template <>
630EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) {
631 return __lsx_vsub_h(a, b);
632}
633template <>
634EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
635 return __lsx_vsub_w(a, b);
636}
637template <>
638EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
639 return __lsx_vsub_d(a, b);
640}
641template <>
642EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
643 return __lsx_vfsub_s(a, b);
644}
645template <>
646EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
647 return __lsx_vfsub_d(a, b);
648}
649
650template <>
651EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
652template <>
653EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
654 const Packet4f mask =
655 make_packet4f(numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f);
656 return padd(a, pxor(mask, b));
657}
658template <>
659EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b);
660template <>
661EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b) {
662 const Packet2d mask = make_packet2d(numext::bit_cast<double>(0x8000000000000000ull), 0.0);
663 return padd(a, pxor(mask, b));
664}
665
666template <>
667EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
668 Packet4f mask = make_packet4f(numext::bit_cast<float>(0x80000000), numext::bit_cast<float>(0x80000000),
669 numext::bit_cast<float>(0x80000000), numext::bit_cast<float>(0x80000000));
670 return (Packet4f)__lsx_vxor_v(numext::bit_cast<__m128i>(mask), numext::bit_cast<__m128i>(a));
671}
672template <>
673EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
674 Packet2d mask =
675 make_packet2d(numext::bit_cast<double>(0x8000000000000000), numext::bit_cast<double>(0x8000000000000000));
676 return (Packet2d)__lsx_vxor_v(numext::bit_cast<__m128i>(mask), numext::bit_cast<__m128i>(a));
677}
678template <>
679EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) {
680 return __lsx_vneg_b(a);
681}
682template <>
683EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) {
684 return __lsx_vneg_h(a);
685}
686template <>
687EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
688 return __lsx_vneg_w(a);
689}
690template <>
691EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) {
692 return __lsx_vneg_d(a);
693}
694
695template <>
696EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
697 return a;
698}
699template <>
700EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
701 return a;
702}
703template <>
704EIGEN_STRONG_INLINE Packet16c pconj(const Packet16c& a) {
705 return a;
706}
707template <>
708EIGEN_STRONG_INLINE Packet8s pconj(const Packet8s& a) {
709 return a;
710}
711template <>
712EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
713 return a;
714}
715template <>
716EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) {
717 return a;
718}
719template <>
720EIGEN_STRONG_INLINE Packet16uc pconj(const Packet16uc& a) {
721 return a;
722}
723template <>
724EIGEN_STRONG_INLINE Packet8us pconj(const Packet8us& a) {
725 return a;
726}
727template <>
728EIGEN_STRONG_INLINE Packet4ui pconj(const Packet4ui& a) {
729 return a;
730}
731template <>
732EIGEN_STRONG_INLINE Packet2ul pconj(const Packet2ul& a) {
733 return a;
734}
735
736template <>
737EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
738 return __lsx_vfmul_s(a, b);
739}
740template <>
741EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
742 return __lsx_vfmul_d(a, b);
743}
744template <>
745EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) {
746 return __lsx_vmul_b(a, b);
747}
748template <>
749EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) {
750 return __lsx_vmul_h(a, b);
751}
752template <>
753EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
754 return __lsx_vmul_w(a, b);
755}
756template <>
757EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(const Packet2l& a, const Packet2l& b) {
758 return __lsx_vmul_d(a, b);
759}
760template <>
761EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
762 return __lsx_vmul_b(a, b);
763}
764template <>
765EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) {
766 return __lsx_vmul_h(a, b);
767}
768template <>
769EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
770 return __lsx_vmul_w(a, b);
771}
772template <>
773EIGEN_STRONG_INLINE Packet2ul pmul<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
774 return __lsx_vmul_d(a, b);
775}
776
777template <>
778EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
779 return __lsx_vfdiv_s(a, b);
780}
781template <>
782EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
783 return __lsx_vfdiv_d(a, b);
784}
785template <>
786EIGEN_STRONG_INLINE Packet8s pdiv<Packet8s>(const Packet8s& a, const Packet8s& b) {
787 return __lsx_vdiv_h(a, b);
788}
789template <>
790EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
791 return __lsx_vdiv_w(a, b);
792}
793template <>
794EIGEN_STRONG_INLINE Packet2l pdiv<Packet2l>(const Packet2l& a, const Packet2l& b) {
795 return __lsx_vdiv_d(a, b);
796}
797template <>
798EIGEN_STRONG_INLINE Packet8us pdiv<Packet8us>(const Packet8us& a, const Packet8us& b) {
799 return __lsx_vdiv_hu(a, b);
800}
801template <>
802EIGEN_STRONG_INLINE Packet4ui pdiv<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
803 return __lsx_vdiv_wu(a, b);
804}
805template <>
806EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
807 return __lsx_vdiv_du(a, b);
808}
809
810template <>
811EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
812 return __lsx_vfmadd_s(a, b, c);
813}
814template <>
815EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
816 return __lsx_vfmadd_d(a, b, c);
817}
818template <>
819EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
820 return __lsx_vfmsub_s(a, b, c);
821}
822template <>
823EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
824 return __lsx_vfmsub_d(a, b, c);
825}
826template <>
827EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
828 return __lsx_vfnmsub_s(a, b, c);
829}
830template <>
831EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
832 return __lsx_vfnmsub_d(a, b, c);
833}
834template <>
835EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
836 return __lsx_vfnmadd_s(a, b, c);
837}
838template <>
839EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
840 return __lsx_vfnmadd_d(a, b, c);
841}
842template <>
843EIGEN_STRONG_INLINE Packet16c pmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
844 return __lsx_vmadd_b(c, a, b);
845}
846template <>
847EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
848 return __lsx_vmadd_h(c, a, b);
849}
850template <>
851EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
852 return __lsx_vmadd_w(c, a, b);
853}
854template <>
855EIGEN_STRONG_INLINE Packet2l pmadd(const Packet2l& a, const Packet2l& b, const Packet2l& c) {
856 return __lsx_vmadd_d(c, a, b);
857}
858template <>
859EIGEN_STRONG_INLINE Packet16uc pmadd(const Packet16uc& a, const Packet16uc& b, const Packet16uc& c) {
860 return __lsx_vmadd_b(c, a, b);
861}
862template <>
863EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) {
864 return __lsx_vmadd_h(c, a, b);
865}
866template <>
867EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c) {
868 return __lsx_vmadd_w(c, a, b);
869}
870template <>
871EIGEN_STRONG_INLINE Packet2ul pmadd(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c) {
872 return __lsx_vmadd_d(c, a, b);
873}
874
875template <>
876EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
877 return (Packet4f)__lsx_vand_v((__m128i)a, (__m128i)b);
878}
879template <>
880EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
881 return (Packet2d)__lsx_vand_v((__m128i)a, (__m128i)b);
882}
883template <>
884EIGEN_STRONG_INLINE Packet16c pand<Packet16c>(const Packet16c& a, const Packet16c& b) {
885 return __lsx_vand_v(a, b);
886}
887template <>
888EIGEN_STRONG_INLINE Packet8s pand<Packet8s>(const Packet8s& a, const Packet8s& b) {
889 return __lsx_vand_v(a, b);
890}
891template <>
892EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
893 return __lsx_vand_v(a, b);
894}
895template <>
896EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(const Packet2l& a, const Packet2l& b) {
897 return __lsx_vand_v(a, b);
898}
899template <>
900EIGEN_STRONG_INLINE Packet16uc pand<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
901 return __lsx_vand_v(a, b);
902}
903template <>
904EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) {
905 return __lsx_vand_v(a, b);
906}
907template <>
908EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
909 return __lsx_vand_v(a, b);
910}
911template <>
912EIGEN_STRONG_INLINE Packet2ul pand<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
913 return __lsx_vand_v(a, b);
914}
915
916template <>
917EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
918 return (Packet4f)__lsx_vor_v((__m128i)a, (__m128i)b);
919}
920template <>
921EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
922 return (Packet2d)__lsx_vor_v((__m128i)a, (__m128i)b);
923}
924template <>
925EIGEN_STRONG_INLINE Packet16c por<Packet16c>(const Packet16c& a, const Packet16c& b) {
926 return __lsx_vor_v(a, b);
927}
928template <>
929EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) {
930 return __lsx_vor_v(a, b);
931}
932template <>
933EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
934 return __lsx_vor_v(a, b);
935}
936template <>
937EIGEN_STRONG_INLINE Packet2l por<Packet2l>(const Packet2l& a, const Packet2l& b) {
938 return __lsx_vor_v(a, b);
939}
940template <>
941EIGEN_STRONG_INLINE Packet16uc por<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
942 return __lsx_vor_v(a, b);
943}
944template <>
945EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) {
946 return __lsx_vor_v(a, b);
947}
948template <>
949EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
950 return __lsx_vor_v(a, b);
951}
952template <>
953EIGEN_STRONG_INLINE Packet2ul por<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
954 return __lsx_vor_v(a, b);
955}
956
957template <>
958EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
959 return (Packet4f)__lsx_vxor_v((__m128i)a, (__m128i)b);
960}
961template <>
962EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
963 return (Packet2d)__lsx_vxor_v((__m128i)a, (__m128i)b);
964}
965template <>
966EIGEN_STRONG_INLINE Packet16c pxor<Packet16c>(const Packet16c& a, const Packet16c& b) {
967 return __lsx_vxor_v(a, b);
968}
969template <>
970EIGEN_STRONG_INLINE Packet8s pxor<Packet8s>(const Packet8s& a, const Packet8s& b) {
971 return __lsx_vxor_v(a, b);
972}
973template <>
974EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
975 return __lsx_vxor_v(a, b);
976}
977template <>
978EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(const Packet2l& a, const Packet2l& b) {
979 return __lsx_vxor_v(a, b);
980}
981template <>
982EIGEN_STRONG_INLINE Packet16uc pxor<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
983 return __lsx_vxor_v(a, b);
984}
985template <>
986EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b) {
987 return __lsx_vxor_v(a, b);
988}
989template <>
990EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
991 return __lsx_vxor_v(a, b);
992}
993template <>
994EIGEN_STRONG_INLINE Packet2ul pxor<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
995 return __lsx_vxor_v(a, b);
996}
997
998template <>
999EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
1000 return (Packet4f)__lsx_vandn_v((__m128i)b, (__m128i)a);
1001}
1002template <>
1003EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
1004 return (Packet2d)__lsx_vandn_v((__m128i)b, (__m128i)a);
1005}
1006template <>
1007EIGEN_STRONG_INLINE Packet16c pandnot<Packet16c>(const Packet16c& a, const Packet16c& b) {
1008 return __lsx_vandn_v(b, a);
1009}
1010template <>
1011EIGEN_STRONG_INLINE Packet8s pandnot<Packet8s>(const Packet8s& a, const Packet8s& b) {
1012 return __lsx_vandn_v(b, a);
1013}
1014template <>
1015EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
1016 return __lsx_vandn_v(b, a);
1017}
1018template <>
1019EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, const Packet2l& b) {
1020 return __lsx_vandn_v(b, a);
1021}
1022template <>
1023EIGEN_STRONG_INLINE Packet16uc pandnot<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1024 return __lsx_vandn_v(b, a);
1025}
1026template <>
1027EIGEN_STRONG_INLINE Packet8us pandnot<Packet8us>(const Packet8us& a, const Packet8us& b) {
1028 return __lsx_vandn_v(b, a);
1029}
1030template <>
1031EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1032 return __lsx_vandn_v(b, a);
1033}
1034template <>
1035EIGEN_STRONG_INLINE Packet2ul pandnot<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1036 return __lsx_vandn_v(b, a);
1037}
1038
1039template <>
1040EIGEN_STRONG_INLINE Packet4f pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b) {
1041 return (Packet4f)__lsx_vfcmp_cle_s(a, b);
1042}
1043template <>
1044EIGEN_STRONG_INLINE Packet2d pcmp_le<Packet2d>(const Packet2d& a, const Packet2d& b) {
1045 return (Packet2d)__lsx_vfcmp_cle_d(a, b);
1046}
1047template <>
1048EIGEN_STRONG_INLINE Packet16c pcmp_le<Packet16c>(const Packet16c& a, const Packet16c& b) {
1049 return __lsx_vsle_b(a, b);
1050}
1051template <>
1052EIGEN_STRONG_INLINE Packet8s pcmp_le<Packet8s>(const Packet8s& a, const Packet8s& b) {
1053 return __lsx_vsle_h(a, b);
1054}
1055template <>
1056EIGEN_STRONG_INLINE Packet4i pcmp_le<Packet4i>(const Packet4i& a, const Packet4i& b) {
1057 return __lsx_vsle_w(a, b);
1058}
1059template <>
1060EIGEN_STRONG_INLINE Packet2l pcmp_le<Packet2l>(const Packet2l& a, const Packet2l& b) {
1061 return __lsx_vsle_d(a, b);
1062}
1063template <>
1064EIGEN_STRONG_INLINE Packet16uc pcmp_le<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1065 return __lsx_vsle_bu(a, b);
1066}
1067template <>
1068EIGEN_STRONG_INLINE Packet8us pcmp_le<Packet8us>(const Packet8us& a, const Packet8us& b) {
1069 return __lsx_vsle_hu(a, b);
1070}
1071template <>
1072EIGEN_STRONG_INLINE Packet4ui pcmp_le<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1073 return __lsx_vsle_wu(a, b);
1074}
1075template <>
1076EIGEN_STRONG_INLINE Packet2ul pcmp_le<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1077 return __lsx_vsle_du(a, b);
1078}
1079
1080template <>
1081EIGEN_STRONG_INLINE Packet4f pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b) {
1082 return (Packet4f)__lsx_vfcmp_clt_s(a, b);
1083}
1084template <>
1085EIGEN_STRONG_INLINE Packet2d pcmp_lt<Packet2d>(const Packet2d& a, const Packet2d& b) {
1086 return (Packet2d)__lsx_vfcmp_clt_d(a, b);
1087}
1088template <>
1089EIGEN_STRONG_INLINE Packet16c pcmp_lt<Packet16c>(const Packet16c& a, const Packet16c& b) {
1090 return __lsx_vslt_b(a, b);
1091}
1092template <>
1093EIGEN_STRONG_INLINE Packet8s pcmp_lt<Packet8s>(const Packet8s& a, const Packet8s& b) {
1094 return __lsx_vslt_h(a, b);
1095}
1096template <>
1097EIGEN_STRONG_INLINE Packet4i pcmp_lt<Packet4i>(const Packet4i& a, const Packet4i& b) {
1098 return __lsx_vslt_w(a, b);
1099}
1100template <>
1101EIGEN_STRONG_INLINE Packet2l pcmp_lt<Packet2l>(const Packet2l& a, const Packet2l& b) {
1102 return __lsx_vslt_d(a, b);
1103}
1104template <>
1105EIGEN_STRONG_INLINE Packet16uc pcmp_lt<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1106 return __lsx_vslt_bu(a, b);
1107}
1108template <>
1109EIGEN_STRONG_INLINE Packet8us pcmp_lt<Packet8us>(const Packet8us& a, const Packet8us& b) {
1110 return __lsx_vslt_hu(a, b);
1111}
1112template <>
1113EIGEN_STRONG_INLINE Packet4ui pcmp_lt<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1114 return __lsx_vslt_wu(a, b);
1115}
1116template <>
1117EIGEN_STRONG_INLINE Packet2ul pcmp_lt<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1118 return __lsx_vslt_du(a, b);
1119}
1120
1121template <>
1122EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan<Packet4f>(const Packet4f& a, const Packet4f& b) {
1123 return (Packet4f)__lsx_vfcmp_sult_s(a, b);
1124}
1125template <>
1126EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan<Packet2d>(const Packet2d& a, const Packet2d& b) {
1127 return (Packet2d)__lsx_vfcmp_sult_d(a, b);
1128}
1129
1130template <>
1131EIGEN_STRONG_INLINE Packet4f pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b) {
1132 return (Packet4f)__lsx_vfcmp_seq_s(a, b);
1133}
1134template <>
1135EIGEN_STRONG_INLINE Packet2d pcmp_eq<Packet2d>(const Packet2d& a, const Packet2d& b) {
1136 return (Packet2d)__lsx_vfcmp_seq_d(a, b);
1137}
1138template <>
1139EIGEN_STRONG_INLINE Packet16c pcmp_eq<Packet16c>(const Packet16c& a, const Packet16c& b) {
1140 return __lsx_vseq_b(a, b);
1141}
1142template <>
1143EIGEN_STRONG_INLINE Packet8s pcmp_eq<Packet8s>(const Packet8s& a, const Packet8s& b) {
1144 return __lsx_vseq_h(a, b);
1145}
1146template <>
1147EIGEN_STRONG_INLINE Packet4i pcmp_eq<Packet4i>(const Packet4i& a, const Packet4i& b) {
1148 return __lsx_vseq_w(a, b);
1149}
1150template <>
1151EIGEN_STRONG_INLINE Packet2l pcmp_eq<Packet2l>(const Packet2l& a, const Packet2l& b) {
1152 return __lsx_vseq_d(a, b);
1153}
1154template <>
1155EIGEN_STRONG_INLINE Packet16uc pcmp_eq<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1156 return __lsx_vseq_b(a, b);
1157}
1158template <>
1159EIGEN_STRONG_INLINE Packet8us pcmp_eq<Packet8us>(const Packet8us& a, const Packet8us& b) {
1160 return __lsx_vseq_h(a, b);
1161}
1162template <>
1163EIGEN_STRONG_INLINE Packet4ui pcmp_eq<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1164 return __lsx_vseq_w(a, b);
1165}
1166template <>
1167EIGEN_STRONG_INLINE Packet2ul pcmp_eq<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1168 return __lsx_vseq_d(a, b);
1169}
1170
1171template <>
1172EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) {
1173 return __lsx_vmin_b(a, b);
1174}
1175template <>
1176EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) {
1177 return __lsx_vmin_h(a, b);
1178}
1179template <>
1180EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
1181 return __lsx_vmin_w(a, b);
1182}
1183template <>
1184EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(const Packet2l& a, const Packet2l& b) {
1185 return __lsx_vmin_d(a, b);
1186}
1187template <>
1188EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1189 return __lsx_vmin_bu(a, b);
1190}
1191template <>
1192EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) {
1193 return __lsx_vmin_hu(a, b);
1194}
1195template <>
1196EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1197 return __lsx_vmin_wu(a, b);
1198}
1199template <>
1200EIGEN_STRONG_INLINE Packet2ul pmin<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1201 return __lsx_vmin_du(a, b);
1202}
1203
1204template <>
1205EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) {
1206 return __lsx_vmax_b(a, b);
1207}
1208template <>
1209EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) {
1210 return __lsx_vmax_h(a, b);
1211}
1212template <>
1213EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
1214 return __lsx_vmax_w(a, b);
1215}
1216template <>
1217EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(const Packet2l& a, const Packet2l& b) {
1218 return __lsx_vmax_d(a, b);
1219}
1220template <>
1221EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1222 return __lsx_vmax_bu(a, b);
1223}
1224template <>
1225EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) {
1226 return __lsx_vmax_hu(a, b);
1227}
1228template <>
1229EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1230 return __lsx_vmax_wu(a, b);
1231}
1232template <>
1233EIGEN_STRONG_INLINE Packet2ul pmax<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
1234 return __lsx_vmax_du(a, b);
1235}
1236
1237template <>
1238EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
1239 Packet4i aNaN = __lsx_vfcmp_cun_s(a, a);
1240 Packet4i aMinOrNaN = por<Packet4i>(__lsx_vfcmp_clt_s(a, b), aNaN);
1241 return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMinOrNaN);
1242}
1243template <>
1244EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
1245 Packet2l aNaN = __lsx_vfcmp_cun_d(a, a);
1246 Packet2l aMinOrNaN = por<Packet2l>(__lsx_vfcmp_clt_d(a, b), aNaN);
1247 return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMinOrNaN);
1248}
1249template <>
1250EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
1251 Packet4i aNaN = __lsx_vfcmp_cun_s(a, a);
1252 Packet4i aMaxOrNaN = por<Packet4i>(__lsx_vfcmp_clt_s(b, a), aNaN);
1253 return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMaxOrNaN);
1254}
1255template <>
1256EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
1257 Packet2l aNaN = __lsx_vfcmp_cun_d(a, a);
1258 Packet2l aMaxOrNaN = por<Packet2l>(__lsx_vfcmp_clt_d(b, a), aNaN);
1259 return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMaxOrNaN);
1260}
1261
1262template <int N>
1263EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(const Packet16c& a) {
1264 return __lsx_vsrai_b((__m128i)a, N);
1265}
1266template <int N>
1267EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(const Packet8s& a) {
1268 return __lsx_vsrai_h((__m128i)a, N);
1269}
1270template <int N>
1271EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
1272 return __lsx_vsrai_w((__m128i)a, N);
1273}
1274template <int N>
1275EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(const Packet2l& a) {
1276 return __lsx_vsrai_d((__m128i)a, N);
1277}
1278template <int N>
1279EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(const Packet16uc& a) {
1280 return __lsx_vsrli_b((__m128i)a, N);
1281}
1282template <int N>
1283EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(const Packet8us& a) {
1284 return __lsx_vsrli_h((__m128i)a, N);
1285}
1286template <int N>
1287EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(const Packet4ui& a) {
1288 return __lsx_vsrli_w((__m128i)a, N);
1289}
1290template <int N>
1291EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(const Packet2ul& a) {
1292 return __lsx_vsrli_d((__m128i)a, N);
1293}
1294
1295template <int N>
1296EIGEN_STRONG_INLINE Packet16c plogical_shift_right(const Packet16c& a) {
1297 return __lsx_vsrli_b((__m128i)a, N);
1298}
1299template <int N>
1300EIGEN_STRONG_INLINE Packet8s plogical_shift_right(const Packet8s& a) {
1301 return __lsx_vsrli_h((__m128i)a, N);
1302}
1303template <int N>
1304EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) {
1305 return __lsx_vsrli_w((__m128i)a, N);
1306}
1307template <int N>
1308EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
1309 return __lsx_vsrli_d((__m128i)a, N);
1310}
1311template <int N>
1312EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(const Packet16uc& a) {
1313 return __lsx_vsrli_b((__m128i)a, N);
1314}
1315template <int N>
1316EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a) {
1317 return __lsx_vsrli_h((__m128i)a, N);
1318}
1319template <int N>
1320EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a) {
1321 return __lsx_vsrli_w((__m128i)a, N);
1322}
1323template <int N>
1324EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(const Packet2ul& a) {
1325 return __lsx_vsrli_d((__m128i)a, N);
1326}
1327
1328template <int N>
1329EIGEN_STRONG_INLINE Packet16c plogical_shift_left(const Packet16c& a) {
1330 return __lsx_vslli_b((__m128i)a, N);
1331}
1332template <int N>
1333EIGEN_STRONG_INLINE Packet8s plogical_shift_left(const Packet8s& a) {
1334 return __lsx_vslli_h((__m128i)a, N);
1335}
1336template <int N>
1337EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
1338 return __lsx_vslli_w((__m128i)a, N);
1339}
1340template <int N>
1341EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
1342 return __lsx_vslli_d((__m128i)a, N);
1343}
1344template <int N>
1345EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(const Packet16uc& a) {
1346 return __lsx_vslli_b((__m128i)a, N);
1347}
1348template <int N>
1349EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a) {
1350 return __lsx_vslli_h((__m128i)a, N);
1351}
1352template <int N>
1353EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a) {
1354 return __lsx_vslli_w((__m128i)a, N);
1355}
1356template <int N>
1357EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(const Packet2ul& a) {
1358 return __lsx_vslli_d((__m128i)a, N);
1359}
1360
1361template <>
1362EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
1363 return (Packet4f)__lsx_vbitclri_w((__m128i)a, 31);
1364}
1365template <>
1366EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
1367 return (Packet2d)__lsx_vbitclri_d((__m128i)a, 63);
1368}
1369template <>
1370EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) {
1371 return __lsx_vabsd_b(a, pzero(a));
1372}
1373template <>
1374EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) {
1375 return __lsx_vabsd_h(a, pzero(a));
1376}
1377template <>
1378EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
1379 return __lsx_vabsd_w(a, pzero(a));
1380}
1381template <>
1382EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) {
1383 return __lsx_vabsd_d(a, pzero(a));
1384}
1385template <>
1386EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) {
1387 return a;
1388}
1389template <>
1390EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) {
1391 return a;
1392}
1393template <>
1394EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) {
1395 return a;
1396}
1397template <>
1398EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) {
1399 return a;
1400}
1401
1402template <>
1403EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
1404 EIGEN_DEBUG_ALIGNED_LOAD return (Packet4f)__lsx_vld(from, 0);
1405}
1406template <>
1407EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
1408 EIGEN_DEBUG_ALIGNED_LOAD return (Packet2d)__lsx_vld(from, 0);
1409}
1410template <>
1411EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const int8_t* from) {
1412 EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
1413}
1414template <>
1415EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const int16_t* from) {
1416 EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
1417}
1418template <>
1419EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) {
1420 EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
1421}
1422template <>
1423EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from) {
1424 EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
1425}
1426template <>
1427EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const uint8_t* from) {
1428 EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
1429}
1430template <>
1431EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const uint16_t* from) {
1432 EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
1433}
1434template <>
1435EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from) {
1436 EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
1437}
1438template <>
1439EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(const uint64_t* from) {
1440 EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
1441}
1442
1443template <>
1444EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
1445 EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4f)__lsx_vld(from, 0);
1446}
1447template <>
1448EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
1449 EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__lsx_vld(from, 0);
1450}
1451template <>
1452EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const int8_t* from) {
1453 EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
1454}
1455template <>
1456EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const int16_t* from) {
1457 EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
1458}
1459template <>
1460EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) {
1461 EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
1462}
1463template <>
1464EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(const int64_t* from) {
1465 EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
1466}
1467template <>
1468EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const uint8_t* from) {
1469 EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
1470}
1471template <>
1472EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const uint16_t* from) {
1473 EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
1474}
1475template <>
1476EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from) {
1477 EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
1478}
1479template <>
1480EIGEN_STRONG_INLINE Packet2ul ploadu<Packet2ul>(const uint64_t* from) {
1481 EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
1482}
1483
1484template <>
1485EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
1486 float f0 = from[0], f1 = from[1];
1487 return make_packet4f(f0, f0, f1, f1);
1488}
1489template <>
1490EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
1491 return pset1<Packet2d>(from[0]);
1492}
1493template <>
1494EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const int8_t* from) {
1495 Packet16c tmp = pload<Packet16c>(from);
1496 return __lsx_vilvl_b(tmp, tmp);
1497}
1498template <>
1499EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const int16_t* from) {
1500 Packet8s tmp = pload<Packet8s>(from);
1501 return __lsx_vilvl_h(tmp, tmp);
1502}
1503template <>
1504EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from) {
1505 Packet4i tmp = pload<Packet4i>(from);
1506 return __lsx_vilvl_w(tmp, tmp);
1507}
1508template <>
1509EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(const int64_t* from) {
1510 return pset1<Packet2l>(from[0]);
1511}
1512template <>
1513EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const uint8_t* from) {
1514 Packet16uc tmp = pload<Packet16uc>(from);
1515 return __lsx_vilvl_b(tmp, tmp);
1516}
1517template <>
1518EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const uint16_t* from) {
1519 Packet8us tmp = pload<Packet8us>(from);
1520 return __lsx_vilvl_h(tmp, tmp);
1521}
1522template <>
1523EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from) {
1524 Packet4ui tmp = pload<Packet4ui>(from);
1525 return __lsx_vilvl_w(tmp, tmp);
1526}
1527template <>
1528EIGEN_STRONG_INLINE Packet2ul ploaddup<Packet2ul>(const uint64_t* from) {
1529 return pset1<Packet2ul>(from[0]);
1530}
1531
1532template <>
1533EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
1534 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst(from, to, 0);
1535}
1536template <>
1537EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
1538 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst(from, to, 0);
1539}
1540template <>
1541EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet16c& from) {
1542 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1543}
1544template <>
1545EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet8s& from) {
1546 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1547}
1548template <>
1549EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) {
1550 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1551}
1552template <>
1553EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from) {
1554 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1555}
1556template <>
1557EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet16uc& from) {
1558 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1559}
1560template <>
1561EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet8us& from) {
1562 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1563}
1564template <>
1565EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from) {
1566 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1567}
1568template <>
1569EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet2ul& from) {
1570 EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1571}
1572
1573template <>
1574EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
1575 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst(from, to, 0);
1576}
1577template <>
1578EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
1579 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst(from, to, 0);
1580}
1581
1582template <>
1583EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet16c& from) {
1584 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1585}
1586template <>
1587EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet8s& from) {
1588 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1589}
1590template <>
1591EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) {
1592 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1593}
1594template <>
1595EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from) {
1596 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1597}
1598template <>
1599EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet16uc& from) {
1600 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1601}
1602template <>
1603EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet8us& from) {
1604 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1605}
1606template <>
1607EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from) {
1608 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1609}
1610template <>
1611EIGEN_STRONG_INLINE void pstoreu<uint64_t>(uint64_t* to, const Packet2ul& from) {
1612 EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
1613}
1614
1615template <>
1616EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
1617 Packet4f v = {from[0], from[stride], from[2 * stride], from[3 * stride]};
1618 return v;
1619}
1620template <>
1621EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
1622 Packet2d v = {from[0], from[stride]};
1623 return v;
1624}
1625template <>
1626EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather<int8_t, Packet16c>(const int8_t* from, Index stride) {
1627 int8_t v[16] __attribute__((aligned(16)));
1628 v[0] = from[0];
1629 v[1] = from[stride];
1630 v[2] = from[2 * stride];
1631 v[3] = from[3 * stride];
1632 v[4] = from[4 * stride];
1633 v[5] = from[5 * stride];
1634 v[6] = from[6 * stride];
1635 v[7] = from[7 * stride];
1636 v[8] = from[8 * stride];
1637 v[9] = from[9 * stride];
1638 v[10] = from[10 * stride];
1639 v[11] = from[11 * stride];
1640 v[12] = from[12 * stride];
1641 v[13] = from[13 * stride];
1642 v[14] = from[14 * stride];
1643 v[15] = from[15 * stride];
1644 return __lsx_vld(v, 0);
1645}
1646template <>
1647EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather<int16_t, Packet8s>(const int16_t* from, Index stride) {
1648 int16_t v[8] __attribute__((aligned(16)));
1649 v[0] = from[0];
1650 v[1] = from[stride];
1651 v[2] = from[2 * stride];
1652 v[3] = from[3 * stride];
1653 v[4] = from[4 * stride];
1654 v[5] = from[5 * stride];
1655 v[6] = from[6 * stride];
1656 v[7] = from[7 * stride];
1657 return __lsx_vld(v, 0);
1658}
1659template <>
1660EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride) {
1661 int32_t v[4] __attribute__((aligned(16)));
1662 v[0] = from[0];
1663 v[1] = from[stride];
1664 v[2] = from[2 * stride];
1665 v[3] = from[3 * stride];
1666 return __lsx_vld(v, 0);
1667}
1668template <>
1669EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride) {
1670 int64_t v[2] __attribute__((aligned(16)));
1671 v[0] = from[0];
1672 v[1] = from[stride];
1673 return __lsx_vld(v, 0);
1674}
1675template <>
1676EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather<uint8_t, Packet16uc>(const uint8_t* from, Index stride) {
1677 uint8_t v[16] __attribute__((aligned(16)));
1678 v[0] = from[0];
1679 v[1] = from[stride];
1680 v[2] = from[2 * stride];
1681 v[3] = from[3 * stride];
1682 v[4] = from[4 * stride];
1683 v[5] = from[5 * stride];
1684 v[6] = from[6 * stride];
1685 v[7] = from[7 * stride];
1686 v[8] = from[8 * stride];
1687 v[9] = from[9 * stride];
1688 v[10] = from[10 * stride];
1689 v[11] = from[11 * stride];
1690 v[12] = from[12 * stride];
1691 v[13] = from[13 * stride];
1692 v[14] = from[14 * stride];
1693 v[15] = from[15 * stride];
1694 return __lsx_vld(v, 0);
1695}
1696template <>
1697EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather<uint16_t, Packet8us>(const uint16_t* from, Index stride) {
1698 uint16_t v[8] __attribute__((aligned(16)));
1699 v[0] = from[0];
1700 v[1] = from[stride];
1701 v[2] = from[2 * stride];
1702 v[3] = from[3 * stride];
1703 v[4] = from[4 * stride];
1704 v[5] = from[5 * stride];
1705 v[6] = from[6 * stride];
1706 v[7] = from[7 * stride];
1707 return __lsx_vld(v, 0);
1708}
1709template <>
1710EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride) {
1711 uint32_t v[4] __attribute__((aligned(16)));
1712 v[0] = from[0];
1713 v[1] = from[stride];
1714 v[2] = from[2 * stride];
1715 v[3] = from[3 * stride];
1716 return __lsx_vld(v, 0);
1717}
1718template <>
1719EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather<uint64_t, Packet2ul>(const uint64_t* from, Index stride) {
1720 uint64_t v[2] __attribute__((aligned(16)));
1721 v[0] = from[0];
1722 v[1] = from[stride];
1723 return __lsx_vld(v, 0);
1724}
1725
1726template <>
1727EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
1728 __lsx_vstelm_w(from, to, 0, 0);
1729 __lsx_vstelm_w(from, to + stride * 1, 0, 1);
1730 __lsx_vstelm_w(from, to + stride * 2, 0, 2);
1731 __lsx_vstelm_w(from, to + stride * 3, 0, 3);
1732}
1733template <>
1734EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
1735 __lsx_vstelm_d(from, to, 0, 0);
1736 __lsx_vstelm_d(from, to + stride, 0, 1);
1737}
1738template <>
1739EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet16c>(int8_t* to, const Packet16c& from,
1740 Index stride) {
1741 __lsx_vstelm_b((__m128i)from, to, 0, 0);
1742 __lsx_vstelm_b((__m128i)from, to + stride * 1, 0, 1);
1743 __lsx_vstelm_b((__m128i)from, to + stride * 2, 0, 2);
1744 __lsx_vstelm_b((__m128i)from, to + stride * 3, 0, 3);
1745 __lsx_vstelm_b((__m128i)from, to + stride * 4, 0, 4);
1746 __lsx_vstelm_b((__m128i)from, to + stride * 5, 0, 5);
1747 __lsx_vstelm_b((__m128i)from, to + stride * 6, 0, 6);
1748 __lsx_vstelm_b((__m128i)from, to + stride * 7, 0, 7);
1749 __lsx_vstelm_b((__m128i)from, to + stride * 8, 0, 8);
1750 __lsx_vstelm_b((__m128i)from, to + stride * 9, 0, 9);
1751 __lsx_vstelm_b((__m128i)from, to + stride * 10, 0, 10);
1752 __lsx_vstelm_b((__m128i)from, to + stride * 11, 0, 11);
1753 __lsx_vstelm_b((__m128i)from, to + stride * 12, 0, 12);
1754 __lsx_vstelm_b((__m128i)from, to + stride * 13, 0, 13);
1755 __lsx_vstelm_b((__m128i)from, to + stride * 14, 0, 14);
1756 __lsx_vstelm_b((__m128i)from, to + stride * 15, 0, 15);
1757}
1758template <>
1759EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet8s>(int16_t* to, const Packet8s& from,
1760 Index stride) {
1761 __lsx_vstelm_h((__m128i)from, to, 0, 0);
1762 __lsx_vstelm_h((__m128i)from, to + stride * 1, 0, 1);
1763 __lsx_vstelm_h((__m128i)from, to + stride * 2, 0, 2);
1764 __lsx_vstelm_h((__m128i)from, to + stride * 3, 0, 3);
1765 __lsx_vstelm_h((__m128i)from, to + stride * 4, 0, 4);
1766 __lsx_vstelm_h((__m128i)from, to + stride * 5, 0, 5);
1767 __lsx_vstelm_h((__m128i)from, to + stride * 6, 0, 6);
1768 __lsx_vstelm_h((__m128i)from, to + stride * 7, 0, 7);
1769}
1770template <>
1771EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from,
1772 Index stride) {
1773 __lsx_vstelm_w((__m128i)from, to, 0, 0);
1774 __lsx_vstelm_w((__m128i)from, to + stride * 1, 0, 1);
1775 __lsx_vstelm_w((__m128i)from, to + stride * 2, 0, 2);
1776 __lsx_vstelm_w((__m128i)from, to + stride * 3, 0, 3);
1777}
1778template <>
1779EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from,
1780 Index stride) {
1781 __lsx_vstelm_d((__m128i)from, to, 0, 0);
1782 __lsx_vstelm_d((__m128i)from, to + stride * 1, 0, 1);
1783}
1784template <>
1785EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet16uc>(uint8_t* to, const Packet16uc& from,
1786 Index stride) {
1787 __lsx_vstelm_b((__m128i)from, to, 0, 0);
1788 __lsx_vstelm_b((__m128i)from, to + stride * 1, 0, 1);
1789 __lsx_vstelm_b((__m128i)from, to + stride * 2, 0, 2);
1790 __lsx_vstelm_b((__m128i)from, to + stride * 3, 0, 3);
1791 __lsx_vstelm_b((__m128i)from, to + stride * 4, 0, 4);
1792 __lsx_vstelm_b((__m128i)from, to + stride * 5, 0, 5);
1793 __lsx_vstelm_b((__m128i)from, to + stride * 6, 0, 6);
1794 __lsx_vstelm_b((__m128i)from, to + stride * 7, 0, 7);
1795 __lsx_vstelm_b((__m128i)from, to + stride * 8, 0, 8);
1796 __lsx_vstelm_b((__m128i)from, to + stride * 9, 0, 9);
1797 __lsx_vstelm_b((__m128i)from, to + stride * 10, 0, 10);
1798 __lsx_vstelm_b((__m128i)from, to + stride * 11, 0, 11);
1799 __lsx_vstelm_b((__m128i)from, to + stride * 12, 0, 12);
1800 __lsx_vstelm_b((__m128i)from, to + stride * 13, 0, 13);
1801 __lsx_vstelm_b((__m128i)from, to + stride * 14, 0, 14);
1802 __lsx_vstelm_b((__m128i)from, to + stride * 15, 0, 15);
1803}
1804template <>
1805EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet8us>(uint16_t* to, const Packet8us& from,
1806 Index stride) {
1807 __lsx_vstelm_h((__m128i)from, to, 0, 0);
1808 __lsx_vstelm_h((__m128i)from, to + stride * 1, 0, 1);
1809 __lsx_vstelm_h((__m128i)from, to + stride * 2, 0, 2);
1810 __lsx_vstelm_h((__m128i)from, to + stride * 3, 0, 3);
1811 __lsx_vstelm_h((__m128i)from, to + stride * 4, 0, 4);
1812 __lsx_vstelm_h((__m128i)from, to + stride * 5, 0, 5);
1813 __lsx_vstelm_h((__m128i)from, to + stride * 6, 0, 6);
1814 __lsx_vstelm_h((__m128i)from, to + stride * 7, 0, 7);
1815}
1816template <>
1817EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from,
1818 Index stride) {
1819 __lsx_vstelm_w((__m128i)from, to, 0, 0);
1820 __lsx_vstelm_w((__m128i)from, to + stride * 1, 0, 1);
1821 __lsx_vstelm_w((__m128i)from, to + stride * 2, 0, 2);
1822 __lsx_vstelm_w((__m128i)from, to + stride * 3, 0, 3);
1823}
1824template <>
1825EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint64_t, Packet2ul>(uint64_t* to, const Packet2ul& from,
1826 Index stride) {
1827 __lsx_vstelm_d((__m128i)from, to, 0, 0);
1828 __lsx_vstelm_d((__m128i)from, to + stride * 1, 0, 1);
1829}
1830
1831template <>
1832EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
1833 __builtin_prefetch(addr);
1834}
1835template <>
1836EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
1837 __builtin_prefetch(addr);
1838}
1839template <>
1840EIGEN_STRONG_INLINE void prefetch<int8_t>(const int8_t* addr) {
1841 __builtin_prefetch(addr);
1842}
1843template <>
1844EIGEN_STRONG_INLINE void prefetch<int16_t>(const int16_t* addr) {
1845 __builtin_prefetch(addr);
1846}
1847template <>
1848EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) {
1849 __builtin_prefetch(addr);
1850}
1851template <>
1852EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) {
1853 __builtin_prefetch(addr);
1854}
1855template <>
1856EIGEN_STRONG_INLINE void prefetch<uint8_t>(const uint8_t* addr) {
1857 __builtin_prefetch(addr);
1858}
1859template <>
1860EIGEN_STRONG_INLINE void prefetch<uint16_t>(const uint16_t* addr) {
1861 __builtin_prefetch(addr);
1862}
1863template <>
1864EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) {
1865 __builtin_prefetch(addr);
1866}
1867template <>
1868EIGEN_STRONG_INLINE void prefetch<uint64_t>(const uint64_t* addr) {
1869 __builtin_prefetch(addr);
1870}
1871
1872template <>
1873EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
1874 float v;
1875 __lsx_vstelm_w(a, &v, 0, 0);
1876 return v;
1877}
1878template <>
1879EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
1880 double v;
1881 __lsx_vstelm_d(a, &v, 0, 0);
1882 return v;
1883}
1884
1885template <>
1886EIGEN_STRONG_INLINE int8_t pfirst<Packet16c>(const Packet16c& a) {
1887 return (int8_t)__lsx_vpickve2gr_b((__m128i)a, 0);
1888}
1889template <>
1890EIGEN_STRONG_INLINE int16_t pfirst<Packet8s>(const Packet8s& a) {
1891 return (int16_t)__lsx_vpickve2gr_h((__m128i)a, 0);
1892}
1893template <>
1894EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) {
1895 return __lsx_vpickve2gr_w((__m128i)a, 0);
1896}
1897template <>
1898EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
1899 return __lsx_vpickve2gr_d((__m128i)a, 0);
1900}
1901template <>
1902EIGEN_STRONG_INLINE uint8_t pfirst<Packet16uc>(const Packet16uc& a) {
1903 return (uint8_t)__lsx_vpickve2gr_bu((__m128i)a, 0);
1904}
1905template <>
1906EIGEN_STRONG_INLINE uint16_t pfirst<Packet8us>(const Packet8us& a) {
1907 return (uint16_t)__lsx_vpickve2gr_hu((__m128i)a, 0);
1908}
1909template <>
1910EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
1911 return __lsx_vpickve2gr_wu((__m128i)a, 0);
1912}
1913template <>
1914EIGEN_STRONG_INLINE uint64_t pfirst<Packet2ul>(const Packet2ul& a) {
1915 return __lsx_vpickve2gr_du((__m128i)a, 0);
1916}
1917
1918template <>
1919EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
1920 return (Packet4f)__lsx_vshuf4i_w(a, 0x1B);
1921}
1922template <>
1923EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
1924 return (Packet2d)__lsx_vshuf4i_d(a, a, 0x1);
1925}
1926template <>
1927EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) {
1928 return __lsx_vshuf4i_b(__lsx_vshuf4i_w((__m128i)a, 0x1B), 0x1B);
1929}
1930template <>
1931EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) {
1932 return __lsx_vshuf4i_h(__lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1), 0x1B);
1933}
1934template <>
1935EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
1936 return __lsx_vshuf4i_w((__m128i)a, 0x1B);
1937}
1938template <>
1939EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a) {
1940 return __lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1);
1941}
1942template <>
1943EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) {
1944 return __lsx_vshuf4i_b(__lsx_vshuf4i_w((__m128i)a, 0x1B), 0x1B);
1945}
1946template <>
1947EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) {
1948 return __lsx_vshuf4i_h(__lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1), 0x1B);
1949}
1950template <>
1951EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) {
1952 return __lsx_vshuf4i_w((__m128i)a, 0x1B);
1953}
1954template <>
1955EIGEN_STRONG_INLINE Packet2ul preverse(const Packet2ul& a) {
1956 return __lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1);
1957}
1958
1959template <>
1960EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
1961 Packet4f tmp = __lsx_vfadd_s(a, vec4f_swizzle1(a, 2, 3, 2, 3));
1962 return pfirst<Packet4f>(__lsx_vfadd_s(tmp, vec4f_swizzle1(tmp, 1, 1, 1, 1)));
1963}
1964template <>
1965EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
1966 return pfirst<Packet2d>(__lsx_vfadd_d(a, preverse(a)));
1967}
1968template <>
1969EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a) {
1970 Packet8s tmp1 = __lsx_vhaddw_h_b(a, a);
1971 Packet4i tmp2 = __lsx_vhaddw_w_h(tmp1, tmp1);
1972 Packet2l tmp3 = __lsx_vhaddw_d_w(tmp2, tmp2);
1973 return (int8_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp3, tmp3), 0);
1974}
1975template <>
1976EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a) {
1977 Packet4i tmp1 = __lsx_vhaddw_w_h(a, a);
1978 Packet2l tmp2 = __lsx_vhaddw_d_w(tmp1, tmp1);
1979 return (int16_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp2, tmp2), 0);
1980}
1981template <>
1982EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
1983 Packet2l tmp = __lsx_vhaddw_d_w(a, a);
1984 return (int32_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp, tmp), 0);
1985}
1986template <>
1987EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a) {
1988 return (int64_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(a, a), 0);
1989}
1990template <>
1991EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a) {
1992 Packet8us tmp1 = __lsx_vhaddw_hu_bu(a, a);
1993 Packet4ui tmp2 = __lsx_vhaddw_wu_hu(tmp1, tmp1);
1994 Packet2ul tmp3 = __lsx_vhaddw_du_wu(tmp2, tmp2);
1995 return (uint8_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp3, tmp3), 0);
1996}
1997template <>
1998EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a) {
1999 Packet4ui tmp1 = __lsx_vhaddw_wu_hu(a, a);
2000 Packet2ul tmp2 = __lsx_vhaddw_du_wu(tmp1, tmp1);
2001 return (uint16_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp2, tmp2), 0);
2002}
2003template <>
2004EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
2005 Packet2ul tmp = __lsx_vhaddw_du_wu(a, a);
2006 return (uint32_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp, tmp), 0);
2007}
2008template <>
2009EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a) {
2010 return (uint64_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(a, a), 0);
2011}
2012
2013template <>
2014EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
2015 Packet4f tmp = __lsx_vfmul_s(a, vec4f_swizzle1(a, 2, 3, 2, 3));
2016 return pfirst<Packet4f>(__lsx_vfmul_s(tmp, vec4f_swizzle1(tmp, 1, 1, 1, 1)));
2017}
2018template <>
2019EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
2020 return pfirst<Packet2d>(__lsx_vfmul_d(a, preverse(a)));
2021}
2022template <>
2023EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(const Packet16c& a) {
2024 Packet8s tmp1 = __lsx_vmulwev_h_b(a, preverse(a));
2025 Packet4i tmp2 = __lsx_vmulwev_w_h(tmp1, preverse(tmp1));
2026 Packet2l tmp3 = __lsx_vmulwev_d_w(tmp2, preverse(tmp2));
2027 return (int8_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp3, preverse(tmp3)), 0);
2028}
2029template <>
2030EIGEN_STRONG_INLINE int16_t predux_mul<Packet8s>(const Packet8s& a) {
2031 Packet4i tmp1 = __lsx_vmulwev_w_h(a, preverse(a));
2032 Packet2l tmp2 = __lsx_vmulwev_d_w(tmp1, preverse(tmp1));
2033 return (int16_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp2, preverse(tmp2)), 0);
2034}
2035template <>
2036EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a) {
2037 Packet2l tmp = __lsx_vmulwev_d_w(a, preverse(a));
2038 return (int32_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp, preverse(tmp)), 0);
2039}
2040template <>
2041EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a) {
2042 return (int64_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(a, preverse(a)), 0);
2043}
2044template <>
2045EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(const Packet16uc& a) {
2046 Packet8us tmp1 = __lsx_vmulwev_h_bu(a, preverse(a));
2047 Packet4ui tmp2 = __lsx_vmulwev_w_h(tmp1, preverse(tmp1));
2048 Packet2ul tmp3 = __lsx_vmulwev_d_w(tmp2, preverse(tmp2));
2049 return (uint8_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp3, preverse(tmp3)), 0);
2050}
2051template <>
2052EIGEN_STRONG_INLINE uint16_t predux_mul<Packet8us>(const Packet8us& a) {
2053 Packet4ui tmp1 = __lsx_vmulwev_w_hu(a, preverse(a));
2054 Packet2ul tmp2 = __lsx_vmulwev_d_w(tmp1, preverse(tmp1));
2055 return (uint16_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp2, preverse(tmp2)), 0);
2056}
2057template <>
2058EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a) {
2059 Packet2ul tmp = __lsx_vmulwev_d_wu(a, preverse(a));
2060 return (uint32_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp, preverse(tmp)), 0);
2061}
2062template <>
2063EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(const Packet2ul& a) {
2064 return (uint64_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_du(a, preverse(a)), 0);
2065}
2066
2067template <>
2068EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
2069 Packet4f tmp = __lsx_vfmin_s(a, (Packet4f)__lsx_vshuf4i_w(a, 0x4E));
2070 return pfirst(__lsx_vfmin_s(tmp, (Packet4f)__lsx_vshuf4i_w(tmp, 0xB1)));
2071}
2072template <>
2073EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
2074 return pfirst(__lsx_vfmin_d(a, preverse(a)));
2075}
2076template <>
2077EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a) {
2078 Packet16c tmp1 = __lsx_vmin_b(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2079 Packet16c tmp2 = __lsx_vmin_b(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2080 Packet16c tmp3 = __lsx_vmin_b(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
2081 return pfirst((Packet16c)__lsx_vmin_b(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
2082}
2083template <>
2084EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a) {
2085 Packet8s tmp1 = __lsx_vmin_h(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2086 Packet8s tmp2 = __lsx_vmin_h(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2087 return pfirst((Packet8s)__lsx_vmin_h(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
2088}
2089template <>
2090EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) {
2091 Packet4i tmp = __lsx_vmin_w(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2092 return pfirst((Packet4i)__lsx_vmin_w(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
2093}
2094template <>
2095EIGEN_STRONG_INLINE int64_t predux_min<Packet2l>(const Packet2l& a) {
2096 return pfirst((Packet2l)__lsx_vmin_d(a, preverse(a)));
2097}
2098template <>
2099EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a) {
2100 Packet16uc tmp1 = __lsx_vmin_bu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2101 Packet16uc tmp2 = __lsx_vmin_bu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2102 Packet16uc tmp3 = __lsx_vmin_bu(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
2103 return pfirst((Packet16uc)__lsx_vmin_bu(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
2104}
2105template <>
2106EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a) {
2107 Packet8us tmp1 = __lsx_vmin_hu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2108 Packet8us tmp2 = __lsx_vmin_hu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2109 return pfirst((Packet8us)__lsx_vmin_hu(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
2110}
2111template <>
2112EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a) {
2113 Packet4ui tmp = __lsx_vmin_wu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2114 return pfirst((Packet4ui)__lsx_vmin_wu(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
2115}
2116template <>
2117EIGEN_STRONG_INLINE uint64_t predux_min<Packet2ul>(const Packet2ul& a) {
2118 return pfirst((Packet2ul)__lsx_vmin_du(a, preverse(a)));
2119}
2120
2121template <>
2122EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
2123 Packet4f tmp = __lsx_vfmax_s(a, (Packet4f)__lsx_vshuf4i_w(a, 0x4E));
2124 return pfirst(__lsx_vfmax_s(tmp, (Packet4f)__lsx_vshuf4i_w(tmp, 0xB1)));
2125}
2126template <>
2127EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
2128 return pfirst(__lsx_vfmax_d(a, preverse(a)));
2129}
2130template <>
2131EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a) {
2132 Packet16c tmp1 = __lsx_vmax_b(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2133 Packet16c tmp2 = __lsx_vmax_b(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2134 Packet16c tmp3 = __lsx_vmax_b(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
2135 return pfirst((Packet16c)__lsx_vmax_b(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
2136}
2137template <>
2138EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a) {
2139 Packet8s tmp1 = __lsx_vmax_h(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2140 Packet8s tmp2 = __lsx_vmax_h(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2141 return pfirst((Packet8s)__lsx_vmax_h(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
2142}
2143template <>
2144EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
2145 Packet4i tmp = __lsx_vmax_w(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2146 return pfirst((Packet4i)__lsx_vmax_w(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
2147}
2148template <>
2149EIGEN_STRONG_INLINE int64_t predux_max<Packet2l>(const Packet2l& a) {
2150 return pfirst((Packet2l)__lsx_vmax_d(a, preverse(a)));
2151}
2152template <>
2153EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a) {
2154 Packet16uc tmp1 = __lsx_vmax_bu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2155 Packet16uc tmp2 = __lsx_vmax_bu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2156 Packet16uc tmp3 = __lsx_vmax_bu(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
2157 return pfirst((Packet16uc)__lsx_vmax_bu(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
2158}
2159template <>
2160EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a) {
2161 Packet8us tmp1 = __lsx_vmax_hu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2162 Packet8us tmp2 = __lsx_vmax_hu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
2163 return pfirst((Packet8us)__lsx_vmax_hu(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
2164}
2165template <>
2166EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a) {
2167 Packet4ui tmp = __lsx_vmax_wu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
2168 return pfirst((Packet4ui)__lsx_vmax_wu(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
2169}
2170template <>
2171EIGEN_STRONG_INLINE uint64_t predux_max<Packet2ul>(const Packet2ul& a) {
2172 return pfirst((Packet2ul)__lsx_vmax_du(a, preverse(a)));
2173}
2174
2175template <>
2176EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
2177 return __lsx_vfsqrt_s(a);
2178}
2179template <>
2180EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& a) {
2181 return __lsx_vfsqrt_d(a);
2182}
2183
2184EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
2185 Packet4f T0 = (Packet4f)__lsx_vilvl_w((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
2186 Packet4f T1 = (Packet4f)__lsx_vilvh_w((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
2187 Packet4f T2 = (Packet4f)__lsx_vilvl_w((__m128i)kernel.packet[3], (__m128i)kernel.packet[2]);
2188 Packet4f T3 = (Packet4f)__lsx_vilvh_w((__m128i)kernel.packet[3], (__m128i)kernel.packet[2]);
2189
2190 kernel.packet[0] = (Packet4f)__lsx_vilvl_d((__m128i)T2, (__m128i)T0);
2191 kernel.packet[1] = (Packet4f)__lsx_vilvh_d((__m128i)T2, (__m128i)T0);
2192 kernel.packet[2] = (Packet4f)__lsx_vilvl_d((__m128i)T3, (__m128i)T1);
2193 kernel.packet[3] = (Packet4f)__lsx_vilvh_d((__m128i)T3, (__m128i)T1);
2194}
2195EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
2196 Packet2d tmp = (Packet2d)__lsx_vilvh_d((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
2197 kernel.packet[0] = (Packet2d)__lsx_vilvl_d((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
2198 kernel.packet[1] = tmp;
2199}
2200EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
2201 __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2202 __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2203 __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2204 __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2205 __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
2206 __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
2207 __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
2208 __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
2209 __m128i t8 = __lsx_vilvl_b(kernel.packet[9], kernel.packet[8]);
2210 __m128i t9 = __lsx_vilvh_b(kernel.packet[9], kernel.packet[8]);
2211 __m128i ta = __lsx_vilvl_b(kernel.packet[11], kernel.packet[10]);
2212 __m128i tb = __lsx_vilvh_b(kernel.packet[11], kernel.packet[10]);
2213 __m128i tc = __lsx_vilvl_b(kernel.packet[13], kernel.packet[12]);
2214 __m128i td = __lsx_vilvh_b(kernel.packet[13], kernel.packet[12]);
2215 __m128i te = __lsx_vilvl_b(kernel.packet[15], kernel.packet[14]);
2216 __m128i tf = __lsx_vilvh_b(kernel.packet[15], kernel.packet[14]);
2217
2218 __m128i s0 = __lsx_vilvl_h(t2, t0);
2219 __m128i s1 = __lsx_vilvh_h(t2, t0);
2220 __m128i s2 = __lsx_vilvl_h(t3, t1);
2221 __m128i s3 = __lsx_vilvh_h(t3, t1);
2222 __m128i s4 = __lsx_vilvl_h(t6, t4);
2223 __m128i s5 = __lsx_vilvh_h(t6, t4);
2224 __m128i s6 = __lsx_vilvl_h(t7, t5);
2225 __m128i s7 = __lsx_vilvh_h(t7, t5);
2226 __m128i s8 = __lsx_vilvl_h(ta, t8);
2227 __m128i s9 = __lsx_vilvh_h(ta, t8);
2228 __m128i sa = __lsx_vilvl_h(tb, t9);
2229 __m128i sb = __lsx_vilvh_h(tb, t9);
2230 __m128i sc = __lsx_vilvl_h(te, tc);
2231 __m128i sd = __lsx_vilvh_h(te, tc);
2232 __m128i se = __lsx_vilvl_h(tf, td);
2233 __m128i sf = __lsx_vilvh_h(tf, td);
2234
2235 __m128i u0 = __lsx_vilvl_w(s4, s0);
2236 __m128i u1 = __lsx_vilvh_w(s4, s0);
2237 __m128i u2 = __lsx_vilvl_w(s5, s1);
2238 __m128i u3 = __lsx_vilvh_w(s5, s1);
2239 __m128i u4 = __lsx_vilvl_w(s6, s2);
2240 __m128i u5 = __lsx_vilvh_w(s6, s2);
2241 __m128i u6 = __lsx_vilvl_w(s7, s3);
2242 __m128i u7 = __lsx_vilvh_w(s7, s3);
2243 __m128i u8 = __lsx_vilvl_w(sc, s8);
2244 __m128i u9 = __lsx_vilvh_w(sc, s8);
2245 __m128i ua = __lsx_vilvl_w(sd, s9);
2246 __m128i ub = __lsx_vilvh_w(sd, s9);
2247 __m128i uc = __lsx_vilvl_w(se, sa);
2248 __m128i ud = __lsx_vilvh_w(se, sa);
2249 __m128i ue = __lsx_vilvl_w(sf, sb);
2250 __m128i uf = __lsx_vilvh_w(sf, sb);
2251
2252 kernel.packet[0] = __lsx_vilvl_d(u8, u0);
2253 kernel.packet[1] = __lsx_vilvh_d(u8, u0);
2254 kernel.packet[2] = __lsx_vilvl_d(u9, u1);
2255 kernel.packet[3] = __lsx_vilvh_d(u9, u1);
2256 kernel.packet[4] = __lsx_vilvl_d(ua, u2);
2257 kernel.packet[5] = __lsx_vilvh_d(ua, u2);
2258 kernel.packet[6] = __lsx_vilvl_d(ub, u3);
2259 kernel.packet[7] = __lsx_vilvh_d(ub, u3);
2260 kernel.packet[8] = __lsx_vilvl_d(uc, u4);
2261 kernel.packet[9] = __lsx_vilvh_d(uc, u4);
2262 kernel.packet[10] = __lsx_vilvl_d(ud, u5);
2263 kernel.packet[11] = __lsx_vilvh_d(ud, u5);
2264 kernel.packet[12] = __lsx_vilvl_d(ue, u6);
2265 kernel.packet[13] = __lsx_vilvh_d(ue, u6);
2266 kernel.packet[14] = __lsx_vilvl_d(uf, u7);
2267 kernel.packet[15] = __lsx_vilvh_d(uf, u7);
2268}
2269EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 8>& kernel) {
2270 __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2271 __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2272 __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2273 __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2274 __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
2275 __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
2276 __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
2277 __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
2278
2279 __m128i s0 = __lsx_vilvl_h(t2, t0);
2280 __m128i s1 = __lsx_vilvh_h(t2, t0);
2281 __m128i s2 = __lsx_vilvl_h(t3, t1);
2282 __m128i s3 = __lsx_vilvh_h(t3, t1);
2283 __m128i s4 = __lsx_vilvl_h(t6, t4);
2284 __m128i s5 = __lsx_vilvh_h(t6, t4);
2285 __m128i s6 = __lsx_vilvl_h(t7, t5);
2286 __m128i s7 = __lsx_vilvh_h(t7, t5);
2287
2288 kernel.packet[0] = __lsx_vilvl_w(s4, s0);
2289 kernel.packet[1] = __lsx_vilvh_w(s4, s0);
2290 kernel.packet[2] = __lsx_vilvl_w(s5, s1);
2291 kernel.packet[3] = __lsx_vilvh_w(s5, s1);
2292 kernel.packet[4] = __lsx_vilvl_w(s6, s2);
2293 kernel.packet[5] = __lsx_vilvh_w(s6, s2);
2294 kernel.packet[6] = __lsx_vilvl_w(s7, s3);
2295 kernel.packet[7] = __lsx_vilvh_w(s7, s3);
2296}
2297EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
2298 __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2299 __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2300 __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2301 __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2302
2303 kernel.packet[0] = __lsx_vilvl_h(t2, t0);
2304 kernel.packet[1] = __lsx_vilvh_h(t2, t0);
2305 kernel.packet[2] = __lsx_vilvl_h(t3, t1);
2306 kernel.packet[3] = __lsx_vilvh_h(t3, t1);
2307}
2308EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
2309 __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
2310 __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
2311 __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
2312 __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
2313 __m128i t4 = __lsx_vilvl_h(kernel.packet[5], kernel.packet[4]);
2314 __m128i t5 = __lsx_vilvh_h(kernel.packet[5], kernel.packet[4]);
2315 __m128i t6 = __lsx_vilvl_h(kernel.packet[7], kernel.packet[6]);
2316 __m128i t7 = __lsx_vilvh_h(kernel.packet[7], kernel.packet[6]);
2317
2318 __m128i s0 = __lsx_vilvl_w(t2, t0);
2319 __m128i s1 = __lsx_vilvh_w(t2, t0);
2320 __m128i s2 = __lsx_vilvl_w(t3, t1);
2321 __m128i s3 = __lsx_vilvh_w(t3, t1);
2322 __m128i s4 = __lsx_vilvl_w(t6, t4);
2323 __m128i s5 = __lsx_vilvh_w(t6, t4);
2324 __m128i s6 = __lsx_vilvl_w(t7, t5);
2325 __m128i s7 = __lsx_vilvh_w(t7, t5);
2326
2327 kernel.packet[0] = __lsx_vilvl_d(s4, s0);
2328 kernel.packet[1] = __lsx_vilvh_d(s4, s0);
2329 kernel.packet[2] = __lsx_vilvl_d(s5, s1);
2330 kernel.packet[3] = __lsx_vilvh_d(s5, s1);
2331 kernel.packet[4] = __lsx_vilvl_d(s6, s2);
2332 kernel.packet[5] = __lsx_vilvh_d(s6, s2);
2333 kernel.packet[6] = __lsx_vilvl_d(s7, s3);
2334 kernel.packet[7] = __lsx_vilvh_d(s7, s3);
2335}
2336EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
2337 __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
2338 __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
2339 __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
2340 __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
2341
2342 kernel.packet[0] = __lsx_vilvl_w(t2, t0);
2343 kernel.packet[1] = __lsx_vilvh_w(t2, t0);
2344 kernel.packet[2] = __lsx_vilvl_w(t3, t1);
2345 kernel.packet[3] = __lsx_vilvh_w(t3, t1);
2346}
2347EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
2348 __m128i T0 = __lsx_vilvl_w(kernel.packet[1], kernel.packet[0]);
2349 __m128i T1 = __lsx_vilvh_w(kernel.packet[1], kernel.packet[0]);
2350 __m128i T2 = __lsx_vilvl_w(kernel.packet[3], kernel.packet[2]);
2351 __m128i T3 = __lsx_vilvh_w(kernel.packet[3], kernel.packet[2]);
2352
2353 kernel.packet[0] = __lsx_vilvl_d(T2, T0);
2354 kernel.packet[1] = __lsx_vilvh_d(T2, T0);
2355 kernel.packet[2] = __lsx_vilvl_d(T3, T1);
2356 kernel.packet[3] = __lsx_vilvh_d(T3, T1);
2357}
2358EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2l, 2>& kernel) {
2359 __m128i tmp = __lsx_vilvh_d(kernel.packet[1], kernel.packet[0]);
2360 kernel.packet[0] = __lsx_vilvl_d(kernel.packet[1], kernel.packet[0]);
2361 kernel.packet[1] = tmp;
2362}
2363EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
2364 __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2365 __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2366 __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2367 __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2368 __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
2369 __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
2370 __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
2371 __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
2372 __m128i t8 = __lsx_vilvl_b(kernel.packet[9], kernel.packet[8]);
2373 __m128i t9 = __lsx_vilvh_b(kernel.packet[9], kernel.packet[8]);
2374 __m128i ta = __lsx_vilvl_b(kernel.packet[11], kernel.packet[10]);
2375 __m128i tb = __lsx_vilvh_b(kernel.packet[11], kernel.packet[10]);
2376 __m128i tc = __lsx_vilvl_b(kernel.packet[13], kernel.packet[12]);
2377 __m128i td = __lsx_vilvh_b(kernel.packet[13], kernel.packet[12]);
2378 __m128i te = __lsx_vilvl_b(kernel.packet[15], kernel.packet[14]);
2379 __m128i tf = __lsx_vilvh_b(kernel.packet[15], kernel.packet[14]);
2380
2381 __m128i s0 = __lsx_vilvl_h(t2, t0);
2382 __m128i s1 = __lsx_vilvh_h(t2, t0);
2383 __m128i s2 = __lsx_vilvl_h(t3, t1);
2384 __m128i s3 = __lsx_vilvh_h(t3, t1);
2385 __m128i s4 = __lsx_vilvl_h(t6, t4);
2386 __m128i s5 = __lsx_vilvh_h(t6, t4);
2387 __m128i s6 = __lsx_vilvl_h(t7, t5);
2388 __m128i s7 = __lsx_vilvh_h(t7, t5);
2389 __m128i s8 = __lsx_vilvl_h(ta, t8);
2390 __m128i s9 = __lsx_vilvh_h(ta, t8);
2391 __m128i sa = __lsx_vilvl_h(tb, t9);
2392 __m128i sb = __lsx_vilvh_h(tb, t9);
2393 __m128i sc = __lsx_vilvl_h(te, tc);
2394 __m128i sd = __lsx_vilvh_h(te, tc);
2395 __m128i se = __lsx_vilvl_h(tf, td);
2396 __m128i sf = __lsx_vilvh_h(tf, td);
2397
2398 __m128i u0 = __lsx_vilvl_w(s4, s0);
2399 __m128i u1 = __lsx_vilvh_w(s4, s0);
2400 __m128i u2 = __lsx_vilvl_w(s5, s1);
2401 __m128i u3 = __lsx_vilvh_w(s5, s1);
2402 __m128i u4 = __lsx_vilvl_w(s6, s2);
2403 __m128i u5 = __lsx_vilvh_w(s6, s2);
2404 __m128i u6 = __lsx_vilvl_w(s7, s3);
2405 __m128i u7 = __lsx_vilvh_w(s7, s3);
2406 __m128i u8 = __lsx_vilvl_w(sc, s8);
2407 __m128i u9 = __lsx_vilvh_w(sc, s8);
2408 __m128i ua = __lsx_vilvl_w(sd, s9);
2409 __m128i ub = __lsx_vilvh_w(sd, s9);
2410 __m128i uc = __lsx_vilvl_w(se, sa);
2411 __m128i ud = __lsx_vilvh_w(se, sa);
2412 __m128i ue = __lsx_vilvl_w(sf, sb);
2413 __m128i uf = __lsx_vilvh_w(sf, sb);
2414
2415 kernel.packet[0] = __lsx_vilvl_d(u8, u0);
2416 kernel.packet[1] = __lsx_vilvh_d(u8, u0);
2417 kernel.packet[2] = __lsx_vilvl_d(u9, u1);
2418 kernel.packet[3] = __lsx_vilvh_d(u9, u1);
2419 kernel.packet[4] = __lsx_vilvl_d(ua, u2);
2420 kernel.packet[5] = __lsx_vilvh_d(ua, u2);
2421 kernel.packet[6] = __lsx_vilvl_d(ub, u3);
2422 kernel.packet[7] = __lsx_vilvh_d(ub, u3);
2423 kernel.packet[8] = __lsx_vilvl_d(uc, u4);
2424 kernel.packet[9] = __lsx_vilvh_d(uc, u4);
2425 kernel.packet[10] = __lsx_vilvl_d(ud, u5);
2426 kernel.packet[11] = __lsx_vilvh_d(ud, u5);
2427 kernel.packet[12] = __lsx_vilvl_d(ue, u6);
2428 kernel.packet[13] = __lsx_vilvh_d(ue, u6);
2429 kernel.packet[14] = __lsx_vilvl_d(uf, u7);
2430 kernel.packet[15] = __lsx_vilvh_d(uf, u7);
2431}
2432EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 8>& kernel) {
2433 __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2434 __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2435 __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2436 __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2437 __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
2438 __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
2439 __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
2440 __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
2441
2442 __m128i s0 = __lsx_vilvl_h(t2, t0);
2443 __m128i s1 = __lsx_vilvh_h(t2, t0);
2444 __m128i s2 = __lsx_vilvl_h(t3, t1);
2445 __m128i s3 = __lsx_vilvh_h(t3, t1);
2446 __m128i s4 = __lsx_vilvl_h(t6, t4);
2447 __m128i s5 = __lsx_vilvh_h(t6, t4);
2448 __m128i s6 = __lsx_vilvl_h(t7, t5);
2449 __m128i s7 = __lsx_vilvh_h(t7, t5);
2450
2451 kernel.packet[0] = __lsx_vilvl_w(s4, s0);
2452 kernel.packet[1] = __lsx_vilvh_w(s4, s0);
2453 kernel.packet[2] = __lsx_vilvl_w(s5, s1);
2454 kernel.packet[3] = __lsx_vilvh_w(s5, s1);
2455 kernel.packet[4] = __lsx_vilvl_w(s6, s2);
2456 kernel.packet[5] = __lsx_vilvh_w(s6, s2);
2457 kernel.packet[6] = __lsx_vilvl_w(s7, s3);
2458 kernel.packet[7] = __lsx_vilvh_w(s7, s3);
2459}
2460EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
2461 __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
2462 __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
2463 __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
2464 __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
2465
2466 kernel.packet[0] = __lsx_vilvl_h(t2, t0);
2467 kernel.packet[1] = __lsx_vilvh_h(t2, t0);
2468 kernel.packet[2] = __lsx_vilvl_h(t3, t1);
2469 kernel.packet[3] = __lsx_vilvh_h(t3, t1);
2470}
2471EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
2472 __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
2473 __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
2474 __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
2475 __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
2476 __m128i t4 = __lsx_vilvl_h(kernel.packet[5], kernel.packet[4]);
2477 __m128i t5 = __lsx_vilvh_h(kernel.packet[5], kernel.packet[4]);
2478 __m128i t6 = __lsx_vilvl_h(kernel.packet[7], kernel.packet[6]);
2479 __m128i t7 = __lsx_vilvh_h(kernel.packet[7], kernel.packet[6]);
2480
2481 __m128i s0 = __lsx_vilvl_w(t2, t0);
2482 __m128i s1 = __lsx_vilvh_w(t2, t0);
2483 __m128i s2 = __lsx_vilvl_w(t3, t1);
2484 __m128i s3 = __lsx_vilvh_w(t3, t1);
2485 __m128i s4 = __lsx_vilvl_w(t6, t4);
2486 __m128i s5 = __lsx_vilvh_w(t6, t4);
2487 __m128i s6 = __lsx_vilvl_w(t7, t5);
2488 __m128i s7 = __lsx_vilvh_w(t7, t5);
2489
2490 kernel.packet[0] = __lsx_vilvl_d(s4, s0);
2491 kernel.packet[1] = __lsx_vilvh_d(s4, s0);
2492 kernel.packet[2] = __lsx_vilvl_d(s5, s1);
2493 kernel.packet[3] = __lsx_vilvh_d(s5, s1);
2494 kernel.packet[4] = __lsx_vilvl_d(s6, s2);
2495 kernel.packet[5] = __lsx_vilvh_d(s6, s2);
2496 kernel.packet[6] = __lsx_vilvl_d(s7, s3);
2497 kernel.packet[7] = __lsx_vilvh_d(s7, s3);
2498}
2499EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
2500 __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
2501 __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
2502 __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
2503 __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
2504
2505 kernel.packet[0] = __lsx_vilvl_w(t2, t0);
2506 kernel.packet[1] = __lsx_vilvh_w(t2, t0);
2507 kernel.packet[2] = __lsx_vilvl_w(t3, t1);
2508 kernel.packet[3] = __lsx_vilvh_w(t3, t1);
2509}
2510EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
2511 __m128i T0 = __lsx_vilvl_w(kernel.packet[1], kernel.packet[0]);
2512 __m128i T1 = __lsx_vilvh_w(kernel.packet[1], kernel.packet[0]);
2513 __m128i T2 = __lsx_vilvl_w(kernel.packet[3], kernel.packet[2]);
2514 __m128i T3 = __lsx_vilvh_w(kernel.packet[3], kernel.packet[2]);
2515
2516 kernel.packet[0] = __lsx_vilvl_d(T2, T0);
2517 kernel.packet[1] = __lsx_vilvh_d(T2, T0);
2518 kernel.packet[2] = __lsx_vilvl_d(T3, T1);
2519 kernel.packet[3] = __lsx_vilvh_d(T3, T1);
2520}
2521EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2ul, 2>& kernel) {
2522 __m128i tmp = __lsx_vilvh_d(kernel.packet[1], kernel.packet[0]);
2523 kernel.packet[0] = __lsx_vilvl_d(kernel.packet[1], kernel.packet[0]);
2524 kernel.packet[1] = tmp;
2525}
2526
2527template <>
2528EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
2529 return __lsx_vfrsqrt_s(a);
2530}
2531template <>
2532EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
2533 return __lsx_vfrsqrt_d(a);
2534}
2535
2536template <>
2537EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) {
2538 return __lsx_vfrintrm_s(a);
2539}
2540template <>
2541EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) {
2542 return __lsx_vfrintrm_d(a);
2543}
2544
2545template <>
2546EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) {
2547 return __lsx_vfrintrp_s(a);
2548}
2549template <>
2550EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) {
2551 return __lsx_vfrintrp_d(a);
2552}
2553
2554template <>
2555EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) {
2556 const Packet4f mask = pset1frombits<Packet4f>(static_cast<numext::uint32_t>(0x80000000u));
2557 const Packet4f prev0dot5 = pset1frombits<Packet4f>(static_cast<numext::uint32_t>(0x3EFFFFFFu));
2558 return __lsx_vfrintrz_s(padd(pxor(pand(a, mask), prev0dot5), a));
2559}
2560template <>
2561EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) {
2562 const Packet2d mask = pset1frombits<Packet2d>(static_cast<numext::uint64_t>(0x8000000000000000ull));
2563 const Packet2d prev0dot5 = pset1frombits<Packet2d>(static_cast<numext::uint64_t>(0x3FDFFFFFFFFFFFFFull));
2564 return __lsx_vfrintrz_d(padd(por(pand(a, mask), prev0dot5), a));
2565}
2566
2567template <>
2568EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
2569 return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask);
2570}
2571template <>
2572EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b) {
2573 return (Packet16c)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask);
2574}
2575
2576template <>
2577EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const int8_t* from) {
2578 int8_t tmp[16] = {*from, *from, *from, *from, *(from + 1), *(from + 1),
2579 *(from + 1), *(from + 1), *(from + 2), *(from + 2), *(from + 2), *(from + 2),
2580 *(from + 3), *(from + 3), *(from + 3), *(from + 3)};
2581 return __lsx_vld(tmp, 0);
2582}
2583template <>
2584EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const uint8_t* from) {
2585 uint8_t tmp[16] = {*from, *from, *from, *from, *(from + 1), *(from + 1),
2586 *(from + 1), *(from + 1), *(from + 2), *(from + 2), *(from + 2), *(from + 2),
2587 *(from + 3), *(from + 3), *(from + 3), *(from + 3)};
2588 return __lsx_vld(tmp, 0);
2589}
2590template <>
2591EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const int16_t* from) {
2592 int16_t tmp[8] = {*from, *from, *from, *from, *(from + 1), *(from + 1), *(from + 1), *(from + 1)};
2593 return __lsx_vld(tmp, 0);
2594}
2595template <>
2596EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const uint16_t* from) {
2597 uint16_t tmp[8] = {*from, *from, *from, *from, *(from + 1), *(from + 1), *(from + 1), *(from + 1)};
2598 return __lsx_vld(tmp, 0);
2599}
2600template <>
2601EIGEN_STRONG_INLINE Packet4i ploadquad<Packet4i>(const int32_t* from) {
2602 int32_t tmp[4] = {*from, *from, *from, *from};
2603 return __lsx_vld(tmp, 0);
2604}
2605template <>
2606EIGEN_STRONG_INLINE Packet4ui ploadquad<Packet4ui>(const uint32_t* from) {
2607 uint32_t tmp[4] = {*from, *from, *from, *from};
2608 return __lsx_vld(tmp, 0);
2609}
2610
2611template <>
2612EIGEN_STRONG_INLINE Packet16c pnmsub(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
2613 return __lsx_vmsub_b(pnegate(c), a, b);
2614}
2615template <>
2616EIGEN_STRONG_INLINE Packet8s pnmsub(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
2617 return __lsx_vmsub_h(pnegate(c), a, b);
2618}
2619template <>
2620EIGEN_STRONG_INLINE Packet4i pnmsub(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
2621 return __lsx_vmsub_w(pnegate(c), a, b);
2622}
2623template <>
2624EIGEN_STRONG_INLINE Packet2l pnmsub(const Packet2l& a, const Packet2l& b, const Packet2l& c) {
2625 return __lsx_vmsub_d(pnegate(c), a, b);
2626}
2627
2628template <>
2629EIGEN_STRONG_INLINE Packet16c pmsub(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
2630 return __lsx_vmadd_b(pnegate(c), a, b);
2631}
2632template <>
2633EIGEN_STRONG_INLINE Packet8s pmsub(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
2634 return __lsx_vmadd_h(pnegate(c), a, b);
2635}
2636template <>
2637EIGEN_STRONG_INLINE Packet4i pmsub(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
2638 return __lsx_vmadd_w(pnegate(c), a, b);
2639}
2640template <>
2641EIGEN_STRONG_INLINE Packet2l pmsub(const Packet2l& a, const Packet2l& b, const Packet2l& c) {
2642 return __lsx_vmadd_d(pnegate(c), a, b);
2643}
2644
2645template <>
2646EIGEN_STRONG_INLINE Packet16c pnmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
2647 return __lsx_vmsub_b(c, a, b);
2648}
2649template <>
2650EIGEN_STRONG_INLINE Packet8s pnmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
2651 return __lsx_vmsub_h(c, a, b);
2652}
2653template <>
2654EIGEN_STRONG_INLINE Packet4i pnmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
2655 return __lsx_vmsub_w(c, a, b);
2656}
2657template <>
2658EIGEN_STRONG_INLINE Packet2l pnmadd(const Packet2l& a, const Packet2l& b, const Packet2l& c) {
2659 return __lsx_vmsub_d(c, a, b);
2660}
2661
2662template <>
2663EIGEN_STRONG_INLINE Packet4f pexp(const Packet4f& _x) {
2664 return pexp_float(_x);
2665}
2666template <>
2667EIGEN_STRONG_INLINE Packet2d pexp(const Packet2d& _x) {
2668 return pexp_double(_x);
2669}
2670
2671template <>
2672EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
2673 return pldexp_generic(a, exponent);
2674}
2675
2676template <>
2677EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
2678 return pfrexp_generic(a, exponent);
2679}
2680template <>
2681EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
2682 return pfrexp_generic(a, exponent);
2683}
2684template <>
2685EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /* a */) {
2686 Packet4f v = {0.0f, 0.0f, 0.0f, 0.0f};
2687 return v;
2688}
2689template <>
2690EIGEN_STRONG_INLINE Packet4f pabsdiff<Packet4f>(const Packet4f& a, const Packet4f& b) {
2691 Packet4f v = psub(a, b);
2692 return pabs(v);
2693}
2694template <>
2695EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
2696 return pmin<Packet4f>(a, b);
2697}
2698template <>
2699EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
2700 return pmax<Packet4f>(a, b);
2701}
2702template <>
2703EIGEN_STRONG_INLINE Packet4f ploadquad<Packet4f>(const float* from) {
2704 return (__m128)__lsx_vldrepl_w(from, 0);
2705}
2706template <>
2707EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
2708 return (__m128)__lsx_vsrai_w((__m128i)a, 31);
2709}
2710template <>
2711EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
2712 return __lsx_vfrintrne_s(a);
2713}
2714template <>
2715EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(const Packet4f& a) {
2716 return __lsx_vfrintrz_s(a);
2717}
2718template <>
2719EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& a) {
2720 return __lsx_vfrecip_s(a);
2721}
2722
2723template <>
2724EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /* a */) {
2725 Packet2d v = {0.0, 0.0};
2726 return v;
2727}
2728template <>
2729EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
2730 return pmin<Packet2d>(a, b);
2731}
2732template <>
2733EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
2734 return pmax<Packet2d>(a, b);
2735}
2736template <>
2737EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
2738 return (__m128d)(__lsx_vsrai_d((__m128i)a, 63));
2739}
2740template <>
2741EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) {
2742 return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask);
2743}
2744template <>
2745EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
2746 return __lsx_vfrintrne_d(a);
2747}
2748template <>
2749EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(const Packet2d& a) {
2750 return __lsx_vfrintrz_d(a);
2751}
2752template <>
2753EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
2754 return pldexp_generic(a, exponent);
2755}
2756
2757template <>
2758EIGEN_STRONG_INLINE Packet16c pabsdiff<Packet16c>(const Packet16c& a, const Packet16c& b) {
2759 Packet16c v = psub(a, b);
2760 return pabs(v);
2761}
2762
2763template <>
2764EIGEN_STRONG_INLINE Packet8s pabsdiff<Packet8s>(const Packet8s& a, const Packet8s& b) {
2765 Packet8s v = psub(a, b);
2766 return pabs(v);
2767}
2768template <>
2769EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b) {
2770 return __lsx_vbitsel_v(b, a, mask);
2771}
2772
2773template <>
2774EIGEN_STRONG_INLINE Packet4i pabsdiff<Packet4i>(const Packet4i& a, const Packet4i& b) {
2775 Packet4i v = psub(a, b);
2776 return pabs(v);
2777}
2778template <>
2779EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
2780 return __lsx_vbitsel_v(b, a, mask);
2781}
2782
2783template <>
2784EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b) {
2785 return __lsx_vbitsel_v(b, a, mask);
2786}
2787
2788template <>
2789EIGEN_STRONG_INLINE Packet16uc pdiv<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
2790 return __lsx_vdiv_bu(a, b);
2791}
2792template <>
2793EIGEN_STRONG_INLINE Packet16uc pabsdiff<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
2794 Packet16uc v = psub(a, b);
2795 return pabs(v);
2796}
2797template <>
2798EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a,
2799 const Packet16uc& b) {
2800 return __lsx_vbitsel_v(b, a, mask);
2801}
2802template <>
2803EIGEN_STRONG_INLINE Packet16uc psqrt(const Packet16uc& a) {
2804 __m128i res = {0, 0};
2805 __m128i add = {0x0808080808080808, 0x0808080808080808};
2806 for (int i = 0; i < 4; i++) {
2807 const __m128i temp = __lsx_vor_v(res, add);
2808 const __m128i tmul = __lsx_vpackev_b(__lsx_vmulwod_h_bu(temp, temp), __lsx_vmulwev_h_bu(temp, temp));
2809 res = __lsx_vbitsel_v(res, temp, __lsx_vsle_bu(tmul, a));
2810 add = __lsx_vsrli_b(add, 1);
2811 }
2812 return res;
2813}
2814
2815template <>
2816EIGEN_STRONG_INLINE Packet8us pabsdiff<Packet8us>(const Packet8us& a, const Packet8us& b) {
2817 Packet8us v = psub(a, b);
2818 return pabs(v);
2819}
2820template <>
2821EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b) {
2822 return __lsx_vbitsel_v(b, a, mask);
2823}
2824template <>
2825EIGEN_STRONG_INLINE Packet8us psqrt(const Packet8us& a) {
2826 __m128i res = {0, 0};
2827 __m128i add = {0x0080008000800080, 0x0080008000800080};
2828 for (int i = 0; i < 4; i++) {
2829 const __m128i temp = __lsx_vor_v(res, add);
2830 const __m128i tmul = __lsx_vpackev_h(__lsx_vmulwod_w_hu(temp, temp), __lsx_vmulwev_w_hu(temp, temp));
2831 res = __lsx_vbitsel_v(res, temp, __lsx_vsle_hu(tmul, a));
2832 add = __lsx_vsrli_h(add, 1);
2833 }
2834 return res;
2835}
2836
2837template <>
2838EIGEN_STRONG_INLINE Packet4ui pabsdiff<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
2839 Packet4ui v = psub(a, b);
2840 return pabs(v);
2841}
2842template <>
2843EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) {
2844 return __lsx_vbitsel_v(b, a, mask);
2845}
2846template <>
2847EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) {
2848 __m128i res = {0, 0};
2849 __m128i add = {0x0000800000008000, 0x0000800000008000};
2850 for (int i = 0; i < 4; i++) {
2851 const __m128i temp = __lsx_vor_v(res, add);
2852 const __m128i tmul = __lsx_vpackev_w(__lsx_vmulwod_d_wu(temp, temp), __lsx_vmulwev_d_wu(temp, temp));
2853 res = __lsx_vbitsel_v(res, temp, __lsx_vsle_wu(tmul, a));
2854 add = __lsx_vsrli_w(add, 1);
2855 }
2856 return res;
2857}
2858
2859template <>
2860EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b) {
2861 return __lsx_vbitsel_v(b, a, mask);
2862}
2863
2864} // namespace internal
2865} // namespace Eigen
2866#endif
@ Aligned16
Definition Constants.h:237
Namespace containing all symbols from the Eigen library.
Definition B01_Experimental.dox:1
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:82