Eigen  5.0.1-dev+7c7d8473
 
Loading...
Searching...
No Matches
PacketMath.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2018 Wave Computing, Inc.
5// Written by:
6// Chris Larsen
7// Alexey Frunze (afrunze@wavecomp.com)
8//
9// This Source Code Form is subject to the terms of the Mozilla
10// Public License v. 2.0. If a copy of the MPL was not distributed
11// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
12
13#ifndef EIGEN_PACKET_MATH_MSA_H
14#define EIGEN_PACKET_MATH_MSA_H
15
16#include <iostream>
17#include <string>
18
19// IWYU pragma: private
20#include "../../InternalHeaderCheck.h"
21
22namespace Eigen {
23
24namespace internal {
25
26#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
27#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
28#endif
29
30#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
31#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
32#endif
33
34#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
35#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
36#endif
37
38#if 0
39#define EIGEN_MSA_DEBUG \
40 static bool firstTime = true; \
41 do { \
42 if (firstTime) { \
43 std::cout << __FILE__ << ':' << __LINE__ << ':' << __FUNCTION__ << std::endl; \
44 firstTime = false; \
45 } \
46 } while (0)
47#else
48#define EIGEN_MSA_DEBUG
49#endif
50
51#define EIGEN_MSA_SHF_I8(a, b, c, d) (((d) << 6) | ((c) << 4) | ((b) << 2) | (a))
52
53typedef v4f32 Packet4f;
54typedef v4i32 Packet4i;
55typedef v4u32 Packet4ui;
56
57#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = {X, X, X, X}
58#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = {X, X, X, X}
59#define EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = {X, X, X, X}
60
61inline std::ostream& operator<<(std::ostream& os, const Packet4f& value) {
62 os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
63 return os;
64}
65
66inline std::ostream& operator<<(std::ostream& os, const Packet4i& value) {
67 os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
68 return os;
69}
70
71inline std::ostream& operator<<(std::ostream& os, const Packet4ui& value) {
72 os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
73 return os;
74}
75
76template <>
77struct packet_traits<float> : default_packet_traits {
78 typedef Packet4f type;
79 typedef Packet4f half; // Packet2f intrinsics not implemented yet
80 enum {
81 Vectorizable = 1,
82 AlignedOnScalar = 1,
83 size = 4,
84 // FIXME check the Has*
85 HasDiv = 1,
86 HasSin = EIGEN_FAST_MATH,
87 HasCos = EIGEN_FAST_MATH,
88 HasTanh = EIGEN_FAST_MATH,
89 HasErf = EIGEN_FAST_MATH,
90 HasLog = 1,
91 HasExp = 1,
92 HasSqrt = 1,
93 HasRsqrt = 1,
94 };
95};
96
97template <>
98struct packet_traits<int32_t> : default_packet_traits {
99 typedef Packet4i type;
100 typedef Packet4i half; // Packet2i intrinsics not implemented yet
101 enum {
102 Vectorizable = 1,
103 AlignedOnScalar = 1,
104 size = 4,
105 // FIXME check the Has*
106 HasDiv = 1,
107 };
108};
109
110template <>
111struct unpacket_traits<Packet4f> {
112 typedef float type;
113 enum {
114 size = 4,
115 alignment = Aligned16,
116 vectorizable = true,
117 masked_load_available = false,
118 masked_store_available = false
119 };
120 typedef Packet4f half;
121};
122
123template <>
124struct unpacket_traits<Packet4i> {
125 typedef int32_t type;
126 enum {
127 size = 4,
128 alignment = Aligned16,
129 vectorizable = true,
130 masked_load_available = false,
131 masked_store_available = false
132 };
133 typedef Packet4i half;
134};
135
136template <>
137EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
138 EIGEN_MSA_DEBUG;
139
140 Packet4f v = {from, from, from, from};
141 return v;
142}
143
144template <>
145EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) {
146 EIGEN_MSA_DEBUG;
147
148 return __builtin_msa_fill_w(from);
149}
150
151template <>
152EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float* from) {
153 EIGEN_MSA_DEBUG;
154
155 float f = *from;
156 Packet4f v = {f, f, f, f};
157 return v;
158}
159
160template <>
161EIGEN_STRONG_INLINE Packet4i pload1<Packet4i>(const int32_t* from) {
162 EIGEN_MSA_DEBUG;
163
164 return __builtin_msa_fill_w(*from);
165}
166
167template <>
168EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
169 EIGEN_MSA_DEBUG;
170
171 return __builtin_msa_fadd_w(a, b);
172}
173
174template <>
175EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
176 EIGEN_MSA_DEBUG;
177
178 return __builtin_msa_addv_w(a, b);
179}
180
181template <>
182EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
183 EIGEN_MSA_DEBUG;
184
185 static const Packet4f countdown = {0.0f, 1.0f, 2.0f, 3.0f};
186 return padd(pset1<Packet4f>(a), countdown);
187}
188
189template <>
190EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a) {
191 EIGEN_MSA_DEBUG;
192
193 static const Packet4i countdown = {0, 1, 2, 3};
194 return padd(pset1<Packet4i>(a), countdown);
195}
196
197template <>
198EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
199 EIGEN_MSA_DEBUG;
200
201 return __builtin_msa_fsub_w(a, b);
202}
203
204template <>
205EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
206 EIGEN_MSA_DEBUG;
207
208 return __builtin_msa_subv_w(a, b);
209}
210
211template <>
212EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
213 EIGEN_MSA_DEBUG;
214
215 return (Packet4f)__builtin_msa_bnegi_w((v4u32)a, 31);
216}
217
218template <>
219EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
220 EIGEN_MSA_DEBUG;
221
222 return __builtin_msa_addvi_w((v4i32)__builtin_msa_nori_b((v16u8)a, 0), 1);
223}
224
225template <>
226EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
227 EIGEN_MSA_DEBUG;
228
229 return a;
230}
231
232template <>
233EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
234 EIGEN_MSA_DEBUG;
235
236 return a;
237}
238
239template <>
240EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
241 EIGEN_MSA_DEBUG;
242
243 return __builtin_msa_fmul_w(a, b);
244}
245
246template <>
247EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
248 EIGEN_MSA_DEBUG;
249
250 return __builtin_msa_mulv_w(a, b);
251}
252
253template <>
254EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
255 EIGEN_MSA_DEBUG;
256
257 return __builtin_msa_fdiv_w(a, b);
258}
259
260template <>
261EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
262 EIGEN_MSA_DEBUG;
263
264 return __builtin_msa_div_s_w(a, b);
265}
266
267template <>
268EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
269 EIGEN_MSA_DEBUG;
270
271 return __builtin_msa_fmadd_w(c, a, b);
272}
273
274template <>
275EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
276 EIGEN_MSA_DEBUG;
277
278 // Use "asm" construct to avoid __builtin_msa_maddv_w GNU C bug.
279 Packet4i value = c;
280 __asm__("maddv.w %w[value], %w[a], %w[b]\n"
281 // Outputs
282 : [value] "+f"(value)
283 // Inputs
284 : [a] "f"(a), [b] "f"(b));
285 return value;
286}
287
288template <>
289EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
290 EIGEN_MSA_DEBUG;
291
292 return (Packet4f)__builtin_msa_and_v((v16u8)a, (v16u8)b);
293}
294
295template <>
296EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
297 EIGEN_MSA_DEBUG;
298
299 return (Packet4i)__builtin_msa_and_v((v16u8)a, (v16u8)b);
300}
301
302template <>
303EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
304 EIGEN_MSA_DEBUG;
305
306 return (Packet4f)__builtin_msa_or_v((v16u8)a, (v16u8)b);
307}
308
309template <>
310EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
311 EIGEN_MSA_DEBUG;
312
313 return (Packet4i)__builtin_msa_or_v((v16u8)a, (v16u8)b);
314}
315
316template <>
317EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
318 EIGEN_MSA_DEBUG;
319
320 return (Packet4f)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
321}
322
323template <>
324EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
325 EIGEN_MSA_DEBUG;
326
327 return (Packet4i)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
328}
329
330template <>
331EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
332 EIGEN_MSA_DEBUG;
333
334 return pand(a, (Packet4f)__builtin_msa_xori_b((v16u8)b, 255));
335}
336
337template <>
338EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
339 EIGEN_MSA_DEBUG;
340
341 return pand(a, (Packet4i)__builtin_msa_xori_b((v16u8)b, 255));
342}
343
344template <>
345EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
346 EIGEN_MSA_DEBUG;
347
348#if EIGEN_FAST_MATH
349 // This prefers numbers to NaNs.
350 return __builtin_msa_fmin_w(a, b);
351#else
352 // This prefers NaNs to numbers.
353 Packet4i aNaN = __builtin_msa_fcun_w(a, a);
354 Packet4i aMinOrNaN = por(__builtin_msa_fclt_w(a, b), aNaN);
355 return (Packet4f)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
356#endif
357}
358
359template <>
360EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
361 EIGEN_MSA_DEBUG;
362
363 return __builtin_msa_min_s_w(a, b);
364}
365
366template <>
367EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
368 EIGEN_MSA_DEBUG;
369
370#if EIGEN_FAST_MATH
371 // This prefers numbers to NaNs.
372 return __builtin_msa_fmax_w(a, b);
373#else
374 // This prefers NaNs to numbers.
375 Packet4i aNaN = __builtin_msa_fcun_w(a, a);
376 Packet4i aMaxOrNaN = por(__builtin_msa_fclt_w(b, a), aNaN);
377 return (Packet4f)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
378#endif
379}
380
381template <>
382EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
383 EIGEN_MSA_DEBUG;
384
385 return __builtin_msa_max_s_w(a, b);
386}
387
388template <>
389EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
390 EIGEN_MSA_DEBUG;
391
392 EIGEN_DEBUG_ALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast<float*>(from), 0);
393}
394
395template <>
396EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) {
397 EIGEN_MSA_DEBUG;
398
399 EIGEN_DEBUG_ALIGNED_LOAD return __builtin_msa_ld_w(const_cast<int32_t*>(from), 0);
400}
401
402template <>
403EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
404 EIGEN_MSA_DEBUG;
405
406 EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast<float*>(from), 0);
407}
408
409template <>
410EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) {
411 EIGEN_MSA_DEBUG;
412
413 EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4i)__builtin_msa_ld_w(const_cast<int32_t*>(from), 0);
414}
415
416template <>
417EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
418 EIGEN_MSA_DEBUG;
419
420 float f0 = from[0], f1 = from[1];
421 Packet4f v0 = {f0, f0, f0, f0};
422 Packet4f v1 = {f1, f1, f1, f1};
423 return (Packet4f)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
424}
425
426template <>
427EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from) {
428 EIGEN_MSA_DEBUG;
429
430 int32_t i0 = from[0], i1 = from[1];
431 Packet4i v0 = {i0, i0, i0, i0};
432 Packet4i v1 = {i1, i1, i1, i1};
433 return (Packet4i)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
434}
435
436template <>
437EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
438 EIGEN_MSA_DEBUG;
439
440 EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
441}
442
443template <>
444EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) {
445 EIGEN_MSA_DEBUG;
446
447 EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w(from, to, 0);
448}
449
450template <>
451EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
452 EIGEN_MSA_DEBUG;
453
454 EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
455}
456
457template <>
458EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) {
459 EIGEN_MSA_DEBUG;
460
461 EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w(from, to, 0);
462}
463
464template <>
465EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
466 EIGEN_MSA_DEBUG;
467
468 float f = *from;
469 Packet4f v = {f, f, f, f};
470 v[1] = from[stride];
471 v[2] = from[2 * stride];
472 v[3] = from[3 * stride];
473 return v;
474}
475
476template <>
477EIGEN_DEVICE_FUNC inline Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride) {
478 EIGEN_MSA_DEBUG;
479
480 int32_t i = *from;
481 Packet4i v = {i, i, i, i};
482 v[1] = from[stride];
483 v[2] = from[2 * stride];
484 v[3] = from[3 * stride];
485 return v;
486}
487
488template <>
489EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
490 EIGEN_MSA_DEBUG;
491
492 *to = from[0];
493 to += stride;
494 *to = from[1];
495 to += stride;
496 *to = from[2];
497 to += stride;
498 *to = from[3];
499}
500
501template <>
502EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, Index stride) {
503 EIGEN_MSA_DEBUG;
504
505 *to = from[0];
506 to += stride;
507 *to = from[1];
508 to += stride;
509 *to = from[2];
510 to += stride;
511 *to = from[3];
512}
513
514template <>
515EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
516 EIGEN_MSA_DEBUG;
517
518 __builtin_prefetch(addr);
519}
520
521template <>
522EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) {
523 EIGEN_MSA_DEBUG;
524
525 __builtin_prefetch(addr);
526}
527
528template <>
529EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
530 EIGEN_MSA_DEBUG;
531
532 return a[0];
533}
534
535template <>
536EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) {
537 EIGEN_MSA_DEBUG;
538
539 return a[0];
540}
541
542template <>
543EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
544 EIGEN_MSA_DEBUG;
545
546 return (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
547}
548
549template <>
550EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
551 EIGEN_MSA_DEBUG;
552
553 return __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
554}
555
556template <>
557EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
558 EIGEN_MSA_DEBUG;
559
560 return (Packet4f)__builtin_msa_bclri_w((v4u32)a, 31);
561}
562
563template <>
564EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
565 EIGEN_MSA_DEBUG;
566
567 Packet4i zero = __builtin_msa_ldi_w(0);
568 return __builtin_msa_add_a_w(zero, a);
569}
570
571template <>
572EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
573 EIGEN_MSA_DEBUG;
574
575 Packet4f s = padd(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
576 s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
577 return s[0];
578}
579
580template <>
581EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
582 EIGEN_MSA_DEBUG;
583
584 Packet4i s = padd(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
585 s = padd(s, __builtin_msa_shf_w(s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
586 return s[0];
587}
588
589// Other reduction functions:
590// mul
591template <>
592EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
593 EIGEN_MSA_DEBUG;
594
595 Packet4f p = pmul(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
596 p = pmul(p, (Packet4f)__builtin_msa_shf_w((v4i32)p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
597 return p[0];
598}
599
600template <>
601EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a) {
602 EIGEN_MSA_DEBUG;
603
604 Packet4i p = pmul(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
605 p = pmul(p, __builtin_msa_shf_w(p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
606 return p[0];
607}
608
609// min
610template <>
611EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
612 EIGEN_MSA_DEBUG;
613
614 // Swap 64-bit halves of a.
615 Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
616#if !EIGEN_FAST_MATH
617 // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit
618 // masks of all zeroes/ones in low 64 bits.
619 v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
620 // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes.
621 unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
622#endif
623 // Continue with min computation.
624 Packet4f v = __builtin_msa_fmin_w(a, swapped);
625 v = __builtin_msa_fmin_w(v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
626#if !EIGEN_FAST_MATH
627 // Based on the mask select between v and 4 qNaNs.
628 v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
629 v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
630#endif
631 return v[0];
632}
633
634template <>
635EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) {
636 EIGEN_MSA_DEBUG;
637
638 Packet4i m = pmin(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
639 m = pmin(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
640 return m[0];
641}
642
643// max
644template <>
645EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
646 EIGEN_MSA_DEBUG;
647
648 // Swap 64-bit halves of a.
649 Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
650#if !EIGEN_FAST_MATH
651 // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit
652 // masks of all zeroes/ones in low 64 bits.
653 v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
654 // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes.
655 unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
656#endif
657 // Continue with max computation.
658 Packet4f v = __builtin_msa_fmax_w(a, swapped);
659 v = __builtin_msa_fmax_w(v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
660#if !EIGEN_FAST_MATH
661 // Based on the mask select between v and 4 qNaNs.
662 v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
663 v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
664#endif
665 return v[0];
666}
667
668template <>
669EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
670 EIGEN_MSA_DEBUG;
671
672 Packet4i m = pmax(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
673 m = pmax(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
674 return m[0];
675}
676
677inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet4f, 4>& value) {
678 os << "[ " << value.packet[0] << "," << std::endl
679 << " " << value.packet[1] << "," << std::endl
680 << " " << value.packet[2] << "," << std::endl
681 << " " << value.packet[3] << " ]";
682 return os;
683}
684
685EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
686 EIGEN_MSA_DEBUG;
687
688 v4i32 tmp1, tmp2, tmp3, tmp4;
689
690 tmp1 = __builtin_msa_ilvr_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
691 tmp2 = __builtin_msa_ilvr_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
692 tmp3 = __builtin_msa_ilvl_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
693 tmp4 = __builtin_msa_ilvl_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
694
695 kernel.packet[0] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
696 kernel.packet[1] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
697 kernel.packet[2] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
698 kernel.packet[3] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
699}
700
701inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet4i, 4>& value) {
702 os << "[ " << value.packet[0] << "," << std::endl
703 << " " << value.packet[1] << "," << std::endl
704 << " " << value.packet[2] << "," << std::endl
705 << " " << value.packet[3] << " ]";
706 return os;
707}
708
709EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
710 EIGEN_MSA_DEBUG;
711
712 v4i32 tmp1, tmp2, tmp3, tmp4;
713
714 tmp1 = __builtin_msa_ilvr_w(kernel.packet[1], kernel.packet[0]);
715 tmp2 = __builtin_msa_ilvr_w(kernel.packet[3], kernel.packet[2]);
716 tmp3 = __builtin_msa_ilvl_w(kernel.packet[1], kernel.packet[0]);
717 tmp4 = __builtin_msa_ilvl_w(kernel.packet[3], kernel.packet[2]);
718
719 kernel.packet[0] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
720 kernel.packet[1] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
721 kernel.packet[2] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
722 kernel.packet[3] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
723}
724
725template <>
726EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
727 EIGEN_MSA_DEBUG;
728
729 return __builtin_msa_fsqrt_w(a);
730}
731
732template <>
733EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
734 EIGEN_MSA_DEBUG;
735
736#if EIGEN_FAST_MATH
737 return __builtin_msa_frsqrt_w(a);
738#else
739 Packet4f ones = __builtin_msa_ffint_s_w(__builtin_msa_ldi_w(1));
740 return pdiv(ones, psqrt(a));
741#endif
742}
743
744template <>
745EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
746 Packet4f v = a;
747 int32_t old_mode, new_mode;
748 asm volatile(
749 "cfcmsa %[old_mode], $1\n"
750 "ori %[new_mode], %[old_mode], 3\n" // 3 = round towards -INFINITY.
751 "ctcmsa $1, %[new_mode]\n"
752 "frint.w %w[v], %w[v]\n"
753 "ctcmsa $1, %[old_mode]\n"
754 : // outputs
755 [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
756 [v] "+f"(v)
757 : // inputs
758 : // clobbers
759 );
760 return v;
761}
762
763template <>
764EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
765 Packet4f v = a;
766 int32_t old_mode, new_mode;
767 asm volatile(
768 "cfcmsa %[old_mode], $1\n"
769 "ori %[new_mode], %[old_mode], 3\n"
770 "xori %[new_mode], %[new_mode], 1\n" // 2 = round towards +INFINITY.
771 "ctcmsa $1, %[new_mode]\n"
772 "frint.w %w[v], %w[v]\n"
773 "ctcmsa $1, %[old_mode]\n"
774 : // outputs
775 [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
776 [v] "+f"(v)
777 : // inputs
778 : // clobbers
779 );
780 return v;
781}
782
783template <>
784EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
785 Packet4f v = a;
786 int32_t old_mode, new_mode;
787 asm volatile(
788 "cfcmsa %[old_mode], $1\n"
789 "ori %[new_mode], %[old_mode], 3\n"
790 "xori %[new_mode], %[new_mode], 3\n" // 0 = round to nearest, ties to even.
791 "ctcmsa $1, %[new_mode]\n"
792 "frint.w %w[v], %w[v]\n"
793 "ctcmsa $1, %[old_mode]\n"
794 : // outputs
795 [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
796 [v] "+f"(v)
797 : // inputs
798 : // clobbers
799 );
800 return v;
801}
802
803//---------- double ----------
804
805typedef v2f64 Packet2d;
806typedef v2i64 Packet2l;
807typedef v2u64 Packet2ul;
808
809#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = {X, X}
810#define EIGEN_DECLARE_CONST_Packet2l(NAME, X) const Packet2l p2l_##NAME = {X, X}
811#define EIGEN_DECLARE_CONST_Packet2ul(NAME, X) const Packet2ul p2ul_##NAME = {X, X}
812
813inline std::ostream& operator<<(std::ostream& os, const Packet2d& value) {
814 os << "[ " << value[0] << ", " << value[1] << " ]";
815 return os;
816}
817
818inline std::ostream& operator<<(std::ostream& os, const Packet2l& value) {
819 os << "[ " << value[0] << ", " << value[1] << " ]";
820 return os;
821}
822
823inline std::ostream& operator<<(std::ostream& os, const Packet2ul& value) {
824 os << "[ " << value[0] << ", " << value[1] << " ]";
825 return os;
826}
827
828template <>
829struct packet_traits<double> : default_packet_traits {
830 typedef Packet2d type;
831 typedef Packet2d half;
832 enum {
833 Vectorizable = 1,
834 AlignedOnScalar = 1,
835 size = 2,
836 // FIXME check the Has*
837 HasDiv = 1,
838 HasExp = 1,
839 HasSqrt = 1,
840 HasRsqrt = 1,
841 };
842};
843
844template <>
845struct unpacket_traits<Packet2d> {
846 typedef double type;
847 enum {
848 size = 2,
849 alignment = Aligned16,
850 vectorizable = true,
851 masked_load_available = false,
852 masked_store_available = false
853 };
854 typedef Packet2d half;
855};
856
857template <>
858EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
859 EIGEN_MSA_DEBUG;
860
861 Packet2d value = {from, from};
862 return value;
863}
864
865template <>
866EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
867 EIGEN_MSA_DEBUG;
868
869 return __builtin_msa_fadd_d(a, b);
870}
871
872template <>
873EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
874 EIGEN_MSA_DEBUG;
875
876 static const Packet2d countdown = {0.0, 1.0};
877 return padd(pset1<Packet2d>(a), countdown);
878}
879
880template <>
881EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
882 EIGEN_MSA_DEBUG;
883
884 return __builtin_msa_fsub_d(a, b);
885}
886
887template <>
888EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
889 EIGEN_MSA_DEBUG;
890
891 return (Packet2d)__builtin_msa_bnegi_d((v2u64)a, 63);
892}
893
894template <>
895EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
896 EIGEN_MSA_DEBUG;
897
898 return a;
899}
900
901template <>
902EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
903 EIGEN_MSA_DEBUG;
904
905 return __builtin_msa_fmul_d(a, b);
906}
907
908template <>
909EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
910 EIGEN_MSA_DEBUG;
911
912 return __builtin_msa_fdiv_d(a, b);
913}
914
915template <>
916EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
917 EIGEN_MSA_DEBUG;
918
919 return __builtin_msa_fmadd_d(c, a, b);
920}
921
922// Logical Operations are not supported for float, so we have to reinterpret casts using MSA
923// intrinsics
924template <>
925EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
926 EIGEN_MSA_DEBUG;
927
928 return (Packet2d)__builtin_msa_and_v((v16u8)a, (v16u8)b);
929}
930
931template <>
932EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
933 EIGEN_MSA_DEBUG;
934
935 return (Packet2d)__builtin_msa_or_v((v16u8)a, (v16u8)b);
936}
937
938template <>
939EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
940 EIGEN_MSA_DEBUG;
941
942 return (Packet2d)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
943}
944
945template <>
946EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
947 EIGEN_MSA_DEBUG;
948
949 return pand(a, (Packet2d)__builtin_msa_xori_b((v16u8)b, 255));
950}
951
952template <>
953EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
954 EIGEN_MSA_DEBUG;
955
956 EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast<double*>(from), 0);
957}
958
959template <>
960EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
961 EIGEN_MSA_DEBUG;
962
963#if EIGEN_FAST_MATH
964 // This prefers numbers to NaNs.
965 return __builtin_msa_fmin_d(a, b);
966#else
967 // This prefers NaNs to numbers.
968 v2i64 aNaN = __builtin_msa_fcun_d(a, a);
969 v2i64 aMinOrNaN = por(__builtin_msa_fclt_d(a, b), aNaN);
970 return (Packet2d)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
971#endif
972}
973
974template <>
975EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
976 EIGEN_MSA_DEBUG;
977
978#if EIGEN_FAST_MATH
979 // This prefers numbers to NaNs.
980 return __builtin_msa_fmax_d(a, b);
981#else
982 // This prefers NaNs to numbers.
983 v2i64 aNaN = __builtin_msa_fcun_d(a, a);
984 v2i64 aMaxOrNaN = por(__builtin_msa_fclt_d(b, a), aNaN);
985 return (Packet2d)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
986#endif
987}
988
989template <>
990EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
991 EIGEN_MSA_DEBUG;
992
993 EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast<double*>(from), 0);
994}
995
996template <>
997EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
998 EIGEN_MSA_DEBUG;
999
1000 Packet2d value = {*from, *from};
1001 return value;
1002}
1003
1004template <>
1005EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
1006 EIGEN_MSA_DEBUG;
1007
1008 EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
1009}
1010
1011template <>
1012EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
1013 EIGEN_MSA_DEBUG;
1014
1015 EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
1016}
1017
1018template <>
1019EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
1020 EIGEN_MSA_DEBUG;
1021
1022 Packet2d value;
1023 value[0] = *from;
1024 from += stride;
1025 value[1] = *from;
1026 return value;
1027}
1028
1029template <>
1030EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
1031 EIGEN_MSA_DEBUG;
1032
1033 *to = from[0];
1034 to += stride;
1035 *to = from[1];
1036}
1037
1038template <>
1039EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
1040 EIGEN_MSA_DEBUG;
1041
1042 __builtin_prefetch(addr);
1043}
1044
1045template <>
1046EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
1047 EIGEN_MSA_DEBUG;
1048
1049 return a[0];
1050}
1051
1052template <>
1053EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
1054 EIGEN_MSA_DEBUG;
1055
1056 return (Packet2d)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
1057}
1058
1059template <>
1060EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
1061 EIGEN_MSA_DEBUG;
1062
1063 return (Packet2d)__builtin_msa_bclri_d((v2u64)a, 63);
1064}
1065
1066template <>
1067EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
1068 EIGEN_MSA_DEBUG;
1069
1070 Packet2d s = padd(a, preverse(a));
1071 return s[0];
1072}
1073
1074// Other reduction functions:
1075// mul
1076template <>
1077EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
1078 EIGEN_MSA_DEBUG;
1079
1080 Packet2d p = pmul(a, preverse(a));
1081 return p[0];
1082}
1083
1084// min
1085template <>
1086EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
1087 EIGEN_MSA_DEBUG;
1088
1089#if EIGEN_FAST_MATH
1090 Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
1091 Packet2d v = __builtin_msa_fmin_d(a, swapped);
1092 return v[0];
1093#else
1094 double a0 = a[0], a1 = a[1];
1095 return ((numext::isnan)(a0) || a0 < a1) ? a0 : a1;
1096#endif
1097}
1098
1099// max
1100template <>
1101EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
1102 EIGEN_MSA_DEBUG;
1103
1104#if EIGEN_FAST_MATH
1105 Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
1106 Packet2d v = __builtin_msa_fmax_d(a, swapped);
1107 return v[0];
1108#else
1109 double a0 = a[0], a1 = a[1];
1110 return ((numext::isnan)(a0) || a0 > a1) ? a0 : a1;
1111#endif
1112}
1113
1114template <>
1115EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& a) {
1116 EIGEN_MSA_DEBUG;
1117
1118 return __builtin_msa_fsqrt_d(a);
1119}
1120
1121template <>
1122EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
1123 EIGEN_MSA_DEBUG;
1124
1125#if EIGEN_FAST_MATH
1126 return __builtin_msa_frsqrt_d(a);
1127#else
1128 Packet2d ones = __builtin_msa_ffint_s_d(__builtin_msa_ldi_d(1));
1129 return pdiv(ones, psqrt(a));
1130#endif
1131}
1132
1133inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet2d, 2>& value) {
1134 os << "[ " << value.packet[0] << "," << std::endl << " " << value.packet[1] << " ]";
1135 return os;
1136}
1137
1138EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
1139 EIGEN_MSA_DEBUG;
1140
1141 Packet2d trn1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
1142 Packet2d trn2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
1143 kernel.packet[0] = trn1;
1144 kernel.packet[1] = trn2;
1145}
1146
1147template <>
1148EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
1149 Packet2d v = a;
1150 int32_t old_mode, new_mode;
1151 asm volatile(
1152 "cfcmsa %[old_mode], $1\n"
1153 "ori %[new_mode], %[old_mode], 3\n" // 3 = round towards -INFINITY.
1154 "ctcmsa $1, %[new_mode]\n"
1155 "frint.d %w[v], %w[v]\n"
1156 "ctcmsa $1, %[old_mode]\n"
1157 : // outputs
1158 [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
1159 [v] "+f"(v)
1160 : // inputs
1161 : // clobbers
1162 );
1163 return v;
1164}
1165
1166template <>
1167EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
1168 Packet2d v = a;
1169 int32_t old_mode, new_mode;
1170 asm volatile(
1171 "cfcmsa %[old_mode], $1\n"
1172 "ori %[new_mode], %[old_mode], 3\n"
1173 "xori %[new_mode], %[new_mode], 1\n" // 2 = round towards +INFINITY.
1174 "ctcmsa $1, %[new_mode]\n"
1175 "frint.d %w[v], %w[v]\n"
1176 "ctcmsa $1, %[old_mode]\n"
1177 : // outputs
1178 [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
1179 [v] "+f"(v)
1180 : // inputs
1181 : // clobbers
1182 );
1183 return v;
1184}
1185
1186template <>
1187EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
1188 Packet2d v = a;
1189 int32_t old_mode, new_mode;
1190 asm volatile(
1191 "cfcmsa %[old_mode], $1\n"
1192 "ori %[new_mode], %[old_mode], 3\n"
1193 "xori %[new_mode], %[new_mode], 3\n" // 0 = round to nearest, ties to even.
1194 "ctcmsa $1, %[new_mode]\n"
1195 "frint.d %w[v], %w[v]\n"
1196 "ctcmsa $1, %[old_mode]\n"
1197 : // outputs
1198 [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
1199 [v] "+f"(v)
1200 : // inputs
1201 : // clobbers
1202 );
1203 return v;
1204}
1205
1206} // end namespace internal
1207
1208} // end namespace Eigen
1209
1210#endif // EIGEN_PACKET_MATH_MSA_H
@ Aligned16
Definition Constants.h:237
Namespace containing all symbols from the Eigen library.
Definition B01_Experimental.dox:1
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:82