10#ifndef EIGEN_PACKET_MATH_SSE_H
11#define EIGEN_PACKET_MATH_SSE_H
15#include "../../InternalHeaderCheck.h"
21#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
22#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
25#if !defined(EIGEN_VECTORIZE_AVX) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
28#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2 * sizeof(void*))
31#ifdef EIGEN_VECTORIZE_FMA
32#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
33#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
37#if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW || EIGEN_COMP_LCC) && \
38 (__GXX_ABI_VERSION < 1004)) || \
45typedef eigen_packet_wrapper<__m128> Packet4f;
46typedef eigen_packet_wrapper<__m128d> Packet2d;
48typedef __m128 Packet4f;
49typedef __m128d Packet2d;
52typedef eigen_packet_wrapper<__m128i, 0> Packet4i;
53typedef eigen_packet_wrapper<__m128i, 1> Packet16b;
54typedef eigen_packet_wrapper<__m128i, 4> Packet4ui;
55typedef eigen_packet_wrapper<__m128i, 5> Packet2l;
58struct is_arithmetic<__m128> {
59 enum { value =
true };
62struct is_arithmetic<__m128i> {
63 enum { value =
true };
66struct is_arithmetic<__m128d> {
67 enum { value =
true };
70struct is_arithmetic<Packet4i> {
71 enum { value =
true };
74struct is_arithmetic<Packet2l> {
75 enum { value =
true };
81struct is_arithmetic<Packet4ui> {
82 enum { value =
false };
85struct is_arithmetic<Packet16b> {
86 enum { value =
true };
89template <
int p,
int q,
int r,
int s>
91 enum { mask = (s) << 6 | (r) << 4 | (q) << 2 | (p) };
95#define vec4f_swizzle1(v, p, q, r, s) \
96 Packet4f(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), (shuffle_mask<p, q, r, s>::mask))))
98#define vec4i_swizzle1(v, p, q, r, s) Packet4i(_mm_shuffle_epi32(v, (shuffle_mask<p, q, r, s>::mask)))
100#define vec4ui_swizzle1(v, p, q, r, s) Packet4ui(vec4i_swizzle1(v, p, q, r, s))
102#define vec2d_swizzle1(v, p, q) \
103 Packet2d(_mm_castsi128_pd( \
104 _mm_shuffle_epi32(_mm_castpd_si128(v), (shuffle_mask<2 * p, 2 * p + 1, 2 * q, 2 * q + 1>::mask))))
106#define vec4f_swizzle2(a, b, p, q, r, s) Packet4f(_mm_shuffle_ps((a), (b), (shuffle_mask<p, q, r, s>::mask)))
108#define vec4i_swizzle2(a, b, p, q, r, s) \
110 _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (shuffle_mask<p, q, r, s>::mask)))))
112#define vec4ui_swizzle2(a, b, p, q, r, s) Packet4i(vec4i_swizzle2(a, b, p, q, r, s))
114EIGEN_STRONG_INLINE Packet4f vec4f_movelh(
const Packet4f& a,
const Packet4f& b) {
115 return Packet4f(_mm_movelh_ps(a, b));
117EIGEN_STRONG_INLINE Packet4f vec4f_movehl(
const Packet4f& a,
const Packet4f& b) {
118 return Packet4f(_mm_movehl_ps(a, b));
120EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(
const Packet4f& a,
const Packet4f& b) {
121 return Packet4f(_mm_unpacklo_ps(a, b));
123EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(
const Packet4f& a,
const Packet4f& b) {
124 return Packet4f(_mm_unpackhi_ps(a, b));
126#define vec4f_duplane(a, p) vec4f_swizzle2(a, a, p, p, p, p)
128#define vec2d_swizzle2(a, b, mask) Packet2d(_mm_shuffle_pd(a, b, mask))
130EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(
const Packet2d& a,
const Packet2d& b) {
131 return Packet2d(_mm_unpacklo_pd(a, b));
133EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(
const Packet2d& a,
const Packet2d& b) {
134 return Packet2d(_mm_unpackhi_pd(a, b));
136#define vec2d_duplane(a, p) vec2d_swizzle2(a, a, (p << 1) | p)
138#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = pset1<Packet4f>(X)
140#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = pset1<Packet2d>(X)
142#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) const Packet4f p4f_##NAME = pset1frombits<Packet4f>(X)
144#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = pset1<Packet4i>(X)
146#define EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = pset1<Packet4ui>(X)
150EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_0(
const __m128i& a) {
return _mm_cvtsi128_si64(a); }
151#ifdef EIGEN_VECTORIZE_SSE4_1
152EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_1(
const __m128i& a) {
return _mm_extract_epi64(a, 1); }
154EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_1(
const __m128i& a) {
155 return _mm_cvtsi128_si64(_mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(a), 0x1)));
161EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_0(
const __m128i& a) {
162 return numext::bit_cast<int64_t>(_mm_cvtsd_f64(_mm_castsi128_pd(a)));
164EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_1(
const __m128i& a) {
165 return numext::bit_cast<int64_t>(_mm_cvtsd_f64(_mm_shuffle_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(a), 0x1)));
171#ifndef EIGEN_VECTORIZE_AVX
173struct packet_traits<float> : default_packet_traits {
174 typedef Packet4f type;
175 typedef Packet4f half;
183 HasReciprocal = EIGEN_FAST_MATH,
184 HasSin = EIGEN_FAST_MATH,
185 HasCos = EIGEN_FAST_MATH,
200 HasTanh = EIGEN_FAST_MATH,
201 HasErf = EIGEN_FAST_MATH,
202 HasErfc = EIGEN_FAST_MATH,
208struct packet_traits<double> : default_packet_traits {
209 typedef Packet2d type;
210 typedef Packet2d half;
218 HasSin = EIGEN_FAST_MATH,
219 HasCos = EIGEN_FAST_MATH,
220 HasTanh = EIGEN_FAST_MATH,
222 HasErf = EIGEN_FAST_MATH,
223 HasErfc = EIGEN_FAST_MATH,
235struct packet_traits<int> : default_packet_traits {
236 typedef Packet4i type;
237 typedef Packet4i half;
250struct packet_traits<uint32_t> : default_packet_traits {
251 typedef Packet4ui type;
252 typedef Packet4ui half;
266struct packet_traits<int64_t> : default_packet_traits {
267 typedef Packet2l type;
268 typedef Packet2l half;
282struct packet_traits<bool> : default_packet_traits {
283 typedef Packet16b type;
284 typedef Packet16b half;
304struct unpacket_traits<Packet4f> {
306 typedef Packet4f half;
307 typedef Packet4i integer_packet;
312 masked_load_available =
false,
313 masked_store_available =
false
317struct unpacket_traits<Packet2d> {
319 typedef Packet2d half;
320 typedef Packet2l integer_packet;
325 masked_load_available =
false,
326 masked_store_available =
false
330struct unpacket_traits<Packet2l> {
331 typedef int64_t type;
332 typedef Packet2l half;
337 masked_load_available =
false,
338 masked_store_available =
false
342struct unpacket_traits<Packet4i> {
344 typedef Packet4i half;
349 masked_load_available =
false,
350 masked_store_available =
false
354struct unpacket_traits<Packet4ui> {
355 typedef uint32_t type;
356 typedef Packet4ui half;
361 masked_load_available =
false,
362 masked_store_available =
false
366struct unpacket_traits<Packet16b> {
368 typedef Packet16b half;
373 masked_load_available =
false,
374 masked_store_available =
false
378#ifndef EIGEN_VECTORIZE_AVX
380struct scalar_div_cost<float, true> {
384struct scalar_div_cost<double, true> {
390EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(
const float& from) {
391 return _mm_set_ps1(from);
394EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(
const double& from) {
395 return _mm_set1_pd(from);
398EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(
const int64_t& from) {
399 return _mm_set1_epi64x(from);
402EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(
const int& from) {
403 return _mm_set1_epi32(from);
406EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(
const uint32_t& from) {
407 return _mm_set1_epi32(numext::bit_cast<int32_t>(from));
410EIGEN_STRONG_INLINE Packet16b pset1<Packet16b>(
const bool& from) {
411 return _mm_set1_epi8(
static_cast<char>(from));
415EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(
unsigned int from) {
416 return _mm_castsi128_ps(pset1<Packet4i>(from));
419EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
420 return _mm_castsi128_pd(_mm_set1_epi64x(from));
424EIGEN_STRONG_INLINE Packet4f peven_mask(
const Packet4f& ) {
425 return _mm_castsi128_ps(_mm_set_epi32(0, -1, 0, -1));
428EIGEN_STRONG_INLINE Packet2l peven_mask(
const Packet2l& ) {
429 return _mm_set_epi32(0, 0, -1, -1);
432EIGEN_STRONG_INLINE Packet4i peven_mask(
const Packet4i& ) {
433 return _mm_set_epi32(0, -1, 0, -1);
436EIGEN_STRONG_INLINE Packet4ui peven_mask(
const Packet4ui& ) {
437 return _mm_set_epi32(0, -1, 0, -1);
440EIGEN_STRONG_INLINE Packet2d peven_mask(
const Packet2d& ) {
441 return _mm_castsi128_pd(_mm_set_epi32(0, 0, -1, -1));
445EIGEN_STRONG_INLINE Packet4f pzero(
const Packet4f& ) {
446 return _mm_setzero_ps();
449EIGEN_STRONG_INLINE Packet2d pzero(
const Packet2d& ) {
450 return _mm_setzero_pd();
453EIGEN_STRONG_INLINE Packet2l pzero(
const Packet2l& ) {
454 return _mm_setzero_si128();
457EIGEN_STRONG_INLINE Packet4i pzero(
const Packet4i& ) {
458 return _mm_setzero_si128();
461EIGEN_STRONG_INLINE Packet4ui pzero(
const Packet4ui& ) {
462 return _mm_setzero_si128();
470#if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__)
472EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(
const float* from) {
473 return vec4f_swizzle1(_mm_load_ss(from), 0, 0, 0, 0);
478EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(
const float& a) {
479 return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3, 2, 1, 0));
482EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(
const double& a) {
483 return _mm_add_pd(pset1<Packet2d>(a), _mm_set_pd(1, 0));
486EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(
const int64_t& a) {
487 return _mm_add_epi32(pset1<Packet2l>(a), _mm_set_epi64x(1, 0));
490EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(
const int& a) {
491 return _mm_add_epi32(pset1<Packet4i>(a), _mm_set_epi32(3, 2, 1, 0));
494EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(
const uint32_t& a) {
495 return _mm_add_epi32(pset1<Packet4ui>(a), _mm_set_epi32(3, 2, 1, 0));
499EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
500 return _mm_add_ps(a, b);
503EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
504 return _mm_add_pd(a, b);
507EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
508 return _mm_add_epi64(a, b);
511EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
512 return _mm_add_epi32(a, b);
515EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
516 return _mm_add_epi32(a, b);
520EIGEN_STRONG_INLINE Packet16b padd<Packet16b>(
const Packet16b& a,
const Packet16b& b) {
521 return _mm_or_si128(a, b);
524template <
typename Packet>
525EIGEN_STRONG_INLINE Packet padds(
const Packet& a,
const Packet& b);
527EIGEN_STRONG_INLINE Packet4f padds<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
528 return _mm_add_ss(a, b);
531EIGEN_STRONG_INLINE Packet2d padds<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
532 return _mm_add_sd(a, b);
536EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
537 return _mm_sub_ps(a, b);
540EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
541 return _mm_sub_pd(a, b);
544EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
545 return _mm_sub_epi64(a, b);
548EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
549 return _mm_sub_epi32(a, b);
552EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
553 return _mm_sub_epi32(a, b);
556EIGEN_STRONG_INLINE Packet16b psub<Packet16b>(
const Packet16b& a,
const Packet16b& b) {
557 return _mm_xor_si128(a, b);
561EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(
const Packet4f& a,
const Packet4f& b);
563EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
564#ifdef EIGEN_VECTORIZE_SSE3
565 return _mm_addsub_ps(a, b);
567 const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x0, 0x80000000, 0x0));
568 return padd(a, pxor(mask, b));
573EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(
const Packet2d&,
const Packet2d&);
575EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
576#ifdef EIGEN_VECTORIZE_SSE3
577 return _mm_addsub_pd(a, b);
579 const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0, 0x80000000, 0x0, 0x0));
580 return padd(a, pxor(mask, b));
585EIGEN_STRONG_INLINE Packet4f pnegate(
const Packet4f& a) {
586 const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000));
587 return _mm_xor_ps(a, mask);
590EIGEN_STRONG_INLINE Packet2d pnegate(
const Packet2d& a) {
591 const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0, 0x80000000, 0x0, 0x80000000));
592 return _mm_xor_pd(a, mask);
595EIGEN_STRONG_INLINE Packet2l pnegate(
const Packet2l& a) {
596 return psub(pzero(a), a);
600EIGEN_STRONG_INLINE Packet4i pnegate(
const Packet4i& a) {
601 return psub(pzero(a), a);
605EIGEN_STRONG_INLINE Packet4f pconj(
const Packet4f& a) {
609EIGEN_STRONG_INLINE Packet2d pconj(
const Packet2d& a) {
613EIGEN_STRONG_INLINE Packet2l pconj(
const Packet2l& a) {
617EIGEN_STRONG_INLINE Packet4i pconj(
const Packet4i& a) {
622EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
623 return _mm_mul_ps(a, b);
626EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
627 return _mm_mul_pd(a, b);
630EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
632 __m128i upper32_a = _mm_srli_epi64(a, 32);
633 __m128i upper32_b = _mm_srli_epi64(b, 32);
636 __m128i mul1 = _mm_mul_epu32(upper32_a, b);
637 __m128i mul2 = _mm_mul_epu32(upper32_b, a);
639 __m128i mul3 = _mm_mul_epu32(a, b);
641 __m128i high = _mm_slli_epi64(_mm_add_epi64(mul1, mul2), 32);
642 return _mm_add_epi64(high, mul3);
645EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
646#ifdef EIGEN_VECTORIZE_SSE4_1
647 return _mm_mullo_epi32(a, b);
650 return vec4i_swizzle1(
651 vec4i_swizzle2(_mm_mul_epu32(a, b), _mm_mul_epu32(vec4i_swizzle1(a, 1, 0, 3, 2), vec4i_swizzle1(b, 1, 0, 3, 2)),
657EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
658#ifdef EIGEN_VECTORIZE_SSE4_1
659 return _mm_mullo_epi32(a, b);
662 return vec4ui_swizzle1(
663 vec4ui_swizzle2(_mm_mul_epu32(a, b),
664 _mm_mul_epu32(vec4ui_swizzle1(a, 1, 0, 3, 2), vec4ui_swizzle1(b, 1, 0, 3, 2)), 0, 2, 0, 2),
670EIGEN_STRONG_INLINE Packet16b pmul<Packet16b>(
const Packet16b& a,
const Packet16b& b) {
671 return _mm_and_si128(a, b);
675EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
676 return _mm_div_ps(a, b);
679EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
680 return _mm_div_pd(a, b);
684EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
685#ifdef EIGEN_VECTORIZE_AVX
686 return _mm256_cvttpd_epi32(_mm256_div_pd(_mm256_cvtepi32_pd(a), _mm256_cvtepi32_pd(b)));
688 __m128i q_lo = _mm_cvttpd_epi32(_mm_div_pd(_mm_cvtepi32_pd(a), _mm_cvtepi32_pd(b)));
689 __m128i q_hi = _mm_cvttpd_epi32(
690 _mm_div_pd(_mm_cvtepi32_pd(vec4i_swizzle1(a, 2, 3, 0, 1)), _mm_cvtepi32_pd(vec4i_swizzle1(b, 2, 3, 0, 1))));
691 return vec4i_swizzle1(_mm_unpacklo_epi32(q_lo, q_hi), 0, 2, 1, 3);
695#ifdef EIGEN_VECTORIZE_FMA
697EIGEN_STRONG_INLINE Packet4f pmadd(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
698 return _mm_fmadd_ps(a, b, c);
701EIGEN_STRONG_INLINE Packet2d pmadd(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
702 return _mm_fmadd_pd(a, b, c);
705EIGEN_STRONG_INLINE Packet4f pmsub(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
706 return _mm_fmsub_ps(a, b, c);
709EIGEN_STRONG_INLINE Packet2d pmsub(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
710 return _mm_fmsub_pd(a, b, c);
713EIGEN_STRONG_INLINE Packet4f pnmadd(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
714 return _mm_fnmadd_ps(a, b, c);
717EIGEN_STRONG_INLINE Packet2d pnmadd(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
718 return _mm_fnmadd_pd(a, b, c);
721EIGEN_STRONG_INLINE Packet4f pnmsub(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
722 return _mm_fnmsub_ps(a, b, c);
725EIGEN_STRONG_INLINE Packet2d pnmsub(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
726 return _mm_fnmsub_pd(a, b, c);
729template <
typename Packet>
730EIGEN_STRONG_INLINE Packet pmadds(
const Packet& a,
const Packet& b,
const Packet& c);
732EIGEN_STRONG_INLINE Packet4f pmadds<Packet4f>(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
733 return _mm_fmadd_ss(a, b, c);
736EIGEN_STRONG_INLINE Packet2d pmadds<Packet2d>(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
737 return _mm_fmadd_sd(a, b, c);
741#ifdef EIGEN_VECTORIZE_SSE4_1
743EIGEN_STRONG_INLINE Packet4f pselect(
const Packet4f& mask,
const Packet4f& a,
const Packet4f& b) {
744 return _mm_blendv_ps(b, a, mask);
748EIGEN_STRONG_INLINE Packet2l pselect(
const Packet2l& mask,
const Packet2l& a,
const Packet2l& b) {
749 return _mm_castpd_si128(_mm_blendv_pd(_mm_castsi128_pd(b), _mm_castsi128_pd(a), _mm_castsi128_pd(mask)));
753EIGEN_STRONG_INLINE Packet4i pselect(
const Packet4i& mask,
const Packet4i& a,
const Packet4i& b) {
754 return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(a), _mm_castsi128_ps(mask)));
758EIGEN_STRONG_INLINE Packet4ui pselect(
const Packet4ui& mask,
const Packet4ui& a,
const Packet4ui& b) {
759 return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(a), _mm_castsi128_ps(mask)));
763EIGEN_STRONG_INLINE Packet2d pselect(
const Packet2d& mask,
const Packet2d& a,
const Packet2d& b) {
764 return _mm_blendv_pd(b, a, mask);
769EIGEN_STRONG_INLINE Packet2l ptrue<Packet2l>(
const Packet2l& a) {
770 return _mm_cmpeq_epi32(a, a);
773EIGEN_STRONG_INLINE Packet4i ptrue<Packet4i>(
const Packet4i& a) {
774 return _mm_cmpeq_epi32(a, a);
777EIGEN_STRONG_INLINE Packet16b ptrue<Packet16b>(
const Packet16b& ) {
778 return pset1<Packet16b>(
true);
781EIGEN_STRONG_INLINE Packet4f ptrue<Packet4f>(
const Packet4f& a) {
782 Packet4i b = _mm_castps_si128(a);
783 return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b));
786EIGEN_STRONG_INLINE Packet2d ptrue<Packet2d>(
const Packet2d& a) {
787 Packet4i b = _mm_castpd_si128(a);
788 return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b));
792EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
793 return _mm_and_ps(a, b);
796EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
797 return _mm_and_pd(a, b);
800EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
801 return _mm_and_si128(a, b);
804EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
805 return _mm_and_si128(a, b);
808EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
809 return _mm_and_si128(a, b);
812EIGEN_STRONG_INLINE Packet16b pand<Packet16b>(
const Packet16b& a,
const Packet16b& b) {
813 return _mm_and_si128(a, b);
817EIGEN_STRONG_INLINE Packet4f por<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
818 return _mm_or_ps(a, b);
821EIGEN_STRONG_INLINE Packet2d por<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
822 return _mm_or_pd(a, b);
825EIGEN_STRONG_INLINE Packet2l por<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
826 return _mm_or_si128(a, b);
829EIGEN_STRONG_INLINE Packet4i por<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
830 return _mm_or_si128(a, b);
833EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
834 return _mm_or_si128(a, b);
837EIGEN_STRONG_INLINE Packet16b por<Packet16b>(
const Packet16b& a,
const Packet16b& b) {
838 return _mm_or_si128(a, b);
842EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
843 return _mm_xor_ps(a, b);
846EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
847 return _mm_xor_pd(a, b);
850EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
851 return _mm_xor_si128(a, b);
854EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
855 return _mm_xor_si128(a, b);
858EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
859 return _mm_xor_si128(a, b);
862EIGEN_STRONG_INLINE Packet16b pxor<Packet16b>(
const Packet16b& a,
const Packet16b& b) {
863 return _mm_xor_si128(a, b);
867EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
868 return _mm_andnot_ps(b, a);
871EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
872 return _mm_andnot_pd(b, a);
875EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
876 return _mm_andnot_si128(b, a);
879EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
880 return _mm_andnot_si128(b, a);
883EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
884 return _mm_andnot_si128(b, a);
887EIGEN_STRONG_INLINE Packet16b pandnot<Packet16b>(
const Packet16b& a,
const Packet16b& b) {
888 return _mm_andnot_si128(b, a);
891EIGEN_STRONG_INLINE Packet16b pcmp_lt(
const Packet16b& a,
const Packet16b& b) {
892 return _mm_andnot_si128(a, b);
895EIGEN_STRONG_INLINE Packet4f pcmp_le(
const Packet4f& a,
const Packet4f& b) {
896 return _mm_cmple_ps(a, b);
899EIGEN_STRONG_INLINE Packet4f pcmp_lt(
const Packet4f& a,
const Packet4f& b) {
900 return _mm_cmplt_ps(a, b);
903EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(
const Packet4f& a,
const Packet4f& b) {
904 return _mm_cmpnge_ps(a, b);
907EIGEN_STRONG_INLINE Packet4f pcmp_eq(
const Packet4f& a,
const Packet4f& b) {
908 return _mm_cmpeq_ps(a, b);
912EIGEN_STRONG_INLINE Packet2d pcmp_le(
const Packet2d& a,
const Packet2d& b) {
913 return _mm_cmple_pd(a, b);
916EIGEN_STRONG_INLINE Packet2d pcmp_lt(
const Packet2d& a,
const Packet2d& b) {
917 return _mm_cmplt_pd(a, b);
920EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(
const Packet2d& a,
const Packet2d& b) {
921 return _mm_cmpnge_pd(a, b);
924EIGEN_STRONG_INLINE Packet2d pcmp_eq(
const Packet2d& a,
const Packet2d& b) {
925 return _mm_cmpeq_pd(a, b);
928EIGEN_STRONG_INLINE Packet4i pcmp_lt(
const Packet4i& a,
const Packet4i& b) {
929 return _mm_cmplt_epi32(a, b);
932EIGEN_STRONG_INLINE Packet4i pcmp_eq(
const Packet4i& a,
const Packet4i& b) {
933 return _mm_cmpeq_epi32(a, b);
936EIGEN_STRONG_INLINE Packet4i pcmp_le(
const Packet4i& a,
const Packet4i& b) {
937#ifdef EIGEN_VECTORIZE_SSE4_1
938 return _mm_cmpeq_epi32(a, _mm_min_epi32(a, b));
940 return por(pcmp_lt(a, b), pcmp_eq(a, b));
944EIGEN_STRONG_INLINE Packet2l pcmp_lt(
const Packet2l& a,
const Packet2l& b) {
945#ifdef EIGEN_VECTORIZE_SSE4_2
946 return _mm_cmpgt_epi64(b, a);
948 Packet4i eq = pcmp_eq<Packet4i>(Packet4i(a), Packet4i(b));
949 Packet2l hi_eq = Packet2l(_mm_shuffle_epi32(eq, (shuffle_mask<1, 1, 3, 3>::mask)));
950 Packet4i lt = pcmp_lt<Packet4i>(Packet4i(a), Packet4i(b));
951 Packet2l hi_lt = Packet2l(_mm_shuffle_epi32(lt, (shuffle_mask<1, 1, 3, 3>::mask)));
952 Packet2l lo_lt = Packet2l(_mm_shuffle_epi32(lt, (shuffle_mask<0, 0, 2, 2>::mask)));
954 return por(hi_lt, pand(hi_eq, lo_lt));
958EIGEN_STRONG_INLINE Packet2l pcmp_eq(
const Packet2l& a,
const Packet2l& b) {
959#ifdef EIGEN_VECTORIZE_SSE4_1
960 return _mm_cmpeq_epi64(a, b);
962 Packet4i tmp = pcmp_eq<Packet4i>(Packet4i(a), Packet4i(b));
963 return Packet2l(pand<Packet4i>(tmp, _mm_shuffle_epi32(tmp, (shuffle_mask<1, 0, 3, 2>::mask))));
967EIGEN_STRONG_INLINE Packet2l pcmp_le(
const Packet2l& a,
const Packet2l& b) {
968 return por(pcmp_lt(a, b), pcmp_eq(a, b));
971EIGEN_STRONG_INLINE Packet16b pcmp_eq(
const Packet16b& a,
const Packet16b& b) {
973 const Packet16b kBoolMask = pset1<Packet16b>(
true);
974 return _mm_and_si128(_mm_cmpeq_epi8(a, b), kBoolMask);
977EIGEN_STRONG_INLINE Packet4ui pcmp_eq(
const Packet4ui& a,
const Packet4ui& b) {
978 return _mm_cmpeq_epi32(a, b);
982EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
983#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
988#ifdef EIGEN_VECTORIZE_AVX
990 asm(
"vminps %[a], %[b], %[res]" : [res]
"=x"(res) : [a]
"x"(a), [b]
"x"(b));
993 asm(
"minps %[a], %[res]" : [res]
"+x"(res) : [a]
"x"(a));
998 return _mm_min_ps(b, a);
1002EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
1003#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
1008#ifdef EIGEN_VECTORIZE_AVX
1010 asm(
"vminpd %[a], %[b], %[res]" : [res]
"=x"(res) : [a]
"x"(a), [b]
"x"(b));
1013 asm(
"minpd %[a], %[res]" : [res]
"+x"(res) : [a]
"x"(a));
1018 return _mm_min_pd(b, a);
1022EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
1023 Packet2l a_lt_mask = pcmp_lt(a, b);
1024 return por(pandnot(b, a_lt_mask), pand(a, a_lt_mask));
1027EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1028#ifdef EIGEN_VECTORIZE_SSE4_1
1029 return _mm_min_epi32(a, b);
1032 Packet4i mask = _mm_cmplt_epi32(a, b);
1033 return _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b));
1037EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1038#ifdef EIGEN_VECTORIZE_SSE4_1
1039 return _mm_min_epu32(a, b);
1041 return padd((Packet4ui)pmin((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
1042 (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL))),
1043 pset1<Packet4ui>(0x80000000UL));
1048EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1049#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
1054#ifdef EIGEN_VECTORIZE_AVX
1056 asm(
"vmaxps %[a], %[b], %[res]" : [res]
"=x"(res) : [a]
"x"(a), [b]
"x"(b));
1059 asm(
"maxps %[a], %[res]" : [res]
"+x"(res) : [a]
"x"(a));
1064 return _mm_max_ps(b, a);
1068EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
1069#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
1074#ifdef EIGEN_VECTORIZE_AVX
1076 asm(
"vmaxpd %[a], %[b], %[res]" : [res]
"=x"(res) : [a]
"x"(a), [b]
"x"(b));
1079 asm(
"maxpd %[a], %[res]" : [res]
"+x"(res) : [a]
"x"(a));
1084 return _mm_max_pd(b, a);
1088EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
1089 Packet2l a_lt_mask = pcmp_lt(a, b);
1090 return por(pandnot(a, a_lt_mask), pand(b, a_lt_mask));
1093EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1094#ifdef EIGEN_VECTORIZE_SSE4_1
1095 return _mm_max_epi32(a, b);
1098 Packet4i mask = _mm_cmpgt_epi32(a, b);
1099 return _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b));
1103EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1104#ifdef EIGEN_VECTORIZE_SSE4_1
1105 return _mm_max_epu32(a, b);
1107 return padd((Packet4ui)pmax((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
1108 (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL))),
1109 pset1<Packet4ui>(0x80000000UL));
1114EIGEN_STRONG_INLINE Packet4ui pcmp_lt(
const Packet4ui& a,
const Packet4ui& b) {
1115#ifdef EIGEN_VECTORIZE_SSE4_1
1116 return pxor(pcmp_eq(a, pmax(a, b)), ptrue(a));
1118 return (Packet4ui)pcmp_lt((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
1119 (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL)));
1123EIGEN_STRONG_INLINE Packet4ui pcmp_le(
const Packet4ui& a,
const Packet4ui& b) {
1124#ifdef EIGEN_VECTORIZE_SSE4_1
1125 return pcmp_eq(a, pmin(a, b));
1127 return (Packet4ui)pcmp_le((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
1128 (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL)));
1132template <
typename Packet,
typename Op>
1133EIGEN_STRONG_INLINE Packet pminmax_propagate_numbers(
const Packet& a,
const Packet& b, Op op) {
1136 Packet not_nan_mask_a = pcmp_eq(a, a);
1137 Packet m = op(a, b);
1138 return pselect<Packet>(not_nan_mask_a, m, b);
1141template <
typename Packet,
typename Op>
1142EIGEN_STRONG_INLINE Packet pminmax_propagate_nan(
const Packet& a,
const Packet& b, Op op) {
1145 Packet not_nan_mask_a = pcmp_eq(a, a);
1146 Packet m = op(b, a);
1147 return pselect<Packet>(not_nan_mask_a, m, a);
1152EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1153 return pminmax_propagate_numbers(a, b, pmin<Packet4f>);
1156EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(
const Packet2d& a,
const Packet2d& b) {
1157 return pminmax_propagate_numbers(a, b, pmin<Packet2d>);
1160EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1161 return pminmax_propagate_numbers(a, b, pmax<Packet4f>);
1164EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(
const Packet2d& a,
const Packet2d& b) {
1165 return pminmax_propagate_numbers(a, b, pmax<Packet2d>);
1168EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1169 return pminmax_propagate_nan(a, b, pmin<Packet4f>);
1172EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(
const Packet2d& a,
const Packet2d& b) {
1173 return pminmax_propagate_nan(a, b, pmin<Packet2d>);
1176EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1177 return pminmax_propagate_nan(a, b, pmax<Packet4f>);
1180EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(
const Packet2d& a,
const Packet2d& b) {
1181 return pminmax_propagate_nan(a, b, pmax<Packet2d>);
1185EIGEN_STRONG_INLINE Packet4f psignbit(
const Packet4f& a) {
1186 return _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(a), 31));
1189EIGEN_STRONG_INLINE Packet2d psignbit(
const Packet2d& a) {
1190 Packet4f tmp = psignbit<Packet4f>(_mm_castpd_ps(a));
1191#ifdef EIGEN_VECTORIZE_AVX
1192 return _mm_castps_pd(_mm_permute_ps(tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
1194 return _mm_castps_pd(_mm_shuffle_ps(tmp, tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
1198EIGEN_STRONG_INLINE Packet4i psignbit(
const Packet4i& a) {
1199 return _mm_srai_epi32(a, 31);
1202EIGEN_STRONG_INLINE Packet4ui psignbit(
const Packet4ui& a) {
1206EIGEN_STRONG_INLINE Packet2l psignbit(
const Packet2l& a) {
1207 Packet4i tmp = psignbit<Packet4i>(Packet4i(a));
1208 return Packet2l(_mm_shuffle_epi32(tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
1212EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(
const Packet2l& a) {
1213 Packet2l signbit = psignbit(a);
1214 return por(_mm_slli_epi64(signbit, 64 - N), _mm_srli_epi64(a, N));
1217EIGEN_STRONG_INLINE Packet2l plogical_shift_right(
const Packet2l& a) {
1218 return _mm_srli_epi64(a, N);
1221EIGEN_STRONG_INLINE Packet2l plogical_shift_left(
const Packet2l& a) {
1222 return _mm_slli_epi64(a, N);
1225EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(
const Packet4i& a) {
1226 return _mm_srai_epi32(a, N);
1229EIGEN_STRONG_INLINE Packet4i plogical_shift_right(
const Packet4i& a) {
1230 return _mm_srli_epi32(a, N);
1233EIGEN_STRONG_INLINE Packet4i plogical_shift_left(
const Packet4i& a) {
1234 return _mm_slli_epi32(a, N);
1237EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(
const Packet4ui& a) {
1238 return _mm_srli_epi32(a, N);
1241EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(
const Packet4ui& a) {
1242 return _mm_srli_epi32(a, N);
1245EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(
const Packet4ui& a) {
1246 return _mm_slli_epi32(a, N);
1250EIGEN_STRONG_INLINE Packet4f pabs(
const Packet4f& a) {
1251 const __m128i mask = _mm_setr_epi32(0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF);
1252 return _mm_castsi128_ps(_mm_and_si128(mask, _mm_castps_si128(a)));
1255EIGEN_STRONG_INLINE Packet2d pabs(
const Packet2d& a) {
1256 const __m128i mask = _mm_setr_epi32(0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF);
1257 return _mm_castsi128_pd(_mm_and_si128(mask, _mm_castpd_si128(a)));
1260EIGEN_STRONG_INLINE Packet2l pabs(
const Packet2l& a) {
1261 Packet2l signbit = psignbit(a);
1262 return _mm_sub_epi64(_mm_xor_si128(a, signbit), signbit);
1265EIGEN_STRONG_INLINE Packet4i pabs(
const Packet4i& a) {
1266#ifdef EIGEN_VECTORIZE_SSSE3
1267 return _mm_abs_epi32(a);
1269 Packet4i signbit = psignbit(a);
1270 return _mm_sub_epi32(_mm_xor_si128(a, signbit), signbit);
1274EIGEN_STRONG_INLINE Packet4ui pabs(
const Packet4ui& a) {
1278#ifdef EIGEN_VECTORIZE_SSE4_1
1280EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(
const Packet4f& a) {
1282 const Packet4f mask = pset1frombits<Packet4f>(0x80000000u);
1283 const Packet4f prev0dot5 = pset1frombits<Packet4f>(0x3EFFFFFFu);
1284 return _mm_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
1288EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(
const Packet2d& a) {
1289 const Packet2d mask = _mm_castsi128_pd(_mm_set_epi64x(0x8000000000000000ull, 0x8000000000000000ull));
1290 const Packet2d prev0dot5 = _mm_castsi128_pd(_mm_set_epi64x(0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull));
1291 return _mm_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
1295EIGEN_STRONG_INLINE Packet4f print<Packet4f>(
const Packet4f& a) {
1296 return _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION);
1299EIGEN_STRONG_INLINE Packet2d print<Packet2d>(
const Packet2d& a) {
1300 return _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
1304EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(
const Packet4f& a) {
1305 return _mm_ceil_ps(a);
1308EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(
const Packet2d& a) {
1309 return _mm_ceil_pd(a);
1313EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(
const Packet4f& a) {
1314 return _mm_floor_ps(a);
1317EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(
const Packet2d& a) {
1318 return _mm_floor_pd(a);
1322EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(
const Packet4f& a) {
1323 return _mm_round_ps(a, _MM_FROUND_TRUNC);
1326EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(
const Packet2d& a) {
1327 return _mm_round_pd(a, _MM_FROUND_TRUNC);
1332EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(
const float* from) {
1333 EIGEN_DEBUG_ALIGNED_LOAD
return _mm_load_ps(from);
1336EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(
const double* from) {
1337 EIGEN_DEBUG_ALIGNED_LOAD
return _mm_load_pd(from);
1340EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(
const int64_t* from) {
1341 EIGEN_DEBUG_ALIGNED_LOAD
return _mm_load_si128(
reinterpret_cast<const __m128i*
>(from));
1344EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(
const int* from) {
1345 EIGEN_DEBUG_ALIGNED_LOAD
return _mm_load_si128(
reinterpret_cast<const __m128i*
>(from));
1348EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(
const uint32_t* from) {
1349 EIGEN_DEBUG_ALIGNED_LOAD
return _mm_load_si128(
reinterpret_cast<const __m128i*
>(from));
1352EIGEN_STRONG_INLINE Packet16b pload<Packet16b>(
const bool* from) {
1353 EIGEN_DEBUG_ALIGNED_LOAD
return _mm_load_si128(
reinterpret_cast<const __m128i*
>(from));
1358EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(
const float* from) {
1359 EIGEN_DEBUG_UNALIGNED_LOAD
1360 return _mm_loadu_ps(from);
1366EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(
const float* from) {
1367 EIGEN_DEBUG_UNALIGNED_LOAD
1368 return _mm_loadu_ps(from);
1373EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(
const double* from) {
1374 EIGEN_DEBUG_UNALIGNED_LOAD
1375 return _mm_loadu_pd(from);
1378EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(
const int64_t* from) {
1379 EIGEN_DEBUG_UNALIGNED_LOAD
1380 return _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(from));
1383EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(
const int* from) {
1384 EIGEN_DEBUG_UNALIGNED_LOAD
1385 return _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(from));
1388EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(
const uint32_t* from) {
1389 EIGEN_DEBUG_UNALIGNED_LOAD
1390 return _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(from));
1393EIGEN_STRONG_INLINE Packet16b ploadu<Packet16b>(
const bool* from) {
1394 EIGEN_DEBUG_UNALIGNED_LOAD
1395 return _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(from));
1399template <
typename Packet>
1400EIGEN_STRONG_INLINE Packet ploadl(
const typename unpacket_traits<Packet>::type* from);
1402EIGEN_STRONG_INLINE Packet4f ploadl<Packet4f>(
const float* from) {
1403 EIGEN_DEBUG_UNALIGNED_LOAD
return _mm_castpd_ps(_mm_load_sd(
reinterpret_cast<const double*
>(from)));
1406EIGEN_STRONG_INLINE Packet2d ploadl<Packet2d>(
const double* from) {
1407 EIGEN_DEBUG_UNALIGNED_LOAD
return _mm_load_sd(from);
1411template <
typename Packet>
1412EIGEN_STRONG_INLINE Packet ploads(
const typename unpacket_traits<Packet>::type* from);
1414EIGEN_STRONG_INLINE Packet4f ploads<Packet4f>(
const float* from) {
1415 EIGEN_DEBUG_UNALIGNED_LOAD
return _mm_load_ss(from);
1418EIGEN_STRONG_INLINE Packet2d ploads<Packet2d>(
const double* from) {
1419 EIGEN_DEBUG_UNALIGNED_LOAD
return _mm_load_sd(from);
1423EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(
const float* from) {
1424 return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(
reinterpret_cast<const double*
>(from))), 0, 0, 1, 1);
1427EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(
const double* from) {
1428 return pset1<Packet2d>(from[0]);
1431EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(
const int64_t* from) {
1432 return pset1<Packet2l>(from[0]);
1435EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(
const int* from) {
1437 tmp = _mm_loadl_epi64(
reinterpret_cast<const __m128i*
>(from));
1438 return vec4i_swizzle1(tmp, 0, 0, 1, 1);
1441EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(
const uint32_t* from) {
1443 tmp = _mm_loadl_epi64(
reinterpret_cast<const __m128i*
>(from));
1444 return vec4ui_swizzle1(tmp, 0, 0, 1, 1);
1450EIGEN_STRONG_INLINE Packet16b ploaddup<Packet16b>(
const bool* from) {
1451 __m128i tmp = _mm_castpd_si128(pload1<Packet2d>(
reinterpret_cast<const double*
>(from)));
1452 return _mm_unpacklo_epi8(tmp, tmp);
1458EIGEN_STRONG_INLINE Packet16b ploadquad<Packet16b>(
const bool* from) {
1459 __m128i tmp = _mm_castps_si128(pload1<Packet4f>(
reinterpret_cast<const float*
>(from)));
1460 tmp = _mm_unpacklo_epi8(tmp, tmp);
1461 return _mm_unpacklo_epi16(tmp, tmp);
1465EIGEN_STRONG_INLINE
void pstore<float>(
float* to,
const Packet4f& from) {
1466 EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from);
1469EIGEN_STRONG_INLINE
void pstore<double>(
double* to,
const Packet2d& from) {
1470 EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from);
1473EIGEN_STRONG_INLINE
void pstore<int64_t>(int64_t* to,
const Packet2l& from) {
1474 EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(
reinterpret_cast<__m128i*
>(to), from);
1477EIGEN_STRONG_INLINE
void pstore<int>(
int* to,
const Packet4i& from) {
1478 EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(
reinterpret_cast<__m128i*
>(to), from);
1481EIGEN_STRONG_INLINE
void pstore<uint32_t>(uint32_t* to,
const Packet4ui& from) {
1482 EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(
reinterpret_cast<__m128i*
>(to), from);
1485EIGEN_STRONG_INLINE
void pstore<bool>(
bool* to,
const Packet16b& from) {
1486 EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(
reinterpret_cast<__m128i*
>(to), from);
1490EIGEN_STRONG_INLINE
void pstoreu<double>(
double* to,
const Packet2d& from) {
1491 EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from);
1494EIGEN_STRONG_INLINE
void pstoreu<float>(
float* to,
const Packet4f& from) {
1495 EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from);
1498EIGEN_STRONG_INLINE
void pstoreu<int64_t>(int64_t* to,
const Packet2l& from) {
1499 EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(
reinterpret_cast<__m128i*
>(to), from);
1502EIGEN_STRONG_INLINE
void pstoreu<int>(
int* to,
const Packet4i& from) {
1503 EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(
reinterpret_cast<__m128i*
>(to), from);
1506EIGEN_STRONG_INLINE
void pstoreu<uint32_t>(uint32_t* to,
const Packet4ui& from) {
1507 EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(
reinterpret_cast<__m128i*
>(to), from);
1510EIGEN_STRONG_INLINE
void pstoreu<bool>(
bool* to,
const Packet16b& from) {
1511 EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(
reinterpret_cast<__m128i*
>(to), from);
1514template <
typename Scalar,
typename Packet>
1515EIGEN_STRONG_INLINE
void pstorel(Scalar* to,
const Packet& from);
1517EIGEN_STRONG_INLINE
void pstorel(
float* to,
const Packet4f& from) {
1518 EIGEN_DEBUG_UNALIGNED_STORE _mm_storel_pi(
reinterpret_cast<__m64*
>(to), from);
1521EIGEN_STRONG_INLINE
void pstorel(
double* to,
const Packet2d& from) {
1522 EIGEN_DEBUG_UNALIGNED_STORE _mm_storel_pd(to, from);
1525template <
typename Scalar,
typename Packet>
1526EIGEN_STRONG_INLINE
void pstores(Scalar* to,
const Packet& from);
1528EIGEN_STRONG_INLINE
void pstores(
float* to,
const Packet4f& from) {
1529 EIGEN_DEBUG_UNALIGNED_STORE _mm_store_ss(to, from);
1532EIGEN_STRONG_INLINE
void pstores(
double* to,
const Packet2d& from) {
1533 EIGEN_DEBUG_UNALIGNED_STORE _mm_store_sd(to, from);
1537EIGEN_STRONG_INLINE Packet4f preverse(
const Packet4f& a) {
1538 return _mm_shuffle_ps(a, a, 0x1B);
1541EIGEN_STRONG_INLINE Packet2d preverse(
const Packet2d& a) {
1542 return _mm_shuffle_pd(a, a, 0x1);
1545EIGEN_STRONG_INLINE Packet2l preverse(
const Packet2l& a) {
1546 return _mm_castpd_si128(preverse(_mm_castsi128_pd(a)));
1549EIGEN_STRONG_INLINE Packet4i preverse(
const Packet4i& a) {
1550 return _mm_shuffle_epi32(a, 0x1B);
1553EIGEN_STRONG_INLINE Packet4ui preverse(
const Packet4ui& a) {
1554 return _mm_shuffle_epi32(a, 0x1B);
1557EIGEN_STRONG_INLINE Packet16b preverse(
const Packet16b& a) {
1558#ifdef EIGEN_VECTORIZE_SSSE3
1559 __m128i mask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1560 return _mm_shuffle_epi8(a, mask);
1562 Packet16b tmp = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3));
1563 tmp = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
1564 return _mm_or_si128(_mm_slli_epi16(tmp, 8), _mm_srli_epi16(tmp, 8));
1568#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
1572EIGEN_STRONG_INLINE
float pfirst<Packet4f>(
const Packet4f& a) {
1573 return a.m128_f32[0];
1576EIGEN_STRONG_INLINE
double pfirst<Packet2d>(
const Packet2d& a) {
1577 return a.m128d_f64[0];
1580EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(
const Packet2l& a) {
1581 int64_t x = _mm_extract_epi64_0(a);
1585EIGEN_STRONG_INLINE
int pfirst<Packet4i>(
const Packet4i& a) {
1586 int x = _mm_cvtsi128_si32(a);
1590EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(
const Packet4ui& a) {
1591 uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
1594#elif EIGEN_COMP_MSVC_STRICT
1597EIGEN_STRONG_INLINE
float pfirst<Packet4f>(
const Packet4f& a) {
1598 float x = _mm_cvtss_f32(a);
1602EIGEN_STRONG_INLINE
double pfirst<Packet2d>(
const Packet2d& a) {
1603 double x = _mm_cvtsd_f64(a);
1607EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(
const Packet2l& a) {
1608 int64_t x = _mm_extract_epi64_0(a);
1612EIGEN_STRONG_INLINE
int pfirst<Packet4i>(
const Packet4i& a) {
1613 int x = _mm_cvtsi128_si32(a);
1617EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(
const Packet4ui& a) {
1618 uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
1623EIGEN_STRONG_INLINE
float pfirst<Packet4f>(
const Packet4f& a) {
1624 return _mm_cvtss_f32(a);
1627EIGEN_STRONG_INLINE
double pfirst<Packet2d>(
const Packet2d& a) {
1628 return _mm_cvtsd_f64(a);
1631EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(
const Packet2l& a) {
1632 return _mm_extract_epi64_0(a);
1635EIGEN_STRONG_INLINE
int pfirst<Packet4i>(
const Packet4i& a) {
1636 return _mm_cvtsi128_si32(a);
1639EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(
const Packet4ui& a) {
1640 return numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
1644EIGEN_STRONG_INLINE
bool pfirst<Packet16b>(
const Packet16b& a) {
1645 int x = _mm_cvtsi128_si32(a);
1646 return static_cast<bool>(x & 1);
1650EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(
const float* from,
Index stride) {
1651 return _mm_set_ps(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
1654EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(
const double* from,
Index stride) {
1655 return _mm_set_pd(from[1 * stride], from[0 * stride]);
1658EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(
const int64_t* from,
Index stride) {
1659 return _mm_set_epi64x(from[1 * stride], from[0 * stride]);
1662EIGEN_STRONG_INLINE Packet4i pgather<int, Packet4i>(
const int* from,
Index stride) {
1663 return _mm_set_epi32(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
1666EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(
const uint32_t* from,
Index stride) {
1667 return _mm_set_epi32(numext::bit_cast<int32_t>(from[3 * stride]), numext::bit_cast<int32_t>(from[2 * stride]),
1668 numext::bit_cast<int32_t>(from[1 * stride]), numext::bit_cast<int32_t>(from[0 * stride]));
1672EIGEN_STRONG_INLINE Packet16b pgather<bool, Packet16b>(
const bool* from,
Index stride) {
1673 return _mm_set_epi8(from[15 * stride], from[14 * stride], from[13 * stride], from[12 * stride], from[11 * stride],
1674 from[10 * stride], from[9 * stride], from[8 * stride], from[7 * stride], from[6 * stride],
1675 from[5 * stride], from[4 * stride], from[3 * stride], from[2 * stride], from[1 * stride],
1680EIGEN_STRONG_INLINE
void pscatter<float, Packet4f>(
float* to,
const Packet4f& from,
Index stride) {
1681 to[stride * 0] = pfirst(from);
1682 to[stride * 1] = pfirst(Packet4f(_mm_shuffle_ps(from, from, 1)));
1683 to[stride * 2] = pfirst(Packet4f(_mm_shuffle_ps(from, from, 2)));
1684 to[stride * 3] = pfirst(Packet4f(_mm_shuffle_ps(from, from, 3)));
1687EIGEN_STRONG_INLINE
void pscatter<double, Packet2d>(
double* to,
const Packet2d& from,
Index stride) {
1688 to[stride * 0] = pfirst(from);
1689 to[stride * 1] = pfirst(preverse(from));
1692EIGEN_STRONG_INLINE
void pscatter<int64_t, Packet2l>(int64_t* to,
const Packet2l& from,
Index stride) {
1693 to[stride * 0] = pfirst(from);
1694 to[stride * 1] = pfirst(preverse(from));
1697EIGEN_STRONG_INLINE
void pscatter<int, Packet4i>(
int* to,
const Packet4i& from,
Index stride) {
1698 to[stride * 0] = _mm_cvtsi128_si32(from);
1699 to[stride * 1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
1700 to[stride * 2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
1701 to[stride * 3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
1704EIGEN_STRONG_INLINE
void pscatter<uint32_t, Packet4ui>(uint32_t* to,
const Packet4ui& from,
Index stride) {
1705 to[stride * 0] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(from));
1706 to[stride * 1] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)));
1707 to[stride * 2] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)));
1708 to[stride * 3] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)));
1711EIGEN_STRONG_INLINE
void pscatter<bool, Packet16b>(
bool* to,
const Packet16b& from,
Index stride) {
1712 EIGEN_ALIGN16
bool tmp[16];
1714 to[stride * 0] = tmp[0];
1715 to[stride * 1] = tmp[1];
1716 to[stride * 2] = tmp[2];
1717 to[stride * 3] = tmp[3];
1718 to[stride * 4] = tmp[4];
1719 to[stride * 5] = tmp[5];
1720 to[stride * 6] = tmp[6];
1721 to[stride * 7] = tmp[7];
1722 to[stride * 8] = tmp[8];
1723 to[stride * 9] = tmp[9];
1724 to[stride * 10] = tmp[10];
1725 to[stride * 11] = tmp[11];
1726 to[stride * 12] = tmp[12];
1727 to[stride * 13] = tmp[13];
1728 to[stride * 14] = tmp[14];
1729 to[stride * 15] = tmp[15];
1734EIGEN_STRONG_INLINE
void pstore1<Packet4f>(
float* to,
const float& a) {
1735 Packet4f pa = _mm_set_ss(a);
1736 pstore(to, Packet4f(vec4f_swizzle1(pa, 0, 0, 0, 0)));
1740EIGEN_STRONG_INLINE
void pstore1<Packet2d>(
double* to,
const double& a) {
1741 Packet2d pa = _mm_set_sd(a);
1742 pstore(to, Packet2d(vec2d_swizzle1(pa, 0, 0)));
1745#if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
1746typedef const void* SsePrefetchPtrType;
1748typedef const char* SsePrefetchPtrType;
1751#ifndef EIGEN_VECTORIZE_AVX
1753EIGEN_STRONG_INLINE
void prefetch<float>(
const float* addr) {
1754 _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1757EIGEN_STRONG_INLINE
void prefetch<double>(
const double* addr) {
1758 _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1761EIGEN_STRONG_INLINE
void prefetch<int>(
const int* addr) {
1762 _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1765EIGEN_STRONG_INLINE
void prefetch<int64_t>(
const int64_t* addr) {
1766 _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1769EIGEN_STRONG_INLINE
void prefetch<uint32_t>(
const uint32_t* addr) {
1770 _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1775EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(
const Packet4f& a, Packet4f& exponent) {
1776 return pfrexp_generic(a, exponent);
1781EIGEN_STRONG_INLINE Packet2d pfrexp_generic_get_biased_exponent(
const Packet2d& a) {
1782 const Packet2d cst_exp_mask = pset1frombits<Packet2d>(
static_cast<uint64_t
>(0x7ff0000000000000ull));
1783 __m128i a_expo = _mm_srli_epi64(_mm_castpd_si128(pand(a, cst_exp_mask)), 52);
1784 return _mm_cvtepi32_pd(vec4i_swizzle1(a_expo, 0, 2, 1, 3));
1788EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(
const Packet2d& a, Packet2d& exponent) {
1789 return pfrexp_generic(a, exponent);
1793EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(
const Packet4f& a,
const Packet4f& exponent) {
1794 return pldexp_generic(a, exponent);
1800EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(
const Packet2d& a,
const Packet2d& exponent) {
1802 const Packet2d max_exponent = pset1<Packet2d>(2099.0);
1803 const Packet2d e = pmin(pmax(exponent, pnegate(max_exponent)), max_exponent);
1806 const Packet4i ei = vec4i_swizzle1(_mm_cvtpd_epi32(e), 0, 3, 1, 3);
1809 const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023);
1810 Packet4i b = parithmetic_shift_right<2>(ei);
1811 Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52));
1812 Packet2d out = pmul(pmul(pmul(a, c), c), c);
1813 b = psub(psub(psub(ei, b), b), b);
1814 c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52));
1822EIGEN_STRONG_INLINE Packet2d pldexp_fast<Packet2d>(
const Packet2d& a,
const Packet2d& exponent) {
1824 const Packet2d min_exponent = pset1<Packet2d>(-1023.0);
1825 const Packet2d max_exponent = pset1<Packet2d>(1024.0);
1826 const Packet2d e = pmin(pmax(exponent, min_exponent), max_exponent);
1829 const Packet4i ei = vec4i_swizzle1(_mm_cvtpd_epi32(e), 0, 3, 1, 3);
1832 const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023);
1833 const Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(ei, bias), 52));
1840EIGEN_STRONG_INLINE
void pbroadcast4<Packet4f>(
const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
1841 a3 = pload<Packet4f>(a);
1842 a0 = vec4f_swizzle1(a3, 0, 0, 0, 0);
1843 a1 = vec4f_swizzle1(a3, 1, 1, 1, 1);
1844 a2 = vec4f_swizzle1(a3, 2, 2, 2, 2);
1845 a3 = vec4f_swizzle1(a3, 3, 3, 3, 3);
1848EIGEN_STRONG_INLINE
void pbroadcast4<Packet2d>(
const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2,
1850#ifdef EIGEN_VECTORIZE_SSE3
1851 a0 = _mm_loaddup_pd(a + 0);
1852 a1 = _mm_loaddup_pd(a + 1);
1853 a2 = _mm_loaddup_pd(a + 2);
1854 a3 = _mm_loaddup_pd(a + 3);
1856 a1 = pload<Packet2d>(a);
1857 a0 = vec2d_swizzle1(a1, 0, 0);
1858 a1 = vec2d_swizzle1(a1, 1, 1);
1859 a3 = pload<Packet2d>(a + 2);
1860 a2 = vec2d_swizzle1(a3, 0, 0);
1861 a3 = vec2d_swizzle1(a3, 1, 1);
1866EIGEN_STRONG_INLINE
void punpackp(Packet4f* vecs) {
1867 vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
1868 vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA));
1869 vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF));
1870 vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
1873EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
1874 _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
1877EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
1878 __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
1879 kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
1880 kernel.packet[1] = tmp;
1883EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet2l, 2>& kernel) {
1884 __m128i tmp = _mm_unpackhi_epi64(kernel.packet[0], kernel.packet[1]);
1885 kernel.packet[0] = _mm_unpacklo_epi64(kernel.packet[0], kernel.packet[1]);
1886 kernel.packet[1] = tmp;
1889EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
1890 __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
1891 __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
1892 __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
1893 __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
1895 kernel.packet[0] = _mm_unpacklo_epi64(T0, T1);
1896 kernel.packet[1] = _mm_unpackhi_epi64(T0, T1);
1897 kernel.packet[2] = _mm_unpacklo_epi64(T2, T3);
1898 kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
1900EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
1901 ptranspose((PacketBlock<Packet4i, 4>&)kernel);
1904EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16b, 4>& kernel) {
1905 __m128i T0 = _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]);
1906 __m128i T1 = _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]);
1907 __m128i T2 = _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]);
1908 __m128i T3 = _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]);
1909 kernel.packet[0] = _mm_unpacklo_epi16(T0, T2);
1910 kernel.packet[1] = _mm_unpackhi_epi16(T0, T2);
1911 kernel.packet[2] = _mm_unpacklo_epi16(T1, T3);
1912 kernel.packet[3] = _mm_unpackhi_epi16(T1, T3);
1915EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16b, 16>& kernel) {
1928 _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]);
1930 _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]);
1932 _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]);
1934 _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]);
1936 _mm_unpacklo_epi8(kernel.packet[4], kernel.packet[5]);
1937 __m128i t5 = _mm_unpackhi_epi8(kernel.packet[4], kernel.packet[5]);
1938 __m128i t6 = _mm_unpacklo_epi8(kernel.packet[6], kernel.packet[7]);
1939 __m128i t7 = _mm_unpackhi_epi8(kernel.packet[6], kernel.packet[7]);
1940 __m128i t8 = _mm_unpacklo_epi8(kernel.packet[8], kernel.packet[9]);
1941 __m128i t9 = _mm_unpackhi_epi8(kernel.packet[8], kernel.packet[9]);
1942 __m128i ta = _mm_unpacklo_epi8(kernel.packet[10], kernel.packet[11]);
1943 __m128i tb = _mm_unpackhi_epi8(kernel.packet[10], kernel.packet[11]);
1944 __m128i tc = _mm_unpacklo_epi8(kernel.packet[12], kernel.packet[13]);
1945 __m128i td = _mm_unpackhi_epi8(kernel.packet[12], kernel.packet[13]);
1946 __m128i te = _mm_unpacklo_epi8(kernel.packet[14], kernel.packet[15]);
1947 __m128i tf = _mm_unpackhi_epi8(kernel.packet[14], kernel.packet[15]);
1949 __m128i s0 = _mm_unpacklo_epi16(t0, t2);
1950 __m128i s1 = _mm_unpackhi_epi16(t0, t2);
1951 __m128i s2 = _mm_unpacklo_epi16(t1, t3);
1952 __m128i s3 = _mm_unpackhi_epi16(t1, t3);
1953 __m128i s4 = _mm_unpacklo_epi16(t4, t6);
1954 __m128i s5 = _mm_unpackhi_epi16(t4, t6);
1955 __m128i s6 = _mm_unpacklo_epi16(t5, t7);
1956 __m128i s7 = _mm_unpackhi_epi16(t5, t7);
1957 __m128i s8 = _mm_unpacklo_epi16(t8, ta);
1958 __m128i s9 = _mm_unpackhi_epi16(t8, ta);
1959 __m128i sa = _mm_unpacklo_epi16(t9, tb);
1960 __m128i sb = _mm_unpackhi_epi16(t9, tb);
1961 __m128i sc = _mm_unpacklo_epi16(tc, te);
1962 __m128i sd = _mm_unpackhi_epi16(tc, te);
1963 __m128i se = _mm_unpacklo_epi16(td, tf);
1964 __m128i sf = _mm_unpackhi_epi16(td, tf);
1966 __m128i u0 = _mm_unpacklo_epi32(s0, s4);
1967 __m128i u1 = _mm_unpackhi_epi32(s0, s4);
1968 __m128i u2 = _mm_unpacklo_epi32(s1, s5);
1969 __m128i u3 = _mm_unpackhi_epi32(s1, s5);
1970 __m128i u4 = _mm_unpacklo_epi32(s2, s6);
1971 __m128i u5 = _mm_unpackhi_epi32(s2, s6);
1972 __m128i u6 = _mm_unpacklo_epi32(s3, s7);
1973 __m128i u7 = _mm_unpackhi_epi32(s3, s7);
1974 __m128i u8 = _mm_unpacklo_epi32(s8, sc);
1975 __m128i u9 = _mm_unpackhi_epi32(s8, sc);
1976 __m128i ua = _mm_unpacklo_epi32(s9, sd);
1977 __m128i ub = _mm_unpackhi_epi32(s9, sd);
1978 __m128i uc = _mm_unpacklo_epi32(sa, se);
1979 __m128i ud = _mm_unpackhi_epi32(sa, se);
1980 __m128i ue = _mm_unpacklo_epi32(sb, sf);
1981 __m128i uf = _mm_unpackhi_epi32(sb, sf);
1983 kernel.packet[0] = _mm_unpacklo_epi64(u0, u8);
1984 kernel.packet[1] = _mm_unpackhi_epi64(u0, u8);
1985 kernel.packet[2] = _mm_unpacklo_epi64(u1, u9);
1986 kernel.packet[3] = _mm_unpackhi_epi64(u1, u9);
1987 kernel.packet[4] = _mm_unpacklo_epi64(u2, ua);
1988 kernel.packet[5] = _mm_unpackhi_epi64(u2, ua);
1989 kernel.packet[6] = _mm_unpacklo_epi64(u3, ub);
1990 kernel.packet[7] = _mm_unpackhi_epi64(u3, ub);
1991 kernel.packet[8] = _mm_unpacklo_epi64(u4, uc);
1992 kernel.packet[9] = _mm_unpackhi_epi64(u4, uc);
1993 kernel.packet[10] = _mm_unpacklo_epi64(u5, ud);
1994 kernel.packet[11] = _mm_unpackhi_epi64(u5, ud);
1995 kernel.packet[12] = _mm_unpacklo_epi64(u6, ue);
1996 kernel.packet[13] = _mm_unpackhi_epi64(u6, ue);
1997 kernel.packet[14] = _mm_unpacklo_epi64(u7, uf);
1998 kernel.packet[15] = _mm_unpackhi_epi64(u7, uf);
2001EIGEN_STRONG_INLINE __m128i sse_blend_mask(
const Selector<2>& ifPacket) {
2002 return _mm_set_epi64x(0 - ifPacket.select[1], 0 - ifPacket.select[0]);
2005EIGEN_STRONG_INLINE __m128i sse_blend_mask(
const Selector<4>& ifPacket) {
2006 return _mm_set_epi32(0 - ifPacket.select[3], 0 - ifPacket.select[2], 0 - ifPacket.select[1], 0 - ifPacket.select[0]);
2010EIGEN_STRONG_INLINE Packet2l pblend(
const Selector<2>& ifPacket,
const Packet2l& thenPacket,
2011 const Packet2l& elsePacket) {
2012 const __m128i true_mask = sse_blend_mask(ifPacket);
2013 return pselect<Packet2l>(true_mask, thenPacket, elsePacket);
2016EIGEN_STRONG_INLINE Packet4i pblend(
const Selector<4>& ifPacket,
const Packet4i& thenPacket,
2017 const Packet4i& elsePacket) {
2018 const __m128i true_mask = sse_blend_mask(ifPacket);
2019 return pselect<Packet4i>(true_mask, thenPacket, elsePacket);
2022EIGEN_STRONG_INLINE Packet4ui pblend(
const Selector<4>& ifPacket,
const Packet4ui& thenPacket,
2023 const Packet4ui& elsePacket) {
2024 return (Packet4ui)pblend(ifPacket, (Packet4i)thenPacket, (Packet4i)elsePacket);
2027EIGEN_STRONG_INLINE Packet4f pblend(
const Selector<4>& ifPacket,
const Packet4f& thenPacket,
2028 const Packet4f& elsePacket) {
2029 const __m128i true_mask = sse_blend_mask(ifPacket);
2030 return pselect<Packet4f>(_mm_castsi128_ps(true_mask), thenPacket, elsePacket);
2033EIGEN_STRONG_INLINE Packet2d pblend(
const Selector<2>& ifPacket,
const Packet2d& thenPacket,
2034 const Packet2d& elsePacket) {
2035 const __m128i true_mask = sse_blend_mask(ifPacket);
2036 return pselect<Packet2d>(_mm_castsi128_pd(true_mask), thenPacket, elsePacket);
2040#if defined(EIGEN_VECTORIZE_FMA)
2042EIGEN_STRONG_INLINE
float pmadd(
const float& a,
const float& b,
const float& c) {
2043 return std::fmaf(a, b, c);
2046EIGEN_STRONG_INLINE
double pmadd(
const double& a,
const double& b,
const double& c) {
2047 return std::fma(a, b, c);
2050EIGEN_STRONG_INLINE
float pmsub(
const float& a,
const float& b,
const float& c) {
2051 return std::fmaf(a, b, -c);
2054EIGEN_STRONG_INLINE
double pmsub(
const double& a,
const double& b,
const double& c) {
2055 return std::fma(a, b, -c);
2058EIGEN_STRONG_INLINE
float pnmadd(
const float& a,
const float& b,
const float& c) {
2059 return std::fmaf(-a, b, c);
2062EIGEN_STRONG_INLINE
double pnmadd(
const double& a,
const double& b,
const double& c) {
2063 return std::fma(-a, b, c);
2066EIGEN_STRONG_INLINE
float pnmsub(
const float& a,
const float& b,
const float& c) {
2067 return std::fmaf(-a, b, -c);
2070EIGEN_STRONG_INLINE
double pnmsub(
const double& a,
const double& b,
const double& c) {
2071 return std::fma(-a, b, -c);
2075#ifdef EIGEN_VECTORIZE_SSE4_1
2078EIGEN_STRONG_INLINE __m128i half2floatsse(__m128i h) {
2079 __m128i input = _mm_cvtepu16_epi32(h);
2082 __m128i shifted_exp = _mm_set1_epi32(0x7c00 << 13);
2084 __m128i ou = _mm_slli_epi32(_mm_and_si128(input, _mm_set1_epi32(0x7fff)), 13);
2086 __m128i
exp = _mm_and_si128(ou, shifted_exp);
2088 ou = _mm_add_epi32(ou, _mm_set1_epi32((127 - 15) << 23));
2091 __m128i naninf_mask = _mm_cmpeq_epi32(
exp, shifted_exp);
2093 __m128i naninf_adj = _mm_and_si128(_mm_set1_epi32((128 - 16) << 23), naninf_mask);
2095 ou = _mm_add_epi32(ou, naninf_adj);
2098 __m128i zeroden_mask = _mm_cmpeq_epi32(
exp, _mm_setzero_si128());
2099 __m128i zeroden_adj = _mm_and_si128(zeroden_mask, _mm_set1_epi32(1 << 23));
2101 ou = _mm_add_epi32(ou, zeroden_adj);
2103 __m128i magic = _mm_and_si128(zeroden_mask, _mm_set1_epi32(113 << 23));
2105 ou = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(ou), _mm_castsi128_ps(magic)));
2107 __m128i
sign = _mm_slli_epi32(_mm_and_si128(input, _mm_set1_epi32(0x8000)), 16);
2109 ou = _mm_or_si128(ou,
sign);
2116EIGEN_STRONG_INLINE __m128i float2half(__m128 f) {
2118 __m128i
sign = _mm_set1_epi32(0x80000000u);
2120 sign = _mm_and_si128(
sign, _mm_castps_si128(f));
2122 f = _mm_xor_ps(f, _mm_castsi128_ps(
sign));
2124 __m128i fu = _mm_castps_si128(f);
2126 __m128i f16max = _mm_set1_epi32((127 + 16) << 23);
2127 __m128i f32infty = _mm_set1_epi32(255 << 23);
2130 __m128i infnan_mask = _mm_cmplt_epi32(f16max, _mm_castps_si128(f));
2131 __m128i inf_mask = _mm_cmpgt_epi32(_mm_castps_si128(f), f32infty);
2132 __m128i nan_mask = _mm_andnot_si128(inf_mask, infnan_mask);
2133 __m128i inf_value = _mm_and_si128(inf_mask, _mm_set1_epi32(0x7e00));
2134 __m128i nan_value = _mm_and_si128(nan_mask, _mm_set1_epi32(0x7c00));
2136 __m128i naninf_value = _mm_or_si128(inf_value, nan_value);
2138 __m128i denorm_magic = _mm_set1_epi32(((127 - 15) + (23 - 10) + 1) << 23);
2139 __m128i subnorm_mask = _mm_cmplt_epi32(_mm_castps_si128(f), _mm_set1_epi32(113 << 23));
2141 f = _mm_add_ps(f, _mm_castsi128_ps(denorm_magic));
2143 __m128i o = _mm_sub_epi32(_mm_castps_si128(f), denorm_magic);
2144 o = _mm_and_si128(o, subnorm_mask);
2146 o = _mm_or_si128(o, naninf_value);
2148 __m128i mask = _mm_or_si128(infnan_mask, subnorm_mask);
2149 o = _mm_and_si128(o, mask);
2152 __m128i mand_odd = _mm_and_si128(_mm_srli_epi32(fu, 13), _mm_set1_epi32(0x1));
2154 fu = _mm_add_epi32(fu, _mm_set1_epi32(0xc8000fffU));
2156 fu = _mm_add_epi32(fu, mand_odd);
2157 fu = _mm_andnot_si128(mask, fu);
2159 fu = _mm_srli_epi32(fu, 13);
2160 o = _mm_or_si128(fu, o);
2163 o = _mm_or_si128(o, _mm_srli_epi32(
sign, 16));
2166 return _mm_and_si128(o, _mm_set1_epi32(0xffff));
2180template<>
struct is_arithmetic<Packet4h> {
enum { value =
true }; };
2183struct packet_traits<Eigen::half> : default_packet_traits {
2184 typedef Packet4h type;
2186 typedef Packet4h half;
2189 AlignedOnScalar = 1,
2206template<>
struct unpacket_traits<Packet4h> {
typedef Eigen::half type;
enum {size=4, alignment=
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
typedef Packet4h half; };
2208template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(
const Eigen::half& from) {
2210 result.x = _mm_set1_pi16(from.x);
2214template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(
const Packet4h& from) {
2215 return half_impl::raw_uint16_to_half(
static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
2218template<> EIGEN_STRONG_INLINE Packet4h pconj(
const Packet4h& a) {
return a; }
2220template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(
const Packet4h& a,
const Packet4h& b) {
2221 __int64_t a64 = _mm_cvtm64_si64(a.x);
2222 __int64_t b64 = _mm_cvtm64_si64(b.x);
2226 Eigen::half ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64));
2227 Eigen::half hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64));
2229 ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64 >> 16));
2230 hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64 >> 16));
2232 ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64 >> 32));
2233 hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64 >> 32));
2235 ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64 >> 48));
2236 hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64 >> 48));
2239 result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
2243template<> EIGEN_STRONG_INLINE Packet4h psub<Packet4h>(
const Packet4h& a,
const Packet4h& b) {
2244 __int64_t a64 = _mm_cvtm64_si64(a.x);
2245 __int64_t b64 = _mm_cvtm64_si64(b.x);
2249 Eigen::half ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64));
2250 Eigen::half hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64));
2252 ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64 >> 16));
2253 hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64 >> 16));
2255 ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64 >> 32));
2256 hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64 >> 32));
2258 ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64 >> 48));
2259 hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64 >> 48));
2262 result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
2266template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(
const Packet4h& a,
const Packet4h& b) {
2267 __int64_t a64 = _mm_cvtm64_si64(a.x);
2268 __int64_t b64 = _mm_cvtm64_si64(b.x);
2272 Eigen::half ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64));
2273 Eigen::half hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64));
2275 ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64 >> 16));
2276 hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64 >> 16));
2278 ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64 >> 32));
2279 hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64 >> 32));
2281 ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64 >> 48));
2282 hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64 >> 48));
2285 result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
2289template<> EIGEN_STRONG_INLINE Packet4h pdiv<Packet4h>(
const Packet4h& a,
const Packet4h& b) {
2290 __int64_t a64 = _mm_cvtm64_si64(a.x);
2291 __int64_t b64 = _mm_cvtm64_si64(b.x);
2295 Eigen::half ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64));
2296 Eigen::half hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64));
2298 ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64 >> 16));
2299 hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64 >> 16));
2301 ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64 >> 32));
2302 hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64 >> 32));
2304 ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64 >> 48));
2305 hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64 >> 48));
2308 result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
2312template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(
const Eigen::half* from) {
2314 result.x = _mm_cvtsi64_m64(*
reinterpret_cast<const __int64_t*
>(from));
2318template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(
const Eigen::half* from) {
2320 result.x = _mm_cvtsi64_m64(*
reinterpret_cast<const __int64_t*
>(from));
2324template<> EIGEN_STRONG_INLINE
void pstore<Eigen::half>(Eigen::half* to,
const Packet4h& from) {
2325 __int64_t r = _mm_cvtm64_si64(from.x);
2326 *(
reinterpret_cast<__int64_t*
>(to)) = r;
2329template<> EIGEN_STRONG_INLINE
void pstoreu<Eigen::half>(Eigen::half* to,
const Packet4h& from) {
2330 __int64_t r = _mm_cvtm64_si64(from.x);
2331 *(
reinterpret_cast<__int64_t*
>(to)) = r;
2334template<> EIGEN_STRONG_INLINE Packet4h
2335ploadquad<Packet4h>(
const Eigen::half* from) {
2336 return pset1<Packet4h>(*from);
2339template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(
const Eigen::half* from,
Index stride)
2342 result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
2346template<> EIGEN_STRONG_INLINE
void pscatter<Eigen::half, Packet4h>(Eigen::half* to,
const Packet4h& from,
Index stride)
2348 __int64_t a = _mm_cvtm64_si64(from.x);
2349 to[stride*0].x =
static_cast<unsigned short>(a);
2350 to[stride*1].x =
static_cast<unsigned short>(a >> 16);
2351 to[stride*2].x =
static_cast<unsigned short>(a >> 32);
2352 to[stride*3].x =
static_cast<unsigned short>(a >> 48);
2355EIGEN_STRONG_INLINE
void
2356ptranspose(PacketBlock<Packet4h,4>& kernel) {
2357 __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x);
2358 __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);
2359 __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x);
2360 __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x);
2362 kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1);
2363 kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1);
2364 kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3);
2365 kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3);
2374#if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
2376static inline __m128 _mm_castpd_ps(__m128d x) {
return reinterpret_cast<__m128&
>(x); }
2377static inline __m128i _mm_castpd_si128(__m128d x) {
return reinterpret_cast<__m128i&
>(x); }
2378static inline __m128d _mm_castps_pd(__m128 x) {
return reinterpret_cast<__m128d&
>(x); }
2379static inline __m128i _mm_castps_si128(__m128 x) {
return reinterpret_cast<__m128i&
>(x); }
2380static inline __m128 _mm_castsi128_ps(__m128i x) {
return reinterpret_cast<__m128&
>(x); }
2381static inline __m128d _mm_castsi128_pd(__m128i x) {
return reinterpret_cast<__m128d&
>(x); }
@ Aligned16
Definition Constants.h:237
Namespace containing all symbols from the Eigen library.
Definition B01_Experimental.dox:1
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_exp_op< typename Derived::Scalar >, const Derived > exp(const Eigen::ArrayBase< Derived > &x)
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_sign_op< typename Derived::Scalar >, const Derived > sign(const Eigen::ArrayBase< Derived > &x)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:82