10#ifndef EIGEN_PACKET_MATH_SSE_H
11#define EIGEN_PACKET_MATH_SSE_H
15#include "../../InternalHeaderCheck.h"
21#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
22#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
25#if !defined(EIGEN_VECTORIZE_AVX) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
28#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2 * sizeof(void*))
31#ifdef EIGEN_VECTORIZE_FMA
32#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
33#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
37#if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW || EIGEN_COMP_LCC) && \
38 (__GXX_ABI_VERSION < 1004)) || \
45typedef eigen_packet_wrapper<__m128> Packet4f;
46typedef eigen_packet_wrapper<__m128d> Packet2d;
48typedef __m128 Packet4f;
49typedef __m128d Packet2d;
52typedef eigen_packet_wrapper<__m128i, 0> Packet4i;
53typedef eigen_packet_wrapper<__m128i, 1> Packet16b;
54typedef eigen_packet_wrapper<__m128i, 4> Packet4ui;
55typedef eigen_packet_wrapper<__m128i, 5> Packet2l;
58struct is_arithmetic<__m128> {
59 enum { value =
true };
62struct is_arithmetic<__m128i> {
63 enum { value =
true };
66struct is_arithmetic<__m128d> {
67 enum { value =
true };
70struct is_arithmetic<Packet4i> {
71 enum { value =
true };
74struct is_arithmetic<Packet2l> {
75 enum { value =
true };
81struct is_arithmetic<Packet4ui> {
82 enum { value =
false };
85struct is_arithmetic<Packet16b> {
86 enum { value =
true };
89template <
int p,
int q,
int r,
int s>
91 enum { mask = (s) << 6 | (r) << 4 | (q) << 2 | (p) };
95#define vec4f_swizzle1(v, p, q, r, s) \
96 Packet4f(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), (shuffle_mask<p, q, r, s>::mask))))
98#define vec4i_swizzle1(v, p, q, r, s) Packet4i(_mm_shuffle_epi32(v, (shuffle_mask<p, q, r, s>::mask)))
100#define vec4ui_swizzle1(v, p, q, r, s) Packet4ui(vec4i_swizzle1(v, p, q, r, s))
102#define vec2d_swizzle1(v, p, q) \
103 Packet2d(_mm_castsi128_pd( \
104 _mm_shuffle_epi32(_mm_castpd_si128(v), (shuffle_mask<2 * p, 2 * p + 1, 2 * q, 2 * q + 1>::mask))))
106#define vec4f_swizzle2(a, b, p, q, r, s) Packet4f(_mm_shuffle_ps((a), (b), (shuffle_mask<p, q, r, s>::mask)))
108#define vec4i_swizzle2(a, b, p, q, r, s) \
110 _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (shuffle_mask<p, q, r, s>::mask)))))
112#define vec4ui_swizzle2(a, b, p, q, r, s) Packet4i(vec4i_swizzle2(a, b, p, q, r, s))
114EIGEN_STRONG_INLINE Packet4f vec4f_movelh(
const Packet4f& a,
const Packet4f& b) {
115 return Packet4f(_mm_movelh_ps(a, b));
117EIGEN_STRONG_INLINE Packet4f vec4f_movehl(
const Packet4f& a,
const Packet4f& b) {
118 return Packet4f(_mm_movehl_ps(a, b));
120EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(
const Packet4f& a,
const Packet4f& b) {
121 return Packet4f(_mm_unpacklo_ps(a, b));
123EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(
const Packet4f& a,
const Packet4f& b) {
124 return Packet4f(_mm_unpackhi_ps(a, b));
126#define vec4f_duplane(a, p) vec4f_swizzle2(a, a, p, p, p, p)
128#define vec2d_swizzle2(a, b, mask) Packet2d(_mm_shuffle_pd(a, b, mask))
130EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(
const Packet2d& a,
const Packet2d& b) {
131 return Packet2d(_mm_unpacklo_pd(a, b));
133EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(
const Packet2d& a,
const Packet2d& b) {
134 return Packet2d(_mm_unpackhi_pd(a, b));
136#define vec2d_duplane(a, p) vec2d_swizzle2(a, a, (p << 1) | p)
138#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = pset1<Packet4f>(X)
140#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = pset1<Packet2d>(X)
142#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) const Packet4f p4f_##NAME = pset1frombits<Packet4f>(X)
144#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = pset1<Packet4i>(X)
146#define EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = pset1<Packet4ui>(X)
150EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_0(
const __m128i& a) {
return _mm_cvtsi128_si64(a); }
151#ifdef EIGEN_VECTORIZE_SSE4_1
152EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_1(
const __m128i& a) {
return _mm_extract_epi64(a, 1); }
154EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_1(
const __m128i& a) {
155 return _mm_cvtsi128_si64(_mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(a), 0x1)));
161EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_0(
const __m128i& a) {
162 return numext::bit_cast<int64_t>(_mm_cvtsd_f64(_mm_castsi128_pd(a)));
164EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_1(
const __m128i& a) {
165 return numext::bit_cast<int64_t>(_mm_cvtsd_f64(_mm_shuffle_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(a), 0x1)));
171#ifndef EIGEN_VECTORIZE_AVX
173struct packet_traits<float> : default_packet_traits {
174 typedef Packet4f type;
175 typedef Packet4f half;
183 HasReciprocal = EIGEN_FAST_MATH,
184 HasSin = EIGEN_FAST_MATH,
185 HasCos = EIGEN_FAST_MATH,
200 HasTanh = EIGEN_FAST_MATH,
201 HasErf = EIGEN_FAST_MATH,
202 HasErfc = EIGEN_FAST_MATH,
208struct packet_traits<double> : default_packet_traits {
209 typedef Packet2d type;
210 typedef Packet2d half;
218 HasSin = EIGEN_FAST_MATH,
219 HasCos = EIGEN_FAST_MATH,
220 HasTanh = EIGEN_FAST_MATH,
221 HasErf = EIGEN_FAST_MATH,
222 HasErfc = EIGEN_FAST_MATH,
237struct packet_traits<int> : default_packet_traits {
238 typedef Packet4i type;
239 typedef Packet4i half;
252struct packet_traits<uint32_t> : default_packet_traits {
253 typedef Packet4ui type;
254 typedef Packet4ui half;
268struct packet_traits<int64_t> : default_packet_traits {
269 typedef Packet2l type;
270 typedef Packet2l half;
284struct packet_traits<bool> : default_packet_traits {
285 typedef Packet16b type;
286 typedef Packet16b half;
305struct unpacket_traits<Packet4f> {
307 typedef Packet4f half;
308 typedef Packet4i integer_packet;
313 masked_load_available =
false,
314 masked_store_available =
false
318struct unpacket_traits<Packet2d> {
320 typedef Packet2d half;
321 typedef Packet2l integer_packet;
326 masked_load_available =
false,
327 masked_store_available =
false
331struct unpacket_traits<Packet2l> {
332 typedef int64_t type;
333 typedef Packet2l half;
338 masked_load_available =
false,
339 masked_store_available =
false
343struct unpacket_traits<Packet4i> {
345 typedef Packet4i half;
350 masked_load_available =
false,
351 masked_store_available =
false
355struct unpacket_traits<Packet4ui> {
356 typedef uint32_t type;
357 typedef Packet4ui half;
362 masked_load_available =
false,
363 masked_store_available =
false
367struct unpacket_traits<Packet16b> {
369 typedef Packet16b half;
374 masked_load_available =
false,
375 masked_store_available =
false
379#ifndef EIGEN_VECTORIZE_AVX
381struct scalar_div_cost<float, true> {
385struct scalar_div_cost<double, true> {
391EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(
const float& from) {
392 return _mm_set_ps1(from);
395EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(
const double& from) {
396 return _mm_set1_pd(from);
399EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(
const int64_t& from) {
400 return _mm_set1_epi64x(from);
403EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(
const int& from) {
404 return _mm_set1_epi32(from);
407EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(
const uint32_t& from) {
408 return _mm_set1_epi32(numext::bit_cast<int32_t>(from));
411EIGEN_STRONG_INLINE Packet16b pset1<Packet16b>(
const bool& from) {
412 return _mm_set1_epi8(
static_cast<char>(from));
416EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(
unsigned int from) {
417 return _mm_castsi128_ps(pset1<Packet4i>(from));
420EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
421 return _mm_castsi128_pd(_mm_set1_epi64x(from));
425EIGEN_STRONG_INLINE Packet4f peven_mask(
const Packet4f& ) {
426 return _mm_castsi128_ps(_mm_set_epi32(0, -1, 0, -1));
429EIGEN_STRONG_INLINE Packet2l peven_mask(
const Packet2l& ) {
430 return _mm_set_epi32(0, 0, -1, -1);
433EIGEN_STRONG_INLINE Packet4i peven_mask(
const Packet4i& ) {
434 return _mm_set_epi32(0, -1, 0, -1);
437EIGEN_STRONG_INLINE Packet4ui peven_mask(
const Packet4ui& ) {
438 return _mm_set_epi32(0, -1, 0, -1);
441EIGEN_STRONG_INLINE Packet2d peven_mask(
const Packet2d& ) {
442 return _mm_castsi128_pd(_mm_set_epi32(0, 0, -1, -1));
446EIGEN_STRONG_INLINE Packet4f pzero(
const Packet4f& ) {
447 return _mm_setzero_ps();
450EIGEN_STRONG_INLINE Packet2d pzero(
const Packet2d& ) {
451 return _mm_setzero_pd();
454EIGEN_STRONG_INLINE Packet2l pzero(
const Packet2l& ) {
455 return _mm_setzero_si128();
458EIGEN_STRONG_INLINE Packet4i pzero(
const Packet4i& ) {
459 return _mm_setzero_si128();
462EIGEN_STRONG_INLINE Packet4ui pzero(
const Packet4ui& ) {
463 return _mm_setzero_si128();
471#if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__)
473EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(
const float* from) {
474 return vec4f_swizzle1(_mm_load_ss(from), 0, 0, 0, 0);
479EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(
const float& a) {
480 return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3, 2, 1, 0));
483EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(
const double& a) {
484 return _mm_add_pd(pset1<Packet2d>(a), _mm_set_pd(1, 0));
487EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(
const int64_t& a) {
488 return _mm_add_epi32(pset1<Packet2l>(a), _mm_set_epi64x(1, 0));
491EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(
const int& a) {
492 return _mm_add_epi32(pset1<Packet4i>(a), _mm_set_epi32(3, 2, 1, 0));
495EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(
const uint32_t& a) {
496 return _mm_add_epi32(pset1<Packet4ui>(a), _mm_set_epi32(3, 2, 1, 0));
500EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
501 return _mm_add_ps(a, b);
504EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
505 return _mm_add_pd(a, b);
508EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
509 return _mm_add_epi64(a, b);
512EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
513 return _mm_add_epi32(a, b);
516EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
517 return _mm_add_epi32(a, b);
521EIGEN_STRONG_INLINE Packet16b padd<Packet16b>(
const Packet16b& a,
const Packet16b& b) {
522 return _mm_or_si128(a, b);
525template <
typename Packet>
526EIGEN_STRONG_INLINE Packet padds(
const Packet& a,
const Packet& b);
528EIGEN_STRONG_INLINE Packet4f padds<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
529 return _mm_add_ss(a, b);
532EIGEN_STRONG_INLINE Packet2d padds<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
533 return _mm_add_sd(a, b);
537EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
538 return _mm_sub_ps(a, b);
541EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
542 return _mm_sub_pd(a, b);
545EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
546 return _mm_sub_epi64(a, b);
549EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
550 return _mm_sub_epi32(a, b);
553EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
554 return _mm_sub_epi32(a, b);
557EIGEN_STRONG_INLINE Packet16b psub<Packet16b>(
const Packet16b& a,
const Packet16b& b) {
558 return _mm_xor_si128(a, b);
562EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(
const Packet4f& a,
const Packet4f& b);
564EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
565#ifdef EIGEN_VECTORIZE_SSE3
566 return _mm_addsub_ps(a, b);
568 const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x0, 0x80000000, 0x0));
569 return padd(a, pxor(mask, b));
574EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(
const Packet2d&,
const Packet2d&);
576EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
577#ifdef EIGEN_VECTORIZE_SSE3
578 return _mm_addsub_pd(a, b);
580 const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0, 0x80000000, 0x0, 0x0));
581 return padd(a, pxor(mask, b));
586EIGEN_STRONG_INLINE Packet4f pnegate(
const Packet4f& a) {
587 const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000));
588 return _mm_xor_ps(a, mask);
591EIGEN_STRONG_INLINE Packet2d pnegate(
const Packet2d& a) {
592 const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0, 0x80000000, 0x0, 0x80000000));
593 return _mm_xor_pd(a, mask);
596EIGEN_STRONG_INLINE Packet2l pnegate(
const Packet2l& a) {
597 return psub(pzero(a), a);
601EIGEN_STRONG_INLINE Packet4i pnegate(
const Packet4i& a) {
602 return psub(pzero(a), a);
606EIGEN_STRONG_INLINE Packet4f pconj(
const Packet4f& a) {
610EIGEN_STRONG_INLINE Packet2d pconj(
const Packet2d& a) {
614EIGEN_STRONG_INLINE Packet2l pconj(
const Packet2l& a) {
618EIGEN_STRONG_INLINE Packet4i pconj(
const Packet4i& a) {
623EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
624 return _mm_mul_ps(a, b);
627EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
628 return _mm_mul_pd(a, b);
631EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
633 __m128i upper32_a = _mm_srli_epi64(a, 32);
634 __m128i upper32_b = _mm_srli_epi64(b, 32);
637 __m128i mul1 = _mm_mul_epu32(upper32_a, b);
638 __m128i mul2 = _mm_mul_epu32(upper32_b, a);
640 __m128i mul3 = _mm_mul_epu32(a, b);
642 __m128i high = _mm_slli_epi64(_mm_add_epi64(mul1, mul2), 32);
643 return _mm_add_epi64(high, mul3);
646EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
647#ifdef EIGEN_VECTORIZE_SSE4_1
648 return _mm_mullo_epi32(a, b);
651 return vec4i_swizzle1(
652 vec4i_swizzle2(_mm_mul_epu32(a, b), _mm_mul_epu32(vec4i_swizzle1(a, 1, 0, 3, 2), vec4i_swizzle1(b, 1, 0, 3, 2)),
658EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
659#ifdef EIGEN_VECTORIZE_SSE4_1
660 return _mm_mullo_epi32(a, b);
663 return vec4ui_swizzle1(
664 vec4ui_swizzle2(_mm_mul_epu32(a, b),
665 _mm_mul_epu32(vec4ui_swizzle1(a, 1, 0, 3, 2), vec4ui_swizzle1(b, 1, 0, 3, 2)), 0, 2, 0, 2),
671EIGEN_STRONG_INLINE Packet16b pmul<Packet16b>(
const Packet16b& a,
const Packet16b& b) {
672 return _mm_and_si128(a, b);
676EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
677 return _mm_div_ps(a, b);
680EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
681 return _mm_div_pd(a, b);
685EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
686#ifdef EIGEN_VECTORIZE_AVX
687 return _mm256_cvttpd_epi32(_mm256_div_pd(_mm256_cvtepi32_pd(a), _mm256_cvtepi32_pd(b)));
689 __m128i q_lo = _mm_cvttpd_epi32(_mm_div_pd(_mm_cvtepi32_pd(a), _mm_cvtepi32_pd(b)));
690 __m128i q_hi = _mm_cvttpd_epi32(
691 _mm_div_pd(_mm_cvtepi32_pd(vec4i_swizzle1(a, 2, 3, 0, 1)), _mm_cvtepi32_pd(vec4i_swizzle1(b, 2, 3, 0, 1))));
692 return vec4i_swizzle1(_mm_unpacklo_epi32(q_lo, q_hi), 0, 2, 1, 3);
696#ifdef EIGEN_VECTORIZE_FMA
698EIGEN_STRONG_INLINE Packet4f pmadd(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
699 return _mm_fmadd_ps(a, b, c);
702EIGEN_STRONG_INLINE Packet2d pmadd(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
703 return _mm_fmadd_pd(a, b, c);
706EIGEN_STRONG_INLINE Packet4f pmsub(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
707 return _mm_fmsub_ps(a, b, c);
710EIGEN_STRONG_INLINE Packet2d pmsub(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
711 return _mm_fmsub_pd(a, b, c);
714EIGEN_STRONG_INLINE Packet4f pnmadd(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
715 return _mm_fnmadd_ps(a, b, c);
718EIGEN_STRONG_INLINE Packet2d pnmadd(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
719 return _mm_fnmadd_pd(a, b, c);
722EIGEN_STRONG_INLINE Packet4f pnmsub(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
723 return _mm_fnmsub_ps(a, b, c);
726EIGEN_STRONG_INLINE Packet2d pnmsub(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
727 return _mm_fnmsub_pd(a, b, c);
730template <
typename Packet>
731EIGEN_STRONG_INLINE Packet pmadds(
const Packet& a,
const Packet& b,
const Packet& c);
733EIGEN_STRONG_INLINE Packet4f pmadds<Packet4f>(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
734 return _mm_fmadd_ss(a, b, c);
737EIGEN_STRONG_INLINE Packet2d pmadds<Packet2d>(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
738 return _mm_fmadd_sd(a, b, c);
742#ifdef EIGEN_VECTORIZE_SSE4_1
744EIGEN_STRONG_INLINE Packet4f pselect(
const Packet4f& mask,
const Packet4f& a,
const Packet4f& b) {
745 return _mm_blendv_ps(b, a, mask);
749EIGEN_STRONG_INLINE Packet2l pselect(
const Packet2l& mask,
const Packet2l& a,
const Packet2l& b) {
750 return _mm_castpd_si128(_mm_blendv_pd(_mm_castsi128_pd(b), _mm_castsi128_pd(a), _mm_castsi128_pd(mask)));
754EIGEN_STRONG_INLINE Packet4i pselect(
const Packet4i& mask,
const Packet4i& a,
const Packet4i& b) {
755 return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(a), _mm_castsi128_ps(mask)));
759EIGEN_STRONG_INLINE Packet4ui pselect(
const Packet4ui& mask,
const Packet4ui& a,
const Packet4ui& b) {
760 return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(a), _mm_castsi128_ps(mask)));
764EIGEN_STRONG_INLINE Packet2d pselect(
const Packet2d& mask,
const Packet2d& a,
const Packet2d& b) {
765 return _mm_blendv_pd(b, a, mask);
770EIGEN_STRONG_INLINE Packet2l ptrue<Packet2l>(
const Packet2l& a) {
771 return _mm_cmpeq_epi32(a, a);
774EIGEN_STRONG_INLINE Packet4i ptrue<Packet4i>(
const Packet4i& a) {
775 return _mm_cmpeq_epi32(a, a);
778EIGEN_STRONG_INLINE Packet16b ptrue<Packet16b>(
const Packet16b& ) {
779 return pset1<Packet16b>(
true);
782EIGEN_STRONG_INLINE Packet4f ptrue<Packet4f>(
const Packet4f& a) {
783 Packet4i b = _mm_castps_si128(a);
784 return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b));
787EIGEN_STRONG_INLINE Packet2d ptrue<Packet2d>(
const Packet2d& a) {
788 Packet4i b = _mm_castpd_si128(a);
789 return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b));
793EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
794 return _mm_and_ps(a, b);
797EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
798 return _mm_and_pd(a, b);
801EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
802 return _mm_and_si128(a, b);
805EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
806 return _mm_and_si128(a, b);
809EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
810 return _mm_and_si128(a, b);
813EIGEN_STRONG_INLINE Packet16b pand<Packet16b>(
const Packet16b& a,
const Packet16b& b) {
814 return _mm_and_si128(a, b);
818EIGEN_STRONG_INLINE Packet4f por<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
819 return _mm_or_ps(a, b);
822EIGEN_STRONG_INLINE Packet2d por<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
823 return _mm_or_pd(a, b);
826EIGEN_STRONG_INLINE Packet2l por<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
827 return _mm_or_si128(a, b);
830EIGEN_STRONG_INLINE Packet4i por<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
831 return _mm_or_si128(a, b);
834EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
835 return _mm_or_si128(a, b);
838EIGEN_STRONG_INLINE Packet16b por<Packet16b>(
const Packet16b& a,
const Packet16b& b) {
839 return _mm_or_si128(a, b);
843EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
844 return _mm_xor_ps(a, b);
847EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
848 return _mm_xor_pd(a, b);
851EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
852 return _mm_xor_si128(a, b);
855EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
856 return _mm_xor_si128(a, b);
859EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
860 return _mm_xor_si128(a, b);
863EIGEN_STRONG_INLINE Packet16b pxor<Packet16b>(
const Packet16b& a,
const Packet16b& b) {
864 return _mm_xor_si128(a, b);
868EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
869 return _mm_andnot_ps(b, a);
872EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
873 return _mm_andnot_pd(b, a);
876EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
877 return _mm_andnot_si128(b, a);
880EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
881 return _mm_andnot_si128(b, a);
884EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
885 return _mm_andnot_si128(b, a);
888EIGEN_STRONG_INLINE Packet16b pandnot<Packet16b>(
const Packet16b& a,
const Packet16b& b) {
889 return _mm_andnot_si128(b, a);
892EIGEN_STRONG_INLINE Packet16b pcmp_lt(
const Packet16b& a,
const Packet16b& b) {
893 return _mm_andnot_si128(a, b);
896EIGEN_STRONG_INLINE Packet4f pcmp_le(
const Packet4f& a,
const Packet4f& b) {
897 return _mm_cmple_ps(a, b);
900EIGEN_STRONG_INLINE Packet4f pcmp_lt(
const Packet4f& a,
const Packet4f& b) {
901 return _mm_cmplt_ps(a, b);
904EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(
const Packet4f& a,
const Packet4f& b) {
905 return _mm_cmpnge_ps(a, b);
908EIGEN_STRONG_INLINE Packet4f pcmp_eq(
const Packet4f& a,
const Packet4f& b) {
909 return _mm_cmpeq_ps(a, b);
913EIGEN_STRONG_INLINE Packet2d pcmp_le(
const Packet2d& a,
const Packet2d& b) {
914 return _mm_cmple_pd(a, b);
917EIGEN_STRONG_INLINE Packet2d pcmp_lt(
const Packet2d& a,
const Packet2d& b) {
918 return _mm_cmplt_pd(a, b);
921EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(
const Packet2d& a,
const Packet2d& b) {
922 return _mm_cmpnge_pd(a, b);
925EIGEN_STRONG_INLINE Packet2d pcmp_eq(
const Packet2d& a,
const Packet2d& b) {
926 return _mm_cmpeq_pd(a, b);
929EIGEN_STRONG_INLINE Packet4i pcmp_lt(
const Packet4i& a,
const Packet4i& b) {
930 return _mm_cmplt_epi32(a, b);
933EIGEN_STRONG_INLINE Packet4i pcmp_eq(
const Packet4i& a,
const Packet4i& b) {
934 return _mm_cmpeq_epi32(a, b);
937EIGEN_STRONG_INLINE Packet4i pcmp_le(
const Packet4i& a,
const Packet4i& b) {
938#ifdef EIGEN_VECTORIZE_SSE4_1
939 return _mm_cmpeq_epi32(a, _mm_min_epi32(a, b));
941 return por(pcmp_lt(a, b), pcmp_eq(a, b));
945EIGEN_STRONG_INLINE Packet2l pcmp_lt(
const Packet2l& a,
const Packet2l& b) {
946#ifdef EIGEN_VECTORIZE_SSE4_2
947 return _mm_cmpgt_epi64(b, a);
949 Packet4i eq = pcmp_eq<Packet4i>(Packet4i(a), Packet4i(b));
950 Packet2l hi_eq = Packet2l(_mm_shuffle_epi32(eq, (shuffle_mask<1, 1, 3, 3>::mask)));
951 Packet4i lt = pcmp_lt<Packet4i>(Packet4i(a), Packet4i(b));
952 Packet2l hi_lt = Packet2l(_mm_shuffle_epi32(lt, (shuffle_mask<1, 1, 3, 3>::mask)));
953 Packet2l lo_lt = Packet2l(_mm_shuffle_epi32(lt, (shuffle_mask<0, 0, 2, 2>::mask)));
955 return por(hi_lt, pand(hi_eq, lo_lt));
959EIGEN_STRONG_INLINE Packet2l pcmp_eq(
const Packet2l& a,
const Packet2l& b) {
960#ifdef EIGEN_VECTORIZE_SSE4_1
961 return _mm_cmpeq_epi64(a, b);
963 Packet4i tmp = pcmp_eq<Packet4i>(Packet4i(a), Packet4i(b));
964 return Packet2l(pand<Packet4i>(tmp, _mm_shuffle_epi32(tmp, (shuffle_mask<1, 0, 3, 2>::mask))));
968EIGEN_STRONG_INLINE Packet2l pcmp_le(
const Packet2l& a,
const Packet2l& b) {
969 return por(pcmp_lt(a, b), pcmp_eq(a, b));
972EIGEN_STRONG_INLINE Packet16b pcmp_eq(
const Packet16b& a,
const Packet16b& b) {
974 const Packet16b kBoolMask = pset1<Packet16b>(
true);
975 return _mm_and_si128(_mm_cmpeq_epi8(a, b), kBoolMask);
978EIGEN_STRONG_INLINE Packet4ui pcmp_eq(
const Packet4ui& a,
const Packet4ui& b) {
979 return _mm_cmpeq_epi32(a, b);
983EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
984#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
989#ifdef EIGEN_VECTORIZE_AVX
991 asm(
"vminps %[a], %[b], %[res]" : [res]
"=x"(res) : [a]
"x"(a), [b]
"x"(b));
994 asm(
"minps %[a], %[res]" : [res]
"+x"(res) : [a]
"x"(a));
999 return _mm_min_ps(b, a);
1003EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
1004#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
1009#ifdef EIGEN_VECTORIZE_AVX
1011 asm(
"vminpd %[a], %[b], %[res]" : [res]
"=x"(res) : [a]
"x"(a), [b]
"x"(b));
1014 asm(
"minpd %[a], %[res]" : [res]
"+x"(res) : [a]
"x"(a));
1019 return _mm_min_pd(b, a);
1023EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
1024 Packet2l a_lt_mask = pcmp_lt(a, b);
1025 return por(pandnot(b, a_lt_mask), pand(a, a_lt_mask));
1028EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1029#ifdef EIGEN_VECTORIZE_SSE4_1
1030 return _mm_min_epi32(a, b);
1033 Packet4i mask = _mm_cmplt_epi32(a, b);
1034 return _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b));
1038EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1039#ifdef EIGEN_VECTORIZE_SSE4_1
1040 return _mm_min_epu32(a, b);
1042 return padd((Packet4ui)pmin((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
1043 (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL))),
1044 pset1<Packet4ui>(0x80000000UL));
1049EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1050#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
1055#ifdef EIGEN_VECTORIZE_AVX
1057 asm(
"vmaxps %[a], %[b], %[res]" : [res]
"=x"(res) : [a]
"x"(a), [b]
"x"(b));
1060 asm(
"maxps %[a], %[res]" : [res]
"+x"(res) : [a]
"x"(a));
1065 return _mm_max_ps(b, a);
1069EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
1070#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
1075#ifdef EIGEN_VECTORIZE_AVX
1077 asm(
"vmaxpd %[a], %[b], %[res]" : [res]
"=x"(res) : [a]
"x"(a), [b]
"x"(b));
1080 asm(
"maxpd %[a], %[res]" : [res]
"+x"(res) : [a]
"x"(a));
1085 return _mm_max_pd(b, a);
1089EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(
const Packet2l& a,
const Packet2l& b) {
1090 Packet2l a_lt_mask = pcmp_lt(a, b);
1091 return por(pandnot(a, a_lt_mask), pand(b, a_lt_mask));
1094EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
1095#ifdef EIGEN_VECTORIZE_SSE4_1
1096 return _mm_max_epi32(a, b);
1099 Packet4i mask = _mm_cmpgt_epi32(a, b);
1100 return _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b));
1104EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(
const Packet4ui& a,
const Packet4ui& b) {
1105#ifdef EIGEN_VECTORIZE_SSE4_1
1106 return _mm_max_epu32(a, b);
1108 return padd((Packet4ui)pmax((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
1109 (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL))),
1110 pset1<Packet4ui>(0x80000000UL));
1115EIGEN_STRONG_INLINE Packet4ui pcmp_lt(
const Packet4ui& a,
const Packet4ui& b) {
1116#ifdef EIGEN_VECTORIZE_SSE4_1
1117 return pxor(pcmp_eq(a, pmax(a, b)), ptrue(a));
1119 return (Packet4ui)pcmp_lt((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
1120 (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL)));
1124EIGEN_STRONG_INLINE Packet4ui pcmp_le(
const Packet4ui& a,
const Packet4ui& b) {
1125#ifdef EIGEN_VECTORIZE_SSE4_1
1126 return pcmp_eq(a, pmin(a, b));
1128 return (Packet4ui)pcmp_le((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
1129 (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL)));
1133template <
typename Packet,
typename Op>
1134EIGEN_STRONG_INLINE Packet pminmax_propagate_numbers(
const Packet& a,
const Packet& b, Op op) {
1137 Packet not_nan_mask_a = pcmp_eq(a, a);
1138 Packet m = op(a, b);
1139 return pselect<Packet>(not_nan_mask_a, m, b);
1142template <
typename Packet,
typename Op>
1143EIGEN_STRONG_INLINE Packet pminmax_propagate_nan(
const Packet& a,
const Packet& b, Op op) {
1146 Packet not_nan_mask_a = pcmp_eq(a, a);
1147 Packet m = op(b, a);
1148 return pselect<Packet>(not_nan_mask_a, m, a);
1153EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1154 return pminmax_propagate_numbers(a, b, pmin<Packet4f>);
1157EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(
const Packet2d& a,
const Packet2d& b) {
1158 return pminmax_propagate_numbers(a, b, pmin<Packet2d>);
1161EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1162 return pminmax_propagate_numbers(a, b, pmax<Packet4f>);
1165EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(
const Packet2d& a,
const Packet2d& b) {
1166 return pminmax_propagate_numbers(a, b, pmax<Packet2d>);
1169EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1170 return pminmax_propagate_nan(a, b, pmin<Packet4f>);
1173EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(
const Packet2d& a,
const Packet2d& b) {
1174 return pminmax_propagate_nan(a, b, pmin<Packet2d>);
1177EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(
const Packet4f& a,
const Packet4f& b) {
1178 return pminmax_propagate_nan(a, b, pmax<Packet4f>);
1181EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(
const Packet2d& a,
const Packet2d& b) {
1182 return pminmax_propagate_nan(a, b, pmax<Packet2d>);
1186EIGEN_STRONG_INLINE Packet4f psignbit(
const Packet4f& a) {
1187 return _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(a), 31));
1190EIGEN_STRONG_INLINE Packet2d psignbit(
const Packet2d& a) {
1191 Packet4f tmp = psignbit<Packet4f>(_mm_castpd_ps(a));
1192#ifdef EIGEN_VECTORIZE_AVX
1193 return _mm_castps_pd(_mm_permute_ps(tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
1195 return _mm_castps_pd(_mm_shuffle_ps(tmp, tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
1199EIGEN_STRONG_INLINE Packet4i psignbit(
const Packet4i& a) {
1200 return _mm_srai_epi32(a, 31);
1203EIGEN_STRONG_INLINE Packet4ui psignbit(
const Packet4ui& a) {
1207EIGEN_STRONG_INLINE Packet2l psignbit(
const Packet2l& a) {
1208 Packet4i tmp = psignbit<Packet4i>(Packet4i(a));
1209 return Packet2l(_mm_shuffle_epi32(tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
1213EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(
const Packet2l& a) {
1214 Packet2l signbit = psignbit(a);
1215 return por(_mm_slli_epi64(signbit, 64 - N), _mm_srli_epi64(a, N));
1218EIGEN_STRONG_INLINE Packet2l plogical_shift_right(
const Packet2l& a) {
1219 return _mm_srli_epi64(a, N);
1222EIGEN_STRONG_INLINE Packet2l plogical_shift_left(
const Packet2l& a) {
1223 return _mm_slli_epi64(a, N);
1226EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(
const Packet4i& a) {
1227 return _mm_srai_epi32(a, N);
1230EIGEN_STRONG_INLINE Packet4i plogical_shift_right(
const Packet4i& a) {
1231 return _mm_srli_epi32(a, N);
1234EIGEN_STRONG_INLINE Packet4i plogical_shift_left(
const Packet4i& a) {
1235 return _mm_slli_epi32(a, N);
1238EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(
const Packet4ui& a) {
1239 return _mm_srli_epi32(a, N);
1242EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(
const Packet4ui& a) {
1243 return _mm_srli_epi32(a, N);
1246EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(
const Packet4ui& a) {
1247 return _mm_slli_epi32(a, N);
1251EIGEN_STRONG_INLINE Packet4f pabs(
const Packet4f& a) {
1252 const __m128i mask = _mm_setr_epi32(0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF);
1253 return _mm_castsi128_ps(_mm_and_si128(mask, _mm_castps_si128(a)));
1256EIGEN_STRONG_INLINE Packet2d pabs(
const Packet2d& a) {
1257 const __m128i mask = _mm_setr_epi32(0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF);
1258 return _mm_castsi128_pd(_mm_and_si128(mask, _mm_castpd_si128(a)));
1261EIGEN_STRONG_INLINE Packet2l pabs(
const Packet2l& a) {
1262 Packet2l signbit = psignbit(a);
1263 return _mm_sub_epi64(_mm_xor_si128(a, signbit), signbit);
1266EIGEN_STRONG_INLINE Packet4i pabs(
const Packet4i& a) {
1267#ifdef EIGEN_VECTORIZE_SSSE3
1268 return _mm_abs_epi32(a);
1270 Packet4i signbit = psignbit(a);
1271 return _mm_sub_epi32(_mm_xor_si128(a, signbit), signbit);
1275EIGEN_STRONG_INLINE Packet4ui pabs(
const Packet4ui& a) {
1279#ifdef EIGEN_VECTORIZE_SSE4_1
1281EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(
const Packet4f& a) {
1283 const Packet4f mask = pset1frombits<Packet4f>(0x80000000u);
1284 const Packet4f prev0dot5 = pset1frombits<Packet4f>(0x3EFFFFFFu);
1285 return _mm_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
1289EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(
const Packet2d& a) {
1290 const Packet2d mask = _mm_castsi128_pd(_mm_set_epi64x(0x8000000000000000ull, 0x8000000000000000ull));
1291 const Packet2d prev0dot5 = _mm_castsi128_pd(_mm_set_epi64x(0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull));
1292 return _mm_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
1296EIGEN_STRONG_INLINE Packet4f print<Packet4f>(
const Packet4f& a) {
1297 return _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION);
1300EIGEN_STRONG_INLINE Packet2d print<Packet2d>(
const Packet2d& a) {
1301 return _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
1305EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(
const Packet4f& a) {
1306 return _mm_ceil_ps(a);
1309EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(
const Packet2d& a) {
1310 return _mm_ceil_pd(a);
1314EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(
const Packet4f& a) {
1315 return _mm_floor_ps(a);
1318EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(
const Packet2d& a) {
1319 return _mm_floor_pd(a);
1323EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(
const Packet4f& a) {
1324 return _mm_round_ps(a, _MM_FROUND_TRUNC);
1327EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(
const Packet2d& a) {
1328 return _mm_round_pd(a, _MM_FROUND_TRUNC);
1333EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(
const float* from) {
1334 EIGEN_DEBUG_ALIGNED_LOAD
return _mm_load_ps(from);
1337EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(
const double* from) {
1338 EIGEN_DEBUG_ALIGNED_LOAD
return _mm_load_pd(from);
1341EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(
const int64_t* from) {
1342 EIGEN_DEBUG_ALIGNED_LOAD
return _mm_load_si128(
reinterpret_cast<const __m128i*
>(from));
1345EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(
const int* from) {
1346 EIGEN_DEBUG_ALIGNED_LOAD
return _mm_load_si128(
reinterpret_cast<const __m128i*
>(from));
1349EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(
const uint32_t* from) {
1350 EIGEN_DEBUG_ALIGNED_LOAD
return _mm_load_si128(
reinterpret_cast<const __m128i*
>(from));
1353EIGEN_STRONG_INLINE Packet16b pload<Packet16b>(
const bool* from) {
1354 EIGEN_DEBUG_ALIGNED_LOAD
return _mm_load_si128(
reinterpret_cast<const __m128i*
>(from));
1359EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(
const float* from) {
1360 EIGEN_DEBUG_UNALIGNED_LOAD
1361 return _mm_loadu_ps(from);
1367EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(
const float* from) {
1368 EIGEN_DEBUG_UNALIGNED_LOAD
1369 return _mm_loadu_ps(from);
1374EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(
const double* from) {
1375 EIGEN_DEBUG_UNALIGNED_LOAD
1376 return _mm_loadu_pd(from);
1379EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(
const int64_t* from) {
1380 EIGEN_DEBUG_UNALIGNED_LOAD
1381 return _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(from));
1384EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(
const int* from) {
1385 EIGEN_DEBUG_UNALIGNED_LOAD
1386 return _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(from));
1389EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(
const uint32_t* from) {
1390 EIGEN_DEBUG_UNALIGNED_LOAD
1391 return _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(from));
1394EIGEN_STRONG_INLINE Packet16b ploadu<Packet16b>(
const bool* from) {
1395 EIGEN_DEBUG_UNALIGNED_LOAD
1396 return _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(from));
1400template <
typename Packet>
1401EIGEN_STRONG_INLINE Packet ploadl(
const typename unpacket_traits<Packet>::type* from);
1403EIGEN_STRONG_INLINE Packet4f ploadl<Packet4f>(
const float* from) {
1404 EIGEN_DEBUG_UNALIGNED_LOAD
return _mm_castpd_ps(_mm_load_sd(
reinterpret_cast<const double*
>(from)));
1407EIGEN_STRONG_INLINE Packet2d ploadl<Packet2d>(
const double* from) {
1408 EIGEN_DEBUG_UNALIGNED_LOAD
return _mm_load_sd(from);
1412template <
typename Packet>
1413EIGEN_STRONG_INLINE Packet ploads(
const typename unpacket_traits<Packet>::type* from);
1415EIGEN_STRONG_INLINE Packet4f ploads<Packet4f>(
const float* from) {
1416 EIGEN_DEBUG_UNALIGNED_LOAD
return _mm_load_ss(from);
1419EIGEN_STRONG_INLINE Packet2d ploads<Packet2d>(
const double* from) {
1420 EIGEN_DEBUG_UNALIGNED_LOAD
return _mm_load_sd(from);
1424EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(
const float* from) {
1425 return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(
reinterpret_cast<const double*
>(from))), 0, 0, 1, 1);
1428EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(
const double* from) {
1429 return pset1<Packet2d>(from[0]);
1432EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(
const int64_t* from) {
1433 return pset1<Packet2l>(from[0]);
1436EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(
const int* from) {
1438 tmp = _mm_loadl_epi64(
reinterpret_cast<const __m128i*
>(from));
1439 return vec4i_swizzle1(tmp, 0, 0, 1, 1);
1442EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(
const uint32_t* from) {
1444 tmp = _mm_loadl_epi64(
reinterpret_cast<const __m128i*
>(from));
1445 return vec4ui_swizzle1(tmp, 0, 0, 1, 1);
1451EIGEN_STRONG_INLINE Packet16b ploaddup<Packet16b>(
const bool* from) {
1452 __m128i tmp = _mm_castpd_si128(pload1<Packet2d>(
reinterpret_cast<const double*
>(from)));
1453 return _mm_unpacklo_epi8(tmp, tmp);
1459EIGEN_STRONG_INLINE Packet16b ploadquad<Packet16b>(
const bool* from) {
1460 __m128i tmp = _mm_castps_si128(pload1<Packet4f>(
reinterpret_cast<const float*
>(from)));
1461 tmp = _mm_unpacklo_epi8(tmp, tmp);
1462 return _mm_unpacklo_epi16(tmp, tmp);
1466EIGEN_STRONG_INLINE
void pstore<float>(
float* to,
const Packet4f& from) {
1467 EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from);
1470EIGEN_STRONG_INLINE
void pstore<double>(
double* to,
const Packet2d& from) {
1471 EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from);
1474EIGEN_STRONG_INLINE
void pstore<int64_t>(int64_t* to,
const Packet2l& from) {
1475 EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(
reinterpret_cast<__m128i*
>(to), from);
1478EIGEN_STRONG_INLINE
void pstore<int>(
int* to,
const Packet4i& from) {
1479 EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(
reinterpret_cast<__m128i*
>(to), from);
1482EIGEN_STRONG_INLINE
void pstore<uint32_t>(uint32_t* to,
const Packet4ui& from) {
1483 EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(
reinterpret_cast<__m128i*
>(to), from);
1486EIGEN_STRONG_INLINE
void pstore<bool>(
bool* to,
const Packet16b& from) {
1487 EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(
reinterpret_cast<__m128i*
>(to), from);
1491EIGEN_STRONG_INLINE
void pstoreu<double>(
double* to,
const Packet2d& from) {
1492 EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from);
1495EIGEN_STRONG_INLINE
void pstoreu<float>(
float* to,
const Packet4f& from) {
1496 EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from);
1499EIGEN_STRONG_INLINE
void pstoreu<int64_t>(int64_t* to,
const Packet2l& from) {
1500 EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(
reinterpret_cast<__m128i*
>(to), from);
1503EIGEN_STRONG_INLINE
void pstoreu<int>(
int* to,
const Packet4i& from) {
1504 EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(
reinterpret_cast<__m128i*
>(to), from);
1507EIGEN_STRONG_INLINE
void pstoreu<uint32_t>(uint32_t* to,
const Packet4ui& from) {
1508 EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(
reinterpret_cast<__m128i*
>(to), from);
1511EIGEN_STRONG_INLINE
void pstoreu<bool>(
bool* to,
const Packet16b& from) {
1512 EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(
reinterpret_cast<__m128i*
>(to), from);
1515template <
typename Scalar,
typename Packet>
1516EIGEN_STRONG_INLINE
void pstorel(Scalar* to,
const Packet& from);
1518EIGEN_STRONG_INLINE
void pstorel(
float* to,
const Packet4f& from) {
1519 EIGEN_DEBUG_UNALIGNED_STORE _mm_storel_pi(
reinterpret_cast<__m64*
>(to), from);
1522EIGEN_STRONG_INLINE
void pstorel(
double* to,
const Packet2d& from) {
1523 EIGEN_DEBUG_UNALIGNED_STORE _mm_storel_pd(to, from);
1526template <
typename Scalar,
typename Packet>
1527EIGEN_STRONG_INLINE
void pstores(Scalar* to,
const Packet& from);
1529EIGEN_STRONG_INLINE
void pstores(
float* to,
const Packet4f& from) {
1530 EIGEN_DEBUG_UNALIGNED_STORE _mm_store_ss(to, from);
1533EIGEN_STRONG_INLINE
void pstores(
double* to,
const Packet2d& from) {
1534 EIGEN_DEBUG_UNALIGNED_STORE _mm_store_sd(to, from);
1538EIGEN_STRONG_INLINE Packet4f preverse(
const Packet4f& a) {
1539 return _mm_shuffle_ps(a, a, 0x1B);
1542EIGEN_STRONG_INLINE Packet2d preverse(
const Packet2d& a) {
1543 return _mm_shuffle_pd(a, a, 0x1);
1546EIGEN_STRONG_INLINE Packet2l preverse(
const Packet2l& a) {
1547 return _mm_castpd_si128(preverse(_mm_castsi128_pd(a)));
1550EIGEN_STRONG_INLINE Packet4i preverse(
const Packet4i& a) {
1551 return _mm_shuffle_epi32(a, 0x1B);
1554EIGEN_STRONG_INLINE Packet4ui preverse(
const Packet4ui& a) {
1555 return _mm_shuffle_epi32(a, 0x1B);
1558EIGEN_STRONG_INLINE Packet16b preverse(
const Packet16b& a) {
1559#ifdef EIGEN_VECTORIZE_SSSE3
1560 __m128i mask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1561 return _mm_shuffle_epi8(a, mask);
1563 Packet16b tmp = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3));
1564 tmp = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
1565 return _mm_or_si128(_mm_slli_epi16(tmp, 8), _mm_srli_epi16(tmp, 8));
1569#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
1573EIGEN_STRONG_INLINE
float pfirst<Packet4f>(
const Packet4f& a) {
1574 return a.m128_f32[0];
1577EIGEN_STRONG_INLINE
double pfirst<Packet2d>(
const Packet2d& a) {
1578 return a.m128d_f64[0];
1581EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(
const Packet2l& a) {
1582 int64_t x = _mm_extract_epi64_0(a);
1586EIGEN_STRONG_INLINE
int pfirst<Packet4i>(
const Packet4i& a) {
1587 int x = _mm_cvtsi128_si32(a);
1591EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(
const Packet4ui& a) {
1592 uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
1595#elif EIGEN_COMP_MSVC_STRICT
1598EIGEN_STRONG_INLINE
float pfirst<Packet4f>(
const Packet4f& a) {
1599 float x = _mm_cvtss_f32(a);
1603EIGEN_STRONG_INLINE
double pfirst<Packet2d>(
const Packet2d& a) {
1604 double x = _mm_cvtsd_f64(a);
1608EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(
const Packet2l& a) {
1609 int64_t x = _mm_extract_epi64_0(a);
1613EIGEN_STRONG_INLINE
int pfirst<Packet4i>(
const Packet4i& a) {
1614 int x = _mm_cvtsi128_si32(a);
1618EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(
const Packet4ui& a) {
1619 uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
1624EIGEN_STRONG_INLINE
float pfirst<Packet4f>(
const Packet4f& a) {
1625 return _mm_cvtss_f32(a);
1628EIGEN_STRONG_INLINE
double pfirst<Packet2d>(
const Packet2d& a) {
1629 return _mm_cvtsd_f64(a);
1632EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(
const Packet2l& a) {
1633 return _mm_extract_epi64_0(a);
1636EIGEN_STRONG_INLINE
int pfirst<Packet4i>(
const Packet4i& a) {
1637 return _mm_cvtsi128_si32(a);
1640EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(
const Packet4ui& a) {
1641 return numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
1645EIGEN_STRONG_INLINE
bool pfirst<Packet16b>(
const Packet16b& a) {
1646 int x = _mm_cvtsi128_si32(a);
1647 return static_cast<bool>(x & 1);
1651EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(
const float* from,
Index stride) {
1652 return _mm_set_ps(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
1655EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(
const double* from,
Index stride) {
1656 return _mm_set_pd(from[1 * stride], from[0 * stride]);
1659EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(
const int64_t* from,
Index stride) {
1660 return _mm_set_epi64x(from[1 * stride], from[0 * stride]);
1663EIGEN_STRONG_INLINE Packet4i pgather<int, Packet4i>(
const int* from,
Index stride) {
1664 return _mm_set_epi32(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
1667EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(
const uint32_t* from,
Index stride) {
1668 return _mm_set_epi32(numext::bit_cast<int32_t>(from[3 * stride]), numext::bit_cast<int32_t>(from[2 * stride]),
1669 numext::bit_cast<int32_t>(from[1 * stride]), numext::bit_cast<int32_t>(from[0 * stride]));
1673EIGEN_STRONG_INLINE Packet16b pgather<bool, Packet16b>(
const bool* from,
Index stride) {
1674 return _mm_set_epi8(from[15 * stride], from[14 * stride], from[13 * stride], from[12 * stride], from[11 * stride],
1675 from[10 * stride], from[9 * stride], from[8 * stride], from[7 * stride], from[6 * stride],
1676 from[5 * stride], from[4 * stride], from[3 * stride], from[2 * stride], from[1 * stride],
1681EIGEN_STRONG_INLINE
void pscatter<float, Packet4f>(
float* to,
const Packet4f& from,
Index stride) {
1682 to[stride * 0] = pfirst(from);
1683 to[stride * 1] = pfirst(Packet4f(_mm_shuffle_ps(from, from, 1)));
1684 to[stride * 2] = pfirst(Packet4f(_mm_shuffle_ps(from, from, 2)));
1685 to[stride * 3] = pfirst(Packet4f(_mm_shuffle_ps(from, from, 3)));
1688EIGEN_STRONG_INLINE
void pscatter<double, Packet2d>(
double* to,
const Packet2d& from,
Index stride) {
1689 to[stride * 0] = pfirst(from);
1690 to[stride * 1] = pfirst(preverse(from));
1693EIGEN_STRONG_INLINE
void pscatter<int64_t, Packet2l>(int64_t* to,
const Packet2l& from,
Index stride) {
1694 to[stride * 0] = pfirst(from);
1695 to[stride * 1] = pfirst(preverse(from));
1698EIGEN_STRONG_INLINE
void pscatter<int, Packet4i>(
int* to,
const Packet4i& from,
Index stride) {
1699 to[stride * 0] = _mm_cvtsi128_si32(from);
1700 to[stride * 1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
1701 to[stride * 2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
1702 to[stride * 3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
1705EIGEN_STRONG_INLINE
void pscatter<uint32_t, Packet4ui>(uint32_t* to,
const Packet4ui& from,
Index stride) {
1706 to[stride * 0] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(from));
1707 to[stride * 1] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)));
1708 to[stride * 2] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)));
1709 to[stride * 3] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)));
1712EIGEN_STRONG_INLINE
void pscatter<bool, Packet16b>(
bool* to,
const Packet16b& from,
Index stride) {
1713 EIGEN_ALIGN16
bool tmp[16];
1715 to[stride * 0] = tmp[0];
1716 to[stride * 1] = tmp[1];
1717 to[stride * 2] = tmp[2];
1718 to[stride * 3] = tmp[3];
1719 to[stride * 4] = tmp[4];
1720 to[stride * 5] = tmp[5];
1721 to[stride * 6] = tmp[6];
1722 to[stride * 7] = tmp[7];
1723 to[stride * 8] = tmp[8];
1724 to[stride * 9] = tmp[9];
1725 to[stride * 10] = tmp[10];
1726 to[stride * 11] = tmp[11];
1727 to[stride * 12] = tmp[12];
1728 to[stride * 13] = tmp[13];
1729 to[stride * 14] = tmp[14];
1730 to[stride * 15] = tmp[15];
1735EIGEN_STRONG_INLINE
void pstore1<Packet4f>(
float* to,
const float& a) {
1736 Packet4f pa = _mm_set_ss(a);
1737 pstore(to, Packet4f(vec4f_swizzle1(pa, 0, 0, 0, 0)));
1741EIGEN_STRONG_INLINE
void pstore1<Packet2d>(
double* to,
const double& a) {
1742 Packet2d pa = _mm_set_sd(a);
1743 pstore(to, Packet2d(vec2d_swizzle1(pa, 0, 0)));
1746#if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
1747typedef const void* SsePrefetchPtrType;
1749typedef const char* SsePrefetchPtrType;
1752#ifndef EIGEN_VECTORIZE_AVX
1754EIGEN_STRONG_INLINE
void prefetch<float>(
const float* addr) {
1755 _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1758EIGEN_STRONG_INLINE
void prefetch<double>(
const double* addr) {
1759 _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1762EIGEN_STRONG_INLINE
void prefetch<int>(
const int* addr) {
1763 _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1766EIGEN_STRONG_INLINE
void prefetch<int64_t>(
const int64_t* addr) {
1767 _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1770EIGEN_STRONG_INLINE
void prefetch<uint32_t>(
const uint32_t* addr) {
1771 _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1776EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(
const Packet4f& a, Packet4f& exponent) {
1777 return pfrexp_generic(a, exponent);
1782EIGEN_STRONG_INLINE Packet2d pfrexp_generic_get_biased_exponent(
const Packet2d& a) {
1783 const Packet2d cst_exp_mask = pset1frombits<Packet2d>(
static_cast<uint64_t
>(0x7ff0000000000000ull));
1784 __m128i a_expo = _mm_srli_epi64(_mm_castpd_si128(pand(a, cst_exp_mask)), 52);
1785 return _mm_cvtepi32_pd(vec4i_swizzle1(a_expo, 0, 2, 1, 3));
1789EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(
const Packet2d& a, Packet2d& exponent) {
1790 return pfrexp_generic(a, exponent);
1794EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(
const Packet4f& a,
const Packet4f& exponent) {
1795 return pldexp_generic(a, exponent);
1801EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(
const Packet2d& a,
const Packet2d& exponent) {
1803 const Packet2d max_exponent = pset1<Packet2d>(2099.0);
1804 const Packet2d e = pmin(pmax(exponent, pnegate(max_exponent)), max_exponent);
1807 const Packet4i ei = vec4i_swizzle1(_mm_cvtpd_epi32(e), 0, 3, 1, 3);
1810 const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023);
1811 Packet4i b = parithmetic_shift_right<2>(ei);
1812 Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52));
1813 Packet2d out = pmul(pmul(pmul(a, c), c), c);
1814 b = psub(psub(psub(ei, b), b), b);
1815 c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52));
1823EIGEN_STRONG_INLINE Packet2d pldexp_fast<Packet2d>(
const Packet2d& a,
const Packet2d& exponent) {
1825 const Packet2d min_exponent = pset1<Packet2d>(-1023.0);
1826 const Packet2d max_exponent = pset1<Packet2d>(1024.0);
1827 const Packet2d e = pmin(pmax(exponent, min_exponent), max_exponent);
1830 const Packet4i ei = vec4i_swizzle1(_mm_cvtpd_epi32(e), 0, 3, 1, 3);
1833 const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023);
1834 const Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(ei, bias), 52));
1841EIGEN_STRONG_INLINE
void pbroadcast4<Packet4f>(
const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
1842 a3 = pload<Packet4f>(a);
1843 a0 = vec4f_swizzle1(a3, 0, 0, 0, 0);
1844 a1 = vec4f_swizzle1(a3, 1, 1, 1, 1);
1845 a2 = vec4f_swizzle1(a3, 2, 2, 2, 2);
1846 a3 = vec4f_swizzle1(a3, 3, 3, 3, 3);
1849EIGEN_STRONG_INLINE
void pbroadcast4<Packet2d>(
const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2,
1851#ifdef EIGEN_VECTORIZE_SSE3
1852 a0 = _mm_loaddup_pd(a + 0);
1853 a1 = _mm_loaddup_pd(a + 1);
1854 a2 = _mm_loaddup_pd(a + 2);
1855 a3 = _mm_loaddup_pd(a + 3);
1857 a1 = pload<Packet2d>(a);
1858 a0 = vec2d_swizzle1(a1, 0, 0);
1859 a1 = vec2d_swizzle1(a1, 1, 1);
1860 a3 = pload<Packet2d>(a + 2);
1861 a2 = vec2d_swizzle1(a3, 0, 0);
1862 a3 = vec2d_swizzle1(a3, 1, 1);
1867EIGEN_STRONG_INLINE
void punpackp(Packet4f* vecs) {
1868 vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
1869 vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA));
1870 vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF));
1871 vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
1874EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
1875 _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
1878EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
1879 __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
1880 kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
1881 kernel.packet[1] = tmp;
1884EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet2l, 2>& kernel) {
1885 __m128i tmp = _mm_unpackhi_epi64(kernel.packet[0], kernel.packet[1]);
1886 kernel.packet[0] = _mm_unpacklo_epi64(kernel.packet[0], kernel.packet[1]);
1887 kernel.packet[1] = tmp;
1890EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
1891 __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
1892 __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
1893 __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
1894 __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
1896 kernel.packet[0] = _mm_unpacklo_epi64(T0, T1);
1897 kernel.packet[1] = _mm_unpackhi_epi64(T0, T1);
1898 kernel.packet[2] = _mm_unpacklo_epi64(T2, T3);
1899 kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
1901EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
1902 ptranspose((PacketBlock<Packet4i, 4>&)kernel);
1905EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16b, 4>& kernel) {
1906 __m128i T0 = _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]);
1907 __m128i T1 = _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]);
1908 __m128i T2 = _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]);
1909 __m128i T3 = _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]);
1910 kernel.packet[0] = _mm_unpacklo_epi16(T0, T2);
1911 kernel.packet[1] = _mm_unpackhi_epi16(T0, T2);
1912 kernel.packet[2] = _mm_unpacklo_epi16(T1, T3);
1913 kernel.packet[3] = _mm_unpackhi_epi16(T1, T3);
1916EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16b, 16>& kernel) {
1929 _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]);
1931 _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]);
1933 _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]);
1935 _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]);
1937 _mm_unpacklo_epi8(kernel.packet[4], kernel.packet[5]);
1938 __m128i t5 = _mm_unpackhi_epi8(kernel.packet[4], kernel.packet[5]);
1939 __m128i t6 = _mm_unpacklo_epi8(kernel.packet[6], kernel.packet[7]);
1940 __m128i t7 = _mm_unpackhi_epi8(kernel.packet[6], kernel.packet[7]);
1941 __m128i t8 = _mm_unpacklo_epi8(kernel.packet[8], kernel.packet[9]);
1942 __m128i t9 = _mm_unpackhi_epi8(kernel.packet[8], kernel.packet[9]);
1943 __m128i ta = _mm_unpacklo_epi8(kernel.packet[10], kernel.packet[11]);
1944 __m128i tb = _mm_unpackhi_epi8(kernel.packet[10], kernel.packet[11]);
1945 __m128i tc = _mm_unpacklo_epi8(kernel.packet[12], kernel.packet[13]);
1946 __m128i td = _mm_unpackhi_epi8(kernel.packet[12], kernel.packet[13]);
1947 __m128i te = _mm_unpacklo_epi8(kernel.packet[14], kernel.packet[15]);
1948 __m128i tf = _mm_unpackhi_epi8(kernel.packet[14], kernel.packet[15]);
1950 __m128i s0 = _mm_unpacklo_epi16(t0, t2);
1951 __m128i s1 = _mm_unpackhi_epi16(t0, t2);
1952 __m128i s2 = _mm_unpacklo_epi16(t1, t3);
1953 __m128i s3 = _mm_unpackhi_epi16(t1, t3);
1954 __m128i s4 = _mm_unpacklo_epi16(t4, t6);
1955 __m128i s5 = _mm_unpackhi_epi16(t4, t6);
1956 __m128i s6 = _mm_unpacklo_epi16(t5, t7);
1957 __m128i s7 = _mm_unpackhi_epi16(t5, t7);
1958 __m128i s8 = _mm_unpacklo_epi16(t8, ta);
1959 __m128i s9 = _mm_unpackhi_epi16(t8, ta);
1960 __m128i sa = _mm_unpacklo_epi16(t9, tb);
1961 __m128i sb = _mm_unpackhi_epi16(t9, tb);
1962 __m128i sc = _mm_unpacklo_epi16(tc, te);
1963 __m128i sd = _mm_unpackhi_epi16(tc, te);
1964 __m128i se = _mm_unpacklo_epi16(td, tf);
1965 __m128i sf = _mm_unpackhi_epi16(td, tf);
1967 __m128i u0 = _mm_unpacklo_epi32(s0, s4);
1968 __m128i u1 = _mm_unpackhi_epi32(s0, s4);
1969 __m128i u2 = _mm_unpacklo_epi32(s1, s5);
1970 __m128i u3 = _mm_unpackhi_epi32(s1, s5);
1971 __m128i u4 = _mm_unpacklo_epi32(s2, s6);
1972 __m128i u5 = _mm_unpackhi_epi32(s2, s6);
1973 __m128i u6 = _mm_unpacklo_epi32(s3, s7);
1974 __m128i u7 = _mm_unpackhi_epi32(s3, s7);
1975 __m128i u8 = _mm_unpacklo_epi32(s8, sc);
1976 __m128i u9 = _mm_unpackhi_epi32(s8, sc);
1977 __m128i ua = _mm_unpacklo_epi32(s9, sd);
1978 __m128i ub = _mm_unpackhi_epi32(s9, sd);
1979 __m128i uc = _mm_unpacklo_epi32(sa, se);
1980 __m128i ud = _mm_unpackhi_epi32(sa, se);
1981 __m128i ue = _mm_unpacklo_epi32(sb, sf);
1982 __m128i uf = _mm_unpackhi_epi32(sb, sf);
1984 kernel.packet[0] = _mm_unpacklo_epi64(u0, u8);
1985 kernel.packet[1] = _mm_unpackhi_epi64(u0, u8);
1986 kernel.packet[2] = _mm_unpacklo_epi64(u1, u9);
1987 kernel.packet[3] = _mm_unpackhi_epi64(u1, u9);
1988 kernel.packet[4] = _mm_unpacklo_epi64(u2, ua);
1989 kernel.packet[5] = _mm_unpackhi_epi64(u2, ua);
1990 kernel.packet[6] = _mm_unpacklo_epi64(u3, ub);
1991 kernel.packet[7] = _mm_unpackhi_epi64(u3, ub);
1992 kernel.packet[8] = _mm_unpacklo_epi64(u4, uc);
1993 kernel.packet[9] = _mm_unpackhi_epi64(u4, uc);
1994 kernel.packet[10] = _mm_unpacklo_epi64(u5, ud);
1995 kernel.packet[11] = _mm_unpackhi_epi64(u5, ud);
1996 kernel.packet[12] = _mm_unpacklo_epi64(u6, ue);
1997 kernel.packet[13] = _mm_unpackhi_epi64(u6, ue);
1998 kernel.packet[14] = _mm_unpacklo_epi64(u7, uf);
1999 kernel.packet[15] = _mm_unpackhi_epi64(u7, uf);
2003#if defined(EIGEN_VECTORIZE_FMA)
2005EIGEN_STRONG_INLINE
float pmadd(
const float& a,
const float& b,
const float& c) {
2006 return std::fmaf(a, b, c);
2009EIGEN_STRONG_INLINE
double pmadd(
const double& a,
const double& b,
const double& c) {
2010 return std::fma(a, b, c);
2013EIGEN_STRONG_INLINE
float pmsub(
const float& a,
const float& b,
const float& c) {
2014 return std::fmaf(a, b, -c);
2017EIGEN_STRONG_INLINE
double pmsub(
const double& a,
const double& b,
const double& c) {
2018 return std::fma(a, b, -c);
2021EIGEN_STRONG_INLINE
float pnmadd(
const float& a,
const float& b,
const float& c) {
2022 return std::fmaf(-a, b, c);
2025EIGEN_STRONG_INLINE
double pnmadd(
const double& a,
const double& b,
const double& c) {
2026 return std::fma(-a, b, c);
2029EIGEN_STRONG_INLINE
float pnmsub(
const float& a,
const float& b,
const float& c) {
2030 return std::fmaf(-a, b, -c);
2033EIGEN_STRONG_INLINE
double pnmsub(
const double& a,
const double& b,
const double& c) {
2034 return std::fma(-a, b, -c);
2038#ifdef EIGEN_VECTORIZE_SSE4_1
2041EIGEN_STRONG_INLINE __m128i half2floatsse(__m128i h) {
2042 __m128i input = _mm_cvtepu16_epi32(h);
2045 __m128i shifted_exp = _mm_set1_epi32(0x7c00 << 13);
2047 __m128i ou = _mm_slli_epi32(_mm_and_si128(input, _mm_set1_epi32(0x7fff)), 13);
2049 __m128i
exp = _mm_and_si128(ou, shifted_exp);
2051 ou = _mm_add_epi32(ou, _mm_set1_epi32((127 - 15) << 23));
2054 __m128i naninf_mask = _mm_cmpeq_epi32(
exp, shifted_exp);
2056 __m128i naninf_adj = _mm_and_si128(_mm_set1_epi32((128 - 16) << 23), naninf_mask);
2058 ou = _mm_add_epi32(ou, naninf_adj);
2061 __m128i zeroden_mask = _mm_cmpeq_epi32(
exp, _mm_setzero_si128());
2062 __m128i zeroden_adj = _mm_and_si128(zeroden_mask, _mm_set1_epi32(1 << 23));
2064 ou = _mm_add_epi32(ou, zeroden_adj);
2066 __m128i magic = _mm_and_si128(zeroden_mask, _mm_set1_epi32(113 << 23));
2068 ou = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(ou), _mm_castsi128_ps(magic)));
2070 __m128i
sign = _mm_slli_epi32(_mm_and_si128(input, _mm_set1_epi32(0x8000)), 16);
2072 ou = _mm_or_si128(ou,
sign);
2079EIGEN_STRONG_INLINE __m128i float2half(__m128 f) {
2081 __m128i
sign = _mm_set1_epi32(0x80000000u);
2083 sign = _mm_and_si128(
sign, _mm_castps_si128(f));
2085 f = _mm_xor_ps(f, _mm_castsi128_ps(
sign));
2087 __m128i fu = _mm_castps_si128(f);
2089 __m128i f16max = _mm_set1_epi32((127 + 16) << 23);
2090 __m128i f32infty = _mm_set1_epi32(255 << 23);
2093 __m128i infnan_mask = _mm_cmplt_epi32(f16max, _mm_castps_si128(f));
2094 __m128i inf_mask = _mm_cmpgt_epi32(_mm_castps_si128(f), f32infty);
2095 __m128i nan_mask = _mm_andnot_si128(inf_mask, infnan_mask);
2096 __m128i inf_value = _mm_and_si128(inf_mask, _mm_set1_epi32(0x7e00));
2097 __m128i nan_value = _mm_and_si128(nan_mask, _mm_set1_epi32(0x7c00));
2099 __m128i naninf_value = _mm_or_si128(inf_value, nan_value);
2101 __m128i denorm_magic = _mm_set1_epi32(((127 - 15) + (23 - 10) + 1) << 23);
2102 __m128i subnorm_mask = _mm_cmplt_epi32(_mm_castps_si128(f), _mm_set1_epi32(113 << 23));
2104 f = _mm_add_ps(f, _mm_castsi128_ps(denorm_magic));
2106 __m128i o = _mm_sub_epi32(_mm_castps_si128(f), denorm_magic);
2107 o = _mm_and_si128(o, subnorm_mask);
2109 o = _mm_or_si128(o, naninf_value);
2111 __m128i mask = _mm_or_si128(infnan_mask, subnorm_mask);
2112 o = _mm_and_si128(o, mask);
2115 __m128i mand_odd = _mm_and_si128(_mm_srli_epi32(fu, 13), _mm_set1_epi32(0x1));
2117 fu = _mm_add_epi32(fu, _mm_set1_epi32(0xc8000fffU));
2119 fu = _mm_add_epi32(fu, mand_odd);
2120 fu = _mm_andnot_si128(mask, fu);
2122 fu = _mm_srli_epi32(fu, 13);
2123 o = _mm_or_si128(fu, o);
2126 o = _mm_or_si128(o, _mm_srli_epi32(
sign, 16));
2129 return _mm_and_si128(o, _mm_set1_epi32(0xffff));
2143template<>
struct is_arithmetic<Packet4h> {
enum { value =
true }; };
2146struct packet_traits<Eigen::half> : default_packet_traits {
2147 typedef Packet4h type;
2149 typedef Packet4h half;
2152 AlignedOnScalar = 1,
2168template<>
struct unpacket_traits<Packet4h> {
typedef Eigen::half type;
enum {size=4, alignment=
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
typedef Packet4h half; };
2170template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(
const Eigen::half& from) {
2172 result.x = _mm_set1_pi16(from.x);
2176template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(
const Packet4h& from) {
2177 return half_impl::raw_uint16_to_half(
static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
2180template<> EIGEN_STRONG_INLINE Packet4h pconj(
const Packet4h& a) {
return a; }
2182template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(
const Packet4h& a,
const Packet4h& b) {
2183 __int64_t a64 = _mm_cvtm64_si64(a.x);
2184 __int64_t b64 = _mm_cvtm64_si64(b.x);
2188 Eigen::half ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64));
2189 Eigen::half hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64));
2191 ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64 >> 16));
2192 hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64 >> 16));
2194 ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64 >> 32));
2195 hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64 >> 32));
2197 ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64 >> 48));
2198 hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64 >> 48));
2201 result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
2205template<> EIGEN_STRONG_INLINE Packet4h psub<Packet4h>(
const Packet4h& a,
const Packet4h& b) {
2206 __int64_t a64 = _mm_cvtm64_si64(a.x);
2207 __int64_t b64 = _mm_cvtm64_si64(b.x);
2211 Eigen::half ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64));
2212 Eigen::half hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64));
2214 ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64 >> 16));
2215 hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64 >> 16));
2217 ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64 >> 32));
2218 hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64 >> 32));
2220 ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64 >> 48));
2221 hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64 >> 48));
2224 result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
2228template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(
const Packet4h& a,
const Packet4h& b) {
2229 __int64_t a64 = _mm_cvtm64_si64(a.x);
2230 __int64_t b64 = _mm_cvtm64_si64(b.x);
2234 Eigen::half ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64));
2235 Eigen::half hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64));
2237 ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64 >> 16));
2238 hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64 >> 16));
2240 ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64 >> 32));
2241 hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64 >> 32));
2243 ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64 >> 48));
2244 hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64 >> 48));
2247 result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
2251template<> EIGEN_STRONG_INLINE Packet4h pdiv<Packet4h>(
const Packet4h& a,
const Packet4h& b) {
2252 __int64_t a64 = _mm_cvtm64_si64(a.x);
2253 __int64_t b64 = _mm_cvtm64_si64(b.x);
2257 Eigen::half ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64));
2258 Eigen::half hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64));
2260 ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64 >> 16));
2261 hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64 >> 16));
2263 ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64 >> 32));
2264 hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64 >> 32));
2266 ha = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(a64 >> 48));
2267 hb = half_impl::raw_uint16_to_half(
static_cast<unsigned short>(b64 >> 48));
2270 result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
2274template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(
const Eigen::half* from) {
2276 result.x = _mm_cvtsi64_m64(*
reinterpret_cast<const __int64_t*
>(from));
2280template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(
const Eigen::half* from) {
2282 result.x = _mm_cvtsi64_m64(*
reinterpret_cast<const __int64_t*
>(from));
2286template<> EIGEN_STRONG_INLINE
void pstore<Eigen::half>(Eigen::half* to,
const Packet4h& from) {
2287 __int64_t r = _mm_cvtm64_si64(from.x);
2288 *(
reinterpret_cast<__int64_t*
>(to)) = r;
2291template<> EIGEN_STRONG_INLINE
void pstoreu<Eigen::half>(Eigen::half* to,
const Packet4h& from) {
2292 __int64_t r = _mm_cvtm64_si64(from.x);
2293 *(
reinterpret_cast<__int64_t*
>(to)) = r;
2296template<> EIGEN_STRONG_INLINE Packet4h
2297ploadquad<Packet4h>(
const Eigen::half* from) {
2298 return pset1<Packet4h>(*from);
2301template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(
const Eigen::half* from,
Index stride)
2304 result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
2308template<> EIGEN_STRONG_INLINE
void pscatter<Eigen::half, Packet4h>(Eigen::half* to,
const Packet4h& from,
Index stride)
2310 __int64_t a = _mm_cvtm64_si64(from.x);
2311 to[stride*0].x =
static_cast<unsigned short>(a);
2312 to[stride*1].x =
static_cast<unsigned short>(a >> 16);
2313 to[stride*2].x =
static_cast<unsigned short>(a >> 32);
2314 to[stride*3].x =
static_cast<unsigned short>(a >> 48);
2317EIGEN_STRONG_INLINE
void
2318ptranspose(PacketBlock<Packet4h,4>& kernel) {
2319 __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x);
2320 __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);
2321 __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x);
2322 __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x);
2324 kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1);
2325 kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1);
2326 kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3);
2327 kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3);
2336#if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
2338static inline __m128 _mm_castpd_ps(__m128d x) {
return reinterpret_cast<__m128&
>(x); }
2339static inline __m128i _mm_castpd_si128(__m128d x) {
return reinterpret_cast<__m128i&
>(x); }
2340static inline __m128d _mm_castps_pd(__m128 x) {
return reinterpret_cast<__m128d&
>(x); }
2341static inline __m128i _mm_castps_si128(__m128 x) {
return reinterpret_cast<__m128i&
>(x); }
2342static inline __m128 _mm_castsi128_ps(__m128i x) {
return reinterpret_cast<__m128&
>(x); }
2343static inline __m128d _mm_castsi128_pd(__m128i x) {
return reinterpret_cast<__m128d&
>(x); }
@ Aligned16
Definition Constants.h:237
Namespace containing all symbols from the Eigen library.
Definition B01_Experimental.dox:1
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_exp_op< typename Derived::Scalar >, const Derived > exp(const Eigen::ArrayBase< Derived > &x)
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_sign_op< typename Derived::Scalar >, const Derived > sign(const Eigen::ArrayBase< Derived > &x)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:82