Eigen  5.0.1-dev+bc3b3987
 
Loading...
Searching...
No Matches
PacketMath.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_PACKET_MATH_SSE_H
11#define EIGEN_PACKET_MATH_SSE_H
12
13#include <cstdint>
14// IWYU pragma: private
15#include "../../InternalHeaderCheck.h"
16
17namespace Eigen {
18
19namespace internal {
20
21#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
22#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
23#endif
24
25#if !defined(EIGEN_VECTORIZE_AVX) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
26// 32 bits => 8 registers
27// 64 bits => 16 registers
28#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2 * sizeof(void*))
29#endif
30
31#ifdef EIGEN_VECTORIZE_FMA
32#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
33#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
34#endif
35#endif
36
37#if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW || EIGEN_COMP_LCC) && \
38 (__GXX_ABI_VERSION < 1004)) || \
39 EIGEN_OS_QNX
40// With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
41// have overloads for both types without linking error.
42// One solution is to increase ABI version using -fabi-version=4 (or greater).
43// Otherwise, we workaround this inconvenience by wrapping 128bit types into the following helper
44// structure:
45typedef eigen_packet_wrapper<__m128> Packet4f;
46typedef eigen_packet_wrapper<__m128d> Packet2d;
47#else
48typedef __m128 Packet4f;
49typedef __m128d Packet2d;
50#endif
51
52typedef eigen_packet_wrapper<__m128i, 0> Packet4i;
53typedef eigen_packet_wrapper<__m128i, 1> Packet16b;
54typedef eigen_packet_wrapper<__m128i, 4> Packet4ui;
55typedef eigen_packet_wrapper<__m128i, 5> Packet2l;
56
57template <>
58struct is_arithmetic<__m128> {
59 enum { value = true };
60};
61template <>
62struct is_arithmetic<__m128i> {
63 enum { value = true };
64};
65template <>
66struct is_arithmetic<__m128d> {
67 enum { value = true };
68};
69template <>
70struct is_arithmetic<Packet4i> {
71 enum { value = true };
72};
73template <>
74struct is_arithmetic<Packet2l> {
75 enum { value = true };
76};
77// Note that `Packet4ui` uses the underlying type `__m128i`, which is
78// interpreted as a vector of _signed_ `int32`s, which breaks some arithmetic
79// operations used in `GenericPacketMath.h`.
80template <>
81struct is_arithmetic<Packet4ui> {
82 enum { value = false };
83};
84template <>
85struct is_arithmetic<Packet16b> {
86 enum { value = true };
87};
88
89template <int p, int q, int r, int s>
90struct shuffle_mask {
91 enum { mask = (s) << 6 | (r) << 4 | (q) << 2 | (p) };
92};
93
94// TODO: change the implementation of all swizzle* ops from macro to template,
95#define vec4f_swizzle1(v, p, q, r, s) \
96 Packet4f(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), (shuffle_mask<p, q, r, s>::mask))))
97
98#define vec4i_swizzle1(v, p, q, r, s) Packet4i(_mm_shuffle_epi32(v, (shuffle_mask<p, q, r, s>::mask)))
99
100#define vec4ui_swizzle1(v, p, q, r, s) Packet4ui(vec4i_swizzle1(v, p, q, r, s))
101
102#define vec2d_swizzle1(v, p, q) \
103 Packet2d(_mm_castsi128_pd( \
104 _mm_shuffle_epi32(_mm_castpd_si128(v), (shuffle_mask<2 * p, 2 * p + 1, 2 * q, 2 * q + 1>::mask))))
105
106#define vec4f_swizzle2(a, b, p, q, r, s) Packet4f(_mm_shuffle_ps((a), (b), (shuffle_mask<p, q, r, s>::mask)))
107
108#define vec4i_swizzle2(a, b, p, q, r, s) \
109 Packet4i( \
110 _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (shuffle_mask<p, q, r, s>::mask)))))
111
112#define vec4ui_swizzle2(a, b, p, q, r, s) Packet4i(vec4i_swizzle2(a, b, p, q, r, s))
113
114EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) {
115 return Packet4f(_mm_movelh_ps(a, b));
116}
117EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) {
118 return Packet4f(_mm_movehl_ps(a, b));
119}
120EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) {
121 return Packet4f(_mm_unpacklo_ps(a, b));
122}
123EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) {
124 return Packet4f(_mm_unpackhi_ps(a, b));
125}
126#define vec4f_duplane(a, p) vec4f_swizzle2(a, a, p, p, p, p)
127
128#define vec2d_swizzle2(a, b, mask) Packet2d(_mm_shuffle_pd(a, b, mask))
129
130EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b) {
131 return Packet2d(_mm_unpacklo_pd(a, b));
132}
133EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b) {
134 return Packet2d(_mm_unpackhi_pd(a, b));
135}
136#define vec2d_duplane(a, p) vec2d_swizzle2(a, a, (p << 1) | p)
137
138#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = pset1<Packet4f>(X)
139
140#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = pset1<Packet2d>(X)
141
142#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) const Packet4f p4f_##NAME = pset1frombits<Packet4f>(X)
143
144#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = pset1<Packet4i>(X)
145
146#define EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = pset1<Packet4ui>(X)
147
148// Work around lack of extract/cvt for epi64 when compiling for 32-bit.
149#if EIGEN_ARCH_x86_64
150EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_0(const __m128i& a) { return _mm_cvtsi128_si64(a); }
151#ifdef EIGEN_VECTORIZE_SSE4_1
152EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_1(const __m128i& a) { return _mm_extract_epi64(a, 1); }
153#else
154EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_1(const __m128i& a) {
155 return _mm_cvtsi128_si64(_mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(a), 0x1)));
156}
157#endif
158#else
159// epi64 instructions are not available. The following seems to generate the same instructions
160// with -O2 in GCC/Clang.
161EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_0(const __m128i& a) {
162 return numext::bit_cast<int64_t>(_mm_cvtsd_f64(_mm_castsi128_pd(a)));
163}
164EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_1(const __m128i& a) {
165 return numext::bit_cast<int64_t>(_mm_cvtsd_f64(_mm_shuffle_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(a), 0x1)));
166}
167#endif
168
169// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
170// to leverage AVX instructions.
171#ifndef EIGEN_VECTORIZE_AVX
172template <>
173struct packet_traits<float> : default_packet_traits {
174 typedef Packet4f type;
175 typedef Packet4f half;
176 enum {
177 Vectorizable = 1,
178 AlignedOnScalar = 1,
179 size = 4,
180
181 HasCmp = 1,
182 HasDiv = 1,
183 HasReciprocal = EIGEN_FAST_MATH,
184 HasSin = EIGEN_FAST_MATH,
185 HasCos = EIGEN_FAST_MATH,
186 HasACos = 1,
187 HasASin = 1,
188 HasATan = 1,
189 HasATanh = 1,
190 HasLog = 1,
191 HasLog1p = 1,
192 HasExpm1 = 1,
193 HasNdtri = 1,
194 HasExp = 1,
195 HasPow = 1,
196 HasBessel = 1,
197 HasSqrt = 1,
198 HasRsqrt = 1,
199 HasCbrt = 1,
200 HasTanh = EIGEN_FAST_MATH,
201 HasErf = EIGEN_FAST_MATH,
202 HasErfc = EIGEN_FAST_MATH,
203 HasBlend = 1,
204 HasSign = 0 // The manually vectorized version is slightly slower for SSE.
205 };
206};
207template <>
208struct packet_traits<double> : default_packet_traits {
209 typedef Packet2d type;
210 typedef Packet2d half;
211 enum {
212 Vectorizable = 1,
213 AlignedOnScalar = 1,
214 size = 2,
215
216 HasCmp = 1,
217 HasDiv = 1,
218 HasSin = EIGEN_FAST_MATH,
219 HasCos = EIGEN_FAST_MATH,
220 HasTanh = EIGEN_FAST_MATH,
221 HasLog = 1,
222 HasErf = EIGEN_FAST_MATH,
223 HasErfc = EIGEN_FAST_MATH,
224 HasExp = 1,
225 HasPow = 1,
226 HasSqrt = 1,
227 HasRsqrt = 1,
228 HasCbrt = 1,
229 HasATan = 1,
230 HasATanh = 1,
231 HasBlend = 1
232 };
233};
234template <>
235struct packet_traits<int> : default_packet_traits {
236 typedef Packet4i type;
237 typedef Packet4i half;
238 enum {
239 Vectorizable = 1,
240 AlignedOnScalar = 1,
241 size = 4,
242
243 HasCmp = 1,
244 HasDiv = 1,
245 HasShift = 1,
246 HasBlend = 1
247 };
248};
249template <>
250struct packet_traits<uint32_t> : default_packet_traits {
251 typedef Packet4ui type;
252 typedef Packet4ui half;
253 enum {
254 Vectorizable = 1,
255 AlignedOnScalar = 1,
256 size = 4,
257
258 HasDiv = 0,
259 HasNegate = 0,
260 HasCmp = 1,
261 HasShift = 1,
262 HasBlend = 1
263 };
264};
265template <>
266struct packet_traits<int64_t> : default_packet_traits {
267 typedef Packet2l type;
268 typedef Packet2l half;
269 enum {
270 Vectorizable = 1,
271 AlignedOnScalar = 1,
272 size = 2,
273
274 HasDiv = 0,
275 HasCmp = 1,
276 HasShift = 1,
277 HasBlend = 1
278 };
279};
280#endif
281template <>
282struct packet_traits<bool> : default_packet_traits {
283 typedef Packet16b type;
284 typedef Packet16b half;
285 enum {
286 Vectorizable = 1,
287 AlignedOnScalar = 1,
288 size = 16,
289
290 HasCmp = 1,
291 HasShift = 0,
292 HasAbs = 0,
293 HasAbs2 = 0,
294 HasMin = 0,
295 HasMax = 0,
296 HasConj = 0,
297 HasSqrt = 1,
298 HasNegate = 0,
299 HasSign = 0 // Don't try to vectorize psign<bool> = identity.
300 };
301};
302
303template <>
304struct unpacket_traits<Packet4f> {
305 typedef float type;
306 typedef Packet4f half;
307 typedef Packet4i integer_packet;
308 enum {
309 size = 4,
310 alignment = Aligned16,
311 vectorizable = true,
312 masked_load_available = false,
313 masked_store_available = false
314 };
315};
316template <>
317struct unpacket_traits<Packet2d> {
318 typedef double type;
319 typedef Packet2d half;
320 typedef Packet2l integer_packet;
321 enum {
322 size = 2,
323 alignment = Aligned16,
324 vectorizable = true,
325 masked_load_available = false,
326 masked_store_available = false
327 };
328};
329template <>
330struct unpacket_traits<Packet2l> {
331 typedef int64_t type;
332 typedef Packet2l half;
333 enum {
334 size = 2,
335 alignment = Aligned16,
336 vectorizable = true,
337 masked_load_available = false,
338 masked_store_available = false
339 };
340};
341template <>
342struct unpacket_traits<Packet4i> {
343 typedef int type;
344 typedef Packet4i half;
345 enum {
346 size = 4,
347 alignment = Aligned16,
348 vectorizable = true,
349 masked_load_available = false,
350 masked_store_available = false
351 };
352};
353template <>
354struct unpacket_traits<Packet4ui> {
355 typedef uint32_t type;
356 typedef Packet4ui half;
357 enum {
358 size = 4,
359 alignment = Aligned16,
360 vectorizable = true,
361 masked_load_available = false,
362 masked_store_available = false
363 };
364};
365template <>
366struct unpacket_traits<Packet16b> {
367 typedef bool type;
368 typedef Packet16b half;
369 enum {
370 size = 16,
371 alignment = Aligned16,
372 vectorizable = true,
373 masked_load_available = false,
374 masked_store_available = false
375 };
376};
377
378#ifndef EIGEN_VECTORIZE_AVX
379template <>
380struct scalar_div_cost<float, true> {
381 enum { value = 7 };
382};
383template <>
384struct scalar_div_cost<double, true> {
385 enum { value = 8 };
386};
387#endif
388
389template <>
390EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
391 return _mm_set_ps1(from);
392}
393template <>
394EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
395 return _mm_set1_pd(from);
396}
397template <>
398EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) {
399 return _mm_set1_epi64x(from);
400}
401template <>
402EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
403 return _mm_set1_epi32(from);
404}
405template <>
406EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) {
407 return _mm_set1_epi32(numext::bit_cast<int32_t>(from));
408}
409template <>
410EIGEN_STRONG_INLINE Packet16b pset1<Packet16b>(const bool& from) {
411 return _mm_set1_epi8(static_cast<char>(from));
412}
413
414template <>
415EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) {
416 return _mm_castsi128_ps(pset1<Packet4i>(from));
417}
418template <>
419EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
420 return _mm_castsi128_pd(_mm_set1_epi64x(from));
421}
422
423template <>
424EIGEN_STRONG_INLINE Packet4f peven_mask(const Packet4f& /*a*/) {
425 return _mm_castsi128_ps(_mm_set_epi32(0, -1, 0, -1));
426}
427template <>
428EIGEN_STRONG_INLINE Packet2l peven_mask(const Packet2l& /*a*/) {
429 return _mm_set_epi32(0, 0, -1, -1);
430}
431template <>
432EIGEN_STRONG_INLINE Packet4i peven_mask(const Packet4i& /*a*/) {
433 return _mm_set_epi32(0, -1, 0, -1);
434}
435template <>
436EIGEN_STRONG_INLINE Packet4ui peven_mask(const Packet4ui& /*a*/) {
437 return _mm_set_epi32(0, -1, 0, -1);
438}
439template <>
440EIGEN_STRONG_INLINE Packet2d peven_mask(const Packet2d& /*a*/) {
441 return _mm_castsi128_pd(_mm_set_epi32(0, 0, -1, -1));
442}
443
444template <>
445EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) {
446 return _mm_setzero_ps();
447}
448template <>
449EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) {
450 return _mm_setzero_pd();
451}
452template <>
453EIGEN_STRONG_INLINE Packet2l pzero(const Packet2l& /*a*/) {
454 return _mm_setzero_si128();
455}
456template <>
457EIGEN_STRONG_INLINE Packet4i pzero(const Packet4i& /*a*/) {
458 return _mm_setzero_si128();
459}
460template <>
461EIGEN_STRONG_INLINE Packet4ui pzero(const Packet4ui& /*a*/) {
462 return _mm_setzero_si128();
463}
464
465// GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
466// However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
467// Using inline assembly is also not an option because then gcc fails to reorder properly the instructions.
468// Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply.
469// Also note that with AVX, we want it to generate a vbroadcastss.
470#if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__)
471template <>
472EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float* from) {
473 return vec4f_swizzle1(_mm_load_ss(from), 0, 0, 0, 0);
474}
475#endif
476
477template <>
478EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
479 return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3, 2, 1, 0));
480}
481template <>
482EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
483 return _mm_add_pd(pset1<Packet2d>(a), _mm_set_pd(1, 0));
484}
485template <>
486EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(const int64_t& a) {
487 return _mm_add_epi32(pset1<Packet2l>(a), _mm_set_epi64x(1, 0));
488}
489template <>
490EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) {
491 return _mm_add_epi32(pset1<Packet4i>(a), _mm_set_epi32(3, 2, 1, 0));
492}
493template <>
494EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a) {
495 return _mm_add_epi32(pset1<Packet4ui>(a), _mm_set_epi32(3, 2, 1, 0));
496}
497
498template <>
499EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
500 return _mm_add_ps(a, b);
501}
502template <>
503EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
504 return _mm_add_pd(a, b);
505}
506template <>
507EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(const Packet2l& a, const Packet2l& b) {
508 return _mm_add_epi64(a, b);
509}
510template <>
511EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
512 return _mm_add_epi32(a, b);
513}
514template <>
515EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
516 return _mm_add_epi32(a, b);
517}
518
519template <>
520EIGEN_STRONG_INLINE Packet16b padd<Packet16b>(const Packet16b& a, const Packet16b& b) {
521 return _mm_or_si128(a, b);
522}
523
524template <typename Packet>
525EIGEN_STRONG_INLINE Packet padds(const Packet& a, const Packet& b);
526template <>
527EIGEN_STRONG_INLINE Packet4f padds<Packet4f>(const Packet4f& a, const Packet4f& b) {
528 return _mm_add_ss(a, b);
529}
530template <>
531EIGEN_STRONG_INLINE Packet2d padds<Packet2d>(const Packet2d& a, const Packet2d& b) {
532 return _mm_add_sd(a, b);
533}
534
535template <>
536EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
537 return _mm_sub_ps(a, b);
538}
539template <>
540EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
541 return _mm_sub_pd(a, b);
542}
543template <>
544EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) {
545 return _mm_sub_epi64(a, b);
546}
547template <>
548EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
549 return _mm_sub_epi32(a, b);
550}
551template <>
552EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
553 return _mm_sub_epi32(a, b);
554}
555template <>
556EIGEN_STRONG_INLINE Packet16b psub<Packet16b>(const Packet16b& a, const Packet16b& b) {
557 return _mm_xor_si128(a, b);
558}
559
560template <>
561EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
562template <>
563EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
564#ifdef EIGEN_VECTORIZE_SSE3
565 return _mm_addsub_ps(a, b);
566#else
567 const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x0, 0x80000000, 0x0));
568 return padd(a, pxor(mask, b));
569#endif
570}
571
572template <>
573EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d&, const Packet2d&);
574template <>
575EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b) {
576#ifdef EIGEN_VECTORIZE_SSE3
577 return _mm_addsub_pd(a, b);
578#else
579 const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0, 0x80000000, 0x0, 0x0));
580 return padd(a, pxor(mask, b));
581#endif
582}
583
584template <>
585EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
586 const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000));
587 return _mm_xor_ps(a, mask);
588}
589template <>
590EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
591 const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0, 0x80000000, 0x0, 0x80000000));
592 return _mm_xor_pd(a, mask);
593}
594template <>
595EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) {
596 return psub(pzero(a), a);
597}
598
599template <>
600EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
601 return psub(pzero(a), a);
602}
603
604template <>
605EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
606 return a;
607}
608template <>
609EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
610 return a;
611}
612template <>
613EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) {
614 return a;
615}
616template <>
617EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
618 return a;
619}
620
621template <>
622EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
623 return _mm_mul_ps(a, b);
624}
625template <>
626EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
627 return _mm_mul_pd(a, b);
628}
629template <>
630EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(const Packet2l& a, const Packet2l& b) {
631 // 64-bit mul requires avx512, so do this with 32-bit multiplication
632 __m128i upper32_a = _mm_srli_epi64(a, 32);
633 __m128i upper32_b = _mm_srli_epi64(b, 32);
634
635 // upper * lower
636 __m128i mul1 = _mm_mul_epu32(upper32_a, b);
637 __m128i mul2 = _mm_mul_epu32(upper32_b, a);
638 // Gives us both upper*upper and lower*lower
639 __m128i mul3 = _mm_mul_epu32(a, b);
640
641 __m128i high = _mm_slli_epi64(_mm_add_epi64(mul1, mul2), 32);
642 return _mm_add_epi64(high, mul3);
643}
644template <>
645EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
646#ifdef EIGEN_VECTORIZE_SSE4_1
647 return _mm_mullo_epi32(a, b);
648#else
649 // this version is slightly faster than 4 scalar products
650 return vec4i_swizzle1(
651 vec4i_swizzle2(_mm_mul_epu32(a, b), _mm_mul_epu32(vec4i_swizzle1(a, 1, 0, 3, 2), vec4i_swizzle1(b, 1, 0, 3, 2)),
652 0, 2, 0, 2),
653 0, 2, 1, 3);
654#endif
655}
656template <>
657EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
658#ifdef EIGEN_VECTORIZE_SSE4_1
659 return _mm_mullo_epi32(a, b);
660#else
661 // this version is slightly faster than 4 scalar products
662 return vec4ui_swizzle1(
663 vec4ui_swizzle2(_mm_mul_epu32(a, b),
664 _mm_mul_epu32(vec4ui_swizzle1(a, 1, 0, 3, 2), vec4ui_swizzle1(b, 1, 0, 3, 2)), 0, 2, 0, 2),
665 0, 2, 1, 3);
666#endif
667}
668
669template <>
670EIGEN_STRONG_INLINE Packet16b pmul<Packet16b>(const Packet16b& a, const Packet16b& b) {
671 return _mm_and_si128(a, b);
672}
673
674template <>
675EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
676 return _mm_div_ps(a, b);
677}
678template <>
679EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
680 return _mm_div_pd(a, b);
681}
682
683template <>
684EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
685#ifdef EIGEN_VECTORIZE_AVX
686 return _mm256_cvttpd_epi32(_mm256_div_pd(_mm256_cvtepi32_pd(a), _mm256_cvtepi32_pd(b)));
687#else
688 __m128i q_lo = _mm_cvttpd_epi32(_mm_div_pd(_mm_cvtepi32_pd(a), _mm_cvtepi32_pd(b)));
689 __m128i q_hi = _mm_cvttpd_epi32(
690 _mm_div_pd(_mm_cvtepi32_pd(vec4i_swizzle1(a, 2, 3, 0, 1)), _mm_cvtepi32_pd(vec4i_swizzle1(b, 2, 3, 0, 1))));
691 return vec4i_swizzle1(_mm_unpacklo_epi32(q_lo, q_hi), 0, 2, 1, 3);
692#endif
693}
694
695#ifdef EIGEN_VECTORIZE_FMA
696template <>
697EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
698 return _mm_fmadd_ps(a, b, c);
699}
700template <>
701EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
702 return _mm_fmadd_pd(a, b, c);
703}
704template <>
705EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
706 return _mm_fmsub_ps(a, b, c);
707}
708template <>
709EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
710 return _mm_fmsub_pd(a, b, c);
711}
712template <>
713EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
714 return _mm_fnmadd_ps(a, b, c);
715}
716template <>
717EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
718 return _mm_fnmadd_pd(a, b, c);
719}
720template <>
721EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
722 return _mm_fnmsub_ps(a, b, c);
723}
724template <>
725EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
726 return _mm_fnmsub_pd(a, b, c);
727}
728
729template <typename Packet>
730EIGEN_STRONG_INLINE Packet pmadds(const Packet& a, const Packet& b, const Packet& c);
731template <>
732EIGEN_STRONG_INLINE Packet4f pmadds<Packet4f>(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
733 return _mm_fmadd_ss(a, b, c);
734}
735template <>
736EIGEN_STRONG_INLINE Packet2d pmadds<Packet2d>(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
737 return _mm_fmadd_sd(a, b, c);
738}
739#endif
740
741#ifdef EIGEN_VECTORIZE_SSE4_1
742template <>
743EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
744 return _mm_blendv_ps(b, a, mask);
745}
746
747template <>
748EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b) {
749 return _mm_castpd_si128(_mm_blendv_pd(_mm_castsi128_pd(b), _mm_castsi128_pd(a), _mm_castsi128_pd(mask)));
750}
751
752template <>
753EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
754 return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(a), _mm_castsi128_ps(mask)));
755}
756
757template <>
758EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) {
759 return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(a), _mm_castsi128_ps(mask)));
760}
761
762template <>
763EIGEN_STRONG_INLINE Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) {
764 return _mm_blendv_pd(b, a, mask);
765}
766#endif
767
768template <>
769EIGEN_STRONG_INLINE Packet2l ptrue<Packet2l>(const Packet2l& a) {
770 return _mm_cmpeq_epi32(a, a);
771}
772template <>
773EIGEN_STRONG_INLINE Packet4i ptrue<Packet4i>(const Packet4i& a) {
774 return _mm_cmpeq_epi32(a, a);
775}
776template <>
777EIGEN_STRONG_INLINE Packet16b ptrue<Packet16b>(const Packet16b& /*a*/) {
778 return pset1<Packet16b>(true);
779}
780template <>
781EIGEN_STRONG_INLINE Packet4f ptrue<Packet4f>(const Packet4f& a) {
782 Packet4i b = _mm_castps_si128(a);
783 return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b));
784}
785template <>
786EIGEN_STRONG_INLINE Packet2d ptrue<Packet2d>(const Packet2d& a) {
787 Packet4i b = _mm_castpd_si128(a);
788 return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b));
789}
790
791template <>
792EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
793 return _mm_and_ps(a, b);
794}
795template <>
796EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
797 return _mm_and_pd(a, b);
798}
799template <>
800EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(const Packet2l& a, const Packet2l& b) {
801 return _mm_and_si128(a, b);
802}
803template <>
804EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
805 return _mm_and_si128(a, b);
806}
807template <>
808EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
809 return _mm_and_si128(a, b);
810}
811template <>
812EIGEN_STRONG_INLINE Packet16b pand<Packet16b>(const Packet16b& a, const Packet16b& b) {
813 return _mm_and_si128(a, b);
814}
815
816template <>
817EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
818 return _mm_or_ps(a, b);
819}
820template <>
821EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
822 return _mm_or_pd(a, b);
823}
824template <>
825EIGEN_STRONG_INLINE Packet2l por<Packet2l>(const Packet2l& a, const Packet2l& b) {
826 return _mm_or_si128(a, b);
827}
828template <>
829EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
830 return _mm_or_si128(a, b);
831}
832template <>
833EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
834 return _mm_or_si128(a, b);
835}
836template <>
837EIGEN_STRONG_INLINE Packet16b por<Packet16b>(const Packet16b& a, const Packet16b& b) {
838 return _mm_or_si128(a, b);
839}
840
841template <>
842EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
843 return _mm_xor_ps(a, b);
844}
845template <>
846EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
847 return _mm_xor_pd(a, b);
848}
849template <>
850EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(const Packet2l& a, const Packet2l& b) {
851 return _mm_xor_si128(a, b);
852}
853template <>
854EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
855 return _mm_xor_si128(a, b);
856}
857template <>
858EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
859 return _mm_xor_si128(a, b);
860}
861template <>
862EIGEN_STRONG_INLINE Packet16b pxor<Packet16b>(const Packet16b& a, const Packet16b& b) {
863 return _mm_xor_si128(a, b);
864}
865
866template <>
867EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
868 return _mm_andnot_ps(b, a);
869}
870template <>
871EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
872 return _mm_andnot_pd(b, a);
873}
874template <>
875EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, const Packet2l& b) {
876 return _mm_andnot_si128(b, a);
877}
878template <>
879EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
880 return _mm_andnot_si128(b, a);
881}
882template <>
883EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
884 return _mm_andnot_si128(b, a);
885}
886template <>
887EIGEN_STRONG_INLINE Packet16b pandnot<Packet16b>(const Packet16b& a, const Packet16b& b) {
888 return _mm_andnot_si128(b, a);
889}
890template <>
891EIGEN_STRONG_INLINE Packet16b pcmp_lt(const Packet16b& a, const Packet16b& b) {
892 return _mm_andnot_si128(a, b);
893}
894template <>
895EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) {
896 return _mm_cmple_ps(a, b);
897}
898template <>
899EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) {
900 return _mm_cmplt_ps(a, b);
901}
902template <>
903EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
904 return _mm_cmpnge_ps(a, b);
905}
906template <>
907EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) {
908 return _mm_cmpeq_ps(a, b);
909}
910
911template <>
912EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) {
913 return _mm_cmple_pd(a, b);
914}
915template <>
916EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) {
917 return _mm_cmplt_pd(a, b);
918}
919template <>
920EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
921 return _mm_cmpnge_pd(a, b);
922}
923template <>
924EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) {
925 return _mm_cmpeq_pd(a, b);
926}
927template <>
928EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) {
929 return _mm_cmplt_epi32(a, b);
930}
931template <>
932EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) {
933 return _mm_cmpeq_epi32(a, b);
934}
935template <>
936EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) {
937#ifdef EIGEN_VECTORIZE_SSE4_1
938 return _mm_cmpeq_epi32(a, _mm_min_epi32(a, b));
939#else
940 return por(pcmp_lt(a, b), pcmp_eq(a, b));
941#endif
942}
943template <>
944EIGEN_STRONG_INLINE Packet2l pcmp_lt(const Packet2l& a, const Packet2l& b) {
945#ifdef EIGEN_VECTORIZE_SSE4_2
946 return _mm_cmpgt_epi64(b, a);
947#else
948 Packet4i eq = pcmp_eq<Packet4i>(Packet4i(a), Packet4i(b));
949 Packet2l hi_eq = Packet2l(_mm_shuffle_epi32(eq, (shuffle_mask<1, 1, 3, 3>::mask)));
950 Packet4i lt = pcmp_lt<Packet4i>(Packet4i(a), Packet4i(b));
951 Packet2l hi_lt = Packet2l(_mm_shuffle_epi32(lt, (shuffle_mask<1, 1, 3, 3>::mask)));
952 Packet2l lo_lt = Packet2l(_mm_shuffle_epi32(lt, (shuffle_mask<0, 0, 2, 2>::mask)));
953 // return hi(a) < hi(b) || (hi(a) == hi(b) && lo(a) < lo(b))
954 return por(hi_lt, pand(hi_eq, lo_lt));
955#endif
956}
957template <>
958EIGEN_STRONG_INLINE Packet2l pcmp_eq(const Packet2l& a, const Packet2l& b) {
959#ifdef EIGEN_VECTORIZE_SSE4_1
960 return _mm_cmpeq_epi64(a, b);
961#else
962 Packet4i tmp = pcmp_eq<Packet4i>(Packet4i(a), Packet4i(b));
963 return Packet2l(pand<Packet4i>(tmp, _mm_shuffle_epi32(tmp, (shuffle_mask<1, 0, 3, 2>::mask))));
964#endif
965}
966template <>
967EIGEN_STRONG_INLINE Packet2l pcmp_le(const Packet2l& a, const Packet2l& b) {
968 return por(pcmp_lt(a, b), pcmp_eq(a, b));
969}
970template <>
971EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) {
972 // Mask out invalid bool bits to avoid UB.
973 const Packet16b kBoolMask = pset1<Packet16b>(true);
974 return _mm_and_si128(_mm_cmpeq_epi8(a, b), kBoolMask);
975}
976template <>
977EIGEN_STRONG_INLINE Packet4ui pcmp_eq(const Packet4ui& a, const Packet4ui& b) {
978 return _mm_cmpeq_epi32(a, b);
979}
980
981template <>
982EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
983#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
984// There appears to be a bug in GCC, by which the optimizer may
985// flip the argument order in calls to _mm_min_ps, so we have to
986// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
987// see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
988#ifdef EIGEN_VECTORIZE_AVX
989 Packet4f res;
990 asm("vminps %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
991#else
992 Packet4f res = b;
993 asm("minps %[a], %[res]" : [res] "+x"(res) : [a] "x"(a));
994#endif
995 return res;
996#else
997 // Arguments are reversed to match NaN propagation behavior of std::min.
998 return _mm_min_ps(b, a);
999#endif
1000}
1001template <>
1002EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
1003#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
1004// There appears to be a bug in GCC, by which the optimizer may
1005// flip the argument order in calls to _mm_min_pd, so we have to
1006// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
1007// see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
1008#ifdef EIGEN_VECTORIZE_AVX
1009 Packet2d res;
1010 asm("vminpd %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
1011#else
1012 Packet2d res = b;
1013 asm("minpd %[a], %[res]" : [res] "+x"(res) : [a] "x"(a));
1014#endif
1015 return res;
1016#else
1017 // Arguments are reversed to match NaN propagation behavior of std::min.
1018 return _mm_min_pd(b, a);
1019#endif
1020}
1021template <>
1022EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(const Packet2l& a, const Packet2l& b) {
1023 Packet2l a_lt_mask = pcmp_lt(a, b);
1024 return por(pandnot(b, a_lt_mask), pand(a, a_lt_mask));
1025}
1026template <>
1027EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
1028#ifdef EIGEN_VECTORIZE_SSE4_1
1029 return _mm_min_epi32(a, b);
1030#else
1031 // after some bench, this version *is* faster than a scalar implementation
1032 Packet4i mask = _mm_cmplt_epi32(a, b);
1033 return _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b));
1034#endif
1035}
1036template <>
1037EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1038#ifdef EIGEN_VECTORIZE_SSE4_1
1039 return _mm_min_epu32(a, b);
1040#else
1041 return padd((Packet4ui)pmin((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
1042 (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL))),
1043 pset1<Packet4ui>(0x80000000UL));
1044#endif
1045}
1046
1047template <>
1048EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
1049#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
1050// There appears to be a bug in GCC, by which the optimizer may
1051// flip the argument order in calls to _mm_max_ps, so we have to
1052// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
1053// see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
1054#ifdef EIGEN_VECTORIZE_AVX
1055 Packet4f res;
1056 asm("vmaxps %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
1057#else
1058 Packet4f res = b;
1059 asm("maxps %[a], %[res]" : [res] "+x"(res) : [a] "x"(a));
1060#endif
1061 return res;
1062#else
1063 // Arguments are reversed to match NaN propagation behavior of std::max.
1064 return _mm_max_ps(b, a);
1065#endif
1066}
1067template <>
1068EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
1069#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
1070// There appears to be a bug in GCC, by which the optimizer may
1071// flip the argument order in calls to _mm_max_pd, so we have to
1072// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
1073// see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
1074#ifdef EIGEN_VECTORIZE_AVX
1075 Packet2d res;
1076 asm("vmaxpd %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
1077#else
1078 Packet2d res = b;
1079 asm("maxpd %[a], %[res]" : [res] "+x"(res) : [a] "x"(a));
1080#endif
1081 return res;
1082#else
1083 // Arguments are reversed to match NaN propagation behavior of std::max.
1084 return _mm_max_pd(b, a);
1085#endif
1086}
1087template <>
1088EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(const Packet2l& a, const Packet2l& b) {
1089 Packet2l a_lt_mask = pcmp_lt(a, b);
1090 return por(pandnot(a, a_lt_mask), pand(b, a_lt_mask));
1091}
1092template <>
1093EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
1094#ifdef EIGEN_VECTORIZE_SSE4_1
1095 return _mm_max_epi32(a, b);
1096#else
1097 // after some bench, this version *is* faster than a scalar implementation
1098 Packet4i mask = _mm_cmpgt_epi32(a, b);
1099 return _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b));
1100#endif
1101}
1102template <>
1103EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1104#ifdef EIGEN_VECTORIZE_SSE4_1
1105 return _mm_max_epu32(a, b);
1106#else
1107 return padd((Packet4ui)pmax((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
1108 (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL))),
1109 pset1<Packet4ui>(0x80000000UL));
1110#endif
1111}
1112
1113template <>
1114EIGEN_STRONG_INLINE Packet4ui pcmp_lt(const Packet4ui& a, const Packet4ui& b) {
1115#ifdef EIGEN_VECTORIZE_SSE4_1
1116 return pxor(pcmp_eq(a, pmax(a, b)), ptrue(a));
1117#else
1118 return (Packet4ui)pcmp_lt((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
1119 (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL)));
1120#endif
1121}
1122template <>
1123EIGEN_STRONG_INLINE Packet4ui pcmp_le(const Packet4ui& a, const Packet4ui& b) {
1124#ifdef EIGEN_VECTORIZE_SSE4_1
1125 return pcmp_eq(a, pmin(a, b));
1126#else
1127 return (Packet4ui)pcmp_le((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
1128 (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL)));
1129#endif
1130}
1131
1132template <typename Packet, typename Op>
1133EIGEN_STRONG_INLINE Packet pminmax_propagate_numbers(const Packet& a, const Packet& b, Op op) {
1134 // In this implementation, we take advantage of the fact that pmin/pmax for SSE
1135 // always return a if either a or b is NaN.
1136 Packet not_nan_mask_a = pcmp_eq(a, a);
1137 Packet m = op(a, b);
1138 return pselect<Packet>(not_nan_mask_a, m, b);
1139}
1140
1141template <typename Packet, typename Op>
1142EIGEN_STRONG_INLINE Packet pminmax_propagate_nan(const Packet& a, const Packet& b, Op op) {
1143 // In this implementation, we take advantage of the fact that pmin/pmax for SSE
1144 // always return a if either a or b is NaN.
1145 Packet not_nan_mask_a = pcmp_eq(a, a);
1146 Packet m = op(b, a);
1147 return pselect<Packet>(not_nan_mask_a, m, a);
1148}
1149
1150// Add specializations for min/max with prescribed NaN propagation.
1151template <>
1152EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
1153 return pminmax_propagate_numbers(a, b, pmin<Packet4f>);
1154}
1155template <>
1156EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
1157 return pminmax_propagate_numbers(a, b, pmin<Packet2d>);
1158}
1159template <>
1160EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
1161 return pminmax_propagate_numbers(a, b, pmax<Packet4f>);
1162}
1163template <>
1164EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
1165 return pminmax_propagate_numbers(a, b, pmax<Packet2d>);
1166}
1167template <>
1168EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
1169 return pminmax_propagate_nan(a, b, pmin<Packet4f>);
1170}
1171template <>
1172EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
1173 return pminmax_propagate_nan(a, b, pmin<Packet2d>);
1174}
1175template <>
1176EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
1177 return pminmax_propagate_nan(a, b, pmax<Packet4f>);
1178}
1179template <>
1180EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
1181 return pminmax_propagate_nan(a, b, pmax<Packet2d>);
1182}
1183
1184template <>
1185EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
1186 return _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(a), 31));
1187}
1188template <>
1189EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
1190 Packet4f tmp = psignbit<Packet4f>(_mm_castpd_ps(a));
1191#ifdef EIGEN_VECTORIZE_AVX
1192 return _mm_castps_pd(_mm_permute_ps(tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
1193#else
1194 return _mm_castps_pd(_mm_shuffle_ps(tmp, tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
1195#endif // EIGEN_VECTORIZE_AVX
1196}
1197template <>
1198EIGEN_STRONG_INLINE Packet4i psignbit(const Packet4i& a) {
1199 return _mm_srai_epi32(a, 31);
1200}
1201template <>
1202EIGEN_STRONG_INLINE Packet4ui psignbit(const Packet4ui& a) {
1203 return pzero(a);
1204}
1205template <>
1206EIGEN_STRONG_INLINE Packet2l psignbit(const Packet2l& a) {
1207 Packet4i tmp = psignbit<Packet4i>(Packet4i(a));
1208 return Packet2l(_mm_shuffle_epi32(tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
1209}
1210
1211template <int N>
1212EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(const Packet2l& a) {
1213 Packet2l signbit = psignbit(a);
1214 return por(_mm_slli_epi64(signbit, 64 - N), _mm_srli_epi64(a, N));
1215}
1216template <int N>
1217EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
1218 return _mm_srli_epi64(a, N);
1219}
1220template <int N>
1221EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
1222 return _mm_slli_epi64(a, N);
1223}
1224template <int N>
1225EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
1226 return _mm_srai_epi32(a, N);
1227}
1228template <int N>
1229EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) {
1230 return _mm_srli_epi32(a, N);
1231}
1232template <int N>
1233EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
1234 return _mm_slli_epi32(a, N);
1235}
1236template <int N>
1237EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(const Packet4ui& a) {
1238 return _mm_srli_epi32(a, N);
1239}
1240template <int N>
1241EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a) {
1242 return _mm_srli_epi32(a, N);
1243}
1244template <int N>
1245EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a) {
1246 return _mm_slli_epi32(a, N);
1247}
1248
1249template <>
1250EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
1251 const __m128i mask = _mm_setr_epi32(0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF);
1252 return _mm_castsi128_ps(_mm_and_si128(mask, _mm_castps_si128(a)));
1253}
1254template <>
1255EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
1256 const __m128i mask = _mm_setr_epi32(0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF);
1257 return _mm_castsi128_pd(_mm_and_si128(mask, _mm_castpd_si128(a)));
1258}
1259template <>
1260EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) {
1261 Packet2l signbit = psignbit(a);
1262 return _mm_sub_epi64(_mm_xor_si128(a, signbit), signbit);
1263}
1264template <>
1265EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
1266#ifdef EIGEN_VECTORIZE_SSSE3
1267 return _mm_abs_epi32(a);
1268#else
1269 Packet4i signbit = psignbit(a);
1270 return _mm_sub_epi32(_mm_xor_si128(a, signbit), signbit);
1271#endif
1272}
1273template <>
1274EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) {
1275 return a;
1276}
1277
1278#ifdef EIGEN_VECTORIZE_SSE4_1
1279template <>
1280EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
1281 // Unfortunately _mm_round_ps doesn't have a rounding mode to implement numext::round.
1282 const Packet4f mask = pset1frombits<Packet4f>(0x80000000u);
1283 const Packet4f prev0dot5 = pset1frombits<Packet4f>(0x3EFFFFFFu);
1284 return _mm_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
1285}
1286
1287template <>
1288EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
1289 const Packet2d mask = _mm_castsi128_pd(_mm_set_epi64x(0x8000000000000000ull, 0x8000000000000000ull));
1290 const Packet2d prev0dot5 = _mm_castsi128_pd(_mm_set_epi64x(0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull));
1291 return _mm_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
1292}
1293
1294template <>
1295EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
1296 return _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION);
1297}
1298template <>
1299EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
1300 return _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
1301}
1302
1303template <>
1304EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
1305 return _mm_ceil_ps(a);
1306}
1307template <>
1308EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
1309 return _mm_ceil_pd(a);
1310}
1311
1312template <>
1313EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
1314 return _mm_floor_ps(a);
1315}
1316template <>
1317EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
1318 return _mm_floor_pd(a);
1319}
1320
1321template <>
1322EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(const Packet4f& a) {
1323 return _mm_round_ps(a, _MM_FROUND_TRUNC);
1324}
1325template <>
1326EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(const Packet2d& a) {
1327 return _mm_round_pd(a, _MM_FROUND_TRUNC);
1328}
1329#endif
1330
1331template <>
1332EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
1333 EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from);
1334}
1335template <>
1336EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
1337 EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from);
1338}
1339template <>
1340EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from) {
1341 EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
1342}
1343template <>
1344EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) {
1345 EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
1346}
1347template <>
1348EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from) {
1349 EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
1350}
1351template <>
1352EIGEN_STRONG_INLINE Packet16b pload<Packet16b>(const bool* from) {
1353 EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
1354}
1355
1356#if EIGEN_COMP_MSVC
1357template <>
1358EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
1359 EIGEN_DEBUG_UNALIGNED_LOAD
1360 return _mm_loadu_ps(from);
1361}
1362#else
1363// NOTE: with the code below, MSVC's compiler crashes!
1364
1365template <>
1366EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
1367 EIGEN_DEBUG_UNALIGNED_LOAD
1368 return _mm_loadu_ps(from);
1369}
1370#endif
1371
1372template <>
1373EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
1374 EIGEN_DEBUG_UNALIGNED_LOAD
1375 return _mm_loadu_pd(from);
1376}
1377template <>
1378EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(const int64_t* from) {
1379 EIGEN_DEBUG_UNALIGNED_LOAD
1380 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
1381}
1382template <>
1383EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) {
1384 EIGEN_DEBUG_UNALIGNED_LOAD
1385 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
1386}
1387template <>
1388EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from) {
1389 EIGEN_DEBUG_UNALIGNED_LOAD
1390 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
1391}
1392template <>
1393EIGEN_STRONG_INLINE Packet16b ploadu<Packet16b>(const bool* from) {
1394 EIGEN_DEBUG_UNALIGNED_LOAD
1395 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
1396}
1397
1398// Load lower part of packet zero extending.
1399template <typename Packet>
1400EIGEN_STRONG_INLINE Packet ploadl(const typename unpacket_traits<Packet>::type* from);
1401template <>
1402EIGEN_STRONG_INLINE Packet4f ploadl<Packet4f>(const float* from) {
1403 EIGEN_DEBUG_UNALIGNED_LOAD return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from)));
1404}
1405template <>
1406EIGEN_STRONG_INLINE Packet2d ploadl<Packet2d>(const double* from) {
1407 EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_sd(from);
1408}
1409
1410// Load scalar
1411template <typename Packet>
1412EIGEN_STRONG_INLINE Packet ploads(const typename unpacket_traits<Packet>::type* from);
1413template <>
1414EIGEN_STRONG_INLINE Packet4f ploads<Packet4f>(const float* from) {
1415 EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_ss(from);
1416}
1417template <>
1418EIGEN_STRONG_INLINE Packet2d ploads<Packet2d>(const double* from) {
1419 EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_sd(from);
1420}
1421
1422template <>
1423EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
1424 return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))), 0, 0, 1, 1);
1425}
1426template <>
1427EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
1428 return pset1<Packet2d>(from[0]);
1429}
1430template <>
1431EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(const int64_t* from) {
1432 return pset1<Packet2l>(from[0]);
1433}
1434template <>
1435EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) {
1436 Packet4i tmp;
1437 tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
1438 return vec4i_swizzle1(tmp, 0, 0, 1, 1);
1439}
1440template <>
1441EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from) {
1442 Packet4ui tmp;
1443 tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
1444 return vec4ui_swizzle1(tmp, 0, 0, 1, 1);
1445}
1446
1447// Loads 8 bools from memory and returns the packet
1448// {b0, b0, b1, b1, b2, b2, b3, b3, b4, b4, b5, b5, b6, b6, b7, b7}
1449template <>
1450EIGEN_STRONG_INLINE Packet16b ploaddup<Packet16b>(const bool* from) {
1451 __m128i tmp = _mm_castpd_si128(pload1<Packet2d>(reinterpret_cast<const double*>(from)));
1452 return _mm_unpacklo_epi8(tmp, tmp);
1453}
1454
1455// Loads 4 bools from memory and returns the packet
1456// {b0, b0 b0, b0, b1, b1, b1, b1, b2, b2, b2, b2, b3, b3, b3, b3}
1457template <>
1458EIGEN_STRONG_INLINE Packet16b ploadquad<Packet16b>(const bool* from) {
1459 __m128i tmp = _mm_castps_si128(pload1<Packet4f>(reinterpret_cast<const float*>(from)));
1460 tmp = _mm_unpacklo_epi8(tmp, tmp);
1461 return _mm_unpacklo_epi16(tmp, tmp);
1462}
1463
1464template <>
1465EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
1466 EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from);
1467}
1468template <>
1469EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
1470 EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from);
1471}
1472template <>
1473EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from) {
1474 EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
1475}
1476template <>
1477EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) {
1478 EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
1479}
1480template <>
1481EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from) {
1482 EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
1483}
1484template <>
1485EIGEN_STRONG_INLINE void pstore<bool>(bool* to, const Packet16b& from) {
1486 EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
1487}
1488
1489template <>
1490EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
1491 EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from);
1492}
1493template <>
1494EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
1495 EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from);
1496}
1497template <>
1498EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from) {
1499 EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
1500}
1501template <>
1502EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) {
1503 EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
1504}
1505template <>
1506EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from) {
1507 EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
1508}
1509template <>
1510EIGEN_STRONG_INLINE void pstoreu<bool>(bool* to, const Packet16b& from) {
1511 EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
1512}
1513
1514template <typename Scalar, typename Packet>
1515EIGEN_STRONG_INLINE void pstorel(Scalar* to, const Packet& from);
1516template <>
1517EIGEN_STRONG_INLINE void pstorel(float* to, const Packet4f& from) {
1518 EIGEN_DEBUG_UNALIGNED_STORE _mm_storel_pi(reinterpret_cast<__m64*>(to), from);
1519}
1520template <>
1521EIGEN_STRONG_INLINE void pstorel(double* to, const Packet2d& from) {
1522 EIGEN_DEBUG_UNALIGNED_STORE _mm_storel_pd(to, from);
1523}
1524
1525template <typename Scalar, typename Packet>
1526EIGEN_STRONG_INLINE void pstores(Scalar* to, const Packet& from);
1527template <>
1528EIGEN_STRONG_INLINE void pstores(float* to, const Packet4f& from) {
1529 EIGEN_DEBUG_UNALIGNED_STORE _mm_store_ss(to, from);
1530}
1531template <>
1532EIGEN_STRONG_INLINE void pstores(double* to, const Packet2d& from) {
1533 EIGEN_DEBUG_UNALIGNED_STORE _mm_store_sd(to, from);
1534}
1535
1536template <>
1537EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
1538 return _mm_shuffle_ps(a, a, 0x1B);
1539}
1540template <>
1541EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
1542 return _mm_shuffle_pd(a, a, 0x1);
1543}
1544template <>
1545EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a) {
1546 return _mm_castpd_si128(preverse(_mm_castsi128_pd(a)));
1547}
1548template <>
1549EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
1550 return _mm_shuffle_epi32(a, 0x1B);
1551}
1552template <>
1553EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) {
1554 return _mm_shuffle_epi32(a, 0x1B);
1555}
1556template <>
1557EIGEN_STRONG_INLINE Packet16b preverse(const Packet16b& a) {
1558#ifdef EIGEN_VECTORIZE_SSSE3
1559 __m128i mask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1560 return _mm_shuffle_epi8(a, mask);
1561#else
1562 Packet16b tmp = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3));
1563 tmp = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
1564 return _mm_or_si128(_mm_slli_epi16(tmp, 8), _mm_srli_epi16(tmp, 8));
1565#endif
1566}
1567
1568#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
1569// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
1570// Direct of the struct members fixed bug #62.
1571template <>
1572EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
1573 return a.m128_f32[0];
1574}
1575template <>
1576EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
1577 return a.m128d_f64[0];
1578}
1579template <>
1580EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
1581 int64_t x = _mm_extract_epi64_0(a);
1582 return x;
1583}
1584template <>
1585EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
1586 int x = _mm_cvtsi128_si32(a);
1587 return x;
1588}
1589template <>
1590EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
1591 uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
1592 return x;
1593}
1594#elif EIGEN_COMP_MSVC_STRICT
1595// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
1596template <>
1597EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
1598 float x = _mm_cvtss_f32(a);
1599 return x;
1600}
1601template <>
1602EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
1603 double x = _mm_cvtsd_f64(a);
1604 return x;
1605}
1606template <>
1607EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
1608 int64_t x = _mm_extract_epi64_0(a);
1609 return x;
1610}
1611template <>
1612EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
1613 int x = _mm_cvtsi128_si32(a);
1614 return x;
1615}
1616template <>
1617EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
1618 uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
1619 return x;
1620}
1621#else
1622template <>
1623EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
1624 return _mm_cvtss_f32(a);
1625}
1626template <>
1627EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
1628 return _mm_cvtsd_f64(a);
1629}
1630template <>
1631EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
1632 return _mm_extract_epi64_0(a);
1633}
1634template <>
1635EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
1636 return _mm_cvtsi128_si32(a);
1637}
1638template <>
1639EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
1640 return numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
1641}
1642#endif
1643template <>
1644EIGEN_STRONG_INLINE bool pfirst<Packet16b>(const Packet16b& a) {
1645 int x = _mm_cvtsi128_si32(a);
1646 return static_cast<bool>(x & 1);
1647}
1648
1649template <>
1650EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
1651 return _mm_set_ps(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
1652}
1653template <>
1654EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
1655 return _mm_set_pd(from[1 * stride], from[0 * stride]);
1656}
1657template <>
1658EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride) {
1659 return _mm_set_epi64x(from[1 * stride], from[0 * stride]);
1660}
1661template <>
1662EIGEN_STRONG_INLINE Packet4i pgather<int, Packet4i>(const int* from, Index stride) {
1663 return _mm_set_epi32(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
1664}
1665template <>
1666EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride) {
1667 return _mm_set_epi32(numext::bit_cast<int32_t>(from[3 * stride]), numext::bit_cast<int32_t>(from[2 * stride]),
1668 numext::bit_cast<int32_t>(from[1 * stride]), numext::bit_cast<int32_t>(from[0 * stride]));
1669}
1670
1671template <>
1672EIGEN_STRONG_INLINE Packet16b pgather<bool, Packet16b>(const bool* from, Index stride) {
1673 return _mm_set_epi8(from[15 * stride], from[14 * stride], from[13 * stride], from[12 * stride], from[11 * stride],
1674 from[10 * stride], from[9 * stride], from[8 * stride], from[7 * stride], from[6 * stride],
1675 from[5 * stride], from[4 * stride], from[3 * stride], from[2 * stride], from[1 * stride],
1676 from[0 * stride]);
1677}
1678
1679template <>
1680EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
1681 to[stride * 0] = pfirst(from);
1682 to[stride * 1] = pfirst(Packet4f(_mm_shuffle_ps(from, from, 1)));
1683 to[stride * 2] = pfirst(Packet4f(_mm_shuffle_ps(from, from, 2)));
1684 to[stride * 3] = pfirst(Packet4f(_mm_shuffle_ps(from, from, 3)));
1685}
1686template <>
1687EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
1688 to[stride * 0] = pfirst(from);
1689 to[stride * 1] = pfirst(preverse(from));
1690}
1691template <>
1692EIGEN_STRONG_INLINE void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from, Index stride) {
1693 to[stride * 0] = pfirst(from);
1694 to[stride * 1] = pfirst(preverse(from));
1695}
1696template <>
1697EIGEN_STRONG_INLINE void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) {
1698 to[stride * 0] = _mm_cvtsi128_si32(from);
1699 to[stride * 1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
1700 to[stride * 2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
1701 to[stride * 3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
1702}
1703template <>
1704EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index stride) {
1705 to[stride * 0] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(from));
1706 to[stride * 1] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)));
1707 to[stride * 2] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)));
1708 to[stride * 3] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)));
1709}
1710template <>
1711EIGEN_STRONG_INLINE void pscatter<bool, Packet16b>(bool* to, const Packet16b& from, Index stride) {
1712 EIGEN_ALIGN16 bool tmp[16];
1713 pstore(tmp, from);
1714 to[stride * 0] = tmp[0];
1715 to[stride * 1] = tmp[1];
1716 to[stride * 2] = tmp[2];
1717 to[stride * 3] = tmp[3];
1718 to[stride * 4] = tmp[4];
1719 to[stride * 5] = tmp[5];
1720 to[stride * 6] = tmp[6];
1721 to[stride * 7] = tmp[7];
1722 to[stride * 8] = tmp[8];
1723 to[stride * 9] = tmp[9];
1724 to[stride * 10] = tmp[10];
1725 to[stride * 11] = tmp[11];
1726 to[stride * 12] = tmp[12];
1727 to[stride * 13] = tmp[13];
1728 to[stride * 14] = tmp[14];
1729 to[stride * 15] = tmp[15];
1730}
1731
1732// some compilers might be tempted to perform multiple moves instead of using a vector path.
1733template <>
1734EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a) {
1735 Packet4f pa = _mm_set_ss(a);
1736 pstore(to, Packet4f(vec4f_swizzle1(pa, 0, 0, 0, 0)));
1737}
1738// some compilers might be tempted to perform multiple moves instead of using a vector path.
1739template <>
1740EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a) {
1741 Packet2d pa = _mm_set_sd(a);
1742 pstore(to, Packet2d(vec2d_swizzle1(pa, 0, 0)));
1743}
1744
1745#if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
1746typedef const void* SsePrefetchPtrType;
1747#else
1748typedef const char* SsePrefetchPtrType;
1749#endif
1750
1751#ifndef EIGEN_VECTORIZE_AVX
1752template <>
1753EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
1754 _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1755}
1756template <>
1757EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
1758 _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1759}
1760template <>
1761EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
1762 _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1763}
1764template <>
1765EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) {
1766 _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1767}
1768template <>
1769EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) {
1770 _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1771}
1772#endif
1773
1774template <>
1775EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
1776 return pfrexp_generic(a, exponent);
1777}
1778
1779// Extract exponent without existence of Packet2l.
1780template <>
1781EIGEN_STRONG_INLINE Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
1782 const Packet2d cst_exp_mask = pset1frombits<Packet2d>(static_cast<uint64_t>(0x7ff0000000000000ull));
1783 __m128i a_expo = _mm_srli_epi64(_mm_castpd_si128(pand(a, cst_exp_mask)), 52);
1784 return _mm_cvtepi32_pd(vec4i_swizzle1(a_expo, 0, 2, 1, 3));
1785}
1786
1787template <>
1788EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
1789 return pfrexp_generic(a, exponent);
1790}
1791
1792template <>
1793EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
1794 return pldexp_generic(a, exponent);
1795}
1796
1797// We specialize pldexp here, since the generic implementation uses Packet2l, which is not well
1798// supported by SSE, and has more range than is needed for exponents.
1799template <>
1800EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
1801 // Clamp exponent to [-2099, 2099]
1802 const Packet2d max_exponent = pset1<Packet2d>(2099.0);
1803 const Packet2d e = pmin(pmax(exponent, pnegate(max_exponent)), max_exponent);
1804
1805 // Convert e to integer and swizzle to low-order bits.
1806 const Packet4i ei = vec4i_swizzle1(_mm_cvtpd_epi32(e), 0, 3, 1, 3);
1807
1808 // Split 2^e into four factors and multiply:
1809 const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023);
1810 Packet4i b = parithmetic_shift_right<2>(ei); // floor(e/4)
1811 Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52)); // 2^b
1812 Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
1813 b = psub(psub(psub(ei, b), b), b); // e - 3b
1814 c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52)); // 2^(e - 3b)
1815 out = pmul(out, c); // a * 2^e
1816 return out;
1817}
1818
1819// We specialize pldexp here, since the generic implementation uses Packet2l, which is not well
1820// supported by SSE, and has more range than is needed for exponents.
1821template <>
1822EIGEN_STRONG_INLINE Packet2d pldexp_fast<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
1823 // Clamp exponent to [-1023, 1024]
1824 const Packet2d min_exponent = pset1<Packet2d>(-1023.0);
1825 const Packet2d max_exponent = pset1<Packet2d>(1024.0);
1826 const Packet2d e = pmin(pmax(exponent, min_exponent), max_exponent);
1827
1828 // Convert e to integer and swizzle to low-order bits.
1829 const Packet4i ei = vec4i_swizzle1(_mm_cvtpd_epi32(e), 0, 3, 1, 3);
1830
1831 // Compute 2^e multiply:
1832 const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023);
1833 const Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(ei, bias), 52)); // 2^e
1834 return pmul(a, c);
1835}
1836
1837// with AVX, the default implementations based on pload1 are faster
1838#ifndef __AVX__
1839template <>
1840EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
1841 a3 = pload<Packet4f>(a);
1842 a0 = vec4f_swizzle1(a3, 0, 0, 0, 0);
1843 a1 = vec4f_swizzle1(a3, 1, 1, 1, 1);
1844 a2 = vec4f_swizzle1(a3, 2, 2, 2, 2);
1845 a3 = vec4f_swizzle1(a3, 3, 3, 3, 3);
1846}
1847template <>
1848EIGEN_STRONG_INLINE void pbroadcast4<Packet2d>(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2,
1849 Packet2d& a3) {
1850#ifdef EIGEN_VECTORIZE_SSE3
1851 a0 = _mm_loaddup_pd(a + 0);
1852 a1 = _mm_loaddup_pd(a + 1);
1853 a2 = _mm_loaddup_pd(a + 2);
1854 a3 = _mm_loaddup_pd(a + 3);
1855#else
1856 a1 = pload<Packet2d>(a);
1857 a0 = vec2d_swizzle1(a1, 0, 0);
1858 a1 = vec2d_swizzle1(a1, 1, 1);
1859 a3 = pload<Packet2d>(a + 2);
1860 a2 = vec2d_swizzle1(a3, 0, 0);
1861 a3 = vec2d_swizzle1(a3, 1, 1);
1862#endif
1863}
1864#endif
1865
1866EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) {
1867 vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
1868 vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA));
1869 vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF));
1870 vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
1871}
1872
1873EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
1874 _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
1875}
1876
1877EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
1878 __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
1879 kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
1880 kernel.packet[1] = tmp;
1881}
1882
1883EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2l, 2>& kernel) {
1884 __m128i tmp = _mm_unpackhi_epi64(kernel.packet[0], kernel.packet[1]);
1885 kernel.packet[0] = _mm_unpacklo_epi64(kernel.packet[0], kernel.packet[1]);
1886 kernel.packet[1] = tmp;
1887}
1888
1889EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
1890 __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
1891 __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
1892 __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
1893 __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
1894
1895 kernel.packet[0] = _mm_unpacklo_epi64(T0, T1);
1896 kernel.packet[1] = _mm_unpackhi_epi64(T0, T1);
1897 kernel.packet[2] = _mm_unpacklo_epi64(T2, T3);
1898 kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
1899}
1900EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
1901 ptranspose((PacketBlock<Packet4i, 4>&)kernel);
1902}
1903
1904EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16b, 4>& kernel) {
1905 __m128i T0 = _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]);
1906 __m128i T1 = _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]);
1907 __m128i T2 = _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]);
1908 __m128i T3 = _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]);
1909 kernel.packet[0] = _mm_unpacklo_epi16(T0, T2);
1910 kernel.packet[1] = _mm_unpackhi_epi16(T0, T2);
1911 kernel.packet[2] = _mm_unpacklo_epi16(T1, T3);
1912 kernel.packet[3] = _mm_unpackhi_epi16(T1, T3);
1913}
1914
1915EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16b, 16>& kernel) {
1916 // If we number the elements in the input thus:
1917 // kernel.packet[ 0] = {00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 0a, 0b, 0c, 0d, 0e, 0f}
1918 // kernel.packet[ 1] = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1a, 1b, 1c, 1d, 1e, 1f}
1919 // ...
1920 // kernel.packet[15] = {f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, fa, fb, fc, fd, fe, ff},
1921 //
1922 // the desired output is:
1923 // kernel.packet[ 0] = {00, 10, 20, 30, 40, 50, 60, 70, 80, 90, a0, b0, c0, d0, e0, f0}
1924 // kernel.packet[ 1] = {01, 11, 21, 31, 41, 51, 61, 71, 81, 91, a1, b1, c1, d1, e1, f1}
1925 // ...
1926 // kernel.packet[15] = {0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, af, bf, cf, df, ef, ff},
1927 __m128i t0 =
1928 _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
1929 __m128i t1 =
1930 _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]); // 08 18 09 19 0a 1a 0b 1b 0c 1c 0d 1d 0e 1e 0f 1f
1931 __m128i t2 =
1932 _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]); // 20 30 21 31 22 32 ... 27 37
1933 __m128i t3 =
1934 _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]); // 28 38 29 39 2a 3a ... 2f 3f
1935 __m128i t4 =
1936 _mm_unpacklo_epi8(kernel.packet[4], kernel.packet[5]); // 40 50 41 51 42 52 47 57
1937 __m128i t5 = _mm_unpackhi_epi8(kernel.packet[4], kernel.packet[5]); // 48 58 49 59 4a 5a
1938 __m128i t6 = _mm_unpacklo_epi8(kernel.packet[6], kernel.packet[7]);
1939 __m128i t7 = _mm_unpackhi_epi8(kernel.packet[6], kernel.packet[7]);
1940 __m128i t8 = _mm_unpacklo_epi8(kernel.packet[8], kernel.packet[9]);
1941 __m128i t9 = _mm_unpackhi_epi8(kernel.packet[8], kernel.packet[9]);
1942 __m128i ta = _mm_unpacklo_epi8(kernel.packet[10], kernel.packet[11]);
1943 __m128i tb = _mm_unpackhi_epi8(kernel.packet[10], kernel.packet[11]);
1944 __m128i tc = _mm_unpacklo_epi8(kernel.packet[12], kernel.packet[13]);
1945 __m128i td = _mm_unpackhi_epi8(kernel.packet[12], kernel.packet[13]);
1946 __m128i te = _mm_unpacklo_epi8(kernel.packet[14], kernel.packet[15]);
1947 __m128i tf = _mm_unpackhi_epi8(kernel.packet[14], kernel.packet[15]);
1948
1949 __m128i s0 = _mm_unpacklo_epi16(t0, t2); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
1950 __m128i s1 = _mm_unpackhi_epi16(t0, t2); // 04 14 24 34
1951 __m128i s2 = _mm_unpacklo_epi16(t1, t3); // 08 18 28 38 ...
1952 __m128i s3 = _mm_unpackhi_epi16(t1, t3); // 0c 1c 2c 3c ...
1953 __m128i s4 = _mm_unpacklo_epi16(t4, t6); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
1954 __m128i s5 = _mm_unpackhi_epi16(t4, t6); // 44 54 64 74 ...
1955 __m128i s6 = _mm_unpacklo_epi16(t5, t7);
1956 __m128i s7 = _mm_unpackhi_epi16(t5, t7);
1957 __m128i s8 = _mm_unpacklo_epi16(t8, ta);
1958 __m128i s9 = _mm_unpackhi_epi16(t8, ta);
1959 __m128i sa = _mm_unpacklo_epi16(t9, tb);
1960 __m128i sb = _mm_unpackhi_epi16(t9, tb);
1961 __m128i sc = _mm_unpacklo_epi16(tc, te);
1962 __m128i sd = _mm_unpackhi_epi16(tc, te);
1963 __m128i se = _mm_unpacklo_epi16(td, tf);
1964 __m128i sf = _mm_unpackhi_epi16(td, tf);
1965
1966 __m128i u0 = _mm_unpacklo_epi32(s0, s4); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
1967 __m128i u1 = _mm_unpackhi_epi32(s0, s4); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
1968 __m128i u2 = _mm_unpacklo_epi32(s1, s5);
1969 __m128i u3 = _mm_unpackhi_epi32(s1, s5);
1970 __m128i u4 = _mm_unpacklo_epi32(s2, s6);
1971 __m128i u5 = _mm_unpackhi_epi32(s2, s6);
1972 __m128i u6 = _mm_unpacklo_epi32(s3, s7);
1973 __m128i u7 = _mm_unpackhi_epi32(s3, s7);
1974 __m128i u8 = _mm_unpacklo_epi32(s8, sc);
1975 __m128i u9 = _mm_unpackhi_epi32(s8, sc);
1976 __m128i ua = _mm_unpacklo_epi32(s9, sd);
1977 __m128i ub = _mm_unpackhi_epi32(s9, sd);
1978 __m128i uc = _mm_unpacklo_epi32(sa, se);
1979 __m128i ud = _mm_unpackhi_epi32(sa, se);
1980 __m128i ue = _mm_unpacklo_epi32(sb, sf);
1981 __m128i uf = _mm_unpackhi_epi32(sb, sf);
1982
1983 kernel.packet[0] = _mm_unpacklo_epi64(u0, u8);
1984 kernel.packet[1] = _mm_unpackhi_epi64(u0, u8);
1985 kernel.packet[2] = _mm_unpacklo_epi64(u1, u9);
1986 kernel.packet[3] = _mm_unpackhi_epi64(u1, u9);
1987 kernel.packet[4] = _mm_unpacklo_epi64(u2, ua);
1988 kernel.packet[5] = _mm_unpackhi_epi64(u2, ua);
1989 kernel.packet[6] = _mm_unpacklo_epi64(u3, ub);
1990 kernel.packet[7] = _mm_unpackhi_epi64(u3, ub);
1991 kernel.packet[8] = _mm_unpacklo_epi64(u4, uc);
1992 kernel.packet[9] = _mm_unpackhi_epi64(u4, uc);
1993 kernel.packet[10] = _mm_unpacklo_epi64(u5, ud);
1994 kernel.packet[11] = _mm_unpackhi_epi64(u5, ud);
1995 kernel.packet[12] = _mm_unpacklo_epi64(u6, ue);
1996 kernel.packet[13] = _mm_unpackhi_epi64(u6, ue);
1997 kernel.packet[14] = _mm_unpacklo_epi64(u7, uf);
1998 kernel.packet[15] = _mm_unpackhi_epi64(u7, uf);
1999}
2000
2001EIGEN_STRONG_INLINE __m128i sse_blend_mask(const Selector<2>& ifPacket) {
2002 return _mm_set_epi64x(0 - ifPacket.select[1], 0 - ifPacket.select[0]);
2003}
2004
2005EIGEN_STRONG_INLINE __m128i sse_blend_mask(const Selector<4>& ifPacket) {
2006 return _mm_set_epi32(0 - ifPacket.select[3], 0 - ifPacket.select[2], 0 - ifPacket.select[1], 0 - ifPacket.select[0]);
2007}
2008
2009template <>
2010EIGEN_STRONG_INLINE Packet2l pblend(const Selector<2>& ifPacket, const Packet2l& thenPacket,
2011 const Packet2l& elsePacket) {
2012 const __m128i true_mask = sse_blend_mask(ifPacket);
2013 return pselect<Packet2l>(true_mask, thenPacket, elsePacket);
2014}
2015template <>
2016EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
2017 const Packet4i& elsePacket) {
2018 const __m128i true_mask = sse_blend_mask(ifPacket);
2019 return pselect<Packet4i>(true_mask, thenPacket, elsePacket);
2020}
2021template <>
2022EIGEN_STRONG_INLINE Packet4ui pblend(const Selector<4>& ifPacket, const Packet4ui& thenPacket,
2023 const Packet4ui& elsePacket) {
2024 return (Packet4ui)pblend(ifPacket, (Packet4i)thenPacket, (Packet4i)elsePacket);
2025}
2026template <>
2027EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
2028 const Packet4f& elsePacket) {
2029 const __m128i true_mask = sse_blend_mask(ifPacket);
2030 return pselect<Packet4f>(_mm_castsi128_ps(true_mask), thenPacket, elsePacket);
2031}
2032template <>
2033EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
2034 const Packet2d& elsePacket) {
2035 const __m128i true_mask = sse_blend_mask(ifPacket);
2036 return pselect<Packet2d>(_mm_castsi128_pd(true_mask), thenPacket, elsePacket);
2037}
2038
2039// Scalar path for pmadd with FMA to ensure consistency with vectorized path.
2040#if defined(EIGEN_VECTORIZE_FMA)
2041template <>
2042EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) {
2043 return std::fmaf(a, b, c);
2044}
2045template <>
2046EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) {
2047 return std::fma(a, b, c);
2048}
2049template <>
2050EIGEN_STRONG_INLINE float pmsub(const float& a, const float& b, const float& c) {
2051 return std::fmaf(a, b, -c);
2052}
2053template <>
2054EIGEN_STRONG_INLINE double pmsub(const double& a, const double& b, const double& c) {
2055 return std::fma(a, b, -c);
2056}
2057template <>
2058EIGEN_STRONG_INLINE float pnmadd(const float& a, const float& b, const float& c) {
2059 return std::fmaf(-a, b, c);
2060}
2061template <>
2062EIGEN_STRONG_INLINE double pnmadd(const double& a, const double& b, const double& c) {
2063 return std::fma(-a, b, c);
2064}
2065template <>
2066EIGEN_STRONG_INLINE float pnmsub(const float& a, const float& b, const float& c) {
2067 return std::fmaf(-a, b, -c);
2068}
2069template <>
2070EIGEN_STRONG_INLINE double pnmsub(const double& a, const double& b, const double& c) {
2071 return std::fma(-a, b, -c);
2072}
2073#endif
2074
2075#ifdef EIGEN_VECTORIZE_SSE4_1
2076// Helpers for half->float and float->half conversions.
2077// Currently only used by the AVX code.
2078EIGEN_STRONG_INLINE __m128i half2floatsse(__m128i h) {
2079 __m128i input = _mm_cvtepu16_epi32(h);
2080
2081 // Direct vectorization of half_to_float, C parts in the comments.
2082 __m128i shifted_exp = _mm_set1_epi32(0x7c00 << 13);
2083 // o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits
2084 __m128i ou = _mm_slli_epi32(_mm_and_si128(input, _mm_set1_epi32(0x7fff)), 13);
2085 // exp = shifted_exp & o.u; // just the exponent
2086 __m128i exp = _mm_and_si128(ou, shifted_exp);
2087 // o.u += (127 - 15) << 23;
2088 ou = _mm_add_epi32(ou, _mm_set1_epi32((127 - 15) << 23));
2089
2090 // Inf/NaN?
2091 __m128i naninf_mask = _mm_cmpeq_epi32(exp, shifted_exp);
2092 // Inf/NaN adjust
2093 __m128i naninf_adj = _mm_and_si128(_mm_set1_epi32((128 - 16) << 23), naninf_mask);
2094 // extra exp adjust for Inf/NaN
2095 ou = _mm_add_epi32(ou, naninf_adj);
2096
2097 // Zero/Denormal?
2098 __m128i zeroden_mask = _mm_cmpeq_epi32(exp, _mm_setzero_si128());
2099 __m128i zeroden_adj = _mm_and_si128(zeroden_mask, _mm_set1_epi32(1 << 23));
2100 // o.u += 1 << 23;
2101 ou = _mm_add_epi32(ou, zeroden_adj);
2102 // magic.u = 113 << 23
2103 __m128i magic = _mm_and_si128(zeroden_mask, _mm_set1_epi32(113 << 23));
2104 // o.f -= magic.f
2105 ou = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(ou), _mm_castsi128_ps(magic)));
2106
2107 __m128i sign = _mm_slli_epi32(_mm_and_si128(input, _mm_set1_epi32(0x8000)), 16);
2108 // o.u |= (h.x & 0x8000) << 16; // sign bit
2109 ou = _mm_or_si128(ou, sign);
2110 // return o.f;
2111 // We are actually returning uint version, to make
2112 // _mm256_insertf128_si256 work.
2113 return ou;
2114}
2115
2116EIGEN_STRONG_INLINE __m128i float2half(__m128 f) {
2117 // unsigned int sign_mask = 0x80000000u;
2118 __m128i sign = _mm_set1_epi32(0x80000000u);
2119 // unsigned int sign = f.u & sign_mask;
2120 sign = _mm_and_si128(sign, _mm_castps_si128(f));
2121 // f.u ^= sign;
2122 f = _mm_xor_ps(f, _mm_castsi128_ps(sign));
2123
2124 __m128i fu = _mm_castps_si128(f);
2125
2126 __m128i f16max = _mm_set1_epi32((127 + 16) << 23);
2127 __m128i f32infty = _mm_set1_epi32(255 << 23);
2128 // if (f.u >= f16max.u) // result is Inf or NaN (all exponent bits set)
2129 // there is no _mm_cmpge_epi32, so use lt and swap operands
2130 __m128i infnan_mask = _mm_cmplt_epi32(f16max, _mm_castps_si128(f));
2131 __m128i inf_mask = _mm_cmpgt_epi32(_mm_castps_si128(f), f32infty);
2132 __m128i nan_mask = _mm_andnot_si128(inf_mask, infnan_mask);
2133 __m128i inf_value = _mm_and_si128(inf_mask, _mm_set1_epi32(0x7e00));
2134 __m128i nan_value = _mm_and_si128(nan_mask, _mm_set1_epi32(0x7c00));
2135 // o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
2136 __m128i naninf_value = _mm_or_si128(inf_value, nan_value);
2137
2138 __m128i denorm_magic = _mm_set1_epi32(((127 - 15) + (23 - 10) + 1) << 23);
2139 __m128i subnorm_mask = _mm_cmplt_epi32(_mm_castps_si128(f), _mm_set1_epi32(113 << 23));
2140 // f.f += denorm_magic.f;
2141 f = _mm_add_ps(f, _mm_castsi128_ps(denorm_magic));
2142 // f.u - denorm_magic.u
2143 __m128i o = _mm_sub_epi32(_mm_castps_si128(f), denorm_magic);
2144 o = _mm_and_si128(o, subnorm_mask);
2145 // Correct result for inf/nan/zero/subnormal, 0 otherwise
2146 o = _mm_or_si128(o, naninf_value);
2147
2148 __m128i mask = _mm_or_si128(infnan_mask, subnorm_mask);
2149 o = _mm_and_si128(o, mask);
2150
2151 // mant_odd = (f.u >> 13) & 1;
2152 __m128i mand_odd = _mm_and_si128(_mm_srli_epi32(fu, 13), _mm_set1_epi32(0x1));
2153 // f.u += 0xc8000fffU;
2154 fu = _mm_add_epi32(fu, _mm_set1_epi32(0xc8000fffU));
2155 // f.u += mant_odd;
2156 fu = _mm_add_epi32(fu, mand_odd);
2157 fu = _mm_andnot_si128(mask, fu);
2158 // f.u >> 13
2159 fu = _mm_srli_epi32(fu, 13);
2160 o = _mm_or_si128(fu, o);
2161
2162 // o.x |= static_cast<numext::uint16_t>(sign >> 16);
2163 o = _mm_or_si128(o, _mm_srli_epi32(sign, 16));
2164
2165 // 16 bit values
2166 return _mm_and_si128(o, _mm_set1_epi32(0xffff));
2167}
2168#endif
2169
2170// Packet math for Eigen::half
2171// Disable the following code since it's broken on too many platforms / compilers.
2172// #elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
2173#if 0
2174
2175typedef struct {
2176 __m64 x;
2177} Packet4h;
2178
2179
2180template<> struct is_arithmetic<Packet4h> { enum { value = true }; };
2181
2182template <>
2183struct packet_traits<Eigen::half> : default_packet_traits {
2184 typedef Packet4h type;
2185 // There is no half-size packet for Packet4h.
2186 typedef Packet4h half;
2187 enum {
2188 Vectorizable = 1,
2189 AlignedOnScalar = 1,
2190 size = 4,
2191 HasAdd = 1,
2192 HasSub = 1,
2193 HasMul = 1,
2194 HasDiv = 1,
2195 HasNegate = 0,
2196 HasAbs = 0,
2197 HasAbs2 = 0,
2198 HasMin = 0,
2199 HasMax = 0,
2200 HasConj = 0,
2201 HasSetLinear = 0,
2202 };
2203};
2204
2205
2206template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h half; };
2207
2208template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
2209 Packet4h result;
2210 result.x = _mm_set1_pi16(from.x);
2211 return result;
2212}
2213
2214template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) {
2215 return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
2216}
2217
2218template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; }
2219
2220template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) {
2221 __int64_t a64 = _mm_cvtm64_si64(a.x);
2222 __int64_t b64 = _mm_cvtm64_si64(b.x);
2223
2224 Eigen::half h[4];
2225
2226 Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
2227 Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
2228 h[0] = ha + hb;
2229 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
2230 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
2231 h[1] = ha + hb;
2232 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
2233 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
2234 h[2] = ha + hb;
2235 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
2236 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
2237 h[3] = ha + hb;
2238 Packet4h result;
2239 result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
2240 return result;
2241}
2242
2243template<> EIGEN_STRONG_INLINE Packet4h psub<Packet4h>(const Packet4h& a, const Packet4h& b) {
2244 __int64_t a64 = _mm_cvtm64_si64(a.x);
2245 __int64_t b64 = _mm_cvtm64_si64(b.x);
2246
2247 Eigen::half h[4];
2248
2249 Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
2250 Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
2251 h[0] = ha - hb;
2252 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
2253 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
2254 h[1] = ha - hb;
2255 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
2256 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
2257 h[2] = ha - hb;
2258 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
2259 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
2260 h[3] = ha - hb;
2261 Packet4h result;
2262 result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
2263 return result;
2264}
2265
2266template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) {
2267 __int64_t a64 = _mm_cvtm64_si64(a.x);
2268 __int64_t b64 = _mm_cvtm64_si64(b.x);
2269
2270 Eigen::half h[4];
2271
2272 Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
2273 Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
2274 h[0] = ha * hb;
2275 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
2276 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
2277 h[1] = ha * hb;
2278 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
2279 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
2280 h[2] = ha * hb;
2281 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
2282 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
2283 h[3] = ha * hb;
2284 Packet4h result;
2285 result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
2286 return result;
2287}
2288
2289template<> EIGEN_STRONG_INLINE Packet4h pdiv<Packet4h>(const Packet4h& a, const Packet4h& b) {
2290 __int64_t a64 = _mm_cvtm64_si64(a.x);
2291 __int64_t b64 = _mm_cvtm64_si64(b.x);
2292
2293 Eigen::half h[4];
2294
2295 Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
2296 Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
2297 h[0] = ha / hb;
2298 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
2299 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
2300 h[1] = ha / hb;
2301 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
2302 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
2303 h[2] = ha / hb;
2304 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
2305 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
2306 h[3] = ha / hb;
2307 Packet4h result;
2308 result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
2309 return result;
2310}
2311
2312template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) {
2313 Packet4h result;
2314 result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
2315 return result;
2316}
2317
2318template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) {
2319 Packet4h result;
2320 result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
2321 return result;
2322}
2323
2324template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) {
2325 __int64_t r = _mm_cvtm64_si64(from.x);
2326 *(reinterpret_cast<__int64_t*>(to)) = r;
2327}
2328
2329template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) {
2330 __int64_t r = _mm_cvtm64_si64(from.x);
2331 *(reinterpret_cast<__int64_t*>(to)) = r;
2332}
2333
2334template<> EIGEN_STRONG_INLINE Packet4h
2335ploadquad<Packet4h>(const Eigen::half* from) {
2336 return pset1<Packet4h>(*from);
2337}
2338
2339template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride)
2340{
2341 Packet4h result;
2342 result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
2343 return result;
2344}
2345
2346template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride)
2347{
2348 __int64_t a = _mm_cvtm64_si64(from.x);
2349 to[stride*0].x = static_cast<unsigned short>(a);
2350 to[stride*1].x = static_cast<unsigned short>(a >> 16);
2351 to[stride*2].x = static_cast<unsigned short>(a >> 32);
2352 to[stride*3].x = static_cast<unsigned short>(a >> 48);
2353}
2354
2355EIGEN_STRONG_INLINE void
2356ptranspose(PacketBlock<Packet4h,4>& kernel) {
2357 __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x);
2358 __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);
2359 __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x);
2360 __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x);
2361
2362 kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1);
2363 kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1);
2364 kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3);
2365 kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3);
2366}
2367
2368#endif
2369
2370} // end namespace internal
2371
2372} // end namespace Eigen
2373
2374#if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
2375// PGI++ does not define the following intrinsics in C++ mode.
2376static inline __m128 _mm_castpd_ps(__m128d x) { return reinterpret_cast<__m128&>(x); }
2377static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); }
2378static inline __m128d _mm_castps_pd(__m128 x) { return reinterpret_cast<__m128d&>(x); }
2379static inline __m128i _mm_castps_si128(__m128 x) { return reinterpret_cast<__m128i&>(x); }
2380static inline __m128 _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x); }
2381static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); }
2382#endif
2383
2384#endif // EIGEN_PACKET_MATH_SSE_H
@ Aligned16
Definition Constants.h:237
Namespace containing all symbols from the Eigen library.
Definition B01_Experimental.dox:1
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_exp_op< typename Derived::Scalar >, const Derived > exp(const Eigen::ArrayBase< Derived > &x)
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_sign_op< typename Derived::Scalar >, const Derived > sign(const Eigen::ArrayBase< Derived > &x)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:82