Eigen  5.0.1-dev+7c7d8473
 
Loading...
Searching...
No Matches
PacketMath.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_PACKET_MATH_SSE_H
11#define EIGEN_PACKET_MATH_SSE_H
12
13#include <cstdint>
14// IWYU pragma: private
15#include "../../InternalHeaderCheck.h"
16
17namespace Eigen {
18
19namespace internal {
20
21#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
22#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
23#endif
24
25#if !defined(EIGEN_VECTORIZE_AVX) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
26// 32 bits => 8 registers
27// 64 bits => 16 registers
28#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2 * sizeof(void*))
29#endif
30
31#ifdef EIGEN_VECTORIZE_FMA
32#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
33#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
34#endif
35#endif
36
37#if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW || EIGEN_COMP_LCC) && \
38 (__GXX_ABI_VERSION < 1004)) || \
39 EIGEN_OS_QNX
40// With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
41// have overloads for both types without linking error.
42// One solution is to increase ABI version using -fabi-version=4 (or greater).
43// Otherwise, we workaround this inconvenience by wrapping 128bit types into the following helper
44// structure:
45typedef eigen_packet_wrapper<__m128> Packet4f;
46typedef eigen_packet_wrapper<__m128d> Packet2d;
47#else
48typedef __m128 Packet4f;
49typedef __m128d Packet2d;
50#endif
51
52typedef eigen_packet_wrapper<__m128i, 0> Packet4i;
53typedef eigen_packet_wrapper<__m128i, 1> Packet16b;
54typedef eigen_packet_wrapper<__m128i, 4> Packet4ui;
55typedef eigen_packet_wrapper<__m128i, 5> Packet2l;
56
57template <>
58struct is_arithmetic<__m128> {
59 enum { value = true };
60};
61template <>
62struct is_arithmetic<__m128i> {
63 enum { value = true };
64};
65template <>
66struct is_arithmetic<__m128d> {
67 enum { value = true };
68};
69template <>
70struct is_arithmetic<Packet4i> {
71 enum { value = true };
72};
73template <>
74struct is_arithmetic<Packet2l> {
75 enum { value = true };
76};
77// Note that `Packet4ui` uses the underlying type `__m128i`, which is
78// interpreted as a vector of _signed_ `int32`s, which breaks some arithmetic
79// operations used in `GenericPacketMath.h`.
80template <>
81struct is_arithmetic<Packet4ui> {
82 enum { value = false };
83};
84template <>
85struct is_arithmetic<Packet16b> {
86 enum { value = true };
87};
88
89template <int p, int q, int r, int s>
90struct shuffle_mask {
91 enum { mask = (s) << 6 | (r) << 4 | (q) << 2 | (p) };
92};
93
94// TODO: change the implementation of all swizzle* ops from macro to template,
95#define vec4f_swizzle1(v, p, q, r, s) \
96 Packet4f(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), (shuffle_mask<p, q, r, s>::mask))))
97
98#define vec4i_swizzle1(v, p, q, r, s) Packet4i(_mm_shuffle_epi32(v, (shuffle_mask<p, q, r, s>::mask)))
99
100#define vec4ui_swizzle1(v, p, q, r, s) Packet4ui(vec4i_swizzle1(v, p, q, r, s))
101
102#define vec2d_swizzle1(v, p, q) \
103 Packet2d(_mm_castsi128_pd( \
104 _mm_shuffle_epi32(_mm_castpd_si128(v), (shuffle_mask<2 * p, 2 * p + 1, 2 * q, 2 * q + 1>::mask))))
105
106#define vec4f_swizzle2(a, b, p, q, r, s) Packet4f(_mm_shuffle_ps((a), (b), (shuffle_mask<p, q, r, s>::mask)))
107
108#define vec4i_swizzle2(a, b, p, q, r, s) \
109 Packet4i( \
110 _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (shuffle_mask<p, q, r, s>::mask)))))
111
112#define vec4ui_swizzle2(a, b, p, q, r, s) Packet4i(vec4i_swizzle2(a, b, p, q, r, s))
113
114EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) {
115 return Packet4f(_mm_movelh_ps(a, b));
116}
117EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) {
118 return Packet4f(_mm_movehl_ps(a, b));
119}
120EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) {
121 return Packet4f(_mm_unpacklo_ps(a, b));
122}
123EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) {
124 return Packet4f(_mm_unpackhi_ps(a, b));
125}
126#define vec4f_duplane(a, p) vec4f_swizzle2(a, a, p, p, p, p)
127
128#define vec2d_swizzle2(a, b, mask) Packet2d(_mm_shuffle_pd(a, b, mask))
129
130EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b) {
131 return Packet2d(_mm_unpacklo_pd(a, b));
132}
133EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b) {
134 return Packet2d(_mm_unpackhi_pd(a, b));
135}
136#define vec2d_duplane(a, p) vec2d_swizzle2(a, a, (p << 1) | p)
137
138#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = pset1<Packet4f>(X)
139
140#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = pset1<Packet2d>(X)
141
142#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) const Packet4f p4f_##NAME = pset1frombits<Packet4f>(X)
143
144#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = pset1<Packet4i>(X)
145
146#define EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = pset1<Packet4ui>(X)
147
148// Work around lack of extract/cvt for epi64 when compiling for 32-bit.
149#if EIGEN_ARCH_x86_64
150EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_0(const __m128i& a) { return _mm_cvtsi128_si64(a); }
151#ifdef EIGEN_VECTORIZE_SSE4_1
152EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_1(const __m128i& a) { return _mm_extract_epi64(a, 1); }
153#else
154EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_1(const __m128i& a) {
155 return _mm_cvtsi128_si64(_mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(a), 0x1)));
156}
157#endif
158#else
159// epi64 instructions are not available. The following seems to generate the same instructions
160// with -O2 in GCC/Clang.
161EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_0(const __m128i& a) {
162 return numext::bit_cast<int64_t>(_mm_cvtsd_f64(_mm_castsi128_pd(a)));
163}
164EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_1(const __m128i& a) {
165 return numext::bit_cast<int64_t>(_mm_cvtsd_f64(_mm_shuffle_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(a), 0x1)));
166}
167#endif
168
169// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
170// to leverage AVX instructions.
171#ifndef EIGEN_VECTORIZE_AVX
172template <>
173struct packet_traits<float> : default_packet_traits {
174 typedef Packet4f type;
175 typedef Packet4f half;
176 enum {
177 Vectorizable = 1,
178 AlignedOnScalar = 1,
179 size = 4,
180
181 HasCmp = 1,
182 HasDiv = 1,
183 HasReciprocal = EIGEN_FAST_MATH,
184 HasSin = EIGEN_FAST_MATH,
185 HasCos = EIGEN_FAST_MATH,
186 HasACos = 1,
187 HasASin = 1,
188 HasATan = 1,
189 HasATanh = 1,
190 HasLog = 1,
191 HasLog1p = 1,
192 HasExpm1 = 1,
193 HasNdtri = 1,
194 HasExp = 1,
195 HasPow = 1,
196 HasBessel = 1,
197 HasSqrt = 1,
198 HasRsqrt = 1,
199 HasCbrt = 1,
200 HasTanh = EIGEN_FAST_MATH,
201 HasErf = EIGEN_FAST_MATH,
202 HasErfc = EIGEN_FAST_MATH,
203 HasBlend = 1,
204 HasSign = 0 // The manually vectorized version is slightly slower for SSE.
205 };
206};
207template <>
208struct packet_traits<double> : default_packet_traits {
209 typedef Packet2d type;
210 typedef Packet2d half;
211 enum {
212 Vectorizable = 1,
213 AlignedOnScalar = 1,
214 size = 2,
215
216 HasCmp = 1,
217 HasDiv = 1,
218 HasSin = EIGEN_FAST_MATH,
219 HasCos = EIGEN_FAST_MATH,
220 HasTanh = EIGEN_FAST_MATH,
221 HasErf = EIGEN_FAST_MATH,
222 HasErfc = EIGEN_FAST_MATH,
223 HasLog = 1,
224 HasExp = 1,
225 HasLog1p = 1,
226 HasExpm1 = 1,
227 HasPow = 1,
228 HasSqrt = 1,
229 HasRsqrt = 1,
230 HasCbrt = 1,
231 HasATan = 1,
232 HasATanh = 1,
233 HasBlend = 1
234 };
235};
236template <>
237struct packet_traits<int> : default_packet_traits {
238 typedef Packet4i type;
239 typedef Packet4i half;
240 enum {
241 Vectorizable = 1,
242 AlignedOnScalar = 1,
243 size = 4,
244
245 HasCmp = 1,
246 HasDiv = 1,
247 HasShift = 1,
248 HasBlend = 1
249 };
250};
251template <>
252struct packet_traits<uint32_t> : default_packet_traits {
253 typedef Packet4ui type;
254 typedef Packet4ui half;
255 enum {
256 Vectorizable = 1,
257 AlignedOnScalar = 1,
258 size = 4,
259
260 HasDiv = 0,
261 HasNegate = 0,
262 HasCmp = 1,
263 HasShift = 1,
264 HasBlend = 1
265 };
266};
267template <>
268struct packet_traits<int64_t> : default_packet_traits {
269 typedef Packet2l type;
270 typedef Packet2l half;
271 enum {
272 Vectorizable = 1,
273 AlignedOnScalar = 1,
274 size = 2,
275
276 HasDiv = 0,
277 HasCmp = 1,
278 HasShift = 1,
279 HasBlend = 1
280 };
281};
282#endif
283template <>
284struct packet_traits<bool> : default_packet_traits {
285 typedef Packet16b type;
286 typedef Packet16b half;
287 enum {
288 Vectorizable = 1,
289 AlignedOnScalar = 1,
290 size = 16,
291
292 HasCmp = 1,
293 HasShift = 0,
294 HasAbs = 0,
295 HasMin = 0,
296 HasMax = 0,
297 HasConj = 0,
298 HasSqrt = 1,
299 HasNegate = 0,
300 HasSign = 0 // Don't try to vectorize psign<bool> = identity.
301 };
302};
303
304template <>
305struct unpacket_traits<Packet4f> {
306 typedef float type;
307 typedef Packet4f half;
308 typedef Packet4i integer_packet;
309 enum {
310 size = 4,
311 alignment = Aligned16,
312 vectorizable = true,
313 masked_load_available = false,
314 masked_store_available = false
315 };
316};
317template <>
318struct unpacket_traits<Packet2d> {
319 typedef double type;
320 typedef Packet2d half;
321 typedef Packet2l integer_packet;
322 enum {
323 size = 2,
324 alignment = Aligned16,
325 vectorizable = true,
326 masked_load_available = false,
327 masked_store_available = false
328 };
329};
330template <>
331struct unpacket_traits<Packet2l> {
332 typedef int64_t type;
333 typedef Packet2l half;
334 enum {
335 size = 2,
336 alignment = Aligned16,
337 vectorizable = true,
338 masked_load_available = false,
339 masked_store_available = false
340 };
341};
342template <>
343struct unpacket_traits<Packet4i> {
344 typedef int type;
345 typedef Packet4i half;
346 enum {
347 size = 4,
348 alignment = Aligned16,
349 vectorizable = true,
350 masked_load_available = false,
351 masked_store_available = false
352 };
353};
354template <>
355struct unpacket_traits<Packet4ui> {
356 typedef uint32_t type;
357 typedef Packet4ui half;
358 enum {
359 size = 4,
360 alignment = Aligned16,
361 vectorizable = true,
362 masked_load_available = false,
363 masked_store_available = false
364 };
365};
366template <>
367struct unpacket_traits<Packet16b> {
368 typedef bool type;
369 typedef Packet16b half;
370 enum {
371 size = 16,
372 alignment = Aligned16,
373 vectorizable = true,
374 masked_load_available = false,
375 masked_store_available = false
376 };
377};
378
379#ifndef EIGEN_VECTORIZE_AVX
380template <>
381struct scalar_div_cost<float, true> {
382 enum { value = 7 };
383};
384template <>
385struct scalar_div_cost<double, true> {
386 enum { value = 8 };
387};
388#endif
389
390template <>
391EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
392 return _mm_set_ps1(from);
393}
394template <>
395EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
396 return _mm_set1_pd(from);
397}
398template <>
399EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) {
400 return _mm_set1_epi64x(from);
401}
402template <>
403EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
404 return _mm_set1_epi32(from);
405}
406template <>
407EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) {
408 return _mm_set1_epi32(numext::bit_cast<int32_t>(from));
409}
410template <>
411EIGEN_STRONG_INLINE Packet16b pset1<Packet16b>(const bool& from) {
412 return _mm_set1_epi8(static_cast<char>(from));
413}
414
415template <>
416EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) {
417 return _mm_castsi128_ps(pset1<Packet4i>(from));
418}
419template <>
420EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
421 return _mm_castsi128_pd(_mm_set1_epi64x(from));
422}
423
424template <>
425EIGEN_STRONG_INLINE Packet4f peven_mask(const Packet4f& /*a*/) {
426 return _mm_castsi128_ps(_mm_set_epi32(0, -1, 0, -1));
427}
428template <>
429EIGEN_STRONG_INLINE Packet2l peven_mask(const Packet2l& /*a*/) {
430 return _mm_set_epi32(0, 0, -1, -1);
431}
432template <>
433EIGEN_STRONG_INLINE Packet4i peven_mask(const Packet4i& /*a*/) {
434 return _mm_set_epi32(0, -1, 0, -1);
435}
436template <>
437EIGEN_STRONG_INLINE Packet4ui peven_mask(const Packet4ui& /*a*/) {
438 return _mm_set_epi32(0, -1, 0, -1);
439}
440template <>
441EIGEN_STRONG_INLINE Packet2d peven_mask(const Packet2d& /*a*/) {
442 return _mm_castsi128_pd(_mm_set_epi32(0, 0, -1, -1));
443}
444
445template <>
446EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) {
447 return _mm_setzero_ps();
448}
449template <>
450EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) {
451 return _mm_setzero_pd();
452}
453template <>
454EIGEN_STRONG_INLINE Packet2l pzero(const Packet2l& /*a*/) {
455 return _mm_setzero_si128();
456}
457template <>
458EIGEN_STRONG_INLINE Packet4i pzero(const Packet4i& /*a*/) {
459 return _mm_setzero_si128();
460}
461template <>
462EIGEN_STRONG_INLINE Packet4ui pzero(const Packet4ui& /*a*/) {
463 return _mm_setzero_si128();
464}
465
466// GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
467// However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
468// Using inline assembly is also not an option because then gcc fails to reorder properly the instructions.
469// Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply.
470// Also note that with AVX, we want it to generate a vbroadcastss.
471#if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__)
472template <>
473EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float* from) {
474 return vec4f_swizzle1(_mm_load_ss(from), 0, 0, 0, 0);
475}
476#endif
477
478template <>
479EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
480 return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3, 2, 1, 0));
481}
482template <>
483EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
484 return _mm_add_pd(pset1<Packet2d>(a), _mm_set_pd(1, 0));
485}
486template <>
487EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(const int64_t& a) {
488 return _mm_add_epi32(pset1<Packet2l>(a), _mm_set_epi64x(1, 0));
489}
490template <>
491EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) {
492 return _mm_add_epi32(pset1<Packet4i>(a), _mm_set_epi32(3, 2, 1, 0));
493}
494template <>
495EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a) {
496 return _mm_add_epi32(pset1<Packet4ui>(a), _mm_set_epi32(3, 2, 1, 0));
497}
498
499template <>
500EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
501 return _mm_add_ps(a, b);
502}
503template <>
504EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
505 return _mm_add_pd(a, b);
506}
507template <>
508EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(const Packet2l& a, const Packet2l& b) {
509 return _mm_add_epi64(a, b);
510}
511template <>
512EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
513 return _mm_add_epi32(a, b);
514}
515template <>
516EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
517 return _mm_add_epi32(a, b);
518}
519
520template <>
521EIGEN_STRONG_INLINE Packet16b padd<Packet16b>(const Packet16b& a, const Packet16b& b) {
522 return _mm_or_si128(a, b);
523}
524
525template <typename Packet>
526EIGEN_STRONG_INLINE Packet padds(const Packet& a, const Packet& b);
527template <>
528EIGEN_STRONG_INLINE Packet4f padds<Packet4f>(const Packet4f& a, const Packet4f& b) {
529 return _mm_add_ss(a, b);
530}
531template <>
532EIGEN_STRONG_INLINE Packet2d padds<Packet2d>(const Packet2d& a, const Packet2d& b) {
533 return _mm_add_sd(a, b);
534}
535
536template <>
537EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
538 return _mm_sub_ps(a, b);
539}
540template <>
541EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
542 return _mm_sub_pd(a, b);
543}
544template <>
545EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) {
546 return _mm_sub_epi64(a, b);
547}
548template <>
549EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
550 return _mm_sub_epi32(a, b);
551}
552template <>
553EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
554 return _mm_sub_epi32(a, b);
555}
556template <>
557EIGEN_STRONG_INLINE Packet16b psub<Packet16b>(const Packet16b& a, const Packet16b& b) {
558 return _mm_xor_si128(a, b);
559}
560
561template <>
562EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
563template <>
564EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
565#ifdef EIGEN_VECTORIZE_SSE3
566 return _mm_addsub_ps(a, b);
567#else
568 const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x0, 0x80000000, 0x0));
569 return padd(a, pxor(mask, b));
570#endif
571}
572
573template <>
574EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d&, const Packet2d&);
575template <>
576EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b) {
577#ifdef EIGEN_VECTORIZE_SSE3
578 return _mm_addsub_pd(a, b);
579#else
580 const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0, 0x80000000, 0x0, 0x0));
581 return padd(a, pxor(mask, b));
582#endif
583}
584
585template <>
586EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
587 const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000));
588 return _mm_xor_ps(a, mask);
589}
590template <>
591EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
592 const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0, 0x80000000, 0x0, 0x80000000));
593 return _mm_xor_pd(a, mask);
594}
595template <>
596EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) {
597 return psub(pzero(a), a);
598}
599
600template <>
601EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
602 return psub(pzero(a), a);
603}
604
605template <>
606EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
607 return a;
608}
609template <>
610EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
611 return a;
612}
613template <>
614EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) {
615 return a;
616}
617template <>
618EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
619 return a;
620}
621
622template <>
623EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
624 return _mm_mul_ps(a, b);
625}
626template <>
627EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
628 return _mm_mul_pd(a, b);
629}
630template <>
631EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(const Packet2l& a, const Packet2l& b) {
632 // 64-bit mul requires avx512, so do this with 32-bit multiplication
633 __m128i upper32_a = _mm_srli_epi64(a, 32);
634 __m128i upper32_b = _mm_srli_epi64(b, 32);
635
636 // upper * lower
637 __m128i mul1 = _mm_mul_epu32(upper32_a, b);
638 __m128i mul2 = _mm_mul_epu32(upper32_b, a);
639 // Gives us both upper*upper and lower*lower
640 __m128i mul3 = _mm_mul_epu32(a, b);
641
642 __m128i high = _mm_slli_epi64(_mm_add_epi64(mul1, mul2), 32);
643 return _mm_add_epi64(high, mul3);
644}
645template <>
646EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
647#ifdef EIGEN_VECTORIZE_SSE4_1
648 return _mm_mullo_epi32(a, b);
649#else
650 // this version is slightly faster than 4 scalar products
651 return vec4i_swizzle1(
652 vec4i_swizzle2(_mm_mul_epu32(a, b), _mm_mul_epu32(vec4i_swizzle1(a, 1, 0, 3, 2), vec4i_swizzle1(b, 1, 0, 3, 2)),
653 0, 2, 0, 2),
654 0, 2, 1, 3);
655#endif
656}
657template <>
658EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
659#ifdef EIGEN_VECTORIZE_SSE4_1
660 return _mm_mullo_epi32(a, b);
661#else
662 // this version is slightly faster than 4 scalar products
663 return vec4ui_swizzle1(
664 vec4ui_swizzle2(_mm_mul_epu32(a, b),
665 _mm_mul_epu32(vec4ui_swizzle1(a, 1, 0, 3, 2), vec4ui_swizzle1(b, 1, 0, 3, 2)), 0, 2, 0, 2),
666 0, 2, 1, 3);
667#endif
668}
669
670template <>
671EIGEN_STRONG_INLINE Packet16b pmul<Packet16b>(const Packet16b& a, const Packet16b& b) {
672 return _mm_and_si128(a, b);
673}
674
675template <>
676EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
677 return _mm_div_ps(a, b);
678}
679template <>
680EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
681 return _mm_div_pd(a, b);
682}
683
684template <>
685EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
686#ifdef EIGEN_VECTORIZE_AVX
687 return _mm256_cvttpd_epi32(_mm256_div_pd(_mm256_cvtepi32_pd(a), _mm256_cvtepi32_pd(b)));
688#else
689 __m128i q_lo = _mm_cvttpd_epi32(_mm_div_pd(_mm_cvtepi32_pd(a), _mm_cvtepi32_pd(b)));
690 __m128i q_hi = _mm_cvttpd_epi32(
691 _mm_div_pd(_mm_cvtepi32_pd(vec4i_swizzle1(a, 2, 3, 0, 1)), _mm_cvtepi32_pd(vec4i_swizzle1(b, 2, 3, 0, 1))));
692 return vec4i_swizzle1(_mm_unpacklo_epi32(q_lo, q_hi), 0, 2, 1, 3);
693#endif
694}
695
696#ifdef EIGEN_VECTORIZE_FMA
697template <>
698EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
699 return _mm_fmadd_ps(a, b, c);
700}
701template <>
702EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
703 return _mm_fmadd_pd(a, b, c);
704}
705template <>
706EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
707 return _mm_fmsub_ps(a, b, c);
708}
709template <>
710EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
711 return _mm_fmsub_pd(a, b, c);
712}
713template <>
714EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
715 return _mm_fnmadd_ps(a, b, c);
716}
717template <>
718EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
719 return _mm_fnmadd_pd(a, b, c);
720}
721template <>
722EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
723 return _mm_fnmsub_ps(a, b, c);
724}
725template <>
726EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
727 return _mm_fnmsub_pd(a, b, c);
728}
729
730template <typename Packet>
731EIGEN_STRONG_INLINE Packet pmadds(const Packet& a, const Packet& b, const Packet& c);
732template <>
733EIGEN_STRONG_INLINE Packet4f pmadds<Packet4f>(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
734 return _mm_fmadd_ss(a, b, c);
735}
736template <>
737EIGEN_STRONG_INLINE Packet2d pmadds<Packet2d>(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
738 return _mm_fmadd_sd(a, b, c);
739}
740#endif
741
742#ifdef EIGEN_VECTORIZE_SSE4_1
743template <>
744EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
745 return _mm_blendv_ps(b, a, mask);
746}
747
748template <>
749EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b) {
750 return _mm_castpd_si128(_mm_blendv_pd(_mm_castsi128_pd(b), _mm_castsi128_pd(a), _mm_castsi128_pd(mask)));
751}
752
753template <>
754EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
755 return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(a), _mm_castsi128_ps(mask)));
756}
757
758template <>
759EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) {
760 return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(a), _mm_castsi128_ps(mask)));
761}
762
763template <>
764EIGEN_STRONG_INLINE Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) {
765 return _mm_blendv_pd(b, a, mask);
766}
767#endif
768
769template <>
770EIGEN_STRONG_INLINE Packet2l ptrue<Packet2l>(const Packet2l& a) {
771 return _mm_cmpeq_epi32(a, a);
772}
773template <>
774EIGEN_STRONG_INLINE Packet4i ptrue<Packet4i>(const Packet4i& a) {
775 return _mm_cmpeq_epi32(a, a);
776}
777template <>
778EIGEN_STRONG_INLINE Packet16b ptrue<Packet16b>(const Packet16b& /*a*/) {
779 return pset1<Packet16b>(true);
780}
781template <>
782EIGEN_STRONG_INLINE Packet4f ptrue<Packet4f>(const Packet4f& a) {
783 Packet4i b = _mm_castps_si128(a);
784 return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b));
785}
786template <>
787EIGEN_STRONG_INLINE Packet2d ptrue<Packet2d>(const Packet2d& a) {
788 Packet4i b = _mm_castpd_si128(a);
789 return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b));
790}
791
792template <>
793EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
794 return _mm_and_ps(a, b);
795}
796template <>
797EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
798 return _mm_and_pd(a, b);
799}
800template <>
801EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(const Packet2l& a, const Packet2l& b) {
802 return _mm_and_si128(a, b);
803}
804template <>
805EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
806 return _mm_and_si128(a, b);
807}
808template <>
809EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
810 return _mm_and_si128(a, b);
811}
812template <>
813EIGEN_STRONG_INLINE Packet16b pand<Packet16b>(const Packet16b& a, const Packet16b& b) {
814 return _mm_and_si128(a, b);
815}
816
817template <>
818EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
819 return _mm_or_ps(a, b);
820}
821template <>
822EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
823 return _mm_or_pd(a, b);
824}
825template <>
826EIGEN_STRONG_INLINE Packet2l por<Packet2l>(const Packet2l& a, const Packet2l& b) {
827 return _mm_or_si128(a, b);
828}
829template <>
830EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
831 return _mm_or_si128(a, b);
832}
833template <>
834EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
835 return _mm_or_si128(a, b);
836}
837template <>
838EIGEN_STRONG_INLINE Packet16b por<Packet16b>(const Packet16b& a, const Packet16b& b) {
839 return _mm_or_si128(a, b);
840}
841
842template <>
843EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
844 return _mm_xor_ps(a, b);
845}
846template <>
847EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
848 return _mm_xor_pd(a, b);
849}
850template <>
851EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(const Packet2l& a, const Packet2l& b) {
852 return _mm_xor_si128(a, b);
853}
854template <>
855EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
856 return _mm_xor_si128(a, b);
857}
858template <>
859EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
860 return _mm_xor_si128(a, b);
861}
862template <>
863EIGEN_STRONG_INLINE Packet16b pxor<Packet16b>(const Packet16b& a, const Packet16b& b) {
864 return _mm_xor_si128(a, b);
865}
866
867template <>
868EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
869 return _mm_andnot_ps(b, a);
870}
871template <>
872EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
873 return _mm_andnot_pd(b, a);
874}
875template <>
876EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, const Packet2l& b) {
877 return _mm_andnot_si128(b, a);
878}
879template <>
880EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
881 return _mm_andnot_si128(b, a);
882}
883template <>
884EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
885 return _mm_andnot_si128(b, a);
886}
887template <>
888EIGEN_STRONG_INLINE Packet16b pandnot<Packet16b>(const Packet16b& a, const Packet16b& b) {
889 return _mm_andnot_si128(b, a);
890}
891template <>
892EIGEN_STRONG_INLINE Packet16b pcmp_lt(const Packet16b& a, const Packet16b& b) {
893 return _mm_andnot_si128(a, b);
894}
895template <>
896EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) {
897 return _mm_cmple_ps(a, b);
898}
899template <>
900EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) {
901 return _mm_cmplt_ps(a, b);
902}
903template <>
904EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
905 return _mm_cmpnge_ps(a, b);
906}
907template <>
908EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) {
909 return _mm_cmpeq_ps(a, b);
910}
911
912template <>
913EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) {
914 return _mm_cmple_pd(a, b);
915}
916template <>
917EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) {
918 return _mm_cmplt_pd(a, b);
919}
920template <>
921EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
922 return _mm_cmpnge_pd(a, b);
923}
924template <>
925EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) {
926 return _mm_cmpeq_pd(a, b);
927}
928template <>
929EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) {
930 return _mm_cmplt_epi32(a, b);
931}
932template <>
933EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) {
934 return _mm_cmpeq_epi32(a, b);
935}
936template <>
937EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) {
938#ifdef EIGEN_VECTORIZE_SSE4_1
939 return _mm_cmpeq_epi32(a, _mm_min_epi32(a, b));
940#else
941 return por(pcmp_lt(a, b), pcmp_eq(a, b));
942#endif
943}
944template <>
945EIGEN_STRONG_INLINE Packet2l pcmp_lt(const Packet2l& a, const Packet2l& b) {
946#ifdef EIGEN_VECTORIZE_SSE4_2
947 return _mm_cmpgt_epi64(b, a);
948#else
949 Packet4i eq = pcmp_eq<Packet4i>(Packet4i(a), Packet4i(b));
950 Packet2l hi_eq = Packet2l(_mm_shuffle_epi32(eq, (shuffle_mask<1, 1, 3, 3>::mask)));
951 Packet4i lt = pcmp_lt<Packet4i>(Packet4i(a), Packet4i(b));
952 Packet2l hi_lt = Packet2l(_mm_shuffle_epi32(lt, (shuffle_mask<1, 1, 3, 3>::mask)));
953 Packet2l lo_lt = Packet2l(_mm_shuffle_epi32(lt, (shuffle_mask<0, 0, 2, 2>::mask)));
954 // return hi(a) < hi(b) || (hi(a) == hi(b) && lo(a) < lo(b))
955 return por(hi_lt, pand(hi_eq, lo_lt));
956#endif
957}
958template <>
959EIGEN_STRONG_INLINE Packet2l pcmp_eq(const Packet2l& a, const Packet2l& b) {
960#ifdef EIGEN_VECTORIZE_SSE4_1
961 return _mm_cmpeq_epi64(a, b);
962#else
963 Packet4i tmp = pcmp_eq<Packet4i>(Packet4i(a), Packet4i(b));
964 return Packet2l(pand<Packet4i>(tmp, _mm_shuffle_epi32(tmp, (shuffle_mask<1, 0, 3, 2>::mask))));
965#endif
966}
967template <>
968EIGEN_STRONG_INLINE Packet2l pcmp_le(const Packet2l& a, const Packet2l& b) {
969 return por(pcmp_lt(a, b), pcmp_eq(a, b));
970}
971template <>
972EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) {
973 // Mask out invalid bool bits to avoid UB.
974 const Packet16b kBoolMask = pset1<Packet16b>(true);
975 return _mm_and_si128(_mm_cmpeq_epi8(a, b), kBoolMask);
976}
977template <>
978EIGEN_STRONG_INLINE Packet4ui pcmp_eq(const Packet4ui& a, const Packet4ui& b) {
979 return _mm_cmpeq_epi32(a, b);
980}
981
982template <>
983EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
984#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
985// There appears to be a bug in GCC, by which the optimizer may
986// flip the argument order in calls to _mm_min_ps, so we have to
987// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
988// see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
989#ifdef EIGEN_VECTORIZE_AVX
990 Packet4f res;
991 asm("vminps %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
992#else
993 Packet4f res = b;
994 asm("minps %[a], %[res]" : [res] "+x"(res) : [a] "x"(a));
995#endif
996 return res;
997#else
998 // Arguments are reversed to match NaN propagation behavior of std::min.
999 return _mm_min_ps(b, a);
1000#endif
1001}
1002template <>
1003EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
1004#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
1005// There appears to be a bug in GCC, by which the optimizer may
1006// flip the argument order in calls to _mm_min_pd, so we have to
1007// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
1008// see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
1009#ifdef EIGEN_VECTORIZE_AVX
1010 Packet2d res;
1011 asm("vminpd %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
1012#else
1013 Packet2d res = b;
1014 asm("minpd %[a], %[res]" : [res] "+x"(res) : [a] "x"(a));
1015#endif
1016 return res;
1017#else
1018 // Arguments are reversed to match NaN propagation behavior of std::min.
1019 return _mm_min_pd(b, a);
1020#endif
1021}
1022template <>
1023EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(const Packet2l& a, const Packet2l& b) {
1024 Packet2l a_lt_mask = pcmp_lt(a, b);
1025 return por(pandnot(b, a_lt_mask), pand(a, a_lt_mask));
1026}
1027template <>
1028EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
1029#ifdef EIGEN_VECTORIZE_SSE4_1
1030 return _mm_min_epi32(a, b);
1031#else
1032 // after some bench, this version *is* faster than a scalar implementation
1033 Packet4i mask = _mm_cmplt_epi32(a, b);
1034 return _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b));
1035#endif
1036}
1037template <>
1038EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1039#ifdef EIGEN_VECTORIZE_SSE4_1
1040 return _mm_min_epu32(a, b);
1041#else
1042 return padd((Packet4ui)pmin((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
1043 (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL))),
1044 pset1<Packet4ui>(0x80000000UL));
1045#endif
1046}
1047
1048template <>
1049EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
1050#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
1051// There appears to be a bug in GCC, by which the optimizer may
1052// flip the argument order in calls to _mm_max_ps, so we have to
1053// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
1054// see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
1055#ifdef EIGEN_VECTORIZE_AVX
1056 Packet4f res;
1057 asm("vmaxps %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
1058#else
1059 Packet4f res = b;
1060 asm("maxps %[a], %[res]" : [res] "+x"(res) : [a] "x"(a));
1061#endif
1062 return res;
1063#else
1064 // Arguments are reversed to match NaN propagation behavior of std::max.
1065 return _mm_max_ps(b, a);
1066#endif
1067}
1068template <>
1069EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
1070#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
1071// There appears to be a bug in GCC, by which the optimizer may
1072// flip the argument order in calls to _mm_max_pd, so we have to
1073// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
1074// see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
1075#ifdef EIGEN_VECTORIZE_AVX
1076 Packet2d res;
1077 asm("vmaxpd %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
1078#else
1079 Packet2d res = b;
1080 asm("maxpd %[a], %[res]" : [res] "+x"(res) : [a] "x"(a));
1081#endif
1082 return res;
1083#else
1084 // Arguments are reversed to match NaN propagation behavior of std::max.
1085 return _mm_max_pd(b, a);
1086#endif
1087}
1088template <>
1089EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(const Packet2l& a, const Packet2l& b) {
1090 Packet2l a_lt_mask = pcmp_lt(a, b);
1091 return por(pandnot(a, a_lt_mask), pand(b, a_lt_mask));
1092}
1093template <>
1094EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
1095#ifdef EIGEN_VECTORIZE_SSE4_1
1096 return _mm_max_epi32(a, b);
1097#else
1098 // after some bench, this version *is* faster than a scalar implementation
1099 Packet4i mask = _mm_cmpgt_epi32(a, b);
1100 return _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b));
1101#endif
1102}
1103template <>
1104EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1105#ifdef EIGEN_VECTORIZE_SSE4_1
1106 return _mm_max_epu32(a, b);
1107#else
1108 return padd((Packet4ui)pmax((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
1109 (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL))),
1110 pset1<Packet4ui>(0x80000000UL));
1111#endif
1112}
1113
1114template <>
1115EIGEN_STRONG_INLINE Packet4ui pcmp_lt(const Packet4ui& a, const Packet4ui& b) {
1116#ifdef EIGEN_VECTORIZE_SSE4_1
1117 return pxor(pcmp_eq(a, pmax(a, b)), ptrue(a));
1118#else
1119 return (Packet4ui)pcmp_lt((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
1120 (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL)));
1121#endif
1122}
1123template <>
1124EIGEN_STRONG_INLINE Packet4ui pcmp_le(const Packet4ui& a, const Packet4ui& b) {
1125#ifdef EIGEN_VECTORIZE_SSE4_1
1126 return pcmp_eq(a, pmin(a, b));
1127#else
1128 return (Packet4ui)pcmp_le((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
1129 (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL)));
1130#endif
1131}
1132
1133template <typename Packet, typename Op>
1134EIGEN_STRONG_INLINE Packet pminmax_propagate_numbers(const Packet& a, const Packet& b, Op op) {
1135 // In this implementation, we take advantage of the fact that pmin/pmax for SSE
1136 // always return a if either a or b is NaN.
1137 Packet not_nan_mask_a = pcmp_eq(a, a);
1138 Packet m = op(a, b);
1139 return pselect<Packet>(not_nan_mask_a, m, b);
1140}
1141
1142template <typename Packet, typename Op>
1143EIGEN_STRONG_INLINE Packet pminmax_propagate_nan(const Packet& a, const Packet& b, Op op) {
1144 // In this implementation, we take advantage of the fact that pmin/pmax for SSE
1145 // always return a if either a or b is NaN.
1146 Packet not_nan_mask_a = pcmp_eq(a, a);
1147 Packet m = op(b, a);
1148 return pselect<Packet>(not_nan_mask_a, m, a);
1149}
1150
1151// Add specializations for min/max with prescribed NaN propagation.
1152template <>
1153EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
1154 return pminmax_propagate_numbers(a, b, pmin<Packet4f>);
1155}
1156template <>
1157EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
1158 return pminmax_propagate_numbers(a, b, pmin<Packet2d>);
1159}
1160template <>
1161EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
1162 return pminmax_propagate_numbers(a, b, pmax<Packet4f>);
1163}
1164template <>
1165EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
1166 return pminmax_propagate_numbers(a, b, pmax<Packet2d>);
1167}
1168template <>
1169EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
1170 return pminmax_propagate_nan(a, b, pmin<Packet4f>);
1171}
1172template <>
1173EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
1174 return pminmax_propagate_nan(a, b, pmin<Packet2d>);
1175}
1176template <>
1177EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
1178 return pminmax_propagate_nan(a, b, pmax<Packet4f>);
1179}
1180template <>
1181EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
1182 return pminmax_propagate_nan(a, b, pmax<Packet2d>);
1183}
1184
1185template <>
1186EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
1187 return _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(a), 31));
1188}
1189template <>
1190EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
1191 Packet4f tmp = psignbit<Packet4f>(_mm_castpd_ps(a));
1192#ifdef EIGEN_VECTORIZE_AVX
1193 return _mm_castps_pd(_mm_permute_ps(tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
1194#else
1195 return _mm_castps_pd(_mm_shuffle_ps(tmp, tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
1196#endif // EIGEN_VECTORIZE_AVX
1197}
1198template <>
1199EIGEN_STRONG_INLINE Packet4i psignbit(const Packet4i& a) {
1200 return _mm_srai_epi32(a, 31);
1201}
1202template <>
1203EIGEN_STRONG_INLINE Packet4ui psignbit(const Packet4ui& a) {
1204 return pzero(a);
1205}
1206template <>
1207EIGEN_STRONG_INLINE Packet2l psignbit(const Packet2l& a) {
1208 Packet4i tmp = psignbit<Packet4i>(Packet4i(a));
1209 return Packet2l(_mm_shuffle_epi32(tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
1210}
1211
1212template <int N>
1213EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(const Packet2l& a) {
1214 Packet2l signbit = psignbit(a);
1215 return por(_mm_slli_epi64(signbit, 64 - N), _mm_srli_epi64(a, N));
1216}
1217template <int N>
1218EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
1219 return _mm_srli_epi64(a, N);
1220}
1221template <int N>
1222EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
1223 return _mm_slli_epi64(a, N);
1224}
1225template <int N>
1226EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
1227 return _mm_srai_epi32(a, N);
1228}
1229template <int N>
1230EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) {
1231 return _mm_srli_epi32(a, N);
1232}
1233template <int N>
1234EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
1235 return _mm_slli_epi32(a, N);
1236}
1237template <int N>
1238EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(const Packet4ui& a) {
1239 return _mm_srli_epi32(a, N);
1240}
1241template <int N>
1242EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a) {
1243 return _mm_srli_epi32(a, N);
1244}
1245template <int N>
1246EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a) {
1247 return _mm_slli_epi32(a, N);
1248}
1249
1250template <>
1251EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
1252 const __m128i mask = _mm_setr_epi32(0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF);
1253 return _mm_castsi128_ps(_mm_and_si128(mask, _mm_castps_si128(a)));
1254}
1255template <>
1256EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
1257 const __m128i mask = _mm_setr_epi32(0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF);
1258 return _mm_castsi128_pd(_mm_and_si128(mask, _mm_castpd_si128(a)));
1259}
1260template <>
1261EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) {
1262 Packet2l signbit = psignbit(a);
1263 return _mm_sub_epi64(_mm_xor_si128(a, signbit), signbit);
1264}
1265template <>
1266EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
1267#ifdef EIGEN_VECTORIZE_SSSE3
1268 return _mm_abs_epi32(a);
1269#else
1270 Packet4i signbit = psignbit(a);
1271 return _mm_sub_epi32(_mm_xor_si128(a, signbit), signbit);
1272#endif
1273}
1274template <>
1275EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) {
1276 return a;
1277}
1278
1279#ifdef EIGEN_VECTORIZE_SSE4_1
1280template <>
1281EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
1282 // Unfortunately _mm_round_ps doesn't have a rounding mode to implement numext::round.
1283 const Packet4f mask = pset1frombits<Packet4f>(0x80000000u);
1284 const Packet4f prev0dot5 = pset1frombits<Packet4f>(0x3EFFFFFFu);
1285 return _mm_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
1286}
1287
1288template <>
1289EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
1290 const Packet2d mask = _mm_castsi128_pd(_mm_set_epi64x(0x8000000000000000ull, 0x8000000000000000ull));
1291 const Packet2d prev0dot5 = _mm_castsi128_pd(_mm_set_epi64x(0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull));
1292 return _mm_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
1293}
1294
1295template <>
1296EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
1297 return _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION);
1298}
1299template <>
1300EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
1301 return _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
1302}
1303
1304template <>
1305EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
1306 return _mm_ceil_ps(a);
1307}
1308template <>
1309EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
1310 return _mm_ceil_pd(a);
1311}
1312
1313template <>
1314EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
1315 return _mm_floor_ps(a);
1316}
1317template <>
1318EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
1319 return _mm_floor_pd(a);
1320}
1321
1322template <>
1323EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(const Packet4f& a) {
1324 return _mm_round_ps(a, _MM_FROUND_TRUNC);
1325}
1326template <>
1327EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(const Packet2d& a) {
1328 return _mm_round_pd(a, _MM_FROUND_TRUNC);
1329}
1330#endif
1331
1332template <>
1333EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
1334 EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from);
1335}
1336template <>
1337EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
1338 EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from);
1339}
1340template <>
1341EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from) {
1342 EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
1343}
1344template <>
1345EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) {
1346 EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
1347}
1348template <>
1349EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from) {
1350 EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
1351}
1352template <>
1353EIGEN_STRONG_INLINE Packet16b pload<Packet16b>(const bool* from) {
1354 EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
1355}
1356
1357#if EIGEN_COMP_MSVC
1358template <>
1359EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
1360 EIGEN_DEBUG_UNALIGNED_LOAD
1361 return _mm_loadu_ps(from);
1362}
1363#else
1364// NOTE: with the code below, MSVC's compiler crashes!
1365
1366template <>
1367EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
1368 EIGEN_DEBUG_UNALIGNED_LOAD
1369 return _mm_loadu_ps(from);
1370}
1371#endif
1372
1373template <>
1374EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
1375 EIGEN_DEBUG_UNALIGNED_LOAD
1376 return _mm_loadu_pd(from);
1377}
1378template <>
1379EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(const int64_t* from) {
1380 EIGEN_DEBUG_UNALIGNED_LOAD
1381 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
1382}
1383template <>
1384EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) {
1385 EIGEN_DEBUG_UNALIGNED_LOAD
1386 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
1387}
1388template <>
1389EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from) {
1390 EIGEN_DEBUG_UNALIGNED_LOAD
1391 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
1392}
1393template <>
1394EIGEN_STRONG_INLINE Packet16b ploadu<Packet16b>(const bool* from) {
1395 EIGEN_DEBUG_UNALIGNED_LOAD
1396 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
1397}
1398
1399// Load lower part of packet zero extending.
1400template <typename Packet>
1401EIGEN_STRONG_INLINE Packet ploadl(const typename unpacket_traits<Packet>::type* from);
1402template <>
1403EIGEN_STRONG_INLINE Packet4f ploadl<Packet4f>(const float* from) {
1404 EIGEN_DEBUG_UNALIGNED_LOAD return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from)));
1405}
1406template <>
1407EIGEN_STRONG_INLINE Packet2d ploadl<Packet2d>(const double* from) {
1408 EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_sd(from);
1409}
1410
1411// Load scalar
1412template <typename Packet>
1413EIGEN_STRONG_INLINE Packet ploads(const typename unpacket_traits<Packet>::type* from);
1414template <>
1415EIGEN_STRONG_INLINE Packet4f ploads<Packet4f>(const float* from) {
1416 EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_ss(from);
1417}
1418template <>
1419EIGEN_STRONG_INLINE Packet2d ploads<Packet2d>(const double* from) {
1420 EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_sd(from);
1421}
1422
1423template <>
1424EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
1425 return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))), 0, 0, 1, 1);
1426}
1427template <>
1428EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
1429 return pset1<Packet2d>(from[0]);
1430}
1431template <>
1432EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(const int64_t* from) {
1433 return pset1<Packet2l>(from[0]);
1434}
1435template <>
1436EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) {
1437 Packet4i tmp;
1438 tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
1439 return vec4i_swizzle1(tmp, 0, 0, 1, 1);
1440}
1441template <>
1442EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from) {
1443 Packet4ui tmp;
1444 tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
1445 return vec4ui_swizzle1(tmp, 0, 0, 1, 1);
1446}
1447
1448// Loads 8 bools from memory and returns the packet
1449// {b0, b0, b1, b1, b2, b2, b3, b3, b4, b4, b5, b5, b6, b6, b7, b7}
1450template <>
1451EIGEN_STRONG_INLINE Packet16b ploaddup<Packet16b>(const bool* from) {
1452 __m128i tmp = _mm_castpd_si128(pload1<Packet2d>(reinterpret_cast<const double*>(from)));
1453 return _mm_unpacklo_epi8(tmp, tmp);
1454}
1455
1456// Loads 4 bools from memory and returns the packet
1457// {b0, b0 b0, b0, b1, b1, b1, b1, b2, b2, b2, b2, b3, b3, b3, b3}
1458template <>
1459EIGEN_STRONG_INLINE Packet16b ploadquad<Packet16b>(const bool* from) {
1460 __m128i tmp = _mm_castps_si128(pload1<Packet4f>(reinterpret_cast<const float*>(from)));
1461 tmp = _mm_unpacklo_epi8(tmp, tmp);
1462 return _mm_unpacklo_epi16(tmp, tmp);
1463}
1464
1465template <>
1466EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
1467 EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from);
1468}
1469template <>
1470EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
1471 EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from);
1472}
1473template <>
1474EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from) {
1475 EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
1476}
1477template <>
1478EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) {
1479 EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
1480}
1481template <>
1482EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from) {
1483 EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
1484}
1485template <>
1486EIGEN_STRONG_INLINE void pstore<bool>(bool* to, const Packet16b& from) {
1487 EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
1488}
1489
1490template <>
1491EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
1492 EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from);
1493}
1494template <>
1495EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
1496 EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from);
1497}
1498template <>
1499EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from) {
1500 EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
1501}
1502template <>
1503EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) {
1504 EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
1505}
1506template <>
1507EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from) {
1508 EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
1509}
1510template <>
1511EIGEN_STRONG_INLINE void pstoreu<bool>(bool* to, const Packet16b& from) {
1512 EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
1513}
1514
1515template <typename Scalar, typename Packet>
1516EIGEN_STRONG_INLINE void pstorel(Scalar* to, const Packet& from);
1517template <>
1518EIGEN_STRONG_INLINE void pstorel(float* to, const Packet4f& from) {
1519 EIGEN_DEBUG_UNALIGNED_STORE _mm_storel_pi(reinterpret_cast<__m64*>(to), from);
1520}
1521template <>
1522EIGEN_STRONG_INLINE void pstorel(double* to, const Packet2d& from) {
1523 EIGEN_DEBUG_UNALIGNED_STORE _mm_storel_pd(to, from);
1524}
1525
1526template <typename Scalar, typename Packet>
1527EIGEN_STRONG_INLINE void pstores(Scalar* to, const Packet& from);
1528template <>
1529EIGEN_STRONG_INLINE void pstores(float* to, const Packet4f& from) {
1530 EIGEN_DEBUG_UNALIGNED_STORE _mm_store_ss(to, from);
1531}
1532template <>
1533EIGEN_STRONG_INLINE void pstores(double* to, const Packet2d& from) {
1534 EIGEN_DEBUG_UNALIGNED_STORE _mm_store_sd(to, from);
1535}
1536
1537template <>
1538EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
1539 return _mm_shuffle_ps(a, a, 0x1B);
1540}
1541template <>
1542EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
1543 return _mm_shuffle_pd(a, a, 0x1);
1544}
1545template <>
1546EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a) {
1547 return _mm_castpd_si128(preverse(_mm_castsi128_pd(a)));
1548}
1549template <>
1550EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
1551 return _mm_shuffle_epi32(a, 0x1B);
1552}
1553template <>
1554EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) {
1555 return _mm_shuffle_epi32(a, 0x1B);
1556}
1557template <>
1558EIGEN_STRONG_INLINE Packet16b preverse(const Packet16b& a) {
1559#ifdef EIGEN_VECTORIZE_SSSE3
1560 __m128i mask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1561 return _mm_shuffle_epi8(a, mask);
1562#else
1563 Packet16b tmp = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3));
1564 tmp = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
1565 return _mm_or_si128(_mm_slli_epi16(tmp, 8), _mm_srli_epi16(tmp, 8));
1566#endif
1567}
1568
1569#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
1570// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
1571// Direct of the struct members fixed bug #62.
1572template <>
1573EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
1574 return a.m128_f32[0];
1575}
1576template <>
1577EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
1578 return a.m128d_f64[0];
1579}
1580template <>
1581EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
1582 int64_t x = _mm_extract_epi64_0(a);
1583 return x;
1584}
1585template <>
1586EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
1587 int x = _mm_cvtsi128_si32(a);
1588 return x;
1589}
1590template <>
1591EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
1592 uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
1593 return x;
1594}
1595#elif EIGEN_COMP_MSVC_STRICT
1596// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
1597template <>
1598EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
1599 float x = _mm_cvtss_f32(a);
1600 return x;
1601}
1602template <>
1603EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
1604 double x = _mm_cvtsd_f64(a);
1605 return x;
1606}
1607template <>
1608EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
1609 int64_t x = _mm_extract_epi64_0(a);
1610 return x;
1611}
1612template <>
1613EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
1614 int x = _mm_cvtsi128_si32(a);
1615 return x;
1616}
1617template <>
1618EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
1619 uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
1620 return x;
1621}
1622#else
1623template <>
1624EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
1625 return _mm_cvtss_f32(a);
1626}
1627template <>
1628EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
1629 return _mm_cvtsd_f64(a);
1630}
1631template <>
1632EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
1633 return _mm_extract_epi64_0(a);
1634}
1635template <>
1636EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
1637 return _mm_cvtsi128_si32(a);
1638}
1639template <>
1640EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
1641 return numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
1642}
1643#endif
1644template <>
1645EIGEN_STRONG_INLINE bool pfirst<Packet16b>(const Packet16b& a) {
1646 int x = _mm_cvtsi128_si32(a);
1647 return static_cast<bool>(x & 1);
1648}
1649
1650template <>
1651EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
1652 return _mm_set_ps(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
1653}
1654template <>
1655EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
1656 return _mm_set_pd(from[1 * stride], from[0 * stride]);
1657}
1658template <>
1659EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride) {
1660 return _mm_set_epi64x(from[1 * stride], from[0 * stride]);
1661}
1662template <>
1663EIGEN_STRONG_INLINE Packet4i pgather<int, Packet4i>(const int* from, Index stride) {
1664 return _mm_set_epi32(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
1665}
1666template <>
1667EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride) {
1668 return _mm_set_epi32(numext::bit_cast<int32_t>(from[3 * stride]), numext::bit_cast<int32_t>(from[2 * stride]),
1669 numext::bit_cast<int32_t>(from[1 * stride]), numext::bit_cast<int32_t>(from[0 * stride]));
1670}
1671
1672template <>
1673EIGEN_STRONG_INLINE Packet16b pgather<bool, Packet16b>(const bool* from, Index stride) {
1674 return _mm_set_epi8(from[15 * stride], from[14 * stride], from[13 * stride], from[12 * stride], from[11 * stride],
1675 from[10 * stride], from[9 * stride], from[8 * stride], from[7 * stride], from[6 * stride],
1676 from[5 * stride], from[4 * stride], from[3 * stride], from[2 * stride], from[1 * stride],
1677 from[0 * stride]);
1678}
1679
1680template <>
1681EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
1682 to[stride * 0] = pfirst(from);
1683 to[stride * 1] = pfirst(Packet4f(_mm_shuffle_ps(from, from, 1)));
1684 to[stride * 2] = pfirst(Packet4f(_mm_shuffle_ps(from, from, 2)));
1685 to[stride * 3] = pfirst(Packet4f(_mm_shuffle_ps(from, from, 3)));
1686}
1687template <>
1688EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
1689 to[stride * 0] = pfirst(from);
1690 to[stride * 1] = pfirst(preverse(from));
1691}
1692template <>
1693EIGEN_STRONG_INLINE void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from, Index stride) {
1694 to[stride * 0] = pfirst(from);
1695 to[stride * 1] = pfirst(preverse(from));
1696}
1697template <>
1698EIGEN_STRONG_INLINE void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) {
1699 to[stride * 0] = _mm_cvtsi128_si32(from);
1700 to[stride * 1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
1701 to[stride * 2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
1702 to[stride * 3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
1703}
1704template <>
1705EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index stride) {
1706 to[stride * 0] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(from));
1707 to[stride * 1] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)));
1708 to[stride * 2] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)));
1709 to[stride * 3] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)));
1710}
1711template <>
1712EIGEN_STRONG_INLINE void pscatter<bool, Packet16b>(bool* to, const Packet16b& from, Index stride) {
1713 EIGEN_ALIGN16 bool tmp[16];
1714 pstore(tmp, from);
1715 to[stride * 0] = tmp[0];
1716 to[stride * 1] = tmp[1];
1717 to[stride * 2] = tmp[2];
1718 to[stride * 3] = tmp[3];
1719 to[stride * 4] = tmp[4];
1720 to[stride * 5] = tmp[5];
1721 to[stride * 6] = tmp[6];
1722 to[stride * 7] = tmp[7];
1723 to[stride * 8] = tmp[8];
1724 to[stride * 9] = tmp[9];
1725 to[stride * 10] = tmp[10];
1726 to[stride * 11] = tmp[11];
1727 to[stride * 12] = tmp[12];
1728 to[stride * 13] = tmp[13];
1729 to[stride * 14] = tmp[14];
1730 to[stride * 15] = tmp[15];
1731}
1732
1733// some compilers might be tempted to perform multiple moves instead of using a vector path.
1734template <>
1735EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a) {
1736 Packet4f pa = _mm_set_ss(a);
1737 pstore(to, Packet4f(vec4f_swizzle1(pa, 0, 0, 0, 0)));
1738}
1739// some compilers might be tempted to perform multiple moves instead of using a vector path.
1740template <>
1741EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a) {
1742 Packet2d pa = _mm_set_sd(a);
1743 pstore(to, Packet2d(vec2d_swizzle1(pa, 0, 0)));
1744}
1745
1746#if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
1747typedef const void* SsePrefetchPtrType;
1748#else
1749typedef const char* SsePrefetchPtrType;
1750#endif
1751
1752#ifndef EIGEN_VECTORIZE_AVX
1753template <>
1754EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
1755 _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1756}
1757template <>
1758EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
1759 _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1760}
1761template <>
1762EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
1763 _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1764}
1765template <>
1766EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) {
1767 _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1768}
1769template <>
1770EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) {
1771 _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1772}
1773#endif
1774
1775template <>
1776EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
1777 return pfrexp_generic(a, exponent);
1778}
1779
1780// Extract exponent without existence of Packet2l.
1781template <>
1782EIGEN_STRONG_INLINE Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
1783 const Packet2d cst_exp_mask = pset1frombits<Packet2d>(static_cast<uint64_t>(0x7ff0000000000000ull));
1784 __m128i a_expo = _mm_srli_epi64(_mm_castpd_si128(pand(a, cst_exp_mask)), 52);
1785 return _mm_cvtepi32_pd(vec4i_swizzle1(a_expo, 0, 2, 1, 3));
1786}
1787
1788template <>
1789EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
1790 return pfrexp_generic(a, exponent);
1791}
1792
1793template <>
1794EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
1795 return pldexp_generic(a, exponent);
1796}
1797
1798// We specialize pldexp here, since the generic implementation uses Packet2l, which is not well
1799// supported by SSE, and has more range than is needed for exponents.
1800template <>
1801EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
1802 // Clamp exponent to [-2099, 2099]
1803 const Packet2d max_exponent = pset1<Packet2d>(2099.0);
1804 const Packet2d e = pmin(pmax(exponent, pnegate(max_exponent)), max_exponent);
1805
1806 // Convert e to integer and swizzle to low-order bits.
1807 const Packet4i ei = vec4i_swizzle1(_mm_cvtpd_epi32(e), 0, 3, 1, 3);
1808
1809 // Split 2^e into four factors and multiply:
1810 const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023);
1811 Packet4i b = parithmetic_shift_right<2>(ei); // floor(e/4)
1812 Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52)); // 2^b
1813 Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
1814 b = psub(psub(psub(ei, b), b), b); // e - 3b
1815 c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52)); // 2^(e - 3b)
1816 out = pmul(out, c); // a * 2^e
1817 return out;
1818}
1819
1820// We specialize pldexp here, since the generic implementation uses Packet2l, which is not well
1821// supported by SSE, and has more range than is needed for exponents.
1822template <>
1823EIGEN_STRONG_INLINE Packet2d pldexp_fast<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
1824 // Clamp exponent to [-1023, 1024]
1825 const Packet2d min_exponent = pset1<Packet2d>(-1023.0);
1826 const Packet2d max_exponent = pset1<Packet2d>(1024.0);
1827 const Packet2d e = pmin(pmax(exponent, min_exponent), max_exponent);
1828
1829 // Convert e to integer and swizzle to low-order bits.
1830 const Packet4i ei = vec4i_swizzle1(_mm_cvtpd_epi32(e), 0, 3, 1, 3);
1831
1832 // Compute 2^e multiply:
1833 const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023);
1834 const Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(ei, bias), 52)); // 2^e
1835 return pmul(a, c);
1836}
1837
1838// with AVX, the default implementations based on pload1 are faster
1839#ifndef __AVX__
1840template <>
1841EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
1842 a3 = pload<Packet4f>(a);
1843 a0 = vec4f_swizzle1(a3, 0, 0, 0, 0);
1844 a1 = vec4f_swizzle1(a3, 1, 1, 1, 1);
1845 a2 = vec4f_swizzle1(a3, 2, 2, 2, 2);
1846 a3 = vec4f_swizzle1(a3, 3, 3, 3, 3);
1847}
1848template <>
1849EIGEN_STRONG_INLINE void pbroadcast4<Packet2d>(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2,
1850 Packet2d& a3) {
1851#ifdef EIGEN_VECTORIZE_SSE3
1852 a0 = _mm_loaddup_pd(a + 0);
1853 a1 = _mm_loaddup_pd(a + 1);
1854 a2 = _mm_loaddup_pd(a + 2);
1855 a3 = _mm_loaddup_pd(a + 3);
1856#else
1857 a1 = pload<Packet2d>(a);
1858 a0 = vec2d_swizzle1(a1, 0, 0);
1859 a1 = vec2d_swizzle1(a1, 1, 1);
1860 a3 = pload<Packet2d>(a + 2);
1861 a2 = vec2d_swizzle1(a3, 0, 0);
1862 a3 = vec2d_swizzle1(a3, 1, 1);
1863#endif
1864}
1865#endif
1866
1867EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) {
1868 vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
1869 vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA));
1870 vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF));
1871 vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
1872}
1873
1874EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
1875 _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
1876}
1877
1878EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
1879 __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
1880 kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
1881 kernel.packet[1] = tmp;
1882}
1883
1884EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2l, 2>& kernel) {
1885 __m128i tmp = _mm_unpackhi_epi64(kernel.packet[0], kernel.packet[1]);
1886 kernel.packet[0] = _mm_unpacklo_epi64(kernel.packet[0], kernel.packet[1]);
1887 kernel.packet[1] = tmp;
1888}
1889
1890EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
1891 __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
1892 __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
1893 __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
1894 __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
1895
1896 kernel.packet[0] = _mm_unpacklo_epi64(T0, T1);
1897 kernel.packet[1] = _mm_unpackhi_epi64(T0, T1);
1898 kernel.packet[2] = _mm_unpacklo_epi64(T2, T3);
1899 kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
1900}
1901EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
1902 ptranspose((PacketBlock<Packet4i, 4>&)kernel);
1903}
1904
1905EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16b, 4>& kernel) {
1906 __m128i T0 = _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]);
1907 __m128i T1 = _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]);
1908 __m128i T2 = _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]);
1909 __m128i T3 = _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]);
1910 kernel.packet[0] = _mm_unpacklo_epi16(T0, T2);
1911 kernel.packet[1] = _mm_unpackhi_epi16(T0, T2);
1912 kernel.packet[2] = _mm_unpacklo_epi16(T1, T3);
1913 kernel.packet[3] = _mm_unpackhi_epi16(T1, T3);
1914}
1915
1916EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16b, 16>& kernel) {
1917 // If we number the elements in the input thus:
1918 // kernel.packet[ 0] = {00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 0a, 0b, 0c, 0d, 0e, 0f}
1919 // kernel.packet[ 1] = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1a, 1b, 1c, 1d, 1e, 1f}
1920 // ...
1921 // kernel.packet[15] = {f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, fa, fb, fc, fd, fe, ff},
1922 //
1923 // the desired output is:
1924 // kernel.packet[ 0] = {00, 10, 20, 30, 40, 50, 60, 70, 80, 90, a0, b0, c0, d0, e0, f0}
1925 // kernel.packet[ 1] = {01, 11, 21, 31, 41, 51, 61, 71, 81, 91, a1, b1, c1, d1, e1, f1}
1926 // ...
1927 // kernel.packet[15] = {0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, af, bf, cf, df, ef, ff},
1928 __m128i t0 =
1929 _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
1930 __m128i t1 =
1931 _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]); // 08 18 09 19 0a 1a 0b 1b 0c 1c 0d 1d 0e 1e 0f 1f
1932 __m128i t2 =
1933 _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]); // 20 30 21 31 22 32 ... 27 37
1934 __m128i t3 =
1935 _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]); // 28 38 29 39 2a 3a ... 2f 3f
1936 __m128i t4 =
1937 _mm_unpacklo_epi8(kernel.packet[4], kernel.packet[5]); // 40 50 41 51 42 52 47 57
1938 __m128i t5 = _mm_unpackhi_epi8(kernel.packet[4], kernel.packet[5]); // 48 58 49 59 4a 5a
1939 __m128i t6 = _mm_unpacklo_epi8(kernel.packet[6], kernel.packet[7]);
1940 __m128i t7 = _mm_unpackhi_epi8(kernel.packet[6], kernel.packet[7]);
1941 __m128i t8 = _mm_unpacklo_epi8(kernel.packet[8], kernel.packet[9]);
1942 __m128i t9 = _mm_unpackhi_epi8(kernel.packet[8], kernel.packet[9]);
1943 __m128i ta = _mm_unpacklo_epi8(kernel.packet[10], kernel.packet[11]);
1944 __m128i tb = _mm_unpackhi_epi8(kernel.packet[10], kernel.packet[11]);
1945 __m128i tc = _mm_unpacklo_epi8(kernel.packet[12], kernel.packet[13]);
1946 __m128i td = _mm_unpackhi_epi8(kernel.packet[12], kernel.packet[13]);
1947 __m128i te = _mm_unpacklo_epi8(kernel.packet[14], kernel.packet[15]);
1948 __m128i tf = _mm_unpackhi_epi8(kernel.packet[14], kernel.packet[15]);
1949
1950 __m128i s0 = _mm_unpacklo_epi16(t0, t2); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
1951 __m128i s1 = _mm_unpackhi_epi16(t0, t2); // 04 14 24 34
1952 __m128i s2 = _mm_unpacklo_epi16(t1, t3); // 08 18 28 38 ...
1953 __m128i s3 = _mm_unpackhi_epi16(t1, t3); // 0c 1c 2c 3c ...
1954 __m128i s4 = _mm_unpacklo_epi16(t4, t6); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
1955 __m128i s5 = _mm_unpackhi_epi16(t4, t6); // 44 54 64 74 ...
1956 __m128i s6 = _mm_unpacklo_epi16(t5, t7);
1957 __m128i s7 = _mm_unpackhi_epi16(t5, t7);
1958 __m128i s8 = _mm_unpacklo_epi16(t8, ta);
1959 __m128i s9 = _mm_unpackhi_epi16(t8, ta);
1960 __m128i sa = _mm_unpacklo_epi16(t9, tb);
1961 __m128i sb = _mm_unpackhi_epi16(t9, tb);
1962 __m128i sc = _mm_unpacklo_epi16(tc, te);
1963 __m128i sd = _mm_unpackhi_epi16(tc, te);
1964 __m128i se = _mm_unpacklo_epi16(td, tf);
1965 __m128i sf = _mm_unpackhi_epi16(td, tf);
1966
1967 __m128i u0 = _mm_unpacklo_epi32(s0, s4); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
1968 __m128i u1 = _mm_unpackhi_epi32(s0, s4); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
1969 __m128i u2 = _mm_unpacklo_epi32(s1, s5);
1970 __m128i u3 = _mm_unpackhi_epi32(s1, s5);
1971 __m128i u4 = _mm_unpacklo_epi32(s2, s6);
1972 __m128i u5 = _mm_unpackhi_epi32(s2, s6);
1973 __m128i u6 = _mm_unpacklo_epi32(s3, s7);
1974 __m128i u7 = _mm_unpackhi_epi32(s3, s7);
1975 __m128i u8 = _mm_unpacklo_epi32(s8, sc);
1976 __m128i u9 = _mm_unpackhi_epi32(s8, sc);
1977 __m128i ua = _mm_unpacklo_epi32(s9, sd);
1978 __m128i ub = _mm_unpackhi_epi32(s9, sd);
1979 __m128i uc = _mm_unpacklo_epi32(sa, se);
1980 __m128i ud = _mm_unpackhi_epi32(sa, se);
1981 __m128i ue = _mm_unpacklo_epi32(sb, sf);
1982 __m128i uf = _mm_unpackhi_epi32(sb, sf);
1983
1984 kernel.packet[0] = _mm_unpacklo_epi64(u0, u8);
1985 kernel.packet[1] = _mm_unpackhi_epi64(u0, u8);
1986 kernel.packet[2] = _mm_unpacklo_epi64(u1, u9);
1987 kernel.packet[3] = _mm_unpackhi_epi64(u1, u9);
1988 kernel.packet[4] = _mm_unpacklo_epi64(u2, ua);
1989 kernel.packet[5] = _mm_unpackhi_epi64(u2, ua);
1990 kernel.packet[6] = _mm_unpacklo_epi64(u3, ub);
1991 kernel.packet[7] = _mm_unpackhi_epi64(u3, ub);
1992 kernel.packet[8] = _mm_unpacklo_epi64(u4, uc);
1993 kernel.packet[9] = _mm_unpackhi_epi64(u4, uc);
1994 kernel.packet[10] = _mm_unpacklo_epi64(u5, ud);
1995 kernel.packet[11] = _mm_unpackhi_epi64(u5, ud);
1996 kernel.packet[12] = _mm_unpacklo_epi64(u6, ue);
1997 kernel.packet[13] = _mm_unpackhi_epi64(u6, ue);
1998 kernel.packet[14] = _mm_unpacklo_epi64(u7, uf);
1999 kernel.packet[15] = _mm_unpackhi_epi64(u7, uf);
2000}
2001
2002// Scalar path for pmadd with FMA to ensure consistency with vectorized path.
2003#if defined(EIGEN_VECTORIZE_FMA)
2004template <>
2005EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) {
2006 return std::fmaf(a, b, c);
2007}
2008template <>
2009EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) {
2010 return std::fma(a, b, c);
2011}
2012template <>
2013EIGEN_STRONG_INLINE float pmsub(const float& a, const float& b, const float& c) {
2014 return std::fmaf(a, b, -c);
2015}
2016template <>
2017EIGEN_STRONG_INLINE double pmsub(const double& a, const double& b, const double& c) {
2018 return std::fma(a, b, -c);
2019}
2020template <>
2021EIGEN_STRONG_INLINE float pnmadd(const float& a, const float& b, const float& c) {
2022 return std::fmaf(-a, b, c);
2023}
2024template <>
2025EIGEN_STRONG_INLINE double pnmadd(const double& a, const double& b, const double& c) {
2026 return std::fma(-a, b, c);
2027}
2028template <>
2029EIGEN_STRONG_INLINE float pnmsub(const float& a, const float& b, const float& c) {
2030 return std::fmaf(-a, b, -c);
2031}
2032template <>
2033EIGEN_STRONG_INLINE double pnmsub(const double& a, const double& b, const double& c) {
2034 return std::fma(-a, b, -c);
2035}
2036#endif
2037
2038#ifdef EIGEN_VECTORIZE_SSE4_1
2039// Helpers for half->float and float->half conversions.
2040// Currently only used by the AVX code.
2041EIGEN_STRONG_INLINE __m128i half2floatsse(__m128i h) {
2042 __m128i input = _mm_cvtepu16_epi32(h);
2043
2044 // Direct vectorization of half_to_float, C parts in the comments.
2045 __m128i shifted_exp = _mm_set1_epi32(0x7c00 << 13);
2046 // o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits
2047 __m128i ou = _mm_slli_epi32(_mm_and_si128(input, _mm_set1_epi32(0x7fff)), 13);
2048 // exp = shifted_exp & o.u; // just the exponent
2049 __m128i exp = _mm_and_si128(ou, shifted_exp);
2050 // o.u += (127 - 15) << 23;
2051 ou = _mm_add_epi32(ou, _mm_set1_epi32((127 - 15) << 23));
2052
2053 // Inf/NaN?
2054 __m128i naninf_mask = _mm_cmpeq_epi32(exp, shifted_exp);
2055 // Inf/NaN adjust
2056 __m128i naninf_adj = _mm_and_si128(_mm_set1_epi32((128 - 16) << 23), naninf_mask);
2057 // extra exp adjust for Inf/NaN
2058 ou = _mm_add_epi32(ou, naninf_adj);
2059
2060 // Zero/Denormal?
2061 __m128i zeroden_mask = _mm_cmpeq_epi32(exp, _mm_setzero_si128());
2062 __m128i zeroden_adj = _mm_and_si128(zeroden_mask, _mm_set1_epi32(1 << 23));
2063 // o.u += 1 << 23;
2064 ou = _mm_add_epi32(ou, zeroden_adj);
2065 // magic.u = 113 << 23
2066 __m128i magic = _mm_and_si128(zeroden_mask, _mm_set1_epi32(113 << 23));
2067 // o.f -= magic.f
2068 ou = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(ou), _mm_castsi128_ps(magic)));
2069
2070 __m128i sign = _mm_slli_epi32(_mm_and_si128(input, _mm_set1_epi32(0x8000)), 16);
2071 // o.u |= (h.x & 0x8000) << 16; // sign bit
2072 ou = _mm_or_si128(ou, sign);
2073 // return o.f;
2074 // We are actually returning uint version, to make
2075 // _mm256_insertf128_si256 work.
2076 return ou;
2077}
2078
2079EIGEN_STRONG_INLINE __m128i float2half(__m128 f) {
2080 // unsigned int sign_mask = 0x80000000u;
2081 __m128i sign = _mm_set1_epi32(0x80000000u);
2082 // unsigned int sign = f.u & sign_mask;
2083 sign = _mm_and_si128(sign, _mm_castps_si128(f));
2084 // f.u ^= sign;
2085 f = _mm_xor_ps(f, _mm_castsi128_ps(sign));
2086
2087 __m128i fu = _mm_castps_si128(f);
2088
2089 __m128i f16max = _mm_set1_epi32((127 + 16) << 23);
2090 __m128i f32infty = _mm_set1_epi32(255 << 23);
2091 // if (f.u >= f16max.u) // result is Inf or NaN (all exponent bits set)
2092 // there is no _mm_cmpge_epi32, so use lt and swap operands
2093 __m128i infnan_mask = _mm_cmplt_epi32(f16max, _mm_castps_si128(f));
2094 __m128i inf_mask = _mm_cmpgt_epi32(_mm_castps_si128(f), f32infty);
2095 __m128i nan_mask = _mm_andnot_si128(inf_mask, infnan_mask);
2096 __m128i inf_value = _mm_and_si128(inf_mask, _mm_set1_epi32(0x7e00));
2097 __m128i nan_value = _mm_and_si128(nan_mask, _mm_set1_epi32(0x7c00));
2098 // o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
2099 __m128i naninf_value = _mm_or_si128(inf_value, nan_value);
2100
2101 __m128i denorm_magic = _mm_set1_epi32(((127 - 15) + (23 - 10) + 1) << 23);
2102 __m128i subnorm_mask = _mm_cmplt_epi32(_mm_castps_si128(f), _mm_set1_epi32(113 << 23));
2103 // f.f += denorm_magic.f;
2104 f = _mm_add_ps(f, _mm_castsi128_ps(denorm_magic));
2105 // f.u - denorm_magic.u
2106 __m128i o = _mm_sub_epi32(_mm_castps_si128(f), denorm_magic);
2107 o = _mm_and_si128(o, subnorm_mask);
2108 // Correct result for inf/nan/zero/subnormal, 0 otherwise
2109 o = _mm_or_si128(o, naninf_value);
2110
2111 __m128i mask = _mm_or_si128(infnan_mask, subnorm_mask);
2112 o = _mm_and_si128(o, mask);
2113
2114 // mant_odd = (f.u >> 13) & 1;
2115 __m128i mand_odd = _mm_and_si128(_mm_srli_epi32(fu, 13), _mm_set1_epi32(0x1));
2116 // f.u += 0xc8000fffU;
2117 fu = _mm_add_epi32(fu, _mm_set1_epi32(0xc8000fffU));
2118 // f.u += mant_odd;
2119 fu = _mm_add_epi32(fu, mand_odd);
2120 fu = _mm_andnot_si128(mask, fu);
2121 // f.u >> 13
2122 fu = _mm_srli_epi32(fu, 13);
2123 o = _mm_or_si128(fu, o);
2124
2125 // o.x |= static_cast<numext::uint16_t>(sign >> 16);
2126 o = _mm_or_si128(o, _mm_srli_epi32(sign, 16));
2127
2128 // 16 bit values
2129 return _mm_and_si128(o, _mm_set1_epi32(0xffff));
2130}
2131#endif
2132
2133// Packet math for Eigen::half
2134// Disable the following code since it's broken on too many platforms / compilers.
2135// #elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
2136#if 0
2137
2138typedef struct {
2139 __m64 x;
2140} Packet4h;
2141
2142
2143template<> struct is_arithmetic<Packet4h> { enum { value = true }; };
2144
2145template <>
2146struct packet_traits<Eigen::half> : default_packet_traits {
2147 typedef Packet4h type;
2148 // There is no half-size packet for Packet4h.
2149 typedef Packet4h half;
2150 enum {
2151 Vectorizable = 1,
2152 AlignedOnScalar = 1,
2153 size = 4,
2154 HasAdd = 1,
2155 HasSub = 1,
2156 HasMul = 1,
2157 HasDiv = 1,
2158 HasNegate = 0,
2159 HasAbs = 0,
2160 HasMin = 0,
2161 HasMax = 0,
2162 HasConj = 0,
2163 HasSetLinear = 0,
2164 };
2165};
2166
2167
2168template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h half; };
2169
2170template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
2171 Packet4h result;
2172 result.x = _mm_set1_pi16(from.x);
2173 return result;
2174}
2175
2176template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) {
2177 return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
2178}
2179
2180template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; }
2181
2182template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) {
2183 __int64_t a64 = _mm_cvtm64_si64(a.x);
2184 __int64_t b64 = _mm_cvtm64_si64(b.x);
2185
2186 Eigen::half h[4];
2187
2188 Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
2189 Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
2190 h[0] = ha + hb;
2191 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
2192 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
2193 h[1] = ha + hb;
2194 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
2195 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
2196 h[2] = ha + hb;
2197 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
2198 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
2199 h[3] = ha + hb;
2200 Packet4h result;
2201 result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
2202 return result;
2203}
2204
2205template<> EIGEN_STRONG_INLINE Packet4h psub<Packet4h>(const Packet4h& a, const Packet4h& b) {
2206 __int64_t a64 = _mm_cvtm64_si64(a.x);
2207 __int64_t b64 = _mm_cvtm64_si64(b.x);
2208
2209 Eigen::half h[4];
2210
2211 Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
2212 Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
2213 h[0] = ha - hb;
2214 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
2215 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
2216 h[1] = ha - hb;
2217 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
2218 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
2219 h[2] = ha - hb;
2220 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
2221 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
2222 h[3] = ha - hb;
2223 Packet4h result;
2224 result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
2225 return result;
2226}
2227
2228template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) {
2229 __int64_t a64 = _mm_cvtm64_si64(a.x);
2230 __int64_t b64 = _mm_cvtm64_si64(b.x);
2231
2232 Eigen::half h[4];
2233
2234 Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
2235 Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
2236 h[0] = ha * hb;
2237 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
2238 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
2239 h[1] = ha * hb;
2240 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
2241 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
2242 h[2] = ha * hb;
2243 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
2244 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
2245 h[3] = ha * hb;
2246 Packet4h result;
2247 result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
2248 return result;
2249}
2250
2251template<> EIGEN_STRONG_INLINE Packet4h pdiv<Packet4h>(const Packet4h& a, const Packet4h& b) {
2252 __int64_t a64 = _mm_cvtm64_si64(a.x);
2253 __int64_t b64 = _mm_cvtm64_si64(b.x);
2254
2255 Eigen::half h[4];
2256
2257 Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
2258 Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
2259 h[0] = ha / hb;
2260 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
2261 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
2262 h[1] = ha / hb;
2263 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
2264 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
2265 h[2] = ha / hb;
2266 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
2267 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
2268 h[3] = ha / hb;
2269 Packet4h result;
2270 result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
2271 return result;
2272}
2273
2274template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) {
2275 Packet4h result;
2276 result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
2277 return result;
2278}
2279
2280template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) {
2281 Packet4h result;
2282 result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
2283 return result;
2284}
2285
2286template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) {
2287 __int64_t r = _mm_cvtm64_si64(from.x);
2288 *(reinterpret_cast<__int64_t*>(to)) = r;
2289}
2290
2291template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) {
2292 __int64_t r = _mm_cvtm64_si64(from.x);
2293 *(reinterpret_cast<__int64_t*>(to)) = r;
2294}
2295
2296template<> EIGEN_STRONG_INLINE Packet4h
2297ploadquad<Packet4h>(const Eigen::half* from) {
2298 return pset1<Packet4h>(*from);
2299}
2300
2301template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride)
2302{
2303 Packet4h result;
2304 result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
2305 return result;
2306}
2307
2308template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride)
2309{
2310 __int64_t a = _mm_cvtm64_si64(from.x);
2311 to[stride*0].x = static_cast<unsigned short>(a);
2312 to[stride*1].x = static_cast<unsigned short>(a >> 16);
2313 to[stride*2].x = static_cast<unsigned short>(a >> 32);
2314 to[stride*3].x = static_cast<unsigned short>(a >> 48);
2315}
2316
2317EIGEN_STRONG_INLINE void
2318ptranspose(PacketBlock<Packet4h,4>& kernel) {
2319 __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x);
2320 __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);
2321 __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x);
2322 __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x);
2323
2324 kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1);
2325 kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1);
2326 kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3);
2327 kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3);
2328}
2329
2330#endif
2331
2332} // end namespace internal
2333
2334} // end namespace Eigen
2335
2336#if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
2337// PGI++ does not define the following intrinsics in C++ mode.
2338static inline __m128 _mm_castpd_ps(__m128d x) { return reinterpret_cast<__m128&>(x); }
2339static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); }
2340static inline __m128d _mm_castps_pd(__m128 x) { return reinterpret_cast<__m128d&>(x); }
2341static inline __m128i _mm_castps_si128(__m128 x) { return reinterpret_cast<__m128i&>(x); }
2342static inline __m128 _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x); }
2343static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); }
2344#endif
2345
2346#endif // EIGEN_PACKET_MATH_SSE_H
@ Aligned16
Definition Constants.h:237
Namespace containing all symbols from the Eigen library.
Definition B01_Experimental.dox:1
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_exp_op< typename Derived::Scalar >, const Derived > exp(const Eigen::ArrayBase< Derived > &x)
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_sign_op< typename Derived::Scalar >, const Derived > sign(const Eigen::ArrayBase< Derived > &x)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:82