10#ifndef EIGEN_COMPLEX_AVX_H
11#define EIGEN_COMPLEX_AVX_H
14#include "../../InternalHeaderCheck.h"
22 EIGEN_STRONG_INLINE Packet4cf() {}
23 EIGEN_STRONG_INLINE
explicit Packet4cf(
const __m256& a) : v(a) {}
27#ifndef EIGEN_VECTORIZE_AVX512
29struct packet_traits<std::complex<float> > : default_packet_traits {
30 typedef Packet4cf type;
31 typedef Packet2cf half;
55struct unpacket_traits<Packet4cf> {
56 typedef std::complex<float> type;
57 typedef Packet2cf half;
58 typedef Packet8f as_real;
63 masked_load_available =
false,
64 masked_store_available =
false
69EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(
const Packet4cf& a,
const Packet4cf& b) {
70 return Packet4cf(_mm256_add_ps(a.v, b.v));
73EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(
const Packet4cf& a,
const Packet4cf& b) {
74 return Packet4cf(_mm256_sub_ps(a.v, b.v));
77EIGEN_STRONG_INLINE Packet4cf pnegate(
const Packet4cf& a) {
78 return Packet4cf(pnegate(a.v));
81EIGEN_STRONG_INLINE Packet4cf pconj(
const Packet4cf& a) {
82 const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000, 0x80000000, 0x00000000, 0x80000000, 0x00000000,
83 0x80000000, 0x00000000, 0x80000000));
84 return Packet4cf(_mm256_xor_ps(a.v, mask));
88EIGEN_STRONG_INLINE Packet4cf pmul(
const Packet4cf& a,
const Packet4cf& b) {
89 __m256 tmp1 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1)));
90 __m256 tmp2 = _mm256_moveldup_ps(a.v);
91#ifdef EIGEN_VECTORIZE_FMA
92 __m256 result = _mm256_fmaddsub_ps(tmp2, b.v, tmp1);
94 __m256 result = _mm256_addsub_ps(_mm256_mul_ps(tmp2, b.v), tmp1);
96 return Packet4cf(result);
100EIGEN_STRONG_INLINE Packet4cf pcmp_eq(
const Packet4cf& a,
const Packet4cf& b) {
101 __m256 eq = _mm256_cmp_ps(a.v, b.v, _CMP_EQ_OQ);
102 return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1)));
106EIGEN_STRONG_INLINE Packet4cf ptrue<Packet4cf>(
const Packet4cf& a) {
107 return Packet4cf(ptrue(Packet8f(a.v)));
110EIGEN_STRONG_INLINE Packet4cf pand<Packet4cf>(
const Packet4cf& a,
const Packet4cf& b) {
111 return Packet4cf(_mm256_and_ps(a.v, b.v));
114EIGEN_STRONG_INLINE Packet4cf por<Packet4cf>(
const Packet4cf& a,
const Packet4cf& b) {
115 return Packet4cf(_mm256_or_ps(a.v, b.v));
118EIGEN_STRONG_INLINE Packet4cf pxor<Packet4cf>(
const Packet4cf& a,
const Packet4cf& b) {
119 return Packet4cf(_mm256_xor_ps(a.v, b.v));
122EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(
const Packet4cf& a,
const Packet4cf& b) {
123 return Packet4cf(_mm256_andnot_ps(b.v, a.v));
127EIGEN_STRONG_INLINE Packet4cf pload<Packet4cf>(
const std::complex<float>* from) {
128 EIGEN_DEBUG_ALIGNED_LOAD
return Packet4cf(_mm256_load_ps(&numext::real_ref(*from)));
131EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(
const std::complex<float>* from) {
132 EIGEN_DEBUG_UNALIGNED_LOAD
return Packet4cf(_mm256_loadu_ps(&numext::real_ref(*from)));
136EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(
const std::complex<float>& from) {
137 const float re = std::real(from);
138 const float im = std::imag(from);
139 return Packet4cf(_mm256_set_ps(im, re, im, re, im, re, im, re));
143EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(
const std::complex<float>* from) {
145 Packet2cf a = ploaddup<Packet2cf>(from);
146 Packet2cf b = ploaddup<Packet2cf>(from + 1);
147 return Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(a.v), b.v, 1));
151EIGEN_STRONG_INLINE
void pstore<std::complex<float> >(std::complex<float>* to,
const Packet4cf& from) {
152 EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(&numext::real_ref(*to), from.v);
155EIGEN_STRONG_INLINE
void pstoreu<std::complex<float> >(std::complex<float>* to,
const Packet4cf& from) {
156 EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(&numext::real_ref(*to), from.v);
160EIGEN_DEVICE_FUNC
inline Packet4cf pgather<std::complex<float>, Packet4cf>(
const std::complex<float>* from,
162 return Packet4cf(_mm256_set_ps(std::imag(from[3 * stride]), std::real(from[3 * stride]), std::imag(from[2 * stride]),
163 std::real(from[2 * stride]), std::imag(from[1 * stride]), std::real(from[1 * stride]),
164 std::imag(from[0 * stride]), std::real(from[0 * stride])));
168EIGEN_DEVICE_FUNC
inline void pscatter<std::complex<float>, Packet4cf>(std::complex<float>* to,
const Packet4cf& from,
170 __m128 low = _mm256_extractf128_ps(from.v, 0);
172 std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)), _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)));
174 std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)), _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)));
176 __m128 high = _mm256_extractf128_ps(from.v, 1);
178 std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 0)), _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)));
180 std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)), _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)));
184EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet4cf>(
const Packet4cf& a) {
185 return pfirst(Packet2cf(_mm256_castps256_ps128(a.v)));
189EIGEN_STRONG_INLINE Packet4cf preverse(
const Packet4cf& a) {
190 __m128 low = _mm256_extractf128_ps(a.v, 0);
191 __m128 high = _mm256_extractf128_ps(a.v, 1);
192 __m128d lowd = _mm_castps_pd(low);
193 __m128d highd = _mm_castps_pd(high);
194 low = _mm_castpd_ps(_mm_shuffle_pd(lowd, lowd, 0x1));
195 high = _mm_castpd_ps(_mm_shuffle_pd(highd, highd, 0x1));
196 __m256 result = _mm256_setzero_ps();
197 result = _mm256_insertf128_ps(result, low, 1);
198 result = _mm256_insertf128_ps(result, high, 0);
199 return Packet4cf(result);
203EIGEN_STRONG_INLINE std::complex<float> predux<Packet4cf>(
const Packet4cf& a) {
204 return predux(padd(Packet2cf(_mm256_extractf128_ps(a.v, 0)), Packet2cf(_mm256_extractf128_ps(a.v, 1))));
208EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(
const Packet4cf& a) {
209 return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)), Packet2cf(_mm256_extractf128_ps(a.v, 1))));
212EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf, Packet8f)
215EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(
const Packet4cf& a,
const Packet4cf& b) {
216 return pdiv_complex(a, b);
220EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(
const Packet4cf& x) {
221 return Packet4cf(_mm256_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0, 1)));
226 EIGEN_STRONG_INLINE Packet2cd() {}
227 EIGEN_STRONG_INLINE
explicit Packet2cd(
const __m256d& a) : v(a) {}
231#ifndef EIGEN_VECTORIZE_AVX512
233struct packet_traits<std::complex<double> > : default_packet_traits {
234 typedef Packet2cd type;
235 typedef Packet1cd half;
258struct unpacket_traits<Packet2cd> {
259 typedef std::complex<double> type;
260 typedef Packet1cd half;
261 typedef Packet4d as_real;
266 masked_load_available =
false,
267 masked_store_available =
false
272EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(
const Packet2cd& a,
const Packet2cd& b) {
273 return Packet2cd(_mm256_add_pd(a.v, b.v));
276EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(
const Packet2cd& a,
const Packet2cd& b) {
277 return Packet2cd(_mm256_sub_pd(a.v, b.v));
280EIGEN_STRONG_INLINE Packet2cd pnegate(
const Packet2cd& a) {
281 return Packet2cd(pnegate(a.v));
284EIGEN_STRONG_INLINE Packet2cd pconj(
const Packet2cd& a) {
285 const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000, 0x0, 0x0, 0x0, 0x80000000, 0x0, 0x0, 0x0));
286 return Packet2cd(_mm256_xor_pd(a.v, mask));
290EIGEN_STRONG_INLINE Packet2cd pmul(
const Packet2cd& a,
const Packet2cd& b) {
291 __m256d tmp1 = _mm256_mul_pd(_mm256_permute_pd(a.v, 0xF), _mm256_permute_pd(b.v, 0x5));
292 __m256d tmp2 = _mm256_movedup_pd(a.v);
293#ifdef EIGEN_VECTORIZE_FMA
294 __m256d result = _mm256_fmaddsub_pd(tmp2, b.v, tmp1);
296 __m256d result = _mm256_addsub_pd(_mm256_mul_pd(tmp2, b.v), tmp1);
298 return Packet2cd(result);
302EIGEN_STRONG_INLINE Packet2cd pcmp_eq(
const Packet2cd& a,
const Packet2cd& b) {
303 __m256d eq = _mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ);
304 return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5)));
308EIGEN_STRONG_INLINE Packet2cd ptrue<Packet2cd>(
const Packet2cd& a) {
309 return Packet2cd(ptrue(Packet4d(a.v)));
312EIGEN_STRONG_INLINE Packet2cd pand<Packet2cd>(
const Packet2cd& a,
const Packet2cd& b) {
313 return Packet2cd(_mm256_and_pd(a.v, b.v));
316EIGEN_STRONG_INLINE Packet2cd por<Packet2cd>(
const Packet2cd& a,
const Packet2cd& b) {
317 return Packet2cd(_mm256_or_pd(a.v, b.v));
320EIGEN_STRONG_INLINE Packet2cd pxor<Packet2cd>(
const Packet2cd& a,
const Packet2cd& b) {
321 return Packet2cd(_mm256_xor_pd(a.v, b.v));
324EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(
const Packet2cd& a,
const Packet2cd& b) {
325 return Packet2cd(_mm256_andnot_pd(b.v, a.v));
329EIGEN_STRONG_INLINE Packet2cd pload<Packet2cd>(
const std::complex<double>* from) {
330 EIGEN_DEBUG_ALIGNED_LOAD
return Packet2cd(_mm256_load_pd((
const double*)from));
333EIGEN_STRONG_INLINE Packet2cd ploadu<Packet2cd>(
const std::complex<double>* from) {
334 EIGEN_DEBUG_UNALIGNED_LOAD
return Packet2cd(_mm256_loadu_pd((
const double*)from));
338EIGEN_STRONG_INLINE Packet2cd pset1<Packet2cd>(
const std::complex<double>& from) {
341 return Packet2cd(_mm256_broadcast_pd((
const __m128d*)(
const void*)&from));
345EIGEN_STRONG_INLINE Packet2cd ploaddup<Packet2cd>(
const std::complex<double>* from) {
346 return pset1<Packet2cd>(*from);
350EIGEN_STRONG_INLINE
void pstore<std::complex<double> >(std::complex<double>* to,
const Packet2cd& from) {
351 EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd((
double*)to, from.v);
354EIGEN_STRONG_INLINE
void pstoreu<std::complex<double> >(std::complex<double>* to,
const Packet2cd& from) {
355 EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd((
double*)to, from.v);
359EIGEN_DEVICE_FUNC
inline Packet2cd pgather<std::complex<double>, Packet2cd>(
const std::complex<double>* from,
361 return Packet2cd(_mm256_set_pd(std::imag(from[1 * stride]), std::real(from[1 * stride]), std::imag(from[0 * stride]),
362 std::real(from[0 * stride])));
366EIGEN_DEVICE_FUNC
inline void pscatter<std::complex<double>, Packet2cd>(std::complex<double>* to,
const Packet2cd& from,
368 __m128d low = _mm256_extractf128_pd(from.v, 0);
369 to[stride * 0] = std::complex<double>(_mm_cvtsd_f64(low), _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)));
370 __m128d high = _mm256_extractf128_pd(from.v, 1);
371 to[stride * 1] = std::complex<double>(_mm_cvtsd_f64(high), _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)));
375EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet2cd>(
const Packet2cd& a) {
376 __m128d low = _mm256_extractf128_pd(a.v, 0);
377 EIGEN_ALIGN16
double res[2];
378 _mm_store_pd(res, low);
379 return std::complex<double>(res[0], res[1]);
383EIGEN_STRONG_INLINE Packet2cd preverse(
const Packet2cd& a) {
384 __m256d result = _mm256_permute2f128_pd(a.v, a.v, 1);
385 return Packet2cd(result);
389EIGEN_STRONG_INLINE std::complex<double> predux<Packet2cd>(
const Packet2cd& a) {
390 return predux(padd(Packet1cd(_mm256_extractf128_pd(a.v, 0)), Packet1cd(_mm256_extractf128_pd(a.v, 1))));
394EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(
const Packet2cd& a) {
395 return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v, 0)), Packet1cd(_mm256_extractf128_pd(a.v, 1))));
398EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd, Packet4d)
401EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(
const Packet2cd& a,
const Packet2cd& b) {
402 return pdiv_complex(a, b);
406EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(
const Packet2cd& x) {
407 return Packet2cd(_mm256_shuffle_pd(x.v, x.v, 0x5));
410EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet4cf, 4>& kernel) {
411 __m256d P0 = _mm256_castps_pd(kernel.packet[0].v);
412 __m256d P1 = _mm256_castps_pd(kernel.packet[1].v);
413 __m256d P2 = _mm256_castps_pd(kernel.packet[2].v);
414 __m256d P3 = _mm256_castps_pd(kernel.packet[3].v);
416 __m256d T0 = _mm256_shuffle_pd(P0, P1, 15);
417 __m256d T1 = _mm256_shuffle_pd(P0, P1, 0);
418 __m256d T2 = _mm256_shuffle_pd(P2, P3, 15);
419 __m256d T3 = _mm256_shuffle_pd(P2, P3, 0);
421 kernel.packet[1].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 32));
422 kernel.packet[3].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 49));
423 kernel.packet[0].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 32));
424 kernel.packet[2].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 49));
427EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet2cd, 2>& kernel) {
428 __m256d tmp = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 0 + (2 << 4));
429 kernel.packet[1].v = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 1 + (3 << 4));
430 kernel.packet[0].v = tmp;
434EIGEN_STRONG_INLINE Packet2cd psqrt<Packet2cd>(
const Packet2cd& a) {
435 return psqrt_complex<Packet2cd>(a);
439EIGEN_STRONG_INLINE Packet4cf psqrt<Packet4cf>(
const Packet4cf& a) {
440 return psqrt_complex<Packet4cf>(a);
444EIGEN_STRONG_INLINE Packet2cd plog<Packet2cd>(
const Packet2cd& a) {
445 return plog_complex<Packet2cd>(a);
449EIGEN_STRONG_INLINE Packet4cf plog<Packet4cf>(
const Packet4cf& a) {
450 return plog_complex<Packet4cf>(a);
454EIGEN_STRONG_INLINE Packet4cf pexp<Packet4cf>(
const Packet4cf& a) {
455 return pexp_complex<Packet4cf>(a);
458#ifdef EIGEN_VECTORIZE_FMA
461EIGEN_STRONG_INLINE Packet4cf pmadd(
const Packet4cf& a,
const Packet4cf& b,
const Packet4cf& c) {
462 __m256 a_odd = _mm256_movehdup_ps(a.v);
463 __m256 a_even = _mm256_moveldup_ps(a.v);
464 __m256 b_swap = _mm256_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1));
465 __m256 result = _mm256_fmaddsub_ps(a_even, b.v, _mm256_fmaddsub_ps(a_odd, b_swap, c.v));
466 return Packet4cf(result);
469EIGEN_STRONG_INLINE Packet4cf pmsub(
const Packet4cf& a,
const Packet4cf& b,
const Packet4cf& c) {
470 __m256 a_odd = _mm256_movehdup_ps(a.v);
471 __m256 a_even = _mm256_moveldup_ps(a.v);
472 __m256 b_swap = _mm256_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1));
473 __m256 result = _mm256_fmaddsub_ps(a_even, b.v, _mm256_fmsubadd_ps(a_odd, b_swap, c.v));
474 return Packet4cf(result);
477EIGEN_STRONG_INLINE Packet4cf pnmadd(
const Packet4cf& a,
const Packet4cf& b,
const Packet4cf& c) {
478 __m256 a_odd = _mm256_movehdup_ps(a.v);
479 __m256 a_even = _mm256_moveldup_ps(a.v);
480 __m256 b_swap = _mm256_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1));
481 __m256 result = _mm256_fmaddsub_ps(a_odd, b_swap, _mm256_fmaddsub_ps(a_even, b.v, c.v));
482 return Packet4cf(result);
485EIGEN_STRONG_INLINE Packet4cf pnmsub(
const Packet4cf& a,
const Packet4cf& b,
const Packet4cf& c) {
486 __m256 a_odd = _mm256_movehdup_ps(a.v);
487 __m256 a_even = _mm256_moveldup_ps(a.v);
488 __m256 b_swap = _mm256_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1));
489 __m256 result = _mm256_fmaddsub_ps(a_odd, b_swap, _mm256_fmsubadd_ps(a_even, b.v, c.v));
490 return Packet4cf(result);
494EIGEN_STRONG_INLINE Packet2cd pmadd(
const Packet2cd& a,
const Packet2cd& b,
const Packet2cd& c) {
495 __m256d a_odd = _mm256_permute_pd(a.v, 0xF);
496 __m256d a_even = _mm256_movedup_pd(a.v);
497 __m256d b_swap = _mm256_permute_pd(b.v, 0x5);
498 __m256d result = _mm256_fmaddsub_pd(a_even, b.v, _mm256_fmaddsub_pd(a_odd, b_swap, c.v));
499 return Packet2cd(result);
502EIGEN_STRONG_INLINE Packet2cd pmsub(
const Packet2cd& a,
const Packet2cd& b,
const Packet2cd& c) {
503 __m256d a_odd = _mm256_permute_pd(a.v, 0xF);
504 __m256d a_even = _mm256_movedup_pd(a.v);
505 __m256d b_swap = _mm256_permute_pd(b.v, 0x5);
506 __m256d result = _mm256_fmaddsub_pd(a_even, b.v, _mm256_fmsubadd_pd(a_odd, b_swap, c.v));
507 return Packet2cd(result);
510EIGEN_STRONG_INLINE Packet2cd pnmadd(
const Packet2cd& a,
const Packet2cd& b,
const Packet2cd& c) {
511 __m256d a_odd = _mm256_permute_pd(a.v, 0xF);
512 __m256d a_even = _mm256_movedup_pd(a.v);
513 __m256d b_swap = _mm256_permute_pd(b.v, 0x5);
514 __m256d result = _mm256_fmaddsub_pd(a_odd, b_swap, _mm256_fmaddsub_pd(a_even, b.v, c.v));
515 return Packet2cd(result);
518EIGEN_STRONG_INLINE Packet2cd pnmsub(
const Packet2cd& a,
const Packet2cd& b,
const Packet2cd& c) {
519 __m256d a_odd = _mm256_permute_pd(a.v, 0xF);
520 __m256d a_even = _mm256_movedup_pd(a.v);
521 __m256d b_swap = _mm256_permute_pd(b.v, 0x5);
522 __m256d result = _mm256_fmaddsub_pd(a_odd, b_swap, _mm256_fmsubadd_pd(a_even, b.v, c.v));
523 return Packet2cd(result);
@ Aligned32
Definition Constants.h:238
Namespace containing all symbols from the Eigen library.
Definition B01_Experimental.dox:1
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:82