10#ifndef EIGEN_PACKET_MATH_AVX512_H
11#define EIGEN_PACKET_MATH_AVX512_H
14#include "../../InternalHeaderCheck.h"
20#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
21#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
24#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
25#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
28#ifdef EIGEN_VECTORIZE_FMA
29#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
30#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
34typedef __m512 Packet16f;
35typedef __m512i Packet16i;
36typedef __m512d Packet8d;
37typedef eigen_packet_wrapper<__m512i, 1> Packet8l;
38#ifndef EIGEN_VECTORIZE_AVX512FP16
39typedef eigen_packet_wrapper<__m256i, 1> Packet16h;
41typedef eigen_packet_wrapper<__m256i, 2> Packet16bf;
43typedef eigen_packet_wrapper<__m512i, 6> Packet32s;
44typedef eigen_packet_wrapper<__m256i, 6> Packet16s;
45typedef eigen_packet_wrapper<__m128i, 6> Packet8s;
48struct is_arithmetic<__m512> {
49 enum { value =
true };
52struct is_arithmetic<__m512i> {
53 enum { value =
true };
56struct is_arithmetic<__m512d> {
57 enum { value =
true };
60struct is_arithmetic<Packet8l> {
61 enum { value =
true };
64#ifndef EIGEN_VECTORIZE_AVX512FP16
66struct is_arithmetic<Packet16h> {
67 enum { value =
true };
71struct packet_traits<half> : default_packet_traits {
72 typedef Packet16h type;
74 typedef Packet16h half;
99 HasSin = EIGEN_FAST_MATH,
100 HasCos = EIGEN_FAST_MATH,
101 HasTanh = EIGEN_FAST_MATH,
102 HasErf = EIGEN_FAST_MATH,
109struct packet_traits<float> : default_packet_traits {
110 typedef Packet16f type;
111 typedef Packet8f half;
122 HasSin = EIGEN_FAST_MATH,
123 HasCos = EIGEN_FAST_MATH,
138 HasReciprocal = EIGEN_FAST_MATH,
139 HasTanh = EIGEN_FAST_MATH,
140 HasErf = EIGEN_FAST_MATH,
141 HasErfc = EIGEN_FAST_MATH,
147struct packet_traits<double> : default_packet_traits {
148 typedef Packet8d type;
149 typedef Packet4d half;
158 HasSin = EIGEN_FAST_MATH,
159 HasCos = EIGEN_FAST_MATH,
166 HasTanh = EIGEN_FAST_MATH,
167 HasErf = EIGEN_FAST_MATH,
168 HasErfc = EIGEN_FAST_MATH,
176struct packet_traits<int> : default_packet_traits {
177 typedef Packet16i type;
178 typedef Packet8i half;
179 enum { Vectorizable = 1, AlignedOnScalar = 1, HasBlend = 0, HasCmp = 1, HasDiv = 1, size = 16 };
183struct packet_traits<int64_t> : default_packet_traits {
184 typedef Packet8l type;
185 typedef Packet4l half;
186 enum { Vectorizable = 1, AlignedOnScalar = 1, HasCmp = 1, size = 8 };
190struct unpacket_traits<Packet16f> {
192 typedef Packet8f half;
193 typedef Packet16i integer_packet;
194 typedef uint16_t mask_t;
199 masked_load_available =
true,
200 masked_store_available =
true,
201 masked_fpops_available =
true
205struct unpacket_traits<Packet8d> {
207 typedef Packet4d half;
208 typedef Packet8l integer_packet;
209 typedef uint8_t mask_t;
214 masked_load_available =
true,
215 masked_store_available =
true,
216 masked_fpops_available =
true
220struct unpacket_traits<Packet16i> {
222 typedef Packet8i half;
227 masked_load_available =
false,
228 masked_store_available =
false
233struct unpacket_traits<Packet8l> {
234 typedef int64_t type;
235 typedef Packet4l half;
240 masked_load_available =
false,
241 masked_store_available =
false
245#ifndef EIGEN_VECTORIZE_AVX512FP16
247struct unpacket_traits<Packet16h> {
248 typedef Eigen::half type;
249 typedef Packet8h half;
254 masked_load_available =
false,
255 masked_store_available =
false
261struct unpacket_traits<Packet32s> {
262 typedef numext::int16_t type;
263 typedef Packet16s half;
267 vectorizable =
false,
272struct unpacket_traits<Packet16s> {
273 typedef numext::int16_t type;
274 typedef Packet8s half;
278 vectorizable =
false,
283struct unpacket_traits<Packet8s> {
284 typedef numext::int16_t type;
285 typedef Packet8s half;
289 vectorizable =
false,
294EIGEN_STRONG_INLINE Packet16f pset1<Packet16f>(
const float& from) {
295 return _mm512_set1_ps(from);
298EIGEN_STRONG_INLINE Packet8d pset1<Packet8d>(
const double& from) {
299 return _mm512_set1_pd(from);
302EIGEN_STRONG_INLINE Packet16i pset1<Packet16i>(
const int& from) {
303 return _mm512_set1_epi32(from);
306EIGEN_STRONG_INLINE Packet8l pset1<Packet8l>(
const int64_t& from) {
307 return _mm512_set1_epi64(from);
311EIGEN_STRONG_INLINE Packet16f pset1frombits<Packet16f>(
unsigned int from) {
312 return _mm512_castsi512_ps(_mm512_set1_epi32(from));
316EIGEN_STRONG_INLINE Packet8d pset1frombits<Packet8d>(
const numext::uint64_t from) {
317 return _mm512_castsi512_pd(_mm512_set1_epi64(from));
321EIGEN_STRONG_INLINE Packet16f pzero(
const Packet16f& ) {
322 return _mm512_setzero_ps();
325EIGEN_STRONG_INLINE Packet8d pzero(
const Packet8d& ) {
326 return _mm512_setzero_pd();
329EIGEN_STRONG_INLINE Packet16i pzero(
const Packet16i& ) {
330 return _mm512_setzero_si512();
334EIGEN_STRONG_INLINE Packet8l pzero(
const Packet8l& ) {
335 return _mm512_setzero_si512();
339EIGEN_STRONG_INLINE Packet16f peven_mask(
const Packet16f& ) {
340 return _mm512_castsi512_ps(_mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1));
343EIGEN_STRONG_INLINE Packet16i peven_mask(
const Packet16i& ) {
344 return _mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
347EIGEN_STRONG_INLINE Packet8d peven_mask(
const Packet8d& ) {
348 return _mm512_castsi512_pd(_mm512_set_epi32(0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1));
351EIGEN_STRONG_INLINE Packet8l peven_mask(
const Packet8l& ) {
352 return _mm512_set_epi32(0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1);
356EIGEN_STRONG_INLINE Packet16f pload1<Packet16f>(
const float* from) {
357#if (EIGEN_COMP_GNUC != 0) || (EIGEN_COMP_CLANG != 0)
361 __asm__(
"vbroadcastss %[mem], %[dst]" : [dst]
"=v"(ret) : [mem]
"m"(*from));
364 return _mm512_broadcastss_ps(_mm_load_ps1(from));
368EIGEN_STRONG_INLINE Packet8d pload1<Packet8d>(
const double* from) {
369#if (EIGEN_COMP_GNUC != 0) || (EIGEN_COMP_CLANG != 0)
371 __asm__(
"vbroadcastsd %[mem], %[dst]" : [dst]
"=v"(ret) : [mem]
"m"(*from));
374 return _mm512_set1_pd(*from);
379EIGEN_STRONG_INLINE Packet16f plset<Packet16f>(
const float& a) {
380 return _mm512_add_ps(_mm512_set1_ps(a), _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f,
381 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f));
384EIGEN_STRONG_INLINE Packet8d plset<Packet8d>(
const double& a) {
385 return _mm512_add_pd(_mm512_set1_pd(a), _mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0));
388EIGEN_STRONG_INLINE Packet16i plset<Packet16i>(
const int& a) {
389 return _mm512_add_epi32(_mm512_set1_epi32(a), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
392EIGEN_STRONG_INLINE Packet8l plset<Packet8l>(
const int64_t& a) {
393 return _mm512_add_epi64(_mm512_set1_epi64(a), _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0));
397EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(
const Packet16f& a,
const Packet16f& b) {
398 return _mm512_add_ps(a, b);
401EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(
const Packet8d& a,
const Packet8d& b) {
402 return _mm512_add_pd(a, b);
405EIGEN_STRONG_INLINE Packet16i padd<Packet16i>(
const Packet16i& a,
const Packet16i& b) {
406 return _mm512_add_epi32(a, b);
409EIGEN_STRONG_INLINE Packet8l padd<Packet8l>(
const Packet8l& a,
const Packet8l& b) {
410 return _mm512_add_epi64(a, b);
414EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(
const Packet16f& a,
const Packet16f& b, uint16_t umask) {
415 __mmask16 mask =
static_cast<__mmask16
>(umask);
416 return _mm512_maskz_add_ps(mask, a, b);
419EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(
const Packet8d& a,
const Packet8d& b, uint8_t umask) {
420 __mmask8 mask =
static_cast<__mmask8
>(umask);
421 return _mm512_maskz_add_pd(mask, a, b);
425EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(
const Packet16f& a,
const Packet16f& b) {
426 return _mm512_sub_ps(a, b);
429EIGEN_STRONG_INLINE Packet8d psub<Packet8d>(
const Packet8d& a,
const Packet8d& b) {
430 return _mm512_sub_pd(a, b);
433EIGEN_STRONG_INLINE Packet16i psub<Packet16i>(
const Packet16i& a,
const Packet16i& b) {
434 return _mm512_sub_epi32(a, b);
437EIGEN_STRONG_INLINE Packet8l psub<Packet8l>(
const Packet8l& a,
const Packet8l& b) {
438 return _mm512_sub_epi64(a, b);
442EIGEN_STRONG_INLINE Packet16f pnegate(
const Packet16f& a) {
447 _mm512_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000,
448 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
449 return _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a), mask));
452EIGEN_STRONG_INLINE Packet8d pnegate(
const Packet8d& a) {
454 _mm512_set_epi64(0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL,
455 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL);
456 return _mm512_castsi512_pd(_mm512_xor_epi64(_mm512_castpd_si512(a), mask));
459EIGEN_STRONG_INLINE Packet16i pnegate(
const Packet16i& a) {
460 return _mm512_sub_epi32(_mm512_setzero_si512(), a);
463EIGEN_STRONG_INLINE Packet8l pnegate(
const Packet8l& a) {
464 return _mm512_sub_epi64(_mm512_setzero_si512(), a);
468EIGEN_STRONG_INLINE Packet16f pconj(
const Packet16f& a) {
472EIGEN_STRONG_INLINE Packet8d pconj(
const Packet8d& a) {
476EIGEN_STRONG_INLINE Packet16i pconj(
const Packet16i& a) {
480EIGEN_STRONG_INLINE Packet8l pconj(
const Packet8l& a) {
485EIGEN_STRONG_INLINE Packet16f pmul<Packet16f>(
const Packet16f& a,
const Packet16f& b) {
486 return _mm512_mul_ps(a, b);
489EIGEN_STRONG_INLINE Packet8d pmul<Packet8d>(
const Packet8d& a,
const Packet8d& b) {
490 return _mm512_mul_pd(a, b);
493EIGEN_STRONG_INLINE Packet16i pmul<Packet16i>(
const Packet16i& a,
const Packet16i& b) {
494 return _mm512_mullo_epi32(a, b);
497EIGEN_STRONG_INLINE Packet8l pmul<Packet8l>(
const Packet8l& a,
const Packet8l& b) {
498#ifdef EIGEN_VECTORIZE_AVX512DQ
499 return _mm512_mullo_epi64(a, b);
501 return _mm512_mullox_epi64(a, b);
506EIGEN_STRONG_INLINE Packet16f pdiv<Packet16f>(
const Packet16f& a,
const Packet16f& b) {
507 return _mm512_div_ps(a, b);
511EIGEN_STRONG_INLINE Packet8d pdiv<Packet8d>(
const Packet8d& a,
const Packet8d& b) {
512 return _mm512_div_pd(a, b);
516EIGEN_STRONG_INLINE Packet16i pdiv<Packet16i>(
const Packet16i& a,
const Packet16i& b) {
517 Packet8i q_lo = pdiv<Packet8i>(_mm512_extracti64x4_epi64(a, 0), _mm512_extracti64x4_epi64(b, 0));
518 Packet8i q_hi = pdiv<Packet8i>(_mm512_extracti64x4_epi64(a, 1), _mm512_extracti64x4_epi64(b, 1));
519 return _mm512_inserti64x4(_mm512_castsi256_si512(q_lo), q_hi, 1);
522#ifdef EIGEN_VECTORIZE_FMA
524EIGEN_STRONG_INLINE Packet16f pmadd(
const Packet16f& a,
const Packet16f& b,
const Packet16f& c) {
525 return _mm512_fmadd_ps(a, b, c);
528EIGEN_STRONG_INLINE Packet8d pmadd(
const Packet8d& a,
const Packet8d& b,
const Packet8d& c) {
529 return _mm512_fmadd_pd(a, b, c);
533EIGEN_STRONG_INLINE Packet16f pmsub(
const Packet16f& a,
const Packet16f& b,
const Packet16f& c) {
534 return _mm512_fmsub_ps(a, b, c);
537EIGEN_STRONG_INLINE Packet8d pmsub(
const Packet8d& a,
const Packet8d& b,
const Packet8d& c) {
538 return _mm512_fmsub_pd(a, b, c);
542EIGEN_STRONG_INLINE Packet16f pnmadd(
const Packet16f& a,
const Packet16f& b,
const Packet16f& c) {
543 return _mm512_fnmadd_ps(a, b, c);
546EIGEN_STRONG_INLINE Packet8d pnmadd(
const Packet8d& a,
const Packet8d& b,
const Packet8d& c) {
547 return _mm512_fnmadd_pd(a, b, c);
551EIGEN_STRONG_INLINE Packet16f pnmsub(
const Packet16f& a,
const Packet16f& b,
const Packet16f& c) {
552 return _mm512_fnmsub_ps(a, b, c);
555EIGEN_STRONG_INLINE Packet8d pnmsub(
const Packet8d& a,
const Packet8d& b,
const Packet8d& c) {
556 return _mm512_fnmsub_pd(a, b, c);
561EIGEN_DEVICE_FUNC
inline Packet16f pselect(
const Packet16f& mask,
const Packet16f& a,
const Packet16f& b) {
562 __mmask16 mask16 = _mm512_cmpeq_epi32_mask(_mm512_castps_si512(mask), _mm512_setzero_epi32());
563 return _mm512_mask_blend_ps(mask16, a, b);
567EIGEN_DEVICE_FUNC
inline Packet16i pselect(
const Packet16i& mask,
const Packet16i& a,
const Packet16i& b) {
568 __mmask16 mask16 = _mm512_cmpeq_epi32_mask(mask, _mm512_setzero_epi32());
569 return _mm512_mask_blend_epi32(mask16, a, b);
573EIGEN_DEVICE_FUNC
inline Packet8l pselect(
const Packet8l& mask,
const Packet8l& a,
const Packet8l& b) {
574 __mmask8 mask8 = _mm512_cmpeq_epi64_mask(mask, _mm512_setzero_si512());
575 return _mm512_mask_blend_epi64(mask8, a, b);
579EIGEN_DEVICE_FUNC
inline Packet8d pselect(
const Packet8d& mask,
const Packet8d& a,
const Packet8d& b) {
580 __mmask8 mask8 = _mm512_cmp_epi64_mask(_mm512_castpd_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
581 return _mm512_mask_blend_pd(mask8, a, b);
585EIGEN_STRONG_INLINE Packet16f pmin<Packet16f>(
const Packet16f& a,
const Packet16f& b) {
587 return _mm512_min_ps(b, a);
590EIGEN_STRONG_INLINE Packet8d pmin<Packet8d>(
const Packet8d& a,
const Packet8d& b) {
592 return _mm512_min_pd(b, a);
595EIGEN_STRONG_INLINE Packet16i pmin<Packet16i>(
const Packet16i& a,
const Packet16i& b) {
596 return _mm512_min_epi32(b, a);
599EIGEN_STRONG_INLINE Packet8l pmin<Packet8l>(
const Packet8l& a,
const Packet8l& b) {
600 return _mm512_min_epi64(b, a);
604EIGEN_STRONG_INLINE Packet16f pmax<Packet16f>(
const Packet16f& a,
const Packet16f& b) {
606 return _mm512_max_ps(b, a);
609EIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(
const Packet8d& a,
const Packet8d& b) {
611 return _mm512_max_pd(b, a);
614EIGEN_STRONG_INLINE Packet16i pmax<Packet16i>(
const Packet16i& a,
const Packet16i& b) {
615 return _mm512_max_epi32(b, a);
618EIGEN_STRONG_INLINE Packet8l pmax<Packet8l>(
const Packet8l& a,
const Packet8l& b) {
619 return _mm512_max_epi64(b, a);
624EIGEN_STRONG_INLINE Packet16f pmin<PropagateNumbers, Packet16f>(
const Packet16f& a,
const Packet16f& b) {
625 return pminmax_propagate_numbers(a, b, pmin<Packet16f>);
628EIGEN_STRONG_INLINE Packet8d pmin<PropagateNumbers, Packet8d>(
const Packet8d& a,
const Packet8d& b) {
629 return pminmax_propagate_numbers(a, b, pmin<Packet8d>);
632EIGEN_STRONG_INLINE Packet16f pmax<PropagateNumbers, Packet16f>(
const Packet16f& a,
const Packet16f& b) {
633 return pminmax_propagate_numbers(a, b, pmax<Packet16f>);
636EIGEN_STRONG_INLINE Packet8d pmax<PropagateNumbers, Packet8d>(
const Packet8d& a,
const Packet8d& b) {
637 return pminmax_propagate_numbers(a, b, pmax<Packet8d>);
640EIGEN_STRONG_INLINE Packet16f pmin<PropagateNaN, Packet16f>(
const Packet16f& a,
const Packet16f& b) {
641 return pminmax_propagate_nan(a, b, pmin<Packet16f>);
644EIGEN_STRONG_INLINE Packet8d pmin<PropagateNaN, Packet8d>(
const Packet8d& a,
const Packet8d& b) {
645 return pminmax_propagate_nan(a, b, pmin<Packet8d>);
648EIGEN_STRONG_INLINE Packet16f pmax<PropagateNaN, Packet16f>(
const Packet16f& a,
const Packet16f& b) {
649 return pminmax_propagate_nan(a, b, pmax<Packet16f>);
652EIGEN_STRONG_INLINE Packet8d pmax<PropagateNaN, Packet8d>(
const Packet8d& a,
const Packet8d& b) {
653 return pminmax_propagate_nan(a, b, pmax<Packet8d>);
656#ifdef EIGEN_VECTORIZE_AVX512DQ
658EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) {
659 return _mm512_extractf32x8_ps(x, I_);
662EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) {
663 return _mm512_extractf64x2_pd(x, I_);
665EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) {
666 return _mm512_insertf32x8(_mm512_castps256_ps512(a), b, 1);
668EIGEN_STRONG_INLINE Packet16i cat256i(Packet8i a, Packet8i b) {
669 return _mm512_inserti32x8(_mm512_castsi256_si512(a), b, 1);
674EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) {
675 return _mm256_castsi256_ps(_mm512_extracti64x4_epi64(_mm512_castps_si512(x), I_));
680EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) {
681 return _mm_castsi128_pd(_mm512_extracti32x4_epi32(_mm512_castpd_si512(x), I_));
684EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) {
685 return _mm512_castsi512_ps(
686 _mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)), _mm256_castps_si256(b), 1));
688EIGEN_STRONG_INLINE Packet16i cat256i(Packet8i a, Packet8i b) {
689 return _mm512_inserti64x4(_mm512_castsi256_si512(a), b, 1);
695EIGEN_STRONG_INLINE __m256i Pack32To16(Packet16f rf) {
703 __m256i lo = _mm256_castps_si256(extract256<0>(rf));
704 __m256i hi = _mm256_castps_si256(extract256<1>(rf));
705 __m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0), _mm256_extractf128_si256(lo, 1));
706 __m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0), _mm256_extractf128_si256(hi, 1));
707 return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1);
711EIGEN_STRONG_INLINE Packet16f pisnan(
const Packet16f& a) {
712 __mmask16 mask = _mm512_cmp_ps_mask(a, a, _CMP_UNORD_Q);
713 return _mm512_castsi512_ps(_mm512_maskz_set1_epi32(mask, int32_t(-1)));
717EIGEN_STRONG_INLINE Packet16f pcmp_eq(
const Packet16f& a,
const Packet16f& b) {
718 __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ);
719 return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1)));
722EIGEN_STRONG_INLINE Packet16f pcmp_le(
const Packet16f& a,
const Packet16f& b) {
723 __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ);
724 return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1)));
728EIGEN_STRONG_INLINE Packet16f pcmp_lt(
const Packet16f& a,
const Packet16f& b) {
729 __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ);
730 return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1)));
734EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(
const Packet16f& a,
const Packet16f& b) {
735 __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGE_UQ);
736 return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1)));
740EIGEN_STRONG_INLINE Packet16i pcmp_eq(
const Packet16i& a,
const Packet16i& b) {
741 __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_EQ);
742 return _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1));
745EIGEN_STRONG_INLINE Packet16i pcmp_le(
const Packet16i& a,
const Packet16i& b) {
746 __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LE);
747 return _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1));
750EIGEN_STRONG_INLINE Packet16i pcmp_lt(
const Packet16i& a,
const Packet16i& b) {
751 __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LT);
752 return _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1));
756EIGEN_STRONG_INLINE Packet8l pcmp_eq(
const Packet8l& a,
const Packet8l& b) {
757 __mmask8 mask = _mm512_cmp_epi64_mask(a, b, _MM_CMPINT_EQ);
758 return _mm512_mask_set1_epi64(_mm512_setzero_si512(), mask, int64_t(-1));
761EIGEN_STRONG_INLINE Packet8l pcmp_le(
const Packet8l& a,
const Packet8l& b) {
762 __mmask8 mask = _mm512_cmp_epi64_mask(a, b, _MM_CMPINT_LE);
763 return _mm512_mask_set1_epi64(_mm512_setzero_si512(), mask, int64_t(-1));
766EIGEN_STRONG_INLINE Packet8l pcmp_lt(
const Packet8l& a,
const Packet8l& b) {
767 __mmask8 mask = _mm512_cmp_epi64_mask(a, b, _MM_CMPINT_LT);
768 return _mm512_mask_set1_epi64(_mm512_setzero_si512(), mask, int64_t(-1));
772EIGEN_STRONG_INLINE Packet8d pcmp_eq(
const Packet8d& a,
const Packet8d& b) {
773 __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ);
774 return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
777EIGEN_STRONG_INLINE Packet8d pcmp_le(
const Packet8d& a,
const Packet8d& b) {
778 __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LE_OQ);
779 return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
782EIGEN_STRONG_INLINE Packet8d pcmp_lt(
const Packet8d& a,
const Packet8d& b) {
783 __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ);
784 return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
787EIGEN_STRONG_INLINE Packet8d pcmp_lt_or_nan(
const Packet8d& a,
const Packet8d& b) {
788 __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_NGE_UQ);
789 return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
793EIGEN_STRONG_INLINE Packet16f print<Packet16f>(
const Packet16f& a) {
794 return _mm512_roundscale_ps(a, _MM_FROUND_CUR_DIRECTION);
797EIGEN_STRONG_INLINE Packet8d print<Packet8d>(
const Packet8d& a) {
798 return _mm512_roundscale_pd(a, _MM_FROUND_CUR_DIRECTION);
802EIGEN_STRONG_INLINE Packet16f pceil<Packet16f>(
const Packet16f& a) {
803 return _mm512_roundscale_ps(a, _MM_FROUND_TO_POS_INF);
806EIGEN_STRONG_INLINE Packet8d pceil<Packet8d>(
const Packet8d& a) {
807 return _mm512_roundscale_pd(a, _MM_FROUND_TO_POS_INF);
811EIGEN_STRONG_INLINE Packet16f pfloor<Packet16f>(
const Packet16f& a) {
812 return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEG_INF);
815EIGEN_STRONG_INLINE Packet8d pfloor<Packet8d>(
const Packet8d& a) {
816 return _mm512_roundscale_pd(a, _MM_FROUND_TO_NEG_INF);
820EIGEN_STRONG_INLINE Packet16f ptrunc<Packet16f>(
const Packet16f& a) {
821 return _mm512_roundscale_ps(a, _MM_FROUND_TO_ZERO);
824EIGEN_STRONG_INLINE Packet8d ptrunc<Packet8d>(
const Packet8d& a) {
825 return _mm512_roundscale_pd(a, _MM_FROUND_TO_ZERO);
829EIGEN_STRONG_INLINE Packet16i ptrue<Packet16i>(
const Packet16i& ) {
830 return _mm512_set1_epi32(int32_t(-1));
834EIGEN_STRONG_INLINE Packet8l ptrue<Packet8l>(
const Packet8l& ) {
835 return _mm512_set1_epi64(int64_t(-1));
839EIGEN_STRONG_INLINE Packet16f ptrue<Packet16f>(
const Packet16f& a) {
840 return _mm512_castsi512_ps(ptrue<Packet16i>(_mm512_castps_si512(a)));
844EIGEN_STRONG_INLINE Packet8d ptrue<Packet8d>(
const Packet8d& a) {
845 return _mm512_castsi512_pd(ptrue<Packet16i>(_mm512_castpd_si512(a)));
849EIGEN_STRONG_INLINE Packet16i pand<Packet16i>(
const Packet16i& a,
const Packet16i& b) {
850 return _mm512_and_si512(a, b);
854EIGEN_STRONG_INLINE Packet8l pand<Packet8l>(
const Packet8l& a,
const Packet8l& b) {
855 return _mm512_and_si512(a, b);
859EIGEN_STRONG_INLINE Packet16f pand<Packet16f>(
const Packet16f& a,
const Packet16f& b) {
860#ifdef EIGEN_VECTORIZE_AVX512DQ
861 return _mm512_and_ps(a, b);
863 return _mm512_castsi512_ps(pand(_mm512_castps_si512(a), _mm512_castps_si512(b)));
867EIGEN_STRONG_INLINE Packet8d pand<Packet8d>(
const Packet8d& a,
const Packet8d& b) {
868#ifdef EIGEN_VECTORIZE_AVX512DQ
869 return _mm512_and_pd(a, b);
871 Packet8d res = _mm512_undefined_pd();
872 Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
873 Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
874 res = _mm512_insertf64x4(res, _mm256_and_pd(lane0_a, lane0_b), 0);
876 Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
877 Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
878 return _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1);
883EIGEN_STRONG_INLINE Packet16i por<Packet16i>(
const Packet16i& a,
const Packet16i& b) {
884 return _mm512_or_si512(a, b);
888EIGEN_STRONG_INLINE Packet8l por<Packet8l>(
const Packet8l& a,
const Packet8l& b) {
889 return _mm512_or_si512(a, b);
893EIGEN_STRONG_INLINE Packet16f por<Packet16f>(
const Packet16f& a,
const Packet16f& b) {
894#ifdef EIGEN_VECTORIZE_AVX512DQ
895 return _mm512_or_ps(a, b);
897 return _mm512_castsi512_ps(por(_mm512_castps_si512(a), _mm512_castps_si512(b)));
902EIGEN_STRONG_INLINE Packet8d por<Packet8d>(
const Packet8d& a,
const Packet8d& b) {
903#ifdef EIGEN_VECTORIZE_AVX512DQ
904 return _mm512_or_pd(a, b);
906 return _mm512_castsi512_pd(por(_mm512_castpd_si512(a), _mm512_castpd_si512(b)));
911EIGEN_STRONG_INLINE Packet16i pxor<Packet16i>(
const Packet16i& a,
const Packet16i& b) {
912 return _mm512_xor_si512(a, b);
916EIGEN_STRONG_INLINE Packet8l pxor<Packet8l>(
const Packet8l& a,
const Packet8l& b) {
917 return _mm512_xor_si512(a, b);
921EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(
const Packet16f& a,
const Packet16f& b) {
922#ifdef EIGEN_VECTORIZE_AVX512DQ
923 return _mm512_xor_ps(a, b);
925 return _mm512_castsi512_ps(pxor(_mm512_castps_si512(a), _mm512_castps_si512(b)));
930EIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(
const Packet8d& a,
const Packet8d& b) {
931#ifdef EIGEN_VECTORIZE_AVX512DQ
932 return _mm512_xor_pd(a, b);
934 return _mm512_castsi512_pd(pxor(_mm512_castpd_si512(a), _mm512_castpd_si512(b)));
939EIGEN_STRONG_INLINE Packet16i pandnot<Packet16i>(
const Packet16i& a,
const Packet16i& b) {
940 return _mm512_andnot_si512(b, a);
944EIGEN_STRONG_INLINE Packet8l pandnot<Packet8l>(
const Packet8l& a,
const Packet8l& b) {
945 return _mm512_andnot_si512(b, a);
949EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(
const Packet16f& a,
const Packet16f& b) {
950#ifdef EIGEN_VECTORIZE_AVX512DQ
951 return _mm512_andnot_ps(b, a);
953 return _mm512_castsi512_ps(pandnot(_mm512_castps_si512(a), _mm512_castps_si512(b)));
957EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(
const Packet8d& a,
const Packet8d& b) {
958#ifdef EIGEN_VECTORIZE_AVX512DQ
959 return _mm512_andnot_pd(b, a);
961 return _mm512_castsi512_pd(pandnot(_mm512_castpd_si512(a), _mm512_castpd_si512(b)));
966EIGEN_STRONG_INLINE Packet16f pround<Packet16f>(
const Packet16f& a) {
968 const Packet16f mask = pset1frombits<Packet16f>(
static_cast<numext::uint32_t
>(0x80000000u));
969 const Packet16f prev0dot5 = pset1frombits<Packet16f>(
static_cast<numext::uint32_t
>(0x3EFFFFFFu));
970 return _mm512_roundscale_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
973EIGEN_STRONG_INLINE Packet8d pround<Packet8d>(
const Packet8d& a) {
975 const Packet8d mask = pset1frombits<Packet8d>(
static_cast<numext::uint64_t
>(0x8000000000000000ull));
976 const Packet8d prev0dot5 = pset1frombits<Packet8d>(
static_cast<numext::uint64_t
>(0x3FDFFFFFFFFFFFFFull));
977 return _mm512_roundscale_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
981EIGEN_STRONG_INLINE Packet16i parithmetic_shift_right(Packet16i a) {
982 return _mm512_srai_epi32(a, N);
986EIGEN_STRONG_INLINE Packet16i plogical_shift_right(Packet16i a) {
987 return _mm512_srli_epi32(a, N);
991EIGEN_STRONG_INLINE Packet16i plogical_shift_left(Packet16i a) {
992 return _mm512_slli_epi32(a, N);
996EIGEN_STRONG_INLINE Packet8l parithmetic_shift_right(Packet8l a) {
997 return _mm512_srai_epi64(a, N);
1001EIGEN_STRONG_INLINE Packet8l plogical_shift_right(Packet8l a) {
1002 return _mm512_srli_epi64(a, N);
1006EIGEN_STRONG_INLINE Packet8l plogical_shift_left(Packet8l a) {
1007 return _mm512_slli_epi64(a, N);
1011EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(
const float* from) {
1012 EIGEN_DEBUG_ALIGNED_LOAD
return _mm512_load_ps(from);
1015EIGEN_STRONG_INLINE Packet8d pload<Packet8d>(
const double* from) {
1016 EIGEN_DEBUG_ALIGNED_LOAD
return _mm512_load_pd(from);
1019EIGEN_STRONG_INLINE Packet16i pload<Packet16i>(
const int* from) {
1020 EIGEN_DEBUG_ALIGNED_LOAD
return _mm512_load_epi64(from);
1023EIGEN_STRONG_INLINE Packet8l pload<Packet8l>(
const int64_t* from) {
1024 EIGEN_DEBUG_ALIGNED_LOAD
return _mm512_load_epi64(from);
1028EIGEN_STRONG_INLINE Packet16f ploadu<Packet16f>(
const float* from) {
1029 EIGEN_DEBUG_UNALIGNED_LOAD
return _mm512_loadu_ps(from);
1032EIGEN_STRONG_INLINE Packet8d ploadu<Packet8d>(
const double* from) {
1033 EIGEN_DEBUG_UNALIGNED_LOAD
return _mm512_loadu_pd(from);
1036EIGEN_STRONG_INLINE Packet16i ploadu<Packet16i>(
const int* from) {
1037 EIGEN_DEBUG_UNALIGNED_LOAD
return _mm512_loadu_epi32(from);
1040EIGEN_STRONG_INLINE Packet8l ploadu<Packet8l>(
const int64_t* from) {
1041 EIGEN_DEBUG_UNALIGNED_LOAD
return _mm512_loadu_epi64(from);
1045EIGEN_STRONG_INLINE Packet16f ploadu<Packet16f>(
const float* from, uint16_t umask) {
1046 __mmask16 mask =
static_cast<__mmask16
>(umask);
1047 EIGEN_DEBUG_UNALIGNED_LOAD
return _mm512_maskz_loadu_ps(mask, from);
1050EIGEN_STRONG_INLINE Packet8d ploadu<Packet8d>(
const double* from, uint8_t umask) {
1051 __mmask8 mask =
static_cast<__mmask8
>(umask);
1052 EIGEN_DEBUG_UNALIGNED_LOAD
return _mm512_maskz_loadu_pd(mask, from);
1058EIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(
const float* from) {
1061 __m256i low_half = _mm256_castps_si256(_mm256_loadu_ps(from));
1062 __m512 even_elements = _mm512_castsi512_ps(_mm512_cvtepu32_epi64(low_half));
1063 __m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0));
1070EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(
const double* from) {
1071 Packet8d tmp = _mm512_castpd256_pd512(ploadu<Packet4d>(from));
1072 const Packet8l scatter_mask = _mm512_set_epi64(3, 3, 2, 2, 1, 1, 0, 0);
1073 return _mm512_permutexvar_pd(scatter_mask, tmp);
1079EIGEN_STRONG_INLINE Packet8l ploaddup<Packet8l>(
const int64_t* from) {
1080 Packet8l tmp = _mm512_castsi256_si512(ploadu<Packet4l>(from));
1081 const Packet8l scatter_mask = _mm512_set_epi64(3, 3, 2, 2, 1, 1, 0, 0);
1082 return _mm512_permutexvar_epi64(scatter_mask, tmp);
1088EIGEN_STRONG_INLINE Packet16i ploaddup<Packet16i>(
const int* from) {
1089 __m256i low_half = _mm256_load_si256(
reinterpret_cast<const __m256i*
>(from));
1090 __m512 even_elements = _mm512_castsi512_ps(_mm512_cvtepu32_epi64(low_half));
1091 __m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0));
1092 return _mm512_castps_si512(pairs);
1098EIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(
const float* from) {
1099 Packet16f tmp = _mm512_castps128_ps512(ploadu<Packet4f>(from));
1100 const Packet16i scatter_mask = _mm512_set_epi32(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
1101 return _mm512_permutexvar_ps(scatter_mask, tmp);
1107EIGEN_STRONG_INLINE Packet8d ploadquad<Packet8d>(
const double* from) {
1108 __m256d lane0 = _mm256_set1_pd(*from);
1109 __m256d lane1 = _mm256_set1_pd(*(from + 1));
1110 __m512d tmp = _mm512_undefined_pd();
1111 tmp = _mm512_insertf64x4(tmp, lane0, 0);
1112 return _mm512_insertf64x4(tmp, lane1, 1);
1118EIGEN_STRONG_INLINE Packet8l ploadquad<Packet8l>(
const int64_t* from) {
1119 __m256i lane0 = _mm256_set1_epi64x(*from);
1120 __m256i lane1 = _mm256_set1_epi64x(*(from + 1));
1121 __m512i tmp = _mm512_undefined_epi32();
1122 tmp = _mm512_inserti64x4(tmp, lane0, 0);
1123 return _mm512_inserti64x4(tmp, lane1, 1);
1129EIGEN_STRONG_INLINE Packet16i ploadquad<Packet16i>(
const int* from) {
1130 Packet16i tmp = _mm512_castsi128_si512(ploadu<Packet4i>(from));
1131 const Packet16i scatter_mask = _mm512_set_epi32(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
1132 return _mm512_permutexvar_epi32(scatter_mask, tmp);
1136EIGEN_STRONG_INLINE
void pstore<float>(
float* to,
const Packet16f& from) {
1137 EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ps(to, from);
1140EIGEN_STRONG_INLINE
void pstore<double>(
double* to,
const Packet8d& from) {
1141 EIGEN_DEBUG_ALIGNED_STORE _mm512_store_pd(to, from);
1144EIGEN_STRONG_INLINE
void pstore<int>(
int* to,
const Packet16i& from) {
1145 EIGEN_DEBUG_ALIGNED_STORE _mm512_store_epi32(to, from);
1148EIGEN_STRONG_INLINE
void pstore<int64_t>(int64_t* to,
const Packet8l& from) {
1149 EIGEN_DEBUG_ALIGNED_STORE _mm512_store_epi64(to, from);
1153EIGEN_STRONG_INLINE
void pstoreu<float>(
float* to,
const Packet16f& from) {
1154 EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ps(to, from);
1157EIGEN_STRONG_INLINE
void pstoreu<double>(
double* to,
const Packet8d& from) {
1158 EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_pd(to, from);
1161EIGEN_STRONG_INLINE
void pstoreu<int>(
int* to,
const Packet16i& from) {
1162 EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_epi32(to, from);
1165EIGEN_STRONG_INLINE
void pstoreu<int64_t>(int64_t* to,
const Packet8l& from) {
1166 EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_epi64(to, from);
1169EIGEN_STRONG_INLINE
void pstoreu<float>(
float* to,
const Packet16f& from, uint16_t umask) {
1170 __mmask16 mask =
static_cast<__mmask16
>(umask);
1171 EIGEN_DEBUG_UNALIGNED_STORE
return _mm512_mask_storeu_ps(to, mask, from);
1174EIGEN_STRONG_INLINE
void pstoreu<double>(
double* to,
const Packet8d& from, uint8_t umask) {
1175 __mmask8 mask =
static_cast<__mmask8
>(umask);
1176 EIGEN_DEBUG_UNALIGNED_STORE
return _mm512_mask_storeu_pd(to, mask, from);
1179template <
typename Scalar,
typename Packet>
1180EIGEN_DEVICE_FUNC
inline Packet pgather(
const Packet& src,
const Scalar* from,
Index stride,
1181 typename unpacket_traits<Packet>::mask_t umask);
1183EIGEN_DEVICE_FUNC
inline Packet16f pgather<float, Packet16f>(
const Packet16f& src,
const float* from,
Index stride,
1185 Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
1186 Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1187 Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
1188 __mmask16 mask =
static_cast<__mmask16
>(umask);
1190 return _mm512_mask_i32gather_ps(src, mask, indices, from, 4);
1193EIGEN_DEVICE_FUNC
inline Packet8d pgather<double, Packet8d>(
const Packet8d& src,
const double* from,
Index stride,
1195 Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
1196 Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1197 Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
1198 __mmask8 mask =
static_cast<__mmask8
>(umask);
1200 return _mm512_mask_i32gather_pd(src, mask, indices, from, 8);
1204EIGEN_DEVICE_FUNC
inline Packet16f pgather<float, Packet16f>(
const float* from,
Index stride) {
1205 Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
1206 Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1207 Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
1209 return _mm512_i32gather_ps(indices, from, 4);
1212EIGEN_DEVICE_FUNC
inline Packet8d pgather<double, Packet8d>(
const double* from,
Index stride) {
1213 Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
1214 Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1215 Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
1217 return _mm512_i32gather_pd(indices, from, 8);
1220EIGEN_DEVICE_FUNC
inline Packet8l pgather<int64_t, Packet8l>(
const int64_t* from,
Index stride) {
1221 Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
1222 Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1223 Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
1225 return _mm512_i32gather_epi64(indices, from, 8);
1228EIGEN_DEVICE_FUNC
inline Packet16i pgather<int, Packet16i>(
const int* from,
Index stride) {
1229 Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
1230 Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1231 Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
1232 return _mm512_i32gather_epi32(indices, from, 4);
1235template <
typename Scalar,
typename Packet>
1236EIGEN_DEVICE_FUNC
inline void pscatter(Scalar* to,
const Packet& from,
Index stride,
1237 typename unpacket_traits<Packet>::mask_t umask);
1239EIGEN_DEVICE_FUNC
inline void pscatter<float, Packet16f>(
float* to,
const Packet16f& from,
Index stride,
1241 Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
1242 Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1243 Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
1244 __mmask16 mask =
static_cast<__mmask16
>(umask);
1245 _mm512_mask_i32scatter_ps(to, mask, indices, from, 4);
1248EIGEN_DEVICE_FUNC
inline void pscatter<double, Packet8d>(
double* to,
const Packet8d& from,
Index stride,
1250 Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
1251 Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1252 Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
1253 __mmask8 mask =
static_cast<__mmask8
>(umask);
1254 _mm512_mask_i32scatter_pd(to, mask, indices, from, 8);
1257EIGEN_DEVICE_FUNC
inline void pscatter<float, Packet16f>(
float* to,
const Packet16f& from,
Index stride) {
1258 Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
1259 Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1260 Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
1261 _mm512_i32scatter_ps(to, indices, from, 4);
1264EIGEN_DEVICE_FUNC
inline void pscatter<double, Packet8d>(
double* to,
const Packet8d& from,
Index stride) {
1265 Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
1266 Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1267 Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
1268 _mm512_i32scatter_pd(to, indices, from, 8);
1271EIGEN_DEVICE_FUNC
inline void pscatter<int64_t, Packet8l>(int64_t* to,
const Packet8l& from,
Index stride) {
1272 Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
1273 Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1274 Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
1275 _mm512_i32scatter_epi64(to, indices, from, 8);
1278EIGEN_DEVICE_FUNC
inline void pscatter<int, Packet16i>(
int* to,
const Packet16i& from,
Index stride) {
1279 Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
1280 Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1281 Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
1282 _mm512_i32scatter_epi32(to, indices, from, 4);
1286EIGEN_STRONG_INLINE
void pstore1<Packet16f>(
float* to,
const float& a) {
1287 Packet16f pa = pset1<Packet16f>(a);
1291EIGEN_STRONG_INLINE
void pstore1<Packet8d>(
double* to,
const double& a) {
1292 Packet8d pa = pset1<Packet8d>(a);
1296EIGEN_STRONG_INLINE
void pstore1<Packet16i>(
int* to,
const int& a) {
1297 Packet16i pa = pset1<Packet16i>(a);
1301EIGEN_STRONG_INLINE
void pstore1<Packet8l>(int64_t* to,
const int64_t& a) {
1302 Packet8l pa = pset1<Packet8l>(a);
1307EIGEN_STRONG_INLINE
void prefetch<float>(
const float* addr) {
1308 _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1311EIGEN_STRONG_INLINE
void prefetch<double>(
const double* addr) {
1312 _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1315EIGEN_STRONG_INLINE
void prefetch<int>(
const int* addr) {
1316 _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
1320EIGEN_STRONG_INLINE
float pfirst<Packet16f>(
const Packet16f& a) {
1321 return _mm512_cvtss_f32(a);
1324EIGEN_STRONG_INLINE
double pfirst<Packet8d>(
const Packet8d& a) {
1325 return _mm512_cvtsd_f64(a);
1328EIGEN_STRONG_INLINE int64_t pfirst<Packet8l>(
const Packet8l& a) {
1329 int64_t x = _mm_extract_epi64_0(_mm512_extracti32x4_epi32(a, 0));
1333EIGEN_STRONG_INLINE
int pfirst<Packet16i>(
const Packet16i& a) {
1334#if EIGEN_GNUC_STRICT_LESS_THAN(11, 0, 0)
1335 return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
1337 return _mm512_cvtsi512_si32(a);
1342EIGEN_STRONG_INLINE Packet16f preverse(
const Packet16f& a) {
1343 return _mm512_permutexvar_ps(_mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), a);
1347EIGEN_STRONG_INLINE Packet8d preverse(
const Packet8d& a) {
1348 return _mm512_permutexvar_pd(_mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7), a);
1352EIGEN_STRONG_INLINE Packet16i preverse(
const Packet16i& a) {
1353 return _mm512_permutexvar_epi32(_mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), a);
1357EIGEN_STRONG_INLINE Packet8l preverse(
const Packet8l& a) {
1358 return _mm512_permutexvar_epi64(_mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7), a);
1362EIGEN_STRONG_INLINE Packet16f pabs(
const Packet16f& a) {
1364 return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x7fffffff)));
1367EIGEN_STRONG_INLINE Packet8d pabs(
const Packet8d& a) {
1369 return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a), _mm512_set1_epi64(0x7fffffffffffffff)));
1372EIGEN_STRONG_INLINE Packet16i pabs(
const Packet16i& a) {
1373 return _mm512_abs_epi32(a);
1376EIGEN_STRONG_INLINE Packet8l pabs(
const Packet8l& a) {
1377 return _mm512_abs_epi64(a);
1380#ifndef EIGEN_VECTORIZE_AVX512FP16
1382EIGEN_STRONG_INLINE Packet16h psignbit(
const Packet16h& a) {
1383 return _mm256_srai_epi16(a, 15);
1388EIGEN_STRONG_INLINE Packet16bf psignbit(
const Packet16bf& a) {
1389 return _mm256_srai_epi16(a, 15);
1392EIGEN_STRONG_INLINE Packet16f psignbit(
const Packet16f& a) {
1393 return _mm512_castsi512_ps(_mm512_srai_epi32(_mm512_castps_si512(a), 31));
1396EIGEN_STRONG_INLINE Packet8d psignbit(
const Packet8d& a) {
1397 return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), 63));
1401EIGEN_STRONG_INLINE Packet16f pfrexp<Packet16f>(
const Packet16f& a, Packet16f& exponent) {
1402 return pfrexp_generic(a, exponent);
1407EIGEN_STRONG_INLINE Packet8d pfrexp_generic_get_biased_exponent(
const Packet8d& a) {
1408 const Packet8d cst_exp_mask = pset1frombits<Packet8d>(
static_cast<uint64_t
>(0x7ff0000000000000ull));
1409#ifdef EIGEN_VECTORIZE_AVX512DQ
1410 return _mm512_cvtepi64_pd(_mm512_srli_epi64(_mm512_castpd_si512(pand(a, cst_exp_mask)), 52));
1412 return _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(_mm512_srli_epi64(_mm512_castpd_si512(pand(a, cst_exp_mask)), 52)));
1417EIGEN_STRONG_INLINE Packet8d pfrexp<Packet8d>(
const Packet8d& a, Packet8d& exponent) {
1418 return pfrexp_generic(a, exponent);
1422EIGEN_STRONG_INLINE Packet16f pldexp<Packet16f>(
const Packet16f& a,
const Packet16f& exponent) {
1423 return pldexp_generic(a, exponent);
1427EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(
const Packet8d& a,
const Packet8d& exponent) {
1429 const Packet8d max_exponent = pset1<Packet8d>(2099.0);
1430 const Packet8i e = _mm512_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
1433 const Packet8i bias = pset1<Packet8i>(1023);
1434 Packet8i b = parithmetic_shift_right<2>(e);
1437 const Packet8i permute_idx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
1438 Packet8i hi = _mm256_permutevar8x32_epi32(padd(b, bias), permute_idx);
1439 Packet8i lo = _mm256_slli_epi64(hi, 52);
1440 hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52);
1441 Packet8d c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1));
1442 Packet8d out = pmul(pmul(pmul(a, c), c), c);
1445 b = psub(psub(psub(e, b), b), b);
1446 hi = _mm256_permutevar8x32_epi32(padd(b, bias), permute_idx);
1447 lo = _mm256_slli_epi64(hi, 52);
1448 hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52);
1449 c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1));
1454#ifdef EIGEN_VECTORIZE_AVX512DQ
1456#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
1457 __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0); \
1458 __m256 OUTPUT##_1 = _mm512_extractf32x8_ps(INPUT, 1)
1461#define EIGEN_EXTRACT_8i_FROM_16i(INPUT, OUTPUT) \
1462 __m256i OUTPUT##_0 = _mm512_extracti32x8_epi32(INPUT, 0); \
1463 __m256i OUTPUT##_1 = _mm512_extracti32x8_epi32(INPUT, 1)
1465#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
1466 __m256 OUTPUT##_0 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 0)), \
1467 _mm512_extractf32x4_ps(INPUT, 1), 1); \
1468 __m256 OUTPUT##_1 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 2)), \
1469 _mm512_extractf32x4_ps(INPUT, 3), 1)
1471#define EIGEN_EXTRACT_8i_FROM_16i(INPUT, OUTPUT) \
1472 __m256i OUTPUT##_0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm512_extracti32x4_epi32(INPUT, 0)), \
1473 _mm512_extracti32x4_epi32(INPUT, 1), 1); \
1474 __m256i OUTPUT##_1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm512_extracti32x4_epi32(INPUT, 2)), \
1475 _mm512_extracti32x4_epi32(INPUT, 3), 1)
1478#ifdef EIGEN_VECTORIZE_AVX512DQ
1479#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
1480 OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1);
1482#define EIGEN_INSERT_8i_INTO_16i(OUTPUT, INPUTA, INPUTB) \
1483 OUTPUT = _mm512_inserti32x8(_mm512_castsi256_si512(INPUTA), INPUTB, 1);
1485#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
1486 OUTPUT = _mm512_undefined_ps(); \
1487 OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \
1488 OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \
1489 OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \
1490 OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3);
1492#define EIGEN_INSERT_8i_INTO_16i(OUTPUT, INPUTA, INPUTB) \
1493 OUTPUT = _mm512_undefined_epi32(); \
1494 OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTA, 0), 0); \
1495 OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTA, 1), 1); \
1496 OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTB, 0), 2); \
1497 OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTB, 1), 3);
1501EIGEN_STRONG_INLINE Packet8f predux_half_dowto4<Packet16f>(
const Packet16f& a) {
1502#ifdef EIGEN_VECTORIZE_AVX512DQ
1503 __m256 lane0 = _mm512_extractf32x8_ps(a, 0);
1504 __m256 lane1 = _mm512_extractf32x8_ps(a, 1);
1505 return _mm256_add_ps(lane0, lane1);
1507 __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
1508 __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
1509 __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
1510 __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
1511 __m128 sum0 = _mm_add_ps(lane0, lane2);
1512 __m128 sum1 = _mm_add_ps(lane1, lane3);
1513 return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1);
1517EIGEN_STRONG_INLINE Packet4d predux_half_dowto4<Packet8d>(
const Packet8d& a) {
1518 __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
1519 __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
1520 return _mm256_add_pd(lane0, lane1);
1523EIGEN_STRONG_INLINE Packet8i predux_half_dowto4<Packet16i>(
const Packet16i& a) {
1524#ifdef EIGEN_VECTORIZE_AVX512DQ
1525 __m256i lane0 = _mm512_extracti32x8_epi32(a, 0);
1526 __m256i lane1 = _mm512_extracti32x8_epi32(a, 1);
1527 return _mm256_add_epi32(lane0, lane1);
1529 __m128i lane0 = _mm512_extracti32x4_epi32(a, 0);
1530 __m128i lane1 = _mm512_extracti32x4_epi32(a, 1);
1531 __m128i lane2 = _mm512_extracti32x4_epi32(a, 2);
1532 __m128i lane3 = _mm512_extracti32x4_epi32(a, 3);
1533 __m128i sum0 = _mm_add_epi32(lane0, lane2);
1534 __m128i sum1 = _mm_add_epi32(lane1, lane3);
1535 return _mm256_inserti128_si256(_mm256_castsi128_si256(sum0), sum1, 1);
1540EIGEN_STRONG_INLINE Packet4l predux_half_dowto4<Packet8l>(
const Packet8l& a) {
1541 __m256i lane0 = _mm512_extracti64x4_epi64(a, 0);
1542 __m256i lane1 = _mm512_extracti64x4_epi64(a, 1);
1543 return _mm256_add_epi64(lane0, lane1);
1546#define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \
1547 EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]);
1549EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet16f, 16>& kernel) {
1550 __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
1551 __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
1552 __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
1553 __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
1554 __m512 T4 = _mm512_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
1555 __m512 T5 = _mm512_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
1556 __m512 T6 = _mm512_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
1557 __m512 T7 = _mm512_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
1558 __m512 T8 = _mm512_unpacklo_ps(kernel.packet[8], kernel.packet[9]);
1559 __m512 T9 = _mm512_unpackhi_ps(kernel.packet[8], kernel.packet[9]);
1560 __m512 T10 = _mm512_unpacklo_ps(kernel.packet[10], kernel.packet[11]);
1561 __m512 T11 = _mm512_unpackhi_ps(kernel.packet[10], kernel.packet[11]);
1562 __m512 T12 = _mm512_unpacklo_ps(kernel.packet[12], kernel.packet[13]);
1563 __m512 T13 = _mm512_unpackhi_ps(kernel.packet[12], kernel.packet[13]);
1564 __m512 T14 = _mm512_unpacklo_ps(kernel.packet[14], kernel.packet[15]);
1565 __m512 T15 = _mm512_unpackhi_ps(kernel.packet[14], kernel.packet[15]);
1566 __m512 S0 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
1567 __m512 S1 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
1568 __m512 S2 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
1569 __m512 S3 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
1570 __m512 S4 = _mm512_shuffle_ps(T4, T6, _MM_SHUFFLE(1, 0, 1, 0));
1571 __m512 S5 = _mm512_shuffle_ps(T4, T6, _MM_SHUFFLE(3, 2, 3, 2));
1572 __m512 S6 = _mm512_shuffle_ps(T5, T7, _MM_SHUFFLE(1, 0, 1, 0));
1573 __m512 S7 = _mm512_shuffle_ps(T5, T7, _MM_SHUFFLE(3, 2, 3, 2));
1574 __m512 S8 = _mm512_shuffle_ps(T8, T10, _MM_SHUFFLE(1, 0, 1, 0));
1575 __m512 S9 = _mm512_shuffle_ps(T8, T10, _MM_SHUFFLE(3, 2, 3, 2));
1576 __m512 S10 = _mm512_shuffle_ps(T9, T11, _MM_SHUFFLE(1, 0, 1, 0));
1577 __m512 S11 = _mm512_shuffle_ps(T9, T11, _MM_SHUFFLE(3, 2, 3, 2));
1578 __m512 S12 = _mm512_shuffle_ps(T12, T14, _MM_SHUFFLE(1, 0, 1, 0));
1579 __m512 S13 = _mm512_shuffle_ps(T12, T14, _MM_SHUFFLE(3, 2, 3, 2));
1580 __m512 S14 = _mm512_shuffle_ps(T13, T15, _MM_SHUFFLE(1, 0, 1, 0));
1581 __m512 S15 = _mm512_shuffle_ps(T13, T15, _MM_SHUFFLE(3, 2, 3, 2));
1583 EIGEN_EXTRACT_8f_FROM_16f(S0, S0);
1584 EIGEN_EXTRACT_8f_FROM_16f(S1, S1);
1585 EIGEN_EXTRACT_8f_FROM_16f(S2, S2);
1586 EIGEN_EXTRACT_8f_FROM_16f(S3, S3);
1587 EIGEN_EXTRACT_8f_FROM_16f(S4, S4);
1588 EIGEN_EXTRACT_8f_FROM_16f(S5, S5);
1589 EIGEN_EXTRACT_8f_FROM_16f(S6, S6);
1590 EIGEN_EXTRACT_8f_FROM_16f(S7, S7);
1591 EIGEN_EXTRACT_8f_FROM_16f(S8, S8);
1592 EIGEN_EXTRACT_8f_FROM_16f(S9, S9);
1593 EIGEN_EXTRACT_8f_FROM_16f(S10, S10);
1594 EIGEN_EXTRACT_8f_FROM_16f(S11, S11);
1595 EIGEN_EXTRACT_8f_FROM_16f(S12, S12);
1596 EIGEN_EXTRACT_8f_FROM_16f(S13, S13);
1597 EIGEN_EXTRACT_8f_FROM_16f(S14, S14);
1598 EIGEN_EXTRACT_8f_FROM_16f(S15, S15);
1600 PacketBlock<Packet8f, 32> tmp;
1602 tmp.packet[0] = _mm256_permute2f128_ps(S0_0, S4_0, 0x20);
1603 tmp.packet[1] = _mm256_permute2f128_ps(S1_0, S5_0, 0x20);
1604 tmp.packet[2] = _mm256_permute2f128_ps(S2_0, S6_0, 0x20);
1605 tmp.packet[3] = _mm256_permute2f128_ps(S3_0, S7_0, 0x20);
1606 tmp.packet[4] = _mm256_permute2f128_ps(S0_0, S4_0, 0x31);
1607 tmp.packet[5] = _mm256_permute2f128_ps(S1_0, S5_0, 0x31);
1608 tmp.packet[6] = _mm256_permute2f128_ps(S2_0, S6_0, 0x31);
1609 tmp.packet[7] = _mm256_permute2f128_ps(S3_0, S7_0, 0x31);
1611 tmp.packet[8] = _mm256_permute2f128_ps(S0_1, S4_1, 0x20);
1612 tmp.packet[9] = _mm256_permute2f128_ps(S1_1, S5_1, 0x20);
1613 tmp.packet[10] = _mm256_permute2f128_ps(S2_1, S6_1, 0x20);
1614 tmp.packet[11] = _mm256_permute2f128_ps(S3_1, S7_1, 0x20);
1615 tmp.packet[12] = _mm256_permute2f128_ps(S0_1, S4_1, 0x31);
1616 tmp.packet[13] = _mm256_permute2f128_ps(S1_1, S5_1, 0x31);
1617 tmp.packet[14] = _mm256_permute2f128_ps(S2_1, S6_1, 0x31);
1618 tmp.packet[15] = _mm256_permute2f128_ps(S3_1, S7_1, 0x31);
1621 tmp.packet[16] = _mm256_permute2f128_ps(S8_0, S12_0, 0x20);
1622 tmp.packet[17] = _mm256_permute2f128_ps(S9_0, S13_0, 0x20);
1623 tmp.packet[18] = _mm256_permute2f128_ps(S10_0, S14_0, 0x20);
1624 tmp.packet[19] = _mm256_permute2f128_ps(S11_0, S15_0, 0x20);
1625 tmp.packet[20] = _mm256_permute2f128_ps(S8_0, S12_0, 0x31);
1626 tmp.packet[21] = _mm256_permute2f128_ps(S9_0, S13_0, 0x31);
1627 tmp.packet[22] = _mm256_permute2f128_ps(S10_0, S14_0, 0x31);
1628 tmp.packet[23] = _mm256_permute2f128_ps(S11_0, S15_0, 0x31);
1630 tmp.packet[24] = _mm256_permute2f128_ps(S8_1, S12_1, 0x20);
1631 tmp.packet[25] = _mm256_permute2f128_ps(S9_1, S13_1, 0x20);
1632 tmp.packet[26] = _mm256_permute2f128_ps(S10_1, S14_1, 0x20);
1633 tmp.packet[27] = _mm256_permute2f128_ps(S11_1, S15_1, 0x20);
1634 tmp.packet[28] = _mm256_permute2f128_ps(S8_1, S12_1, 0x31);
1635 tmp.packet[29] = _mm256_permute2f128_ps(S9_1, S13_1, 0x31);
1636 tmp.packet[30] = _mm256_permute2f128_ps(S10_1, S14_1, 0x31);
1637 tmp.packet[31] = _mm256_permute2f128_ps(S11_1, S15_1, 0x31);
1640 PACK_OUTPUT(kernel.packet, tmp.packet, 0, 16);
1641 PACK_OUTPUT(kernel.packet, tmp.packet, 1, 16);
1642 PACK_OUTPUT(kernel.packet, tmp.packet, 2, 16);
1643 PACK_OUTPUT(kernel.packet, tmp.packet, 3, 16);
1645 PACK_OUTPUT(kernel.packet, tmp.packet, 4, 16);
1646 PACK_OUTPUT(kernel.packet, tmp.packet, 5, 16);
1647 PACK_OUTPUT(kernel.packet, tmp.packet, 6, 16);
1648 PACK_OUTPUT(kernel.packet, tmp.packet, 7, 16);
1650 PACK_OUTPUT(kernel.packet, tmp.packet, 8, 16);
1651 PACK_OUTPUT(kernel.packet, tmp.packet, 9, 16);
1652 PACK_OUTPUT(kernel.packet, tmp.packet, 10, 16);
1653 PACK_OUTPUT(kernel.packet, tmp.packet, 11, 16);
1655 PACK_OUTPUT(kernel.packet, tmp.packet, 12, 16);
1656 PACK_OUTPUT(kernel.packet, tmp.packet, 13, 16);
1657 PACK_OUTPUT(kernel.packet, tmp.packet, 14, 16);
1658 PACK_OUTPUT(kernel.packet, tmp.packet, 15, 16);
1660#define PACK_OUTPUT_2(OUTPUT, INPUT, INDEX, STRIDE) \
1661 EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[2 * INDEX], INPUT[2 * INDEX + STRIDE]);
1663EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet16f, 8>& kernel) {
1664 __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
1665 __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
1666 __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
1667 __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
1668 __m512 T4 = _mm512_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
1669 __m512 T5 = _mm512_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
1670 __m512 T6 = _mm512_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
1671 __m512 T7 = _mm512_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
1673 kernel.packet[0] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T0), _mm512_castps_pd(T2)));
1674 kernel.packet[1] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T0), _mm512_castps_pd(T2)));
1675 kernel.packet[2] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T1), _mm512_castps_pd(T3)));
1676 kernel.packet[3] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T1), _mm512_castps_pd(T3)));
1677 kernel.packet[4] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T4), _mm512_castps_pd(T6)));
1678 kernel.packet[5] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T4), _mm512_castps_pd(T6)));
1679 kernel.packet[6] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T5), _mm512_castps_pd(T7)));
1680 kernel.packet[7] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T5), _mm512_castps_pd(T7)));
1682 T0 = _mm512_shuffle_f32x4(kernel.packet[0], kernel.packet[4], 0x44);
1683 T1 = _mm512_shuffle_f32x4(kernel.packet[0], kernel.packet[4], 0xee);
1684 T2 = _mm512_shuffle_f32x4(kernel.packet[1], kernel.packet[5], 0x44);
1685 T3 = _mm512_shuffle_f32x4(kernel.packet[1], kernel.packet[5], 0xee);
1686 T4 = _mm512_shuffle_f32x4(kernel.packet[2], kernel.packet[6], 0x44);
1687 T5 = _mm512_shuffle_f32x4(kernel.packet[2], kernel.packet[6], 0xee);
1688 T6 = _mm512_shuffle_f32x4(kernel.packet[3], kernel.packet[7], 0x44);
1689 T7 = _mm512_shuffle_f32x4(kernel.packet[3], kernel.packet[7], 0xee);
1691 kernel.packet[0] = _mm512_shuffle_f32x4(T0, T2, 0x88);
1692 kernel.packet[2] = _mm512_shuffle_f32x4(T0, T2, 0xdd);
1693 kernel.packet[1] = _mm512_shuffle_f32x4(T4, T6, 0x88);
1694 kernel.packet[3] = _mm512_shuffle_f32x4(T4, T6, 0xdd);
1695 kernel.packet[4] = _mm512_shuffle_f32x4(T1, T3, 0x88);
1696 kernel.packet[6] = _mm512_shuffle_f32x4(T1, T3, 0xdd);
1697 kernel.packet[5] = _mm512_shuffle_f32x4(T5, T7, 0x88);
1698 kernel.packet[7] = _mm512_shuffle_f32x4(T5, T7, 0xdd);
1701EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet16f, 4>& kernel) {
1702 __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
1703 __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
1704 __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
1705 __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
1707 __m512 S0 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
1708 __m512 S1 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
1709 __m512 S2 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
1710 __m512 S3 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
1712 EIGEN_EXTRACT_8f_FROM_16f(S0, S0);
1713 EIGEN_EXTRACT_8f_FROM_16f(S1, S1);
1714 EIGEN_EXTRACT_8f_FROM_16f(S2, S2);
1715 EIGEN_EXTRACT_8f_FROM_16f(S3, S3);
1717 PacketBlock<Packet8f, 8> tmp;
1719 tmp.packet[0] = _mm256_permute2f128_ps(S0_0, S1_0, 0x20);
1720 tmp.packet[1] = _mm256_permute2f128_ps(S2_0, S3_0, 0x20);
1721 tmp.packet[2] = _mm256_permute2f128_ps(S0_0, S1_0, 0x31);
1722 tmp.packet[3] = _mm256_permute2f128_ps(S2_0, S3_0, 0x31);
1724 tmp.packet[4] = _mm256_permute2f128_ps(S0_1, S1_1, 0x20);
1725 tmp.packet[5] = _mm256_permute2f128_ps(S2_1, S3_1, 0x20);
1726 tmp.packet[6] = _mm256_permute2f128_ps(S0_1, S1_1, 0x31);
1727 tmp.packet[7] = _mm256_permute2f128_ps(S2_1, S3_1, 0x31);
1729 PACK_OUTPUT_2(kernel.packet, tmp.packet, 0, 1);
1730 PACK_OUTPUT_2(kernel.packet, tmp.packet, 1, 1);
1731 PACK_OUTPUT_2(kernel.packet, tmp.packet, 2, 1);
1732 PACK_OUTPUT_2(kernel.packet, tmp.packet, 3, 1);
1735#define PACK_OUTPUT_SQ_D(OUTPUT, INPUT, INDEX, STRIDE) \
1736 OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX], 0); \
1737 OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX + STRIDE], 1);
1739#define PACK_OUTPUT_D(OUTPUT, INPUT, INDEX, STRIDE) \
1740 OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX)], 0); \
1741 OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1);
1743#define PACK_OUTPUT_L(OUTPUT, INPUT, INDEX, STRIDE) \
1744 OUTPUT[INDEX] = _mm512_inserti64x4(OUTPUT[INDEX], INPUT[(2 * INDEX)], 0); \
1745 OUTPUT[INDEX] = _mm512_inserti64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1);
1747EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet8d, 4>& kernel) {
1748 __m512d T0 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
1749 __m512d T1 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0xff);
1750 __m512d T2 = _mm512_shuffle_pd(kernel.packet[2], kernel.packet[3], 0);
1751 __m512d T3 = _mm512_shuffle_pd(kernel.packet[2], kernel.packet[3], 0xff);
1753 PacketBlock<Packet4d, 8> tmp;
1755 tmp.packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), _mm512_extractf64x4_pd(T2, 0), 0x20);
1756 tmp.packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), _mm512_extractf64x4_pd(T3, 0), 0x20);
1757 tmp.packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), _mm512_extractf64x4_pd(T2, 0), 0x31);
1758 tmp.packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), _mm512_extractf64x4_pd(T3, 0), 0x31);
1760 tmp.packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), _mm512_extractf64x4_pd(T2, 1), 0x20);
1761 tmp.packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), _mm512_extractf64x4_pd(T3, 1), 0x20);
1762 tmp.packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), _mm512_extractf64x4_pd(T2, 1), 0x31);
1763 tmp.packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), _mm512_extractf64x4_pd(T3, 1), 0x31);
1765 PACK_OUTPUT_D(kernel.packet, tmp.packet, 0, 1);
1766 PACK_OUTPUT_D(kernel.packet, tmp.packet, 1, 1);
1767 PACK_OUTPUT_D(kernel.packet, tmp.packet, 2, 1);
1768 PACK_OUTPUT_D(kernel.packet, tmp.packet, 3, 1);
1771EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet8d, 8>& kernel) {
1772 __m512d T0 = _mm512_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
1773 __m512d T1 = _mm512_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
1774 __m512d T2 = _mm512_unpacklo_pd(kernel.packet[2], kernel.packet[3]);
1775 __m512d T3 = _mm512_unpackhi_pd(kernel.packet[2], kernel.packet[3]);
1776 __m512d T4 = _mm512_unpacklo_pd(kernel.packet[4], kernel.packet[5]);
1777 __m512d T5 = _mm512_unpackhi_pd(kernel.packet[4], kernel.packet[5]);
1778 __m512d T6 = _mm512_unpacklo_pd(kernel.packet[6], kernel.packet[7]);
1779 __m512d T7 = _mm512_unpackhi_pd(kernel.packet[6], kernel.packet[7]);
1781 kernel.packet[0] = _mm512_permutex_pd(T2, 0x4E);
1782 kernel.packet[0] = _mm512_mask_blend_pd(0xCC, T0, kernel.packet[0]);
1783 kernel.packet[2] = _mm512_permutex_pd(T0, 0x4E);
1784 kernel.packet[2] = _mm512_mask_blend_pd(0xCC, kernel.packet[2], T2);
1785 kernel.packet[1] = _mm512_permutex_pd(T3, 0x4E);
1786 kernel.packet[1] = _mm512_mask_blend_pd(0xCC, T1, kernel.packet[1]);
1787 kernel.packet[3] = _mm512_permutex_pd(T1, 0x4E);
1788 kernel.packet[3] = _mm512_mask_blend_pd(0xCC, kernel.packet[3], T3);
1789 kernel.packet[4] = _mm512_permutex_pd(T6, 0x4E);
1790 kernel.packet[4] = _mm512_mask_blend_pd(0xCC, T4, kernel.packet[4]);
1791 kernel.packet[6] = _mm512_permutex_pd(T4, 0x4E);
1792 kernel.packet[6] = _mm512_mask_blend_pd(0xCC, kernel.packet[6], T6);
1793 kernel.packet[5] = _mm512_permutex_pd(T7, 0x4E);
1794 kernel.packet[5] = _mm512_mask_blend_pd(0xCC, T5, kernel.packet[5]);
1795 kernel.packet[7] = _mm512_permutex_pd(T5, 0x4E);
1796 kernel.packet[7] = _mm512_mask_blend_pd(0xCC, kernel.packet[7], T7);
1798 T0 = _mm512_shuffle_f64x2(kernel.packet[4], kernel.packet[4], 0x4E);
1799 T0 = _mm512_mask_blend_pd(0xF0, kernel.packet[0], T0);
1800 T4 = _mm512_shuffle_f64x2(kernel.packet[0], kernel.packet[0], 0x4E);
1801 T4 = _mm512_mask_blend_pd(0xF0, T4, kernel.packet[4]);
1802 T1 = _mm512_shuffle_f64x2(kernel.packet[5], kernel.packet[5], 0x4E);
1803 T1 = _mm512_mask_blend_pd(0xF0, kernel.packet[1], T1);
1804 T5 = _mm512_shuffle_f64x2(kernel.packet[1], kernel.packet[1], 0x4E);
1805 T5 = _mm512_mask_blend_pd(0xF0, T5, kernel.packet[5]);
1806 T2 = _mm512_shuffle_f64x2(kernel.packet[6], kernel.packet[6], 0x4E);
1807 T2 = _mm512_mask_blend_pd(0xF0, kernel.packet[2], T2);
1808 T6 = _mm512_shuffle_f64x2(kernel.packet[2], kernel.packet[2], 0x4E);
1809 T6 = _mm512_mask_blend_pd(0xF0, T6, kernel.packet[6]);
1810 T3 = _mm512_shuffle_f64x2(kernel.packet[7], kernel.packet[7], 0x4E);
1811 T3 = _mm512_mask_blend_pd(0xF0, kernel.packet[3], T3);
1812 T7 = _mm512_shuffle_f64x2(kernel.packet[3], kernel.packet[3], 0x4E);
1813 T7 = _mm512_mask_blend_pd(0xF0, T7, kernel.packet[7]);
1815 kernel.packet[0] = T0;
1816 kernel.packet[1] = T1;
1817 kernel.packet[2] = T2;
1818 kernel.packet[3] = T3;
1819 kernel.packet[4] = T4;
1820 kernel.packet[5] = T5;
1821 kernel.packet[6] = T6;
1822 kernel.packet[7] = T7;
1825EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet8l, 4>& kernel) {
1826 __m512i T0 = _mm512_castpd_si512(
1827 _mm512_shuffle_pd(_mm512_castsi512_pd(kernel.packet[0]), _mm512_castsi512_pd(kernel.packet[1]), 0));
1828 __m512i T1 = _mm512_castpd_si512(
1829 _mm512_shuffle_pd(_mm512_castsi512_pd(kernel.packet[0]), _mm512_castsi512_pd(kernel.packet[1]), 0xff));
1830 __m512i T2 = _mm512_castpd_si512(
1831 _mm512_shuffle_pd(_mm512_castsi512_pd(kernel.packet[2]), _mm512_castsi512_pd(kernel.packet[3]), 0));
1832 __m512i T3 = _mm512_castpd_si512(
1833 _mm512_shuffle_pd(_mm512_castsi512_pd(kernel.packet[2]), _mm512_castsi512_pd(kernel.packet[3]), 0xff));
1835 PacketBlock<Packet4l, 8> tmp;
1837 tmp.packet[0] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T0, 0), _mm512_extracti64x4_epi64(T2, 0), 0x20);
1838 tmp.packet[1] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T1, 0), _mm512_extracti64x4_epi64(T3, 0), 0x20);
1839 tmp.packet[2] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T0, 0), _mm512_extracti64x4_epi64(T2, 0), 0x31);
1840 tmp.packet[3] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T1, 0), _mm512_extracti64x4_epi64(T3, 0), 0x31);
1842 tmp.packet[4] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T0, 1), _mm512_extracti64x4_epi64(T2, 1), 0x20);
1843 tmp.packet[5] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T1, 1), _mm512_extracti64x4_epi64(T3, 1), 0x20);
1844 tmp.packet[6] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T0, 1), _mm512_extracti64x4_epi64(T2, 1), 0x31);
1845 tmp.packet[7] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T1, 1), _mm512_extracti64x4_epi64(T3, 1), 0x31);
1847 PACK_OUTPUT_L(kernel.packet, tmp.packet, 0, 1);
1848 PACK_OUTPUT_L(kernel.packet, tmp.packet, 1, 1);
1849 PACK_OUTPUT_L(kernel.packet, tmp.packet, 2, 1);
1850 PACK_OUTPUT_L(kernel.packet, tmp.packet, 3, 1);
1853EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet8l, 8>& kernel) {
1854 __m512i T0 = _mm512_unpacklo_epi64(kernel.packet[0], kernel.packet[1]);
1855 __m512i T1 = _mm512_unpackhi_epi64(kernel.packet[0], kernel.packet[1]);
1856 __m512i T2 = _mm512_unpacklo_epi64(kernel.packet[2], kernel.packet[3]);
1857 __m512i T3 = _mm512_unpackhi_epi64(kernel.packet[2], kernel.packet[3]);
1858 __m512i T4 = _mm512_unpacklo_epi64(kernel.packet[4], kernel.packet[5]);
1859 __m512i T5 = _mm512_unpackhi_epi64(kernel.packet[4], kernel.packet[5]);
1860 __m512i T6 = _mm512_unpacklo_epi64(kernel.packet[6], kernel.packet[7]);
1861 __m512i T7 = _mm512_unpackhi_epi64(kernel.packet[6], kernel.packet[7]);
1863 kernel.packet[0] = _mm512_permutex_epi64(T2, 0x4E);
1864 kernel.packet[0] = _mm512_mask_blend_epi64(0xCC, T0, kernel.packet[0]);
1865 kernel.packet[2] = _mm512_permutex_epi64(T0, 0x4E);
1866 kernel.packet[2] = _mm512_mask_blend_epi64(0xCC, kernel.packet[2], T2);
1867 kernel.packet[1] = _mm512_permutex_epi64(T3, 0x4E);
1868 kernel.packet[1] = _mm512_mask_blend_epi64(0xCC, T1, kernel.packet[1]);
1869 kernel.packet[3] = _mm512_permutex_epi64(T1, 0x4E);
1870 kernel.packet[3] = _mm512_mask_blend_epi64(0xCC, kernel.packet[3], T3);
1871 kernel.packet[4] = _mm512_permutex_epi64(T6, 0x4E);
1872 kernel.packet[4] = _mm512_mask_blend_epi64(0xCC, T4, kernel.packet[4]);
1873 kernel.packet[6] = _mm512_permutex_epi64(T4, 0x4E);
1874 kernel.packet[6] = _mm512_mask_blend_epi64(0xCC, kernel.packet[6], T6);
1875 kernel.packet[5] = _mm512_permutex_epi64(T7, 0x4E);
1876 kernel.packet[5] = _mm512_mask_blend_epi64(0xCC, T5, kernel.packet[5]);
1877 kernel.packet[7] = _mm512_permutex_epi64(T5, 0x4E);
1878 kernel.packet[7] = _mm512_mask_blend_epi64(0xCC, kernel.packet[7], T7);
1880 T0 = _mm512_shuffle_i64x2(kernel.packet[4], kernel.packet[4], 0x4E);
1881 T0 = _mm512_mask_blend_epi64(0xF0, kernel.packet[0], T0);
1882 T4 = _mm512_shuffle_i64x2(kernel.packet[0], kernel.packet[0], 0x4E);
1883 T4 = _mm512_mask_blend_epi64(0xF0, T4, kernel.packet[4]);
1884 T1 = _mm512_shuffle_i64x2(kernel.packet[5], kernel.packet[5], 0x4E);
1885 T1 = _mm512_mask_blend_epi64(0xF0, kernel.packet[1], T1);
1886 T5 = _mm512_shuffle_i64x2(kernel.packet[1], kernel.packet[1], 0x4E);
1887 T5 = _mm512_mask_blend_epi64(0xF0, T5, kernel.packet[5]);
1888 T2 = _mm512_shuffle_i64x2(kernel.packet[6], kernel.packet[6], 0x4E);
1889 T2 = _mm512_mask_blend_epi64(0xF0, kernel.packet[2], T2);
1890 T6 = _mm512_shuffle_i64x2(kernel.packet[2], kernel.packet[2], 0x4E);
1891 T6 = _mm512_mask_blend_epi64(0xF0, T6, kernel.packet[6]);
1892 T3 = _mm512_shuffle_i64x2(kernel.packet[7], kernel.packet[7], 0x4E);
1893 T3 = _mm512_mask_blend_epi64(0xF0, kernel.packet[3], T3);
1894 T7 = _mm512_shuffle_i64x2(kernel.packet[3], kernel.packet[3], 0x4E);
1895 T7 = _mm512_mask_blend_epi64(0xF0, T7, kernel.packet[7]);
1897 kernel.packet[0] = T0;
1898 kernel.packet[1] = T1;
1899 kernel.packet[2] = T2;
1900 kernel.packet[3] = T3;
1901 kernel.packet[4] = T4;
1902 kernel.packet[5] = T5;
1903 kernel.packet[6] = T6;
1904 kernel.packet[7] = T7;
1907#define PACK_OUTPUT_I32(OUTPUT, INPUT, INDEX, STRIDE) \
1908 EIGEN_INSERT_8i_INTO_16i(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]);
1910#define PACK_OUTPUT_I32_2(OUTPUT, INPUT, INDEX, STRIDE) \
1911 EIGEN_INSERT_8i_INTO_16i(OUTPUT[INDEX], INPUT[2 * INDEX], INPUT[2 * INDEX + STRIDE]);
1913#define SHUFFLE_EPI32(A, B, M) _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(A), _mm512_castsi512_ps(B), M))
1915EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet16i, 16>& kernel) {
1916 __m512i T0 = _mm512_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
1917 __m512i T1 = _mm512_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
1918 __m512i T2 = _mm512_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
1919 __m512i T3 = _mm512_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
1920 __m512i T4 = _mm512_unpacklo_epi32(kernel.packet[4], kernel.packet[5]);
1921 __m512i T5 = _mm512_unpackhi_epi32(kernel.packet[4], kernel.packet[5]);
1922 __m512i T6 = _mm512_unpacklo_epi32(kernel.packet[6], kernel.packet[7]);
1923 __m512i T7 = _mm512_unpackhi_epi32(kernel.packet[6], kernel.packet[7]);
1924 __m512i T8 = _mm512_unpacklo_epi32(kernel.packet[8], kernel.packet[9]);
1925 __m512i T9 = _mm512_unpackhi_epi32(kernel.packet[8], kernel.packet[9]);
1926 __m512i T10 = _mm512_unpacklo_epi32(kernel.packet[10], kernel.packet[11]);
1927 __m512i T11 = _mm512_unpackhi_epi32(kernel.packet[10], kernel.packet[11]);
1928 __m512i T12 = _mm512_unpacklo_epi32(kernel.packet[12], kernel.packet[13]);
1929 __m512i T13 = _mm512_unpackhi_epi32(kernel.packet[12], kernel.packet[13]);
1930 __m512i T14 = _mm512_unpacklo_epi32(kernel.packet[14], kernel.packet[15]);
1931 __m512i T15 = _mm512_unpackhi_epi32(kernel.packet[14], kernel.packet[15]);
1932 __m512i S0 = SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
1933 __m512i S1 = SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
1934 __m512i S2 = SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
1935 __m512i S3 = SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
1936 __m512i S4 = SHUFFLE_EPI32(T4, T6, _MM_SHUFFLE(1, 0, 1, 0));
1937 __m512i S5 = SHUFFLE_EPI32(T4, T6, _MM_SHUFFLE(3, 2, 3, 2));
1938 __m512i S6 = SHUFFLE_EPI32(T5, T7, _MM_SHUFFLE(1, 0, 1, 0));
1939 __m512i S7 = SHUFFLE_EPI32(T5, T7, _MM_SHUFFLE(3, 2, 3, 2));
1940 __m512i S8 = SHUFFLE_EPI32(T8, T10, _MM_SHUFFLE(1, 0, 1, 0));
1941 __m512i S9 = SHUFFLE_EPI32(T8, T10, _MM_SHUFFLE(3, 2, 3, 2));
1942 __m512i S10 = SHUFFLE_EPI32(T9, T11, _MM_SHUFFLE(1, 0, 1, 0));
1943 __m512i S11 = SHUFFLE_EPI32(T9, T11, _MM_SHUFFLE(3, 2, 3, 2));
1944 __m512i S12 = SHUFFLE_EPI32(T12, T14, _MM_SHUFFLE(1, 0, 1, 0));
1945 __m512i S13 = SHUFFLE_EPI32(T12, T14, _MM_SHUFFLE(3, 2, 3, 2));
1946 __m512i S14 = SHUFFLE_EPI32(T13, T15, _MM_SHUFFLE(1, 0, 1, 0));
1947 __m512i S15 = SHUFFLE_EPI32(T13, T15, _MM_SHUFFLE(3, 2, 3, 2));
1949 EIGEN_EXTRACT_8i_FROM_16i(S0, S0);
1950 EIGEN_EXTRACT_8i_FROM_16i(S1, S1);
1951 EIGEN_EXTRACT_8i_FROM_16i(S2, S2);
1952 EIGEN_EXTRACT_8i_FROM_16i(S3, S3);
1953 EIGEN_EXTRACT_8i_FROM_16i(S4, S4);
1954 EIGEN_EXTRACT_8i_FROM_16i(S5, S5);
1955 EIGEN_EXTRACT_8i_FROM_16i(S6, S6);
1956 EIGEN_EXTRACT_8i_FROM_16i(S7, S7);
1957 EIGEN_EXTRACT_8i_FROM_16i(S8, S8);
1958 EIGEN_EXTRACT_8i_FROM_16i(S9, S9);
1959 EIGEN_EXTRACT_8i_FROM_16i(S10, S10);
1960 EIGEN_EXTRACT_8i_FROM_16i(S11, S11);
1961 EIGEN_EXTRACT_8i_FROM_16i(S12, S12);
1962 EIGEN_EXTRACT_8i_FROM_16i(S13, S13);
1963 EIGEN_EXTRACT_8i_FROM_16i(S14, S14);
1964 EIGEN_EXTRACT_8i_FROM_16i(S15, S15);
1966 PacketBlock<Packet8i, 32> tmp;
1968 tmp.packet[0] = _mm256_permute2f128_si256(S0_0, S4_0, 0x20);
1969 tmp.packet[1] = _mm256_permute2f128_si256(S1_0, S5_0, 0x20);
1970 tmp.packet[2] = _mm256_permute2f128_si256(S2_0, S6_0, 0x20);
1971 tmp.packet[3] = _mm256_permute2f128_si256(S3_0, S7_0, 0x20);
1972 tmp.packet[4] = _mm256_permute2f128_si256(S0_0, S4_0, 0x31);
1973 tmp.packet[5] = _mm256_permute2f128_si256(S1_0, S5_0, 0x31);
1974 tmp.packet[6] = _mm256_permute2f128_si256(S2_0, S6_0, 0x31);
1975 tmp.packet[7] = _mm256_permute2f128_si256(S3_0, S7_0, 0x31);
1977 tmp.packet[8] = _mm256_permute2f128_si256(S0_1, S4_1, 0x20);
1978 tmp.packet[9] = _mm256_permute2f128_si256(S1_1, S5_1, 0x20);
1979 tmp.packet[10] = _mm256_permute2f128_si256(S2_1, S6_1, 0x20);
1980 tmp.packet[11] = _mm256_permute2f128_si256(S3_1, S7_1, 0x20);
1981 tmp.packet[12] = _mm256_permute2f128_si256(S0_1, S4_1, 0x31);
1982 tmp.packet[13] = _mm256_permute2f128_si256(S1_1, S5_1, 0x31);
1983 tmp.packet[14] = _mm256_permute2f128_si256(S2_1, S6_1, 0x31);
1984 tmp.packet[15] = _mm256_permute2f128_si256(S3_1, S7_1, 0x31);
1987 tmp.packet[16] = _mm256_permute2f128_si256(S8_0, S12_0, 0x20);
1988 tmp.packet[17] = _mm256_permute2f128_si256(S9_0, S13_0, 0x20);
1989 tmp.packet[18] = _mm256_permute2f128_si256(S10_0, S14_0, 0x20);
1990 tmp.packet[19] = _mm256_permute2f128_si256(S11_0, S15_0, 0x20);
1991 tmp.packet[20] = _mm256_permute2f128_si256(S8_0, S12_0, 0x31);
1992 tmp.packet[21] = _mm256_permute2f128_si256(S9_0, S13_0, 0x31);
1993 tmp.packet[22] = _mm256_permute2f128_si256(S10_0, S14_0, 0x31);
1994 tmp.packet[23] = _mm256_permute2f128_si256(S11_0, S15_0, 0x31);
1996 tmp.packet[24] = _mm256_permute2f128_si256(S8_1, S12_1, 0x20);
1997 tmp.packet[25] = _mm256_permute2f128_si256(S9_1, S13_1, 0x20);
1998 tmp.packet[26] = _mm256_permute2f128_si256(S10_1, S14_1, 0x20);
1999 tmp.packet[27] = _mm256_permute2f128_si256(S11_1, S15_1, 0x20);
2000 tmp.packet[28] = _mm256_permute2f128_si256(S8_1, S12_1, 0x31);
2001 tmp.packet[29] = _mm256_permute2f128_si256(S9_1, S13_1, 0x31);
2002 tmp.packet[30] = _mm256_permute2f128_si256(S10_1, S14_1, 0x31);
2003 tmp.packet[31] = _mm256_permute2f128_si256(S11_1, S15_1, 0x31);
2006 PACK_OUTPUT_I32(kernel.packet, tmp.packet, 0, 16);
2007 PACK_OUTPUT_I32(kernel.packet, tmp.packet, 1, 16);
2008 PACK_OUTPUT_I32(kernel.packet, tmp.packet, 2, 16);
2009 PACK_OUTPUT_I32(kernel.packet, tmp.packet, 3, 16);
2011 PACK_OUTPUT_I32(kernel.packet, tmp.packet, 4, 16);
2012 PACK_OUTPUT_I32(kernel.packet, tmp.packet, 5, 16);
2013 PACK_OUTPUT_I32(kernel.packet, tmp.packet, 6, 16);
2014 PACK_OUTPUT_I32(kernel.packet, tmp.packet, 7, 16);
2016 PACK_OUTPUT_I32(kernel.packet, tmp.packet, 8, 16);
2017 PACK_OUTPUT_I32(kernel.packet, tmp.packet, 9, 16);
2018 PACK_OUTPUT_I32(kernel.packet, tmp.packet, 10, 16);
2019 PACK_OUTPUT_I32(kernel.packet, tmp.packet, 11, 16);
2021 PACK_OUTPUT_I32(kernel.packet, tmp.packet, 12, 16);
2022 PACK_OUTPUT_I32(kernel.packet, tmp.packet, 13, 16);
2023 PACK_OUTPUT_I32(kernel.packet, tmp.packet, 14, 16);
2024 PACK_OUTPUT_I32(kernel.packet, tmp.packet, 15, 16);
2027EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet16i, 4>& kernel) {
2028 __m512i T0 = _mm512_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
2029 __m512i T1 = _mm512_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
2030 __m512i T2 = _mm512_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
2031 __m512i T3 = _mm512_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
2033 __m512i S0 = SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
2034 __m512i S1 = SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
2035 __m512i S2 = SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
2036 __m512i S3 = SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
2038 EIGEN_EXTRACT_8i_FROM_16i(S0, S0);
2039 EIGEN_EXTRACT_8i_FROM_16i(S1, S1);
2040 EIGEN_EXTRACT_8i_FROM_16i(S2, S2);
2041 EIGEN_EXTRACT_8i_FROM_16i(S3, S3);
2043 PacketBlock<Packet8i, 8> tmp;
2045 tmp.packet[0] = _mm256_permute2f128_si256(S0_0, S1_0, 0x20);
2046 tmp.packet[1] = _mm256_permute2f128_si256(S2_0, S3_0, 0x20);
2047 tmp.packet[2] = _mm256_permute2f128_si256(S0_0, S1_0, 0x31);
2048 tmp.packet[3] = _mm256_permute2f128_si256(S2_0, S3_0, 0x31);
2050 tmp.packet[4] = _mm256_permute2f128_si256(S0_1, S1_1, 0x20);
2051 tmp.packet[5] = _mm256_permute2f128_si256(S2_1, S3_1, 0x20);
2052 tmp.packet[6] = _mm256_permute2f128_si256(S0_1, S1_1, 0x31);
2053 tmp.packet[7] = _mm256_permute2f128_si256(S2_1, S3_1, 0x31);
2055 PACK_OUTPUT_I32_2(kernel.packet, tmp.packet, 0, 1);
2056 PACK_OUTPUT_I32_2(kernel.packet, tmp.packet, 1, 1);
2057 PACK_OUTPUT_I32_2(kernel.packet, tmp.packet, 2, 1);
2058 PACK_OUTPUT_I32_2(kernel.packet, tmp.packet, 3, 1);
2062#ifndef EIGEN_VECTORIZE_AVX512FP16
2064EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(
const Eigen::half& from) {
2065 return _mm256_set1_epi16(from.x);
2069EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(
const Packet16h& from) {
2070 return half_impl::raw_uint16_to_half(
static_cast<unsigned short>(_mm256_extract_epi16(from, 0)));
2074EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(
const Eigen::half* from) {
2075 return _mm256_load_si256(
reinterpret_cast<const __m256i*
>(from));
2079EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(
const Eigen::half* from) {
2080 return _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(from));
2084EIGEN_STRONG_INLINE
void pstore<half>(Eigen::half* to,
const Packet16h& from) {
2087 EIGEN_DEBUG_ALIGNED_STORE
2088 _mm256_store_si256((__m256i*)(
void*)to, from);
2092EIGEN_STRONG_INLINE
void pstoreu<half>(Eigen::half* to,
const Packet16h& from) {
2095 EIGEN_DEBUG_UNALIGNED_STORE
2096 _mm256_storeu_si256((__m256i*)(
void*)to, from);
2100EIGEN_STRONG_INLINE Packet16h ploaddup<Packet16h>(
const Eigen::half* from) {
2101 unsigned short a = from[0].x;
2102 unsigned short b = from[1].x;
2103 unsigned short c = from[2].x;
2104 unsigned short d = from[3].x;
2105 unsigned short e = from[4].x;
2106 unsigned short f = from[5].x;
2107 unsigned short g = from[6].x;
2108 unsigned short h = from[7].x;
2109 return _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
2113EIGEN_STRONG_INLINE Packet16h ploadquad(
const Eigen::half* from) {
2114 unsigned short a = from[0].x;
2115 unsigned short b = from[1].x;
2116 unsigned short c = from[2].x;
2117 unsigned short d = from[3].x;
2118 return _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
2121EIGEN_STRONG_INLINE Packet16f half2float(
const Packet16h& a) {
return _mm512_cvtph_ps(a); }
2123EIGEN_STRONG_INLINE Packet16h float2half(
const Packet16f& a) {
2124 return _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
2128EIGEN_STRONG_INLINE Packet16h ptrue(
const Packet16h& a) {
2129 return Packet16h(ptrue(Packet8i(a)));
2133EIGEN_STRONG_INLINE Packet16h pabs(
const Packet16h& a) {
2134 const __m256i sign_mask = _mm256_set1_epi16(
static_cast<numext::uint16_t
>(0x8000));
2135 return _mm256_andnot_si256(sign_mask, a);
2139EIGEN_STRONG_INLINE Packet16h pmin<Packet16h>(
const Packet16h& a,
const Packet16h& b) {
2140 return float2half(pmin<Packet16f>(half2float(a), half2float(b)));
2144EIGEN_STRONG_INLINE Packet16h pmax<Packet16h>(
const Packet16h& a,
const Packet16h& b) {
2145 return float2half(pmax<Packet16f>(half2float(a), half2float(b)));
2149EIGEN_STRONG_INLINE Packet16h plset<Packet16h>(
const half& a) {
2150 return float2half(plset<Packet16f>(
static_cast<float>(a)));
2154EIGEN_STRONG_INLINE Packet16h por(
const Packet16h& a,
const Packet16h& b) {
2157 return Packet16h(por(Packet8i(a), Packet8i(b)));
2160EIGEN_STRONG_INLINE Packet16h pxor(
const Packet16h& a,
const Packet16h& b) {
2161 return Packet16h(pxor(Packet8i(a), Packet8i(b)));
2164EIGEN_STRONG_INLINE Packet16h pand(
const Packet16h& a,
const Packet16h& b) {
2165 return Packet16h(pand(Packet8i(a), Packet8i(b)));
2168EIGEN_STRONG_INLINE Packet16h pandnot(
const Packet16h& a,
const Packet16h& b) {
2169 return Packet16h(pandnot(Packet8i(a), Packet8i(b)));
2173EIGEN_STRONG_INLINE Packet16h pselect(
const Packet16h& mask,
const Packet16h& a,
const Packet16h& b) {
2174 return _mm256_blendv_epi8(b, a, mask);
2178EIGEN_STRONG_INLINE Packet16h pround<Packet16h>(
const Packet16h& a) {
2179 return float2half(pround<Packet16f>(half2float(a)));
2183EIGEN_STRONG_INLINE Packet16h print<Packet16h>(
const Packet16h& a) {
2184 return float2half(print<Packet16f>(half2float(a)));
2188EIGEN_STRONG_INLINE Packet16h pceil<Packet16h>(
const Packet16h& a) {
2189 return float2half(pceil<Packet16f>(half2float(a)));
2193EIGEN_STRONG_INLINE Packet16h pfloor<Packet16h>(
const Packet16h& a) {
2194 return float2half(pfloor<Packet16f>(half2float(a)));
2198EIGEN_STRONG_INLINE Packet16h ptrunc<Packet16h>(
const Packet16h& a) {
2199 return float2half(ptrunc<Packet16f>(half2float(a)));
2203EIGEN_STRONG_INLINE Packet16h pcmp_eq(
const Packet16h& a,
const Packet16h& b) {
2204 Packet16f af = half2float(a);
2205 Packet16f bf = half2float(b);
2206 return Pack32To16(pcmp_eq(af, bf));
2210EIGEN_STRONG_INLINE Packet16h pcmp_le(
const Packet16h& a,
const Packet16h& b) {
2211 return Pack32To16(pcmp_le(half2float(a), half2float(b)));
2215EIGEN_STRONG_INLINE Packet16h pcmp_lt(
const Packet16h& a,
const Packet16h& b) {
2216 return Pack32To16(pcmp_lt(half2float(a), half2float(b)));
2220EIGEN_STRONG_INLINE Packet16h pcmp_lt_or_nan(
const Packet16h& a,
const Packet16h& b) {
2221 return Pack32To16(pcmp_lt_or_nan(half2float(a), half2float(b)));
2225EIGEN_STRONG_INLINE Packet16h pconj(
const Packet16h& a) {
2230EIGEN_STRONG_INLINE Packet16h pnegate(
const Packet16h& a) {
2231 Packet16h sign_mask = _mm256_set1_epi16(
static_cast<unsigned short>(0x8000));
2232 return _mm256_xor_si256(a, sign_mask);
2236EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(
const Packet16h& a,
const Packet16h& b) {
2237 Packet16f af = half2float(a);
2238 Packet16f bf = half2float(b);
2239 Packet16f rf = padd(af, bf);
2240 return float2half(rf);
2244EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(
const Packet16h& a,
const Packet16h& b) {
2245 Packet16f af = half2float(a);
2246 Packet16f bf = half2float(b);
2247 Packet16f rf = psub(af, bf);
2248 return float2half(rf);
2252EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(
const Packet16h& a,
const Packet16h& b) {
2253 Packet16f af = half2float(a);
2254 Packet16f bf = half2float(b);
2255 Packet16f rf = pmul(af, bf);
2256 return float2half(rf);
2260EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(
const Packet16h& a,
const Packet16h& b) {
2261 Packet16f af = half2float(a);
2262 Packet16f bf = half2float(b);
2263 Packet16f rf = pdiv(af, bf);
2264 return float2half(rf);
2268EIGEN_STRONG_INLINE Packet16h pmadd<Packet16h>(
const Packet16h& a,
const Packet16h& b,
const Packet16h& c) {
2269 return float2half(pmadd(half2float(a), half2float(b), half2float(c)));
2273EIGEN_STRONG_INLINE Packet16h pmsub<Packet16h>(
const Packet16h& a,
const Packet16h& b,
const Packet16h& c) {
2274 return float2half(pmsub(half2float(a), half2float(b), half2float(c)));
2278EIGEN_STRONG_INLINE Packet16h pnmadd<Packet16h>(
const Packet16h& a,
const Packet16h& b,
const Packet16h& c) {
2279 return float2half(pnmadd(half2float(a), half2float(b), half2float(c)));
2283EIGEN_STRONG_INLINE Packet16h pnmsub<Packet16h>(
const Packet16h& a,
const Packet16h& b,
const Packet16h& c) {
2284 return float2half(pnmsub(half2float(a), half2float(b), half2float(c)));
2288EIGEN_STRONG_INLINE Packet8h predux_half_dowto4<Packet16h>(
const Packet16h& a) {
2289 Packet8h lane0 = _mm256_extractf128_si256(a, 0);
2290 Packet8h lane1 = _mm256_extractf128_si256(a, 1);
2291 return padd<Packet8h>(lane0, lane1);
2295EIGEN_STRONG_INLINE Packet16h preverse(
const Packet16h& a) {
2296 __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
2297 return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a, 1), m)),
2298 _mm_shuffle_epi8(_mm256_extractf128_si256(a, 0), m), 1);
2302EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(
const Eigen::half* from,
Index stride) {
2303 return _mm256_set_epi16(from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
2304 from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x,
2305 from[7 * stride].x, from[6 * stride].x, from[5 * stride].x, from[4 * stride].x,
2306 from[3 * stride].x, from[2 * stride].x, from[1 * stride].x, from[0 * stride].x);
2310EIGEN_STRONG_INLINE
void pscatter<half, Packet16h>(half* to,
const Packet16h& from,
Index stride) {
2311 EIGEN_ALIGN64 half aux[16];
2313 to[stride * 0] = aux[0];
2314 to[stride * 1] = aux[1];
2315 to[stride * 2] = aux[2];
2316 to[stride * 3] = aux[3];
2317 to[stride * 4] = aux[4];
2318 to[stride * 5] = aux[5];
2319 to[stride * 6] = aux[6];
2320 to[stride * 7] = aux[7];
2321 to[stride * 8] = aux[8];
2322 to[stride * 9] = aux[9];
2323 to[stride * 10] = aux[10];
2324 to[stride * 11] = aux[11];
2325 to[stride * 12] = aux[12];
2326 to[stride * 13] = aux[13];
2327 to[stride * 14] = aux[14];
2328 to[stride * 15] = aux[15];
2331EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16h, 16>& kernel) {
2332 __m256i a = kernel.packet[0];
2333 __m256i b = kernel.packet[1];
2334 __m256i c = kernel.packet[2];
2335 __m256i d = kernel.packet[3];
2336 __m256i e = kernel.packet[4];
2337 __m256i f = kernel.packet[5];
2338 __m256i g = kernel.packet[6];
2339 __m256i h = kernel.packet[7];
2340 __m256i i = kernel.packet[8];
2341 __m256i j = kernel.packet[9];
2342 __m256i k = kernel.packet[10];
2343 __m256i l = kernel.packet[11];
2344 __m256i m = kernel.packet[12];
2345 __m256i n = kernel.packet[13];
2346 __m256i o = kernel.packet[14];
2347 __m256i p = kernel.packet[15];
2349 __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
2350 __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
2351 __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
2352 __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
2353 __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
2354 __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
2355 __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
2356 __m256i op_07 = _mm256_unpacklo_epi16(o, p);
2358 __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
2359 __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
2360 __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
2361 __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
2362 __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
2363 __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
2364 __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
2365 __m256i op_8f = _mm256_unpackhi_epi16(o, p);
2367 __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
2368 __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
2369 __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
2370 __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
2371 __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
2372 __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
2373 __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
2374 __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
2376 __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
2377 __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
2378 __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
2379 __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
2380 __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
2381 __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
2382 __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
2383 __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
2385 __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
2386 __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
2387 __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
2388 __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
2389 __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
2390 __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
2391 __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
2392 __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
2393 __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
2394 __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
2395 __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
2396 __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
2397 __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
2398 __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
2399 __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
2400 __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
2403 __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
2404 __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
2405 __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
2406 __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
2407 __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
2408 __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
2409 __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
2410 __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
2411 __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
2412 __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
2413 __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
2414 __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
2415 __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
2416 __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
2417 __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
2418 __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
2420 kernel.packet[0] = a_p_0;
2421 kernel.packet[1] = a_p_1;
2422 kernel.packet[2] = a_p_2;
2423 kernel.packet[3] = a_p_3;
2424 kernel.packet[4] = a_p_4;
2425 kernel.packet[5] = a_p_5;
2426 kernel.packet[6] = a_p_6;
2427 kernel.packet[7] = a_p_7;
2428 kernel.packet[8] = a_p_8;
2429 kernel.packet[9] = a_p_9;
2430 kernel.packet[10] = a_p_a;
2431 kernel.packet[11] = a_p_b;
2432 kernel.packet[12] = a_p_c;
2433 kernel.packet[13] = a_p_d;
2434 kernel.packet[14] = a_p_e;
2435 kernel.packet[15] = a_p_f;
2438EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16h, 8>& kernel) {
2439 EIGEN_ALIGN64 half in[8][16];
2440 pstore<half>(in[0], kernel.packet[0]);
2441 pstore<half>(in[1], kernel.packet[1]);
2442 pstore<half>(in[2], kernel.packet[2]);
2443 pstore<half>(in[3], kernel.packet[3]);
2444 pstore<half>(in[4], kernel.packet[4]);
2445 pstore<half>(in[5], kernel.packet[5]);
2446 pstore<half>(in[6], kernel.packet[6]);
2447 pstore<half>(in[7], kernel.packet[7]);
2449 EIGEN_ALIGN64 half out[8][16];
2451 for (
int i = 0; i < 8; ++i) {
2452 for (
int j = 0; j < 8; ++j) {
2453 out[i][j] = in[j][2 * i];
2455 for (
int j = 0; j < 8; ++j) {
2456 out[i][j + 8] = in[j][2 * i + 1];
2460 kernel.packet[0] = pload<Packet16h>(out[0]);
2461 kernel.packet[1] = pload<Packet16h>(out[1]);
2462 kernel.packet[2] = pload<Packet16h>(out[2]);
2463 kernel.packet[3] = pload<Packet16h>(out[3]);
2464 kernel.packet[4] = pload<Packet16h>(out[4]);
2465 kernel.packet[5] = pload<Packet16h>(out[5]);
2466 kernel.packet[6] = pload<Packet16h>(out[6]);
2467 kernel.packet[7] = pload<Packet16h>(out[7]);
2470EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16h, 4>& kernel) {
2471 EIGEN_ALIGN64 half in[4][16];
2472 pstore<half>(in[0], kernel.packet[0]);
2473 pstore<half>(in[1], kernel.packet[1]);
2474 pstore<half>(in[2], kernel.packet[2]);
2475 pstore<half>(in[3], kernel.packet[3]);
2477 EIGEN_ALIGN64 half out[4][16];
2479 for (
int i = 0; i < 4; ++i) {
2480 for (
int j = 0; j < 4; ++j) {
2481 out[i][j] = in[j][4 * i];
2483 for (
int j = 0; j < 4; ++j) {
2484 out[i][j + 4] = in[j][4 * i + 1];
2486 for (
int j = 0; j < 4; ++j) {
2487 out[i][j + 8] = in[j][4 * i + 2];
2489 for (
int j = 0; j < 4; ++j) {
2490 out[i][j + 12] = in[j][4 * i + 3];
2494 kernel.packet[0] = pload<Packet16h>(out[0]);
2495 kernel.packet[1] = pload<Packet16h>(out[1]);
2496 kernel.packet[2] = pload<Packet16h>(out[2]);
2497 kernel.packet[3] = pload<Packet16h>(out[3]);
2503struct is_arithmetic<Packet16bf> {
2504 enum { value =
true };
2508struct packet_traits<bfloat16> : default_packet_traits {
2509 typedef Packet16bf type;
2510 typedef Packet8bf half;
2513 AlignedOnScalar = 1,
2517 HasSin = EIGEN_FAST_MATH,
2518 HasCos = EIGEN_FAST_MATH,
2521#ifdef EIGEN_VECTORIZE_AVX512DQ
2529 HasTanh = EIGEN_FAST_MATH,
2530 HasErf = EIGEN_FAST_MATH,
2537struct unpacket_traits<Packet16bf> {
2538 typedef bfloat16 type;
2542 vectorizable =
true,
2543 masked_load_available =
false,
2544 masked_store_available =
false
2546 typedef Packet8bf half;
2550EIGEN_STRONG_INLINE Packet16bf pset1<Packet16bf>(
const bfloat16& from) {
2551 return _mm256_set1_epi16(from.value);
2555EIGEN_STRONG_INLINE bfloat16 pfirst<Packet16bf>(
const Packet16bf& from) {
2557 t.value =
static_cast<unsigned short>(_mm256_extract_epi16(from, 0));
2562EIGEN_STRONG_INLINE Packet16bf pload<Packet16bf>(
const bfloat16* from) {
2563 return _mm256_load_si256(
reinterpret_cast<const __m256i*
>(from));
2567EIGEN_STRONG_INLINE Packet16bf ploadu<Packet16bf>(
const bfloat16* from) {
2568 return _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(from));
2572EIGEN_STRONG_INLINE
void pstore<bfloat16>(bfloat16* to,
const Packet16bf& from) {
2573 EIGEN_DEBUG_ALIGNED_STORE
2574 _mm256_store_si256(
reinterpret_cast<__m256i*
>(to), from);
2578EIGEN_STRONG_INLINE
void pstoreu<bfloat16>(bfloat16* to,
const Packet16bf& from) {
2579 EIGEN_DEBUG_UNALIGNED_STORE
2580 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(to), from);
2584EIGEN_STRONG_INLINE Packet16bf ploaddup<Packet16bf>(
const bfloat16* from) {
2585 unsigned short a = from[0].value;
2586 unsigned short b = from[1].value;
2587 unsigned short c = from[2].value;
2588 unsigned short d = from[3].value;
2589 unsigned short e = from[4].value;
2590 unsigned short f = from[5].value;
2591 unsigned short g = from[6].value;
2592 unsigned short h = from[7].value;
2593 return _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
2597EIGEN_STRONG_INLINE Packet16bf ploadquad(
const bfloat16* from) {
2598 unsigned short a = from[0].value;
2599 unsigned short b = from[1].value;
2600 unsigned short c = from[2].value;
2601 unsigned short d = from[3].value;
2602 return _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
2605EIGEN_STRONG_INLINE Packet16f Bf16ToF32(
const Packet16bf& a) {
2606 return _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(a), 16));
2610EIGEN_STRONG_INLINE Packet16bf F32ToBf16(
const Packet16f& a) {
2613#if defined(EIGEN_VECTORIZE_AVX512BF16) && EIGEN_GNUC_STRICT_AT_LEAST(10, 1, 0)
2617 r = (__m256i)(_mm512_cvtneps_pbh(a));
2621 __m512i input = _mm512_castps_si512(a);
2622 __m512i nan = _mm512_set1_epi32(0x7fc0);
2625 t = _mm512_and_si512(_mm512_srli_epi32(input, 16), _mm512_set1_epi32(1));
2627 t = _mm512_add_epi32(t, _mm512_set1_epi32(0x7fff));
2629 t = _mm512_add_epi32(t, input);
2631 t = _mm512_srli_epi32(t, 16);
2634 __mmask16 mask = _mm512_cmp_ps_mask(a, a, _CMP_ORD_Q);
2636 t = _mm512_mask_blend_epi32(mask, nan, t);
2638 r = _mm512_cvtepi32_epi16(t);
2645EIGEN_STRONG_INLINE Packet16bf ptrue(
const Packet16bf& a) {
2646 return Packet16bf(ptrue<Packet8i>(Packet8i(a)));
2650EIGEN_STRONG_INLINE Packet16bf por(
const Packet16bf& a,
const Packet16bf& b) {
2651 return Packet16bf(por<Packet8i>(Packet8i(a), Packet8i(b)));
2655EIGEN_STRONG_INLINE Packet16bf pxor(
const Packet16bf& a,
const Packet16bf& b) {
2656 return Packet16bf(pxor<Packet8i>(Packet8i(a), Packet8i(b)));
2660EIGEN_STRONG_INLINE Packet16bf pand(
const Packet16bf& a,
const Packet16bf& b) {
2661 return Packet16bf(pand<Packet8i>(Packet8i(a), Packet8i(b)));
2665EIGEN_STRONG_INLINE Packet16bf pandnot(
const Packet16bf& a,
const Packet16bf& b) {
2666 return Packet16bf(pandnot<Packet8i>(Packet8i(a), Packet8i(b)));
2670EIGEN_STRONG_INLINE Packet16bf pselect(
const Packet16bf& mask,
const Packet16bf& a,
const Packet16bf& b) {
2673 return _mm256_blendv_epi8(b, a, mask);
2677EIGEN_STRONG_INLINE Packet16bf pround<Packet16bf>(
const Packet16bf& a) {
2678 return F32ToBf16(pround<Packet16f>(Bf16ToF32(a)));
2682EIGEN_STRONG_INLINE Packet16bf print<Packet16bf>(
const Packet16bf& a) {
2683 return F32ToBf16(print<Packet16f>(Bf16ToF32(a)));
2687EIGEN_STRONG_INLINE Packet16bf pceil<Packet16bf>(
const Packet16bf& a) {
2688 return F32ToBf16(pceil<Packet16f>(Bf16ToF32(a)));
2692EIGEN_STRONG_INLINE Packet16bf pfloor<Packet16bf>(
const Packet16bf& a) {
2693 return F32ToBf16(pfloor<Packet16f>(Bf16ToF32(a)));
2697EIGEN_STRONG_INLINE Packet16bf ptrunc<Packet16bf>(
const Packet16bf& a) {
2698 return F32ToBf16(ptrunc<Packet16f>(Bf16ToF32(a)));
2702EIGEN_STRONG_INLINE Packet16bf pcmp_eq(
const Packet16bf& a,
const Packet16bf& b) {
2703 return Pack32To16(pcmp_eq(Bf16ToF32(a), Bf16ToF32(b)));
2707EIGEN_STRONG_INLINE Packet16bf pcmp_le(
const Packet16bf& a,
const Packet16bf& b) {
2708 return Pack32To16(pcmp_le(Bf16ToF32(a), Bf16ToF32(b)));
2712EIGEN_STRONG_INLINE Packet16bf pcmp_lt(
const Packet16bf& a,
const Packet16bf& b) {
2713 return Pack32To16(pcmp_lt(Bf16ToF32(a), Bf16ToF32(b)));
2717EIGEN_STRONG_INLINE Packet16bf pcmp_lt_or_nan(
const Packet16bf& a,
const Packet16bf& b) {
2718 return Pack32To16(pcmp_lt_or_nan(Bf16ToF32(a), Bf16ToF32(b)));
2722EIGEN_STRONG_INLINE Packet16bf pnegate(
const Packet16bf& a) {
2723 Packet16bf sign_mask = _mm256_set1_epi16(
static_cast<unsigned short>(0x8000));
2724 return _mm256_xor_si256(a, sign_mask);
2728EIGEN_STRONG_INLINE Packet16bf pconj(
const Packet16bf& a) {
2733EIGEN_STRONG_INLINE Packet16bf pabs(
const Packet16bf& a) {
2734 const __m256i sign_mask = _mm256_set1_epi16(
static_cast<numext::uint16_t
>(0x8000));
2735 return _mm256_andnot_si256(sign_mask, a);
2739EIGEN_STRONG_INLINE Packet16bf padd<Packet16bf>(
const Packet16bf& a,
const Packet16bf& b) {
2740 return F32ToBf16(padd<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
2744EIGEN_STRONG_INLINE Packet16bf psub<Packet16bf>(
const Packet16bf& a,
const Packet16bf& b) {
2745 return F32ToBf16(psub<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
2749EIGEN_STRONG_INLINE Packet16bf pmul<Packet16bf>(
const Packet16bf& a,
const Packet16bf& b) {
2750 return F32ToBf16(pmul(Bf16ToF32(a), Bf16ToF32(b)));
2754EIGEN_STRONG_INLINE Packet16bf pmadd<Packet16bf>(
const Packet16bf& a,
const Packet16bf& b,
const Packet16bf& c) {
2755 return F32ToBf16(pmadd(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
2759EIGEN_STRONG_INLINE Packet16bf pmsub<Packet16bf>(
const Packet16bf& a,
const Packet16bf& b,
const Packet16bf& c) {
2760 return F32ToBf16(pmsub(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
2764EIGEN_STRONG_INLINE Packet16bf pnmadd<Packet16bf>(
const Packet16bf& a,
const Packet16bf& b,
const Packet16bf& c) {
2765 return F32ToBf16(pnmadd(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
2769EIGEN_STRONG_INLINE Packet16bf pnmsub<Packet16bf>(
const Packet16bf& a,
const Packet16bf& b,
const Packet16bf& c) {
2770 return F32ToBf16(pnmsub(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
2774EIGEN_STRONG_INLINE Packet16bf pdiv<Packet16bf>(
const Packet16bf& a,
const Packet16bf& b) {
2775 return F32ToBf16(pdiv<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
2779EIGEN_STRONG_INLINE Packet16bf pmin<Packet16bf>(
const Packet16bf& a,
const Packet16bf& b) {
2780 return F32ToBf16(pmin<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
2784EIGEN_STRONG_INLINE Packet16bf pmax<Packet16bf>(
const Packet16bf& a,
const Packet16bf& b) {
2785 return F32ToBf16(pmax<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
2789EIGEN_STRONG_INLINE Packet16bf plset<Packet16bf>(
const bfloat16& a) {
2790 return F32ToBf16(plset<Packet16f>(
static_cast<float>(a)));
2794EIGEN_STRONG_INLINE Packet8bf predux_half_dowto4<Packet16bf>(
const Packet16bf& a) {
2795 Packet8bf lane0 = _mm256_extractf128_si256(a, 0);
2796 Packet8bf lane1 = _mm256_extractf128_si256(a, 1);
2797 return padd<Packet8bf>(lane0, lane1);
2801EIGEN_STRONG_INLINE Packet16bf preverse(
const Packet16bf& a) {
2802 __m256i m = _mm256_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7,
2807 res = _mm256_permute2x128_si256(a, a, 1);
2809 return _mm256_shuffle_epi8(res, m);
2813EIGEN_STRONG_INLINE Packet16bf pgather<bfloat16, Packet16bf>(
const bfloat16* from,
Index stride) {
2814 return _mm256_set_epi16(
2815 from[15 * stride].value, from[14 * stride].value, from[13 * stride].value, from[12 * stride].value,
2816 from[11 * stride].value, from[10 * stride].value, from[9 * stride].value, from[8 * stride].value,
2817 from[7 * stride].value, from[6 * stride].value, from[5 * stride].value, from[4 * stride].value,
2818 from[3 * stride].value, from[2 * stride].value, from[1 * stride].value, from[0 * stride].value);
2822EIGEN_STRONG_INLINE
void pscatter<bfloat16, Packet16bf>(bfloat16* to,
const Packet16bf& from,
Index stride) {
2823 EIGEN_ALIGN64 bfloat16 aux[16];
2825 to[stride * 0] = aux[0];
2826 to[stride * 1] = aux[1];
2827 to[stride * 2] = aux[2];
2828 to[stride * 3] = aux[3];
2829 to[stride * 4] = aux[4];
2830 to[stride * 5] = aux[5];
2831 to[stride * 6] = aux[6];
2832 to[stride * 7] = aux[7];
2833 to[stride * 8] = aux[8];
2834 to[stride * 9] = aux[9];
2835 to[stride * 10] = aux[10];
2836 to[stride * 11] = aux[11];
2837 to[stride * 12] = aux[12];
2838 to[stride * 13] = aux[13];
2839 to[stride * 14] = aux[14];
2840 to[stride * 15] = aux[15];
2843EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16bf, 16>& kernel) {
2844 __m256i a = kernel.packet[0];
2845 __m256i b = kernel.packet[1];
2846 __m256i c = kernel.packet[2];
2847 __m256i d = kernel.packet[3];
2848 __m256i e = kernel.packet[4];
2849 __m256i f = kernel.packet[5];
2850 __m256i g = kernel.packet[6];
2851 __m256i h = kernel.packet[7];
2852 __m256i i = kernel.packet[8];
2853 __m256i j = kernel.packet[9];
2854 __m256i k = kernel.packet[10];
2855 __m256i l = kernel.packet[11];
2856 __m256i m = kernel.packet[12];
2857 __m256i n = kernel.packet[13];
2858 __m256i o = kernel.packet[14];
2859 __m256i p = kernel.packet[15];
2861 __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
2862 __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
2863 __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
2864 __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
2865 __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
2866 __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
2867 __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
2868 __m256i op_07 = _mm256_unpacklo_epi16(o, p);
2870 __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
2871 __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
2872 __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
2873 __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
2874 __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
2875 __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
2876 __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
2877 __m256i op_8f = _mm256_unpackhi_epi16(o, p);
2879 __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
2880 __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
2881 __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
2882 __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
2883 __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
2884 __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
2885 __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
2886 __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
2888 __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
2889 __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
2890 __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
2891 __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
2892 __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
2893 __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
2894 __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
2895 __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
2897 __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
2898 __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
2899 __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
2900 __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
2901 __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
2902 __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
2903 __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
2904 __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
2905 __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
2906 __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
2907 __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
2908 __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
2909 __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
2910 __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
2911 __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
2912 __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
2915 kernel.packet[0] = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
2916 kernel.packet[1] = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
2917 kernel.packet[2] = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
2918 kernel.packet[3] = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
2919 kernel.packet[4] = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
2920 kernel.packet[5] = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
2921 kernel.packet[6] = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
2922 kernel.packet[7] = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
2923 kernel.packet[8] = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
2924 kernel.packet[9] = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
2925 kernel.packet[10] = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
2926 kernel.packet[11] = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
2927 kernel.packet[12] = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
2928 kernel.packet[13] = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
2929 kernel.packet[14] = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
2930 kernel.packet[15] = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
2933EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16bf, 4>& kernel) {
2934 __m256i a = kernel.packet[0];
2935 __m256i b = kernel.packet[1];
2936 __m256i c = kernel.packet[2];
2937 __m256i d = kernel.packet[3];
2939 __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
2940 __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
2941 __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
2942 __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
2944 __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
2945 __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
2946 __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
2947 __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
2950 kernel.packet[0] = _mm256_permute2x128_si256(abcd_03, abcd_47, 0x20);
2951 kernel.packet[1] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x20);
2952 kernel.packet[2] = _mm256_permute2x128_si256(abcd_03, abcd_47, 0x31);
2953 kernel.packet[3] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x31);
2959EIGEN_STRONG_INLINE Packet32s pset1<Packet32s>(
const numext::int16_t& x) {
2960 return _mm512_set1_epi16(x);
2964EIGEN_STRONG_INLINE Packet16s pset1<Packet16s>(
const numext::int16_t& x) {
2965 return _mm256_set1_epi16(x);
2969EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(
const numext::int16_t& x) {
2970 return _mm_set1_epi16(x);
2974EIGEN_STRONG_INLINE
void pstore<numext::int16_t, Packet32s>(numext::int16_t* out,
const Packet32s& x) {
2975 EIGEN_DEBUG_ALIGNED_STORE
2976 _mm512_store_epi32(out, x);
2980EIGEN_STRONG_INLINE
void pstore<numext::int16_t, Packet16s>(numext::int16_t* out,
const Packet16s& x) {
2981 EIGEN_DEBUG_ALIGNED_STORE
2982#if defined(EIGEN_VECTORIZE_AVX512F) && defined(EIGEN_VECTORIZE_AVX512VL)
2983 _mm256_store_epi32(out, x);
2985 _mm256_store_si256(
reinterpret_cast<__m256i*
>(out), x);
2990EIGEN_STRONG_INLINE
void pstore<numext::int16_t, Packet8s>(numext::int16_t* out,
const Packet8s& x) {
2991 EIGEN_DEBUG_ALIGNED_STORE
2992#if defined(EIGEN_VECTORIZE_AVX512F) && defined(EIGEN_VECTORIZE_AVX512VL)
2993 _mm256_store_epi32(out, x);
2995 _mm_store_si128(
reinterpret_cast<__m128i*
>(out), x);
3000EIGEN_STRONG_INLINE
void pstoreu<numext::int16_t, Packet32s>(numext::int16_t* out,
const Packet32s& x) {
3001 EIGEN_DEBUG_UNALIGNED_STORE
3002 _mm512_storeu_epi32(out, x);
3006EIGEN_STRONG_INLINE
void pstoreu<numext::int16_t, Packet16s>(numext::int16_t* out,
const Packet16s& x) {
3007 EIGEN_DEBUG_UNALIGNED_STORE
3008 _mm256_storeu_epi32(out, x);
3012EIGEN_STRONG_INLINE
void pstoreu<numext::int16_t, Packet8s>(numext::int16_t* out,
const Packet8s& x) {
3013 EIGEN_DEBUG_UNALIGNED_STORE
3014 _mm_storeu_epi32(out, x);
3018EIGEN_STRONG_INLINE Packet32s padd(
const Packet32s& a,
const Packet32s& b) {
3019 return _mm512_add_epi16(a, b);
3023EIGEN_STRONG_INLINE Packet16s padd(
const Packet16s& a,
const Packet16s& b) {
3024 return _mm256_add_epi16(a, b);
3028EIGEN_STRONG_INLINE Packet8s padd(
const Packet8s& a,
const Packet8s& b) {
3029 return _mm_add_epi16(a, b);
3033EIGEN_STRONG_INLINE Packet32s psub(
const Packet32s& a,
const Packet32s& b) {
3034 return _mm512_sub_epi16(a, b);
3038EIGEN_STRONG_INLINE Packet16s psub(
const Packet16s& a,
const Packet16s& b) {
3039 return _mm256_sub_epi16(a, b);
3043EIGEN_STRONG_INLINE Packet8s psub(
const Packet8s& a,
const Packet8s& b) {
3044 return _mm_sub_epi16(a, b);
3048EIGEN_STRONG_INLINE Packet32s pmul(
const Packet32s& a,
const Packet32s& b) {
3049 return _mm512_mullo_epi16(a, b);
3053EIGEN_STRONG_INLINE Packet16s pmul(
const Packet16s& a,
const Packet16s& b) {
3054 return _mm256_mullo_epi16(a, b);
3058EIGEN_STRONG_INLINE Packet8s pmul(
const Packet8s& a,
const Packet8s& b) {
3059 return _mm_mullo_epi16(a, b);
3063EIGEN_STRONG_INLINE Packet32s pnegate(
const Packet32s& a) {
3064 return _mm512_sub_epi16(_mm512_setzero_si512(), a);
3068EIGEN_STRONG_INLINE Packet16s pnegate(
const Packet16s& a) {
3069 return _mm256_sub_epi16(_mm256_setzero_si256(), a);
3073EIGEN_STRONG_INLINE Packet8s pnegate(
const Packet8s& a) {
3074 return _mm_sub_epi16(_mm_setzero_si128(), a);
3078EIGEN_STRONG_INLINE Packet32s parithmetic_shift_right(Packet32s a) {
3079 return _mm512_srai_epi16(a, N);
3083EIGEN_STRONG_INLINE Packet16s parithmetic_shift_right(Packet16s a) {
3084 return _mm256_srai_epi16(a, N);
3088EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(Packet8s a) {
3089 return _mm_srai_epi16(a, N);
3093EIGEN_STRONG_INLINE Packet32s plogical_shift_left(Packet32s a) {
3094 return _mm512_slli_epi16(a, N);
3098EIGEN_STRONG_INLINE Packet16s plogical_shift_left(Packet16s a) {
3099 return _mm256_slli_epi16(a, N);
3103EIGEN_STRONG_INLINE Packet8s plogical_shift_left(Packet8s a) {
3104 return _mm_slli_epi16(a, N);
3108EIGEN_STRONG_INLINE Packet32s plogical_shift_right(Packet32s a) {
3109 return _mm512_srli_epi16(a, N);
3113EIGEN_STRONG_INLINE Packet16s plogical_shift_right(Packet16s a) {
3114 return _mm256_srli_epi16(a, N);
3118EIGEN_STRONG_INLINE Packet8s plogical_shift_right(Packet8s a) {
3119 return _mm_srli_epi16(a, N);
@ Aligned64
Definition Constants.h:239
@ Aligned32
Definition Constants.h:238
@ Aligned16
Definition Constants.h:237
Namespace containing all symbols from the Eigen library.
Definition B01_Experimental.dox:1
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:82