10#ifndef EIGEN_PACKET_MATH_GPU_H
11#define EIGEN_PACKET_MATH_GPU_H
14#include "../../InternalHeaderCheck.h"
21#if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350)
22#define EIGEN_GPU_HAS_LDG 1
26#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530)
27#define EIGEN_CUDA_HAS_FP16_ARITHMETIC 1
30#if defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
31#define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
37#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
38#define EIGEN_HAS_GPU_DEVICE_FUNCTIONS 1
40#define EIGEN_HAS_GPU_DEVICE_FUNCTIONS 0
46#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
49struct is_arithmetic<float4> {
50 enum { value =
true };
53struct is_arithmetic<double2> {
54 enum { value =
true };
58struct packet_traits<float> : default_packet_traits {
83 HasGammaSampleDerAlpha = 1,
89 HasCmp = EIGEN_HAS_GPU_DEVICE_FUNCTIONS
94struct packet_traits<double> : default_packet_traits {
117 HasGammaSampleDerAlpha = 1,
125struct unpacket_traits<float4> {
131 masked_load_available =
false,
132 masked_store_available =
false
137struct unpacket_traits<double2> {
143 masked_load_available =
false,
144 masked_store_available =
false
146 typedef double2 half;
150EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(
const float& from) {
151 return make_float4(from, from, from, from);
154EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(
const double& from) {
155 return make_double2(from, from);
158#if EIGEN_HAS_GPU_DEVICE_FUNCTIONS
160EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
float bitwise_and(
const float& a,
const float& b) {
161 return __int_as_float(__float_as_int(a) & __float_as_int(b));
163EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double bitwise_and(
const double& a,
const double& b) {
164 return __longlong_as_double(__double_as_longlong(a) & __double_as_longlong(b));
167EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
float bitwise_or(
const float& a,
const float& b) {
168 return __int_as_float(__float_as_int(a) | __float_as_int(b));
170EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double bitwise_or(
const double& a,
const double& b) {
171 return __longlong_as_double(__double_as_longlong(a) | __double_as_longlong(b));
174EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
float bitwise_xor(
const float& a,
const float& b) {
175 return __int_as_float(__float_as_int(a) ^ __float_as_int(b));
177EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double bitwise_xor(
const double& a,
const double& b) {
178 return __longlong_as_double(__double_as_longlong(a) ^ __double_as_longlong(b));
181EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
float bitwise_andnot(
const float& a,
const float& b) {
182 return __int_as_float(__float_as_int(a) & ~__float_as_int(b));
184EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double bitwise_andnot(
const double& a,
const double& b) {
185 return __longlong_as_double(__double_as_longlong(a) & ~__double_as_longlong(b));
187EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
float eq_mask(
const float& a,
const float& b) {
188 return __int_as_float(a == b ? 0xffffffffu : 0u);
190EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double eq_mask(
const double& a,
const double& b) {
191 return __longlong_as_double(a == b ? 0xffffffffffffffffull : 0ull);
194EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
float lt_mask(
const float& a,
const float& b) {
195 return __int_as_float(a < b ? 0xffffffffu : 0u);
198EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double lt_mask(
const double& a,
const double& b) {
199 return __longlong_as_double(a < b ? 0xffffffffffffffffull : 0ull);
202EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
float le_mask(
const float& a,
const float& b) {
203 return __int_as_float(a <= b ? 0xffffffffu : 0u);
206EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double le_mask(
const double& a,
const double& b) {
207 return __longlong_as_double(a <= b ? 0xffffffffffffffffull : 0ull);
211EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand<float4>(
const float4& a,
const float4& b) {
212 return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y), bitwise_and(a.z, b.z), bitwise_and(a.w, b.w));
215EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand<double2>(
const double2& a,
const double2& b) {
216 return make_double2(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y));
220EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por<float4>(
const float4& a,
const float4& b) {
221 return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y), bitwise_or(a.z, b.z), bitwise_or(a.w, b.w));
224EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por<double2>(
const double2& a,
const double2& b) {
225 return make_double2(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y));
229EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor<float4>(
const float4& a,
const float4& b) {
230 return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y), bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w));
233EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor<double2>(
const double2& a,
const double2& b) {
234 return make_double2(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y));
238EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot<float4>(
const float4& a,
const float4& b) {
239 return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y), bitwise_andnot(a.z, b.z),
240 bitwise_andnot(a.w, b.w));
243EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pandnot<double2>(
const double2& a,
const double2& b) {
244 return make_double2(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y));
248EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq<float4>(
const float4& a,
const float4& b) {
249 return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z), eq_mask(a.w, b.w));
252EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_lt<float4>(
const float4& a,
const float4& b) {
253 return make_float4(lt_mask(a.x, b.x), lt_mask(a.y, b.y), lt_mask(a.z, b.z), lt_mask(a.w, b.w));
256EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_le<float4>(
const float4& a,
const float4& b) {
257 return make_float4(le_mask(a.x, b.x), le_mask(a.y, b.y), le_mask(a.z, b.z), le_mask(a.w, b.w));
260EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_eq<double2>(
const double2& a,
const double2& b) {
261 return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y));
264EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_lt<double2>(
const double2& a,
const double2& b) {
265 return make_double2(lt_mask(a.x, b.x), lt_mask(a.y, b.y));
268EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_le<double2>(
const double2& a,
const double2& b) {
269 return make_double2(le_mask(a.x, b.x), le_mask(a.y, b.y));
274EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(
const float& a) {
275 return make_float4(a, a + 1, a + 2, a + 3);
278EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(
const double& a) {
279 return make_double2(a, a + 1);
283EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(
const float4& a,
const float4& b) {
284 return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
287EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(
const double2& a,
const double2& b) {
288 return make_double2(a.x + b.x, a.y + b.y);
292EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(
const float4& a,
const float4& b) {
293 return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
296EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(
const double2& a,
const double2& b) {
297 return make_double2(a.x - b.x, a.y - b.y);
301EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(
const float4& a) {
302 return make_float4(-a.x, -a.y, -a.z, -a.w);
305EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(
const double2& a) {
306 return make_double2(-a.x, -a.y);
310EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(
const float4& a) {
314EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(
const double2& a) {
319EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(
const float4& a,
const float4& b) {
320 return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
323EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(
const double2& a,
const double2& b) {
324 return make_double2(a.x * b.x, a.y * b.y);
328EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(
const float4& a,
const float4& b) {
329 return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
332EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(
const double2& a,
const double2& b) {
333 return make_double2(a.x / b.x, a.y / b.y);
337EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(
const float4& a,
const float4& b) {
338 return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
341EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(
const double2& a,
const double2& b) {
342 return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
346EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(
const float4& a,
const float4& b) {
347 return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
350EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(
const double2& a,
const double2& b) {
351 return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
355EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(
const float* from) {
356 return *
reinterpret_cast<const float4*
>(from);
360EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(
const double* from) {
361 return *
reinterpret_cast<const double2*
>(from);
365EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(
const float* from) {
366 return make_float4(from[0], from[1], from[2], from[3]);
369EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(
const double* from) {
370 return make_double2(from[0], from[1]);
374EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(
const float* from) {
375 return make_float4(from[0], from[0], from[1], from[1]);
378EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(
const double* from) {
379 return make_double2(from[0], from[0]);
383EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pstore<float>(
float* to,
const float4& from) {
384 *
reinterpret_cast<float4*
>(to) = from;
388EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pstore<double>(
double* to,
const double2& from) {
389 *
reinterpret_cast<double2*
>(to) = from;
393EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pstoreu<float>(
float* to,
const float4& from) {
401EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pstoreu<double>(
double* to,
const double2& from) {
407EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(
const float* from) {
408#if defined(EIGEN_GPU_HAS_LDG)
409 return __ldg(
reinterpret_cast<const float4*
>(from));
411 return make_float4(from[0], from[1], from[2], from[3]);
415EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(
const double* from) {
416#if defined(EIGEN_GPU_HAS_LDG)
417 return __ldg(
reinterpret_cast<const double2*
>(from));
419 return make_double2(from[0], from[1]);
424EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(
const float* from) {
425#if defined(EIGEN_GPU_HAS_LDG)
426 return make_float4(__ldg(from + 0), __ldg(from + 1), __ldg(from + 2), __ldg(from + 3));
428 return make_float4(from[0], from[1], from[2], from[3]);
432EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(
const double* from) {
433#if defined(EIGEN_GPU_HAS_LDG)
434 return make_double2(__ldg(from + 0), __ldg(from + 1));
436 return make_double2(from[0], from[1]);
441EIGEN_DEVICE_FUNC
inline float4 pgather<float, float4>(
const float* from,
Index stride) {
442 return make_float4(from[0 * stride], from[1 * stride], from[2 * stride], from[3 * stride]);
446EIGEN_DEVICE_FUNC
inline double2 pgather<double, double2>(
const double* from,
Index stride) {
447 return make_double2(from[0 * stride], from[1 * stride]);
451EIGEN_DEVICE_FUNC
inline void pscatter<float, float4>(
float* to,
const float4& from,
Index stride) {
452 to[stride * 0] = from.x;
453 to[stride * 1] = from.y;
454 to[stride * 2] = from.z;
455 to[stride * 3] = from.w;
458EIGEN_DEVICE_FUNC
inline void pscatter<double, double2>(
double* to,
const double2& from,
Index stride) {
459 to[stride * 0] = from.x;
460 to[stride * 1] = from.y;
464EIGEN_DEVICE_FUNC
inline float pfirst<float4>(
const float4& a) {
468EIGEN_DEVICE_FUNC
inline double pfirst<double2>(
const double2& a) {
473EIGEN_DEVICE_FUNC
inline float predux<float4>(
const float4& a) {
474 return a.x + a.y + a.z + a.w;
477EIGEN_DEVICE_FUNC
inline double predux<double2>(
const double2& a) {
482EIGEN_DEVICE_FUNC
inline float predux_max<float4>(
const float4& a) {
483 return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
486EIGEN_DEVICE_FUNC
inline double predux_max<double2>(
const double2& a) {
487 return fmax(a.x, a.y);
491EIGEN_DEVICE_FUNC
inline float predux_min<float4>(
const float4& a) {
492 return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
495EIGEN_DEVICE_FUNC
inline double predux_min<double2>(
const double2& a) {
496 return fmin(a.x, a.y);
500EIGEN_DEVICE_FUNC
inline float predux_mul<float4>(
const float4& a) {
501 return a.x * a.y * a.z * a.w;
504EIGEN_DEVICE_FUNC
inline double predux_mul<double2>(
const double2& a) {
509EIGEN_DEVICE_FUNC
inline float4 pabs<float4>(
const float4& a) {
510 return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
513EIGEN_DEVICE_FUNC
inline double2 pabs<double2>(
const double2& a) {
514 return make_double2(fabs(a.x), fabs(a.y));
518EIGEN_DEVICE_FUNC
inline float4 pfloor<float4>(
const float4& a) {
519 return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
522EIGEN_DEVICE_FUNC
inline double2 pfloor<double2>(
const double2& a) {
527EIGEN_DEVICE_FUNC
inline float4 pceil<float4>(
const float4& a) {
528 return make_float4(ceilf(a.x), ceilf(a.y), ceilf(a.z), ceilf(a.w));
531EIGEN_DEVICE_FUNC
inline double2 pceil<double2>(
const double2& a) {
532 return make_double2(
ceil(a.x),
ceil(a.y));
536EIGEN_DEVICE_FUNC
inline float4 print<float4>(
const float4& a) {
537 return make_float4(rintf(a.x), rintf(a.y), rintf(a.z), rintf(a.w));
540EIGEN_DEVICE_FUNC
inline double2 print<double2>(
const double2& a) {
541 return make_double2(
rint(a.x),
rint(a.y));
545EIGEN_DEVICE_FUNC
inline float4 ptrunc<float4>(
const float4& a) {
546 return make_float4(truncf(a.x), truncf(a.y), truncf(a.z), truncf(a.w));
549EIGEN_DEVICE_FUNC
inline double2 ptrunc<double2>(
const double2& a) {
553EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<float4, 4>& kernel) {
554 float tmp = kernel.packet[0].y;
555 kernel.packet[0].y = kernel.packet[1].x;
556 kernel.packet[1].x = tmp;
558 tmp = kernel.packet[0].z;
559 kernel.packet[0].z = kernel.packet[2].x;
560 kernel.packet[2].x = tmp;
562 tmp = kernel.packet[0].w;
563 kernel.packet[0].w = kernel.packet[3].x;
564 kernel.packet[3].x = tmp;
566 tmp = kernel.packet[1].z;
567 kernel.packet[1].z = kernel.packet[2].y;
568 kernel.packet[2].y = tmp;
570 tmp = kernel.packet[1].w;
571 kernel.packet[1].w = kernel.packet[3].y;
572 kernel.packet[3].y = tmp;
574 tmp = kernel.packet[2].w;
575 kernel.packet[2].w = kernel.packet[3].z;
576 kernel.packet[3].z = tmp;
579EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<double2, 2>& kernel) {
580 double tmp = kernel.packet[0].y;
581 kernel.packet[0].y = kernel.packet[1].x;
582 kernel.packet[1].x = tmp;
590#if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
592typedef ulonglong2 Packet4h2;
594struct unpacket_traits<Packet4h2> {
595 typedef Eigen::half type;
600 masked_load_available =
false,
601 masked_store_available =
false
603 typedef Packet4h2 half;
606struct is_arithmetic<Packet4h2> {
607 enum { value =
true };
611struct unpacket_traits<half2> {
612 typedef Eigen::half type;
617 masked_load_available =
false,
618 masked_store_available =
false
623struct is_arithmetic<half2> {
624 enum { value =
true };
628struct packet_traits<Eigen::half> : default_packet_traits {
629 typedef Packet4h2 type;
630 typedef Packet4h2 half;
649EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(
const Eigen::half& from) {
650 return __half2half2(from);
654EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pset1<Packet4h2>(
const Eigen::half& from) {
656 half2* p_alias =
reinterpret_cast<half2*
>(&r);
657 p_alias[0] = pset1<half2>(from);
658 p_alias[1] = pset1<half2>(from);
659 p_alias[2] = pset1<half2>(from);
660 p_alias[3] = pset1<half2>(from);
666EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(
const Eigen::half* from) {
667 return *
reinterpret_cast<const half2*
>(from);
670EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(
const Eigen::half* from) {
return __halves2half2(from[0], from[1]); }
672EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(
const Eigen::half* from) {
673 return __halves2half2(from[0], from[0]);
676EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pstore(Eigen::half* to,
const half2& from) {
677 *
reinterpret_cast<half2*
>(to) = from;
680EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pstoreu(Eigen::half* to,
const half2& from) {
681 to[0] = __low2half(from);
682 to[1] = __high2half(from);
685EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(
const Eigen::half* from) {
686#if defined(EIGEN_GPU_HAS_LDG)
688 return __ldg(
reinterpret_cast<const half2*
>(from));
690 return __halves2half2(*(from + 0), *(from + 1));
694EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(
const Eigen::half* from) {
695#if defined(EIGEN_GPU_HAS_LDG)
696 return __halves2half2(__ldg(from + 0), __ldg(from + 1));
698 return __halves2half2(*(from + 0), *(from + 1));
702EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(
const Eigen::half* from,
Index stride) {
703 return __halves2half2(from[0 * stride], from[1 * stride]);
706EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter(Eigen::half* to,
const half2& from,
Index stride) {
707 to[stride * 0] = __low2half(from);
708 to[stride * 1] = __high2half(from);
711EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(
const half2& a) {
return __low2half(a); }
713EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(
const half2& a) {
714 half a1 = __low2half(a);
715 half a2 = __high2half(a);
716 half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF);
717 half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF);
718 return __halves2half2(result1, result2);
721EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(
const half2& ) {
722 half true_half = half_impl::raw_uint16_to_half(0xffffu);
723 return pset1<half2>(true_half);
726EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(
const half2& ) {
727 half false_half = half_impl::raw_uint16_to_half(0x0000u);
728 return pset1<half2>(false_half);
731EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<half2, 2>& kernel) {
732 __half a1 = __low2half(kernel.packet[0]);
733 __half a2 = __high2half(kernel.packet[0]);
734 __half b1 = __low2half(kernel.packet[1]);
735 __half b2 = __high2half(kernel.packet[1]);
736 kernel.packet[0] = __halves2half2(a1, b1);
737 kernel.packet[1] = __halves2half2(a2, b2);
740EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(
const Eigen::half& a) {
741#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
742 return __halves2half2(a, __hadd(a, __float2half(1.0f)));
744 float f = __half2float(a) + 1.0f;
745 return __halves2half2(a, __float2half(f));
749EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(
const half2& mask,
const half2& a,
const half2& b) {
750 half mask_low = __low2half(mask);
751 half mask_high = __high2half(mask);
752 half result_low = mask_low == half(0) ? __low2half(b) : __low2half(a);
753 half result_high = mask_high == half(0) ? __high2half(b) : __high2half(a);
754 return __halves2half2(result_low, result_high);
757EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(
const half2& a,
const half2& b) {
758 half true_half = half_impl::raw_uint16_to_half(0xffffu);
759 half false_half = half_impl::raw_uint16_to_half(0x0000u);
760 half a1 = __low2half(a);
761 half a2 = __high2half(a);
762 half b1 = __low2half(b);
763 half b2 = __high2half(b);
764 half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half;
765 half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half;
766 return __halves2half2(eq1, eq2);
769EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(
const half2& a,
const half2& b) {
770 half true_half = half_impl::raw_uint16_to_half(0xffffu);
771 half false_half = half_impl::raw_uint16_to_half(0x0000u);
772 half a1 = __low2half(a);
773 half a2 = __high2half(a);
774 half b1 = __low2half(b);
775 half b2 = __high2half(b);
776 half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half;
777 half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half;
778 return __halves2half2(eq1, eq2);
781EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_le(
const half2& a,
const half2& b) {
782 half true_half = half_impl::raw_uint16_to_half(0xffffu);
783 half false_half = half_impl::raw_uint16_to_half(0x0000u);
784 half a1 = __low2half(a);
785 half a2 = __high2half(a);
786 half b1 = __low2half(b);
787 half b2 = __high2half(b);
788 half eq1 = __half2float(a1) <= __half2float(b1) ? true_half : false_half;
789 half eq2 = __half2float(a2) <= __half2float(b2) ? true_half : false_half;
790 return __halves2half2(eq1, eq2);
793EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(
const half2& a,
const half2& b) {
794 half a1 = __low2half(a);
795 half a2 = __high2half(a);
796 half b1 = __low2half(b);
797 half b2 = __high2half(b);
798 half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x);
799 half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x);
800 return __halves2half2(result1, result2);
803EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(
const half2& a,
const half2& b) {
804 half a1 = __low2half(a);
805 half a2 = __high2half(a);
806 half b1 = __low2half(b);
807 half b2 = __high2half(b);
808 half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x);
809 half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x);
810 return __halves2half2(result1, result2);
813EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(
const half2& a,
const half2& b) {
814 half a1 = __low2half(a);
815 half a2 = __high2half(a);
816 half b1 = __low2half(b);
817 half b2 = __high2half(b);
818 half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x);
819 half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x);
820 return __halves2half2(result1, result2);
823EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(
const half2& a,
const half2& b) {
824 half a1 = __low2half(a);
825 half a2 = __high2half(a);
826 half b1 = __low2half(b);
827 half b2 = __high2half(b);
828 half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x);
829 half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x);
830 return __halves2half2(result1, result2);
833EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(
const half2& a,
const half2& b) {
834#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
835 return __hadd2(a, b);
837 float a1 = __low2float(a);
838 float a2 = __high2float(a);
839 float b1 = __low2float(b);
840 float b2 = __high2float(b);
843 return __floats2half2_rn(r1, r2);
847EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(
const half2& a,
const half2& b) {
848#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
849 return __hsub2(a, b);
851 float a1 = __low2float(a);
852 float a2 = __high2float(a);
853 float b1 = __low2float(b);
854 float b2 = __high2float(b);
857 return __floats2half2_rn(r1, r2);
861EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(
const half2& a) {
862#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
865 float a1 = __low2float(a);
866 float a2 = __high2float(a);
867 return __floats2half2_rn(-a1, -a2);
871EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(
const half2& a) {
return a; }
873EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(
const half2& a,
const half2& b) {
874#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
875 return __hmul2(a, b);
877 float a1 = __low2float(a);
878 float a2 = __high2float(a);
879 float b1 = __low2float(b);
880 float b2 = __high2float(b);
883 return __floats2half2_rn(r1, r2);
887EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(
const half2& a,
const half2& b,
const half2& c) {
888#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
889 return __hfma2(a, b, c);
891 float a1 = __low2float(a);
892 float a2 = __high2float(a);
893 float b1 = __low2float(b);
894 float b2 = __high2float(b);
895 float c1 = __low2float(c);
896 float c2 = __high2float(c);
897 float r1 = a1 * b1 + c1;
898 float r2 = a2 * b2 + c2;
899 return __floats2half2_rn(r1, r2);
903EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(
const half2& a,
const half2& b) {
904#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
905 return __h2div(a, b);
907 float a1 = __low2float(a);
908 float a2 = __high2float(a);
909 float b1 = __low2float(b);
910 float b2 = __high2float(b);
913 return __floats2half2_rn(r1, r2);
917EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(
const half2& a,
const half2& b) {
918 float a1 = __low2float(a);
919 float a2 = __high2float(a);
920 float b1 = __low2float(b);
921 float b2 = __high2float(b);
922 __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
923 __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
924 return __halves2half2(r1, r2);
927EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(
const half2& a,
const half2& b) {
928 float a1 = __low2float(a);
929 float a2 = __high2float(a);
930 float b1 = __low2float(b);
931 float b2 = __high2float(b);
932 __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
933 __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
934 return __halves2half2(r1, r2);
937EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(
const half2& a) {
938#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
939 return __hadd(__low2half(a), __high2half(a));
941 float a1 = __low2float(a);
942 float a2 = __high2float(a);
943 return Eigen::half(__float2half(a1 + a2));
947EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(
const half2& a) {
948#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
949 __half first = __low2half(a);
950 __half second = __high2half(a);
951 return __hgt(first, second) ? first : second;
953 float a1 = __low2float(a);
954 float a2 = __high2float(a);
955 return a1 > a2 ? __low2half(a) : __high2half(a);
959EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(
const half2& a) {
960#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
961 __half first = __low2half(a);
962 __half second = __high2half(a);
963 return __hlt(first, second) ? first : second;
965 float a1 = __low2float(a);
966 float a2 = __high2float(a);
967 return a1 < a2 ? __low2half(a) : __high2half(a);
971EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(
const half2& a) {
972#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
973 return __hmul(__low2half(a), __high2half(a));
975 float a1 = __low2float(a);
976 float a2 = __high2float(a);
977 return Eigen::half(__float2half(a1 * a2));
981EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(
const half2& a) {
982 float a1 = __low2float(a);
983 float a2 = __high2float(a);
984 float r1 = log1pf(a1);
985 float r2 = log1pf(a2);
986 return __floats2half2_rn(r1, r2);
989EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(
const half2& a) {
990 float a1 = __low2float(a);
991 float a2 = __high2float(a);
992 float r1 = expm1f(a1);
993 float r2 = expm1f(a2);
994 return __floats2half2_rn(r1, r2);
997#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || defined(EIGEN_HIP_DEVICE_COMPILE)
999EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(
const half2& a) {
return h2log(a); }
1001EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(
const half2& a) {
return h2exp(a); }
1003EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(
const half2& a) {
return h2sqrt(a); }
1005EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(
const half2& a) {
return h2rsqrt(a); }
1009EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(
const half2& a) {
1010 float a1 = __low2float(a);
1011 float a2 = __high2float(a);
1012 float r1 = logf(a1);
1013 float r2 = logf(a2);
1014 return __floats2half2_rn(r1, r2);
1017EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(
const half2& a) {
1018 float a1 = __low2float(a);
1019 float a2 = __high2float(a);
1020 float r1 = expf(a1);
1021 float r2 = expf(a2);
1022 return __floats2half2_rn(r1, r2);
1025EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(
const half2& a) {
1026 float a1 = __low2float(a);
1027 float a2 = __high2float(a);
1028 float r1 = sqrtf(a1);
1029 float r2 = sqrtf(a2);
1030 return __floats2half2_rn(r1, r2);
1033EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(
const half2& a) {
1034 float a1 = __low2float(a);
1035 float a2 = __high2float(a);
1036 float r1 = rsqrtf(a1);
1037 float r2 = rsqrtf(a2);
1038 return __floats2half2_rn(r1, r2);
1044EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pload<Packet4h2>(
const Eigen::half* from) {
1045 return *
reinterpret_cast<const Packet4h2*
>(from);
1050EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ploadu<Packet4h2>(
const Eigen::half* from) {
1052 half2* p_alias =
reinterpret_cast<half2*
>(&r);
1053 p_alias[0] = ploadu(from + 0);
1054 p_alias[1] = ploadu(from + 2);
1055 p_alias[2] = ploadu(from + 4);
1056 p_alias[3] = ploadu(from + 6);
1061EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ploaddup<Packet4h2>(
const Eigen::half* from) {
1063 half2* p_alias =
reinterpret_cast<half2*
>(&r);
1064 p_alias[0] = ploaddup(from + 0);
1065 p_alias[1] = ploaddup(from + 1);
1066 p_alias[2] = ploaddup(from + 2);
1067 p_alias[3] = ploaddup(from + 3);
1072EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pstore<Eigen::half>(Eigen::half* to,
const Packet4h2& from) {
1073 *
reinterpret_cast<Packet4h2*
>(to) = from;
1077EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pstoreu<Eigen::half>(Eigen::half* to,
const Packet4h2& from) {
1078 const half2* from_alias =
reinterpret_cast<const half2*
>(&from);
1079 pstoreu(to + 0, from_alias[0]);
1080 pstoreu(to + 2, from_alias[1]);
1081 pstoreu(to + 4, from_alias[2]);
1082 pstoreu(to + 6, from_alias[3]);
1086EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 ploadt_ro<Packet4h2, Aligned>(
const Eigen::half* from) {
1087#if defined(EIGEN_GPU_HAS_LDG)
1089 r = __ldg(
reinterpret_cast<const Packet4h2*
>(from));
1093 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1094 r_alias[0] = ploadt_ro_aligned(from + 0);
1095 r_alias[1] = ploadt_ro_aligned(from + 2);
1096 r_alias[2] = ploadt_ro_aligned(from + 4);
1097 r_alias[3] = ploadt_ro_aligned(from + 6);
1103EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 ploadt_ro<Packet4h2, Unaligned>(
const Eigen::half* from) {
1105 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1106 r_alias[0] = ploadt_ro_unaligned(from + 0);
1107 r_alias[1] = ploadt_ro_unaligned(from + 2);
1108 r_alias[2] = ploadt_ro_unaligned(from + 4);
1109 r_alias[3] = ploadt_ro_unaligned(from + 6);
1114EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pgather<Eigen::half, Packet4h2>(
const Eigen::half* from,
Index stride) {
1116 half2* p_alias =
reinterpret_cast<half2*
>(&r);
1117 p_alias[0] = __halves2half2(from[0 * stride], from[1 * stride]);
1118 p_alias[1] = __halves2half2(from[2 * stride], from[3 * stride]);
1119 p_alias[2] = __halves2half2(from[4 * stride], from[5 * stride]);
1120 p_alias[3] = __halves2half2(from[6 * stride], from[7 * stride]);
1125EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<Eigen::half, Packet4h2>(Eigen::half* to,
const Packet4h2& from,
1127 const half2* from_alias =
reinterpret_cast<const half2*
>(&from);
1128 pscatter(to + stride * 0, from_alias[0], stride);
1129 pscatter(to + stride * 2, from_alias[1], stride);
1130 pscatter(to + stride * 4, from_alias[2], stride);
1131 pscatter(to + stride * 6, from_alias[3], stride);
1135EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h2>(
const Packet4h2& a) {
1136 return pfirst(*(
reinterpret_cast<const half2*
>(&a)));
1140EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pabs<Packet4h2>(
const Packet4h2& a) {
1142 half2* p_alias =
reinterpret_cast<half2*
>(&r);
1143 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1144 p_alias[0] = pabs(a_alias[0]);
1145 p_alias[1] = pabs(a_alias[1]);
1146 p_alias[2] = pabs(a_alias[2]);
1147 p_alias[3] = pabs(a_alias[3]);
1152EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ptrue<Packet4h2>(
const Packet4h2& ) {
1153 half true_half = half_impl::raw_uint16_to_half(0xffffu);
1154 return pset1<Packet4h2>(true_half);
1158EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pzero<Packet4h2>(
const Packet4h2& ) {
1159 half false_half = half_impl::raw_uint16_to_half(0x0000u);
1160 return pset1<Packet4h2>(false_half);
1163EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose_double(
double* d_row0,
double* d_row1,
double* d_row2,
1164 double* d_row3,
double* d_row4,
double* d_row5,
1165 double* d_row6,
double* d_row7) {
1168 d_row0[1] = d_row4[0];
1172 d_row1[1] = d_row5[0];
1176 d_row2[1] = d_row6[0];
1180 d_row3[1] = d_row7[0];
1184EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose_half2(half2* f_row0, half2* f_row1, half2* f_row2,
1188 f_row0[1] = f_row2[0];
1192 f_row1[1] = f_row3[0];
1196EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose_half(half2& f0, half2& f1) {
1197 __half a1 = __low2half(f0);
1198 __half a2 = __high2half(f0);
1199 __half b1 = __low2half(f1);
1200 __half b2 = __high2half(f1);
1201 f0 = __halves2half2(a1, b1);
1202 f1 = __halves2half2(a2, b2);
1205EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet4h2, 8>& kernel) {
1206 double* d_row0 =
reinterpret_cast<double*
>(&kernel.packet[0]);
1207 double* d_row1 =
reinterpret_cast<double*
>(&kernel.packet[1]);
1208 double* d_row2 =
reinterpret_cast<double*
>(&kernel.packet[2]);
1209 double* d_row3 =
reinterpret_cast<double*
>(&kernel.packet[3]);
1210 double* d_row4 =
reinterpret_cast<double*
>(&kernel.packet[4]);
1211 double* d_row5 =
reinterpret_cast<double*
>(&kernel.packet[5]);
1212 double* d_row6 =
reinterpret_cast<double*
>(&kernel.packet[6]);
1213 double* d_row7 =
reinterpret_cast<double*
>(&kernel.packet[7]);
1214 ptranspose_double(d_row0, d_row1, d_row2, d_row3, d_row4, d_row5, d_row6, d_row7);
1216 half2* f_row0 =
reinterpret_cast<half2*
>(d_row0);
1217 half2* f_row1 =
reinterpret_cast<half2*
>(d_row1);
1218 half2* f_row2 =
reinterpret_cast<half2*
>(d_row2);
1219 half2* f_row3 =
reinterpret_cast<half2*
>(d_row3);
1220 ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1221 ptranspose_half(f_row0[0], f_row1[0]);
1222 ptranspose_half(f_row0[1], f_row1[1]);
1223 ptranspose_half(f_row2[0], f_row3[0]);
1224 ptranspose_half(f_row2[1], f_row3[1]);
1226 f_row0 =
reinterpret_cast<half2*
>(d_row0 + 1);
1227 f_row1 =
reinterpret_cast<half2*
>(d_row1 + 1);
1228 f_row2 =
reinterpret_cast<half2*
>(d_row2 + 1);
1229 f_row3 =
reinterpret_cast<half2*
>(d_row3 + 1);
1230 ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1231 ptranspose_half(f_row0[0], f_row1[0]);
1232 ptranspose_half(f_row0[1], f_row1[1]);
1233 ptranspose_half(f_row2[0], f_row3[0]);
1234 ptranspose_half(f_row2[1], f_row3[1]);
1236 f_row0 =
reinterpret_cast<half2*
>(d_row4);
1237 f_row1 =
reinterpret_cast<half2*
>(d_row5);
1238 f_row2 =
reinterpret_cast<half2*
>(d_row6);
1239 f_row3 =
reinterpret_cast<half2*
>(d_row7);
1240 ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1241 ptranspose_half(f_row0[0], f_row1[0]);
1242 ptranspose_half(f_row0[1], f_row1[1]);
1243 ptranspose_half(f_row2[0], f_row3[0]);
1244 ptranspose_half(f_row2[1], f_row3[1]);
1246 f_row0 =
reinterpret_cast<half2*
>(d_row4 + 1);
1247 f_row1 =
reinterpret_cast<half2*
>(d_row5 + 1);
1248 f_row2 =
reinterpret_cast<half2*
>(d_row6 + 1);
1249 f_row3 =
reinterpret_cast<half2*
>(d_row7 + 1);
1250 ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1251 ptranspose_half(f_row0[0], f_row1[0]);
1252 ptranspose_half(f_row0[1], f_row1[1]);
1253 ptranspose_half(f_row2[0], f_row3[0]);
1254 ptranspose_half(f_row2[1], f_row3[1]);
1258EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plset<Packet4h2>(
const Eigen::half& a) {
1259#if defined(EIGEN_HIP_DEVICE_COMPILE)
1262 half2* p_alias =
reinterpret_cast<half2*
>(&r);
1263 p_alias[0] = __halves2half2(a, __hadd(a, __float2half(1.0f)));
1264 p_alias[1] = __halves2half2(__hadd(a, __float2half(2.0f)), __hadd(a, __float2half(3.0f)));
1265 p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)), __hadd(a, __float2half(5.0f)));
1266 p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)), __hadd(a, __float2half(7.0f)));
1268#elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1270 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1272 half2 b = pset1<half2>(a);
1274 half2 half_offset0 = __halves2half2(__float2half(0.0f), __float2half(2.0f));
1275 half2 half_offset1 = __halves2half2(__float2half(4.0f), __float2half(6.0f));
1277 c = __hadd2(b, half_offset0);
1278 r_alias[0] = plset(__low2half(c));
1279 r_alias[1] = plset(__high2half(c));
1281 c = __hadd2(b, half_offset1);
1282 r_alias[2] = plset(__low2half(c));
1283 r_alias[3] = plset(__high2half(c));
1288 float f = __half2float(a);
1290 half2* p_alias =
reinterpret_cast<half2*
>(&r);
1291 p_alias[0] = __halves2half2(a, __float2half(f + 1.0f));
1292 p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f));
1293 p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f));
1294 p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f));
1300EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pselect<Packet4h2>(
const Packet4h2& mask,
const Packet4h2& a,
1301 const Packet4h2& b) {
1303 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1304 const half2* mask_alias =
reinterpret_cast<const half2*
>(&mask);
1305 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1306 const half2* b_alias =
reinterpret_cast<const half2*
>(&b);
1307 r_alias[0] = pselect(mask_alias[0], a_alias[0], b_alias[0]);
1308 r_alias[1] = pselect(mask_alias[1], a_alias[1], b_alias[1]);
1309 r_alias[2] = pselect(mask_alias[2], a_alias[2], b_alias[2]);
1310 r_alias[3] = pselect(mask_alias[3], a_alias[3], b_alias[3]);
1315EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcmp_eq<Packet4h2>(
const Packet4h2& a,
const Packet4h2& b) {
1317 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1318 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1319 const half2* b_alias =
reinterpret_cast<const half2*
>(&b);
1320 r_alias[0] = pcmp_eq(a_alias[0], b_alias[0]);
1321 r_alias[1] = pcmp_eq(a_alias[1], b_alias[1]);
1322 r_alias[2] = pcmp_eq(a_alias[2], b_alias[2]);
1323 r_alias[3] = pcmp_eq(a_alias[3], b_alias[3]);
1328EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcmp_lt<Packet4h2>(
const Packet4h2& a,
const Packet4h2& b) {
1330 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1331 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1332 const half2* b_alias =
reinterpret_cast<const half2*
>(&b);
1333 r_alias[0] = pcmp_lt(a_alias[0], b_alias[0]);
1334 r_alias[1] = pcmp_lt(a_alias[1], b_alias[1]);
1335 r_alias[2] = pcmp_lt(a_alias[2], b_alias[2]);
1336 r_alias[3] = pcmp_lt(a_alias[3], b_alias[3]);
1341EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcmp_le<Packet4h2>(
const Packet4h2& a,
const Packet4h2& b) {
1343 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1344 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1345 const half2* b_alias =
reinterpret_cast<const half2*
>(&b);
1346 r_alias[0] = pcmp_le(a_alias[0], b_alias[0]);
1347 r_alias[1] = pcmp_le(a_alias[1], b_alias[1]);
1348 r_alias[2] = pcmp_le(a_alias[2], b_alias[2]);
1349 r_alias[3] = pcmp_le(a_alias[3], b_alias[3]);
1354EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pand<Packet4h2>(
const Packet4h2& a,
const Packet4h2& b) {
1356 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1357 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1358 const half2* b_alias =
reinterpret_cast<const half2*
>(&b);
1359 r_alias[0] = pand(a_alias[0], b_alias[0]);
1360 r_alias[1] = pand(a_alias[1], b_alias[1]);
1361 r_alias[2] = pand(a_alias[2], b_alias[2]);
1362 r_alias[3] = pand(a_alias[3], b_alias[3]);
1367EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 por<Packet4h2>(
const Packet4h2& a,
const Packet4h2& b) {
1369 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1370 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1371 const half2* b_alias =
reinterpret_cast<const half2*
>(&b);
1372 r_alias[0] = por(a_alias[0], b_alias[0]);
1373 r_alias[1] = por(a_alias[1], b_alias[1]);
1374 r_alias[2] = por(a_alias[2], b_alias[2]);
1375 r_alias[3] = por(a_alias[3], b_alias[3]);
1380EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pxor<Packet4h2>(
const Packet4h2& a,
const Packet4h2& b) {
1382 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1383 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1384 const half2* b_alias =
reinterpret_cast<const half2*
>(&b);
1385 r_alias[0] = pxor(a_alias[0], b_alias[0]);
1386 r_alias[1] = pxor(a_alias[1], b_alias[1]);
1387 r_alias[2] = pxor(a_alias[2], b_alias[2]);
1388 r_alias[3] = pxor(a_alias[3], b_alias[3]);
1393EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pandnot<Packet4h2>(
const Packet4h2& a,
const Packet4h2& b) {
1395 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1396 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1397 const half2* b_alias =
reinterpret_cast<const half2*
>(&b);
1398 r_alias[0] = pandnot(a_alias[0], b_alias[0]);
1399 r_alias[1] = pandnot(a_alias[1], b_alias[1]);
1400 r_alias[2] = pandnot(a_alias[2], b_alias[2]);
1401 r_alias[3] = pandnot(a_alias[3], b_alias[3]);
1406EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 padd<Packet4h2>(
const Packet4h2& a,
const Packet4h2& b) {
1408 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1409 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1410 const half2* b_alias =
reinterpret_cast<const half2*
>(&b);
1411 r_alias[0] = padd(a_alias[0], b_alias[0]);
1412 r_alias[1] = padd(a_alias[1], b_alias[1]);
1413 r_alias[2] = padd(a_alias[2], b_alias[2]);
1414 r_alias[3] = padd(a_alias[3], b_alias[3]);
1419EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psub<Packet4h2>(
const Packet4h2& a,
const Packet4h2& b) {
1421 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1422 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1423 const half2* b_alias =
reinterpret_cast<const half2*
>(&b);
1424 r_alias[0] = psub(a_alias[0], b_alias[0]);
1425 r_alias[1] = psub(a_alias[1], b_alias[1]);
1426 r_alias[2] = psub(a_alias[2], b_alias[2]);
1427 r_alias[3] = psub(a_alias[3], b_alias[3]);
1432EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pnegate(
const Packet4h2& a) {
1434 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1435 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1436 r_alias[0] = pnegate(a_alias[0]);
1437 r_alias[1] = pnegate(a_alias[1]);
1438 r_alias[2] = pnegate(a_alias[2]);
1439 r_alias[3] = pnegate(a_alias[3]);
1444EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pconj(
const Packet4h2& a) {
1449EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmul<Packet4h2>(
const Packet4h2& a,
const Packet4h2& b) {
1451 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1452 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1453 const half2* b_alias =
reinterpret_cast<const half2*
>(&b);
1454 r_alias[0] = pmul(a_alias[0], b_alias[0]);
1455 r_alias[1] = pmul(a_alias[1], b_alias[1]);
1456 r_alias[2] = pmul(a_alias[2], b_alias[2]);
1457 r_alias[3] = pmul(a_alias[3], b_alias[3]);
1462EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmadd<Packet4h2>(
const Packet4h2& a,
const Packet4h2& b,
1463 const Packet4h2& c) {
1465 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1466 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1467 const half2* b_alias =
reinterpret_cast<const half2*
>(&b);
1468 const half2* c_alias =
reinterpret_cast<const half2*
>(&c);
1469 r_alias[0] = pmadd(a_alias[0], b_alias[0], c_alias[0]);
1470 r_alias[1] = pmadd(a_alias[1], b_alias[1], c_alias[1]);
1471 r_alias[2] = pmadd(a_alias[2], b_alias[2], c_alias[2]);
1472 r_alias[3] = pmadd(a_alias[3], b_alias[3], c_alias[3]);
1477EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pdiv<Packet4h2>(
const Packet4h2& a,
const Packet4h2& b) {
1479 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1480 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1481 const half2* b_alias =
reinterpret_cast<const half2*
>(&b);
1482 r_alias[0] = pdiv(a_alias[0], b_alias[0]);
1483 r_alias[1] = pdiv(a_alias[1], b_alias[1]);
1484 r_alias[2] = pdiv(a_alias[2], b_alias[2]);
1485 r_alias[3] = pdiv(a_alias[3], b_alias[3]);
1490EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmin<Packet4h2>(
const Packet4h2& a,
const Packet4h2& b) {
1492 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1493 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1494 const half2* b_alias =
reinterpret_cast<const half2*
>(&b);
1495 r_alias[0] = pmin(a_alias[0], b_alias[0]);
1496 r_alias[1] = pmin(a_alias[1], b_alias[1]);
1497 r_alias[2] = pmin(a_alias[2], b_alias[2]);
1498 r_alias[3] = pmin(a_alias[3], b_alias[3]);
1503EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmax<Packet4h2>(
const Packet4h2& a,
const Packet4h2& b) {
1505 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1506 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1507 const half2* b_alias =
reinterpret_cast<const half2*
>(&b);
1508 r_alias[0] = pmax(a_alias[0], b_alias[0]);
1509 r_alias[1] = pmax(a_alias[1], b_alias[1]);
1510 r_alias[2] = pmax(a_alias[2], b_alias[2]);
1511 r_alias[3] = pmax(a_alias[3], b_alias[3]);
1516EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<Packet4h2>(
const Packet4h2& a) {
1517 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1519 return predux(a_alias[0]) + predux(a_alias[1]) + predux(a_alias[2]) + predux(a_alias[3]);
1523EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(
const Packet4h2& a) {
1524 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1525 half2 m0 = __halves2half2(predux_max(a_alias[0]), predux_max(a_alias[1]));
1526 half2 m1 = __halves2half2(predux_max(a_alias[2]), predux_max(a_alias[3]));
1527 __half first = predux_max(m0);
1528 __half second = predux_max(m1);
1529#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1530 return (__hgt(first, second) ? first : second);
1532 float ffirst = __half2float(first);
1533 float fsecond = __half2float(second);
1534 return (ffirst > fsecond) ? first : second;
1539EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(
const Packet4h2& a) {
1540 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1541 half2 m0 = __halves2half2(predux_min(a_alias[0]), predux_min(a_alias[1]));
1542 half2 m1 = __halves2half2(predux_min(a_alias[2]), predux_min(a_alias[3]));
1543 __half first = predux_min(m0);
1544 __half second = predux_min(m1);
1545#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1546 return (__hlt(first, second) ? first : second);
1548 float ffirst = __half2float(first);
1549 float fsecond = __half2float(second);
1550 return (ffirst < fsecond) ? first : second;
1556EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4h2>(
const Packet4h2& a) {
1557 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1558 return predux_mul(pmul(pmul(a_alias[0], a_alias[1]), pmul(a_alias[2], a_alias[3])));
1562EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plog1p<Packet4h2>(
const Packet4h2& a) {
1564 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1565 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1566 r_alias[0] = plog1p(a_alias[0]);
1567 r_alias[1] = plog1p(a_alias[1]);
1568 r_alias[2] = plog1p(a_alias[2]);
1569 r_alias[3] = plog1p(a_alias[3]);
1574EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pexpm1<Packet4h2>(
const Packet4h2& a) {
1576 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1577 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1578 r_alias[0] = pexpm1(a_alias[0]);
1579 r_alias[1] = pexpm1(a_alias[1]);
1580 r_alias[2] = pexpm1(a_alias[2]);
1581 r_alias[3] = pexpm1(a_alias[3]);
1586EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plog<Packet4h2>(
const Packet4h2& a) {
1588 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1589 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1590 r_alias[0] = plog(a_alias[0]);
1591 r_alias[1] = plog(a_alias[1]);
1592 r_alias[2] = plog(a_alias[2]);
1593 r_alias[3] = plog(a_alias[3]);
1598EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pexp<Packet4h2>(
const Packet4h2& a) {
1600 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1601 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1602 r_alias[0] = pexp(a_alias[0]);
1603 r_alias[1] = pexp(a_alias[1]);
1604 r_alias[2] = pexp(a_alias[2]);
1605 r_alias[3] = pexp(a_alias[3]);
1610EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psqrt<Packet4h2>(
const Packet4h2& a) {
1612 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1613 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1614 r_alias[0] = psqrt(a_alias[0]);
1615 r_alias[1] = psqrt(a_alias[1]);
1616 r_alias[2] = psqrt(a_alias[2]);
1617 r_alias[3] = psqrt(a_alias[3]);
1622EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 prsqrt<Packet4h2>(
const Packet4h2& a) {
1624 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1625 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1626 r_alias[0] = prsqrt(a_alias[0]);
1627 r_alias[1] = prsqrt(a_alias[1]);
1628 r_alias[2] = prsqrt(a_alias[2]);
1629 r_alias[3] = prsqrt(a_alias[3]);
1636EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(
const half2& a,
const half2& b) {
1637#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1638 return __hadd2(a, b);
1640 float a1 = __low2float(a);
1641 float a2 = __high2float(a);
1642 float b1 = __low2float(b);
1643 float b2 = __high2float(b);
1646 return __floats2half2_rn(r1, r2);
1651EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(
const half2& a,
const half2& b) {
1652#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1653 return __hmul2(a, b);
1655 float a1 = __low2float(a);
1656 float a2 = __high2float(a);
1657 float b1 = __low2float(b);
1658 float b2 = __high2float(b);
1661 return __floats2half2_rn(r1, r2);
1666EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(
const half2& a,
const half2& b) {
1667#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1668 return __h2div(a, b);
1670 float a1 = __low2float(a);
1671 float a2 = __high2float(a);
1672 float b1 = __low2float(b);
1673 float b2 = __high2float(b);
1676 return __floats2half2_rn(r1, r2);
1681EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(
const half2& a,
const half2& b) {
1682 float a1 = __low2float(a);
1683 float a2 = __high2float(a);
1684 float b1 = __low2float(b);
1685 float b2 = __high2float(b);
1686 __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
1687 __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
1688 return __halves2half2(r1, r2);
1692EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(
const half2& a,
const half2& b) {
1693 float a1 = __low2float(a);
1694 float a2 = __high2float(a);
1695 float b1 = __low2float(b);
1696 float b2 = __high2float(b);
1697 __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
1698 __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
1699 return __halves2half2(r1, r2);
1704#undef EIGEN_GPU_HAS_LDG
1705#undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
1706#undef EIGEN_GPU_HAS_FP16_ARITHMETIC
@ Aligned16
Definition Constants.h:237
Namespace containing all symbols from the Eigen library.
Definition B01_Experimental.dox:1
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_rint_op< typename Derived::Scalar >, const Derived > rint(const Eigen::ArrayBase< Derived > &x)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:82
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_trunc_op< typename Derived::Scalar >, const Derived > trunc(const Eigen::ArrayBase< Derived > &x)
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_floor_op< typename Derived::Scalar >, const Derived > floor(const Eigen::ArrayBase< Derived > &x)
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_ceil_op< typename Derived::Scalar >, const Derived > ceil(const Eigen::ArrayBase< Derived > &x)