10#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
11#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
14#include "./InternalHeaderCheck.h"
19#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
26template <
typename T,
typename R>
27__device__ EIGEN_ALWAYS_INLINE
void atomicReduce(T* output, T accum, R& reducer) {
28#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
30 unsigned int oldval = *
reinterpret_cast<unsigned int*
>(output);
31 unsigned int newval = oldval;
32 reducer.reduce(accum,
reinterpret_cast<T*
>(&newval));
33 if (newval == oldval) {
36 unsigned int readback;
37 while ((readback = atomicCAS((
unsigned int*)output, oldval, newval)) != oldval) {
40 reducer.reduce(accum,
reinterpret_cast<T*
>(&newval));
41 if (newval == oldval) {
45 }
else if (
sizeof(T) == 8) {
46 unsigned long long oldval = *
reinterpret_cast<unsigned long long*
>(output);
47 unsigned long long newval = oldval;
48 reducer.reduce(accum,
reinterpret_cast<T*
>(&newval));
49 if (newval == oldval) {
52 unsigned long long readback;
53 while ((readback = atomicCAS(
reinterpret_cast<unsigned long long*
>(output), oldval, newval)) != oldval) {
56 reducer.reduce(accum,
reinterpret_cast<T*
>(&newval));
57 if (newval == oldval) {
62 gpu_assert(0 &&
"Wordsize not supported");
65 EIGEN_UNUSED_VARIABLE(output);
66 EIGEN_UNUSED_VARIABLE(accum);
67 EIGEN_UNUSED_VARIABLE(reducer);
68 gpu_assert(0 &&
"Shouldn't be called on unsupported device");
73template <
typename Type>
74__device__
inline Type atomicExchCustom(Type* address, Type val) {
75 return atomicExch(address, val);
79__device__
inline double atomicExchCustom(
double* address,
double val) {
80 unsigned long long int* address_as_ull =
reinterpret_cast<unsigned long long int*
>(address);
81 return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
84#ifdef EIGEN_HAS_GPU_FP16
86__device__
inline void atomicReduce(half2* output, half2 accum, R& reducer) {
87 unsigned int oldval = *
reinterpret_cast<unsigned int*
>(output);
88 unsigned int newval = oldval;
89 reducer.reducePacket(accum,
reinterpret_cast<half2*
>(&newval));
90 if (newval == oldval) {
93 unsigned int readback;
94 while ((readback = atomicCAS((
unsigned int*)output, oldval, newval)) != oldval) {
97 reducer.reducePacket(accum,
reinterpret_cast<half2*
>(&newval));
98 if (newval == oldval) {
103#ifdef EIGEN_GPU_COMPILE_PHASE
106__device__
inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) {
107 half2* houtput =
reinterpret_cast<half2*
>(output);
108 half2* haccum =
reinterpret_cast<half2*
>(&accum);
109 for (
int i = 0; i < 4; ++i) {
110 atomicReduce(houtput + i, *(haccum + i), reducer);
117__device__
inline void atomicReduce(
float* output,
float accum, SumReducer<float>&) {
118#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
119 atomicAdd(output, accum);
121 EIGEN_UNUSED_VARIABLE(output);
122 EIGEN_UNUSED_VARIABLE(accum);
123 gpu_assert(0 &&
"Shouldn't be called on unsupported device");
127template <
typename CoeffType,
typename Index>
128__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void ReductionInitKernel(
const CoeffType val,
Index num_preserved_coeffs,
130 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
131 const Index num_threads = blockDim.x * gridDim.x;
132 for (
Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
137template <
int BlockSize,
int NumPerThread,
typename Self,
typename Reducer,
typename Index>
138__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void FullReductionKernel(Reducer reducer,
const Self input,
Index num_coeffs,
139 typename Self::CoeffReturnType* output,
140 unsigned int* semaphore) {
141#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
143 const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
144 if (gridDim.x == 1) {
145 if (first_index == 0) {
146 *output = reducer.initialize();
149 if (threadIdx.x == 0) {
150 unsigned int block = atomicCAS(semaphore, 0u, 1u);
153 atomicExchCustom(output, reducer.initialize());
155 atomicExch(semaphore, 2u);
161 val = atomicCAS(semaphore, 2u, 2u);
169 eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
171 typename Self::CoeffReturnType accum = reducer.initialize();
172 Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread * BlockSize);
173 for (
Index i = 0; i < max_iter; i += BlockSize) {
174 const Index index = first_index + i;
175 eigen_assert(index < num_coeffs);
176 typename Self::CoeffReturnType val = input.m_impl.coeff(index);
177 reducer.reduce(val, &accum);
181 for (
int offset = warpSize / 2; offset > 0; offset /= 2) {
182#if defined(EIGEN_HIPCC)
186 if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
187 reducer.reduce(__shfl_down(
static_cast<float>(accum), offset, warpSize), &accum);
189 reducer.reduce(__shfl_down(
static_cast<int>(accum), offset, warpSize), &accum);
191#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
192 reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
194 reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
198 if ((threadIdx.x & (warpSize - 1)) == 0) {
199 atomicReduce(output, accum, reducer);
202 if (gridDim.x > 1 && threadIdx.x == 0) {
204 atomicInc(semaphore, gridDim.x + 1);
205#if defined(EIGEN_HIPCC)
206 __threadfence_system();
210 EIGEN_UNUSED_VARIABLE(reducer);
211 EIGEN_UNUSED_VARIABLE(input);
212 EIGEN_UNUSED_VARIABLE(num_coeffs);
213 EIGEN_UNUSED_VARIABLE(output);
214 EIGEN_UNUSED_VARIABLE(semaphore);
215 gpu_assert(0 &&
"Shouldn't be called on unsupported device");
219#ifdef EIGEN_HAS_GPU_FP16
220template <
typename Self,
typename Reducer,
typename Index>
221__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void ReductionInitFullReduxKernelHalfFloat(Reducer reducer,
const Self input,
222 Index num_coeffs, half* scratch) {
223 eigen_assert(blockDim.x == 1);
224 eigen_assert(gridDim.x == 1);
225 typedef packet_traits<Eigen::half>::type packet_type;
226 Index packet_remainder = num_coeffs %
Index(unpacket_traits<packet_type>::size);
227 if (packet_remainder != 0) {
228 half2* h2scratch =
reinterpret_cast<half2*
>(scratch);
229 for (
Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) {
230 *h2scratch = __halves2half2(input.coeff(i), input.coeff(i + 1));
233 if ((num_coeffs & 1) != 0) {
234 half lastCoeff = input.coeff(num_coeffs - 1);
235 *h2scratch = __halves2half2(lastCoeff, reducer.initialize());
238 packet_type reduce = reducer.template initializePacket<packet_type>();
239 internal::pstoreu(scratch, reduce);
243template <
typename Self,
typename Reducer,
typename Index>
244__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void ReductionInitKernelHalfFloat(Reducer reducer,
const Self ,
245 Index num_coeffs, half* output) {
246 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
247 const Index num_threads = blockDim.x * gridDim.x;
248 typedef typename packet_traits<Eigen::half>::type PacketType;
250 const Index num_packets = num_coeffs /
Index(unpacket_traits<PacketType>::size);
251 PacketType* p_output =
reinterpret_cast<PacketType*
>(output);
252 for (
Index i = thread_id; i < num_packets; i += num_threads) {
253 p_output[i] = reducer.template initializePacket<PacketType>();
255 Index packet_remainder = num_coeffs %
Index(unpacket_traits<PacketType>::size);
256 if (thread_id < packet_remainder) {
257 output[num_coeffs - packet_remainder + thread_id] = reducer.initialize();
261template <
int BlockSize,
int NumPerThread,
typename Self,
typename Reducer,
typename Index>
262__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void FullReductionKernelHalfFloat(Reducer reducer,
const Self input,
263 Index num_coeffs, half* output,
265 typedef typename packet_traits<Eigen::half>::type PacketType;
266 const int packet_width = unpacket_traits<PacketType>::size;
267 eigen_assert(NumPerThread % packet_width == 0);
268 const Index first_index = blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x;
272 if (gridDim.x == 1) {
273 if (first_index == 0) {
274 int rem = num_coeffs % packet_width;
276 half2* p_scratch =
reinterpret_cast<half2*
>(scratch);
277 pstoreu(scratch, reducer.template initializePacket<PacketType>());
278 for (
int i = 0; i < rem / 2; i++) {
279 *p_scratch = __halves2half2(input.coeff(num_coeffs - packet_width + 2 * i),
280 input.coeff(num_coeffs - packet_width + 2 * i + 1));
283 if ((num_coeffs & 1) != 0) {
284 half last = input.coeff(num_coeffs - 1);
285 *p_scratch = __halves2half2(last, reducer.initialize());
288 PacketType reduce = reducer.template initializePacket<PacketType>();
289 pstoreu(scratch, reduce);
295 PacketType accum = reducer.template initializePacket<PacketType>();
296 const Index max_iter =
297 numext::mini<Index>((num_coeffs - first_index) / packet_width, NumPerThread * BlockSize / packet_width);
298 for (
Index i = 0; i < max_iter; i += BlockSize) {
299 const Index index = first_index + packet_width * i;
300 eigen_assert(index + packet_width < num_coeffs);
301 PacketType val = input.template packet<Unaligned>(index);
302 reducer.reducePacket(val, &accum);
306 for (
int offset = warpSize / 2; offset > 0; offset /= 2) {
307#if defined(EIGEN_HIPCC)
309 half2* hr =
reinterpret_cast<half2*
>(&r1);
310 half2* hacc =
reinterpret_cast<half2*
>(&accum);
311 for (
int i = 0; i < packet_width / 2; i++) {
318 wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
321 reducer.reducePacket(r1, &accum);
322#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
324 half2* hr =
reinterpret_cast<half2*
>(&r1);
325 half2* hacc =
reinterpret_cast<half2*
>(&accum);
326 for (
int i = 0; i < packet_width / 2; i++) {
327 hr[i] = __shfl_down(hacc[i], offset, warpSize);
329 reducer.reducePacket(r1, &accum);
332 half2* hr =
reinterpret_cast<half2*
>(&r1);
333 half2* hacc =
reinterpret_cast<half2*
>(&accum);
334 for (
int i = 0; i < packet_width / 2; i++) {
335 hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (
unsigned)offset, warpSize);
337 reducer.reducePacket(r1, &accum);
342 if ((threadIdx.x & (warpSize - 1)) == 0) {
343 atomicReduce(
reinterpret_cast<PacketType*
>(scratch), accum, reducer);
347 half2* rv1 =
reinterpret_cast<half2*
>(scratch);
348 if (packet_width > 2) {
349 reducer.reducePacket(rv1[2], rv1);
350 reducer.reducePacket(rv1[3], rv1 + 1);
351 reducer.reducePacket(rv1[1], rv1);
353 if (gridDim.x == 1) {
354 if (first_index == 0) {
355 half tmp = __low2half(*rv1);
356 reducer.reduce(__high2half(*rv1), &tmp);
362template <
typename Op>
363__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void ReductionCleanupKernelHalfFloat(Op reducer, half* output, half* scratch) {
364 eigen_assert(threadIdx.x == 1);
365 typedef packet_traits<Eigen::half>::type packet_type;
366 if (unpacket_traits<packet_type>::size == 1) {
369 half2* pscratch =
reinterpret_cast<half2*
>(scratch);
370 half tmp = __float2half(0.f);
371 for (
int i = 0; i < unpacket_traits<packet_type>::size; i += 2) {
372 reducer.reduce(__low2half(*pscratch), &tmp);
373 reducer.reduce(__high2half(*pscratch), &tmp);
382template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess,
typename Enabled =
void>
383struct FullReductionLauncher {
384 static void run(
const Self&, Op&,
const GpuDevice&, OutputType*,
typename Self::Index) {
385 gpu_assert(
false &&
"Should only be called on doubles, floats and half floats");
390template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess>
391struct FullReductionLauncher<
392 Self, Op, OutputType, PacketAccess,
393 std::enable_if_t<internal::is_same<float, OutputType>::value || internal::is_same<double, OutputType>::value,
395 static void run(
const Self& self, Op& reducer,
const GpuDevice& device, OutputType* output,
396 typename Self::Index num_coeffs) {
397 typedef typename Self::Index
Index;
398 const int block_size = 256;
399 const int num_per_thread = 128;
400 const int num_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
402 unsigned int* semaphore = NULL;
403 if (num_blocks > 1) {
404 semaphore = device.semaphore();
407 LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>), num_blocks, block_size, 0,
408 device, reducer, self, num_coeffs, output, semaphore);
412#ifdef EIGEN_HAS_GPU_FP16
413template <
typename Self,
typename Op>
414struct FullReductionLauncher<Self, Op, Eigen::half, false> {
415 static void run(
const Self&, Op&,
const GpuDevice&, half*,
typename Self::Index) {
416 gpu_assert(
false &&
"Should not be called since there is no packet accessor");
420template <
typename Self,
typename Op>
421struct FullReductionLauncher<Self, Op, Eigen::half, true> {
422 static void run(
const Self& self, Op& reducer,
const GpuDevice& device, half* output,
423 typename Self::Index num_coeffs) {
424 typedef typename Self::Index
Index;
426 const int block_size = 256;
427 const int num_per_thread = 128;
428 const int num_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
429 half* scratch =
static_cast<half*
>(device.scratchpad());
431 if (num_blocks > 1) {
434 LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>), 1, 1, 0, device, reducer, self,
435 num_coeffs, scratch);
438 LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>), num_blocks,
439 block_size, 0, device, reducer, self, num_coeffs, output, scratch);
441 if (num_blocks > 1) {
442 LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>), 1, 1, 0, device, reducer, output, scratch);
448template <
typename Self,
typename Op,
bool Vectorizable>
449struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
453#ifdef EIGEN_HAS_GPU_FP16
454 static constexpr bool HasOptimizedImplementation =
455 !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
456 internal::is_same<typename Self::CoeffReturnType, double>::value ||
457 (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value &&
458 reducer_traits<Op, GpuDevice>::PacketAccess));
460 static constexpr bool HasOptimizedImplementation =
461 !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
462 internal::is_same<typename Self::CoeffReturnType, double>::value);
465 template <
typename OutputType>
466 static void run(
const Self& self, Op& reducer,
const GpuDevice& device, OutputType* output) {
467 gpu_assert(HasOptimizedImplementation &&
"Should only be called on doubles, floats or half floats");
468 const Index num_coeffs = array_prod(self.m_impl.dimensions());
470 if (num_coeffs == 0) {
474 FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device,
479template <
int NumPerThread,
typename Self,
typename Reducer,
typename Index>
480__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void InnerReductionKernel(Reducer reducer,
const Self input,
481 Index num_coeffs_to_reduce,
482 Index num_preserved_coeffs,
483 typename Self::CoeffReturnType* output) {
484#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
485 typedef typename Self::CoeffReturnType Type;
486 eigen_assert(blockDim.y == 1);
487 eigen_assert(blockDim.z == 1);
488 eigen_assert(gridDim.y == 1);
489 eigen_assert(gridDim.z == 1);
491 const int unroll_times = 16;
492 eigen_assert(NumPerThread % unroll_times == 0);
494 const Index input_col_blocks = numext::div_ceil<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
495 const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
497 const Index num_threads = blockDim.x * gridDim.x;
498 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
501 if (gridDim.x == 1) {
502 for (
Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
503 output[i] = reducer.initialize();
508 for (
Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
509 const Index row = i / input_col_blocks;
511 if (row < num_preserved_coeffs) {
512 const Index col_block = i % input_col_blocks;
513 const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
515 Type reduced_val = reducer.initialize();
517 for (
Index j = 0; j < NumPerThread; j += unroll_times) {
518 const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
519 if (last_col >= num_coeffs_to_reduce) {
520 for (
Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
521 const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
522 reducer.reduce(val, &reduced_val);
528 for (
int k = 0; k < unroll_times; ++k) {
529 const Index col = col_begin + blockDim.x * (j + k);
530 reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
536 for (
int offset = warpSize / 2; offset > 0; offset /= 2) {
537#if defined(EIGEN_HIPCC)
541 if (std::is_floating_point<Type>::value) {
542 reducer.reduce(__shfl_down(
static_cast<float>(reduced_val), offset), &reduced_val);
544 reducer.reduce(__shfl_down(
static_cast<int>(reduced_val), offset), &reduced_val);
546#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
547 reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
549 reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
553 if ((threadIdx.x & (warpSize - 1)) == 0) {
554 atomicReduce(&(output[row]), reduced_val, reducer);
559 gpu_assert(0 &&
"Shouldn't be called on unsupported device");
563#ifdef EIGEN_HAS_GPU_FP16
565template <
int NumPerThread,
typename Self,
typename Reducer,
typename Index>
566__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void InnerReductionKernelHalfFloat(Reducer reducer,
const Self input,
567 Index num_coeffs_to_reduce,
568 Index num_preserved_coeffs, half* output) {
569 eigen_assert(blockDim.y == 1);
570 eigen_assert(blockDim.z == 1);
571 eigen_assert(gridDim.y == 1);
572 eigen_assert(gridDim.z == 1);
574 typedef typename packet_traits<Eigen::half>::type PacketType;
575 const int packet_width = unpacket_traits<PacketType>::size;
576 const int unroll_times = 16 / packet_width;
577 eigen_assert(NumPerThread % unroll_times == 0);
578 eigen_assert(unroll_times % 2 == 0);
580 const Index input_col_blocks = numext::div_ceil<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
581 const Index num_input_blocks = numext::div_ceil<Index>(input_col_blocks * num_preserved_coeffs, 2);
583 const Index num_threads = blockDim.x * gridDim.x;
584 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
587 if (gridDim.x == 1) {
588 Index i = packet_width * thread_id;
589 for (; i + packet_width <= num_preserved_coeffs; i += packet_width * num_threads) {
590 PacketType* poutput =
reinterpret_cast<PacketType*
>(output + i);
591 *poutput = reducer.template initializePacket<PacketType>();
593 if (i < num_preserved_coeffs) {
594 output[i] = reducer.initialize();
599 for (
Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
600 const Index row = 2 * (i / input_col_blocks);
602 if (row + 1 < num_preserved_coeffs) {
603 const Index col_block = i % input_col_blocks;
604 const Index col_begin = packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x);
606 PacketType reduced_val1 = reducer.template initializePacket<PacketType>();
607 PacketType reduced_val2 = reducer.template initializePacket<PacketType>();
609 for (
Index j = 0; j < NumPerThread; j += unroll_times) {
610 const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1) * packet_width;
611 if (last_col >= num_coeffs_to_reduce) {
612 Index col = col_begin + blockDim.x * j;
613 for (; col + packet_width <= num_coeffs_to_reduce; col += blockDim.x) {
614 const PacketType val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col);
615 reducer.reducePacket(val1, &reduced_val1);
616 const PacketType val2 = input.m_impl.template packet<Unaligned>((row + 1) * num_coeffs_to_reduce + col);
617 reducer.reducePacket(val2, &reduced_val2);
619 if (col < num_coeffs_to_reduce) {
620 PacketType r1 = reducer.template initializePacket<PacketType>();
621 PacketType r2 = reducer.template initializePacket<PacketType>();
622 half2* hr1 =
reinterpret_cast<half2*
>(&r1);
623 half2* hr2 =
reinterpret_cast<half2*
>(&r2);
624 while (col + 1 < num_coeffs_to_reduce) {
625 *hr1 = __halves2half2(input.m_impl.coeff(row * num_coeffs_to_reduce + col),
626 input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1));
627 *hr2 = __halves2half2(input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col),
628 input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col + 1));
633 if (col < num_coeffs_to_reduce) {
635 const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
636 *hr1 = __halves2half2(last1, reducer.initialize());
637 const half last2 = input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col);
638 *hr2 = __halves2half2(last2, reducer.initialize());
640 reducer.reducePacket(r1, &reduced_val1);
641 reducer.reducePacket(r2, &reduced_val2);
647 for (
int k = 0; k < unroll_times; ++k) {
648 const Index col = col_begin + blockDim.x * (j + k) * packet_width;
649 reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col),
651 reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1) * num_coeffs_to_reduce + col),
658 for (
int offset = warpSize / 2; offset > 0; offset /= 2) {
659#if defined(EIGEN_HIPCC)
662 half2* hr1 =
reinterpret_cast<half2*
>(&r1);
663 half2* hr2 =
reinterpret_cast<half2*
>(&r2);
664 half2* rv1 =
reinterpret_cast<half2*
>(&reduced_val1);
665 half2* rv2 =
reinterpret_cast<half2*
>(&reduced_val2);
666 for (
int i = 0; i < packet_width / 2; i++) {
673 wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize);
681 wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize);
684 reducer.reducePacket(r1, &reduced_val1);
685 reducer.reducePacket(r2, &reduced_val2);
686#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
689 half2* hr1 =
reinterpret_cast<half2*
>(&r1);
690 half2* hr2 =
reinterpret_cast<half2*
>(&r2);
691 half2* rv1 =
reinterpret_cast<half2*
>(&reduced_val1);
692 half2* rv2 =
reinterpret_cast<half2*
>(&reduced_val2);
693 for (
int i = 0; i < packet_width / 2; i++) {
694 hr1[i] = __shfl_down(rv1[i], offset, warpSize);
695 hr2[i] = __shfl_down(rv2[i], offset, warpSize);
697 reducer.reducePacket(r1, &reduced_val1);
698 reducer.reducePacket(r2, &reduced_val2);
702 half2* hr1 =
reinterpret_cast<half2*
>(&r1);
703 half2* hr2 =
reinterpret_cast<half2*
>(&r2);
704 half2* rr1 =
reinterpret_cast<half2*
>(&reduced_val1);
705 half2* rr2 =
reinterpret_cast<half2*
>(&reduced_val2);
706 for (
int j = 0; j < packet_width / 2; j++) {
707 hr1[j] = __shfl_down_sync(0xFFFFFFFF, rr1[j], (
unsigned)offset, warpSize);
708 hr2[j] = __shfl_down_sync(0xFFFFFFFF, rr2[j], (
unsigned)offset, warpSize);
710 reducer.reducePacket(r1, &reduced_val1);
711 reducer.reducePacket(r2, &reduced_val2);
715 half2* rv1 =
reinterpret_cast<half2*
>(&reduced_val1);
716 half2* rv2 =
reinterpret_cast<half2*
>(&reduced_val2);
718 if (packet_width > 2) {
719 reducer.reducePacket(rv1[2], rv1);
720 reducer.reducePacket(rv1[3], rv1 + 1);
721 reducer.reducePacket(rv1[1], rv1);
722 reducer.reducePacket(rv2[2], rv2);
723 reducer.reducePacket(rv2[3], rv2 + 1);
724 reducer.reducePacket(rv2[1], rv2);
726 half val1 = __low2half(*rv1);
727 reducer.reduce(__high2half(*rv1), &val1);
728 half val2 = __low2half(*rv2);
729 reducer.reduce(__high2half(*rv2), &val2);
730 val = __halves2half2(val1, val2);
731 if ((threadIdx.x & (warpSize - 1)) == 0) {
732 half* loc = output + row;
733 atomicReduce(
reinterpret_cast<half2*
>(loc), val, reducer);
741template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess,
typename Enabled =
void>
742struct InnerReductionLauncher {
743 static EIGEN_DEVICE_FUNC
bool run(
const Self&, Op&,
const GpuDevice&, OutputType*,
typename Self::Index,
744 typename Self::Index) {
745 gpu_assert(
false &&
"Should only be called to reduce doubles, floats and half floats on a gpu device");
751template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess>
752struct InnerReductionLauncher<
753 Self, Op, OutputType, PacketAccess,
754 std::enable_if_t<internal::is_same<float, OutputType>::value || internal::is_same<double, OutputType>::value,
756 static bool run(
const Self& self, Op& reducer,
const GpuDevice& device, OutputType* output,
757 typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
758 typedef typename Self::Index
Index;
760 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
761 const int block_size = 256;
762 const int num_per_thread = 128;
763 const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
764 const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size;
765 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
767 if (num_blocks > 1) {
770 const int dyn_blocks2 = numext::div_ceil<int>(num_preserved_vals, 1024);
771 const int max_blocks2 = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / 1024;
772 const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
773 LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>), num_blocks2, 1024, 0, device, reducer.initialize(),
774 num_preserved_vals, output);
777 LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>), num_blocks, block_size, 0, device,
778 reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
784#ifdef EIGEN_HAS_GPU_FP16
785template <
typename Self,
typename Op>
786struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
787 static bool run(
const Self&, Op&,
const GpuDevice&, half*,
typename Self::Index,
typename Self::Index) {
788 gpu_assert(
false &&
"Should not be called since there is no packet accessor");
793template <
typename Self,
typename Op>
794struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
795 static bool run(
const Self& self, Op& reducer,
const GpuDevice& device, half* output,
796 typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
797 typedef typename Self::Index
Index;
799 if (num_preserved_vals % 2 != 0) {
804 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
805 const int block_size = 128;
806 const int num_per_thread = 64;
807 const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
808 const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size;
809 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
811 if (num_blocks > 1) {
814 LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>), 1, 1, 0, device, reducer, self,
815 num_preserved_vals, output);
818 LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>), num_blocks, block_size, 0,
819 device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
826template <
typename Self,
typename Op>
827struct InnerReducer<Self, Op, GpuDevice> {
831#ifdef EIGEN_HAS_GPU_FP16
832 static constexpr bool HasOptimizedImplementation =
833 !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
834 internal::is_same<typename Self::CoeffReturnType, double>::value ||
835 (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value &&
836 reducer_traits<Op, GpuDevice>::PacketAccess));
838 static constexpr bool HasOptimizedImplementation =
839 !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
840 internal::is_same<typename Self::CoeffReturnType, double>::value);
843 template <
typename OutputType>
844 static bool run(
const Self& self, Op& reducer,
const GpuDevice& device, OutputType* output,
845 typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
846 gpu_assert(HasOptimizedImplementation &&
"Should only be called on doubles, floats or half floats");
847 const Index num_coeffs = array_prod(self.m_impl.dimensions());
849 if (num_coeffs == 0) {
853 if (num_coeffs_to_reduce <= 128) {
857 return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(
858 self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
862template <
int NumPerThread,
typename Self,
typename Reducer,
typename Index>
863__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void OuterReductionKernel(Reducer reducer,
const Self input,
864 Index num_coeffs_to_reduce,
865 Index num_preserved_coeffs,
866 typename Self::CoeffReturnType* output) {
867 const Index num_threads = blockDim.x * gridDim.x;
868 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
870 if (gridDim.x == 1) {
871 for (
Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
872 output[i] = reducer.initialize();
878 const Index max_iter = num_preserved_coeffs * numext::div_ceil<Index>(num_coeffs_to_reduce, NumPerThread);
879 for (
Index i = thread_id; i < max_iter; i += num_threads) {
880 const Index input_col = i % num_preserved_coeffs;
881 const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
882 typename Self::CoeffReturnType reduced_val = reducer.initialize();
883 const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
884 for (
Index j = input_row; j < max_row; j++) {
885 typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
886 reducer.reduce(val, &reduced_val);
888 atomicReduce(&(output[input_col]), reduced_val, reducer);
892template <
typename Self,
typename Op>
893struct OuterReducer<Self, Op, GpuDevice> {
897 static constexpr bool HasOptimizedImplementation =
898 !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
899 internal::is_same<typename Self::CoeffReturnType, double>::value);
900 template <
typename Device,
typename OutputType>
902#if !defined(EIGEN_HIPCC)
915 run(
const Self&, Op&,
const Device&, OutputType*,
typename Self::Index,
typename Self::Index) {
916 gpu_assert(
false &&
"Should only be called to reduce doubles or floats on a gpu device");
920 static bool run(
const Self& self, Op& reducer,
const GpuDevice& device,
float* output,
921 typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
922 typedef typename Self::Index
Index;
925 if (num_coeffs_to_reduce <= 32) {
929 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
930 const int block_size = 256;
931 const int num_per_thread = 16;
932 const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
933 const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size;
934 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
936 if (num_blocks > 1) {
939 const int dyn_blocks2 = numext::div_ceil<int>(num_preserved_vals, 1024);
940 const int max_blocks2 = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / 1024;
941 const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
942 LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>), num_blocks2, 1024, 0, device, reducer.initialize(),
943 num_preserved_vals, output);
946 LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>), num_blocks, block_size, 0, device,
947 reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index