10#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
11#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
17#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
24template <
typename T,
typename R>
25__device__ EIGEN_ALWAYS_INLINE
void atomicReduce(T* output, T accum, R& reducer) {
26#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
29 unsigned int oldval = *
reinterpret_cast<unsigned int*
>(output);
30 unsigned int newval = oldval;
31 reducer.reduce(accum,
reinterpret_cast<T*
>(&newval));
32 if (newval == oldval) {
35 unsigned int readback;
36 while ((readback = atomicCAS((
unsigned int*)output, oldval, newval)) != oldval) {
39 reducer.reduce(accum,
reinterpret_cast<T*
>(&newval));
40 if (newval == oldval) {
45 else if (
sizeof(T) == 8) {
46 unsigned long long oldval = *
reinterpret_cast<unsigned long long*
>(output);
47 unsigned long long newval = oldval;
48 reducer.reduce(accum,
reinterpret_cast<T*
>(&newval));
49 if (newval == oldval) {
52 unsigned long long readback;
53 while ((readback = atomicCAS((
unsigned long long*)output, oldval, newval)) != oldval) {
56 reducer.reduce(accum,
reinterpret_cast<T*
>(&newval));
57 if (newval == oldval) {
63 gpu_assert(0 &&
"Wordsize not supported");
66 gpu_assert(0 &&
"Shouldn't be called on unsupported device");
71template <
typename Type>
72__device__
inline Type atomicExchCustom(Type* address, Type val) {
73 return atomicExch(address, val);
77__device__
inline double atomicExchCustom(
double* address,
double val) {
78 unsigned long long int* address_as_ull =
reinterpret_cast<unsigned long long int*
>(address);
79 return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
82#ifdef EIGEN_HAS_GPU_FP16
84__device__
inline void atomicReduce(half2* output, half2 accum, R& reducer) {
85 unsigned int oldval = *
reinterpret_cast<unsigned int*
>(output);
86 unsigned int newval = oldval;
87 reducer.reducePacket(accum,
reinterpret_cast<half2*
>(&newval));
88 if (newval == oldval) {
91 unsigned int readback;
92 while ((readback = atomicCAS((
unsigned int*)output, oldval, newval)) != oldval) {
95 reducer.reducePacket(accum,
reinterpret_cast<half2*
>(&newval));
96 if (newval == oldval) {
101#ifdef EIGEN_GPU_COMPILE_PHASE
104__device__
inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) {
105 half2* houtput=
reinterpret_cast<half2*
>(output);
106 half2* haccum=
reinterpret_cast<half2*
>(&accum);
107 for(
int i=0;i<4;++i){
108 atomicReduce(houtput+i,*(haccum+i),reducer);
115__device__
inline void atomicReduce(
float* output,
float accum, SumReducer<float>&) {
116#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
117 atomicAdd(output, accum);
119 gpu_assert(0 &&
"Shouldn't be called on unsupported device");
124template <
typename CoeffType,
typename Index>
125__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void ReductionInitKernel(
const CoeffType val,
Index num_preserved_coeffs, CoeffType* output) {
126 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
127 const Index num_threads = blockDim.x * gridDim.x;
128 for (
Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
134template <
int BlockSize,
int NumPerThread,
typename Self,
135 typename Reducer,
typename Index>
136__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void FullReductionKernel(Reducer reducer,
const Self input,
Index num_coeffs,
137 typename Self::CoeffReturnType* output,
unsigned int* semaphore) {
138#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
140 const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
141 if (gridDim.x == 1) {
142 if (first_index == 0) {
143 *output = reducer.initialize();
147 if (threadIdx.x == 0) {
148 unsigned int block = atomicCAS(semaphore, 0u, 1u);
151 atomicExchCustom(output, reducer.initialize());
153 atomicExch(semaphore, 2u);
160 val = atomicCAS(semaphore, 2u, 2u);
169 eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
171 typename Self::CoeffReturnType accum = reducer.initialize();
172 Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
173 for (
Index i = 0; i < max_iter; i+=BlockSize) {
174 const Index index = first_index + i;
175 eigen_assert(index < num_coeffs);
176 typename Self::CoeffReturnType val = input.m_impl.coeff(index);
177 reducer.reduce(val, &accum);
181 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
182 #if defined(EIGEN_HIPCC)
186 if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
187 reducer.reduce(__shfl_down(
static_cast<float>(accum), offset, warpSize), &accum);
189 reducer.reduce(__shfl_down(
static_cast<int>(accum), offset, warpSize), &accum);
191 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
192 reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
194 reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
198 if ((threadIdx.x & (warpSize - 1)) == 0) {
199 atomicReduce(output, accum, reducer);
202 if (gridDim.x > 1 && threadIdx.x == 0) {
204 atomicInc(semaphore, gridDim.x + 1);
205#if defined(EIGEN_HIPCC)
206 __threadfence_system();
210 gpu_assert(0 &&
"Shouldn't be called on unsupported device");
215#ifdef EIGEN_HAS_GPU_FP16
216template <
typename Self,
217 typename Reducer,
typename Index>
218__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void ReductionInitFullReduxKernelHalfFloat(
219 Reducer reducer,
const Self input,
Index num_coeffs, half* scratch) {
220 eigen_assert(blockDim.x == 1);
221 eigen_assert(gridDim.x == 1);
222 typedef packet_traits<Eigen::half>::type packet_type;
223 Index packet_remainder =
224 num_coeffs %
Index(unpacket_traits<packet_type>::size);
225 if (packet_remainder != 0) {
226 half2* h2scratch =
reinterpret_cast<half2*
>(scratch);
227 for (
Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) {
229 __halves2half2(input.coeff(i), input.coeff(i + 1));
232 if ((num_coeffs & 1) != 0) {
233 half lastCoeff = input.coeff(num_coeffs - 1);
234 *h2scratch = __halves2half2(lastCoeff, reducer.initialize());
237 packet_type reduce = reducer.template initializePacket<packet_type>();
238 internal::pstoreu(scratch, reduce);
242template <
typename Self,
243 typename Reducer,
typename Index>
244__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void ReductionInitKernelHalfFloat(Reducer reducer,
const Self input,
Index num_coeffs, half* output) {
245 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
246 const Index num_threads = blockDim.x * gridDim.x;
247 typedef typename packet_traits<Eigen::half>::type PacketType;
249 const Index num_packets =
250 num_coeffs /
Index(unpacket_traits<PacketType>::size);
251 PacketType* p_output =
reinterpret_cast<PacketType*
>(output);
252 for (
Index i = thread_id; i < num_packets; i += num_threads) {
253 p_output[i] = reducer.template initializePacket<PacketType>();
255 Index packet_remainder =
256 num_coeffs %
Index(unpacket_traits<PacketType>::size);
257 if (thread_id < packet_remainder) {
258 output[num_coeffs - packet_remainder + thread_id] = reducer.initialize();
262template <
int BlockSize,
int NumPerThread,
typename Self,
263 typename Reducer,
typename Index>
264__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void FullReductionKernelHalfFloat(
265 Reducer reducer,
const Self input,
Index num_coeffs,
266 half* output, half* scratch) {
267 typedef typename packet_traits<Eigen::half>::type PacketType;
268 const int packet_width = unpacket_traits<PacketType>::size;
269 eigen_assert(NumPerThread % packet_width == 0);
270 const Index first_index =
271 blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x;
275 if (gridDim.x == 1) {
276 if (first_index == 0) {
277 int rem = num_coeffs % packet_width;
279 half2* p_scratch =
reinterpret_cast<half2*
>(scratch);
280 pstoreu(scratch, reducer.template initializePacket<PacketType>());
281 for (
int i = 0; i < rem / 2; i++) {
282 *p_scratch = __halves2half2(
283 input.coeff(num_coeffs - packet_width + 2 * i),
284 input.coeff(num_coeffs - packet_width + 2 * i + 1));
287 if ((num_coeffs & 1) != 0) {
288 half
last = input.coeff(num_coeffs - 1);
289 *p_scratch = __halves2half2(
last, reducer.initialize());
292 PacketType reduce = reducer.template initializePacket<PacketType>();
293 pstoreu(scratch, reduce);
299 PacketType accum = reducer.template initializePacket<PacketType>();
300 const Index max_iter =
301 numext::mini<Index>((num_coeffs - first_index) / packet_width,
302 NumPerThread * BlockSize / packet_width);
303 for (
Index i = 0; i < max_iter; i += BlockSize) {
304 const Index index = first_index + packet_width * i;
305 eigen_assert(index + packet_width < num_coeffs);
306 PacketType val = input.template packet<Unaligned>(index);
307 reducer.reducePacket(val, &accum);
311 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
312 #if defined(EIGEN_HIPCC)
314 half2* hr =
reinterpret_cast<half2*
>(&r1);
315 half2* hacc =
reinterpret_cast<half2*
>(&accum);
316 for (
int i = 0; i < packet_width / 2; i++) {
318 union {
int i; half2 h; } wka_in, wka_out;
320 wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
323 reducer.reducePacket(r1, &accum);
324 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
326 half2* hr =
reinterpret_cast<half2*
>(&r1);
327 half2* hacc =
reinterpret_cast<half2*
>(&accum);
328 for (
int i = 0; i < packet_width / 2; i++) {
329 hr[i] = __shfl_down(hacc[i], offset, warpSize);
331 reducer.reducePacket(r1, &accum);
334 half2* hr =
reinterpret_cast<half2*
>(&r1);
335 half2* hacc =
reinterpret_cast<half2*
>(&accum);
336 for (
int i = 0; i < packet_width / 2; i++) {
337 hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (
unsigned)offset, warpSize);
339 reducer.reducePacket(r1, &accum);
344 if ((threadIdx.x & (warpSize - 1)) == 0) {
345 atomicReduce(
reinterpret_cast<PacketType*
>(scratch), accum, reducer);
349 half2* rv1 =
reinterpret_cast<half2*
>(scratch);
350 if (packet_width > 2) {
351 reducer.reducePacket(rv1[2], rv1);
352 reducer.reducePacket(rv1[3], rv1 + 1);
353 reducer.reducePacket(rv1[1], rv1);
355 if (gridDim.x == 1) {
356 if (first_index == 0) {
357 half tmp = __low2half(*rv1);
358 reducer.reduce(__high2half(*rv1), &tmp);
364template <
typename Op>
365__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void ReductionCleanupKernelHalfFloat(Op reducer, half* output, half* scratch) {
366 eigen_assert(threadIdx.x == 1);
367 typedef packet_traits<Eigen::half>::type packet_type;
368 if (unpacket_traits<packet_type>::size == 1) {
371 half2* pscratch =
reinterpret_cast<half2*
>(scratch);
372 half tmp = __float2half(0.f);
373 for (
int i = 0; i < unpacket_traits<packet_type>::size; i += 2) {
374 reducer.reduce(__low2half(*pscratch), &tmp);
375 reducer.reduce(__high2half(*pscratch), &tmp);
384template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess,
typename Enabled =
void>
385struct FullReductionLauncher {
386 static void run(
const Self&, Op&,
const GpuDevice&, OutputType*,
typename Self::Index) {
387 gpu_assert(
false &&
"Should only be called on doubles, floats and half floats");
392template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess>
393struct FullReductionLauncher<
394 Self, Op, OutputType, PacketAccess,
395 typename internal::enable_if<
396 internal::is_same<float, OutputType>::value ||
397 internal::is_same<double, OutputType>::value,
399 static void run(
const Self& self, Op& reducer,
const GpuDevice& device, OutputType* output,
typename Self::Index num_coeffs) {
401 typedef typename Self::Index
Index;
402 const int block_size = 256;
403 const int num_per_thread = 128;
404 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
406 unsigned int* semaphore = NULL;
407 if (num_blocks > 1) {
408 semaphore = device.semaphore();
411 LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
412 num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
416#ifdef EIGEN_HAS_GPU_FP16
417template <
typename Self,
typename Op>
418struct FullReductionLauncher<Self, Op, Eigen::half, false> {
419 static void run(
const Self&, Op&,
const GpuDevice&, half*,
typename Self::Index) {
420 gpu_assert(
false &&
"Should not be called since there is no packet accessor");
424template <
typename Self,
typename Op>
425struct FullReductionLauncher<Self, Op, Eigen::half, true> {
426 static void run(
const Self& self, Op& reducer,
const GpuDevice& device, half* output,
typename Self::Index num_coeffs) {
427 typedef typename Self::Index
Index;
429 const int block_size = 256;
430 const int num_per_thread = 128;
431 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
432 half* scratch =
static_cast<half*
>(device.scratchpad());
434 if (num_blocks > 1) {
437 LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
438 1, 1, 0, device, reducer, self, num_coeffs, scratch);
441 LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
442 num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
444 if (num_blocks > 1) {
445 LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
446 1, 1, 0, device, reducer, output, scratch);
453template <
typename Self,
typename Op,
bool Vectorizable>
454struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
458#ifdef EIGEN_HAS_GPU_FP16
459 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
460 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
461 internal::is_same<typename Self::CoeffReturnType, double>::value ||
462 (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
464 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
465 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
466 internal::is_same<typename Self::CoeffReturnType, double>::value);
469 template <
typename OutputType>
470 static void run(
const Self& self, Op& reducer,
const GpuDevice& device, OutputType* output) {
471 gpu_assert(HasOptimizedImplementation &&
"Should only be called on doubles, floats or half floats");
472 const Index num_coeffs = array_prod(self.m_impl.dimensions());
474 if (num_coeffs == 0) {
478 FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
483template <
int NumPerThread,
typename Self,
484 typename Reducer,
typename Index>
485__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void InnerReductionKernel(Reducer reducer,
const Self input,
Index num_coeffs_to_reduce,
Index num_preserved_coeffs,
486 typename Self::CoeffReturnType* output) {
487#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
488 typedef typename Self::CoeffReturnType Type;
489 eigen_assert(blockDim.y == 1);
490 eigen_assert(blockDim.z == 1);
491 eigen_assert(gridDim.y == 1);
492 eigen_assert(gridDim.z == 1);
494 const int unroll_times = 16;
495 eigen_assert(NumPerThread % unroll_times == 0);
497 const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
498 const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
500 const Index num_threads = blockDim.x * gridDim.x;
501 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
504 if (gridDim.x == 1) {
505 for (
Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
506 output[i] = reducer.initialize();
511 for (
Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
512 const Index row = i / input_col_blocks;
514 if (row < num_preserved_coeffs) {
515 const Index col_block = i % input_col_blocks;
516 const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
518 Type reduced_val = reducer.initialize();
520 for (
Index j = 0; j < NumPerThread; j += unroll_times) {
521 const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
522 if (last_col >= num_coeffs_to_reduce) {
523 for (
Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
524 const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
525 reducer.reduce(val, &reduced_val);
531 for (
int k = 0; k < unroll_times; ++k) {
532 const Index col = col_begin + blockDim.x * (j + k);
533 reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
539 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
540 #if defined(EIGEN_HIPCC)
544 if (std::is_floating_point<Type>::value) {
545 reducer.reduce(__shfl_down(
static_cast<float>(reduced_val), offset), &reduced_val);
547 reducer.reduce(__shfl_down(
static_cast<int>(reduced_val), offset), &reduced_val);
549 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
550 reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
552 reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
556 if ((threadIdx.x & (warpSize - 1)) == 0) {
557 atomicReduce(&(output[row]), reduced_val, reducer);
562 gpu_assert(0 &&
"Shouldn't be called on unsupported device");
566#ifdef EIGEN_HAS_GPU_FP16
568template <
int NumPerThread,
typename Self,
569 typename Reducer,
typename Index>
570__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void InnerReductionKernelHalfFloat(Reducer reducer,
const Self input,
Index num_coeffs_to_reduce,
Index num_preserved_coeffs,
572 eigen_assert(blockDim.y == 1);
573 eigen_assert(blockDim.z == 1);
574 eigen_assert(gridDim.y == 1);
575 eigen_assert(gridDim.z == 1);
577 typedef typename packet_traits<Eigen::half>::type PacketType;
578 const int packet_width = unpacket_traits<PacketType>::size;
579 const int unroll_times = 16 / packet_width;
580 eigen_assert(NumPerThread % unroll_times == 0);
581 eigen_assert(unroll_times % 2 == 0);
583 const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
584 const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
586 const Index num_threads = blockDim.x * gridDim.x;
587 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
590 if (gridDim.x == 1) {
591 Index i = packet_width * thread_id;
592 for (; i + packet_width <= num_preserved_coeffs;
593 i += packet_width * num_threads) {
594 PacketType* poutput =
reinterpret_cast<PacketType*
>(output + i);
595 *poutput = reducer.template initializePacket<PacketType>();
597 if (i < num_preserved_coeffs) {
598 output[i] = reducer.initialize();
603 for (
Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
604 const Index row = 2 * (i / input_col_blocks);
606 if (row + 1 < num_preserved_coeffs) {
607 const Index col_block = i % input_col_blocks;
608 const Index col_begin =
609 packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x);
611 PacketType reduced_val1 = reducer.template initializePacket<PacketType>();
612 PacketType reduced_val2 = reducer.template initializePacket<PacketType>();
614 for (
Index j = 0; j < NumPerThread; j += unroll_times) {
615 const Index last_col =
616 col_begin + blockDim.x * (j + unroll_times - 1) * packet_width;
617 if (last_col >= num_coeffs_to_reduce) {
618 Index col = col_begin + blockDim.x * j;
619 for (; col + packet_width <= num_coeffs_to_reduce;
621 const PacketType val1 = input.m_impl.template packet<Unaligned>(
622 row * num_coeffs_to_reduce + col);
623 reducer.reducePacket(val1, &reduced_val1);
624 const PacketType val2 = input.m_impl.template packet<Unaligned>(
625 (row + 1) * num_coeffs_to_reduce + col);
626 reducer.reducePacket(val2, &reduced_val2);
628 if (col < num_coeffs_to_reduce) {
629 PacketType r1 = reducer.template initializePacket<PacketType>();
630 PacketType r2 = reducer.template initializePacket<PacketType>();
631 half2* hr1 =
reinterpret_cast<half2*
>(&r1);
632 half2* hr2 =
reinterpret_cast<half2*
>(&r2);
633 while (col + 1 < num_coeffs_to_reduce) {
634 *hr1 = __halves2half2(
635 input.m_impl.coeff(row * num_coeffs_to_reduce + col),
636 input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1));
637 *hr2 = __halves2half2(
638 input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col),
639 input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col +
645 if (col < num_coeffs_to_reduce) {
648 input.m_impl.coeff(row * num_coeffs_to_reduce + col);
649 *hr1 = __halves2half2(last1, reducer.initialize());
651 input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col);
652 *hr2 = __halves2half2(last2, reducer.initialize());
654 reducer.reducePacket(r1, &reduced_val1);
655 reducer.reducePacket(r2, &reduced_val2);
661 for (
int k = 0; k < unroll_times; ++k) {
662 const Index col = col_begin + blockDim.x * (j + k) * packet_width;
663 reducer.reducePacket(input.m_impl.template packet<Unaligned>(
664 row * num_coeffs_to_reduce + col),
666 reducer.reducePacket(input.m_impl.template packet<Unaligned>(
667 (row + 1) * num_coeffs_to_reduce + col),
674 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
675 #if defined(EIGEN_HIPCC)
678 half2* hr1 =
reinterpret_cast<half2*
>(&r1);
679 half2* hr2 =
reinterpret_cast<half2*
>(&r2);
680 half2* rv1 =
reinterpret_cast<half2*
>(&reduced_val1);
681 half2* rv2 =
reinterpret_cast<half2*
>(&reduced_val2);
682 for (
int i = 0; i < packet_width / 2; i++) {
684 union {
int i; half2 h; } wka_in1, wka_out1;
686 wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize);
689 union {
int i; half2 h; } wka_in2, wka_out2;
691 wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize);
694 reducer.reducePacket(r1, &reduced_val1);
695 reducer.reducePacket(r2, &reduced_val2);
696 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
699 half2* hr1 =
reinterpret_cast<half2*
>(&r1);
700 half2* hr2 =
reinterpret_cast<half2*
>(&r2);
701 half2* rv1 =
reinterpret_cast<half2*
>(&reduced_val1);
702 half2* rv2 =
reinterpret_cast<half2*
>(&reduced_val2);
703 for (
int i = 0; i < packet_width / 2; i++) {
704 hr1[i] = __shfl_down(rv1[i], offset, warpSize);
705 hr2[i] = __shfl_down(rv2[i], offset, warpSize);
707 reducer.reducePacket(r1, &reduced_val1);
708 reducer.reducePacket(r2, &reduced_val2);
712 half2* hr1 =
reinterpret_cast<half2*
>(&r1);
713 half2* hr2 =
reinterpret_cast<half2*
>(&r2);
714 half2* rr1 =
reinterpret_cast<half2*
>(&reduced_val1);
715 half2* rr2 =
reinterpret_cast<half2*
>(&reduced_val2);
716 for (
int i = 0; i < packet_width / 2; i++) {
718 __shfl_down_sync(0xFFFFFFFF, rr1[i], (
unsigned)offset, warpSize);
720 __shfl_down_sync(0xFFFFFFFF, rr2[i], (
unsigned)offset, warpSize);
722 reducer.reducePacket(r1, &reduced_val1);
723 reducer.reducePacket(r2, &reduced_val2);
727 half2* rv1 =
reinterpret_cast<half2*
>(&reduced_val1);
728 half2* rv2 =
reinterpret_cast<half2*
>(&reduced_val2);
730 if (packet_width > 2) {
731 reducer.reducePacket(rv1[2], rv1);
732 reducer.reducePacket(rv1[3], rv1 + 1);
733 reducer.reducePacket(rv1[1], rv1);
734 reducer.reducePacket(rv2[2], rv2);
735 reducer.reducePacket(rv2[3], rv2 + 1);
736 reducer.reducePacket(rv2[1], rv2);
738 half val1 = __low2half(*rv1);
739 reducer.reduce(__high2half(*rv1), &val1);
740 half val2 = __low2half(*rv2);
741 reducer.reduce(__high2half(*rv2), &val2);
742 val = __halves2half2(val1, val2);
743 if ((threadIdx.x & (warpSize - 1)) == 0) {
744 half* loc = output + row;
745 atomicReduce((half2*)loc, val, reducer);
753template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess,
typename Enabled =
void>
754struct InnerReductionLauncher {
755 static EIGEN_DEVICE_FUNC
bool run(
const Self&, Op&,
const GpuDevice&, OutputType*,
typename Self::Index,
typename Self::Index) {
756 gpu_assert(
false &&
"Should only be called to reduce doubles, floats and half floats on a gpu device");
762template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess>
763struct InnerReductionLauncher<
764 Self, Op, OutputType, PacketAccess,
765 typename internal::enable_if<
766 internal::is_same<float, OutputType>::value ||
767 internal::is_same<double, OutputType>::value,
769 static bool run(
const Self& self, Op& reducer,
const GpuDevice& device, OutputType* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
770 typedef typename Self::Index
Index;
772 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
773 const int block_size = 256;
774 const int num_per_thread = 128;
775 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
776 const int max_blocks = device.getNumGpuMultiProcessors() *
777 device.maxGpuThreadsPerMultiProcessor() / block_size;
778 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
780 if (num_blocks > 1) {
783 const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
784 const int max_blocks = device.getNumGpuMultiProcessors() *
785 device.maxGpuThreadsPerMultiProcessor() / 1024;
786 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
787 LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
788 num_blocks, 1024, 0, device, reducer.initialize(),
789 num_preserved_vals, output);
792 LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
793 num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
799#ifdef EIGEN_HAS_GPU_FP16
800template <
typename Self,
typename Op>
801struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
802 static bool run(
const Self&, Op&,
const GpuDevice&, half*,
typename Self::Index,
typename Self::Index) {
803 gpu_assert(
false &&
"Should not be called since there is no packet accessor");
808template <
typename Self,
typename Op>
809struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
810 static bool run(
const Self& self, Op& reducer,
const GpuDevice& device, half* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
811 typedef typename Self::Index
Index;
813 if (num_preserved_vals % 2 != 0) {
818 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
819 const int block_size = 128;
820 const int num_per_thread = 64;
821 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
822 const int max_blocks = device.getNumGpuMultiProcessors() *
823 device.maxGpuThreadsPerMultiProcessor() / block_size;
824 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
826 if (num_blocks > 1) {
829 LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
830 1, 1, 0, device, reducer, self, num_preserved_vals, output);
833 LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
834 num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
842template <
typename Self,
typename Op>
843struct InnerReducer<Self, Op, GpuDevice> {
847#ifdef EIGEN_HAS_GPU_FP16
848 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
849 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
850 internal::is_same<typename Self::CoeffReturnType, double>::value ||
851 (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
853 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
854 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
855 internal::is_same<typename Self::CoeffReturnType, double>::value);
858 template <
typename OutputType>
859 static bool run(
const Self& self, Op& reducer,
const GpuDevice& device, OutputType* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
860 gpu_assert(HasOptimizedImplementation &&
"Should only be called on doubles, floats or half floats");
861 const Index num_coeffs = array_prod(self.m_impl.dimensions());
863 if (num_coeffs == 0) {
867 if (num_coeffs_to_reduce <= 128) {
871 return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
875template <
int NumPerThread,
typename Self,
876 typename Reducer,
typename Index>
877__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void OuterReductionKernel(Reducer reducer,
const Self input,
Index num_coeffs_to_reduce,
Index num_preserved_coeffs,
878 typename Self::CoeffReturnType* output) {
879 const Index num_threads = blockDim.x * gridDim.x;
880 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
882 if (gridDim.x == 1) {
883 for (
Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
884 output[i] = reducer.initialize();
890 const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
891 for (
Index i = thread_id; i < max_iter; i += num_threads) {
892 const Index input_col = i % num_preserved_coeffs;
893 const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
894 typename Self::CoeffReturnType reduced_val = reducer.initialize();
895 const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
896 for (
Index j = input_row; j < max_row; j++) {
897 typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
898 reducer.reduce(val, &reduced_val);
900 atomicReduce(&(output[input_col]), reduced_val, reducer);
905template <
typename Self,
typename Op>
906struct OuterReducer<Self, Op, GpuDevice> {
910 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
911 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
912 internal::is_same<typename Self::CoeffReturnType, double>::value);
913 template <
typename Device,
typename OutputType>
915 #if !defined(EIGEN_HIPCC)
927 bool run(
const Self&, Op&,
const Device&, OutputType*,
typename Self::Index,
typename Self::Index) {
928 gpu_assert(
false &&
"Should only be called to reduce doubles or floats on a gpu device");
932 static bool run(
const Self& self, Op& reducer,
const GpuDevice& device,
float* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
933 typedef typename Self::Index
Index;
936 if (num_coeffs_to_reduce <= 32) {
940 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
941 const int block_size = 256;
942 const int num_per_thread = 16;
943 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
944 const int max_blocks = device.getNumGpuMultiProcessors() *
945 device.maxGpuThreadsPerMultiProcessor() / block_size;
946 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
948 if (num_blocks > 1) {
951 const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
952 const int max_blocks = device.getNumGpuMultiProcessors() *
953 device.maxGpuThreadsPerMultiProcessor() / 1024;
954 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
955 LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
956 num_blocks, 1024, 0, device, reducer.initialize(),
957 num_preserved_vals, output);
960 LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
961 num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
static const symbolic::SymbolExpr< internal::symbolic_last_tag > last
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index