10#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
11#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
17#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
24template <
typename T,
typename R>
25__device__ EIGEN_ALWAYS_INLINE
void atomicReduce(T* output, T accum, R& reducer) {
26#if __CUDA_ARCH__ >= 300
29 unsigned int oldval = *
reinterpret_cast<unsigned int*
>(output);
30 unsigned int newval = oldval;
31 reducer.reduce(accum,
reinterpret_cast<T*
>(&newval));
32 if (newval == oldval) {
35 unsigned int readback;
36 while ((readback = atomicCAS((
unsigned int*)output, oldval, newval)) != oldval) {
39 reducer.reduce(accum,
reinterpret_cast<T*
>(&newval));
40 if (newval == oldval) {
45 else if (
sizeof(T) == 8) {
46 unsigned long long oldval = *
reinterpret_cast<unsigned long long*
>(output);
47 unsigned long long newval = oldval;
48 reducer.reduce(accum,
reinterpret_cast<T*
>(&newval));
49 if (newval == oldval) {
52 unsigned long long readback;
53 while ((readback = atomicCAS((
unsigned long long*)output, oldval, newval)) != oldval) {
56 reducer.reduce(accum,
reinterpret_cast<T*
>(&newval));
57 if (newval == oldval) {
63 assert(0 &&
"Wordsize not supported");
66 EIGEN_UNUSED_VARIABLE(output);
67 EIGEN_UNUSED_VARIABLE(accum);
68 EIGEN_UNUSED_VARIABLE(reducer);
69 assert(0 &&
"Shouldn't be called on unsupported device");
74template <
typename Type>
75__device__
inline Type atomicExchCustom(Type* address, Type val) {
76 return atomicExch(address, val);
80__device__
inline double atomicExchCustom(
double* address,
double val) {
81 unsigned long long int* address_as_ull =
reinterpret_cast<unsigned long long int*
>(address);
82 return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
85#ifdef EIGEN_HAS_CUDA_FP16
86template <
template <
typename T>
class R>
87__device__
inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
88 unsigned int oldval = *
reinterpret_cast<unsigned int*
>(output);
89 unsigned int newval = oldval;
90 reducer.reducePacket(accum,
reinterpret_cast<half2*
>(&newval));
91 if (newval == oldval) {
94 unsigned int readback;
95 while ((readback = atomicCAS((
unsigned int*)output, oldval, newval)) != oldval) {
98 reducer.reducePacket(accum,
reinterpret_cast<half2*
>(&newval));
99 if (newval == oldval) {
107__device__
inline void atomicReduce(
float* output,
float accum, SumReducer<float>&) {
108#if __CUDA_ARCH__ >= 300
109 atomicAdd(output, accum);
111 EIGEN_UNUSED_VARIABLE(output);
112 EIGEN_UNUSED_VARIABLE(accum);
113 assert(0 &&
"Shouldn't be called on unsupported device");
118template <
typename CoeffType,
typename Index>
119__global__
void ReductionInitKernel(
const CoeffType val,
Index num_preserved_coeffs, CoeffType* output) {
120 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
121 const Index num_threads = blockDim.x * gridDim.x;
122 for (
Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
128template <
int BlockSize,
int NumPerThread,
typename Self,
129 typename Reducer,
typename Index>
130__global__
void FullReductionKernel(Reducer reducer,
const Self input,
Index num_coeffs,
131 typename Self::CoeffReturnType* output,
unsigned int* semaphore) {
132#if __CUDA_ARCH__ >= 300
134 const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
135 if (gridDim.x == 1) {
136 if (first_index == 0) {
137 *output = reducer.initialize();
141 if (threadIdx.x == 0) {
142 unsigned int block = atomicCAS(semaphore, 0u, 1u);
145 atomicExchCustom(output, reducer.initialize());
147 atomicExch(semaphore, 2u);
154 val = atomicCAS(semaphore, 2u, 2u);
163 eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
165 typename Self::CoeffReturnType accum = reducer.initialize();
166 Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
167 for (
Index i = 0; i < max_iter; i+=BlockSize) {
168 const Index index = first_index + i;
169 eigen_assert(index < num_coeffs);
170 typename Self::CoeffReturnType val = input.m_impl.coeff(index);
171 reducer.reduce(val, &accum);
175 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
176#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
178 reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
180 reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
184 if ((threadIdx.x & (warpSize - 1)) == 0) {
185 atomicReduce(output, accum, reducer);
188 if (gridDim.x > 1 && threadIdx.x == 0) {
190 atomicInc(semaphore, gridDim.x + 1);
193 EIGEN_UNUSED_VARIABLE(reducer);
194 EIGEN_UNUSED_VARIABLE(input);
195 EIGEN_UNUSED_VARIABLE(num_coeffs);
196 EIGEN_UNUSED_VARIABLE(output);
197 EIGEN_UNUSED_VARIABLE(semaphore);
198 assert(0 &&
"Shouldn't be called on unsupported device");
203#ifdef EIGEN_HAS_CUDA_FP16
204template <
typename Self,
205 typename Reducer,
typename Index>
206__global__
void ReductionInitFullReduxKernelHalfFloat(Reducer reducer,
const Self input,
Index num_coeffs, half2* scratch) {
207#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
208 eigen_assert(blockDim.x == 1);
209 eigen_assert(gridDim.x == 1);
210 typedef packet_traits<Eigen::half>::type packet_type;
211 Index packet_remainder = num_coeffs %
Index(unpacket_traits<packet_type>::size);
212 if (packet_remainder != 0) {
213 half2* h2scratch =
reinterpret_cast<half2*
>(scratch);
214 for (
Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) {
215 *h2scratch = __halves2half2(input.coeff(i), input.coeff(i + 1));
218 if ((num_coeffs & 1) != 0) {
219 half lastCoeff = input.coeff(num_coeffs - 1);
220 *h2scratch = __halves2half2(lastCoeff, reducer.initialize());
223 packet_type reduce = reducer.template initializePacket<packet_type>();
224 internal::pstoreu(scratch, reduce);
227 EIGEN_UNUSED_VARIABLE(input);
228 EIGEN_UNUSED_VARIABLE(reducer);
229 EIGEN_UNUSED_VARIABLE(num_coeffs);
230 EIGEN_UNUSED_VARIABLE(scratch);
234template <
typename Self,
235 typename Reducer,
typename Index>
236__global__
void ReductionInitKernelHalfFloat(Reducer reducer,
const Self input,
Index num_coeffs, half* output) {
237 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
238 const Index num_threads = blockDim.x * gridDim.x;
239 typedef typename packet_traits<Eigen::half>::type PacketType;
240 EIGEN_UNUSED_VARIABLE(input);
242 const Index num_packets = num_coeffs /
Index(unpacket_traits<PacketType>::size);
243 PacketType* p_output =
reinterpret_cast<PacketType*
>(output);
244 for (
Index i = thread_id; i < num_packets; i += num_threads) {
245 p_output[i] = reducer.template initializePacket<PacketType>();
247 Index packet_remainder = num_coeffs %
Index(unpacket_traits<PacketType>::size);
248 if (thread_id < packet_remainder) {
249 output[num_coeffs - packet_remainder + thread_id] = reducer.initialize();
253template <
int BlockSize,
int NumPerThread,
typename Self,
254 typename Reducer,
typename Index>
255__global__
void FullReductionKernelHalfFloat(Reducer reducer,
const Self input,
Index num_coeffs,
256 half* output, half2* scratch) {
257#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
258 EIGEN_UNUSED_VARIABLE(num_coeffs);
259 typedef typename packet_traits<Eigen::half>::type PacketType;
260 const int packet_width = unpacket_traits<PacketType>::size;
261 eigen_assert(NumPerThread % packet_width == 0);
262 const Index first_index = blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x;
265 if (gridDim.x == 1) {
266 if (first_index == 0) {
267 int rem = num_coeffs % packet_width;
269 half2* p_scratch =
reinterpret_cast<half2*
>(scratch);
270 pstoreu(scratch, reducer.template initializePacket<PacketType>());
271 for (
int i = 0; i < rem / 2; i++) {
272 *p_scratch = __halves2half2(input.coeff(num_coeffs - packet_width + 2 * i),
273 input.coeff(num_coeffs - packet_width + 2 * i + 1));
276 if ((num_coeffs & 1) != 0) {
277 half last = input.coeff(num_coeffs - 1);
278 *p_scratch = __halves2half2(last, reducer.initialize());
281 PacketType reduce = reducer.template initializePacket<PacketType>();
282 pstoreu(scratch, reduce);
288 PacketType accum = reducer.template initializePacket<PacketType>();
289 const Index max_iter =
290 numext::mini<Index>((num_coeffs - first_index) / packet_width, NumPerThread * BlockSize / packet_width);
291 for (
Index i = 0; i < max_iter; i += BlockSize) {
292 const Index index = first_index + packet_width * i;
293 eigen_assert(index + packet_width < num_coeffs);
294 PacketType val = input.template packet<Unaligned>(index);
295 reducer.reducePacket(val, &accum);
299 for (
int offset = warpSize / 2; offset > 0; offset /= 2) {
300#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
302 half2* hr =
reinterpret_cast<half2*
>(&r1);
303 half2* hacc =
reinterpret_cast<half2*
>(&accum);
304 for (
int i = 0; i < packet_width / 2; i++) {
305 hr[i] = __shfl_down(hacc[i], offset, warpSize);
307 reducer.reducePacket(r1, &accum);
310 half2* hr =
reinterpret_cast<half2*
>(&r1);
311 half2* hacc =
reinterpret_cast<half2*
>(&accum);
312 for (
int i = 0; i < packet_width / 2; i++) {
313 hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (
unsigned)offset, warpSize);
315 reducer.reducePacket(r1, &accum);
320 if ((threadIdx.x & (warpSize - 1)) == 0) {
321 atomicReduce(
reinterpret_cast<PacketType*
>(scratch), accum, reducer);
325 half2* rv1 =
reinterpret_cast<half2*
>(scratch);
326 if (packet_width > 2) {
327 reducer.reducePacket(rv1[2], rv1);
328 reducer.reducePacket(rv1[3], rv1 + 1);
329 reducer.reducePacket(rv1[1], rv1);
331 if (gridDim.x == 1) {
332 if (first_index == 0) {
333 half tmp = __low2half(*rv1);
334 reducer.reduce(__high2half(*rv1), &tmp);
339 EIGEN_UNUSED_VARIABLE(reducer);
340 EIGEN_UNUSED_VARIABLE(input);
341 EIGEN_UNUSED_VARIABLE(num_coeffs);
342 EIGEN_UNUSED_VARIABLE(output);
343 EIGEN_UNUSED_VARIABLE(scratch);
347template <
typename Op>
348__global__
void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
349 eigen_assert(threadIdx.x == 1);
350 half tmp = __low2half(*scratch);
351 reducer.reduce(__high2half(*scratch), &tmp);
357template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess,
typename Enabled =
void>
358struct FullReductionLauncher {
359 static void run(
const Self&, Op&,
const GpuDevice&, OutputType*,
typename Self::Index) {
360 assert(
false &&
"Should only be called on doubles, floats and half floats");
365template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess>
366struct FullReductionLauncher<
367 Self, Op, OutputType, PacketAccess,
368 typename internal::enable_if<
369 internal::is_same<float, OutputType>::value ||
370 internal::is_same<double, OutputType>::value,
372 static void run(
const Self& self, Op& reducer,
const GpuDevice& device, OutputType* output,
typename Self::Index num_coeffs) {
373 typedef typename Self::Index
Index;
374 const int block_size = 256;
375 const int num_per_thread = 128;
376 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
378 unsigned int* semaphore = NULL;
379 if (num_blocks > 1) {
380 semaphore = device.semaphore();
383 LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
384 num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
388#ifdef EIGEN_HAS_CUDA_FP16
389template <
typename Self,
typename Op>
390struct FullReductionLauncher<Self, Op, Eigen::half, false> {
391 static void run(
const Self&, Op&,
const GpuDevice&, half*,
typename Self::Index) {
392 assert(
false &&
"Should not be called since there is no packet accessor");
396template <
typename Self,
typename Op>
397struct FullReductionLauncher<Self, Op, Eigen::half, true> {
398 static void run(
const Self& self, Op& reducer,
const GpuDevice& device, half* output,
typename Self::Index num_coeffs) {
399 typedef typename Self::Index
Index;
401 const int block_size = 256;
402 const int num_per_thread = 128;
403 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
404 half2* scratch =
static_cast<half2*
>(device.scratchpad());
406 if (num_blocks > 1) {
409 LAUNCH_CUDA_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
410 1, 1, 0, device, reducer, self, num_coeffs, scratch);
413 LAUNCH_CUDA_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
414 num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
416 if (num_blocks > 1) {
417 LAUNCH_CUDA_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
418 1, 1, 0, device, reducer, output, scratch);
425template <
typename Self,
typename Op,
bool Vectorizable>
426struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
430#ifdef EIGEN_HAS_CUDA_FP16
431 static const bool HasOptimizedImplementation = !Op::IsStateful &&
432 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
433 internal::is_same<typename Self::CoeffReturnType, double>::value ||
434 (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
436 static const bool HasOptimizedImplementation = !Op::IsStateful &&
437 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
438 internal::is_same<typename Self::CoeffReturnType, double>::value);
441 template <
typename OutputType>
442 static void run(
const Self& self, Op& reducer,
const GpuDevice& device, OutputType* output) {
443 assert(HasOptimizedImplementation &&
"Should only be called on doubles, floats or half floats");
444 const Index num_coeffs = array_prod(self.m_impl.dimensions());
446 if (num_coeffs == 0) {
450 FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
455template <
int NumPerThread,
typename Self,
456 typename Reducer,
typename Index>
457__global__
void InnerReductionKernel(Reducer reducer,
const Self input,
Index num_coeffs_to_reduce,
Index num_preserved_coeffs,
458 typename Self::CoeffReturnType* output) {
459#if __CUDA_ARCH__ >= 300
460 typedef typename Self::CoeffReturnType Type;
461 eigen_assert(blockDim.y == 1);
462 eigen_assert(blockDim.z == 1);
463 eigen_assert(gridDim.y == 1);
464 eigen_assert(gridDim.z == 1);
466 const int unroll_times = 16;
467 eigen_assert(NumPerThread % unroll_times == 0);
469 const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
470 const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
472 const Index num_threads = blockDim.x * gridDim.x;
473 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
476 if (gridDim.x == 1) {
477 for (
Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
478 output[i] = reducer.initialize();
483 for (
Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
484 const Index row = i / input_col_blocks;
486 if (row < num_preserved_coeffs) {
487 const Index col_block = i % input_col_blocks;
488 const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
490 Type reduced_val = reducer.initialize();
492 for (
Index j = 0; j < NumPerThread; j += unroll_times) {
493 const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
494 if (last_col >= num_coeffs_to_reduce) {
495 for (
Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
496 const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
497 reducer.reduce(val, &reduced_val);
503 for (
int k = 0; k < unroll_times; ++k) {
504 const Index col = col_begin + blockDim.x * (j + k);
505 reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
511 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
512#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
513 reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
515 reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
519 if ((threadIdx.x & (warpSize - 1)) == 0) {
520 atomicReduce(&(output[row]), reduced_val, reducer);
525 EIGEN_UNUSED_VARIABLE(reducer);
526 EIGEN_UNUSED_VARIABLE(input);
527 EIGEN_UNUSED_VARIABLE(num_coeffs_to_reduce);
528 EIGEN_UNUSED_VARIABLE(num_preserved_coeffs);
529 EIGEN_UNUSED_VARIABLE(output);
530 assert(0 &&
"Shouldn't be called on unsupported device");
534#ifdef EIGEN_HAS_CUDA_FP16
536template <
int NumPerThread,
typename Self,
537 typename Reducer,
typename Index>
538__global__
void InnerReductionKernelHalfFloat(Reducer reducer,
const Self input,
Index num_coeffs_to_reduce,
Index num_preserved_coeffs,
540#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
541 eigen_assert(blockDim.y == 1);
542 eigen_assert(blockDim.z == 1);
543 eigen_assert(gridDim.y == 1);
544 eigen_assert(gridDim.z == 1);
546 typedef typename packet_traits<Eigen::half>::type PacketType;
547 const int packet_width = unpacket_traits<PacketType>::size;
548 const int unroll_times = 16 / packet_width;
549 eigen_assert(NumPerThread % unroll_times == 0);
550 eigen_assert(unroll_times % 2 == 0);
552 const Index input_col_blocks = numext::div_ceil<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
553 const Index num_input_blocks = numext::div_ceil<Index>(input_col_blocks * num_preserved_coeffs, 2);
555 const Index num_threads = blockDim.x * gridDim.x;
556 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
559 if (gridDim.x == 1) {
560 Index i = packet_width * thread_id;
561 for (; i + packet_width <= num_preserved_coeffs; i += packet_width * num_threads) {
562 PacketType* poutput =
reinterpret_cast<PacketType*
>(output + i);
563 *poutput = reducer.template initializePacket<PacketType>();
565 if (i < num_preserved_coeffs) {
566 output[i] = reducer.initialize();
571 for (
Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
572 const Index row = 2 * (i / input_col_blocks);
574 if (row + 1 < num_preserved_coeffs) {
575 const Index col_block = i % input_col_blocks;
576 const Index col_begin = packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x);
578 PacketType reduced_val1 = reducer.template initializePacket<PacketType>();
579 PacketType reduced_val2 = reducer.template initializePacket<PacketType>();
581 for (
Index j = 0; j < NumPerThread; j += unroll_times) {
582 const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1) * packet_width;
583 if (last_col >= num_coeffs_to_reduce) {
584 Index col = col_begin + blockDim.x * j;
585 for (; col + packet_width <= num_coeffs_to_reduce; col += blockDim.x) {
586 const PacketType val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col);
587 reducer.reducePacket(val1, &reduced_val1);
588 const PacketType val2 = input.m_impl.template packet<Unaligned>((row + 1) * num_coeffs_to_reduce + col);
589 reducer.reducePacket(val2, &reduced_val2);
591 if (col < num_coeffs_to_reduce) {
592 PacketType r1 = reducer.template initializePacket<PacketType>();
593 PacketType r2 = reducer.template initializePacket<PacketType>();
594 half2* hr1 =
reinterpret_cast<half2*
>(&r1);
595 half2* hr2 =
reinterpret_cast<half2*
>(&r2);
596 while (col + 1 < num_coeffs_to_reduce) {
597 *hr1 = __halves2half2(input.m_impl.coeff(row * num_coeffs_to_reduce + col),
598 input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1));
599 *hr2 = __halves2half2(input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col),
600 input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col + 1));
605 if (col < num_coeffs_to_reduce) {
607 const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
608 *hr1 = __halves2half2(last1, reducer.initialize());
609 const half last2 = input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col);
610 *hr2 = __halves2half2(last2, reducer.initialize());
612 reducer.reducePacket(r1, &reduced_val1);
613 reducer.reducePacket(r2, &reduced_val2);
619 for (
int k = 0; k < unroll_times; ++k) {
620 const Index col = col_begin + blockDim.x * (j + k) * packet_width;
621 reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col),
623 reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1) * num_coeffs_to_reduce + col),
630 for (
int offset = warpSize / 2; offset > 0; offset /= 2) {
631#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
634 half2* hr1 =
reinterpret_cast<half2*
>(&r1);
635 half2* hr2 =
reinterpret_cast<half2*
>(&r2);
636 half2* rv1 =
reinterpret_cast<half2*
>(&reduced_val1);
637 half2* rv2 =
reinterpret_cast<half2*
>(&reduced_val2);
638 for (
int i = 0; i < packet_width / 2; i++) {
639 hr1[i] = __shfl_down(rv1[i], offset, warpSize);
640 hr2[i] = __shfl_down(rv2[i], offset, warpSize);
642 reducer.reducePacket(r1, &reduced_val1);
643 reducer.reducePacket(r2, &reduced_val2);
647 half2* hr1 =
reinterpret_cast<half2*
>(&r1);
648 half2* hr2 =
reinterpret_cast<half2*
>(&r2);
649 half2* rr1 =
reinterpret_cast<half2*
>(&reduced_val1);
650 half2* rr2 =
reinterpret_cast<half2*
>(&reduced_val2);
651 for (
int j = 0; j < packet_width / 2; j++) {
652 hr1[j] = __shfl_down_sync(0xFFFFFFFF, rr1[j], (
unsigned)offset, warpSize);
653 hr2[j] = __shfl_down_sync(0xFFFFFFFF, rr2[j], (
unsigned)offset, warpSize);
655 reducer.reducePacket(r1, &reduced_val1);
656 reducer.reducePacket(r2, &reduced_val2);
660 half2* rv1 =
reinterpret_cast<half2*
>(&reduced_val1);
661 half2* rv2 =
reinterpret_cast<half2*
>(&reduced_val2);
663 if (packet_width > 2) {
664 reducer.reducePacket(rv1[2], rv1);
665 reducer.reducePacket(rv1[3], rv1 + 1);
666 reducer.reducePacket(rv1[1], rv1);
667 reducer.reducePacket(rv2[2], rv2);
668 reducer.reducePacket(rv2[3], rv2 + 1);
669 reducer.reducePacket(rv2[1], rv2);
671 half val1 = __low2half(*rv1);
672 reducer.reduce(__high2half(*rv1), &val1);
673 half val2 = __low2half(*rv2);
674 reducer.reduce(__high2half(*rv2), &val2);
675 val = __halves2half2(val1, val2);
676 if ((threadIdx.x & (warpSize - 1)) == 0) {
677 half* loc = output + row;
678 atomicReduce(
reinterpret_cast<half2*
>(loc), val, reducer);
683 EIGEN_UNUSED_VARIABLE(reducer);
684 EIGEN_UNUSED_VARIABLE(input);
685 EIGEN_UNUSED_VARIABLE(num_coeffs_to_reduce);
686 EIGEN_UNUSED_VARIABLE(num_preserved_coeffs);
687 EIGEN_UNUSED_VARIABLE(output);
693template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess,
typename Enabled =
void>
694struct InnerReductionLauncher {
695 static EIGEN_DEVICE_FUNC
bool run(
const Self&, Op&,
const GpuDevice&, OutputType*,
typename Self::Index,
typename Self::Index) {
696 assert(
false &&
"Should only be called to reduce doubles, floats and half floats on a gpu device");
702template <
typename Self,
typename Op,
typename OutputType,
bool PacketAccess>
703struct InnerReductionLauncher<
704 Self, Op, OutputType, PacketAccess,
705 typename internal::enable_if<
706 internal::is_same<float, OutputType>::value ||
707 internal::is_same<double, OutputType>::value,
709 static bool run(
const Self& self, Op& reducer,
const GpuDevice& device, OutputType* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
710 typedef typename Self::Index
Index;
712 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
713 const int block_size = 256;
714 const int num_per_thread = 128;
715 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
716 const int max_blocks = device.getNumCudaMultiProcessors() *
717 device.maxCudaThreadsPerMultiProcessor() / block_size;
718 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
720 if (num_blocks > 1) {
723 const int dyn_blocks2 = divup<int>(num_preserved_vals, 1024);
724 const int max_blocks2 = device.getNumCudaMultiProcessors() *
725 device.maxCudaThreadsPerMultiProcessor() / 1024;
726 const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
727 LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>),
728 num_blocks2, 1024, 0, device, reducer.initialize(),
729 num_preserved_vals, output);
732 LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
733 num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
739#ifdef EIGEN_HAS_CUDA_FP16
740template <
typename Self,
typename Op>
741struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
742 static bool run(
const Self&, Op&,
const GpuDevice&, half*,
typename Self::Index,
typename Self::Index) {
743 assert(
false &&
"Should not be called since there is no packet accessor");
748template <
typename Self,
typename Op>
749struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
750 static bool run(
const Self& self, Op& reducer,
const GpuDevice& device, half* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
751 typedef typename Self::Index
Index;
753 if (num_preserved_vals % 2 != 0) {
758 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
759 const int block_size = 128;
760 const int num_per_thread = 64;
761 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
762 const int max_blocks = device.getNumCudaMultiProcessors() *
763 device.maxCudaThreadsPerMultiProcessor() / block_size;
764 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
766 if (num_blocks > 1) {
769 LAUNCH_CUDA_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
770 1, 1, 0, device, reducer, self, num_preserved_vals, output);
773 LAUNCH_CUDA_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
774 num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
782template <
typename Self,
typename Op>
783struct InnerReducer<Self, Op, GpuDevice> {
787#ifdef EIGEN_HAS_CUDA_FP16
788 static const bool HasOptimizedImplementation = !Op::IsStateful &&
789 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
790 internal::is_same<typename Self::CoeffReturnType, double>::value ||
791 (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
793 static const bool HasOptimizedImplementation = !Op::IsStateful &&
794 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
795 internal::is_same<typename Self::CoeffReturnType, double>::value);
798 template <
typename OutputType>
799 static bool run(
const Self& self, Op& reducer,
const GpuDevice& device, OutputType* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
800 assert(HasOptimizedImplementation &&
"Should only be called on doubles, floats or half floats");
801 const Index num_coeffs = array_prod(self.m_impl.dimensions());
803 if (num_coeffs == 0) {
807 if (num_coeffs_to_reduce <= 128) {
811 return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
815template <
int NumPerThread,
typename Self,
816 typename Reducer,
typename Index>
817__global__
void OuterReductionKernel(Reducer reducer,
const Self input,
Index num_coeffs_to_reduce,
Index num_preserved_coeffs,
818 typename Self::CoeffReturnType* output) {
819 const Index num_threads = blockDim.x * gridDim.x;
820 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
822 if (gridDim.x == 1) {
823 for (
Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
824 output[i] = reducer.initialize();
830 const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
831 for (
Index i = thread_id; i < max_iter; i += num_threads) {
832 const Index input_col = i % num_preserved_coeffs;
833 const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
834 typename Self::CoeffReturnType reduced_val = reducer.initialize();
835 const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
836 for (
Index j = input_row; j < max_row; j++) {
837 typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
838 reducer.reduce(val, &reduced_val);
840 atomicReduce(&(output[input_col]), reduced_val, reducer);
845template <
typename Self,
typename Op>
846struct OuterReducer<Self, Op, GpuDevice> {
850 static const bool HasOptimizedImplementation = !Op::IsStateful &&
851 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
852 internal::is_same<typename Self::CoeffReturnType, double>::value);
853 template <
typename Device,
typename OutputType>
854 static EIGEN_DEVICE_FUNC
bool run(
const Self&, Op&,
const Device&, OutputType*,
typename Self::Index,
typename Self::Index) {
855 assert(
false &&
"Should only be called to reduce doubles or floats on a gpu device");
859 static bool run(
const Self& self, Op& reducer,
const GpuDevice& device,
float* output,
typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_preserved_vals) {
860 typedef typename Self::Index
Index;
863 if (num_coeffs_to_reduce <= 32) {
867 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
868 const int block_size = 256;
869 const int num_per_thread = 16;
870 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
871 const int max_blocks = device.getNumCudaMultiProcessors() *
872 device.maxCudaThreadsPerMultiProcessor() / block_size;
873 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
875 if (num_blocks > 1) {
878 const int dyn_blocks2 = divup<int>(num_preserved_vals, 1024);
879 const int max_blocks2 = device.getNumCudaMultiProcessors() *
880 device.maxCudaThreadsPerMultiProcessor() / 1024;
881 const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
882 LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
883 num_blocks2, 1024, 0, device, reducer.initialize(),
884 num_preserved_vals, output);
887 LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
888 num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index