Eigen-unsupported  5.0.1-dev+284dcc12
 
Loading...
Searching...
No Matches
TensorReductionGpu.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
11#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
12
13// IWYU pragma: private
14#include "./InternalHeaderCheck.h"
15
16namespace Eigen {
17namespace internal {
18
19#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
20// Full reducers for GPU, don't vectorize for now
21
22// Reducer function that enables multiple gpu thread to safely accumulate at the same
23// output address. It basically reads the current value of the output variable, and
24// attempts to update it with the new value. If in the meantime another gpu thread
25// updated the content of the output address it will try again.
26template <typename T, typename R>
27__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
28#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
29 if (sizeof(T) == 4) {
30 unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
31 unsigned int newval = oldval;
32 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
33 if (newval == oldval) {
34 return;
35 }
36 unsigned int readback;
37 while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
38 oldval = readback;
39 newval = oldval;
40 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
41 if (newval == oldval) {
42 return;
43 }
44 }
45 } else if (sizeof(T) == 8) {
46 unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
47 unsigned long long newval = oldval;
48 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
49 if (newval == oldval) {
50 return;
51 }
52 unsigned long long readback;
53 while ((readback = atomicCAS(reinterpret_cast<unsigned long long*>(output), oldval, newval)) != oldval) {
54 oldval = readback;
55 newval = oldval;
56 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
57 if (newval == oldval) {
58 return;
59 }
60 }
61 } else {
62 gpu_assert(0 && "Wordsize not supported");
63 }
64#else // EIGEN_CUDA_ARCH >= 300
65 EIGEN_UNUSED_VARIABLE(output);
66 EIGEN_UNUSED_VARIABLE(accum);
67 EIGEN_UNUSED_VARIABLE(reducer);
68 gpu_assert(0 && "Shouldn't be called on unsupported device");
69#endif // EIGEN_CUDA_ARCH >= 300
70}
71
72// We extend atomicExch to support extra data types
73template <typename Type>
74__device__ inline Type atomicExchCustom(Type* address, Type val) {
75 return atomicExch(address, val);
76}
77
78template <>
79__device__ inline double atomicExchCustom(double* address, double val) {
80 unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
81 return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
82}
83
84#ifdef EIGEN_HAS_GPU_FP16
85template <typename R>
86__device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) {
87 unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
88 unsigned int newval = oldval;
89 reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
90 if (newval == oldval) {
91 return;
92 }
93 unsigned int readback;
94 while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
95 oldval = readback;
96 newval = oldval;
97 reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
98 if (newval == oldval) {
99 return;
100 }
101 }
102}
103#ifdef EIGEN_GPU_COMPILE_PHASE
104// reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations
105template <typename R>
106__device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) {
107 half2* houtput = reinterpret_cast<half2*>(output);
108 half2* haccum = reinterpret_cast<half2*>(&accum);
109 for (int i = 0; i < 4; ++i) {
110 atomicReduce(houtput + i, *(haccum + i), reducer);
111 }
112}
113#endif // EIGEN_GPU_COMPILE_PHASE
114#endif // EIGEN_HAS_GPU_FP16
115
116template <>
117__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
118#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
119 atomicAdd(output, accum);
120#else // EIGEN_CUDA_ARCH >= 300
121 EIGEN_UNUSED_VARIABLE(output);
122 EIGEN_UNUSED_VARIABLE(accum);
123 gpu_assert(0 && "Shouldn't be called on unsupported device");
124#endif // EIGEN_CUDA_ARCH >= 300
125}
126
127template <typename CoeffType, typename Index>
128__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs,
129 CoeffType* output) {
130 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
131 const Index num_threads = blockDim.x * gridDim.x;
132 for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
133 output[i] = val;
134 }
135}
136
137template <int BlockSize, int NumPerThread, typename Self, typename Reducer, typename Index>
138__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
139 typename Self::CoeffReturnType* output,
140 unsigned int* semaphore) {
141#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
142 // Initialize the output value
143 const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
144 if (gridDim.x == 1) {
145 if (first_index == 0) {
146 *output = reducer.initialize();
147 }
148 } else {
149 if (threadIdx.x == 0) {
150 unsigned int block = atomicCAS(semaphore, 0u, 1u);
151 if (block == 0) {
152 // We're the first block to run, initialize the output value
153 atomicExchCustom(output, reducer.initialize());
154 __threadfence();
155 atomicExch(semaphore, 2u);
156 } else {
157 // Wait for the first block to initialize the output value.
158 // Use atomicCAS here to ensure that the reads aren't cached
159 unsigned int val;
160 do {
161 val = atomicCAS(semaphore, 2u, 2u);
162 } while (val < 2u);
163 }
164 }
165 }
166
167 __syncthreads();
168
169 eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
170
171 typename Self::CoeffReturnType accum = reducer.initialize();
172 Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread * BlockSize);
173 for (Index i = 0; i < max_iter; i += BlockSize) {
174 const Index index = first_index + i;
175 eigen_assert(index < num_coeffs);
176 typename Self::CoeffReturnType val = input.m_impl.coeff(index);
177 reducer.reduce(val, &accum);
178 }
179
180#pragma unroll
181 for (int offset = warpSize / 2; offset > 0; offset /= 2) {
182#if defined(EIGEN_HIPCC)
183 // use std::is_floating_point to determine the type of reduced_val
184 // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
185 // and list the float and int versions of __shfl_down as the candidate functions.
186 if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
187 reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
188 } else {
189 reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
190 }
191#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
192 reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
193#else
194 reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
195#endif
196 }
197
198 if ((threadIdx.x & (warpSize - 1)) == 0) {
199 atomicReduce(output, accum, reducer);
200 }
201
202 if (gridDim.x > 1 && threadIdx.x == 0) {
203 // Let the last block reset the semaphore
204 atomicInc(semaphore, gridDim.x + 1);
205#if defined(EIGEN_HIPCC)
206 __threadfence_system();
207#endif
208 }
209#else // EIGEN_CUDA_ARCH >= 300
210 EIGEN_UNUSED_VARIABLE(reducer);
211 EIGEN_UNUSED_VARIABLE(input);
212 EIGEN_UNUSED_VARIABLE(num_coeffs);
213 EIGEN_UNUSED_VARIABLE(output);
214 EIGEN_UNUSED_VARIABLE(semaphore);
215 gpu_assert(0 && "Shouldn't be called on unsupported device");
216#endif // EIGEN_CUDA_ARCH >= 300
217}
218
219#ifdef EIGEN_HAS_GPU_FP16
220template <typename Self, typename Reducer, typename Index>
221__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input,
222 Index num_coeffs, half* scratch) {
223 eigen_assert(blockDim.x == 1);
224 eigen_assert(gridDim.x == 1);
225 typedef packet_traits<Eigen::half>::type packet_type;
226 Index packet_remainder = num_coeffs % Index(unpacket_traits<packet_type>::size);
227 if (packet_remainder != 0) {
228 half2* h2scratch = reinterpret_cast<half2*>(scratch);
229 for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) {
230 *h2scratch = __halves2half2(input.coeff(i), input.coeff(i + 1));
231 h2scratch++;
232 }
233 if ((num_coeffs & 1) != 0) {
234 half lastCoeff = input.coeff(num_coeffs - 1);
235 *h2scratch = __halves2half2(lastCoeff, reducer.initialize());
236 }
237 } else {
238 packet_type reduce = reducer.template initializePacket<packet_type>();
239 internal::pstoreu(scratch, reduce);
240 }
241}
242
243template <typename Self, typename Reducer, typename Index>
244__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self /*input*/,
245 Index num_coeffs, half* output) {
246 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
247 const Index num_threads = blockDim.x * gridDim.x;
248 typedef typename packet_traits<Eigen::half>::type PacketType;
249
250 const Index num_packets = num_coeffs / Index(unpacket_traits<PacketType>::size);
251 PacketType* p_output = reinterpret_cast<PacketType*>(output);
252 for (Index i = thread_id; i < num_packets; i += num_threads) {
253 p_output[i] = reducer.template initializePacket<PacketType>();
254 }
255 Index packet_remainder = num_coeffs % Index(unpacket_traits<PacketType>::size);
256 if (thread_id < packet_remainder) {
257 output[num_coeffs - packet_remainder + thread_id] = reducer.initialize();
258 }
259}
260
261template <int BlockSize, int NumPerThread, typename Self, typename Reducer, typename Index>
262__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
263 Index num_coeffs, half* output,
264 half* scratch) {
265 typedef typename packet_traits<Eigen::half>::type PacketType;
266 const int packet_width = unpacket_traits<PacketType>::size;
267 eigen_assert(NumPerThread % packet_width == 0);
268 const Index first_index = blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x;
269
270 // Initialize the output value if it wasn't initialized by the ReductionInitKernel
271
272 if (gridDim.x == 1) {
273 if (first_index == 0) {
274 int rem = num_coeffs % packet_width;
275 if (rem != 0) {
276 half2* p_scratch = reinterpret_cast<half2*>(scratch);
277 pstoreu(scratch, reducer.template initializePacket<PacketType>());
278 for (int i = 0; i < rem / 2; i++) {
279 *p_scratch = __halves2half2(input.coeff(num_coeffs - packet_width + 2 * i),
280 input.coeff(num_coeffs - packet_width + 2 * i + 1));
281 p_scratch++;
282 }
283 if ((num_coeffs & 1) != 0) {
284 half last = input.coeff(num_coeffs - 1);
285 *p_scratch = __halves2half2(last, reducer.initialize());
286 }
287 } else {
288 PacketType reduce = reducer.template initializePacket<PacketType>();
289 pstoreu(scratch, reduce);
290 }
291 }
292 __syncthreads();
293 }
294
295 PacketType accum = reducer.template initializePacket<PacketType>();
296 const Index max_iter =
297 numext::mini<Index>((num_coeffs - first_index) / packet_width, NumPerThread * BlockSize / packet_width);
298 for (Index i = 0; i < max_iter; i += BlockSize) {
299 const Index index = first_index + packet_width * i;
300 eigen_assert(index + packet_width < num_coeffs);
301 PacketType val = input.template packet<Unaligned>(index);
302 reducer.reducePacket(val, &accum);
303 }
304
305#pragma unroll
306 for (int offset = warpSize / 2; offset > 0; offset /= 2) {
307#if defined(EIGEN_HIPCC)
308 PacketType r1;
309 half2* hr = reinterpret_cast<half2*>(&r1);
310 half2* hacc = reinterpret_cast<half2*>(&accum);
311 for (int i = 0; i < packet_width / 2; i++) {
312 // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
313 union {
314 int i;
315 half2 h;
316 } wka_in, wka_out;
317 wka_in.h = hacc[i];
318 wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
319 hr[i] = wka_out.h;
320 }
321 reducer.reducePacket(r1, &accum);
322#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
323 PacketType r1;
324 half2* hr = reinterpret_cast<half2*>(&r1);
325 half2* hacc = reinterpret_cast<half2*>(&accum);
326 for (int i = 0; i < packet_width / 2; i++) {
327 hr[i] = __shfl_down(hacc[i], offset, warpSize);
328 }
329 reducer.reducePacket(r1, &accum);
330#else
331 PacketType r1;
332 half2* hr = reinterpret_cast<half2*>(&r1);
333 half2* hacc = reinterpret_cast<half2*>(&accum);
334 for (int i = 0; i < packet_width / 2; i++) {
335 hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (unsigned)offset, warpSize);
336 }
337 reducer.reducePacket(r1, &accum);
338
339#endif
340 }
341
342 if ((threadIdx.x & (warpSize - 1)) == 0) {
343 atomicReduce(reinterpret_cast<PacketType*>(scratch), accum, reducer);
344 }
345
346 __syncthreads();
347 half2* rv1 = reinterpret_cast<half2*>(scratch);
348 if (packet_width > 2) {
349 reducer.reducePacket(rv1[2], rv1);
350 reducer.reducePacket(rv1[3], rv1 + 1);
351 reducer.reducePacket(rv1[1], rv1);
352 }
353 if (gridDim.x == 1) {
354 if (first_index == 0) {
355 half tmp = __low2half(*rv1);
356 reducer.reduce(__high2half(*rv1), &tmp);
357 *output = tmp;
358 }
359 }
360}
361
362template <typename Op>
363__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, half* scratch) {
364 eigen_assert(threadIdx.x == 1);
365 typedef packet_traits<Eigen::half>::type packet_type;
366 if (unpacket_traits<packet_type>::size == 1) {
367 *output = *scratch;
368 } else {
369 half2* pscratch = reinterpret_cast<half2*>(scratch);
370 half tmp = __float2half(0.f);
371 for (int i = 0; i < unpacket_traits<packet_type>::size; i += 2) {
372 reducer.reduce(__low2half(*pscratch), &tmp);
373 reducer.reduce(__high2half(*pscratch), &tmp);
374 pscratch++;
375 }
376 *output = tmp;
377 }
378}
379
380#endif // EIGEN_HAS_GPU_FP16
381
382template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
383struct FullReductionLauncher {
384 static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
385 gpu_assert(false && "Should only be called on doubles, floats and half floats");
386 }
387};
388
389// Specialization for float and double
390template <typename Self, typename Op, typename OutputType, bool PacketAccess>
391struct FullReductionLauncher<
392 Self, Op, OutputType, PacketAccess,
393 std::enable_if_t<internal::is_same<float, OutputType>::value || internal::is_same<double, OutputType>::value,
394 void>> {
395 static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output,
396 typename Self::Index num_coeffs) {
397 typedef typename Self::Index Index;
398 const int block_size = 256;
399 const int num_per_thread = 128;
400 const int num_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
401
402 unsigned int* semaphore = NULL;
403 if (num_blocks > 1) {
404 semaphore = device.semaphore();
405 }
406
407 LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>), num_blocks, block_size, 0,
408 device, reducer, self, num_coeffs, output, semaphore);
409 }
410};
411
412#ifdef EIGEN_HAS_GPU_FP16
413template <typename Self, typename Op>
414struct FullReductionLauncher<Self, Op, Eigen::half, false> {
415 static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
416 gpu_assert(false && "Should not be called since there is no packet accessor");
417 }
418};
419
420template <typename Self, typename Op>
421struct FullReductionLauncher<Self, Op, Eigen::half, true> {
422 static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output,
423 typename Self::Index num_coeffs) {
424 typedef typename Self::Index Index;
425
426 const int block_size = 256;
427 const int num_per_thread = 128;
428 const int num_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
429 half* scratch = static_cast<half*>(device.scratchpad());
430
431 if (num_blocks > 1) {
432 // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
433 // won't be a race conditions between multiple thread blocks.
434 LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>), 1, 1, 0, device, reducer, self,
435 num_coeffs, scratch);
436 }
437
438 LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>), num_blocks,
439 block_size, 0, device, reducer, self, num_coeffs, output, scratch);
440
441 if (num_blocks > 1) {
442 LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>), 1, 1, 0, device, reducer, output, scratch);
443 }
444 }
445};
446#endif // EIGEN_HAS_GPU_FP16
447
448template <typename Self, typename Op, bool Vectorizable>
449struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
450 // Unfortunately nvidia doesn't support well exotic types such as complex,
451 // so reduce the scope of the optimized version of the code to the simple cases
452 // of doubles, floats and half floats
453#ifdef EIGEN_HAS_GPU_FP16
454 static constexpr bool HasOptimizedImplementation =
455 !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
456 internal::is_same<typename Self::CoeffReturnType, double>::value ||
457 (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value &&
458 reducer_traits<Op, GpuDevice>::PacketAccess));
459#else // EIGEN_HAS_GPU_FP16
460 static constexpr bool HasOptimizedImplementation =
461 !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
462 internal::is_same<typename Self::CoeffReturnType, double>::value);
463#endif // EIGEN_HAS_GPU_FP16
464
465 template <typename OutputType>
466 static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
467 gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
468 const Index num_coeffs = array_prod(self.m_impl.dimensions());
469 // Don't crash when we're called with an input tensor of size 0.
470 if (num_coeffs == 0) {
471 return;
472 }
473
474 FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device,
475 output, num_coeffs);
476 }
477};
478
479template <int NumPerThread, typename Self, typename Reducer, typename Index>
480__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reducer, const Self input,
481 Index num_coeffs_to_reduce,
482 Index num_preserved_coeffs,
483 typename Self::CoeffReturnType* output) {
484#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
485 typedef typename Self::CoeffReturnType Type;
486 eigen_assert(blockDim.y == 1);
487 eigen_assert(blockDim.z == 1);
488 eigen_assert(gridDim.y == 1);
489 eigen_assert(gridDim.z == 1);
490
491 const int unroll_times = 16;
492 eigen_assert(NumPerThread % unroll_times == 0);
493
494 const Index input_col_blocks = numext::div_ceil<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
495 const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
496
497 const Index num_threads = blockDim.x * gridDim.x;
498 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
499
500 // Initialize the output values if they weren't initialized by the ReductionInitKernel
501 if (gridDim.x == 1) {
502 for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
503 output[i] = reducer.initialize();
504 }
505 __syncthreads();
506 }
507
508 for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
509 const Index row = i / input_col_blocks;
510
511 if (row < num_preserved_coeffs) {
512 const Index col_block = i % input_col_blocks;
513 const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
514
515 Type reduced_val = reducer.initialize();
516
517 for (Index j = 0; j < NumPerThread; j += unroll_times) {
518 const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
519 if (last_col >= num_coeffs_to_reduce) {
520 for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
521 const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
522 reducer.reduce(val, &reduced_val);
523 }
524 break;
525 } else {
526 // Faster version of the loop with no branches after unrolling.
527#pragma unroll
528 for (int k = 0; k < unroll_times; ++k) {
529 const Index col = col_begin + blockDim.x * (j + k);
530 reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
531 }
532 }
533 }
534
535#pragma unroll
536 for (int offset = warpSize / 2; offset > 0; offset /= 2) {
537#if defined(EIGEN_HIPCC)
538 // use std::is_floating_point to determine the type of reduced_val
539 // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
540 // and list the float and int versions of __shfl_down as the candidate functions.
541 if (std::is_floating_point<Type>::value) {
542 reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
543 } else {
544 reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
545 }
546#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
547 reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
548#else
549 reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
550#endif
551 }
552
553 if ((threadIdx.x & (warpSize - 1)) == 0) {
554 atomicReduce(&(output[row]), reduced_val, reducer);
555 }
556 }
557 }
558#else // EIGEN_CUDA_ARCH >= 300
559 gpu_assert(0 && "Shouldn't be called on unsupported device");
560#endif // EIGEN_CUDA_ARCH >= 300
561}
562
563#ifdef EIGEN_HAS_GPU_FP16
564
565template <int NumPerThread, typename Self, typename Reducer, typename Index>
566__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
567 Index num_coeffs_to_reduce,
568 Index num_preserved_coeffs, half* output) {
569 eigen_assert(blockDim.y == 1);
570 eigen_assert(blockDim.z == 1);
571 eigen_assert(gridDim.y == 1);
572 eigen_assert(gridDim.z == 1);
573
574 typedef typename packet_traits<Eigen::half>::type PacketType;
575 const int packet_width = unpacket_traits<PacketType>::size;
576 const int unroll_times = 16 / packet_width;
577 eigen_assert(NumPerThread % unroll_times == 0);
578 eigen_assert(unroll_times % 2 == 0);
579
580 const Index input_col_blocks = numext::div_ceil<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
581 const Index num_input_blocks = numext::div_ceil<Index>(input_col_blocks * num_preserved_coeffs, 2);
582
583 const Index num_threads = blockDim.x * gridDim.x;
584 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
585
586 // Initialize the output values if they weren't initialized by the ReductionInitKernel
587 if (gridDim.x == 1) {
588 Index i = packet_width * thread_id;
589 for (; i + packet_width <= num_preserved_coeffs; i += packet_width * num_threads) {
590 PacketType* poutput = reinterpret_cast<PacketType*>(output + i);
591 *poutput = reducer.template initializePacket<PacketType>();
592 }
593 if (i < num_preserved_coeffs) {
594 output[i] = reducer.initialize();
595 }
596 __syncthreads();
597 }
598
599 for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
600 const Index row = 2 * (i / input_col_blocks); // everybody takes 2 rows
601
602 if (row + 1 < num_preserved_coeffs) {
603 const Index col_block = i % input_col_blocks;
604 const Index col_begin = packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x);
605
606 PacketType reduced_val1 = reducer.template initializePacket<PacketType>();
607 PacketType reduced_val2 = reducer.template initializePacket<PacketType>();
608
609 for (Index j = 0; j < NumPerThread; j += unroll_times) {
610 const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1) * packet_width;
611 if (last_col >= num_coeffs_to_reduce) {
612 Index col = col_begin + blockDim.x * j;
613 for (; col + packet_width <= num_coeffs_to_reduce; col += blockDim.x) {
614 const PacketType val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col);
615 reducer.reducePacket(val1, &reduced_val1);
616 const PacketType val2 = input.m_impl.template packet<Unaligned>((row + 1) * num_coeffs_to_reduce + col);
617 reducer.reducePacket(val2, &reduced_val2);
618 }
619 if (col < num_coeffs_to_reduce) {
620 PacketType r1 = reducer.template initializePacket<PacketType>();
621 PacketType r2 = reducer.template initializePacket<PacketType>();
622 half2* hr1 = reinterpret_cast<half2*>(&r1);
623 half2* hr2 = reinterpret_cast<half2*>(&r2);
624 while (col + 1 < num_coeffs_to_reduce) {
625 *hr1 = __halves2half2(input.m_impl.coeff(row * num_coeffs_to_reduce + col),
626 input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1));
627 *hr2 = __halves2half2(input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col),
628 input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col + 1));
629 hr1++;
630 hr2++;
631 col += 2;
632 }
633 if (col < num_coeffs_to_reduce) {
634 // Peel;
635 const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
636 *hr1 = __halves2half2(last1, reducer.initialize());
637 const half last2 = input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col);
638 *hr2 = __halves2half2(last2, reducer.initialize());
639 }
640 reducer.reducePacket(r1, &reduced_val1);
641 reducer.reducePacket(r2, &reduced_val2);
642 }
643 break;
644 } else {
645 // Faster version of the loop with no branches after unrolling.
646#pragma unroll
647 for (int k = 0; k < unroll_times; ++k) {
648 const Index col = col_begin + blockDim.x * (j + k) * packet_width;
649 reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col),
650 &reduced_val1);
651 reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1) * num_coeffs_to_reduce + col),
652 &reduced_val2);
653 }
654 }
655 }
656
657#pragma unroll
658 for (int offset = warpSize / 2; offset > 0; offset /= 2) {
659#if defined(EIGEN_HIPCC)
660 PacketType r1;
661 PacketType r2;
662 half2* hr1 = reinterpret_cast<half2*>(&r1);
663 half2* hr2 = reinterpret_cast<half2*>(&r2);
664 half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
665 half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
666 for (int i = 0; i < packet_width / 2; i++) {
667 // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
668 union {
669 int i;
670 half2 h;
671 } wka_in1, wka_out1;
672 wka_in1.h = rv1[i];
673 wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize);
674 hr1[i] = wka_out1.h;
675
676 union {
677 int i;
678 half2 h;
679 } wka_in2, wka_out2;
680 wka_in2.h = rv2[i];
681 wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize);
682 hr2[i] = wka_out2.h;
683 }
684 reducer.reducePacket(r1, &reduced_val1);
685 reducer.reducePacket(r2, &reduced_val2);
686#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
687 PacketType r1;
688 PacketType r2;
689 half2* hr1 = reinterpret_cast<half2*>(&r1);
690 half2* hr2 = reinterpret_cast<half2*>(&r2);
691 half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
692 half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
693 for (int i = 0; i < packet_width / 2; i++) {
694 hr1[i] = __shfl_down(rv1[i], offset, warpSize);
695 hr2[i] = __shfl_down(rv2[i], offset, warpSize);
696 }
697 reducer.reducePacket(r1, &reduced_val1);
698 reducer.reducePacket(r2, &reduced_val2);
699#else
700 PacketType r1;
701 PacketType r2;
702 half2* hr1 = reinterpret_cast<half2*>(&r1);
703 half2* hr2 = reinterpret_cast<half2*>(&r2);
704 half2* rr1 = reinterpret_cast<half2*>(&reduced_val1);
705 half2* rr2 = reinterpret_cast<half2*>(&reduced_val2);
706 for (int j = 0; j < packet_width / 2; j++) {
707 hr1[j] = __shfl_down_sync(0xFFFFFFFF, rr1[j], (unsigned)offset, warpSize);
708 hr2[j] = __shfl_down_sync(0xFFFFFFFF, rr2[j], (unsigned)offset, warpSize);
709 }
710 reducer.reducePacket(r1, &reduced_val1);
711 reducer.reducePacket(r2, &reduced_val2);
712
713#endif
714 }
715 half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
716 half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
717 half2 val;
718 if (packet_width > 2) {
719 reducer.reducePacket(rv1[2], rv1);
720 reducer.reducePacket(rv1[3], rv1 + 1);
721 reducer.reducePacket(rv1[1], rv1);
722 reducer.reducePacket(rv2[2], rv2);
723 reducer.reducePacket(rv2[3], rv2 + 1);
724 reducer.reducePacket(rv2[1], rv2);
725 }
726 half val1 = __low2half(*rv1);
727 reducer.reduce(__high2half(*rv1), &val1);
728 half val2 = __low2half(*rv2);
729 reducer.reduce(__high2half(*rv2), &val2);
730 val = __halves2half2(val1, val2);
731 if ((threadIdx.x & (warpSize - 1)) == 0) {
732 half* loc = output + row;
733 atomicReduce(reinterpret_cast<half2*>(loc), val, reducer);
734 }
735 }
736 }
737}
738
739#endif // EIGEN_HAS_GPU_FP16
740
741template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
742struct InnerReductionLauncher {
743 static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index,
744 typename Self::Index) {
745 gpu_assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
746 return true;
747 }
748};
749
750// Specialization for float and double
751template <typename Self, typename Op, typename OutputType, bool PacketAccess>
752struct InnerReductionLauncher<
753 Self, Op, OutputType, PacketAccess,
754 std::enable_if_t<internal::is_same<float, OutputType>::value || internal::is_same<double, OutputType>::value,
755 void>> {
756 static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output,
757 typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
758 typedef typename Self::Index Index;
759
760 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
761 const int block_size = 256;
762 const int num_per_thread = 128;
763 const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
764 const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size;
765 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
766
767 if (num_blocks > 1) {
768 // We initialize the outputs outside the reduction kernel when we can't be sure that there
769 // won't be a race conditions between multiple thread blocks.
770 const int dyn_blocks2 = numext::div_ceil<int>(num_preserved_vals, 1024);
771 const int max_blocks2 = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / 1024;
772 const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
773 LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>), num_blocks2, 1024, 0, device, reducer.initialize(),
774 num_preserved_vals, output);
775 }
776
777 LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>), num_blocks, block_size, 0, device,
778 reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
779
780 return false;
781 }
782};
783
784#ifdef EIGEN_HAS_GPU_FP16
785template <typename Self, typename Op>
786struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
787 static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
788 gpu_assert(false && "Should not be called since there is no packet accessor");
789 return true;
790 }
791};
792
793template <typename Self, typename Op>
794struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
795 static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output,
796 typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
797 typedef typename Self::Index Index;
798
799 if (num_preserved_vals % 2 != 0) {
800 // Not supported yet, revert to the slower code path
801 return true;
802 }
803
804 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
805 const int block_size = /*256*/ 128;
806 const int num_per_thread = /*128*/ 64;
807 const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
808 const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size;
809 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
810
811 if (num_blocks > 1) {
812 // We initialize the outputs outside the reduction kernel when we can't be sure that there
813 // won't be a race conditions between multiple thread blocks.
814 LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>), 1, 1, 0, device, reducer, self,
815 num_preserved_vals, output);
816 }
817
818 LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>), num_blocks, block_size, 0,
819 device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
820
821 return false;
822 }
823};
824#endif // EIGEN_HAS_GPU_FP16
825
826template <typename Self, typename Op>
827struct InnerReducer<Self, Op, GpuDevice> {
828 // Unfortunately nvidia doesn't support well exotic types such as complex,
829 // so reduce the scope of the optimized version of the code to the simple case
830 // of floats and half floats.
831#ifdef EIGEN_HAS_GPU_FP16
832 static constexpr bool HasOptimizedImplementation =
833 !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
834 internal::is_same<typename Self::CoeffReturnType, double>::value ||
835 (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value &&
836 reducer_traits<Op, GpuDevice>::PacketAccess));
837#else // EIGEN_HAS_GPU_FP16
838 static constexpr bool HasOptimizedImplementation =
839 !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
840 internal::is_same<typename Self::CoeffReturnType, double>::value);
841#endif // EIGEN_HAS_GPU_FP16
842
843 template <typename OutputType>
844 static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output,
845 typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
846 gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
847 const Index num_coeffs = array_prod(self.m_impl.dimensions());
848 // Don't crash when we're called with an input tensor of size 0.
849 if (num_coeffs == 0) {
850 return true;
851 }
852 // It's faster to use the usual code.
853 if (num_coeffs_to_reduce <= 128) {
854 return true;
855 }
856
857 return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(
858 self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
859 }
860};
861
862template <int NumPerThread, typename Self, typename Reducer, typename Index>
863__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer, const Self input,
864 Index num_coeffs_to_reduce,
865 Index num_preserved_coeffs,
866 typename Self::CoeffReturnType* output) {
867 const Index num_threads = blockDim.x * gridDim.x;
868 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
869 // Initialize the output values if they weren't initialized by the ReductionInitKernel
870 if (gridDim.x == 1) {
871 for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
872 output[i] = reducer.initialize();
873 }
874 __syncthreads();
875 }
876
877 // Do the reduction.
878 const Index max_iter = num_preserved_coeffs * numext::div_ceil<Index>(num_coeffs_to_reduce, NumPerThread);
879 for (Index i = thread_id; i < max_iter; i += num_threads) {
880 const Index input_col = i % num_preserved_coeffs;
881 const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
882 typename Self::CoeffReturnType reduced_val = reducer.initialize();
883 const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
884 for (Index j = input_row; j < max_row; j++) {
885 typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
886 reducer.reduce(val, &reduced_val);
887 }
888 atomicReduce(&(output[input_col]), reduced_val, reducer);
889 }
890}
891
892template <typename Self, typename Op>
893struct OuterReducer<Self, Op, GpuDevice> {
894 // Unfortunately nvidia doesn't support well exotic types such as complex,
895 // so reduce the scope of the optimized version of the code to the simple case
896 // of floats.
897 static constexpr bool HasOptimizedImplementation =
898 !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
899 internal::is_same<typename Self::CoeffReturnType, double>::value);
900 template <typename Device, typename OutputType>
901 static
902#if !defined(EIGEN_HIPCC)
903 // FIXME : leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error
904 // (in the cxx11_tensor_reduction_gpu test)
905 //
906 // terminate called after throwing an instance of 'std::runtime_error'
907 // what(): No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL...
908 //
909 // don't know why this happens (and why is it a runtime error instead of a compile time error)
910 //
911 // this will be fixed by HIP PR#457
912 EIGEN_DEVICE_FUNC
913#endif
914 bool
915 run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
916 gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device");
917 return true;
918 }
919
920 static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output,
921 typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
922 typedef typename Self::Index Index;
923
924 // It's faster to use the usual code.
925 if (num_coeffs_to_reduce <= 32) {
926 return true;
927 }
928
929 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
930 const int block_size = 256;
931 const int num_per_thread = 16;
932 const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
933 const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size;
934 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
935
936 if (num_blocks > 1) {
937 // We initialize the outputs in the reduction kernel itself when we don't have to worry
938 // about race conditions between multiple thread blocks.
939 const int dyn_blocks2 = numext::div_ceil<int>(num_preserved_vals, 1024);
940 const int max_blocks2 = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / 1024;
941 const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
942 LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>), num_blocks2, 1024, 0, device, reducer.initialize(),
943 num_preserved_vals, output);
944 }
945
946 LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>), num_blocks, block_size, 0, device,
947 reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
948
949 return false;
950 }
951};
952
953#endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
954
955} // end namespace internal
956} // end namespace Eigen
957
958#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index