Eigen-unsupported  3.4.1 (git rev 28ded8800c26864e537852658428ab44c8399e87)
 
Loading...
Searching...
No Matches
TensorReductionGpu.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
11#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
12
13namespace Eigen {
14namespace internal {
15
16
17#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
18// Full reducers for GPU, don't vectorize for now
19
20// Reducer function that enables multiple gpu thread to safely accumulate at the same
21// output address. It basically reads the current value of the output variable, and
22// attempts to update it with the new value. If in the meantime another gpu thread
23// updated the content of the output address it will try again.
24template <typename T, typename R>
25__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
26#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
27 if (sizeof(T) == 4)
28 {
29 unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
30 unsigned int newval = oldval;
31 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
32 if (newval == oldval) {
33 return;
34 }
35 unsigned int readback;
36 while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
37 oldval = readback;
38 newval = oldval;
39 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
40 if (newval == oldval) {
41 return;
42 }
43 }
44 }
45 else if (sizeof(T) == 8) {
46 unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
47 unsigned long long newval = oldval;
48 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
49 if (newval == oldval) {
50 return;
51 }
52 unsigned long long readback;
53 while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
54 oldval = readback;
55 newval = oldval;
56 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
57 if (newval == oldval) {
58 return;
59 }
60 }
61 }
62 else {
63 gpu_assert(0 && "Wordsize not supported");
64 }
65#else // EIGEN_CUDA_ARCH >= 300
66 gpu_assert(0 && "Shouldn't be called on unsupported device");
67#endif // EIGEN_CUDA_ARCH >= 300
68}
69
70// We extend atomicExch to support extra data types
71template <typename Type>
72__device__ inline Type atomicExchCustom(Type* address, Type val) {
73 return atomicExch(address, val);
74}
75
76template <>
77__device__ inline double atomicExchCustom(double* address, double val) {
78 unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
79 return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
80}
81
82#ifdef EIGEN_HAS_GPU_FP16
83template <typename R>
84__device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) {
85 unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
86 unsigned int newval = oldval;
87 reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
88 if (newval == oldval) {
89 return;
90 }
91 unsigned int readback;
92 while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
93 oldval = readback;
94 newval = oldval;
95 reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
96 if (newval == oldval) {
97 return;
98 }
99 }
100}
101#ifdef EIGEN_GPU_COMPILE_PHASE
102// reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations
103template <typename R>
104__device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) {
105 half2* houtput=reinterpret_cast<half2*>(output);
106 half2* haccum=reinterpret_cast<half2*>(&accum);
107 for(int i=0;i<4;++i){
108 atomicReduce(houtput+i,*(haccum+i),reducer);
109 }
110}
111#endif // EIGEN_GPU_COMPILE_PHASE
112#endif // EIGEN_HAS_GPU_FP16
113
114template <>
115__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
116#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
117 atomicAdd(output, accum);
118#else // EIGEN_CUDA_ARCH >= 300
119 gpu_assert(0 && "Shouldn't be called on unsupported device");
120#endif // EIGEN_CUDA_ARCH >= 300
121}
122
123
124template <typename CoeffType, typename Index>
125__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
126 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
127 const Index num_threads = blockDim.x * gridDim.x;
128 for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
129 output[i] = val;
130 }
131}
132
133
134template <int BlockSize, int NumPerThread, typename Self,
135 typename Reducer, typename Index>
136__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
137 typename Self::CoeffReturnType* output, unsigned int* semaphore) {
138#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
139 // Initialize the output value
140 const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
141 if (gridDim.x == 1) {
142 if (first_index == 0) {
143 *output = reducer.initialize();
144 }
145 }
146 else {
147 if (threadIdx.x == 0) {
148 unsigned int block = atomicCAS(semaphore, 0u, 1u);
149 if (block == 0) {
150 // We're the first block to run, initialize the output value
151 atomicExchCustom(output, reducer.initialize());
152 __threadfence();
153 atomicExch(semaphore, 2u);
154 }
155 else {
156 // Wait for the first block to initialize the output value.
157 // Use atomicCAS here to ensure that the reads aren't cached
158 unsigned int val;
159 do {
160 val = atomicCAS(semaphore, 2u, 2u);
161 }
162 while (val < 2u);
163 }
164 }
165 }
166
167 __syncthreads();
168
169 eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
170
171 typename Self::CoeffReturnType accum = reducer.initialize();
172 Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
173 for (Index i = 0; i < max_iter; i+=BlockSize) {
174 const Index index = first_index + i;
175 eigen_assert(index < num_coeffs);
176 typename Self::CoeffReturnType val = input.m_impl.coeff(index);
177 reducer.reduce(val, &accum);
178 }
179
180#pragma unroll
181 for (int offset = warpSize/2; offset > 0; offset /= 2) {
182 #if defined(EIGEN_HIPCC)
183 // use std::is_floating_point to determine the type of reduced_val
184 // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
185 // and list the float and int versions of __shfl_down as the candidate functions.
186 if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
187 reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
188 } else {
189 reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
190 }
191 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
192 reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
193 #else
194 reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
195 #endif
196 }
197
198 if ((threadIdx.x & (warpSize - 1)) == 0) {
199 atomicReduce(output, accum, reducer);
200 }
201
202 if (gridDim.x > 1 && threadIdx.x == 0) {
203 // Let the last block reset the semaphore
204 atomicInc(semaphore, gridDim.x + 1);
205#if defined(EIGEN_HIPCC)
206 __threadfence_system();
207#endif
208 }
209#else // EIGEN_CUDA_ARCH >= 300
210 gpu_assert(0 && "Shouldn't be called on unsupported device");
211#endif // EIGEN_CUDA_ARCH >= 300
212}
213
214
215#ifdef EIGEN_HAS_GPU_FP16
216template <typename Self,
217 typename Reducer, typename Index>
218__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(
219 Reducer reducer, const Self input, Index num_coeffs, half* scratch) {
220 eigen_assert(blockDim.x == 1);
221 eigen_assert(gridDim.x == 1);
222 typedef packet_traits<Eigen::half>::type packet_type;
223 Index packet_remainder =
224 num_coeffs % Index(unpacket_traits<packet_type>::size);
225 if (packet_remainder != 0) {
226 half2* h2scratch = reinterpret_cast<half2*>(scratch);
227 for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) {
228 *h2scratch =
229 __halves2half2(input.coeff(i), input.coeff(i + 1));
230 h2scratch++;
231 }
232 if ((num_coeffs & 1) != 0) {
233 half lastCoeff = input.coeff(num_coeffs - 1);
234 *h2scratch = __halves2half2(lastCoeff, reducer.initialize());
235 }
236 } else {
237 packet_type reduce = reducer.template initializePacket<packet_type>();
238 internal::pstoreu(scratch, reduce);
239 }
240}
241
242template <typename Self,
243 typename Reducer, typename Index>
244__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
245 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
246 const Index num_threads = blockDim.x * gridDim.x;
247 typedef typename packet_traits<Eigen::half>::type PacketType;
248
249 const Index num_packets =
250 num_coeffs / Index(unpacket_traits<PacketType>::size);
251 PacketType* p_output = reinterpret_cast<PacketType*>(output);
252 for (Index i = thread_id; i < num_packets; i += num_threads) {
253 p_output[i] = reducer.template initializePacket<PacketType>();
254 }
255 Index packet_remainder =
256 num_coeffs % Index(unpacket_traits<PacketType>::size);
257 if (thread_id < packet_remainder) {
258 output[num_coeffs - packet_remainder + thread_id] = reducer.initialize();
259 }
260}
261
262template <int BlockSize, int NumPerThread, typename Self,
263 typename Reducer, typename Index>
264__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(
265 Reducer reducer, const Self input, Index num_coeffs,
266 half* output, half* scratch) {
267 typedef typename packet_traits<Eigen::half>::type PacketType;
268 const int packet_width = unpacket_traits<PacketType>::size;
269 eigen_assert(NumPerThread % packet_width == 0);
270 const Index first_index =
271 blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x;
272
273 // Initialize the output value if it wasn't initialized by the ReductionInitKernel
274
275 if (gridDim.x == 1) {
276 if (first_index == 0) {
277 int rem = num_coeffs % packet_width;
278 if (rem != 0) {
279 half2* p_scratch = reinterpret_cast<half2*>(scratch);
280 pstoreu(scratch, reducer.template initializePacket<PacketType>());
281 for (int i = 0; i < rem / 2; i++) {
282 *p_scratch = __halves2half2(
283 input.coeff(num_coeffs - packet_width + 2 * i),
284 input.coeff(num_coeffs - packet_width + 2 * i + 1));
285 p_scratch++;
286 }
287 if ((num_coeffs & 1) != 0) {
288 half last = input.coeff(num_coeffs - 1);
289 *p_scratch = __halves2half2(last, reducer.initialize());
290 }
291 } else {
292 PacketType reduce = reducer.template initializePacket<PacketType>();
293 pstoreu(scratch, reduce);
294 }
295 }
296 __syncthreads();
297 }
298
299 PacketType accum = reducer.template initializePacket<PacketType>();
300 const Index max_iter =
301 numext::mini<Index>((num_coeffs - first_index) / packet_width,
302 NumPerThread * BlockSize / packet_width);
303 for (Index i = 0; i < max_iter; i += BlockSize) {
304 const Index index = first_index + packet_width * i;
305 eigen_assert(index + packet_width < num_coeffs);
306 PacketType val = input.template packet<Unaligned>(index);
307 reducer.reducePacket(val, &accum);
308 }
309
310#pragma unroll
311 for (int offset = warpSize/2; offset > 0; offset /= 2) {
312 #if defined(EIGEN_HIPCC)
313 PacketType r1;
314 half2* hr = reinterpret_cast<half2*>(&r1);
315 half2* hacc = reinterpret_cast<half2*>(&accum);
316 for (int i = 0; i < packet_width / 2; i++) {
317 // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
318 union { int i; half2 h; } wka_in, wka_out;
319 wka_in.h = hacc[i];
320 wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
321 hr[i] = wka_out.h;
322 }
323 reducer.reducePacket(r1, &accum);
324 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
325 PacketType r1;
326 half2* hr = reinterpret_cast<half2*>(&r1);
327 half2* hacc = reinterpret_cast<half2*>(&accum);
328 for (int i = 0; i < packet_width / 2; i++) {
329 hr[i] = __shfl_down(hacc[i], offset, warpSize);
330 }
331 reducer.reducePacket(r1, &accum);
332 #else
333 PacketType r1;
334 half2* hr = reinterpret_cast<half2*>(&r1);
335 half2* hacc = reinterpret_cast<half2*>(&accum);
336 for (int i = 0; i < packet_width / 2; i++) {
337 hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (unsigned)offset, warpSize);
338 }
339 reducer.reducePacket(r1, &accum);
340
341 #endif
342 }
343
344 if ((threadIdx.x & (warpSize - 1)) == 0) {
345 atomicReduce(reinterpret_cast<PacketType*>(scratch), accum, reducer);
346 }
347
348 __syncthreads();
349 half2* rv1 = reinterpret_cast<half2*>(scratch);
350 if (packet_width > 2) {
351 reducer.reducePacket(rv1[2], rv1);
352 reducer.reducePacket(rv1[3], rv1 + 1);
353 reducer.reducePacket(rv1[1], rv1);
354 }
355 if (gridDim.x == 1) {
356 if (first_index == 0) {
357 half tmp = __low2half(*rv1);
358 reducer.reduce(__high2half(*rv1), &tmp);
359 *output = tmp;
360 }
361 }
362}
363
364template <typename Op>
365__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, half* scratch) {
366 eigen_assert(threadIdx.x == 1);
367 typedef packet_traits<Eigen::half>::type packet_type;
368 if (unpacket_traits<packet_type>::size == 1) {
369 *output = *scratch;
370 } else {
371 half2* pscratch = reinterpret_cast<half2*>(scratch);
372 half tmp = __float2half(0.f);
373 for (int i = 0; i < unpacket_traits<packet_type>::size; i += 2) {
374 reducer.reduce(__low2half(*pscratch), &tmp);
375 reducer.reduce(__high2half(*pscratch), &tmp);
376 pscratch++;
377 }
378 *output = tmp;
379 }
380}
381
382#endif // EIGEN_HAS_GPU_FP16
383
384template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
385struct FullReductionLauncher {
386 static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
387 gpu_assert(false && "Should only be called on doubles, floats and half floats");
388 }
389};
390
391// Specialization for float and double
392template <typename Self, typename Op, typename OutputType, bool PacketAccess>
393struct FullReductionLauncher<
394 Self, Op, OutputType, PacketAccess,
395 typename internal::enable_if<
396 internal::is_same<float, OutputType>::value ||
397 internal::is_same<double, OutputType>::value,
398 void>::type> {
399 static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
400
401 typedef typename Self::Index Index;
402 const int block_size = 256;
403 const int num_per_thread = 128;
404 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
405
406 unsigned int* semaphore = NULL;
407 if (num_blocks > 1) {
408 semaphore = device.semaphore();
409 }
410
411 LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
412 num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
413 }
414};
415
416#ifdef EIGEN_HAS_GPU_FP16
417template <typename Self, typename Op>
418struct FullReductionLauncher<Self, Op, Eigen::half, false> {
419 static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
420 gpu_assert(false && "Should not be called since there is no packet accessor");
421 }
422};
423
424template <typename Self, typename Op>
425struct FullReductionLauncher<Self, Op, Eigen::half, true> {
426 static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
427 typedef typename Self::Index Index;
428
429 const int block_size = 256;
430 const int num_per_thread = 128;
431 const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
432 half* scratch = static_cast<half*>(device.scratchpad());
433
434 if (num_blocks > 1) {
435 // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
436 // won't be a race conditions between multiple thread blocks.
437 LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
438 1, 1, 0, device, reducer, self, num_coeffs, scratch);
439 }
440
441 LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
442 num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
443
444 if (num_blocks > 1) {
445 LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
446 1, 1, 0, device, reducer, output, scratch);
447 }
448 }
449};
450#endif // EIGEN_HAS_GPU_FP16
451
452
453template <typename Self, typename Op, bool Vectorizable>
454struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
455 // Unfortunately nvidia doesn't support well exotic types such as complex,
456 // so reduce the scope of the optimized version of the code to the simple cases
457 // of doubles, floats and half floats
458#ifdef EIGEN_HAS_GPU_FP16
459 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
460 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
461 internal::is_same<typename Self::CoeffReturnType, double>::value ||
462 (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
463#else // EIGEN_HAS_GPU_FP16
464 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
465 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
466 internal::is_same<typename Self::CoeffReturnType, double>::value);
467#endif // EIGEN_HAS_GPU_FP16
468
469 template <typename OutputType>
470 static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
471 gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
472 const Index num_coeffs = array_prod(self.m_impl.dimensions());
473 // Don't crash when we're called with an input tensor of size 0.
474 if (num_coeffs == 0) {
475 return;
476 }
477
478 FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
479 }
480};
481
482
483template <int NumPerThread, typename Self,
484 typename Reducer, typename Index>
485__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
486 typename Self::CoeffReturnType* output) {
487#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
488 typedef typename Self::CoeffReturnType Type;
489 eigen_assert(blockDim.y == 1);
490 eigen_assert(blockDim.z == 1);
491 eigen_assert(gridDim.y == 1);
492 eigen_assert(gridDim.z == 1);
493
494 const int unroll_times = 16;
495 eigen_assert(NumPerThread % unroll_times == 0);
496
497 const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
498 const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
499
500 const Index num_threads = blockDim.x * gridDim.x;
501 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
502
503 // Initialize the output values if they weren't initialized by the ReductionInitKernel
504 if (gridDim.x == 1) {
505 for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
506 output[i] = reducer.initialize();
507 }
508 __syncthreads();
509 }
510
511 for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
512 const Index row = i / input_col_blocks;
513
514 if (row < num_preserved_coeffs) {
515 const Index col_block = i % input_col_blocks;
516 const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
517
518 Type reduced_val = reducer.initialize();
519
520 for (Index j = 0; j < NumPerThread; j += unroll_times) {
521 const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
522 if (last_col >= num_coeffs_to_reduce) {
523 for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
524 const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
525 reducer.reduce(val, &reduced_val);
526 }
527 break;
528 } else {
529 // Faster version of the loop with no branches after unrolling.
530#pragma unroll
531 for (int k = 0; k < unroll_times; ++k) {
532 const Index col = col_begin + blockDim.x * (j + k);
533 reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
534 }
535 }
536 }
537
538#pragma unroll
539 for (int offset = warpSize/2; offset > 0; offset /= 2) {
540 #if defined(EIGEN_HIPCC)
541 // use std::is_floating_point to determine the type of reduced_val
542 // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
543 // and list the float and int versions of __shfl_down as the candidate functions.
544 if (std::is_floating_point<Type>::value) {
545 reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
546 } else {
547 reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
548 }
549 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
550 reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
551 #else
552 reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
553 #endif
554 }
555
556 if ((threadIdx.x & (warpSize - 1)) == 0) {
557 atomicReduce(&(output[row]), reduced_val, reducer);
558 }
559 }
560 }
561#else // EIGEN_CUDA_ARCH >= 300
562 gpu_assert(0 && "Shouldn't be called on unsupported device");
563#endif // EIGEN_CUDA_ARCH >= 300
564}
565
566#ifdef EIGEN_HAS_GPU_FP16
567
568template <int NumPerThread, typename Self,
569 typename Reducer, typename Index>
570__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
571 half* output) {
572 eigen_assert(blockDim.y == 1);
573 eigen_assert(blockDim.z == 1);
574 eigen_assert(gridDim.y == 1);
575 eigen_assert(gridDim.z == 1);
576
577 typedef typename packet_traits<Eigen::half>::type PacketType;
578 const int packet_width = unpacket_traits<PacketType>::size;
579 const int unroll_times = 16 / packet_width;
580 eigen_assert(NumPerThread % unroll_times == 0);
581 eigen_assert(unroll_times % 2 == 0);
582
583 const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
584 const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
585
586 const Index num_threads = blockDim.x * gridDim.x;
587 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
588
589 // Initialize the output values if they weren't initialized by the ReductionInitKernel
590 if (gridDim.x == 1) {
591 Index i = packet_width * thread_id;
592 for (; i + packet_width <= num_preserved_coeffs;
593 i += packet_width * num_threads) {
594 PacketType* poutput = reinterpret_cast<PacketType*>(output + i);
595 *poutput = reducer.template initializePacket<PacketType>();
596 }
597 if (i < num_preserved_coeffs) {
598 output[i] = reducer.initialize();
599 }
600 __syncthreads();
601 }
602
603 for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
604 const Index row = 2 * (i / input_col_blocks); // everybody takes 2 rows
605
606 if (row + 1 < num_preserved_coeffs) {
607 const Index col_block = i % input_col_blocks;
608 const Index col_begin =
609 packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x);
610
611 PacketType reduced_val1 = reducer.template initializePacket<PacketType>();
612 PacketType reduced_val2 = reducer.template initializePacket<PacketType>();
613
614 for (Index j = 0; j < NumPerThread; j += unroll_times) {
615 const Index last_col =
616 col_begin + blockDim.x * (j + unroll_times - 1) * packet_width;
617 if (last_col >= num_coeffs_to_reduce) {
618 Index col = col_begin + blockDim.x * j;
619 for (; col + packet_width <= num_coeffs_to_reduce;
620 col += blockDim.x) {
621 const PacketType val1 = input.m_impl.template packet<Unaligned>(
622 row * num_coeffs_to_reduce + col);
623 reducer.reducePacket(val1, &reduced_val1);
624 const PacketType val2 = input.m_impl.template packet<Unaligned>(
625 (row + 1) * num_coeffs_to_reduce + col);
626 reducer.reducePacket(val2, &reduced_val2);
627 }
628 if (col < num_coeffs_to_reduce) {
629 PacketType r1 = reducer.template initializePacket<PacketType>();
630 PacketType r2 = reducer.template initializePacket<PacketType>();
631 half2* hr1 = reinterpret_cast<half2*>(&r1);
632 half2* hr2 = reinterpret_cast<half2*>(&r2);
633 while (col + 1 < num_coeffs_to_reduce) {
634 *hr1 = __halves2half2(
635 input.m_impl.coeff(row * num_coeffs_to_reduce + col),
636 input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1));
637 *hr2 = __halves2half2(
638 input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col),
639 input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col +
640 1));
641 hr1++;
642 hr2++;
643 col += 2;
644 }
645 if (col < num_coeffs_to_reduce) {
646 // Peel;
647 const half last1 =
648 input.m_impl.coeff(row * num_coeffs_to_reduce + col);
649 *hr1 = __halves2half2(last1, reducer.initialize());
650 const half last2 =
651 input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col);
652 *hr2 = __halves2half2(last2, reducer.initialize());
653 }
654 reducer.reducePacket(r1, &reduced_val1);
655 reducer.reducePacket(r2, &reduced_val2);
656 }
657 break;
658 } else {
659 // Faster version of the loop with no branches after unrolling.
660#pragma unroll
661 for (int k = 0; k < unroll_times; ++k) {
662 const Index col = col_begin + blockDim.x * (j + k) * packet_width;
663 reducer.reducePacket(input.m_impl.template packet<Unaligned>(
664 row * num_coeffs_to_reduce + col),
665 &reduced_val1);
666 reducer.reducePacket(input.m_impl.template packet<Unaligned>(
667 (row + 1) * num_coeffs_to_reduce + col),
668 &reduced_val2);
669 }
670 }
671 }
672
673#pragma unroll
674 for (int offset = warpSize/2; offset > 0; offset /= 2) {
675 #if defined(EIGEN_HIPCC)
676 PacketType r1;
677 PacketType r2;
678 half2* hr1 = reinterpret_cast<half2*>(&r1);
679 half2* hr2 = reinterpret_cast<half2*>(&r2);
680 half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
681 half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
682 for (int i = 0; i < packet_width / 2; i++) {
683 // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
684 union { int i; half2 h; } wka_in1, wka_out1;
685 wka_in1.h = rv1[i];
686 wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize);
687 hr1[i] = wka_out1.h;
688
689 union { int i; half2 h; } wka_in2, wka_out2;
690 wka_in2.h = rv2[i];
691 wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize);
692 hr2[i] = wka_out2.h;
693 }
694 reducer.reducePacket(r1, &reduced_val1);
695 reducer.reducePacket(r2, &reduced_val2);
696 #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
697 PacketType r1;
698 PacketType r2;
699 half2* hr1 = reinterpret_cast<half2*>(&r1);
700 half2* hr2 = reinterpret_cast<half2*>(&r2);
701 half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
702 half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
703 for (int i = 0; i < packet_width / 2; i++) {
704 hr1[i] = __shfl_down(rv1[i], offset, warpSize);
705 hr2[i] = __shfl_down(rv2[i], offset, warpSize);
706 }
707 reducer.reducePacket(r1, &reduced_val1);
708 reducer.reducePacket(r2, &reduced_val2);
709 #else
710 PacketType r1;
711 PacketType r2;
712 half2* hr1 = reinterpret_cast<half2*>(&r1);
713 half2* hr2 = reinterpret_cast<half2*>(&r2);
714 half2* rr1 = reinterpret_cast<half2*>(&reduced_val1);
715 half2* rr2 = reinterpret_cast<half2*>(&reduced_val2);
716 for (int i = 0; i < packet_width / 2; i++) {
717 hr1[i] =
718 __shfl_down_sync(0xFFFFFFFF, rr1[i], (unsigned)offset, warpSize);
719 hr2[i] =
720 __shfl_down_sync(0xFFFFFFFF, rr2[i], (unsigned)offset, warpSize);
721 }
722 reducer.reducePacket(r1, &reduced_val1);
723 reducer.reducePacket(r2, &reduced_val2);
724
725 #endif
726 }
727 half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
728 half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
729 half2 val;
730 if (packet_width > 2) {
731 reducer.reducePacket(rv1[2], rv1);
732 reducer.reducePacket(rv1[3], rv1 + 1);
733 reducer.reducePacket(rv1[1], rv1);
734 reducer.reducePacket(rv2[2], rv2);
735 reducer.reducePacket(rv2[3], rv2 + 1);
736 reducer.reducePacket(rv2[1], rv2);
737 }
738 half val1 = __low2half(*rv1);
739 reducer.reduce(__high2half(*rv1), &val1);
740 half val2 = __low2half(*rv2);
741 reducer.reduce(__high2half(*rv2), &val2);
742 val = __halves2half2(val1, val2);
743 if ((threadIdx.x & (warpSize - 1)) == 0) {
744 half* loc = output + row;
745 atomicReduce((half2*)loc, val, reducer);
746 }
747 }
748 }
749}
750
751#endif // EIGEN_HAS_GPU_FP16
752
753template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
754struct InnerReductionLauncher {
755 static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
756 gpu_assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
757 return true;
758 }
759};
760
761// Specialization for float and double
762template <typename Self, typename Op, typename OutputType, bool PacketAccess>
763struct InnerReductionLauncher<
764 Self, Op, OutputType, PacketAccess,
765 typename internal::enable_if<
766 internal::is_same<float, OutputType>::value ||
767 internal::is_same<double, OutputType>::value,
768 void>::type> {
769 static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
770 typedef typename Self::Index Index;
771
772 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
773 const int block_size = 256;
774 const int num_per_thread = 128;
775 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
776 const int max_blocks = device.getNumGpuMultiProcessors() *
777 device.maxGpuThreadsPerMultiProcessor() / block_size;
778 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
779
780 if (num_blocks > 1) {
781 // We initialize the outputs outside the reduction kernel when we can't be sure that there
782 // won't be a race conditions between multiple thread blocks.
783 const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
784 const int max_blocks = device.getNumGpuMultiProcessors() *
785 device.maxGpuThreadsPerMultiProcessor() / 1024;
786 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
787 LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
788 num_blocks, 1024, 0, device, reducer.initialize(),
789 num_preserved_vals, output);
790 }
791
792 LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
793 num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
794
795 return false;
796 }
797};
798
799#ifdef EIGEN_HAS_GPU_FP16
800template <typename Self, typename Op>
801struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
802 static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
803 gpu_assert(false && "Should not be called since there is no packet accessor");
804 return true;
805 }
806};
807
808template <typename Self, typename Op>
809struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
810 static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
811 typedef typename Self::Index Index;
812
813 if (num_preserved_vals % 2 != 0) {
814 // Not supported yet, revert to the slower code path
815 return true;
816 }
817
818 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
819 const int block_size = /*256*/128;
820 const int num_per_thread = /*128*/64;
821 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
822 const int max_blocks = device.getNumGpuMultiProcessors() *
823 device.maxGpuThreadsPerMultiProcessor() / block_size;
824 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
825
826 if (num_blocks > 1) {
827 // We initialize the outputs outside the reduction kernel when we can't be sure that there
828 // won't be a race conditions between multiple thread blocks.
829 LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
830 1, 1, 0, device, reducer, self, num_preserved_vals, output);
831 }
832
833 LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
834 num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
835
836 return false;
837 }
838};
839#endif // EIGEN_HAS_GPU_FP16
840
841
842template <typename Self, typename Op>
843struct InnerReducer<Self, Op, GpuDevice> {
844 // Unfortunately nvidia doesn't support well exotic types such as complex,
845 // so reduce the scope of the optimized version of the code to the simple case
846 // of floats and half floats.
847#ifdef EIGEN_HAS_GPU_FP16
848 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
849 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
850 internal::is_same<typename Self::CoeffReturnType, double>::value ||
851 (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
852#else // EIGEN_HAS_GPU_FP16
853 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
854 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
855 internal::is_same<typename Self::CoeffReturnType, double>::value);
856#endif // EIGEN_HAS_GPU_FP16
857
858 template <typename OutputType>
859 static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
860 gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
861 const Index num_coeffs = array_prod(self.m_impl.dimensions());
862 // Don't crash when we're called with an input tensor of size 0.
863 if (num_coeffs == 0) {
864 return true;
865 }
866 // It's faster to use the usual code.
867 if (num_coeffs_to_reduce <= 128) {
868 return true;
869 }
870
871 return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
872 }
873};
874
875template <int NumPerThread, typename Self,
876 typename Reducer, typename Index>
877__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
878 typename Self::CoeffReturnType* output) {
879 const Index num_threads = blockDim.x * gridDim.x;
880 const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
881 // Initialize the output values if they weren't initialized by the ReductionInitKernel
882 if (gridDim.x == 1) {
883 for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
884 output[i] = reducer.initialize();
885 }
886 __syncthreads();
887 }
888
889 // Do the reduction.
890 const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
891 for (Index i = thread_id; i < max_iter; i += num_threads) {
892 const Index input_col = i % num_preserved_coeffs;
893 const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
894 typename Self::CoeffReturnType reduced_val = reducer.initialize();
895 const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
896 for (Index j = input_row; j < max_row; j++) {
897 typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
898 reducer.reduce(val, &reduced_val);
899 }
900 atomicReduce(&(output[input_col]), reduced_val, reducer);
901 }
902}
903
904
905template <typename Self, typename Op>
906struct OuterReducer<Self, Op, GpuDevice> {
907 // Unfortunately nvidia doesn't support well exotic types such as complex,
908 // so reduce the scope of the optimized version of the code to the simple case
909 // of floats.
910 static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
911 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
912 internal::is_same<typename Self::CoeffReturnType, double>::value);
913 template <typename Device, typename OutputType>
914 static
915 #if !defined(EIGEN_HIPCC)
916 // FIXME : leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error
917 // (in the cxx11_tensor_reduction_gpu test)
918 //
919 // terminate called after throwing an instance of 'std::runtime_error'
920 // what(): No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL...
921 //
922 // don't know why this happens (and why is it a runtime error instead of a compile time error)
923 //
924 // this will be fixed by HIP PR#457
925 EIGEN_DEVICE_FUNC
926 #endif
927 bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
928 gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device");
929 return true;
930 }
931
932 static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
933 typedef typename Self::Index Index;
934
935 // It's faster to use the usual code.
936 if (num_coeffs_to_reduce <= 32) {
937 return true;
938 }
939
940 const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
941 const int block_size = 256;
942 const int num_per_thread = 16;
943 const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
944 const int max_blocks = device.getNumGpuMultiProcessors() *
945 device.maxGpuThreadsPerMultiProcessor() / block_size;
946 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
947
948 if (num_blocks > 1) {
949 // We initialize the outputs in the reduction kernel itself when we don't have to worry
950 // about race conditions between multiple thread blocks.
951 const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
952 const int max_blocks = device.getNumGpuMultiProcessors() *
953 device.maxGpuThreadsPerMultiProcessor() / 1024;
954 const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
955 LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
956 num_blocks, 1024, 0, device, reducer.initialize(),
957 num_preserved_vals, output);
958 }
959
960 LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
961 num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
962
963 return false;
964 }
965};
966
967#endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
968
969
970} // end namespace internal
971} // end namespace Eigen
972
973#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
static const symbolic::SymbolExpr< internal::symbolic_last_tag > last
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index