Eigen-unsupported  5.0.1-dev+284dcc12
 
Loading...
Searching...
No Matches
TensorReductionSycl.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Mehdi Goli Codeplay Software Ltd.
5// Ralph Potter Codeplay Software Ltd.
6// Luke Iwanski Codeplay Software Ltd.
7// Contact: <eigen@codeplay.com>
8//
9// This Source Code Form is subject to the terms of the Mozilla
10// Public License v. 2.0. If a copy of the MPL was not distributed
11// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
12
13/*****************************************************************
14 * TensorReductionSycl.h
15 *
16 * \brief:
17 * This is the specialization of the reduction operation. Two phase reduction approach
18 * is used since the GPU does not have Global Synchronization for global memory among
19 * different work-group/thread block. To solve the problem, we need to create two kernels
20 * to reduce the data, where the first kernel reduce the data locally and each local
21 * workgroup/thread-block save the input data into global memory. In the second phase (global reduction)
22 * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element.
23 * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU:
24 * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
25 *
26 *****************************************************************/
27
28#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
29#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
30// IWYU pragma: private
31#include "./InternalHeaderCheck.h"
32
33namespace Eigen {
34namespace TensorSycl {
35namespace internal {
36
37template <typename Op, typename CoeffReturnType, typename Index, bool Vectorizable>
38struct OpDefiner {
39 typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType;
40 typedef Op type;
41 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; }
42
43 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
44 const Index &) {
45 return accumulator;
46 }
47};
48
49template <typename CoeffReturnType, typename Index>
50struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, false> {
51 typedef Eigen::internal::SumReducer<CoeffReturnType> type;
52 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
53 return type();
54 }
55
56 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator,
57 const Index &scale) {
58 ::Eigen::internal::scalar_quotient_op<CoeffReturnType> quotient_op;
59 return quotient_op(accumulator, CoeffReturnType(scale));
60 }
61};
62
63template <typename CoeffReturnType, typename Index>
64struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, true> {
65 typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType;
66 typedef Eigen::internal::SumReducer<CoeffReturnType> type;
67 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
68 return type();
69 }
70
71 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
72 const Index &scale) {
73 return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale)));
74 }
75};
76
77template <typename CoeffReturnType, typename OpType, typename InputAccessor, typename OutputAccessor, typename Index,
78 Index local_range>
79struct SecondStepFullReducer {
80 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
81 LocalAccessor;
82 typedef OpDefiner<OpType, CoeffReturnType, Index, true> OpDef;
83 typedef typename OpDef::type Op;
84 LocalAccessor scratch;
85 InputAccessor aI;
86 OutputAccessor outAcc;
87 Op op;
88 SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
89 : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
90
91 void operator()(cl::sycl::nd_item<1> itemID) const {
92 // Our empirical research shows that the best performance will be achieved
93 // when there is only one element per thread to reduce in the second step.
94 // in this step the second step reduction time is almost negligible.
95 // Hence, in the second step of reduction the input size is fixed to the
96 // local size, thus, there is only one element read per thread. The
97 // algorithm must be changed if the number of reduce per thread in the
98 // second step is greater than 1. Otherwise, the result will be wrong.
99 const Index localid = itemID.get_local_id(0);
100 auto aInPtr = aI + localid;
101 auto aOutPtr = outAcc;
102 CoeffReturnType *scratchptr = scratch.get_pointer();
103 CoeffReturnType accumulator = *aInPtr;
104
105 scratchptr[localid] = op.finalize(accumulator);
106 for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {
107 itemID.barrier(cl::sycl::access::fence_space::local_space);
108 if (localid < offset) {
109 op.reduce(scratchptr[localid + offset], &accumulator);
110 scratchptr[localid] = op.finalize(accumulator);
111 }
112 }
113 if (localid == 0) *aOutPtr = op.finalize(accumulator);
114 }
115};
116
117// Full reduction first phase. In this version the vectorization is true and the reduction accept
118// any generic reducerOp e.g( max, min, sum, mean, iamax, iamin, etc ).
119template <typename Evaluator, typename OpType, typename Evaluator::Index local_range>
120class FullReductionKernelFunctor {
121 public:
122 typedef typename Evaluator::CoeffReturnType CoeffReturnType;
123 typedef typename Evaluator::Index Index;
124 typedef OpDefiner<OpType, typename Evaluator::CoeffReturnType, Index,
125 (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
126 OpDef;
127
128 typedef typename OpDef::type Op;
129 typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
130 typedef typename Evaluator::PacketReturnType PacketReturnType;
131 typedef std::conditional_t<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess), PacketReturnType,
132 CoeffReturnType>
133 OutType;
134 typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
135 LocalAccessor;
136 LocalAccessor scratch;
137 Evaluator evaluator;
138 EvaluatorPointerType final_output;
139 Index rng;
140 Op op;
141
142 FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_,
143 Index rng_, OpType op_)
144 : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
145
146 void operator()(cl::sycl::nd_item<1> itemID) const { compute_reduction(itemID); }
147
148 template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
149 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<Vect> compute_reduction(
150 const cl::sycl::nd_item<1> &itemID) const {
151 auto output_ptr = final_output;
152 Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
153 Index globalid = itemID.get_global_id(0);
154 Index localid = itemID.get_local_id(0);
155 Index step = Evaluator::PacketSize * itemID.get_global_range(0);
156 Index start = Evaluator::PacketSize * globalid;
157 // vectorizable parts
158 PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();
159 for (Index i = start; i < VectorizedRange; i += step) {
160 op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator);
161 }
162 globalid += VectorizedRange;
163 // non vectorizable parts
164 for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
165 op.template reducePacket<PacketReturnType>(
166 ::Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, Evaluator::PacketSize>::convert_to_packet_type(
167 evaluator.impl().coeff(i), op.initialize()),
168 &packetAccumulator);
169 }
170 scratch[localid] = packetAccumulator =
171 OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);
172 // reduction parts // Local size is always power of 2
173 EIGEN_UNROLL_LOOP
174 for (Index offset = local_range / 2; offset > 0; offset /= 2) {
175 itemID.barrier(cl::sycl::access::fence_space::local_space);
176 if (localid < offset) {
177 op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator);
178 scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);
179 }
180 }
181 if (localid == 0) {
182 output_ptr[itemID.get_group(0)] =
183 op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));
184 }
185 }
186
187 template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
188 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!Vect> compute_reduction(
189 const cl::sycl::nd_item<1> &itemID) const {
190 auto output_ptr = final_output;
191 Index globalid = itemID.get_global_id(0);
192 Index localid = itemID.get_local_id(0);
193 // vectorizable parts
194 CoeffReturnType accumulator = op.initialize();
195 // non vectorizable parts
196 for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
197 op.reduce(evaluator.impl().coeff(i), &accumulator);
198 }
199 scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng);
200
201 // reduction parts. the local size is always power of 2
202 EIGEN_UNROLL_LOOP
203 for (Index offset = local_range / 2; offset > 0; offset /= 2) {
204 itemID.barrier(cl::sycl::access::fence_space::local_space);
205 if (localid < offset) {
206 op.reduce(scratch[localid + offset], &accumulator);
207 scratch[localid] = op.finalize(accumulator);
208 }
209 }
210 if (localid == 0) {
211 output_ptr[itemID.get_group(0)] = op.finalize(accumulator);
212 }
213 }
214};
215
216template <typename Evaluator, typename OpType>
217class GenericNondeterministicReducer {
218 public:
219 typedef typename Evaluator::CoeffReturnType CoeffReturnType;
220 typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
221 typedef typename Evaluator::Index Index;
222 typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
223 typedef typename OpDef::type Op;
224 template <typename Scratch>
225 GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_,
226 Index range_, Index num_values_to_reduce_)
227 : evaluator(evaluator_),
228 output_accessor(output_accessor_),
229 functor(OpDef::get_op(functor_)),
230 range(range_),
231 num_values_to_reduce(num_values_to_reduce_) {}
232
233 void operator()(cl::sycl::nd_item<1> itemID) const {
234 // This is to bypass the statefull condition in Eigen meanReducer
235 Op non_const_functor;
236 std::memcpy(&non_const_functor, &functor, sizeof(Op));
237 auto output_accessor_ptr = output_accessor;
238 Index globalid = static_cast<Index>(itemID.get_global_linear_id());
239 if (globalid < range) {
240 CoeffReturnType accum = functor.initialize();
241 Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
242 evaluator, evaluator.firstInput(globalid), non_const_functor, &accum);
243 output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
244 }
245 }
246
247 private:
248 Evaluator evaluator;
249 EvaluatorPointerType output_accessor;
250 Op functor;
251 Index range;
252 Index num_values_to_reduce;
253};
254
255enum class reduction_dim { inner_most, outer_most };
256// default is preserver
257template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt>
258struct PartialReductionKernel {
259 typedef typename Evaluator::CoeffReturnType CoeffReturnType;
260 typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
261 typedef typename Evaluator::Index Index;
262 typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
263 typedef typename OpDef::type Op;
264 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
265 ScratchAcc;
266 ScratchAcc scratch;
267 Evaluator evaluator;
268 EvaluatorPointerType output_accessor;
269 Op op;
270 const Index preserve_elements_num_groups;
271 const Index reduce_elements_num_groups;
272 const Index num_coeffs_to_preserve;
273 const Index num_coeffs_to_reduce;
274
275 PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_,
276 const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_,
277 const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
278 : scratch(scratch_),
279 evaluator(evaluator_),
280 output_accessor(output_accessor_),
281 op(OpDef::get_op(op_)),
282 preserve_elements_num_groups(preserve_elements_num_groups_),
283 reduce_elements_num_groups(reduce_elements_num_groups_),
284 num_coeffs_to_preserve(num_coeffs_to_preserve_),
285 num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
286
287 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,
288 CoeffReturnType &accumulator) const {
289 if (globalPId >= num_coeffs_to_preserve) {
290 return;
291 }
292 Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve)
293 : globalRId + (globalPId * num_coeffs_to_reduce);
294 Index localOffset = globalRId;
295
296 const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;
297 const Index per_thread_global_stride =
298 rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride;
299 for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) {
300 op.reduce(evaluator.impl().coeff(global_offset), &accumulator);
301 localOffset += per_thread_local_stride;
302 global_offset += per_thread_global_stride;
303 }
304 }
305 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
306 const Index linearLocalThreadId = itemID.get_local_id(0);
307 Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
308 : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
309 Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP
310 : linearLocalThreadId % PannelParameters::LocalThreadSizeR;
311 const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups
312 : itemID.get_group(0) / reduce_elements_num_groups;
313 const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups
314 : itemID.get_group(0) % reduce_elements_num_groups;
315
316 Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
317 const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
318 CoeffReturnType *scratchPtr = scratch.get_pointer();
319 auto outPtr = output_accessor + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
320 CoeffReturnType accumulator = op.initialize();
321
322 element_wise_reduce(globalRId, globalPId, accumulator);
323
324 accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);
325 scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
326 accumulator;
327 if (rt == reduction_dim::inner_most) {
328 pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
329 rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
330 globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
331 }
332
333 /* Apply the reduction operation between the current local
334 * id and the one on the other half of the vector. */
335 auto out_scratch_ptr =
336 scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
337 itemID.barrier(cl::sycl::access::fence_space::local_space);
338 if (rt == reduction_dim::inner_most) {
339 accumulator = *out_scratch_ptr;
340 }
341 // The Local LocalThreadSizeR is always power of 2
342 EIGEN_UNROLL_LOOP
343 for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {
344 if (rLocalThreadId < offset) {
345 op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);
346 // The result has already been divided for mean reducer in the
347 // previous reduction so no need to divide furthermore
348 *out_scratch_ptr = op.finalize(accumulator);
349 }
350 /* All threads collectively read from global memory into local.
351 * The barrier ensures all threads' IO is resolved before
352 * execution continues (strictly speaking, all threads within
353 * a single work-group - there is no co-ordination between
354 * work-groups, only work-items). */
355 itemID.barrier(cl::sycl::access::fence_space::local_space);
356 }
357
358 if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {
359 outPtr[globalPId] = op.finalize(accumulator);
360 }
361 }
362};
363
364template <typename OutScalar, typename Index, typename InputAccessor, typename OutputAccessor, typename OpType>
365struct SecondStepPartialReduction {
366 typedef OpDefiner<OpType, OutScalar, Index, false> OpDef;
367 typedef typename OpDef::type Op;
368 typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
369 ScratchAccessor;
370 InputAccessor input_accessor;
371 OutputAccessor output_accessor;
372 Op op;
373 const Index num_coeffs_to_preserve;
374 const Index num_coeffs_to_reduce;
375
376 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_,
377 OutputAccessor output_accessor_, OpType op_,
378 const Index num_coeffs_to_preserve_,
379 const Index num_coeffs_to_reduce_)
380 : input_accessor(input_accessor_),
381 output_accessor(output_accessor_),
382 op(OpDef::get_op(op_)),
383 num_coeffs_to_preserve(num_coeffs_to_preserve_),
384 num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
385
386 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
387 const Index globalId = itemID.get_global_id(0);
388
389 if (globalId >= num_coeffs_to_preserve) return;
390
391 auto in_ptr = input_accessor + globalId;
392
393 OutScalar accumulator = op.initialize();
394 // num_coeffs_to_reduce is not bigger that 256
395 for (Index i = 0; i < num_coeffs_to_reduce; i++) {
396 op.reduce(*in_ptr, &accumulator);
397 in_ptr += num_coeffs_to_preserve;
398 }
399 output_accessor[globalId] = op.finalize(accumulator);
400 }
401}; // namespace internal
402
403template <typename Index, Index LTP, Index LTR, bool BC_>
404struct ReductionPannel {
405 static constexpr Index LocalThreadSizeP = LTP;
406 static constexpr Index LocalThreadSizeR = LTR;
407 static constexpr bool BC = BC_;
408};
409
410template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt>
411struct PartialReducerLauncher {
412 typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
413 typedef typename Self::CoeffReturnType CoeffReturnType;
414 typedef typename Self::Storage Storage;
415 typedef typename Self::Index Index;
416 typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true>
417 PannelParameters;
418
419 typedef PartialReductionKernel<Self, Op, PannelParameters, rt> SyclReducerKerneType;
420
421 static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output,
422 Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {
423 Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);
424
425 // getPowerOfTwo makes sure local range is power of 2 and <=
426 // maxSyclThreadPerBlock this will help us to avoid extra check on the
427 // kernel
428 static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) &
429 (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)),
430 "The Local thread size must be a power of 2 for the reduction "
431 "operation");
432
433 constexpr Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;
434 // In this step, we force the code not to be more than 2-step reduction:
435 // Our empirical research shows that if each thread reduces at least 64
436 // elements individually, we get better performance. However, this can change
437 // on different platforms. In this step we force the code not to be
438 // morthan step reduction: Our empirical research shows that for inner_most
439 // dim reducer, it is better to have 8 group in a reduce dimension for sizes
440 // > 1024 to achieve the best performance.
441 const Index reductionPerThread = 64;
442 Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true);
443 const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
444 Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
445 const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;
446 const Index globalRange = pNumGroups * rNumGroups * localRange;
447
448 constexpr Index scratchSize =
449 PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC);
450 auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
451 if (rNumGroups > 1) {
452 CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
453 dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType)));
454 EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
455 dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
456 self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
457 num_coeffs_to_reduce)
458 .wait();
459 typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>
460 SecondStepPartialReductionKernel;
461 dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
462 temp_accessor, output,
463 cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)),
464 Index(1), reducer, num_coeffs_to_preserve, rNumGroups)
465 .wait();
466 self.device().deallocate_temp(temp_pointer);
467 } else {
468 dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
469 self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
470 num_coeffs_to_reduce)
471 .wait();
472 }
473 return false;
474 }
475};
476} // namespace internal
477} // namespace TensorSycl
478
479namespace internal {
480
481template <typename Self, typename Op, bool Vectorizable>
482struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
483 typedef typename Self::CoeffReturnType CoeffReturnType;
484 typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
485 static constexpr bool HasOptimizedImplementation = true;
486 static constexpr int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
487 static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) {
488 typedef std::conditional_t<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType> OutType;
489 static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
490 (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
491 "The Local thread size must be a power of 2 for the reduction "
492 "operation");
493 constexpr Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
494
495 typename Self::Index inputSize = self.impl().dimensions().TotalSize();
496 // In this step we force the code not to be more than 2-step reduction:
497 // Our empirical research shows that if each thread reduces at least 512
498 // elements individually, we get better performance.
499 const Index reductionPerThread = 2048;
500 // const Index num_work_group =
501 Index reductionGroup = dev.getPowerOfTwo(
502 (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true);
503 const Index num_work_group = std::min(reductionGroup, local_range);
504 // 1
505 // ? local_range
506 // : 1);
507 const Index global_range = num_work_group * local_range;
508
509 auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
510 typedef TensorSycl::internal::FullReductionKernelFunctor<Self, Op, local_range> reduction_kernel_t;
511 if (num_work_group > 1) {
512 CoeffReturnType *temp_pointer =
513 static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType)));
514 typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
515 dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,
516 local_range, inputSize, reducer)
517 .wait();
518 typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
519 EvaluatorPointerType, Index, local_range>
520 GenericRKernel;
521 dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
522 tmp_global_accessor, data,
523 cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)),
524 num_work_group, reducer)
525 .wait();
526 dev.deallocate_temp(temp_pointer);
527 } else {
528 dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,
529 reducer)
530 .wait();
531 }
532 }
533};
534// vectorizable inner_most most dim preserver
535// col reduction
536template <typename Self, typename Op>
537struct OuterReducer<Self, Op, Eigen::SyclDevice> {
538 static constexpr bool HasOptimizedImplementation = true;
539
540 static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
541 typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
542 typename Self::Index num_coeffs_to_preserve) {
543 return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
544 Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output,
545 num_coeffs_to_reduce,
546 num_coeffs_to_preserve);
547 }
548};
549// row reduction
550template <typename Self, typename Op>
551struct InnerReducer<Self, Op, Eigen::SyclDevice> {
552 static constexpr bool HasOptimizedImplementation = true;
553
554 static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
555 typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
556 typename Self::Index num_coeffs_to_preserve) {
557 return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
558 Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output,
559 num_coeffs_to_reduce,
560 num_coeffs_to_preserve);
561 }
562};
563
564// ArmgMax uses this kernel for partial reduction//
565// TODO(@mehdi.goli) come up with a better kernel
566// generic partial reduction
567template <typename Self, typename Op>
568struct GenericReducer<Self, Op, Eigen::SyclDevice> {
569 static constexpr bool HasOptimizedImplementation = false;
570 static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
571 typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce,
572 typename Self::Index num_coeffs_to_preserve) {
573 typename Self::Index range, GRange, tileSize;
574 dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
575
576 dev.template unary_kernel_launcher<typename Self::CoeffReturnType,
577 TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>(
578 self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
579 reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1))
580 .wait();
581 return false;
582 }
583};
584
585} // namespace internal
586} // namespace Eigen
587
588#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index