28#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
29#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
31#include "./InternalHeaderCheck.h"
37template <
typename Op,
typename CoeffReturnType,
typename Index,
bool Vectorizable>
39 typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType;
41 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) {
return op; }
43 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(
const PacketReturnType &accumulator,
49template <
typename CoeffReturnType,
typename Index>
50struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType,
Index, false> {
51 typedef Eigen::internal::SumReducer<CoeffReturnType> type;
52 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
56 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(
const CoeffReturnType &accumulator,
58 ::Eigen::internal::scalar_quotient_op<CoeffReturnType> quotient_op;
59 return quotient_op(accumulator, CoeffReturnType(scale));
63template <
typename CoeffReturnType,
typename Index>
64struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType,
Index, true> {
65 typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType;
66 typedef Eigen::internal::SumReducer<CoeffReturnType> type;
67 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
71 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(
const PacketReturnType &accumulator,
73 return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale)));
77template <
typename CoeffReturnType,
typename OpType,
typename InputAccessor,
typename OutputAccessor,
typename Index,
79struct SecondStepFullReducer {
80 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
82 typedef OpDefiner<OpType, CoeffReturnType, Index, true> OpDef;
83 typedef typename OpDef::type Op;
84 LocalAccessor scratch;
86 OutputAccessor outAcc;
88 SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
89 : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
91 void operator()(cl::sycl::nd_item<1> itemID)
const {
99 const Index localid = itemID.get_local_id(0);
100 auto aInPtr = aI + localid;
101 auto aOutPtr = outAcc;
102 CoeffReturnType *scratchptr = scratch.get_pointer();
103 CoeffReturnType accumulator = *aInPtr;
105 scratchptr[localid] = op.finalize(accumulator);
106 for (
Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {
107 itemID.barrier(cl::sycl::access::fence_space::local_space);
108 if (localid < offset) {
109 op.reduce(scratchptr[localid + offset], &accumulator);
110 scratchptr[localid] = op.finalize(accumulator);
113 if (localid == 0) *aOutPtr = op.finalize(accumulator);
119template <
typename Evaluator,
typename OpType,
typename Evaluator::Index local_range>
120class FullReductionKernelFunctor {
122 typedef typename Evaluator::CoeffReturnType CoeffReturnType;
123 typedef typename Evaluator::Index Index;
124 typedef OpDefiner<OpType,
typename Evaluator::CoeffReturnType, Index,
125 (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
128 typedef typename OpDef::type Op;
129 typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
130 typedef typename Evaluator::PacketReturnType PacketReturnType;
131 typedef std::conditional_t<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess), PacketReturnType,
134 typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
136 LocalAccessor scratch;
138 EvaluatorPointerType final_output;
142 FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_,
143 Index rng_, OpType op_)
144 : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
146 void operator()(cl::sycl::nd_item<1> itemID)
const { compute_reduction(itemID); }
148 template <
bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
149 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<Vect> compute_reduction(
150 const cl::sycl::nd_item<1> &itemID)
const {
151 auto output_ptr = final_output;
152 Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
153 Index globalid = itemID.get_global_id(0);
154 Index localid = itemID.get_local_id(0);
155 Index step = Evaluator::PacketSize * itemID.get_global_range(0);
156 Index start = Evaluator::PacketSize * globalid;
158 PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();
159 for (Index i = start; i < VectorizedRange; i += step) {
160 op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator);
162 globalid += VectorizedRange;
164 for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
165 op.template reducePacket<PacketReturnType>(
166 ::Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, Evaluator::PacketSize>::convert_to_packet_type(
167 evaluator.impl().coeff(i), op.initialize()),
170 scratch[localid] = packetAccumulator =
171 OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);
174 for (Index offset = local_range / 2; offset > 0; offset /= 2) {
175 itemID.barrier(cl::sycl::access::fence_space::local_space);
176 if (localid < offset) {
177 op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator);
178 scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);
182 output_ptr[itemID.get_group(0)] =
183 op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));
187 template <
bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
188 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!Vect> compute_reduction(
189 const cl::sycl::nd_item<1> &itemID)
const {
190 auto output_ptr = final_output;
191 Index globalid = itemID.get_global_id(0);
192 Index localid = itemID.get_local_id(0);
194 CoeffReturnType accumulator = op.initialize();
196 for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
197 op.reduce(evaluator.impl().coeff(i), &accumulator);
199 scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng);
203 for (Index offset = local_range / 2; offset > 0; offset /= 2) {
204 itemID.barrier(cl::sycl::access::fence_space::local_space);
205 if (localid < offset) {
206 op.reduce(scratch[localid + offset], &accumulator);
207 scratch[localid] = op.finalize(accumulator);
211 output_ptr[itemID.get_group(0)] = op.finalize(accumulator);
216template <
typename Evaluator,
typename OpType>
217class GenericNondeterministicReducer {
219 typedef typename Evaluator::CoeffReturnType CoeffReturnType;
220 typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
221 typedef typename Evaluator::Index Index;
222 typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
223 typedef typename OpDef::type Op;
224 template <
typename Scratch>
225 GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_,
226 Index range_, Index num_values_to_reduce_)
227 : evaluator(evaluator_),
228 output_accessor(output_accessor_),
229 functor(OpDef::get_op(functor_)),
231 num_values_to_reduce(num_values_to_reduce_) {}
233 void operator()(cl::sycl::nd_item<1> itemID)
const {
235 Op non_const_functor;
236 std::memcpy(&non_const_functor, &functor,
sizeof(Op));
237 auto output_accessor_ptr = output_accessor;
238 Index globalid =
static_cast<Index
>(itemID.get_global_linear_id());
239 if (globalid < range) {
240 CoeffReturnType accum = functor.initialize();
241 Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
242 evaluator, evaluator.firstInput(globalid), non_const_functor, &accum);
243 output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
249 EvaluatorPointerType output_accessor;
252 Index num_values_to_reduce;
255enum class reduction_dim { inner_most, outer_most };
257template <
typename Evaluator,
typename OpType,
typename PannelParameters, reduction_dim rt>
258struct PartialReductionKernel {
259 typedef typename Evaluator::CoeffReturnType CoeffReturnType;
260 typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
261 typedef typename Evaluator::Index Index;
262 typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
263 typedef typename OpDef::type Op;
264 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
268 EvaluatorPointerType output_accessor;
270 const Index preserve_elements_num_groups;
271 const Index reduce_elements_num_groups;
272 const Index num_coeffs_to_preserve;
273 const Index num_coeffs_to_reduce;
275 PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_,
276 const Index preserve_elements_num_groups_,
const Index reduce_elements_num_groups_,
277 const Index num_coeffs_to_preserve_,
const Index num_coeffs_to_reduce_)
279 evaluator(evaluator_),
280 output_accessor(output_accessor_),
281 op(OpDef::get_op(op_)),
282 preserve_elements_num_groups(preserve_elements_num_groups_),
283 reduce_elements_num_groups(reduce_elements_num_groups_),
284 num_coeffs_to_preserve(num_coeffs_to_preserve_),
285 num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
287 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void element_wise_reduce(Index globalRId, Index globalPId,
288 CoeffReturnType &accumulator)
const {
289 if (globalPId >= num_coeffs_to_preserve) {
292 Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve)
293 : globalRId + (globalPId * num_coeffs_to_reduce);
294 Index localOffset = globalRId;
296 const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;
297 const Index per_thread_global_stride =
298 rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride;
299 for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) {
300 op.reduce(evaluator.impl().coeff(global_offset), &accumulator);
301 localOffset += per_thread_local_stride;
302 global_offset += per_thread_global_stride;
305 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void operator()(cl::sycl::nd_item<1> itemID)
const {
306 const Index linearLocalThreadId = itemID.get_local_id(0);
307 Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
308 : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
309 Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP
310 : linearLocalThreadId % PannelParameters::LocalThreadSizeR;
311 const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups
312 : itemID.get_group(0) / reduce_elements_num_groups;
313 const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups
314 : itemID.get_group(0) % reduce_elements_num_groups;
316 Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
317 const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
318 CoeffReturnType *scratchPtr = scratch.get_pointer();
319 auto outPtr = output_accessor + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
320 CoeffReturnType accumulator = op.initialize();
322 element_wise_reduce(globalRId, globalPId, accumulator);
324 accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);
325 scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
327 if (rt == reduction_dim::inner_most) {
328 pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
329 rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
330 globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
335 auto out_scratch_ptr =
336 scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
337 itemID.barrier(cl::sycl::access::fence_space::local_space);
338 if (rt == reduction_dim::inner_most) {
339 accumulator = *out_scratch_ptr;
343 for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {
344 if (rLocalThreadId < offset) {
345 op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);
348 *out_scratch_ptr = op.finalize(accumulator);
355 itemID.barrier(cl::sycl::access::fence_space::local_space);
358 if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {
359 outPtr[globalPId] = op.finalize(accumulator);
364template <
typename OutScalar,
typename Index,
typename InputAccessor,
typename OutputAccessor,
typename OpType>
365struct SecondStepPartialReduction {
366 typedef OpDefiner<OpType, OutScalar, Index, false> OpDef;
367 typedef typename OpDef::type Op;
368 typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
370 InputAccessor input_accessor;
371 OutputAccessor output_accessor;
373 const Index num_coeffs_to_preserve;
374 const Index num_coeffs_to_reduce;
376 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_,
377 OutputAccessor output_accessor_, OpType op_,
378 const Index num_coeffs_to_preserve_,
379 const Index num_coeffs_to_reduce_)
380 : input_accessor(input_accessor_),
381 output_accessor(output_accessor_),
382 op(OpDef::get_op(op_)),
383 num_coeffs_to_preserve(num_coeffs_to_preserve_),
384 num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
386 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void operator()(cl::sycl::nd_item<1> itemID)
const {
387 const Index globalId = itemID.get_global_id(0);
389 if (globalId >= num_coeffs_to_preserve)
return;
391 auto in_ptr = input_accessor + globalId;
393 OutScalar accumulator = op.initialize();
395 for (
Index i = 0; i < num_coeffs_to_reduce; i++) {
396 op.reduce(*in_ptr, &accumulator);
397 in_ptr += num_coeffs_to_preserve;
399 output_accessor[globalId] = op.finalize(accumulator);
403template <
typename Index, Index LTP, Index LTR,
bool BC_>
404struct ReductionPannel {
405 static constexpr Index LocalThreadSizeP = LTP;
406 static constexpr Index LocalThreadSizeR = LTR;
407 static constexpr bool BC = BC_;
410template <
typename Self,
typename Op, TensorSycl::
internal::reduction_dim rt>
411struct PartialReducerLauncher {
412 typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
413 typedef typename Self::CoeffReturnType CoeffReturnType;
414 typedef typename Self::Storage Storage;
415 typedef typename Self::Index Index;
416 typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true>
419 typedef PartialReductionKernel<Self, Op, PannelParameters, rt> SyclReducerKerneType;
421 static bool run(
const Self &self,
const Op &reducer,
const Eigen::SyclDevice &dev, EvaluatorPointerType output,
422 Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {
423 Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);
428 static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) &
429 (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)),
430 "The Local thread size must be a power of 2 for the reduction "
433 constexpr Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;
441 const Index reductionPerThread = 64;
442 Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(),
true);
443 const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
444 Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
445 const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;
446 const Index globalRange = pNumGroups * rNumGroups * localRange;
448 constexpr Index scratchSize =
449 PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC);
450 auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
451 if (rNumGroups > 1) {
452 CoeffReturnType *temp_pointer =
static_cast<CoeffReturnType *
>(
453 dev.allocate_temp(num_coeffs_to_preserve * rNumGroups *
sizeof(CoeffReturnType)));
454 EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
455 dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
456 self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
457 num_coeffs_to_reduce)
459 typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>
460 SecondStepPartialReductionKernel;
461 dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
462 temp_accessor, output,
463 cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)),
464 Index(1), reducer, num_coeffs_to_preserve, rNumGroups)
466 self.device().deallocate_temp(temp_pointer);
468 dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
469 self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
470 num_coeffs_to_reduce)
481template <
typename Self,
typename Op,
bool Vectorizable>
482struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
483 typedef typename Self::CoeffReturnType CoeffReturnType;
484 typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
485 static constexpr bool HasOptimizedImplementation =
true;
486 static constexpr int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
487 static void run(
const Self &self, Op &reducer,
const Eigen::SyclDevice &dev, EvaluatorPointerType data) {
488 typedef std::conditional_t<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType> OutType;
489 static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
490 (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
491 "The Local thread size must be a power of 2 for the reduction "
493 constexpr Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
495 typename Self::Index inputSize = self.impl().dimensions().TotalSize();
499 const Index reductionPerThread = 2048;
501 Index reductionGroup = dev.getPowerOfTwo(
502 (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range),
true);
503 const Index num_work_group = std::min(reductionGroup, local_range);
507 const Index global_range = num_work_group * local_range;
509 auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
510 typedef TensorSycl::internal::FullReductionKernelFunctor<Self, Op, local_range> reduction_kernel_t;
511 if (num_work_group > 1) {
512 CoeffReturnType *temp_pointer =
513 static_cast<CoeffReturnType *
>(dev.allocate_temp(num_work_group *
sizeof(CoeffReturnType)));
514 typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
515 dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,
516 local_range, inputSize, reducer)
518 typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
519 EvaluatorPointerType,
Index, local_range>
521 dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
522 tmp_global_accessor, data,
523 cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)),
524 num_work_group, reducer)
526 dev.deallocate_temp(temp_pointer);
528 dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,
536template <
typename Self,
typename Op>
537struct OuterReducer<Self, Op, Eigen::SyclDevice> {
538 static constexpr bool HasOptimizedImplementation =
true;
540 static bool run(
const Self &self,
const Op &reducer,
const Eigen::SyclDevice &dev,
541 typename Self::EvaluatorPointerType output,
typename Self::Index num_coeffs_to_reduce,
542 typename Self::Index num_coeffs_to_preserve) {
543 return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
544 Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output,
545 num_coeffs_to_reduce,
546 num_coeffs_to_preserve);
550template <
typename Self,
typename Op>
551struct InnerReducer<Self, Op, Eigen::SyclDevice> {
552 static constexpr bool HasOptimizedImplementation =
true;
554 static bool run(
const Self &self,
const Op &reducer,
const Eigen::SyclDevice &dev,
555 typename Self::EvaluatorPointerType output,
typename Self::Index num_coeffs_to_reduce,
556 typename Self::Index num_coeffs_to_preserve) {
557 return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
558 Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output,
559 num_coeffs_to_reduce,
560 num_coeffs_to_preserve);
567template <
typename Self,
typename Op>
568struct GenericReducer<Self, Op, Eigen::SyclDevice> {
569 static constexpr bool HasOptimizedImplementation =
false;
570 static bool run(
const Self &self,
const Op &reducer,
const Eigen::SyclDevice &dev,
571 typename Self::EvaluatorPointerType output,
typename Self::Index num_values_to_reduce,
572 typename Self::Index num_coeffs_to_preserve) {
573 typename Self::Index range, GRange, tileSize;
574 dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
576 dev.template unary_kernel_launcher<
typename Self::CoeffReturnType,
577 TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>(
578 self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
Index(1),
579 reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<
Index>(1))
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index