Eigen-unsupported  5.0.1-dev+284dcc12
 
Loading...
Searching...
No Matches
TensorScanSycl.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Mehdi Goli Codeplay Software Ltd.
5// Ralph Potter Codeplay Software Ltd.
6// Luke Iwanski Codeplay Software Ltd.
7// Contact: <eigen@codeplay.com>
8//
9// This Source Code Form is subject to the terms of the Mozilla
10// Public License v. 2.0. If a copy of the MPL was not distributed
11// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
12
13/*****************************************************************
14 * TensorScanSycl.h
15 *
16 * \brief:
17 * Tensor Scan Sycl implement the extend version of
18 * "Efficient parallel scan algorithms for GPUs." .for Tensor operations.
19 * The algorithm requires up to 3 stage (consequently 3 kernels) depending on
20 * the size of the tensor. In the first kernel (ScanKernelFunctor), each
21 * threads within the work-group individually reduces the allocated elements per
22 * thread in order to reduces the total number of blocks. In the next step all
23 * thread within the work-group will reduce the associated blocks into the
24 * temporary buffers. In the next kernel(ScanBlockKernelFunctor), the temporary
25 * buffer is given as an input and all the threads within a work-group scan and
26 * reduces the boundaries between the blocks (generated from the previous
27 * kernel). and write the data on the temporary buffer. If the second kernel is
28 * required, the third and final kernel (ScanAdjustmentKernelFunctor) will
29 * adjust the final result into the output buffer.
30 * The original algorithm for the parallel prefix sum can be found here:
31 *
32 * Sengupta, Shubhabrata, Mark Harris, and Michael Garland. "Efficient parallel
33 * scan algorithms for GPUs." NVIDIA, Santa Clara, CA, Tech. Rep. NVR-2008-003
34 *1, no. 1 (2008): 1-17.
35 *****************************************************************/
36
37#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
38#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
39
40// IWYU pragma: private
41#include "./InternalHeaderCheck.h"
42
43namespace Eigen {
44namespace TensorSycl {
45namespace internal {
46
47#ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE
48#define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4)
49#endif
50
51template <typename index_t>
52struct ScanParameters {
53 // must be power of 2
54 static constexpr index_t ScanPerThread = 8;
55 const index_t total_size;
56 const index_t non_scan_size;
57 const index_t scan_size;
58 const index_t non_scan_stride;
59 const index_t scan_stride;
60 const index_t panel_threads;
61 const index_t group_threads;
62 const index_t block_threads;
63 const index_t elements_per_group;
64 const index_t elements_per_block;
65 const index_t loop_range;
66
67 ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_,
68 index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_,
69 index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)
70 : total_size(total_size_),
71 non_scan_size(non_scan_size_),
72 scan_size(scan_size_),
73 non_scan_stride(non_scan_stride_),
74 scan_stride(scan_stride_),
75 panel_threads(panel_threads_),
76 group_threads(group_threads_),
77 block_threads(block_threads_),
78 elements_per_group(elements_per_group_),
79 elements_per_block(elements_per_block_),
80 loop_range(loop_range_) {}
81};
82
83enum class scan_step { first, second };
84template <typename Evaluator, typename CoeffReturnType, typename OutAccessor, typename Op, typename Index,
85 scan_step stp>
86struct ScanKernelFunctor {
87 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
88 LocalAccessor;
89 static constexpr int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
90
91 LocalAccessor scratch;
92 Evaluator dev_eval;
93 OutAccessor out_ptr;
94 OutAccessor tmp_ptr;
95 const ScanParameters<Index> scanParameters;
96 Op accumulator;
97 const bool inclusive;
98 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_,
99 OutAccessor out_accessor_, OutAccessor temp_accessor_,
100 const ScanParameters<Index> scanParameters_, Op accumulator_,
101 const bool inclusive_)
102 : scratch(scratch_),
103 dev_eval(dev_eval_),
104 out_ptr(out_accessor_),
105 tmp_ptr(temp_accessor_),
106 scanParameters(scanParameters_),
107 accumulator(accumulator_),
108 inclusive(inclusive_) {}
109
110 template <scan_step sst = stp, typename Input>
111 std::enable_if_t<sst == scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read(
112 const Input &inpt, Index global_id) const {
113 return inpt.coeff(global_id);
114 }
115
116 template <scan_step sst = stp, typename Input>
117 std::enable_if_t<sst != scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read(
118 const Input &inpt, Index global_id) const {
119 return inpt[global_id];
120 }
121
122 template <scan_step sst = stp, typename InclusiveOp>
123 std::enable_if_t<sst == scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation(
124 InclusiveOp inclusive_op) const {
125 inclusive_op();
126 }
127
128 template <scan_step sst = stp, typename InclusiveOp>
129 std::enable_if_t<sst != scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation(
130 InclusiveOp) const {}
131
132 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
133 for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
134 Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
135 Index tmp = data_offset % scanParameters.panel_threads;
136 const Index panel_id = data_offset / scanParameters.panel_threads;
137 const Index group_id = tmp / scanParameters.group_threads;
138 tmp = tmp % scanParameters.group_threads;
139 const Index block_id = tmp / scanParameters.block_threads;
140 const Index local_id = tmp % scanParameters.block_threads;
141 // we put one element per packet in scratch_mem
142 const Index scratch_stride = scanParameters.elements_per_block / PacketSize;
143 const Index scratch_offset = (itemID.get_local_id(0) / scanParameters.block_threads) * scratch_stride;
144 CoeffReturnType private_scan[ScanParameters<Index>::ScanPerThread];
145 CoeffReturnType inclusive_scan;
146 // the actual panel size is scan_size * non_scan_size.
147 // elements_per_panel is roundup to power of 2 for binary tree
148 const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
149 const Index group_offset = group_id * scanParameters.non_scan_stride;
150 // This will be effective when the size is bigger than elements_per_block
151 const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
152 const Index thread_offset = (ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride);
153 const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
154 Index next_elements = 0;
155 EIGEN_UNROLL_LOOP
156 for (int i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
157 Index global_id = global_offset + next_elements;
158 private_scan[i] = ((((block_id * scanParameters.elements_per_block) +
159 (ScanParameters<Index>::ScanPerThread * local_id) + i) < scanParameters.scan_size) &&
160 (global_id < scanParameters.total_size))
161 ? read(dev_eval, global_id)
162 : accumulator.initialize();
163 next_elements += scanParameters.scan_stride;
164 }
165 first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {
166 if (inclusive) {
167 inclusive_scan = private_scan[ScanParameters<Index>::ScanPerThread - 1];
168 }
169 });
170 // This for loop must be 2
171 EIGEN_UNROLL_LOOP
172 for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
173 Index private_offset = 1;
174 // build sum in place up the tree
175 EIGEN_UNROLL_LOOP
176 for (Index d = PacketSize >> 1; d > 0; d >>= 1) {
177 EIGEN_UNROLL_LOOP
178 for (Index l = 0; l < d; l++) {
179 Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
180 Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
181 CoeffReturnType accum = accumulator.initialize();
182 accumulator.reduce(private_scan[ai], &accum);
183 accumulator.reduce(private_scan[bi], &accum);
184 private_scan[bi] = accumulator.finalize(accum);
185 }
186 private_offset *= 2;
187 }
188 scratch[2 * local_id + (packetIndex / PacketSize) + scratch_offset] =
189 private_scan[PacketSize - 1 + packetIndex];
190 private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize();
191 // traverse down tree & build scan
192 EIGEN_UNROLL_LOOP
193 for (Index d = 1; d < PacketSize; d *= 2) {
194 private_offset >>= 1;
195 EIGEN_UNROLL_LOOP
196 for (Index l = 0; l < d; l++) {
197 Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
198 Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
199 CoeffReturnType accum = accumulator.initialize();
200 accumulator.reduce(private_scan[ai], &accum);
201 accumulator.reduce(private_scan[bi], &accum);
202 private_scan[ai] = private_scan[bi];
203 private_scan[bi] = accumulator.finalize(accum);
204 }
205 }
206 }
207
208 Index offset = 1;
209 // build sum in place up the tree
210 for (Index d = scratch_stride >> 1; d > 0; d >>= 1) {
211 // Synchronise
212 itemID.barrier(cl::sycl::access::fence_space::local_space);
213 if (local_id < d) {
214 Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
215 Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
216 CoeffReturnType accum = accumulator.initialize();
217 accumulator.reduce(scratch[ai], &accum);
218 accumulator.reduce(scratch[bi], &accum);
219 scratch[bi] = accumulator.finalize(accum);
220 }
221 offset *= 2;
222 }
223 // Synchronise
224 itemID.barrier(cl::sycl::access::fence_space::local_space);
225 // next step optimisation
226 if (local_id == 0) {
227 if (((scanParameters.elements_per_group / scanParameters.elements_per_block) > 1)) {
228 const Index temp_id = panel_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) *
229 scanParameters.non_scan_size +
230 group_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) +
231 block_id;
232 tmp_ptr[temp_id] = scratch[scratch_stride - 1 + scratch_offset];
233 }
234 // clear the last element
235 scratch[scratch_stride - 1 + scratch_offset] = accumulator.initialize();
236 }
237 // traverse down tree & build scan
238 for (Index d = 1; d < scratch_stride; d *= 2) {
239 offset >>= 1;
240 // Synchronise
241 itemID.barrier(cl::sycl::access::fence_space::local_space);
242 if (local_id < d) {
243 Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
244 Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
245 CoeffReturnType accum = accumulator.initialize();
246 accumulator.reduce(scratch[ai], &accum);
247 accumulator.reduce(scratch[bi], &accum);
248 scratch[ai] = scratch[bi];
249 scratch[bi] = accumulator.finalize(accum);
250 }
251 }
252 // Synchronise
253 itemID.barrier(cl::sycl::access::fence_space::local_space);
254 // This for loop must be 2
255 EIGEN_UNROLL_LOOP
256 for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
257 EIGEN_UNROLL_LOOP
258 for (Index i = 0; i < PacketSize; i++) {
259 CoeffReturnType accum = private_scan[packetIndex + i];
260 accumulator.reduce(scratch[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum);
261 private_scan[packetIndex + i] = accumulator.finalize(accum);
262 }
263 }
264 first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {
265 if (inclusive) {
266 accumulator.reduce(private_scan[ScanParameters<Index>::ScanPerThread - 1], &inclusive_scan);
267 private_scan[0] = accumulator.finalize(inclusive_scan);
268 }
269 });
270 next_elements = 0;
271 // right the first set of private param
272 EIGEN_UNROLL_LOOP
273 for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
274 Index global_id = global_offset + next_elements;
275 if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
276 scanParameters.scan_size) &&
277 (global_id < scanParameters.total_size)) {
278 Index private_id = (i * !inclusive) + (((i + 1) % ScanParameters<Index>::ScanPerThread) * (inclusive));
279 out_ptr[global_id] = private_scan[private_id];
280 }
281 next_elements += scanParameters.scan_stride;
282 }
283 } // end for loop
284 }
285};
286
287template <typename CoeffReturnType, typename InAccessor, typename OutAccessor, typename Op, typename Index>
288struct ScanAdjustmentKernelFunctor {
289 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
290 LocalAccessor;
291 static constexpr int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
292 InAccessor in_ptr;
293 OutAccessor out_ptr;
294 const ScanParameters<Index> scanParameters;
295 Op accumulator;
296 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_,
297 OutAccessor out_accessor_,
298 const ScanParameters<Index> scanParameters_,
299 Op accumulator_)
300 : in_ptr(in_accessor_), out_ptr(out_accessor_), scanParameters(scanParameters_), accumulator(accumulator_) {}
301
302 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
303 for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
304 Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
305 Index tmp = data_offset % scanParameters.panel_threads;
306 const Index panel_id = data_offset / scanParameters.panel_threads;
307 const Index group_id = tmp / scanParameters.group_threads;
308 tmp = tmp % scanParameters.group_threads;
309 const Index block_id = tmp / scanParameters.block_threads;
310 const Index local_id = tmp % scanParameters.block_threads;
311
312 // the actual panel size is scan_size * non_scan_size.
313 // elements_per_panel is roundup to power of 2 for binary tree
314 const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
315 const Index group_offset = group_id * scanParameters.non_scan_stride;
316 // This will be effective when the size is bigger than elements_per_block
317 const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
318 const Index thread_offset = ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride;
319
320 const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
321 const Index block_size = scanParameters.elements_per_group / scanParameters.elements_per_block;
322 const Index in_id = (panel_id * block_size * scanParameters.non_scan_size) + (group_id * block_size) + block_id;
323 CoeffReturnType adjust_val = in_ptr[in_id];
324
325 Index next_elements = 0;
326 EIGEN_UNROLL_LOOP
327 for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
328 Index global_id = global_offset + next_elements;
329 if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
330 scanParameters.scan_size) &&
331 (global_id < scanParameters.total_size)) {
332 CoeffReturnType accum = adjust_val;
333 accumulator.reduce(out_ptr[global_id], &accum);
334 out_ptr[global_id] = accumulator.finalize(accum);
335 }
336 next_elements += scanParameters.scan_stride;
337 }
338 }
339 }
340};
341
342template <typename Index>
343struct ScanInfo {
344 const Index &total_size;
345 const Index &scan_size;
346 const Index &panel_size;
347 const Index &non_scan_size;
348 const Index &scan_stride;
349 const Index &non_scan_stride;
350
351 Index max_elements_per_block;
352 Index block_size;
353 Index panel_threads;
354 Index group_threads;
355 Index block_threads;
356 Index elements_per_group;
357 Index elements_per_block;
358 Index loop_range;
359 Index global_range;
360 Index local_range;
361 const Eigen::SyclDevice &dev;
362 EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_,
363 const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_,
364 const Eigen::SyclDevice &dev_)
365 : total_size(total_size_),
366 scan_size(scan_size_),
367 panel_size(panel_size_),
368 non_scan_size(non_scan_size_),
369 scan_stride(scan_stride_),
370 non_scan_stride(non_scan_stride_),
371 dev(dev_) {
372 // must be power of 2
373 local_range = std::min(Index(dev.getNearestPowerOfTwoWorkGroupSize()),
374 Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1));
375
376 max_elements_per_block = local_range * ScanParameters<Index>::ScanPerThread;
377
378 elements_per_group =
379 dev.getPowerOfTwo(Index(roundUp(Index(scan_size), ScanParameters<Index>::ScanPerThread)), true);
380 const Index elements_per_panel = elements_per_group * non_scan_size;
381 elements_per_block = std::min(Index(elements_per_group), Index(max_elements_per_block));
382 panel_threads = elements_per_panel / ScanParameters<Index>::ScanPerThread;
383 group_threads = elements_per_group / ScanParameters<Index>::ScanPerThread;
384 block_threads = elements_per_block / ScanParameters<Index>::ScanPerThread;
385 block_size = elements_per_group / elements_per_block;
386#ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE
387 const Index max_threads = std::min(Index(panel_threads * panel_size), Index(EIGEN_SYCL_MAX_GLOBAL_RANGE));
388#else
389 const Index max_threads = panel_threads * panel_size;
390#endif
391 global_range = roundUp(max_threads, local_range);
392 loop_range = Index(
393 std::ceil(double(elements_per_panel * panel_size) / (global_range * ScanParameters<Index>::ScanPerThread)));
394 }
395 inline ScanParameters<Index> get_scan_parameter() {
396 return ScanParameters<Index>(total_size, non_scan_size, scan_size, non_scan_stride, scan_stride, panel_threads,
397 group_threads, block_threads, elements_per_group, elements_per_block, loop_range);
398 }
399 inline cl::sycl::nd_range<1> get_thread_range() {
400 return cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
401 }
402};
403
404template <typename EvaluatorPointerType, typename CoeffReturnType, typename Reducer, typename Index>
405struct SYCLAdjustBlockOffset {
406 EIGEN_STRONG_INLINE static void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr,
407 Reducer &accumulator, const Index total_size,
408 const Index scan_size, const Index panel_size,
409 const Index non_scan_size, const Index scan_stride,
410 const Index non_scan_stride, const Eigen::SyclDevice &dev) {
411 auto scan_info =
412 ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
413
414 typedef ScanAdjustmentKernelFunctor<CoeffReturnType, EvaluatorPointerType, EvaluatorPointerType, Reducer, Index>
415 AdjustFuctor;
416 dev.template unary_kernel_launcher<CoeffReturnType, AdjustFuctor>(in_ptr, out_ptr, scan_info.get_thread_range(),
417 scan_info.max_elements_per_block,
418 scan_info.get_scan_parameter(), accumulator)
419 .wait();
420 }
421};
422
423template <typename CoeffReturnType, scan_step stp>
424struct ScanLauncher_impl {
425 template <typename Input, typename EvaluatorPointerType, typename Reducer, typename Index>
426 EIGEN_STRONG_INLINE static void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator,
427 const Index total_size, const Index scan_size, const Index panel_size,
428 const Index non_scan_size, const Index scan_stride,
429 const Index non_scan_stride, const bool inclusive,
430 const Eigen::SyclDevice &dev) {
431 auto scan_info =
432 ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
433 const Index temp_pointer_size = scan_info.block_size * non_scan_size * panel_size;
434 const Index scratch_size = scan_info.max_elements_per_block / (ScanParameters<Index>::ScanPerThread / 2);
435 CoeffReturnType *temp_pointer =
436 static_cast<CoeffReturnType *>(dev.allocate_temp(temp_pointer_size * sizeof(CoeffReturnType)));
437 EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
438
439 typedef ScanKernelFunctor<Input, CoeffReturnType, EvaluatorPointerType, Reducer, Index, stp> ScanFunctor;
440 dev.template binary_kernel_launcher<CoeffReturnType, ScanFunctor>(
441 in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size,
442 scan_info.get_scan_parameter(), accumulator, inclusive)
443 .wait();
444
445 if (scan_info.block_size > 1) {
446 ScanLauncher_impl<CoeffReturnType, scan_step::second>::scan_block(
447 tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.block_size, panel_size,
448 non_scan_size, Index(1), scan_info.block_size, false, dev);
449
450 SYCLAdjustBlockOffset<EvaluatorPointerType, CoeffReturnType, Reducer, Index>::adjust_scan_block_offset(
451 tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride,
452 non_scan_stride, dev);
453 }
454 dev.deallocate_temp(temp_pointer);
455 }
456};
457
458} // namespace internal
459} // namespace TensorSycl
460namespace internal {
461template <typename Self, typename Reducer, bool vectorize>
462struct ScanLauncher<Self, Reducer, Eigen::SyclDevice, vectorize> {
463 typedef typename Self::Index Index;
464 typedef typename Self::CoeffReturnType CoeffReturnType;
465 typedef typename Self::Storage Storage;
466 typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
467 void operator()(Self &self, EvaluatorPointerType data) const {
468 const Index total_size = internal::array_prod(self.dimensions());
469 const Index scan_size = self.size();
470 const Index scan_stride = self.stride();
471 // this is the scan op (can be sum or ...)
472 auto accumulator = self.accumulator();
473 auto inclusive = !self.exclusive();
474 auto consume_dim = self.consume_dim();
475 auto dev = self.device();
476
477 auto dims = self.inner().dimensions();
478
479 Index non_scan_size = 1;
480 Index panel_size = 1;
481 if (static_cast<int>(Self::Layout) == static_cast<int>(ColMajor)) {
482 for (int i = 0; i < consume_dim; i++) {
483 non_scan_size *= dims[i];
484 }
485 for (int i = consume_dim + 1; i < Self::NumDims; i++) {
486 panel_size *= dims[i];
487 }
488 } else {
489 for (int i = Self::NumDims - 1; i > consume_dim; i--) {
490 non_scan_size *= dims[i];
491 }
492 for (int i = consume_dim - 1; i >= 0; i--) {
493 panel_size *= dims[i];
494 }
495 }
496 const Index non_scan_stride = (scan_stride > 1) ? 1 : scan_size;
497 auto eval_impl = self.inner();
498 TensorSycl::internal::ScanLauncher_impl<CoeffReturnType, TensorSycl::internal::scan_step::first>::scan_block(
499 eval_impl, data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride,
500 inclusive, dev);
501 }
502};
503} // namespace internal
504} // namespace Eigen
505
506#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index