Eigen-unsupported  3.4.1 (git rev 28ded8800c26864e537852658428ab44c8399e87)
 
Loading...
Searching...
No Matches
TensorExecutor.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
11#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
12
13namespace Eigen {
14
15namespace internal {
16
24
25// TODO(ezhulenev): Add specializations for all other types of Tensor ops.
26
27template<typename Expression>
29 enum { value = false };
30};
31
32template<typename LhsXprType, typename RhsXprType>
34 const TensorAssignOp<LhsXprType, RhsXprType> > {
35 enum { value = ExpressionHasTensorBroadcastingOp<RhsXprType>::value };
36};
37
38template<typename UnaryOp, typename XprType>
39struct ExpressionHasTensorBroadcastingOp<
40 const TensorCwiseUnaryOp<UnaryOp, XprType> > {
41 enum { value = ExpressionHasTensorBroadcastingOp<XprType>::value };
42};
43
44template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
46 const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > {
47 enum {
48 value = ExpressionHasTensorBroadcastingOp<LhsXprType>::value ||
49 ExpressionHasTensorBroadcastingOp<RhsXprType>::value
50 };
51};
52
53template<typename Broadcast, typename XprType>
55 const TensorBroadcastingOp<Broadcast, XprType> > {
56 enum { value = true };
57};
58
59// -------------------------------------------------------------------------- //
60
77template <typename Expression, typename Device, bool Vectorizable,
78 TiledEvaluation Tiling>
80 public:
81 typedef typename Expression::Index StorageIndex;
82
83 // Including `unsupported/Eigen/CXX11/Tensor` in different translation units
84 // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR
85 // violation. If this template is instantiated with a non-default device, it
86 // means that this header file was included without defining
87 // `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`.
88 static_assert(std::is_same<Device, DefaultDevice>::value,
89 "Default executor instantiated with non-default device. "
90 "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
91 "EIGEN_USE_SYCL before including Eigen headers.");
92
93 EIGEN_DEVICE_FUNC
94 static EIGEN_STRONG_INLINE void run(const Expression& expr,
95 const Device& device = Device()) {
96 TensorEvaluator<Expression, Device> evaluator(expr, device);
97 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
98 if (needs_assign) {
99 const StorageIndex size = array_prod(evaluator.dimensions());
100 for (StorageIndex i = 0; i < size; ++i) {
101 evaluator.evalScalar(i);
102 }
103 }
104 evaluator.cleanup();
105 }
106};
107
112template <typename Expression, typename Device, typename DoneCallback,
113 bool Vectorizable, TiledEvaluation Tiling>
115
119template <typename Expression>
120class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
121 /*Tiling=*/TiledEvaluation::Off> {
122 public:
123 typedef typename Expression::Index StorageIndex;
124
125 EIGEN_DEVICE_FUNC
126 static EIGEN_STRONG_INLINE void run(
127 const Expression& expr, const DefaultDevice& device = DefaultDevice()) {
128 TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
129 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
130 if (needs_assign) {
131 const StorageIndex size = array_prod(evaluator.dimensions());
132 const int PacketSize = unpacket_traits<typename TensorEvaluator<
133 Expression, DefaultDevice>::PacketReturnType>::size;
134
135 // Give compiler a strong possibility to unroll the loop. But don't insist
136 // on unrolling, because if the function is expensive compiler should not
137 // unroll the loop at the expense of inlining.
138 const StorageIndex UnrolledSize =
139 (size / (4 * PacketSize)) * 4 * PacketSize;
140 for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
141 for (StorageIndex j = 0; j < 4; j++) {
142 evaluator.evalPacket(i + j * PacketSize);
143 }
144 }
145 const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
146 for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
147 evaluator.evalPacket(i);
148 }
149 for (StorageIndex i = VectorizedSize; i < size; ++i) {
150 evaluator.evalScalar(i);
151 }
152 }
153 evaluator.cleanup();
154 }
155};
156
161template <typename Expression, bool Vectorizable>
162class TensorExecutor<Expression, DefaultDevice, Vectorizable,
163 /*Tiling=*/TiledEvaluation::On> {
164 public:
165 typedef typename traits<Expression>::Scalar Scalar;
166 typedef typename remove_const<Scalar>::type ScalarNoConst;
167
169 typedef typename traits<Expression>::Index StorageIndex;
170
171 static const int NumDims = traits<Expression>::NumDimensions;
172
173 EIGEN_DEVICE_FUNC
174 static EIGEN_STRONG_INLINE void run(const Expression& expr,
175 const DefaultDevice& device = DefaultDevice()) {
176 typedef TensorBlockMapper<NumDims, Evaluator::Layout, StorageIndex>
177 TensorBlockMapper;
178
179 typedef internal::TensorBlockDescriptor<NumDims, StorageIndex>
180 TensorBlockDesc;
181 typedef internal::TensorBlockScratchAllocator<DefaultDevice>
182 TensorBlockScratch;
183
184 Evaluator evaluator(expr, device);
185
186 // TODO(ezhulenev): Do not use tiling for small tensors?
187 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
188
189 if (needs_assign) {
190 // Query expression tree for desired block size/shape.
191 const TensorBlockResourceRequirements requirements =
192 evaluator.getResourceRequirements();
193
194 const TensorBlockMapper block_mapper(
195 typename TensorBlockDesc::Dimensions(evaluator.dimensions()),
196 requirements);
197
198 // Share scratch memory allocator between all blocks.
199 TensorBlockScratch scratch(device);
200
201 const StorageIndex total_block_count = block_mapper.blockCount();
202 for (StorageIndex i = 0; i < total_block_count; ++i) {
203 TensorBlockDesc desc = block_mapper.blockDescriptor(i);
204 evaluator.evalBlock(desc, scratch);
205 scratch.reset();
206 }
207 }
208 evaluator.cleanup();
209 }
210};
211
223#ifdef EIGEN_USE_THREADS
224
225template <typename TensorBlockMapper>
226struct TensorExecutorTilingContext {
227 TensorExecutorTilingContext() = default;
228 TensorExecutorTilingContext(const TensorBlockMapper& b_mapper,
229 const TensorOpCost& b_cost, size_t b_aligned_size)
230 : block_mapper(b_mapper),
231 cost(b_cost),
232 aligned_blocksize(b_aligned_size) {}
233
234 TensorBlockMapper block_mapper; // navigate through blocks
235 TensorOpCost cost; // cost of computing a single block
236 size_t aligned_blocksize; // block size after memory alignment
237};
238
239// Computes a block evaluation parameters, and allocates temporary memory buffer
240// for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.
241template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
242TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
243 const Evaluator& evaluator) {
244 // Query expression tree for desired block size/shape.
245 TensorBlockResourceRequirements requirements =
246 evaluator.getResourceRequirements();
247
248 // Update target block size based on cost model.
249 double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(
250 1, requirements.cost_per_coeff);
251 requirements.size = static_cast<size_t>(1.0 / taskSize);
252
253 TensorBlockMapper block_mapper(
254 typename TensorBlockMapper::Dimensions(evaluator.dimensions()),
255 requirements);
256
257 size_t block_size = block_mapper.blockTotalSize();
258 const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
259 const size_t aligned_blocksize =
260 align *
261 divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
262
263 return {block_mapper, requirements.cost_per_coeff * block_size,
264 aligned_blocksize};
265}
266
267template <typename Evaluator, typename StorageIndex, bool Vectorizable>
268struct EvalRange {
269 static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
270 const StorageIndex lastIdx) {
271 Evaluator evaluator = *evaluator_in;
272 eigen_assert(lastIdx >= firstIdx);
273 for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
274 evaluator.evalScalar(i);
275 }
276 }
277
278 static StorageIndex alignBlockSize(StorageIndex size) { return size; }
279};
280
281template <typename Evaluator, typename StorageIndex>
282struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
283 static const int PacketSize =
284 unpacket_traits<typename Evaluator::PacketReturnType>::size;
285
286 static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
287 const StorageIndex lastIdx) {
288 Evaluator evaluator = *evaluator_in;
289 eigen_assert(lastIdx >= firstIdx);
290 StorageIndex i = firstIdx;
291 if (lastIdx - firstIdx >= PacketSize) {
292 eigen_assert(firstIdx % PacketSize == 0);
293 StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
294 // Give compiler a strong possibility to unroll the loop. But don't insist
295 // on unrolling, because if the function is expensive compiler should not
296 // unroll the loop at the expense of inlining.
297 for (; i <= last_chunk_offset; i += 4 * PacketSize) {
298 for (StorageIndex j = 0; j < 4; j++) {
299 evaluator.evalPacket(i + j * PacketSize);
300 }
301 }
302 last_chunk_offset = lastIdx - PacketSize;
303 for (; i <= last_chunk_offset; i += PacketSize) {
304 evaluator.evalPacket(i);
305 }
306 }
307 for (; i < lastIdx; ++i) {
308 evaluator.evalScalar(i);
309 }
310 }
311
312 static StorageIndex alignBlockSize(StorageIndex size) {
313 // Align block size to packet size and account for unrolling in run above.
314 if (size >= 16 * PacketSize) {
315 return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
316 }
317 // Aligning to 4 * PacketSize would increase block size by more than 25%.
318 return (size + PacketSize - 1) & ~(PacketSize - 1);
319 }
320};
321
322template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
323class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
324 public:
325 typedef typename Expression::Index StorageIndex;
326
327 static EIGEN_STRONG_INLINE void run(const Expression& expr,
328 const ThreadPoolDevice& device) {
329 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
330 typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
331
332 Evaluator evaluator(expr, device);
333 const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
334 if (needs_assign) {
335 const StorageIndex size = array_prod(evaluator.dimensions());
336 device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
337 EvalRange::alignBlockSize,
338 [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) {
339 EvalRange::run(&evaluator, firstIdx, lastIdx);
340 });
341 }
342 evaluator.cleanup();
343 }
344};
345
346template <typename Expression, bool Vectorizable>
347class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
348 /*Tiling=*/TiledEvaluation::On> {
349 public:
350 typedef typename traits<Expression>::Index IndexType;
351 typedef typename traits<Expression>::Scalar Scalar;
352 typedef typename remove_const<Scalar>::type ScalarNoConst;
353
354 static const int NumDims = traits<Expression>::NumDimensions;
355
356 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
357 typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
358 typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
359
360 typedef internal::TensorBlockDescriptor<NumDims, IndexType>
361 TensorBlockDesc;
362 typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
363 TensorBlockScratch;
364
365 static EIGEN_STRONG_INLINE void run(const Expression& expr,
366 const ThreadPoolDevice& device) {
367 Evaluator evaluator(expr, device);
368
369 const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
370 if (needs_assign) {
371 const TilingContext tiling =
372 internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
373 Vectorizable>(evaluator);
374
375 auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,
376 IndexType lastBlockIdx) {
377 TensorBlockScratch scratch(device);
378
379 for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
380 ++block_idx) {
381 TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);
382 evaluator.evalBlock(desc, scratch);
383 scratch.reset();
384 }
385 };
386
387 // Evaluate small expressions directly as a single block.
388 if (tiling.block_mapper.blockCount() == 1) {
389 TensorBlockScratch scratch(device);
390 TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());
391 evaluator.evalBlock(desc, scratch);
392 } else {
393 device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost,
394 eval_block);
395 }
396 }
397 evaluator.cleanup();
398 }
399};
400
401template <typename Expression, typename DoneCallback, bool Vectorizable,
402 TiledEvaluation Tiling>
403class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
404 Vectorizable, Tiling> {
405 public:
406 typedef typename Expression::Index StorageIndex;
407 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
408
409 static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
410 const ThreadPoolDevice& device,
411 DoneCallback done) {
412 TensorAsyncExecutorContext* const ctx =
413 new TensorAsyncExecutorContext(expr, device, std::move(done));
414
415 const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void {
416 if (!need_assign) {
417 delete ctx;
418 return;
419 }
420
421 typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
422 const StorageIndex size = array_prod(ctx->evaluator.dimensions());
423 device.parallelForAsync(
424 size, ctx->evaluator.costPerCoeff(Vectorizable),
425 EvalRange::alignBlockSize,
426 [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
427 EvalRange::run(&ctx->evaluator, firstIdx, lastIdx);
428 },
429 [ctx]() { delete ctx; });
430 };
431
432 ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
433 }
434
435 private:
436 struct TensorAsyncExecutorContext {
437 TensorAsyncExecutorContext(const Expression& expr,
438 const ThreadPoolDevice& thread_pool,
439 DoneCallback done)
440 : evaluator(expr, thread_pool), on_done(std::move(done)) {}
441
442 ~TensorAsyncExecutorContext() {
443 evaluator.cleanup();
444 on_done();
445 }
446
447 Evaluator evaluator;
448
449 private:
450 DoneCallback on_done;
451 };
452};
453
454template <typename Expression, typename DoneCallback, bool Vectorizable>
455class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
456 Vectorizable, /*Tileable*/ TiledEvaluation::On> {
457 public:
458 typedef typename traits<Expression>::Index IndexType;
459 typedef typename traits<Expression>::Scalar Scalar;
460 typedef typename remove_const<Scalar>::type ScalarNoConst;
461
462 static const int NumDims = traits<Expression>::NumDimensions;
463
464 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
465 typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
466 typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
467
468 typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
469 typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
470 TensorBlockScratch;
471
472 static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
473 const ThreadPoolDevice& device,
474 DoneCallback done) {
475
476 TensorAsyncExecutorContext* const ctx =
477 new TensorAsyncExecutorContext(expr, device, std::move(done));
478
479 const auto on_eval_subexprs = [ctx](bool need_assign) -> void {
480 if (!need_assign) {
481 delete ctx;
482 return;
483 }
484
485 ctx->tiling = internal::GetTensorExecutorTilingContext<
486 Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
487
488 auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
489 TensorBlockScratch scratch(ctx->device);
490
491 for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
492 ++block_idx) {
493 TensorBlockDesc desc =
494 ctx->tiling.block_mapper.blockDescriptor(block_idx);
495 ctx->evaluator.evalBlock(desc, scratch);
496 scratch.reset();
497 }
498 };
499
500 // Evaluate small expressions directly as a single block.
501 if (ctx->tiling.block_mapper.blockCount() == 1) {
502 TensorBlockScratch scratch(ctx->device);
503 TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions());
504 ctx->evaluator.evalBlock(desc, scratch);
505 delete ctx;
506 } else {
507 ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(),
508 ctx->tiling.cost, eval_block,
509 [ctx]() { delete ctx; });
510 }
511 };
512
513 ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
514 }
515
516 private:
517 struct TensorAsyncExecutorContext {
518 TensorAsyncExecutorContext(const Expression& expr,
519 const ThreadPoolDevice& thread_pool,
520 DoneCallback done)
521 : device(thread_pool),
522 evaluator(expr, thread_pool),
523 on_done(std::move(done)) {}
524
525 ~TensorAsyncExecutorContext() {
526 evaluator.cleanup();
527 on_done();
528 }
529
530 const ThreadPoolDevice& device;
531 Evaluator evaluator;
532 TilingContext tiling;
533
534 private:
535 DoneCallback on_done;
536 };
537};
538
539#endif // EIGEN_USE_THREADS
540
541// GPU: the evaluation of the expression is offloaded to a GPU.
542#if defined(EIGEN_USE_GPU)
543
544template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
545class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
546 public:
547 typedef typename Expression::Index StorageIndex;
548 static void run(const Expression& expr, const GpuDevice& device);
549};
550
551#if defined(EIGEN_GPUCC)
552template <typename Evaluator, typename StorageIndex, bool Vectorizable>
553struct EigenMetaKernelEval {
554 static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
555 void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
556 for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) {
557 eval.evalScalar(i);
558 }
559 }
560};
561
562template <typename Evaluator, typename StorageIndex>
563struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
564 static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
565 void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
566 const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
567 const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
568 const StorageIndex vectorized_step_size = step_size * PacketSize;
569
570 // Use the vector path
571 for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size;
572 i += vectorized_step_size) {
573 eval.evalPacket(i);
574 }
575 for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) {
576 eval.evalScalar(i);
577 }
578 }
579};
580
581template <typename Evaluator, typename StorageIndex>
582__global__ void
583__launch_bounds__(1024)
584EigenMetaKernel(Evaluator eval, StorageIndex size) {
585
586 const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
587 const StorageIndex step_size = blockDim.x * gridDim.x;
588
589 const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
590 EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
591}
592
593/*static*/
594template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
595EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling>::run(
596 const Expression& expr, const GpuDevice& device) {
597 TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
598 const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
599 if (needs_assign) {
600
601 const int block_size = device.maxGpuThreadsPerBlock();
602 const int max_blocks = device.getNumGpuMultiProcessors() *
603 device.maxGpuThreadsPerMultiProcessor() / block_size;
604 const StorageIndex size = array_prod(evaluator.dimensions());
605 // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
606 const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
607
608 LAUNCH_GPU_KERNEL(
609 (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
610 num_blocks, block_size, 0, device, evaluator, size);
611 }
612 evaluator.cleanup();
613}
614
615#endif // EIGEN_GPUCC
616#endif // EIGEN_USE_GPU
617
618// SYCL Executor policy
619#ifdef EIGEN_USE_SYCL
620
621template <typename Evaluator>
622struct ExecExprFunctorKernel {
623 typedef typename Evaluator::Index Index;
624 Evaluator evaluator;
625 const Index range;
626 template <typename Scratch>
627 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(
628 const Scratch, Evaluator evaluator_, const Index range_)
629 : evaluator(evaluator_), range(range_) {}
630
631 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(
632 cl::sycl::nd_item<1> itemID) {
633 compute(itemID);
634 }
635 template <bool is_vec = Evaluator::PacketAccess>
636 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<!is_vec>::type
637 compute(const cl::sycl::nd_item<1>& itemID) {
638 Index gId = static_cast<Index>(itemID.get_global_linear_id());
639 Index total_threads = itemID.get_global_range(0);
640
641 for (Index i = gId; i < range; i += total_threads) {
642 evaluator.evalScalar(i);
643 }
644 }
645 template <bool is_vec = Evaluator::PacketAccess>
646 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<is_vec>::type
647 compute(const cl::sycl::nd_item<1>& itemID) {
648 const Index vectorizedRange =
649 (range / Evaluator::PacketSize) * Evaluator::PacketSize;
650 Index gId = static_cast<Index>(itemID.get_global_linear_id());
651 const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
652 const Index start = Evaluator::PacketSize * gId;
653 for (Index i = start; i < vectorizedRange; i += step) {
654 evaluator.evalPacket(i);
655 }
656 gId += vectorizedRange;
657 for (Index i = gId; i < range; i += itemID.get_global_range(0)) {
658 evaluator.evalScalar(i);
659 }
660 }
661};
662
663template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
664class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
665 public:
666 typedef typename Expression::Index Index;
667 static EIGEN_STRONG_INLINE void run(const Expression& expr,
668 const Eigen::SyclDevice& dev) {
669 typedef Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> Evaluator;
670 Evaluator evaluator(expr, dev);
671 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
672 if (needs_assign) {
673 Index range, GRange, tileSize;
674 Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
675 total_size = (total_size == 0) ? 1 : total_size;
676 const int PacketSize =
677 Eigen::PacketType<typename Evaluator::CoeffReturnType,
678 Eigen::SyclDevice>::size;
679 Index vectorizable_threads = static_cast<Index>(total_size / PacketSize);
680 dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
681 range = total_size;
682
683 dev.template nullary_kernel_launcher<
684 typename Evaluator::CoeffReturnType,
685 ExecExprFunctorKernel<Evaluator> >(
686 evaluator,
687 cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
688 cl::sycl::range<1>(tileSize)),
689 Index(1), range);
690 }
691 evaluator.cleanup();
692 }
693};
694
695#endif
696
697} // end namespace internal
698
699} // end namespace Eigen
700
701#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
Definition TensorAssign.h:57
Definition TensorExecutor.h:114
The tensor executor class.
Definition TensorExecutor.h:79
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The tensor evaluator class.
Definition TensorEvaluator.h:27