Eigen-unsupported  5.0.1-dev+284dcc12
 
Loading...
Searching...
No Matches
TensorExecutor.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
11#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
12
13// IWYU pragma: private
14#include "./InternalHeaderCheck.h"
15
16namespace Eigen {
17
18namespace internal {
19
27
28// TODO(ezhulenev): Add specializations for all other types of Tensor ops.
29
30template <typename Expression>
32 enum { value = false };
33};
34
35template <typename LhsXprType, typename RhsXprType>
36struct ExpressionHasTensorBroadcastingOp<const TensorAssignOp<LhsXprType, RhsXprType> > {
37 enum { value = ExpressionHasTensorBroadcastingOp<RhsXprType>::value };
38};
39
40template <typename UnaryOp, typename XprType>
41struct ExpressionHasTensorBroadcastingOp<const TensorCwiseUnaryOp<UnaryOp, XprType> > {
42 enum { value = ExpressionHasTensorBroadcastingOp<XprType>::value };
43};
44
45template <typename BinaryOp, typename LhsXprType, typename RhsXprType>
46struct ExpressionHasTensorBroadcastingOp<const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > {
47 enum {
48 value = ExpressionHasTensorBroadcastingOp<LhsXprType>::value || ExpressionHasTensorBroadcastingOp<RhsXprType>::value
49 };
50};
51
52template <typename Broadcast, typename XprType>
53struct ExpressionHasTensorBroadcastingOp<const TensorBroadcastingOp<Broadcast, XprType> > {
54 enum { value = true };
55};
56
57// -------------------------------------------------------------------------- //
58
75template <typename Expression, typename Device, bool Vectorizable, TiledEvaluation Tiling>
77 public:
78 typedef typename Expression::Index StorageIndex;
79
80 // Including `unsupported/Eigen/CXX11/Tensor` in different translation units
81 // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR
82 // violation. If this template is instantiated with a non-default device, it
83 // means that this header file was included without defining
84 // `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`.
85 static_assert(std::is_same<Device, DefaultDevice>::value,
86 "Default executor instantiated with non-default device. "
87 "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
88 "EIGEN_USE_SYCL before including Eigen headers.");
89
90 static EIGEN_STRONG_INLINE void run(const Expression& expr, const Device& device = DefaultDevice()) {
91 TensorEvaluator<Expression, Device> evaluator(expr, device);
92 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
93 if (needs_assign) {
94 const StorageIndex size = array_prod(evaluator.dimensions());
95 for (StorageIndex i = 0; i < size; ++i) {
96 evaluator.evalScalar(i);
97 }
98 }
99 evaluator.cleanup();
100 }
101};
102
107template <typename Expression, typename Device, typename DoneCallback, bool Vectorizable, TiledEvaluation Tiling>
109
113template <typename Expression>
114class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
115 /*Tiling=*/TiledEvaluation::Off> {
116 public:
117 typedef typename Expression::Index StorageIndex;
118
119 static EIGEN_STRONG_INLINE void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) {
120 TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
121 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
122 if (needs_assign) {
123 const StorageIndex size = array_prod(evaluator.dimensions());
124 const int PacketSize =
125 unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
126
127 // Give compiler a strong possibility to unroll the loop. But don't insist
128 // on unrolling, because if the function is expensive compiler should not
129 // unroll the loop at the expense of inlining.
130 const StorageIndex UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
131 for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
132 for (StorageIndex j = 0; j < 4; j++) {
133 evaluator.evalPacket(i + j * PacketSize);
134 }
135 }
136 const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
137 for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
138 evaluator.evalPacket(i);
139 }
140 for (StorageIndex i = VectorizedSize; i < size; ++i) {
141 evaluator.evalScalar(i);
142 }
143 }
144 evaluator.cleanup();
145 }
146};
147
152template <typename Expression, bool Vectorizable>
153class TensorExecutor<Expression, DefaultDevice, Vectorizable,
154 /*Tiling=*/TiledEvaluation::On> {
155 public:
156 typedef typename traits<Expression>::Scalar Scalar;
157 typedef std::remove_const_t<Scalar> ScalarNoConst;
158
160 typedef typename traits<Expression>::Index StorageIndex;
161
162 static constexpr int NumDims = traits<Expression>::NumDimensions;
163
164 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(const Expression& expr,
165 const DefaultDevice& device = DefaultDevice()) {
166 typedef TensorBlockMapper<NumDims, Evaluator::Layout, StorageIndex> TensorBlockMapper;
167
168 typedef internal::TensorBlockDescriptor<NumDims, StorageIndex> TensorBlockDesc;
169 typedef internal::TensorBlockScratchAllocator<DefaultDevice> TensorBlockScratch;
170
171 Evaluator evaluator(expr, device);
172
173 // TODO(ezhulenev): Do not use tiling for small tensors?
174 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
175
176 if (needs_assign) {
177 // Query expression tree for desired block size/shape.
178 const TensorBlockResourceRequirements requirements = evaluator.getResourceRequirements();
179
180 const TensorBlockMapper block_mapper(typename TensorBlockDesc::Dimensions(evaluator.dimensions()), requirements);
181
182 // Share scratch memory allocator between all blocks.
183 TensorBlockScratch scratch(device);
184
185 const StorageIndex total_block_count = block_mapper.blockCount();
186 for (StorageIndex i = 0; i < total_block_count; ++i) {
187 TensorBlockDesc desc = block_mapper.blockDescriptor(i);
188 evaluator.evalBlock(desc, scratch);
189 scratch.reset();
190 }
191 }
192 evaluator.cleanup();
193 }
194};
195
207#ifdef EIGEN_USE_THREADS
208
209template <typename TensorBlockMapper>
210struct TensorExecutorTilingContext {
211 TensorExecutorTilingContext() = default;
212 TensorExecutorTilingContext(const TensorBlockMapper& b_mapper, const TensorOpCost& b_cost, size_t b_aligned_size)
213 : block_mapper(b_mapper), cost(b_cost), aligned_blocksize(b_aligned_size) {}
214
215 TensorBlockMapper block_mapper; // navigate through blocks
216 TensorOpCost cost; // cost of computing a single block
217 size_t aligned_blocksize; // block size after memory alignment
218};
219
220// Computes a block evaluation parameters, and allocates temporary memory buffer
221// for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.
222template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
223TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(const Evaluator& evaluator) {
224 // Query expression tree for desired block size/shape.
225 TensorBlockResourceRequirements requirements = evaluator.getResourceRequirements();
226
227 // Update target block size based on cost model.
228 double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(1, requirements.cost_per_coeff);
229 requirements.size = static_cast<size_t>(1.0 / taskSize);
230
231 TensorBlockMapper block_mapper(typename TensorBlockMapper::Dimensions(evaluator.dimensions()), requirements);
232
233 size_t block_size = block_mapper.blockTotalSize();
234 const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
235 const size_t aligned_blocksize =
236 align * numext::div_ceil<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
237
238 return {block_mapper, requirements.cost_per_coeff * block_size, aligned_blocksize};
239}
240
241template <typename Evaluator, typename StorageIndex, bool Vectorizable>
242struct EvalRange {
243 static void run(Evaluator* evaluator_in, const StorageIndex firstIdx, const StorageIndex lastIdx) {
244 Evaluator evaluator = *evaluator_in;
245 eigen_assert(lastIdx >= firstIdx);
246 for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
247 evaluator.evalScalar(i);
248 }
249 }
250
251 static StorageIndex alignBlockSize(StorageIndex size) { return size; }
252};
253
254template <typename Evaluator, typename StorageIndex>
255struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
256 static constexpr int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
257
258 static void run(Evaluator* evaluator_in, const StorageIndex firstIdx, const StorageIndex lastIdx) {
259 Evaluator evaluator = *evaluator_in;
260 eigen_assert(lastIdx >= firstIdx);
261 StorageIndex i = firstIdx;
262 if (lastIdx - firstIdx >= PacketSize) {
263 eigen_assert(firstIdx % PacketSize == 0);
264 StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
265 // Give compiler a strong possibility to unroll the loop. But don't insist
266 // on unrolling, because if the function is expensive compiler should not
267 // unroll the loop at the expense of inlining.
268 for (; i <= last_chunk_offset; i += 4 * PacketSize) {
269 for (StorageIndex j = 0; j < 4; j++) {
270 evaluator.evalPacket(i + j * PacketSize);
271 }
272 }
273 last_chunk_offset = lastIdx - PacketSize;
274 for (; i <= last_chunk_offset; i += PacketSize) {
275 evaluator.evalPacket(i);
276 }
277 }
278 for (; i < lastIdx; ++i) {
279 evaluator.evalScalar(i);
280 }
281 }
282
283 static StorageIndex alignBlockSize(StorageIndex size) {
284 // Align block size to packet size and account for unrolling in run above.
285 if (size >= 16 * PacketSize) {
286 return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
287 }
288 // Aligning to 4 * PacketSize would increase block size by more than 25%.
289 return (size + PacketSize - 1) & ~(PacketSize - 1);
290 }
291};
292
293template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
294class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
295 public:
296 typedef typename Expression::Index StorageIndex;
297
298 static EIGEN_STRONG_INLINE void run(const Expression& expr, const ThreadPoolDevice& device) {
299 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
300 typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
301
302 Evaluator evaluator(expr, device);
303 const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
304 if (needs_assign) {
305 const StorageIndex size = array_prod(evaluator.dimensions());
306 device.parallelFor(
307 size, evaluator.costPerCoeff(Vectorizable), EvalRange::alignBlockSize,
308 [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) { EvalRange::run(&evaluator, firstIdx, lastIdx); });
309 }
310 evaluator.cleanup();
311 }
312};
313
314template <typename Expression, bool Vectorizable>
315class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
316 /*Tiling=*/TiledEvaluation::On> {
317 public:
318 typedef typename traits<Expression>::Index IndexType;
319 typedef typename traits<Expression>::Scalar Scalar;
320 typedef std::remove_const_t<Scalar> ScalarNoConst;
321
322 static constexpr int NumDims = traits<Expression>::NumDimensions;
323
324 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
325 typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
326 typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
327
328 typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
329 typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice> TensorBlockScratch;
330
331 static EIGEN_STRONG_INLINE void run(const Expression& expr, const ThreadPoolDevice& device) {
332 Evaluator evaluator(expr, device);
333
334 const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
335 if (needs_assign) {
336 const TilingContext tiling =
337 internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper, Vectorizable>(evaluator);
338
339 auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx, IndexType lastBlockIdx) {
340 TensorBlockScratch scratch(device);
341
342 for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; ++block_idx) {
343 TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);
344 evaluator.evalBlock(desc, scratch);
345 scratch.reset();
346 }
347 };
348
349 // Evaluate small expressions directly as a single block.
350 if (tiling.block_mapper.blockCount() == 1) {
351 TensorBlockScratch scratch(device);
352 TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());
353 evaluator.evalBlock(desc, scratch);
354 } else {
355 device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost, std::move(eval_block));
356 }
357 }
358 evaluator.cleanup();
359 }
360};
361
362template <typename Expression, typename DoneCallback, bool Vectorizable, TiledEvaluation Tiling>
363class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback, Vectorizable, Tiling> {
364 public:
365 typedef typename Expression::Index StorageIndex;
366 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
367
368 static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, const ThreadPoolDevice& device, DoneCallback done) {
369 TensorAsyncExecutorContext* const ctx = new TensorAsyncExecutorContext(expr, device, std::move(done));
370
371 const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void {
372 if (!need_assign) {
373 delete ctx;
374 return;
375 }
376
377 typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
378 const StorageIndex size = array_prod(ctx->evaluator.dimensions());
379 device.parallelForAsync(
380 size, ctx->evaluator.costPerCoeff(Vectorizable), EvalRange::alignBlockSize,
381 [ctx](StorageIndex firstIdx, StorageIndex lastIdx) { EvalRange::run(&ctx->evaluator, firstIdx, lastIdx); },
382 [ctx]() { delete ctx; });
383 };
384
385 ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
386 }
387
388 private:
389 struct TensorAsyncExecutorContext {
390 TensorAsyncExecutorContext(const Expression& expr, const ThreadPoolDevice& thread_pool, DoneCallback done)
391 : evaluator(expr, thread_pool), on_done(std::move(done)) {}
392
393 ~TensorAsyncExecutorContext() {
394 evaluator.cleanup();
395 on_done();
396 }
397
398 Evaluator evaluator;
399
400 private:
401 DoneCallback on_done;
402 };
403};
404
405template <typename Expression, typename DoneCallback, bool Vectorizable>
406class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback, Vectorizable, /*Tileable*/ TiledEvaluation::On> {
407 public:
408 typedef typename traits<Expression>::Index IndexType;
409 typedef typename traits<Expression>::Scalar Scalar;
410 typedef std::remove_const_t<Scalar> ScalarNoConst;
411
412 static constexpr int NumDims = traits<Expression>::NumDimensions;
413
414 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
415 typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
416 typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
417
418 typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
419 typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice> TensorBlockScratch;
420
421 static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, const ThreadPoolDevice& device, DoneCallback done) {
422 TensorAsyncExecutorContext* const ctx = new TensorAsyncExecutorContext(expr, device, std::move(done));
423
424 const auto on_eval_subexprs = [ctx](bool need_assign) -> void {
425 if (!need_assign) {
426 delete ctx;
427 return;
428 }
429
430 ctx->tiling = internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
431
432 auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
433 TensorBlockScratch scratch(ctx->device);
434
435 for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; ++block_idx) {
436 TensorBlockDesc desc = ctx->tiling.block_mapper.blockDescriptor(block_idx);
437 ctx->evaluator.evalBlock(desc, scratch);
438 scratch.reset();
439 }
440 };
441
442 // Evaluate small expressions directly as a single block.
443 if (ctx->tiling.block_mapper.blockCount() == 1) {
444 TensorBlockScratch scratch(ctx->device);
445 TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions());
446 ctx->evaluator.evalBlock(desc, scratch);
447 delete ctx;
448 } else {
449 ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(), ctx->tiling.cost, eval_block,
450 [ctx]() { delete ctx; });
451 }
452 };
453
454 ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
455 }
456
457 private:
458 struct TensorAsyncExecutorContext {
459 TensorAsyncExecutorContext(const Expression& expr, const ThreadPoolDevice& thread_pool, DoneCallback done)
460 : device(thread_pool), evaluator(expr, thread_pool), on_done(std::move(done)) {}
461
462 ~TensorAsyncExecutorContext() {
463 evaluator.cleanup();
464 on_done();
465 }
466
467 const ThreadPoolDevice& device;
468 Evaluator evaluator;
469 TilingContext tiling;
470
471 private:
472 DoneCallback on_done;
473 };
474};
475
476#endif // EIGEN_USE_THREADS
477
478// GPU: the evaluation of the expression is offloaded to a GPU.
479#if defined(EIGEN_USE_GPU)
480
481template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
482class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
483 public:
484 typedef typename Expression::Index StorageIndex;
485 static void run(const Expression& expr, const GpuDevice& device);
486};
487
488#if defined(EIGEN_GPUCC)
489// Returns 1 if lhs + rhs would overflow, -1 if it would underflow, otherwise 0.
490template <typename Index>
491EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int sum_will_overflow(Index lhs, Index rhs) {
492 const Index highest = NumTraits<Index>::highest();
493 const Index lowest = NumTraits<Index>::lowest();
494 if (lhs > 0 && rhs > 0) {
495 return lhs > highest - rhs ? 1 : 0;
496 } else if (lhs < 0 && rhs < 0) {
497 return lhs < lowest - rhs ? -1 : 0;
498 } else {
499 return 0;
500 }
501}
502
503// Returns lhs + rhs, saturating to the highest/lowest representable value on
504// overflow/underflow respectively.
505template <typename Index>
506EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index saturate_add(Index lhs, Index rhs) {
507 const Index highest = NumTraits<Index>::highest();
508 const Index lowest = NumTraits<Index>::lowest();
509 int overflow = sum_will_overflow(lhs, rhs);
510 return overflow == 1 ? highest : overflow == -1 ? lowest : lhs + rhs;
511}
512
513// A functor that adds step_size to a given index, saturating to avoid
514// overflow/underflow. If overflow/underflow is not possible, regular addition
515// is used (for efficiency).
516template <typename Index>
517struct SafeStep {
518 // lastIdx is one past the end of the possible indexes.
519 // step_size is the value that will be added to the given index when the
520 // functor is called.
521 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SafeStep(Index lastIdx, Index step_size)
522 : can_overflow_(sum_will_overflow(lastIdx, step_size)), step_size_(step_size) {}
523
524 // Adds step_size to index, saturating on overflow (if overflow is possible).
525 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index operator()(Index index) const {
526 return can_overflow_ ? saturate_add(index, step_size_) : index + step_size_;
527 }
528
529 private:
530 const bool can_overflow_;
531 const Index step_size_;
532};
533
534template <typename Evaluator, typename StorageIndex, bool Vectorizable>
535struct EigenMetaKernelEval {
536 static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx,
537 StorageIndex step_size) {
538 SafeStep<StorageIndex> safe_step(lastIdx, step_size);
539 for (StorageIndex i = firstIdx; i < lastIdx; i = safe_step(i)) {
540 eval.evalScalar(i);
541 }
542 }
543};
544
545template <typename Evaluator, typename StorageIndex>
546struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
547 static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx,
548 StorageIndex step_size) {
549 const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
550 const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
551 const StorageIndex vectorized_step_size = step_size * PacketSize;
552
553 SafeStep<StorageIndex> safe_vectorized_step(vectorized_size, vectorized_step_size);
554 // Use the vector path
555 for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size; i = safe_vectorized_step(i)) {
556 eval.evalPacket(i);
557 }
558 SafeStep<StorageIndex> safe_step(lastIdx, step_size);
559 for (StorageIndex i = saturate_add(vectorized_size, firstIdx); i < lastIdx; i = safe_step(i)) {
560 eval.evalScalar(i);
561 }
562 }
563};
564
565template <typename Evaluator, typename StorageIndex>
566__global__ void __launch_bounds__(1024) EigenMetaKernel(Evaluator eval, StorageIndex size) {
567 const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
568 const StorageIndex step_size = blockDim.x * gridDim.x;
569
570 const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
571 EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
572}
573
574/*static*/
575template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
576EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling>::run(const Expression& expr,
577 const GpuDevice& device) {
578 TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
579 const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
580 if (needs_assign) {
581 const int block_size = device.maxGpuThreadsPerBlock();
582 const int max_blocks = static_cast<int>(
583 numext::mini<int64_t>(device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor(),
584 NumTraits<StorageIndex>::highest()) /
585 block_size);
586 const StorageIndex size = array_prod(evaluator.dimensions());
587 // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
588 const int num_blocks = numext::maxi<int>(
589 numext::mini<int>(max_blocks, static_cast<int>(numext::div_ceil<StorageIndex>(size, block_size))), 1);
590
591 LAUNCH_GPU_KERNEL((EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>), num_blocks, block_size,
592 0, device, evaluator, size);
593 }
594 evaluator.cleanup();
595}
596
597#endif // EIGEN_GPUCC
598#endif // EIGEN_USE_GPU
599
600// SYCL Executor policy
601#ifdef EIGEN_USE_SYCL
602
603template <typename Evaluator>
604struct ExecExprFunctorKernel {
605 typedef typename Evaluator::Index Index;
606 Evaluator evaluator;
607 const Index range;
608 template <typename Scratch>
609 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(const Scratch, Evaluator evaluator_, const Index range_)
610 : evaluator(evaluator_), range(range_) {}
611
612 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(cl::sycl::nd_item<1> itemID) const { compute(itemID); }
613 template <bool is_vec = Evaluator::PacketAccess>
614 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<!is_vec> compute(const cl::sycl::nd_item<1>& itemID) const {
615 Index gId = static_cast<Index>(itemID.get_global_linear_id());
616 Index total_threads = itemID.get_global_range(0);
617
618 for (Index i = gId; i < range; i += total_threads) {
619 evaluator.evalScalar(i);
620 }
621 }
622 template <bool is_vec = Evaluator::PacketAccess>
623 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<is_vec> compute(const cl::sycl::nd_item<1>& itemID) const {
624 const Index vectorizedRange = (range / Evaluator::PacketSize) * Evaluator::PacketSize;
625 Index gId = static_cast<Index>(itemID.get_global_linear_id());
626 const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
627 const Index start = Evaluator::PacketSize * gId;
628 for (Index i = start; i < vectorizedRange; i += step) {
629 evaluator.evalPacket(i);
630 }
631 gId += vectorizedRange;
632 for (Index i = gId; i < range; i += itemID.get_global_range(0)) {
633 evaluator.evalScalar(i);
634 }
635 }
636};
637
638template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
639class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
640 public:
641 typedef typename Expression::Index Index;
642 static EIGEN_STRONG_INLINE void run(const Expression& expr, const Eigen::SyclDevice& dev) {
643 typedef Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> Evaluator;
644 Evaluator evaluator(expr, dev);
645 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
646 if (needs_assign) {
647 Index range, GRange, tileSize;
648 Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
649 total_size = (total_size == 0) ? 1 : total_size;
650 const int PacketSize = Eigen::PacketType<typename Evaluator::CoeffReturnType, Eigen::SyclDevice>::size;
651 Index vectorizable_threads = static_cast<Index>(total_size / PacketSize);
652 dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
653 range = total_size;
654
655 dev.template nullary_kernel_launcher<typename Evaluator::CoeffReturnType, ExecExprFunctorKernel<Evaluator> >(
656 evaluator, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
657 range)
658 .wait();
659 }
660 evaluator.cleanup();
661 }
662};
663
664#endif
665
666} // end namespace internal
667
668} // end namespace Eigen
669
670#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
Definition TensorAssign.h:55
Definition TensorExecutor.h:108
The tensor executor class.
Definition TensorExecutor.h:76
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The tensor evaluator class.
Definition TensorEvaluator.h:30