10#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
11#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
27template<
typename Expression>
29 enum { value =
false };
32template<
typename LhsXprType,
typename RhsXprType>
35 enum { value = ExpressionHasTensorBroadcastingOp<RhsXprType>::value };
38template<
typename UnaryOp,
typename XprType>
39struct ExpressionHasTensorBroadcastingOp<
40 const TensorCwiseUnaryOp<UnaryOp, XprType> > {
41 enum { value = ExpressionHasTensorBroadcastingOp<XprType>::value };
44template<
typename BinaryOp,
typename LhsXprType,
typename RhsXprType>
46 const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > {
48 value = ExpressionHasTensorBroadcastingOp<LhsXprType>::value ||
49 ExpressionHasTensorBroadcastingOp<RhsXprType>::value
53template<
typename Broadcast,
typename XprType>
55 const TensorBroadcastingOp<Broadcast, XprType> > {
56 enum { value =
true };
77template <
typename Expression,
typename Device,
bool Vectorizable,
78 TiledEvaluation Tiling>
81 typedef typename Expression::Index StorageIndex;
88 static_assert(std::is_same<Device, DefaultDevice>::value,
89 "Default executor instantiated with non-default device. "
90 "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
91 "EIGEN_USE_SYCL before including Eigen headers.");
94 static EIGEN_STRONG_INLINE
void run(
const Expression& expr,
95 const Device& device = Device()) {
97 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
99 const StorageIndex size = array_prod(evaluator.dimensions());
100 for (StorageIndex i = 0; i < size; ++i) {
101 evaluator.evalScalar(i);
112template <
typename Expression,
typename Device,
typename DoneCallback,
113 bool Vectorizable, TiledEvaluation Tiling>
119template <
typename Expression>
121 TiledEvaluation::Off> {
123 typedef typename Expression::Index StorageIndex;
126 static EIGEN_STRONG_INLINE
void run(
127 const Expression& expr,
const DefaultDevice& device = DefaultDevice()) {
129 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
131 const StorageIndex size = array_prod(evaluator.dimensions());
133 Expression, DefaultDevice>::PacketReturnType>::size;
138 const StorageIndex UnrolledSize =
139 (size / (4 * PacketSize)) * 4 * PacketSize;
140 for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
141 for (StorageIndex j = 0; j < 4; j++) {
142 evaluator.evalPacket(i + j * PacketSize);
145 const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
146 for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
147 evaluator.evalPacket(i);
149 for (StorageIndex i = VectorizedSize; i < size; ++i) {
150 evaluator.evalScalar(i);
161template <
typename Expression,
bool Vectorizable>
163 TiledEvaluation::On> {
165 typedef typename traits<Expression>::Scalar Scalar;
166 typedef typename remove_const<Scalar>::type ScalarNoConst;
169 typedef typename traits<Expression>::Index StorageIndex;
171 static const int NumDims = traits<Expression>::NumDimensions;
174 static EIGEN_STRONG_INLINE
void run(
const Expression& expr,
175 const DefaultDevice& device = DefaultDevice()) {
176 typedef TensorBlockMapper<NumDims, Evaluator::Layout, StorageIndex>
179 typedef internal::TensorBlockDescriptor<NumDims, StorageIndex>
181 typedef internal::TensorBlockScratchAllocator<DefaultDevice>
184 Evaluator evaluator(expr, device);
187 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
191 const TensorBlockResourceRequirements requirements =
192 evaluator.getResourceRequirements();
194 const TensorBlockMapper block_mapper(
195 typename TensorBlockDesc::Dimensions(evaluator.dimensions()),
199 TensorBlockScratch scratch(device);
201 const StorageIndex total_block_count = block_mapper.blockCount();
202 for (StorageIndex i = 0; i < total_block_count; ++i) {
203 TensorBlockDesc desc = block_mapper.blockDescriptor(i);
204 evaluator.evalBlock(desc, scratch);
223#ifdef EIGEN_USE_THREADS
225template <
typename TensorBlockMapper>
226struct TensorExecutorTilingContext {
227 TensorExecutorTilingContext() =
default;
228 TensorExecutorTilingContext(
const TensorBlockMapper& b_mapper,
229 const TensorOpCost& b_cost,
size_t b_aligned_size)
230 : block_mapper(b_mapper),
232 aligned_blocksize(b_aligned_size) {}
234 TensorBlockMapper block_mapper;
236 size_t aligned_blocksize;
241template <
typename Evaluator,
typename TensorBlockMapper,
bool Vectorizable>
242TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
243 const Evaluator& evaluator) {
245 TensorBlockResourceRequirements requirements =
246 evaluator.getResourceRequirements();
249 double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(
250 1, requirements.cost_per_coeff);
251 requirements.size =
static_cast<size_t>(1.0 / taskSize);
253 TensorBlockMapper block_mapper(
254 typename TensorBlockMapper::Dimensions(evaluator.dimensions()),
257 size_t block_size = block_mapper.blockTotalSize();
258 const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
259 const size_t aligned_blocksize =
261 divup<size_t>(block_size *
sizeof(
typename Evaluator::Scalar), align);
263 return {block_mapper, requirements.cost_per_coeff * block_size,
267template <
typename Evaluator,
typename StorageIndex,
bool Vectorizable>
269 static void run(Evaluator* evaluator_in,
const StorageIndex firstIdx,
270 const StorageIndex lastIdx) {
271 Evaluator evaluator = *evaluator_in;
272 eigen_assert(lastIdx >= firstIdx);
273 for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
274 evaluator.evalScalar(i);
278 static StorageIndex alignBlockSize(StorageIndex size) {
return size; }
281template <
typename Evaluator,
typename StorageIndex>
282struct EvalRange<Evaluator, StorageIndex, true> {
283 static const int PacketSize =
284 unpacket_traits<typename Evaluator::PacketReturnType>::size;
286 static void run(Evaluator* evaluator_in,
const StorageIndex firstIdx,
287 const StorageIndex lastIdx) {
288 Evaluator evaluator = *evaluator_in;
289 eigen_assert(lastIdx >= firstIdx);
290 StorageIndex i = firstIdx;
291 if (lastIdx - firstIdx >= PacketSize) {
292 eigen_assert(firstIdx % PacketSize == 0);
293 StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
297 for (; i <= last_chunk_offset; i += 4 * PacketSize) {
298 for (StorageIndex j = 0; j < 4; j++) {
299 evaluator.evalPacket(i + j * PacketSize);
302 last_chunk_offset = lastIdx - PacketSize;
303 for (; i <= last_chunk_offset; i += PacketSize) {
304 evaluator.evalPacket(i);
307 for (; i < lastIdx; ++i) {
308 evaluator.evalScalar(i);
312 static StorageIndex alignBlockSize(StorageIndex size) {
314 if (size >= 16 * PacketSize) {
315 return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
318 return (size + PacketSize - 1) & ~(PacketSize - 1);
322template <
typename Expression,
bool Vectorizable, TiledEvaluation Tiling>
323class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
325 typedef typename Expression::Index StorageIndex;
327 static EIGEN_STRONG_INLINE
void run(
const Expression& expr,
328 const ThreadPoolDevice& device) {
329 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
330 typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
332 Evaluator evaluator(expr, device);
333 const bool needs_assign = evaluator.evalSubExprsIfNeeded(
nullptr);
335 const StorageIndex size = array_prod(evaluator.dimensions());
336 device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
337 EvalRange::alignBlockSize,
338 [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) {
339 EvalRange::run(&evaluator, firstIdx, lastIdx);
346template <
typename Expression,
bool Vectorizable>
348 TiledEvaluation::On> {
350 typedef typename traits<Expression>::Index IndexType;
351 typedef typename traits<Expression>::Scalar Scalar;
352 typedef typename remove_const<Scalar>::type ScalarNoConst;
354 static const int NumDims = traits<Expression>::NumDimensions;
356 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
357 typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
358 typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
360 typedef internal::TensorBlockDescriptor<NumDims, IndexType>
362 typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
365 static EIGEN_STRONG_INLINE
void run(
const Expression& expr,
366 const ThreadPoolDevice& device) {
367 Evaluator evaluator(expr, device);
369 const bool needs_assign = evaluator.evalSubExprsIfNeeded(
nullptr);
371 const TilingContext tiling =
372 internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
373 Vectorizable>(evaluator);
375 auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,
376 IndexType lastBlockIdx) {
377 TensorBlockScratch scratch(device);
379 for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
381 TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);
382 evaluator.evalBlock(desc, scratch);
388 if (tiling.block_mapper.blockCount() == 1) {
389 TensorBlockScratch scratch(device);
390 TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());
391 evaluator.evalBlock(desc, scratch);
393 device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost,
401template <
typename Expression,
typename DoneCallback,
bool Vectorizable,
402 TiledEvaluation Tiling>
404 Vectorizable, Tiling> {
406 typedef typename Expression::Index StorageIndex;
407 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
409 static EIGEN_STRONG_INLINE
void runAsync(
const Expression& expr,
410 const ThreadPoolDevice& device,
412 TensorAsyncExecutorContext*
const ctx =
413 new TensorAsyncExecutorContext(expr, device, std::move(done));
415 const auto on_eval_subexprs = [ctx, &device](
bool need_assign) ->
void {
421 typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
422 const StorageIndex size = array_prod(ctx->evaluator.dimensions());
423 device.parallelForAsync(
424 size, ctx->evaluator.costPerCoeff(Vectorizable),
425 EvalRange::alignBlockSize,
426 [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
427 EvalRange::run(&ctx->evaluator, firstIdx, lastIdx);
429 [ctx]() { delete ctx; });
432 ctx->evaluator.evalSubExprsIfNeededAsync(
nullptr, on_eval_subexprs);
436 struct TensorAsyncExecutorContext {
437 TensorAsyncExecutorContext(
const Expression& expr,
438 const ThreadPoolDevice& thread_pool,
440 : evaluator(expr, thread_pool), on_done(std::move(done)) {}
442 ~TensorAsyncExecutorContext() {
450 DoneCallback on_done;
454template <
typename Expression,
typename DoneCallback,
bool Vectorizable>
456 Vectorizable, TiledEvaluation::On> {
458 typedef typename traits<Expression>::Index IndexType;
459 typedef typename traits<Expression>::Scalar Scalar;
460 typedef typename remove_const<Scalar>::type ScalarNoConst;
462 static const int NumDims = traits<Expression>::NumDimensions;
464 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
465 typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
466 typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
468 typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
469 typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
472 static EIGEN_STRONG_INLINE
void runAsync(
const Expression& expr,
473 const ThreadPoolDevice& device,
476 TensorAsyncExecutorContext*
const ctx =
477 new TensorAsyncExecutorContext(expr, device, std::move(done));
479 const auto on_eval_subexprs = [ctx](
bool need_assign) ->
void {
485 ctx->tiling = internal::GetTensorExecutorTilingContext<
486 Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
488 auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
489 TensorBlockScratch scratch(ctx->device);
491 for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
493 TensorBlockDesc desc =
494 ctx->tiling.block_mapper.blockDescriptor(block_idx);
495 ctx->evaluator.evalBlock(desc, scratch);
501 if (ctx->tiling.block_mapper.blockCount() == 1) {
502 TensorBlockScratch scratch(ctx->device);
503 TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions());
504 ctx->evaluator.evalBlock(desc, scratch);
507 ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(),
508 ctx->tiling.cost, eval_block,
509 [ctx]() { delete ctx; });
513 ctx->evaluator.evalSubExprsIfNeededAsync(
nullptr, on_eval_subexprs);
517 struct TensorAsyncExecutorContext {
518 TensorAsyncExecutorContext(
const Expression& expr,
519 const ThreadPoolDevice& thread_pool,
521 : device(thread_pool),
522 evaluator(expr, thread_pool),
523 on_done(std::move(done)) {}
525 ~TensorAsyncExecutorContext() {
530 const ThreadPoolDevice& device;
532 TilingContext tiling;
535 DoneCallback on_done;
542#if defined(EIGEN_USE_GPU)
544template <
typename Expression,
bool Vectorizable, TiledEvaluation Tiling>
545class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
547 typedef typename Expression::Index StorageIndex;
548 static void run(
const Expression& expr,
const GpuDevice& device);
551#if defined(EIGEN_GPUCC)
552template <
typename Evaluator,
typename StorageIndex,
bool Vectorizable>
553struct EigenMetaKernelEval {
554 static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
555 void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
556 for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) {
562template <
typename Evaluator,
typename StorageIndex>
563struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
564 static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
565 void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
566 const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
567 const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
568 const StorageIndex vectorized_step_size = step_size * PacketSize;
571 for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size;
572 i += vectorized_step_size) {
575 for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) {
581template <
typename Evaluator,
typename StorageIndex>
583__launch_bounds__(1024)
584EigenMetaKernel(Evaluator eval, StorageIndex size) {
586 const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
587 const StorageIndex step_size = blockDim.x * gridDim.x;
589 const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
590 EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
594template <
typename Expression,
bool Vectorizable, TiledEvaluation Tiling>
595EIGEN_STRONG_INLINE
void TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling>::run(
596 const Expression& expr,
const GpuDevice& device) {
597 TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
598 const bool needs_assign = evaluator.evalSubExprsIfNeeded(
nullptr);
601 const int block_size = device.maxGpuThreadsPerBlock();
602 const int max_blocks = device.getNumGpuMultiProcessors() *
603 device.maxGpuThreadsPerMultiProcessor() / block_size;
604 const StorageIndex size = array_prod(evaluator.dimensions());
606 const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
609 (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
610 num_blocks, block_size, 0, device, evaluator, size);
621template <
typename Evaluator>
622struct ExecExprFunctorKernel {
623 typedef typename Evaluator::Index
Index;
626 template <
typename Scratch>
627 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(
628 const Scratch, Evaluator evaluator_,
const Index range_)
629 : evaluator(evaluator_), range(range_) {}
631 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void operator()(
632 cl::sycl::nd_item<1> itemID) {
635 template <
bool is_vec = Evaluator::PacketAccess>
636 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
typename std::enable_if<!is_vec>::type
637 compute(
const cl::sycl::nd_item<1>& itemID) {
638 Index gId =
static_cast<Index>(itemID.get_global_linear_id());
639 Index total_threads = itemID.get_global_range(0);
641 for (Index i = gId; i < range; i += total_threads) {
642 evaluator.evalScalar(i);
645 template <
bool is_vec = Evaluator::PacketAccess>
646 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
typename std::enable_if<is_vec>::type
647 compute(
const cl::sycl::nd_item<1>& itemID) {
648 const Index vectorizedRange =
649 (range / Evaluator::PacketSize) * Evaluator::PacketSize;
650 Index gId =
static_cast<Index>(itemID.get_global_linear_id());
651 const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
652 const Index start = Evaluator::PacketSize * gId;
653 for (Index i = start; i < vectorizedRange; i += step) {
654 evaluator.evalPacket(i);
656 gId += vectorizedRange;
657 for (Index i = gId; i < range; i += itemID.get_global_range(0)) {
658 evaluator.evalScalar(i);
663template <
typename Expression,
bool Vectorizable, TiledEvaluation Tiling>
664class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
666 typedef typename Expression::Index
Index;
667 static EIGEN_STRONG_INLINE
void run(
const Expression& expr,
668 const Eigen::SyclDevice& dev) {
669 typedef Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> Evaluator;
670 Evaluator evaluator(expr, dev);
671 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
673 Index range, GRange, tileSize;
674 Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
675 total_size = (total_size == 0) ? 1 : total_size;
676 const int PacketSize =
677 Eigen::PacketType<
typename Evaluator::CoeffReturnType,
678 Eigen::SyclDevice>::size;
679 Index vectorizable_threads =
static_cast<Index>(total_size / PacketSize);
680 dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
683 dev.template nullary_kernel_launcher<
684 typename Evaluator::CoeffReturnType,
685 ExecExprFunctorKernel<Evaluator> >(
687 cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
688 cl::sycl::range<1>(tileSize)),
Definition TensorAssign.h:57
Definition TensorExecutor.h:114
The tensor executor class.
Definition TensorExecutor.h:79
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The tensor evaluator class.
Definition TensorEvaluator.h:27
Definition TensorExecutor.h:28