8#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
9#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
12#include "./InternalHeaderCheck.h"
19template <
typename Scalar,
typename IndexType,
int NumDims,
int Layout>
28template <
int Layout,
typename IndexType,
int NumDims>
29EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
const DSizes<IndexType, NumDims>& dimensions) {
30 DSizes<IndexType, NumDims> strides;
31 if (NumDims == 0)
return strides;
35 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
37 for (
int i = 1; i < NumDims; ++i) {
38 strides[i] = strides[i - 1] * dimensions[i - 1];
41 strides[NumDims - 1] = 1;
42 for (
int i = NumDims - 2; i >= 0; --i) {
43 strides[i] = strides[i + 1] * dimensions[i + 1];
50template <
int Layout,
typename IndexType,
size_t NumDims>
51EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
const Eigen::array<IndexType, NumDims>& dimensions) {
52 return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
55template <
int Layout, std::ptrdiff_t... Indices>
56EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t,
sizeof...(Indices)> strides(
const Sizes<Indices...>& sizes) {
57 return strides<Layout>(DSizes<std::ptrdiff_t,
sizeof...(Indices)>(sizes));
73enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims };
75struct TensorBlockResourceRequirements {
76 TensorBlockShapeType shape_type;
78 TensorOpCost cost_per_coeff;
84 EIGEN_DEVICE_FUNC TensorBlockResourceRequirements(TensorBlockShapeType shape_type_,
size_t size_, TensorOpCost cost_)
85 : shape_type(shape_type_), size(size_), cost_per_coeff(cost_) {}
88 template <
typename Scalar>
89 EIGEN_DEVICE_FUNC
static TensorBlockResourceRequirements withShapeAndSize(TensorBlockShapeType shape_type,
90 size_t size_in_bytes, TensorOpCost cost) {
91 const size_t size = numext::maxi(
size_t(1), size_in_bytes /
sizeof(Scalar));
92 return {shape_type, size, cost};
95 template <
typename Scalar>
96 EIGEN_DEVICE_FUNC
static TensorBlockResourceRequirements withShapeAndSize(TensorBlockShapeType shape_type,
97 size_t size_in_bytes) {
112 return withShapeAndSize<Scalar>(shape_type, size_in_bytes,
118 template <
typename Scalar>
119 EIGEN_DEVICE_FUNC
static TensorBlockResourceRequirements skewed(
size_t size_in_bytes) {
120 return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims, size_in_bytes);
123 template <
typename Scalar>
124 EIGEN_DEVICE_FUNC
static TensorBlockResourceRequirements uniform(
size_t size_in_bytes) {
125 return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims, size_in_bytes);
128 EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE TensorBlockResourceRequirements
129 merge(
const TensorBlockResourceRequirements& lhs,
const TensorBlockResourceRequirements& rhs) {
130 return {merge(lhs.shape_type, rhs.shape_type),
131 merge(lhs.size, rhs.size),
132 merge(lhs.cost_per_coeff, rhs.cost_per_coeff)};
135 EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff(TensorOpCost cost) {
136 cost_per_coeff += cost;
143 EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {
144 return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}};
148 using Requirements = TensorBlockResourceRequirements;
150 EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE
size_t merge(
size_t lhs_size,
size_t rhs_size) {
151 return numext::maxi(lhs_size, rhs_size);
154 EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE TensorBlockShapeType merge(TensorBlockShapeType lhs,
155 TensorBlockShapeType rhs) {
156 return (lhs == TensorBlockShapeType::kSkewedInnerDims || rhs == TensorBlockShapeType::kSkewedInnerDims)
157 ? TensorBlockShapeType::kSkewedInnerDims
158 : TensorBlockShapeType::kUniformAllDims;
161 EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost, TensorOpCost rhs_cost) {
162 return lhs_cost + rhs_cost;
170template <
int NumDims,
typename IndexType = Eigen::Index>
171class TensorBlockDescriptor {
173 typedef DSizes<IndexType, NumDims> Dimensions;
185 class DestinationBuffer {
187 enum DestinationBufferKind :
int {
218 template <
typename Scalar>
219 Scalar* data()
const {
220 eigen_assert(m_data_type_size ==
sizeof(Scalar));
221 return static_cast<Scalar*
>(m_data);
224 const Dimensions& strides()
const {
return m_strides; }
225 const DestinationBufferKind& kind()
const {
return m_kind; }
228 friend class TensorBlockDescriptor<NumDims, IndexType>;
230 DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
232 template <
typename Scalar>
233 DestinationBuffer(Scalar* data,
const Dimensions& strides, DestinationBufferKind kind)
234 : m_data(static_cast<void*>(data)), m_data_type_size(sizeof(Scalar)), m_strides(strides), m_kind(kind) {}
236 template <
int Layout,
typename Scalar>
237 static DestinationBuffer make(
const TensorBlockDescriptor& desc, Scalar* data,
const Dimensions& strides) {
238 return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
241 template <
int Layout>
242 static DestinationBufferKind kind(
const TensorBlockDescriptor& desc,
const Dimensions& strides) {
243 const Dimensions& desc_dims = desc.dimensions();
244 const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
245 for (
int i = 0; i < NumDims; ++i) {
246 if (desc_dims[i] == 1)
continue;
247 if (desc_strides[i] != strides[i])
return kStrided;
255 size_t m_data_type_size;
259 Dimensions m_strides;
261 DestinationBufferKind m_kind;
264 TensorBlockDescriptor(
const IndexType offset,
const Dimensions& dimensions,
const DestinationBuffer& destination)
265 : m_offset(offset), m_dimensions(dimensions), m_destination(destination) {}
267 TensorBlockDescriptor(
const IndexType offset,
const Dimensions& dimensions)
268 : m_offset(offset), m_dimensions(dimensions), m_destination(DestinationBuffer()) {}
270 IndexType offset()
const {
return m_offset; }
271 const Dimensions& dimensions()
const {
return m_dimensions; }
272 IndexType dimension(
int index)
const {
return m_dimensions[index]; }
273 IndexType size()
const {
return array_prod<IndexType>(m_dimensions); }
275 const DestinationBuffer& destination()
const {
return m_destination; }
277 template <
int Layout,
typename Scalar>
278 void AddDestinationBuffer(Scalar* dst_base,
const Dimensions& dst_strides) {
279 eigen_assert(dst_base != NULL);
280 m_destination = DestinationBuffer::template make<Layout>(*
this, dst_base, dst_strides);
283 template <
int Layout,
typename Scalar,
typename DstStr
idesIndexType>
284 void AddDestinationBuffer(Scalar* dst_base,
const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
286 AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));
289 TensorBlockDescriptor& DropDestinationBuffer() {
290 m_destination.m_data = NULL;
291 m_destination.m_kind = DestinationBuffer::kEmpty;
295 bool HasDestinationBuffer()
const {
return m_destination.kind() != DestinationBuffer::kEmpty; }
298 TensorBlockDescriptor WithOffset(IndexType offset)
const {
299 return TensorBlockDescriptor(offset, m_dimensions, m_destination);
305 const IndexType m_offset;
306 const Dimensions m_dimensions;
307 DestinationBuffer m_destination;
313template <
int NumDims,
int Layout,
typename IndexType = Eigen::Index>
314class TensorBlockMapper {
315 typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor;
318 typedef DSizes<IndexType, NumDims> Dimensions;
320 TensorBlockMapper() =
default;
321 TensorBlockMapper(
const DSizes<IndexType, NumDims>& dimensions,
const TensorBlockResourceRequirements& requirements)
322 : m_tensor_dimensions(dimensions), m_requirements(requirements) {
324 InitializeBlockDimensions();
327 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount()
const {
return m_total_block_count; }
329 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize()
const {
return m_block_dimensions.TotalSize(); }
331 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const DSizes<IndexType, NumDims>& blockDimensions()
const {
332 return m_block_dimensions;
335 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor blockDescriptor(IndexType block_index)
const {
336 static const bool isColMajor = Layout ==
static_cast<int>(
ColMajor);
338 IndexType offset = 0;
339 DSizes<IndexType, NumDims> dimensions;
341 if (NumDims == 0)
return BlockDescriptor(offset, dimensions);
344 for (
int i = NumDims - 1; i >= 0; --i) {
345 const int dim = isColMajor ? i : NumDims - i - 1;
347 const IndexType idx = block_index / m_block_strides[dim];
348 block_index -= idx * m_block_strides[dim];
350 const IndexType coord = idx * m_block_dimensions[dim];
351 dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord, m_block_dimensions[dim]);
352 offset += coord * m_tensor_strides[dim];
355 return {offset, dimensions};
359 void InitializeBlockDimensions() {
361 const TensorBlockShapeType shape_type = m_requirements.shape_type;
362 IndexType target_block_size = numext::maxi<IndexType>(1,
static_cast<IndexType
>(m_requirements.size));
364 IndexType tensor_size = m_tensor_dimensions.TotalSize();
370 if (tensor_size == 0) {
371 for (
int i = 0; i < NumDims; ++i) {
372 m_block_dimensions[i] = 1;
374 m_total_block_count = 0;
379 if (tensor_size <= target_block_size) {
380 m_block_dimensions = m_tensor_dimensions;
381 m_total_block_count = 1;
384 for (
int i = 0; i < NumDims; ++i) {
385 m_tensor_strides[i] = 0;
386 m_block_strides[i] = 1;
391 static const bool isColMajor = Layout ==
static_cast<int>(
ColMajor);
394 if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {
395 IndexType coeff_to_allocate = target_block_size;
397 for (
int i = 0; i < NumDims; ++i) {
398 const int dim = isColMajor ? i : NumDims - i - 1;
399 m_block_dimensions[dim] = numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
401 numext::div_ceil(coeff_to_allocate, numext::maxi(
static_cast<IndexType
>(1), m_block_dimensions[dim]));
403 eigen_assert(coeff_to_allocate == 1);
405 }
else if (shape_type == TensorBlockShapeType::kUniformAllDims) {
408 const IndexType dim_size_target = convert_index<IndexType>(
409 std::pow(
static_cast<float>(target_block_size), 1.0f /
static_cast<float>(m_block_dimensions.rank())));
411 for (
int i = 0; i < NumDims; ++i) {
416 m_block_dimensions[i] = numext::mini(dim_size_target, m_tensor_dimensions[i]);
420 IndexType total_size = m_block_dimensions.TotalSize();
421 for (
int i = 0; i < NumDims; ++i) {
422 const int dim = isColMajor ? i : NumDims - i - 1;
424 if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
425 const IndexType total_size_other_dims = total_size / m_block_dimensions[dim];
426 const IndexType alloc_avail = numext::div_ceil<IndexType>(target_block_size, total_size_other_dims);
427 if (alloc_avail == m_block_dimensions[dim]) {
431 m_block_dimensions[dim] = numext::mini(m_tensor_dimensions[dim], alloc_avail);
432 total_size = total_size_other_dims * m_block_dimensions[dim];
440 eigen_assert(m_block_dimensions.TotalSize() >=
441 numext::mini<IndexType>(target_block_size, m_tensor_dimensions.TotalSize()));
444 DSizes<IndexType, NumDims> block_count;
445 for (
int i = 0; i < NumDims; ++i) {
446 block_count[i] = numext::div_ceil(m_tensor_dimensions[i], m_block_dimensions[i]);
448 m_total_block_count = array_prod(block_count);
451 m_tensor_strides = strides<Layout>(m_tensor_dimensions);
452 m_block_strides = strides<Layout>(block_count);
455 DSizes<IndexType, NumDims> m_tensor_dimensions;
456 TensorBlockResourceRequirements m_requirements;
458 DSizes<IndexType, NumDims> m_block_dimensions;
459 IndexType m_total_block_count;
461 DSizes<IndexType, NumDims> m_tensor_strides;
462 DSizes<IndexType, NumDims> m_block_strides;
474template <
typename Device>
475class TensorBlockScratchAllocator {
477 explicit TensorBlockScratchAllocator(
const Device& device) : m_device(device), m_allocation_index(0) {}
479 ~TensorBlockScratchAllocator() {
480 for (
size_t i = 0; i < m_allocations.size(); ++i) {
481 m_device.deallocate(m_allocations[i].ptr);
485 void* allocate(
size_t size) {
487 if (m_allocations.capacity() == 0) m_allocations.reserve(8);
490 const int num_allocations =
static_cast<int>(m_allocations.size());
491 const bool has_allocation = m_allocation_index < num_allocations;
494 eigen_assert(m_allocation_index <= num_allocations);
501 if (has_allocation && m_allocations[m_allocation_index].size < size) {
502 m_device.deallocate(m_allocations[m_allocation_index].ptr);
503 m_allocations[m_allocation_index].ptr = m_device.allocate(size);
504 m_allocations[m_allocation_index].size = size;
508 if (!has_allocation) {
509 Allocation allocation;
510 allocation.ptr = m_device.allocate(size);
511 allocation.size = size;
512 m_allocations.push_back(allocation);
515 eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
516 eigen_assert(m_allocations[m_allocation_index].size >= size);
518 return m_allocations[m_allocation_index++].ptr;
521 void reset() { m_allocation_index = 0; }
529 const Device& m_device;
530 int m_allocation_index;
532 std::vector<Allocation> m_allocations;
538enum TensorBlockKind {
550 kMaterializedInScratch,
559 kMaterializedInOutput
566class TensorBlockNotImplemented {
568 typedef void XprType;
576template <
typename XprType>
578 typedef typename XprType::Scalar type;
581struct XprScalar<void> {
603template <
typename Scalar,
int NumDims,
int Layout,
typename IndexType = Eigen::Index>
604class TensorMaterializedBlock {
606 typedef DSizes<IndexType, NumDims> Dimensions;
607 typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
609 TensorMaterializedBlock(TensorBlockKind kind,
const Scalar* data,
const Dimensions& dimensions,
610 bool valid_expr =
true)
611 : m_kind(kind), m_data(data), m_dimensions(dimensions), m_expr(m_data, m_dimensions), m_valid_expr(valid_expr) {
612 eigen_assert(m_kind == internal::TensorBlockKind::kView ||
613 m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
614 m_kind == internal::TensorBlockKind::kMaterializedInOutput);
617 TensorBlockKind kind()
const {
return m_kind; }
621 const XprType& expr()
const {
622 eigen_assert(m_valid_expr);
625 const Scalar* data()
const {
return m_data; }
628 typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
640 Scalar* data()
const {
return m_data; }
641 const Dimensions& dimensions()
const {
return m_dimensions; }
642 const Dimensions& strides()
const {
return m_strides; }
644 TensorMaterializedBlock AsTensorMaterializedBlock()
const {
645 return TensorMaterializedBlock(m_materialized_in_output ? internal::TensorBlockKind::kMaterializedInOutput
646 : internal::TensorBlockKind::kMaterializedInScratch,
647 m_data, m_dimensions, !m_strided_storage);
651 friend class TensorMaterializedBlock<Scalar, NumDims, Layout, IndexType>;
653 Storage(Scalar* data,
const Dimensions& dimensions,
const Dimensions& strides,
bool materialized_in_output,
654 bool strided_storage)
656 m_dimensions(dimensions),
658 m_materialized_in_output(materialized_in_output),
659 m_strided_storage(strided_storage) {}
662 Dimensions m_dimensions;
663 Dimensions m_strides;
664 bool m_materialized_in_output;
665 bool m_strided_storage;
670 template <
typename TensorBlockScratch>
671 EIGEN_STRONG_INLINE
static Storage prepareStorage(TensorBlockDesc& desc, TensorBlockScratch& scratch,
672 bool allow_strided_storage =
false) {
674 typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
676 if (desc.destination().kind() == DestinationBuffer::kContiguous) {
677 Scalar* buffer = desc.destination().template data<Scalar>();
678 desc.DropDestinationBuffer();
679 return Storage(buffer, desc.dimensions(), internal::strides<Layout>(desc.dimensions()),
683 }
else if (desc.destination().kind() == DestinationBuffer::kStrided && allow_strided_storage) {
684 Scalar* buffer = desc.destination().template data<Scalar>();
685 desc.DropDestinationBuffer();
686 return Storage(buffer, desc.dimensions(), desc.destination().strides(),
690 void* mem = scratch.allocate(desc.size() *
sizeof(Scalar));
691 return Storage(
static_cast<Scalar*
>(mem), desc.dimensions(), internal::strides<Layout>(desc.dimensions()),
698 template <
typename DataDimensions,
typename TensorBlockScratch>
699 EIGEN_STRONG_INLINE
static TensorMaterializedBlock materialize(
const Scalar* data,
const DataDimensions& data_dims,
700 TensorBlockDesc& desc, TensorBlockScratch& scratch) {
701 eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
713 static const bool is_col_major = Layout ==
ColMajor;
716 int num_matching_inner_dims = 0;
717 for (
int i = 0; i < NumDims; ++i) {
718 int dim = is_col_major ? i : NumDims - i - 1;
719 if (data_dims[dim] != desc.dimensions()[dim])
break;
720 ++num_matching_inner_dims;
725 bool can_use_direct_access =
true;
726 for (
int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
727 int dim = is_col_major ? i : NumDims - i - 1;
728 if (desc.dimension(dim) != 1) {
729 can_use_direct_access =
false;
734 if (can_use_direct_access) {
735 const Scalar* block_start = data + desc.offset();
736 return TensorMaterializedBlock(internal::TensorBlockKind::kView, block_start, desc.dimensions());
740 const Storage storage = prepareStorage(desc, scratch);
742 typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout> TensorBlockIO;
743 typedef typename TensorBlockIO::Dst TensorBlockIODst;
744 typedef typename TensorBlockIO::Src TensorBlockIOSrc;
746 TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)), data, desc.offset());
747 TensorBlockIODst dst(storage.dimensions(), storage.strides(), storage.data());
749 TensorBlockIO::Copy(dst, src);
750 return storage.AsTensorMaterializedBlock();
755 TensorBlockKind m_kind;
756 const Scalar* m_data;
757 Dimensions m_dimensions;
766template <
typename UnaryOp,
typename ArgTensorBlock>
767class TensorCwiseUnaryBlock {
768 static constexpr bool NoArgBlockAccess = internal::is_void<typename ArgTensorBlock::XprType>::value;
771 typedef std::conditional_t<NoArgBlockAccess, void,
772 TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >
775 typedef typename XprScalar<XprType>::type Scalar;
777 TensorCwiseUnaryBlock(
const ArgTensorBlock& arg_block,
const UnaryOp& functor)
778 : m_arg_block(arg_block), m_functor(functor) {}
780 TensorBlockKind kind()
const {
return internal::TensorBlockKind::kExpr; }
782 XprType expr()
const {
return XprType(m_arg_block.expr(), m_functor); }
783 const Scalar* data()
const {
return NULL; }
784 void cleanup() { m_arg_block.cleanup(); }
787 ArgTensorBlock m_arg_block;
795template <
typename BinaryOp,
typename LhsTensorBlock,
typename RhsTensorBlock>
796class TensorCwiseBinaryBlock {
797 static constexpr bool NoArgBlockAccess = internal::is_void<typename LhsTensorBlock::XprType>::value ||
798 internal::is_void<typename RhsTensorBlock::XprType>::value;
801 typedef std::conditional_t<
802 NoArgBlockAccess, void,
803 TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType, const typename RhsTensorBlock::XprType> >
806 typedef typename XprScalar<XprType>::type Scalar;
808 TensorCwiseBinaryBlock(
const LhsTensorBlock& left_block,
const RhsTensorBlock& right_block,
const BinaryOp& functor)
809 : m_left_block(left_block), m_right_block(right_block), m_functor(functor) {}
811 TensorBlockKind kind()
const {
return internal::TensorBlockKind::kExpr; }
813 XprType expr()
const {
return XprType(m_left_block.expr(), m_right_block.expr(), m_functor); }
815 const Scalar* data()
const {
return NULL; }
818 m_left_block.cleanup();
819 m_right_block.cleanup();
823 LhsTensorBlock m_left_block;
824 RhsTensorBlock m_right_block;
833template <
typename BlockFactory,
typename ArgTensorBlock>
834class TensorUnaryExprBlock {
835 typedef typename ArgTensorBlock::XprType ArgXprType;
836 static constexpr bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
839 typedef std::conditional_t<NoArgBlockAccess, void, typename BlockFactory::template XprType<ArgXprType>::type> XprType;
841 typedef typename XprScalar<XprType>::type Scalar;
843 TensorUnaryExprBlock(
const ArgTensorBlock& arg_block,
const BlockFactory& factory)
844 : m_arg_block(arg_block), m_factory(factory) {}
846 TensorBlockKind kind()
const {
return internal::TensorBlockKind::kExpr; }
847 XprType expr()
const {
return m_factory.expr(m_arg_block.expr()); }
848 const Scalar* data()
const {
return NULL; }
849 void cleanup() { m_arg_block.cleanup(); }
852 ArgTensorBlock m_arg_block;
853 BlockFactory m_factory;
860template <
typename BlockFactory,
typename Arg1TensorBlock,
typename Arg2TensorBlock,
typename Arg3TensorBlock>
861class TensorTernaryExprBlock {
862 typedef typename Arg1TensorBlock::XprType Arg1XprType;
863 typedef typename Arg2TensorBlock::XprType Arg2XprType;
864 typedef typename Arg3TensorBlock::XprType Arg3XprType;
866 static constexpr bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
867 internal::is_void<Arg2XprType>::value ||
868 internal::is_void<Arg3XprType>::value;
871 typedef std::conditional_t<NoArgBlockAccess, void,
872 typename BlockFactory::template XprType<Arg1XprType, Arg2XprType, Arg3XprType>::type>
875 typedef typename XprScalar<XprType>::type Scalar;
877 TensorTernaryExprBlock(
const Arg1TensorBlock& arg1_block,
const Arg2TensorBlock& arg2_block,
878 const Arg3TensorBlock& arg3_block,
const BlockFactory& factory)
879 : m_arg1_block(arg1_block), m_arg2_block(arg2_block), m_arg3_block(arg3_block), m_factory(factory) {}
881 TensorBlockKind kind()
const {
return internal::TensorBlockKind::kExpr; }
882 XprType expr()
const {
return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(), m_arg3_block.expr()); }
883 const Scalar* data()
const {
return NULL; }
885 m_arg1_block.cleanup();
886 m_arg2_block.cleanup();
887 m_arg3_block.cleanup();
891 Arg1TensorBlock m_arg1_block;
892 Arg2TensorBlock m_arg2_block;
893 Arg3TensorBlock m_arg3_block;
894 BlockFactory m_factory;
901template <
typename Scalar,
typename IndexType>
902class StridedLinearBufferCopy {
903 typedef typename packet_traits<Scalar>::type Packet;
904 typedef typename unpacket_traits<Packet>::half HalfPacket;
906 Vectorizable = packet_traits<Scalar>::Vectorizable,
907 PacketSize = packet_traits<Scalar>::size,
908 HalfPacketSize = unpacket_traits<HalfPacket>::size,
909 HasHalfPacket =
static_cast<int>(HalfPacketSize) <
static_cast<int>(PacketSize)
924 Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
932 Src(IndexType o, IndexType s,
const Scalar* d) : offset(o), stride(s), data(d) {}
939 template <
typename Str
idedLinearBufferCopy::Kind kind>
940 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void Run(
const Dst& dst,
const Src& src,
const size_t count) {
941 Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride, src.data);
945 template <
typename Str
idedLinearBufferCopy::Kind kind>
946 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void Run(
const IndexType count,
const IndexType dst_offset,
947 const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
948 const IndexType src_offset,
const IndexType src_stride,
949 const Scalar* EIGEN_RESTRICT src_data) {
950 const Scalar* src = &src_data[src_offset];
951 Scalar* dst = &dst_data[dst_offset];
954 for (
Index i = 0; i < count; ++i) {
955 dst[i * dst_stride] = src[i * src_stride];
960 const IndexType vectorized_size = PacketSize * (count / PacketSize);
963 if (kind == StridedLinearBufferCopy::Kind::Linear) {
966 const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize));
967 eigen_assert(src_stride == 1 && dst_stride == 1);
968 for (; i < unrolled_size; i += 4 * PacketSize) {
969 for (
int j = 0; j < 4; ++j) {
970 Packet p = ploadu<Packet>(src + i + j * PacketSize);
971 pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
974 for (; i < vectorized_size; i += PacketSize) {
975 Packet p = ploadu<Packet>(src + i);
976 pstoreu<Scalar, Packet>(dst + i, p);
979 const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
980 if (i < vectorized_half_size) {
981 HalfPacket p = ploadu<HalfPacket>(src + i);
982 pstoreu<Scalar, HalfPacket>(dst + i, p);
986 for (; i < count; ++i) {
990 }
else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
992 eigen_assert(src_stride == 1 && dst_stride != 1);
993 for (; i < vectorized_size; i += PacketSize) {
994 Packet p = ploadu<Packet>(src + i);
995 pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
998 const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
999 if (i < vectorized_half_size) {
1000 HalfPacket p = ploadu<HalfPacket>(src + i);
1001 pscatter<Scalar, HalfPacket>(dst + i * dst_stride, p, dst_stride);
1002 i += HalfPacketSize;
1005 for (; i < count; ++i) {
1006 dst[i * dst_stride] = src[i];
1009 }
else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
1011 eigen_assert(src_stride == 0 && dst_stride == 1);
1013 const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize));
1015 Packet p = pset1<Packet>(s);
1016 for (; i < unrolled_size; i += 4 * PacketSize) {
1017 for (
int j = 0; j < 4; ++j) {
1018 pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
1021 for (; i < vectorized_size; i += PacketSize) {
1022 pstoreu<Scalar, Packet>(dst + i, p);
1024 if (HasHalfPacket) {
1025 const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
1026 if (i < vectorized_half_size) {
1027 HalfPacket hp = pset1<HalfPacket>(s);
1028 pstoreu<Scalar, HalfPacket>(dst + i, hp);
1029 i += HalfPacketSize;
1032 for (; i < count; ++i) {
1036 }
else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
1038 eigen_assert(src_stride == 0 && dst_stride != 1);
1040 Packet p = pset1<Packet>(s);
1041 for (; i < vectorized_size; i += PacketSize) {
1042 pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
1044 if (HasHalfPacket) {
1045 const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
1046 if (i < vectorized_half_size) {
1047 HalfPacket hp = pset1<HalfPacket>(s);
1048 pscatter<Scalar, HalfPacket>(dst + i * dst_stride, hp, dst_stride);
1049 i += HalfPacketSize;
1052 for (; i < count; ++i) {
1053 dst[i * dst_stride] = s;
1056 }
else if (kind == StridedLinearBufferCopy::Kind::Gather) {
1058 eigen_assert(dst_stride == 1);
1059 for (; i < vectorized_size; i += PacketSize) {
1060 Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
1061 pstoreu<Scalar, Packet>(dst + i, p);
1063 if (HasHalfPacket) {
1064 const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
1065 if (i < vectorized_half_size) {
1066 HalfPacket p = pgather<Scalar, HalfPacket>(src + i * src_stride, src_stride);
1067 pstoreu<Scalar, HalfPacket>(dst + i, p);
1068 i += HalfPacketSize;
1071 for (; i < count; ++i) {
1072 dst[i] = src[i * src_stride];
1075 }
else if (kind == StridedLinearBufferCopy::Kind::Random) {
1077 for (; i < count; ++i) {
1078 dst[i * dst_stride] = src[i * src_stride];
1081 eigen_assert(
false);
1092template <
typename Scalar,
typename IndexType,
int NumDims,
int Layout>
1093class TensorBlockIO {
1094 static constexpr bool IsColMajor = (Layout ==
ColMajor);
1096 typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
1099 typedef DSizes<IndexType, NumDims> Dimensions;
1100 typedef DSizes<int, NumDims> DimensionsMap;
1103 Dst(
const Dimensions& dst_dims,
const Dimensions& dst_strides, Scalar* dst, IndexType dst_offset = 0)
1104 : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
1113 Src(
const Dimensions& src_strides,
const Scalar* src, IndexType src_offset = 0)
1114 : strides(src_strides), data(src), offset(src_offset) {}
1126 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(
const Dst& dst,
const Src& src,
1127 const DimensionsMap& dst_to_src_dim_map) {
1130 *(dst.data + dst.offset) = *(src.data + src.offset);
1138 int inner_dim = IsColMajor ? 0 : NumDims - 1;
1139 EIGEN_UNUSED_VARIABLE(inner_dim);
1140 eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
1141 eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
1145 const DimensionsMap& dim_map = dst_to_src_dim_map;
1148 int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
1160 int num_size_one_inner_dims = 0;
1161 for (
int i = 0; i < num_squeezable_dims; ++i) {
1162 const int dst_dim = IsColMajor ? i : NumDims - i - 1;
1163 if (dst.dims[dst_dim] != 1)
break;
1164 num_size_one_inner_dims++;
1168 if (num_size_one_inner_dims == NumDims) {
1169 *(dst.data + dst.offset) = *(src.data + src.offset);
1174 const int dst_stride1_dim = IsColMajor ? num_size_one_inner_dims : NumDims - num_size_one_inner_dims - 1;
1177 const int src_dim_for_dst_stride1_dim = NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
1180 IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
1184 for (
int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
1185 const int dst_dim = IsColMajor ? i : NumDims - i - 1;
1186 const IndexType dst_stride = dst.strides[dst_dim];
1187 const IndexType src_stride = src.strides[dim_map[dst_dim]];
1188 if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
1189 dst_inner_dim_size *= dst.dims[dst_dim];
1190 ++num_size_one_inner_dims;
1197 IndexType input_offset = src.offset;
1198 IndexType output_offset = dst.offset;
1199 IndexType input_stride = NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
1200 IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
1202 const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
1203 array<BlockIteratorState, at_least_1_dim> it;
1207 for (
int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
1208 const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
1209 if (dst.dims[dst_dim] == 1)
continue;
1211 it[idx].size = dst.dims[dst_dim];
1212 it[idx].input_stride = src.strides[dim_map[dst_dim]];
1213 it[idx].output_stride = dst.strides[dst_dim];
1215 it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
1216 it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
1222 const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
1224#define COPY_INNER_DIM(KIND) \
1225 IndexType num_copied = 0; \
1226 for (num_copied = 0; num_copied < block_total_size; num_copied += dst_inner_dim_size) { \
1227 LinCopy::template Run<KIND>(typename LinCopy::Dst(output_offset, output_stride, dst.data), \
1228 typename LinCopy::Src(input_offset, input_stride, src.data), dst_inner_dim_size); \
1230 for (int j = 0; j < idx; ++j) { \
1231 if (++it[j].count < it[j].size) { \
1232 input_offset += it[j].input_stride; \
1233 output_offset += it[j].output_stride; \
1237 input_offset -= it[j].input_span; \
1238 output_offset -= it[j].output_span; \
1243 if (input_stride == 1 && output_stride == 1) {
1244 COPY_INNER_DIM(LinCopy::Kind::Linear);
1245 }
else if (input_stride == 1 && output_stride != 1) {
1246 COPY_INNER_DIM(LinCopy::Kind::Scatter);
1247 }
else if (input_stride == 0 && output_stride == 1) {
1248 COPY_INNER_DIM(LinCopy::Kind::FillLinear);
1249 }
else if (input_stride == 0 && output_stride != 1) {
1250 COPY_INNER_DIM(LinCopy::Kind::FillScatter);
1251 }
else if (output_stride == 1) {
1252 COPY_INNER_DIM(LinCopy::Kind::Gather);
1254 COPY_INNER_DIM(LinCopy::Kind::Random);
1257#undef COPY_INNER_DIM
1262 static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(
const Dst& dst,
const Src& src) {
1263 DimensionsMap dst_to_src_map;
1264 for (
int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
1265 return Copy(dst, src, dst_to_src_map);
1269 struct BlockIteratorState {
1270 BlockIteratorState() : size(0), count(0), input_stride(0), output_stride(0), input_span(0), output_span(0) {}
1274 IndexType input_stride;
1275 IndexType output_stride;
1276 IndexType input_span;
1277 IndexType output_span;
1283 static int NumSqueezableInnerDims(
const DimensionsMap& dim_map) {
1284 int num_squeezable_dims = 0;
1285 for (
int i = 0; i < NumDims; ++i) {
1286 const int dim = IsColMajor ? i : NumDims - i - 1;
1287 if (dim_map[dim] != dim)
break;
1288 num_squeezable_dims++;
1290 return num_squeezable_dims;
1313template <
typename Scalar,
int NumDims,
typename TensorBlockExpr,
typename IndexType = Eigen::Index>
1314class TensorBlockAssignment {
1316 typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice> TensorBlockEvaluator;
1318 typedef DSizes<IndexType, NumDims> Dimensions;
1320 enum { Vectorizable = packet_traits<Scalar>::Vectorizable, PacketSize = packet_traits<Scalar>::size };
1322 template <
bool Vectorizable,
typename Evaluator>
1323 struct InnerDimAssign {
1324 EIGEN_ALWAYS_INLINE
static void Run(Scalar* target, IndexType count,
const Evaluator& eval, IndexType eval_offset) {
1325 for (IndexType i = 0; i < count; ++i) {
1326 target[i] = eval.coeff(eval_offset + i);
1331 template <
typename Evaluator>
1332 struct InnerDimAssign<true, Evaluator> {
1333 EIGEN_ALWAYS_INLINE
static void Run(Scalar* target, IndexType count,
const Evaluator& eval, IndexType eval_offset) {
1334 typedef typename packet_traits<Scalar>::type Packet;
1336 const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize));
1337 const IndexType vectorized_size = PacketSize * (count / PacketSize);
1340 for (; i < unrolled_size; i += 4 * PacketSize) {
1341 for (
int j = 0; j < 4; ++j) {
1342 const IndexType idx = eval_offset + i + j * PacketSize;
1343 Packet p = eval.template packet<Unaligned>(idx);
1344 pstoreu<Scalar>(target + i + j * PacketSize, p);
1348 for (; i < vectorized_size; i += PacketSize) {
1349 Packet p = eval.template packet<Unaligned>(eval_offset + i);
1350 pstoreu<Scalar>(target + i, p);
1353 for (; i < count; ++i) {
1354 target[i] = eval.coeff(eval_offset + i);
1361 Target(
const Dimensions& target_dims,
const Dimensions& target_strides, Scalar* target_data,
1362 IndexType target_offset = 0)
1363 : dims(target_dims), strides(target_strides), data(target_data), offset(target_offset) {}
1371 static Target target(
const Dimensions& target_dims,
const Dimensions& target_strides, Scalar* target_data,
1372 IndexType target_offset = 0) {
1373 return Target(target_dims, target_strides, target_data, target_offset);
1376 template <
typename TargetDimsIndexType,
typename TargetStr
idesIndexType>
1377 static Target target(
const DSizes<TargetDimsIndexType, NumDims>& target_dims,
1378 const DSizes<TargetStridesIndexType, NumDims>& target_strides, Scalar* target_data,
1379 IndexType target_offset = 0) {
1381 return Target(Dimensions(target_dims), Dimensions(target_strides), target_data, target_offset);
1384 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void Run(
const Target& target,
const TensorBlockExpr& expr) {
1386 DefaultDevice default_device;
1387 TensorBlockEvaluator eval(expr, default_device);
1390 eigen_assert(dimensions_match(target.dims, eval.dimensions()));
1392 static const int Layout = TensorBlockEvaluator::Layout;
1393 static const bool is_col_major = Layout ==
ColMajor;
1396 const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
1397 const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
1398 IndexType output_inner_dim_size = target.dims[inner_dim_idx];
1401 eigen_assert(target.strides[inner_dim_idx] == 1);
1404 IndexType num_squeezed_dims = 0;
1405 for (
Index i = 1; i < NumDims; ++i) {
1406 const Index dim = is_col_major ? i : NumDims - i - 1;
1407 const IndexType target_stride = target.strides[dim];
1409 if (output_inner_dim_size == target_stride) {
1410 output_inner_dim_size *= target.dims[dim];
1411 num_squeezed_dims++;
1419 array<BlockIteratorState, NumDims> it;
1422 for (
Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
1423 const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
1426 it[idx].size = target.dims[dim];
1427 it[idx].output_stride = target.strides[dim];
1428 it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
1434 IndexType input_offset = 0;
1435 IndexType output_offset = target.offset;
1438 for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
1440 InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess, TensorBlockEvaluator>::Run(
1441 target.data + output_offset, output_inner_dim_size, eval, input_offset);
1444 input_offset += output_inner_dim_size;
1447 for (
int j = 0; j < idx; ++j) {
1448 if (++it[j].count < it[j].size) {
1449 output_offset += it[j].output_stride;
1453 output_offset -= it[j].output_span;
1459 struct BlockIteratorState {
1460 BlockIteratorState() : count(0), size(0), output_stride(0), output_span(0) {}
1464 IndexType output_stride;
1465 IndexType output_span;
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index