Eigen-unsupported  5.0.1-dev+284dcc12
 
Loading...
Searching...
No Matches
TensorBlock.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// This Source Code Form is subject to the terms of the Mozilla
5// Public License v. 2.0. If a copy of the MPL was not distributed
6// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
7
8#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
9#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
10
11// IWYU pragma: private
12#include "./InternalHeaderCheck.h"
13
14namespace Eigen {
15namespace internal {
16
17// -------------------------------------------------------------------------- //
18// Forward declarations for templates defined below.
19template <typename Scalar, typename IndexType, int NumDims, int Layout>
20class TensorBlockIO;
21
22// -------------------------------------------------------------------------- //
23// Helper function to compute strides for densely stored buffer of given
24// dimensions.
25
26// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
27// this function instead everywhere.
28template <int Layout, typename IndexType, int NumDims>
29EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(const DSizes<IndexType, NumDims>& dimensions) {
30 DSizes<IndexType, NumDims> strides;
31 if (NumDims == 0) return strides;
32
33 // TODO(ezhulenev): Use templates to unroll this loop (similar to
34 // h_array_reduce in CXX11meta.h)? Benchmark it.
35 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
36 strides[0] = 1;
37 for (int i = 1; i < NumDims; ++i) {
38 strides[i] = strides[i - 1] * dimensions[i - 1];
39 }
40 } else {
41 strides[NumDims - 1] = 1;
42 for (int i = NumDims - 2; i >= 0; --i) {
43 strides[i] = strides[i + 1] * dimensions[i + 1];
44 }
45 }
46
47 return strides;
48}
49
50template <int Layout, typename IndexType, size_t NumDims>
51EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(const Eigen::array<IndexType, NumDims>& dimensions) {
52 return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
53}
54
55template <int Layout, std::ptrdiff_t... Indices>
56EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(const Sizes<Indices...>& sizes) {
57 return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
58}
59
60// -------------------------------------------------------------------------- //
61
62// Tensor block shape type defines what are the shape preference for the blocks
63// extracted from the larger tensor.
64//
65// Example: blocks of 100 elements from the large 100x100 tensor:
66// - tensor: 100x100
67// - target_block_size: 100
68//
69// TensorBlockShapeType:
70// - kUniformAllDims: 100 blocks of size 10x10
71// - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column
72// or row major layout)
73enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims };
74
75struct TensorBlockResourceRequirements {
76 TensorBlockShapeType shape_type; // target block shape
77 size_t size; // target block size
78 TensorOpCost cost_per_coeff; // cost of computing a single block element
79
80#ifdef EIGEN_HIPCC
81 // For HIPCC, we need to explicitly declare as a "device fun", the constructor
82 // which is implicitly invoked in the "merge" / "any" routines. else HIPCC
83 // errors out complaining about the lack of a matching constructor
84 EIGEN_DEVICE_FUNC TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_, TensorOpCost cost_)
85 : shape_type(shape_type_), size(size_), cost_per_coeff(cost_) {}
86#endif
87
88 template <typename Scalar>
89 EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(TensorBlockShapeType shape_type,
90 size_t size_in_bytes, TensorOpCost cost) {
91 const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar));
92 return {shape_type, size, cost};
93 }
94
95 template <typename Scalar>
96 EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(TensorBlockShapeType shape_type,
97 size_t size_in_bytes) {
98 // This default cost per coefficient is valid for most materialized tensor
99 // block evaluation implementations, because they typically just read
100 // coefficients from the underlying tensor storage, and write to the tensor
101 // block buffer (scratch or destination memory, reads and writes have linear
102 // access pattern). We ignore the fixed cost of block evaluation, because in
103 // practice it should negligible.
104 //
105 // Lazy block evaluation adds the cost of calling a functor for each
106 // coefficient.
107 //
108 // All non-trivial block evaluation implementations must provide their own
109 // cost approximation (e.g. shuffling inner dimension has a much higher cost
110 // because it reads memory randomly, although the total number of moved
111 // bytes is the same).
112 return withShapeAndSize<Scalar>(shape_type, size_in_bytes,
113 {/*bytes_loaded=*/sizeof(Scalar),
114 /*bytes_stored=*/sizeof(Scalar),
115 /*compute_cycles=*/0});
116 }
117
118 template <typename Scalar>
119 EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed(size_t size_in_bytes) {
120 return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims, size_in_bytes);
121 }
122
123 template <typename Scalar>
124 EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform(size_t size_in_bytes) {
125 return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims, size_in_bytes);
126 }
127
128 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorBlockResourceRequirements
129 merge(const TensorBlockResourceRequirements& lhs, const TensorBlockResourceRequirements& rhs) {
130 return {merge(lhs.shape_type, rhs.shape_type), // shape_type
131 merge(lhs.size, rhs.size), // size
132 merge(lhs.cost_per_coeff, rhs.cost_per_coeff)}; // cost_per_coeff
133 }
134
135 EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff(TensorOpCost cost) {
136 cost_per_coeff += cost;
137 return *this;
138 }
139
140 // This is a resource requirement that should be returned from expressions
141 // that do not have any block evaluation preference (e.g. default tensor
142 // expression with raw buffer access).
143 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {
144 return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}};
145 }
146
147 private:
148 using Requirements = TensorBlockResourceRequirements;
149
150 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) {
151 return numext::maxi(lhs_size, rhs_size);
152 }
153
154 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorBlockShapeType merge(TensorBlockShapeType lhs,
155 TensorBlockShapeType rhs) {
156 return (lhs == TensorBlockShapeType::kSkewedInnerDims || rhs == TensorBlockShapeType::kSkewedInnerDims)
157 ? TensorBlockShapeType::kSkewedInnerDims
158 : TensorBlockShapeType::kUniformAllDims;
159 }
160
161 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost, TensorOpCost rhs_cost) {
162 return lhs_cost + rhs_cost;
163 }
164};
165
166// -------------------------------------------------------------------------- //
167// TensorBlockDescriptor specifies a block offset within a tensor and the block
168// sizes along each of the tensor dimensions.
169
170template <int NumDims, typename IndexType = Eigen::Index>
171class TensorBlockDescriptor {
172 public:
173 typedef DSizes<IndexType, NumDims> Dimensions;
174
175 // If we evaluate a Tensor assignment, and expression on the left, already has
176 // a memory buffer, then we might do performance optimization, and evaluate
177 // the root expression directly into the final output memory. Some time it's
178 // possible to reuse it for materializing subexpressions inside an expression
179 // tree, to to avoid dynamic memory allocation.
180 //
181 // The pointer type of the underlying storage is erased, because passing
182 // Scalar type through all the expression evaluation layers is way too many
183 // templates. In practice destination buffer type should always match the
184 // evaluated expression scalar type.
185 class DestinationBuffer {
186 public:
187 enum DestinationBufferKind : int {
188 // The above explicit specification of "int" as the enum basetype is
189 // needed to get around a HIPCC link error ("the field type is not
190 // amp-compatible")
191 // which is issued for class members with the enum type.
192 // TODO(rocm):
193 // remove the "int" basetype once HIPCC has been fixed to not error out
194 // in the above scenario.
195
196 // Destination buffer is not defined (`m_data` == nullptr).
197 kEmpty,
198
199 // Tensor block defined by an owning tensor block descriptor can fit
200 // contiguously into the destination buffer. In this case it's safe to
201 // materialize tensor block in the destination buffer, wrap it in a
202 // TensorMap, and use to build Eigen expression on top of it.
203 kContiguous,
204
205 // Destination buffer strides do not match strides of the contiguously
206 // stored block, and it's impossible to define a TensorMap over this
207 // buffer. However if we are evaluating a root of an expression tree, we
208 // still can materialize an output into this destination, because we can
209 // guarantee that no one will ever access it through block API.
210 //
211 // In theory it is possible to build valid TensorStriding<TensorMap>
212 // expression on top of this destination buffer, however it has
213 // inefficient coeff/packet access, and defeats the purpose of fast block
214 // evaluation API.
215 kStrided
216 };
217
218 template <typename Scalar>
219 Scalar* data() const {
220 eigen_assert(m_data_type_size == sizeof(Scalar));
221 return static_cast<Scalar*>(m_data);
222 }
223
224 const Dimensions& strides() const { return m_strides; }
225 const DestinationBufferKind& kind() const { return m_kind; }
226
227 private:
228 friend class TensorBlockDescriptor<NumDims, IndexType>;
229
230 DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
231
232 template <typename Scalar>
233 DestinationBuffer(Scalar* data, const Dimensions& strides, DestinationBufferKind kind)
234 : m_data(static_cast<void*>(data)), m_data_type_size(sizeof(Scalar)), m_strides(strides), m_kind(kind) {}
235
236 template <int Layout, typename Scalar>
237 static DestinationBuffer make(const TensorBlockDescriptor& desc, Scalar* data, const Dimensions& strides) {
238 return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
239 }
240
241 template <int Layout>
242 static DestinationBufferKind kind(const TensorBlockDescriptor& desc, const Dimensions& strides) {
243 const Dimensions& desc_dims = desc.dimensions();
244 const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
245 for (int i = 0; i < NumDims; ++i) {
246 if (desc_dims[i] == 1) continue;
247 if (desc_strides[i] != strides[i]) return kStrided;
248 }
249 return kContiguous;
250 }
251
252 // Storage pointer is type erased, to reduce template bloat, but we still
253 // keep the size of the underlying element type for error checking.
254 void* m_data;
255 size_t m_data_type_size;
256
257 // Destination buffer dimensions always match the dimensions of a tensor
258 // block descriptor it belongs to, however strides might be different.
259 Dimensions m_strides;
260
261 DestinationBufferKind m_kind;
262 };
263
264 TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions, const DestinationBuffer& destination)
265 : m_offset(offset), m_dimensions(dimensions), m_destination(destination) {}
266
267 TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)
268 : m_offset(offset), m_dimensions(dimensions), m_destination(DestinationBuffer()) {}
269
270 IndexType offset() const { return m_offset; }
271 const Dimensions& dimensions() const { return m_dimensions; }
272 IndexType dimension(int index) const { return m_dimensions[index]; }
273 IndexType size() const { return array_prod<IndexType>(m_dimensions); }
274
275 const DestinationBuffer& destination() const { return m_destination; }
276
277 template <int Layout, typename Scalar>
278 void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) {
279 eigen_assert(dst_base != NULL);
280 m_destination = DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);
281 }
282
283 template <int Layout, typename Scalar, typename DstStridesIndexType>
284 void AddDestinationBuffer(Scalar* dst_base, const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
285 // DSizes constructor will do index type promotion if it's safe.
286 AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));
287 }
288
289 TensorBlockDescriptor& DropDestinationBuffer() {
290 m_destination.m_data = NULL;
291 m_destination.m_kind = DestinationBuffer::kEmpty;
292 return *this;
293 }
294
295 bool HasDestinationBuffer() const { return m_destination.kind() != DestinationBuffer::kEmpty; }
296
297 // Returns a copy of `*this` with updated offset.
298 TensorBlockDescriptor WithOffset(IndexType offset) const {
299 return TensorBlockDescriptor(offset, m_dimensions, m_destination);
300 }
301
302 private:
303 // Offset and dimensions are immutable after construction. Block descriptor
304 // can only be mutated by adding or dropping destination.
305 const IndexType m_offset;
306 const Dimensions m_dimensions;
307 DestinationBuffer m_destination;
308};
309
310// -------------------------------------------------------------------------- //
311// TensorBlockMapper is responsible for iterating over the blocks of a tensor.
312
313template <int NumDims, int Layout, typename IndexType = Eigen::Index>
314class TensorBlockMapper {
315 typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor;
316
317 public:
318 typedef DSizes<IndexType, NumDims> Dimensions;
319
320 TensorBlockMapper() = default;
321 TensorBlockMapper(const DSizes<IndexType, NumDims>& dimensions, const TensorBlockResourceRequirements& requirements)
322 : m_tensor_dimensions(dimensions), m_requirements(requirements) {
323 // Compute block dimensions and the total number of blocks.
324 InitializeBlockDimensions();
325 }
326
327 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const { return m_total_block_count; }
328
329 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const { return m_block_dimensions.TotalSize(); }
330
331 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<IndexType, NumDims>& blockDimensions() const {
332 return m_block_dimensions;
333 }
334
335 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor blockDescriptor(IndexType block_index) const {
336 static const bool isColMajor = Layout == static_cast<int>(ColMajor);
337
338 IndexType offset = 0;
339 DSizes<IndexType, NumDims> dimensions;
340
341 if (NumDims == 0) return BlockDescriptor(offset, dimensions);
342
343 // Iterate outer -> inner dimensions.
344 for (int i = NumDims - 1; i >= 0; --i) {
345 const int dim = isColMajor ? i : NumDims - i - 1;
346
347 const IndexType idx = block_index / m_block_strides[dim];
348 block_index -= idx * m_block_strides[dim];
349
350 const IndexType coord = idx * m_block_dimensions[dim];
351 dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord, m_block_dimensions[dim]);
352 offset += coord * m_tensor_strides[dim];
353 }
354
355 return {offset, dimensions};
356 }
357
358 private:
359 void InitializeBlockDimensions() {
360 // Requested block shape and size.
361 const TensorBlockShapeType shape_type = m_requirements.shape_type;
362 IndexType target_block_size = numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size));
363
364 IndexType tensor_size = m_tensor_dimensions.TotalSize();
365
366 // Corner case: one of the dimensions is zero. Logic below is too complex
367 // to handle this case on a general basis, just use unit block size.
368 // Note: we must not yield blocks with zero dimensions (recipe for
369 // overflows/underflows, divisions by zero and NaNs later).
370 if (tensor_size == 0) {
371 for (int i = 0; i < NumDims; ++i) {
372 m_block_dimensions[i] = 1;
373 }
374 m_total_block_count = 0;
375 return;
376 }
377
378 // If tensor fits into a target block size, evaluate it as a single block.
379 if (tensor_size <= target_block_size) {
380 m_block_dimensions = m_tensor_dimensions;
381 m_total_block_count = 1;
382 // The only valid block index is `0`, and in this case we do not need
383 // to compute real strides for tensor or blocks (see blockDescriptor).
384 for (int i = 0; i < NumDims; ++i) {
385 m_tensor_strides[i] = 0;
386 m_block_strides[i] = 1;
387 }
388 return;
389 }
390
391 static const bool isColMajor = Layout == static_cast<int>(ColMajor);
392
393 // Block shape skewed towards inner dimension.
394 if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {
395 IndexType coeff_to_allocate = target_block_size;
396
397 for (int i = 0; i < NumDims; ++i) {
398 const int dim = isColMajor ? i : NumDims - i - 1;
399 m_block_dimensions[dim] = numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
400 coeff_to_allocate =
401 numext::div_ceil(coeff_to_allocate, numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
402 }
403 eigen_assert(coeff_to_allocate == 1);
404
405 } else if (shape_type == TensorBlockShapeType::kUniformAllDims) {
406 // Tensor will not fit within 'target_block_size' budget: calculate tensor
407 // block dimension sizes based on "square" dimension size target.
408 const IndexType dim_size_target = convert_index<IndexType>(
409 std::pow(static_cast<float>(target_block_size), 1.0f / static_cast<float>(m_block_dimensions.rank())));
410
411 for (int i = 0; i < NumDims; ++i) {
412 // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
413 // a multiple of the packet size. Note that reducing
414 // 'block_dim_size' in this manner can increase the number of
415 // blocks, and so will amplify any per-block overhead.
416 m_block_dimensions[i] = numext::mini(dim_size_target, m_tensor_dimensions[i]);
417 }
418
419 // Add any un-allocated coefficients to inner dimension(s).
420 IndexType total_size = m_block_dimensions.TotalSize();
421 for (int i = 0; i < NumDims; ++i) {
422 const int dim = isColMajor ? i : NumDims - i - 1;
423
424 if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
425 const IndexType total_size_other_dims = total_size / m_block_dimensions[dim];
426 const IndexType alloc_avail = numext::div_ceil<IndexType>(target_block_size, total_size_other_dims);
427 if (alloc_avail == m_block_dimensions[dim]) {
428 // Insufficient excess coefficients to allocate.
429 break;
430 }
431 m_block_dimensions[dim] = numext::mini(m_tensor_dimensions[dim], alloc_avail);
432 total_size = total_size_other_dims * m_block_dimensions[dim];
433 }
434 }
435
436 } else {
437 eigen_assert(false); // unknown block shape
438 }
439
440 eigen_assert(m_block_dimensions.TotalSize() >=
441 numext::mini<IndexType>(target_block_size, m_tensor_dimensions.TotalSize()));
442
443 // Calculate block counts by dimension and total block count.
444 DSizes<IndexType, NumDims> block_count;
445 for (int i = 0; i < NumDims; ++i) {
446 block_count[i] = numext::div_ceil(m_tensor_dimensions[i], m_block_dimensions[i]);
447 }
448 m_total_block_count = array_prod(block_count);
449
450 // Calculate block strides (used for enumerating blocks).
451 m_tensor_strides = strides<Layout>(m_tensor_dimensions);
452 m_block_strides = strides<Layout>(block_count);
453 }
454
455 DSizes<IndexType, NumDims> m_tensor_dimensions;
456 TensorBlockResourceRequirements m_requirements;
457
458 DSizes<IndexType, NumDims> m_block_dimensions;
459 IndexType m_total_block_count;
460
461 DSizes<IndexType, NumDims> m_tensor_strides;
462 DSizes<IndexType, NumDims> m_block_strides;
463};
464
465// -------------------------------------------------------------------------- //
466// TensorBlockScratchAllocator is responsible for allocating temporary buffers
467// for block evaluation (output or input block materialization). Given that
468// Eigen expression traversal order is deterministic, all temporary allocations
469// are happening in the same order, and usually have exactly the same size.
470// Scratch allocator keeps a trace of all dynamic allocations, and after the
471// first block evaluation is completed, we should be able to reuse all the
472// temporary buffers for the next block evaluation.
473
474template <typename Device>
475class TensorBlockScratchAllocator {
476 public:
477 explicit TensorBlockScratchAllocator(const Device& device) : m_device(device), m_allocation_index(0) {}
478
479 ~TensorBlockScratchAllocator() {
480 for (size_t i = 0; i < m_allocations.size(); ++i) {
481 m_device.deallocate(m_allocations[i].ptr);
482 }
483 }
484
485 void* allocate(size_t size) {
486 // TODO(ezhulenev): Remove when replaced with inlined vector.
487 if (m_allocations.capacity() == 0) m_allocations.reserve(8);
488
489 // Check if we already have an existing allocation att current index.
490 const int num_allocations = static_cast<int>(m_allocations.size());
491 const bool has_allocation = m_allocation_index < num_allocations;
492
493 // Allocation index can't be larger than the number of allocations.
494 eigen_assert(m_allocation_index <= num_allocations);
495
496 // If we have existing allocation, and its size is larger or equal to
497 // requested size, we do nothing.
498
499 // If current allocation can't fit requested size, we deallocate it, and
500 // replace with a larger allocation.
501 if (has_allocation && m_allocations[m_allocation_index].size < size) {
502 m_device.deallocate(m_allocations[m_allocation_index].ptr);
503 m_allocations[m_allocation_index].ptr = m_device.allocate(size);
504 m_allocations[m_allocation_index].size = size;
505 }
506
507 // Make a new allocation if we don't have and existing one.
508 if (!has_allocation) {
509 Allocation allocation;
510 allocation.ptr = m_device.allocate(size);
511 allocation.size = size;
512 m_allocations.push_back(allocation);
513 }
514
515 eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
516 eigen_assert(m_allocations[m_allocation_index].size >= size);
517
518 return m_allocations[m_allocation_index++].ptr;
519 }
520
521 void reset() { m_allocation_index = 0; }
522
523 private:
524 struct Allocation {
525 void* ptr;
526 size_t size;
527 };
528
529 const Device& m_device;
530 int m_allocation_index;
531 // TODO(ezhulenev): This should be an inlined vector.
532 std::vector<Allocation> m_allocations;
533};
534
535// -------------------------------------------------------------------------- //
536// TensorBlockKind represents all possible block kinds, that can be produced by
537// TensorEvaluator::evalBlock function.
538enum TensorBlockKind {
539 // Tensor block that is a lazy expression that must be assigned to a
540 // destination using TensorBlockAssign.
541 kExpr,
542
543 // Tensor block that is a view into a memory buffer owned by an underlying
544 // Tensor expression (e.g. it can be a view into a Tensor buffer).
545 kView,
546
547 // Tensor block that was materialized in a scratch memory buffer, allocated
548 // with TensorBlockScratchAllocator. This block must be copied to a
549 // destination, similar to a block of `kExpr` type.
550 kMaterializedInScratch,
551
552 // Tensor block that was materialized directly into the final output memory
553 // buffer. For example if the left side of an assignment is a Tensor, we can
554 // directly materialize the block in the destination memory.
555 //
556 // If strides in the output buffer do not match tensor block strides, the
557 // Tensor expression will be invalid, and should not be used by
558 // TensorBlockAssign or for constructing another block expression.
559 kMaterializedInOutput
560};
561
562// -------------------------------------------------------------------------- //
563// TensorBlockNotImplemented should be used to defined TensorBlock typedef in
564// TensorEvaluators that do not support block evaluation.
565
566class TensorBlockNotImplemented {
567 public:
568 typedef void XprType;
569};
570
571// -------------------------------------------------------------------------- //
572// XprScalar extracts Scalar type from the Eigen expressions (if expression type
573// is not void). It's required to be able to define lazy block expression for
574// argument types, that do not support block evaluation.
575
576template <typename XprType>
577struct XprScalar {
578 typedef typename XprType::Scalar type;
579};
580template <>
581struct XprScalar<void> {
582 typedef void type;
583};
584
585// -------------------------------------------------------------------------- //
586// TensorMaterializedBlock is a fully evaluated block of the original tensor,
587// and XprType is just a TensorMap over the data. This block type is typically
588// used to materialize blocks of tensor expressions, that can't be efficiently
589// represented as lazy Tensor expressions with fast coeff/packet operations,
590// e.g. we materialize all broadcasts into evaluated blocks.
591//
592// TensorMaterializedBlock does not own its memory buffer, it's either a memory
593// buffer that backs the original expression (e.g. block is just a view into a
594// Tensor), or a memory buffer allocated with scratch allocator, and in this
595// case the scratch allocator will deallocate it at the end of block based
596// expression execution.
597//
598// If the block was evaluated directly into the output buffer, and strides in
599// the output buffer do not match block strides, the TensorMap expression will
600// be invalid, and should never be used in block assignment or any other tensor
601// expression.
602
603template <typename Scalar, int NumDims, int Layout, typename IndexType = Eigen::Index>
604class TensorMaterializedBlock {
605 public:
606 typedef DSizes<IndexType, NumDims> Dimensions;
607 typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
608
609 TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data, const Dimensions& dimensions,
610 bool valid_expr = true)
611 : m_kind(kind), m_data(data), m_dimensions(dimensions), m_expr(m_data, m_dimensions), m_valid_expr(valid_expr) {
612 eigen_assert(m_kind == internal::TensorBlockKind::kView ||
613 m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
614 m_kind == internal::TensorBlockKind::kMaterializedInOutput);
615 }
616
617 TensorBlockKind kind() const { return m_kind; }
618 // NOTE(ezhulenev): Returning XprType by value like in other block types
619 // causes asan failures. The theory is that XprType::Nested doesn't work
620 // properly for TensorMap.
621 const XprType& expr() const {
622 eigen_assert(m_valid_expr);
623 return m_expr;
624 }
625 const Scalar* data() const { return m_data; }
626 void cleanup() {}
627
628 typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
629
630 // TensorMaterializedBlock can be backed by different types of storage:
631 //
632 // (1) Contiguous block of memory allocated with scratch allocator.
633 // (2) Contiguous block of memory reused from tensor block descriptor
634 // destination buffer.
635 // (3) Strided block of memory reused from tensor block descriptor
636 // destination buffer.
637 //
638 class Storage {
639 public:
640 Scalar* data() const { return m_data; }
641 const Dimensions& dimensions() const { return m_dimensions; }
642 const Dimensions& strides() const { return m_strides; }
643
644 TensorMaterializedBlock AsTensorMaterializedBlock() const {
645 return TensorMaterializedBlock(m_materialized_in_output ? internal::TensorBlockKind::kMaterializedInOutput
646 : internal::TensorBlockKind::kMaterializedInScratch,
647 m_data, m_dimensions, !m_strided_storage);
648 }
649
650 private:
651 friend class TensorMaterializedBlock<Scalar, NumDims, Layout, IndexType>;
652
653 Storage(Scalar* data, const Dimensions& dimensions, const Dimensions& strides, bool materialized_in_output,
654 bool strided_storage)
655 : m_data(data),
656 m_dimensions(dimensions),
657 m_strides(strides),
658 m_materialized_in_output(materialized_in_output),
659 m_strided_storage(strided_storage) {}
660
661 Scalar* m_data;
662 Dimensions m_dimensions;
663 Dimensions m_strides;
664 bool m_materialized_in_output;
665 bool m_strided_storage;
666 };
667
668 // Creates a storage for materialized block either from the block descriptor
669 // destination buffer, or allocates a new buffer with scratch allocator.
670 template <typename TensorBlockScratch>
671 EIGEN_STRONG_INLINE static Storage prepareStorage(TensorBlockDesc& desc, TensorBlockScratch& scratch,
672 bool allow_strided_storage = false) {
673 // Try to reuse destination as an output block buffer.
674 typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
675
676 if (desc.destination().kind() == DestinationBuffer::kContiguous) {
677 Scalar* buffer = desc.destination().template data<Scalar>();
678 desc.DropDestinationBuffer();
679 return Storage(buffer, desc.dimensions(), internal::strides<Layout>(desc.dimensions()),
680 /*materialized_in_output=*/true,
681 /*strided_storage=*/false);
682
683 } else if (desc.destination().kind() == DestinationBuffer::kStrided && allow_strided_storage) {
684 Scalar* buffer = desc.destination().template data<Scalar>();
685 desc.DropDestinationBuffer();
686 return Storage(buffer, desc.dimensions(), desc.destination().strides(),
687 /*materialized_in_output=*/true, /*strided_storage=*/true);
688
689 } else {
690 void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
691 return Storage(static_cast<Scalar*>(mem), desc.dimensions(), internal::strides<Layout>(desc.dimensions()),
692 /*materialized_in_output=*/false,
693 /*strided_storage=*/false);
694 }
695 }
696
697 // Creates a materialized block for the given descriptor from a memory buffer.
698 template <typename DataDimensions, typename TensorBlockScratch>
699 EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(const Scalar* data, const DataDimensions& data_dims,
700 TensorBlockDesc& desc, TensorBlockScratch& scratch) {
701 eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
702
703 // If a tensor block dimensions covers a contiguous block of the underlying
704 // memory, we can skip block buffer memory allocation, and construct a block
705 // from existing `data` memory buffer.
706 //
707 // Example: (RowMajor layout)
708 // data_dims: [11, 12, 13, 14]
709 // desc.dimensions(): [1, 1, 3, 14]
710 //
711 // In this case we can construct a TensorBlock starting at
712 // `data + desc.offset()`, with a `desc.dimensions()` block sizes.
713 static const bool is_col_major = Layout == ColMajor;
714
715 // Find out how many inner dimensions have a matching size.
716 int num_matching_inner_dims = 0;
717 for (int i = 0; i < NumDims; ++i) {
718 int dim = is_col_major ? i : NumDims - i - 1;
719 if (data_dims[dim] != desc.dimensions()[dim]) break;
720 ++num_matching_inner_dims;
721 }
722
723 // All the outer dimensions must be of size `1`, except a single dimension
724 // before the matching inner dimension (`3` in the example above).
725 bool can_use_direct_access = true;
726 for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
727 int dim = is_col_major ? i : NumDims - i - 1;
728 if (desc.dimension(dim) != 1) {
729 can_use_direct_access = false;
730 break;
731 }
732 }
733
734 if (can_use_direct_access) {
735 const Scalar* block_start = data + desc.offset();
736 return TensorMaterializedBlock(internal::TensorBlockKind::kView, block_start, desc.dimensions());
737
738 } else {
739 // Reuse destination buffer or allocate new buffer with scratch allocator.
740 const Storage storage = prepareStorage(desc, scratch);
741
742 typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout> TensorBlockIO;
743 typedef typename TensorBlockIO::Dst TensorBlockIODst;
744 typedef typename TensorBlockIO::Src TensorBlockIOSrc;
745
746 TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)), data, desc.offset());
747 TensorBlockIODst dst(storage.dimensions(), storage.strides(), storage.data());
748
749 TensorBlockIO::Copy(dst, src);
750 return storage.AsTensorMaterializedBlock();
751 }
752 }
753
754 private:
755 TensorBlockKind m_kind;
756 const Scalar* m_data;
757 Dimensions m_dimensions;
758 XprType m_expr;
759 bool m_valid_expr;
760};
761
762// -------------------------------------------------------------------------- //
763// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
764// functor to the blocks produced by the underlying Tensor expression.
765
766template <typename UnaryOp, typename ArgTensorBlock>
767class TensorCwiseUnaryBlock {
768 static constexpr bool NoArgBlockAccess = internal::is_void<typename ArgTensorBlock::XprType>::value;
769
770 public:
771 typedef std::conditional_t<NoArgBlockAccess, void,
772 TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >
773 XprType;
774
775 typedef typename XprScalar<XprType>::type Scalar;
776
777 TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
778 : m_arg_block(arg_block), m_functor(functor) {}
779
780 TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
781
782 XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
783 const Scalar* data() const { return NULL; }
784 void cleanup() { m_arg_block.cleanup(); }
785
786 private:
787 ArgTensorBlock m_arg_block;
788 UnaryOp m_functor;
789};
790
791// -------------------------------------------------------------------------- //
792// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
793// functor to the blocks produced by the underlying Tensor expression.
794
795template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
796class TensorCwiseBinaryBlock {
797 static constexpr bool NoArgBlockAccess = internal::is_void<typename LhsTensorBlock::XprType>::value ||
798 internal::is_void<typename RhsTensorBlock::XprType>::value;
799
800 public:
801 typedef std::conditional_t<
802 NoArgBlockAccess, void,
803 TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType, const typename RhsTensorBlock::XprType> >
804 XprType;
805
806 typedef typename XprScalar<XprType>::type Scalar;
807
808 TensorCwiseBinaryBlock(const LhsTensorBlock& left_block, const RhsTensorBlock& right_block, const BinaryOp& functor)
809 : m_left_block(left_block), m_right_block(right_block), m_functor(functor) {}
810
811 TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
812
813 XprType expr() const { return XprType(m_left_block.expr(), m_right_block.expr(), m_functor); }
814
815 const Scalar* data() const { return NULL; }
816
817 void cleanup() {
818 m_left_block.cleanup();
819 m_right_block.cleanup();
820 }
821
822 private:
823 LhsTensorBlock m_left_block;
824 RhsTensorBlock m_right_block;
825 BinaryOp m_functor;
826};
827
828// -------------------------------------------------------------------------- //
829// TensorUnaryExprBlock is a lazy tensor expression block that can construct
830// an arbitrary tensor expression from a block of the underlying type (this is a
831// generalization of the TensorCwiseUnaryBlock for arbitrary expressions).
832
833template <typename BlockFactory, typename ArgTensorBlock>
834class TensorUnaryExprBlock {
835 typedef typename ArgTensorBlock::XprType ArgXprType;
836 static constexpr bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
837
838 public:
839 typedef std::conditional_t<NoArgBlockAccess, void, typename BlockFactory::template XprType<ArgXprType>::type> XprType;
840
841 typedef typename XprScalar<XprType>::type Scalar;
842
843 TensorUnaryExprBlock(const ArgTensorBlock& arg_block, const BlockFactory& factory)
844 : m_arg_block(arg_block), m_factory(factory) {}
845
846 TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
847 XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
848 const Scalar* data() const { return NULL; }
849 void cleanup() { m_arg_block.cleanup(); }
850
851 private:
852 ArgTensorBlock m_arg_block;
853 BlockFactory m_factory;
854};
855
856// -------------------------------------------------------------------------- //
857// TensorTernaryExprBlock is a lazy tensor expression block that can construct
858// an arbitrary tensor expression from three blocks of the underlying type.
859
860template <typename BlockFactory, typename Arg1TensorBlock, typename Arg2TensorBlock, typename Arg3TensorBlock>
861class TensorTernaryExprBlock {
862 typedef typename Arg1TensorBlock::XprType Arg1XprType;
863 typedef typename Arg2TensorBlock::XprType Arg2XprType;
864 typedef typename Arg3TensorBlock::XprType Arg3XprType;
865
866 static constexpr bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
867 internal::is_void<Arg2XprType>::value ||
868 internal::is_void<Arg3XprType>::value;
869
870 public:
871 typedef std::conditional_t<NoArgBlockAccess, void,
872 typename BlockFactory::template XprType<Arg1XprType, Arg2XprType, Arg3XprType>::type>
873 XprType;
874
875 typedef typename XprScalar<XprType>::type Scalar;
876
877 TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block, const Arg2TensorBlock& arg2_block,
878 const Arg3TensorBlock& arg3_block, const BlockFactory& factory)
879 : m_arg1_block(arg1_block), m_arg2_block(arg2_block), m_arg3_block(arg3_block), m_factory(factory) {}
880
881 TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
882 XprType expr() const { return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(), m_arg3_block.expr()); }
883 const Scalar* data() const { return NULL; }
884 void cleanup() {
885 m_arg1_block.cleanup();
886 m_arg2_block.cleanup();
887 m_arg3_block.cleanup();
888 }
889
890 private:
891 Arg1TensorBlock m_arg1_block;
892 Arg2TensorBlock m_arg2_block;
893 Arg3TensorBlock m_arg3_block;
894 BlockFactory m_factory;
895};
896
897// -------------------------------------------------------------------------- //
898// StridedLinearBufferCopy provides a method to copy data between two linear
899// buffers with different strides, with optimized paths for scatter/gather.
900
901template <typename Scalar, typename IndexType>
902class StridedLinearBufferCopy {
903 typedef typename packet_traits<Scalar>::type Packet;
904 typedef typename unpacket_traits<Packet>::half HalfPacket;
905 enum {
906 Vectorizable = packet_traits<Scalar>::Vectorizable,
907 PacketSize = packet_traits<Scalar>::size,
908 HalfPacketSize = unpacket_traits<HalfPacket>::size,
909 HasHalfPacket = static_cast<int>(HalfPacketSize) < static_cast<int>(PacketSize)
910 };
911
912 public:
913 // Specifying linear copy kind statically gives ~30% speedup for small sizes.
914 enum class Kind {
915 Linear = 0, // src_stride == 1 && dst_stride == 1
916 Scatter = 1, // src_stride == 1 && dst_stride != 1
917 FillLinear = 2, // src_stride == 0 && dst_stride == 1
918 FillScatter = 3, // src_stride == 0 && dst_stride != 1
919 Gather = 4, // dst_stride == 1
920 Random = 5 // everything else
921 };
922
923 struct Dst {
924 Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
925
926 IndexType offset;
927 IndexType stride;
928 Scalar* data;
929 };
930
931 struct Src {
932 Src(IndexType o, IndexType s, const Scalar* d) : offset(o), stride(s), data(d) {}
933
934 IndexType offset;
935 IndexType stride;
936 const Scalar* data;
937 };
938
939 template <typename StridedLinearBufferCopy::Kind kind>
940 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst, const Src& src, const size_t count) {
941 Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride, src.data);
942 }
943
944 private:
945 template <typename StridedLinearBufferCopy::Kind kind>
946 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const IndexType count, const IndexType dst_offset,
947 const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
948 const IndexType src_offset, const IndexType src_stride,
949 const Scalar* EIGEN_RESTRICT src_data) {
950 const Scalar* src = &src_data[src_offset];
951 Scalar* dst = &dst_data[dst_offset];
952
953 if (!Vectorizable) {
954 for (Index i = 0; i < count; ++i) {
955 dst[i * dst_stride] = src[i * src_stride];
956 }
957 return;
958 }
959
960 const IndexType vectorized_size = PacketSize * (count / PacketSize);
961 IndexType i = 0;
962
963 if (kind == StridedLinearBufferCopy::Kind::Linear) {
964 // ******************************************************************** //
965 // Linear copy from `src` to `dst`.
966 const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize));
967 eigen_assert(src_stride == 1 && dst_stride == 1);
968 for (; i < unrolled_size; i += 4 * PacketSize) {
969 for (int j = 0; j < 4; ++j) {
970 Packet p = ploadu<Packet>(src + i + j * PacketSize);
971 pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
972 }
973 }
974 for (; i < vectorized_size; i += PacketSize) {
975 Packet p = ploadu<Packet>(src + i);
976 pstoreu<Scalar, Packet>(dst + i, p);
977 }
978 if (HasHalfPacket) {
979 const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
980 if (i < vectorized_half_size) {
981 HalfPacket p = ploadu<HalfPacket>(src + i);
982 pstoreu<Scalar, HalfPacket>(dst + i, p);
983 i += HalfPacketSize;
984 }
985 }
986 for (; i < count; ++i) {
987 dst[i] = src[i];
988 }
989 // ******************************************************************** //
990 } else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
991 // Scatter from `src` to `dst`.
992 eigen_assert(src_stride == 1 && dst_stride != 1);
993 for (; i < vectorized_size; i += PacketSize) {
994 Packet p = ploadu<Packet>(src + i);
995 pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
996 }
997 if (HasHalfPacket) {
998 const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
999 if (i < vectorized_half_size) {
1000 HalfPacket p = ploadu<HalfPacket>(src + i);
1001 pscatter<Scalar, HalfPacket>(dst + i * dst_stride, p, dst_stride);
1002 i += HalfPacketSize;
1003 }
1004 }
1005 for (; i < count; ++i) {
1006 dst[i * dst_stride] = src[i];
1007 }
1008 // ******************************************************************** //
1009 } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
1010 // Fill `dst` with value at `*src`.
1011 eigen_assert(src_stride == 0 && dst_stride == 1);
1012
1013 const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize));
1014 Scalar s = *src;
1015 Packet p = pset1<Packet>(s);
1016 for (; i < unrolled_size; i += 4 * PacketSize) {
1017 for (int j = 0; j < 4; ++j) {
1018 pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
1019 }
1020 }
1021 for (; i < vectorized_size; i += PacketSize) {
1022 pstoreu<Scalar, Packet>(dst + i, p);
1023 }
1024 if (HasHalfPacket) {
1025 const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
1026 if (i < vectorized_half_size) {
1027 HalfPacket hp = pset1<HalfPacket>(s);
1028 pstoreu<Scalar, HalfPacket>(dst + i, hp);
1029 i += HalfPacketSize;
1030 }
1031 }
1032 for (; i < count; ++i) {
1033 dst[i] = s;
1034 }
1035 // ******************************************************************** //
1036 } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
1037 // Scatter `*src` into `dst`.
1038 eigen_assert(src_stride == 0 && dst_stride != 1);
1039 Scalar s = *src;
1040 Packet p = pset1<Packet>(s);
1041 for (; i < vectorized_size; i += PacketSize) {
1042 pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
1043 }
1044 if (HasHalfPacket) {
1045 const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
1046 if (i < vectorized_half_size) {
1047 HalfPacket hp = pset1<HalfPacket>(s);
1048 pscatter<Scalar, HalfPacket>(dst + i * dst_stride, hp, dst_stride);
1049 i += HalfPacketSize;
1050 }
1051 }
1052 for (; i < count; ++i) {
1053 dst[i * dst_stride] = s;
1054 }
1055 // ******************************************************************** //
1056 } else if (kind == StridedLinearBufferCopy::Kind::Gather) {
1057 // Gather from `src` into `dst`.
1058 eigen_assert(dst_stride == 1);
1059 for (; i < vectorized_size; i += PacketSize) {
1060 Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
1061 pstoreu<Scalar, Packet>(dst + i, p);
1062 }
1063 if (HasHalfPacket) {
1064 const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
1065 if (i < vectorized_half_size) {
1066 HalfPacket p = pgather<Scalar, HalfPacket>(src + i * src_stride, src_stride);
1067 pstoreu<Scalar, HalfPacket>(dst + i, p);
1068 i += HalfPacketSize;
1069 }
1070 }
1071 for (; i < count; ++i) {
1072 dst[i] = src[i * src_stride];
1073 }
1074 // ******************************************************************** //
1075 } else if (kind == StridedLinearBufferCopy::Kind::Random) {
1076 // Random.
1077 for (; i < count; ++i) {
1078 dst[i * dst_stride] = src[i * src_stride];
1079 }
1080 } else {
1081 eigen_assert(false);
1082 }
1083 }
1084};
1085
1086// -------------------------------------------------------------------------- //
1087// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.
1088// It's possible to specify src->dst dimension mapping for the copy operation.
1089// Dimensions of `dst` specify how many elements have to be copied, for the
1090// `src` we need to know only stride to navigate through source memory buffer.
1091
1092template <typename Scalar, typename IndexType, int NumDims, int Layout>
1093class TensorBlockIO {
1094 static constexpr bool IsColMajor = (Layout == ColMajor);
1095
1096 typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
1097
1098 public:
1099 typedef DSizes<IndexType, NumDims> Dimensions;
1100 typedef DSizes<int, NumDims> DimensionsMap;
1101
1102 struct Dst {
1103 Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst, IndexType dst_offset = 0)
1104 : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
1105
1106 Dimensions dims;
1107 Dimensions strides;
1108 Scalar* data;
1109 IndexType offset;
1110 };
1111
1112 struct Src {
1113 Src(const Dimensions& src_strides, const Scalar* src, IndexType src_offset = 0)
1114 : strides(src_strides), data(src), offset(src_offset) {}
1115
1116 Dimensions strides;
1117 const Scalar* data;
1118 IndexType offset;
1119 };
1120
1121 // Copies data to `dst` from `src`, using provided dimensions mapping:
1122 //
1123 // src_dimension_index = dst_to_src_dim_map[dst_dimension_index]
1124 //
1125 // Returns the number of copied elements.
1126 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(const Dst& dst, const Src& src,
1127 const DimensionsMap& dst_to_src_dim_map) {
1128 // Copy single scalar value from `src` to `dst`.
1129 if (NumDims == 0) {
1130 *(dst.data + dst.offset) = *(src.data + src.offset);
1131 return 1;
1132 }
1133
1134 // Both `dst` and `src` must have contiguous innermost dimension. We also
1135 // accept the special case with stride '0', because it's used as a trick to
1136 // implement broadcasting.
1137 {
1138 int inner_dim = IsColMajor ? 0 : NumDims - 1;
1139 EIGEN_UNUSED_VARIABLE(inner_dim);
1140 eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
1141 eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
1142 }
1143
1144 // Give a shorter name to `dst_to_src_dim_map`.
1145 const DimensionsMap& dim_map = dst_to_src_dim_map;
1146
1147 // Do not squeeze reordered inner dimensions.
1148 int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
1149
1150 // NOTE: We find the innermost dimension (contiguous in memory) in the dst
1151 // block, and we write data linearly into that dimension, reading it from
1152 // the src. If dimensions are reordered, we might end up reading data from
1153 // the src with `stride != 1`.
1154 //
1155 // NOTE: Random-Read/Linear-Write can be up to ~2X faster than
1156 // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680
1157
1158 // Find the innermost dimension in the dst whose size is not 1. This is the
1159 // effective inner dim.
1160 int num_size_one_inner_dims = 0;
1161 for (int i = 0; i < num_squeezable_dims; ++i) {
1162 const int dst_dim = IsColMajor ? i : NumDims - i - 1;
1163 if (dst.dims[dst_dim] != 1) break;
1164 num_size_one_inner_dims++;
1165 }
1166
1167 // If all dimensions are of size 1, just copy a scalar from `src` to `dst`.
1168 if (num_size_one_inner_dims == NumDims) {
1169 *(dst.data + dst.offset) = *(src.data + src.offset);
1170 return 1;
1171 }
1172
1173 // Outermost dimension in the dst with `stride == 1` (contiguous in memory).
1174 const int dst_stride1_dim = IsColMajor ? num_size_one_inner_dims : NumDims - num_size_one_inner_dims - 1;
1175
1176 // Dimension in the src that corresponds to the dst innermost dimension.
1177 const int src_dim_for_dst_stride1_dim = NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
1178
1179 // Size of the innermost dimension (length of contiguous blocks of memory).
1180 IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
1181
1182 // Squeeze multiple inner dims into one if they are contiguous in `dst` and
1183 // `src` memory, so we can do less linear copy calls.
1184 for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
1185 const int dst_dim = IsColMajor ? i : NumDims - i - 1;
1186 const IndexType dst_stride = dst.strides[dst_dim];
1187 const IndexType src_stride = src.strides[dim_map[dst_dim]];
1188 if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
1189 dst_inner_dim_size *= dst.dims[dst_dim];
1190 ++num_size_one_inner_dims;
1191 } else {
1192 break;
1193 }
1194 }
1195
1196 // Setup strides to read data from `src` and write to `dst`.
1197 IndexType input_offset = src.offset;
1198 IndexType output_offset = dst.offset;
1199 IndexType input_stride = NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
1200 IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
1201
1202 const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
1203 array<BlockIteratorState, at_least_1_dim> it;
1204
1205 // Initialize block iterator state. Squeeze away any dimension of size 1.
1206 int idx = 0; // currently initialized iterator state index
1207 for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
1208 const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
1209 if (dst.dims[dst_dim] == 1) continue;
1210
1211 it[idx].size = dst.dims[dst_dim];
1212 it[idx].input_stride = src.strides[dim_map[dst_dim]];
1213 it[idx].output_stride = dst.strides[dst_dim];
1214
1215 it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
1216 it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
1217
1218 idx++;
1219 }
1220
1221 // Iterate copying data from src to dst.
1222 const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
1223
1224#define COPY_INNER_DIM(KIND) \
1225 IndexType num_copied = 0; \
1226 for (num_copied = 0; num_copied < block_total_size; num_copied += dst_inner_dim_size) { \
1227 LinCopy::template Run<KIND>(typename LinCopy::Dst(output_offset, output_stride, dst.data), \
1228 typename LinCopy::Src(input_offset, input_stride, src.data), dst_inner_dim_size); \
1229 \
1230 for (int j = 0; j < idx; ++j) { \
1231 if (++it[j].count < it[j].size) { \
1232 input_offset += it[j].input_stride; \
1233 output_offset += it[j].output_stride; \
1234 break; \
1235 } \
1236 it[j].count = 0; \
1237 input_offset -= it[j].input_span; \
1238 output_offset -= it[j].output_span; \
1239 } \
1240 } \
1241 return num_copied;
1242
1243 if (input_stride == 1 && output_stride == 1) {
1244 COPY_INNER_DIM(LinCopy::Kind::Linear);
1245 } else if (input_stride == 1 && output_stride != 1) {
1246 COPY_INNER_DIM(LinCopy::Kind::Scatter);
1247 } else if (input_stride == 0 && output_stride == 1) {
1248 COPY_INNER_DIM(LinCopy::Kind::FillLinear);
1249 } else if (input_stride == 0 && output_stride != 1) {
1250 COPY_INNER_DIM(LinCopy::Kind::FillScatter);
1251 } else if (output_stride == 1) {
1252 COPY_INNER_DIM(LinCopy::Kind::Gather);
1253 } else {
1254 COPY_INNER_DIM(LinCopy::Kind::Random);
1255 }
1256
1257#undef COPY_INNER_DIM
1258 }
1259
1260 // Copy from `src` to `dst` with an identity src->dst dimension map. Returns
1261 // the number of copied elements.
1262 static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst, const Src& src) {
1263 DimensionsMap dst_to_src_map;
1264 for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
1265 return Copy(dst, src, dst_to_src_map);
1266 }
1267
1268 private:
1269 struct BlockIteratorState {
1270 BlockIteratorState() : size(0), count(0), input_stride(0), output_stride(0), input_span(0), output_span(0) {}
1271
1272 IndexType size;
1273 IndexType count;
1274 IndexType input_stride;
1275 IndexType output_stride;
1276 IndexType input_span;
1277 IndexType output_span;
1278 };
1279
1280 // Compute how many inner dimensions it's allowed to squeeze when doing IO
1281 // between two tensor blocks. It's safe to squeeze inner dimensions, only
1282 // if they are not reordered.
1283 static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {
1284 int num_squeezable_dims = 0;
1285 for (int i = 0; i < NumDims; ++i) {
1286 const int dim = IsColMajor ? i : NumDims - i - 1;
1287 if (dim_map[dim] != dim) break;
1288 num_squeezable_dims++;
1289 }
1290 return num_squeezable_dims;
1291 }
1292};
1293
1294// -------------------------------------------------------------------------- //
1295// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
1296// a Tensor block defined by `desc`, backed by a memory buffer at `target`.
1297//
1298// Currently there is no way to write from a Tensor expression to a block of
1299// memory, if dimensions are reordered. If you need to do that, you should
1300// materialize a Tensor block expression into a memory buffer, and then use
1301// TensorBlockIO to copy data between two memory buffers with a custom
1302// `target->src` dimension map (see definition above).
1303//
1304// Also currently the innermost dimension of `target` must have a stride '1'
1305// (contiguous in memory). This restriction could be lifted with a `pscatter`,
1306// but in practice it's never needed, and there is a similar TensorBlockIO
1307// workaround for that.
1308//
1309// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO
1310// where `src` is a tensor expression. Explore if it is possible to rewrite IO
1311// to use expressions instead of pointers, and after that TensorBlockAssignment
1312// will become an alias to IO.
1313template <typename Scalar, int NumDims, typename TensorBlockExpr, typename IndexType = Eigen::Index>
1314class TensorBlockAssignment {
1315 // We will use coeff/packet path to evaluate block expressions.
1316 typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice> TensorBlockEvaluator;
1317
1318 typedef DSizes<IndexType, NumDims> Dimensions;
1319
1320 enum { Vectorizable = packet_traits<Scalar>::Vectorizable, PacketSize = packet_traits<Scalar>::size };
1321
1322 template <bool Vectorizable, typename Evaluator>
1323 struct InnerDimAssign {
1324 EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, const Evaluator& eval, IndexType eval_offset) {
1325 for (IndexType i = 0; i < count; ++i) {
1326 target[i] = eval.coeff(eval_offset + i);
1327 }
1328 }
1329 };
1330
1331 template <typename Evaluator>
1332 struct InnerDimAssign<true, Evaluator> {
1333 EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, const Evaluator& eval, IndexType eval_offset) {
1334 typedef typename packet_traits<Scalar>::type Packet;
1335
1336 const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize));
1337 const IndexType vectorized_size = PacketSize * (count / PacketSize);
1338 IndexType i = 0;
1339
1340 for (; i < unrolled_size; i += 4 * PacketSize) {
1341 for (int j = 0; j < 4; ++j) {
1342 const IndexType idx = eval_offset + i + j * PacketSize;
1343 Packet p = eval.template packet<Unaligned>(idx);
1344 pstoreu<Scalar>(target + i + j * PacketSize, p);
1345 }
1346 }
1347
1348 for (; i < vectorized_size; i += PacketSize) {
1349 Packet p = eval.template packet<Unaligned>(eval_offset + i);
1350 pstoreu<Scalar>(target + i, p);
1351 }
1352
1353 for (; i < count; ++i) {
1354 target[i] = eval.coeff(eval_offset + i);
1355 }
1356 }
1357 };
1358
1359 public:
1360 struct Target {
1361 Target(const Dimensions& target_dims, const Dimensions& target_strides, Scalar* target_data,
1362 IndexType target_offset = 0)
1363 : dims(target_dims), strides(target_strides), data(target_data), offset(target_offset) {}
1364
1365 Dimensions dims;
1366 Dimensions strides;
1367 Scalar* data;
1368 IndexType offset;
1369 };
1370
1371 static Target target(const Dimensions& target_dims, const Dimensions& target_strides, Scalar* target_data,
1372 IndexType target_offset = 0) {
1373 return Target(target_dims, target_strides, target_data, target_offset);
1374 }
1375
1376 template <typename TargetDimsIndexType, typename TargetStridesIndexType>
1377 static Target target(const DSizes<TargetDimsIndexType, NumDims>& target_dims,
1378 const DSizes<TargetStridesIndexType, NumDims>& target_strides, Scalar* target_data,
1379 IndexType target_offset = 0) {
1380 // DSizes constructor will do index type promotion if it's safe.
1381 return Target(Dimensions(target_dims), Dimensions(target_strides), target_data, target_offset);
1382 }
1383
1384 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Target& target, const TensorBlockExpr& expr) {
1385 // Prepare evaluator for block expression.
1386 DefaultDevice default_device;
1387 TensorBlockEvaluator eval(expr, default_device);
1388
1389 // Tensor block expression dimension should match destination dimensions.
1390 eigen_assert(dimensions_match(target.dims, eval.dimensions()));
1391
1392 static const int Layout = TensorBlockEvaluator::Layout;
1393 static const bool is_col_major = Layout == ColMajor;
1394
1395 // Initialize output inner dimension size based on a layout.
1396 const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
1397 const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
1398 IndexType output_inner_dim_size = target.dims[inner_dim_idx];
1399
1400 // Target inner dimension stride must be '1'.
1401 eigen_assert(target.strides[inner_dim_idx] == 1);
1402
1403 // Squeeze multiple inner dims into one if they are contiguous in `target`.
1404 IndexType num_squeezed_dims = 0;
1405 for (Index i = 1; i < NumDims; ++i) {
1406 const Index dim = is_col_major ? i : NumDims - i - 1;
1407 const IndexType target_stride = target.strides[dim];
1408
1409 if (output_inner_dim_size == target_stride) {
1410 output_inner_dim_size *= target.dims[dim];
1411 num_squeezed_dims++;
1412 } else {
1413 break;
1414 }
1415 }
1416
1417 // Initialize output block iterator state. Dimension in this array are
1418 // always in inner_most -> outer_most order (col major layout).
1419 array<BlockIteratorState, NumDims> it;
1420
1421 int idx = 0; // currently initialized iterator state index
1422 for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
1423 const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
1424
1425 it[idx].count = 0;
1426 it[idx].size = target.dims[dim];
1427 it[idx].output_stride = target.strides[dim];
1428 it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
1429 idx++;
1430 }
1431
1432 // We read block expression from the beginning, and start writing data to
1433 // `target` at given offset.
1434 IndexType input_offset = 0;
1435 IndexType output_offset = target.offset;
1436
1437 // Iterate copying data from `eval` to `target`.
1438 for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
1439 // Assign to `target` at current offset.
1440 InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess, TensorBlockEvaluator>::Run(
1441 target.data + output_offset, output_inner_dim_size, eval, input_offset);
1442
1443 // Move input offset forward by the number of assigned coefficients.
1444 input_offset += output_inner_dim_size;
1445
1446 // Update index.
1447 for (int j = 0; j < idx; ++j) {
1448 if (++it[j].count < it[j].size) {
1449 output_offset += it[j].output_stride;
1450 break;
1451 }
1452 it[j].count = 0;
1453 output_offset -= it[j].output_span;
1454 }
1455 }
1456 }
1457
1458 private:
1459 struct BlockIteratorState {
1460 BlockIteratorState() : count(0), size(0), output_stride(0), output_span(0) {}
1461
1462 IndexType count;
1463 IndexType size;
1464 IndexType output_stride;
1465 IndexType output_span;
1466 };
1467};
1468
1469// -------------------------------------------------------------------------- //
1470
1471} // namespace internal
1472} // namespace Eigen
1473
1474#endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index