Eigen-unsupported  5.0.1-dev+284dcc12
 
Loading...
Searching...
No Matches
TensorBroadcasting.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
11#define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
12
13// IWYU pragma: private
14#include "./InternalHeaderCheck.h"
15
16namespace Eigen {
17
18namespace internal {
19template <typename Broadcast, typename XprType>
20struct traits<TensorBroadcastingOp<Broadcast, XprType>> : public traits<XprType> {
21 typedef typename XprType::Scalar Scalar;
22 typedef traits<XprType> XprTraits;
23 typedef typename XprTraits::StorageKind StorageKind;
24 typedef typename XprTraits::Index Index;
25 typedef typename XprType::Nested Nested;
26 typedef std::remove_reference_t<Nested> Nested_;
27 static constexpr int NumDimensions = XprTraits::NumDimensions;
28 static constexpr int Layout = XprTraits::Layout;
29 typedef typename XprTraits::PointerType PointerType;
30 enum {
31 // Broadcast is read-only.
32 Flags = traits<XprType>::Flags & ~LvalueBit
33 };
34};
35
36template <typename Broadcast, typename XprType>
37struct eval<TensorBroadcastingOp<Broadcast, XprType>, Eigen::Dense> {
38 typedef const TensorBroadcastingOp<Broadcast, XprType> EIGEN_DEVICE_REF type;
39};
40
41template <typename Broadcast, typename XprType>
42struct nested<TensorBroadcastingOp<Broadcast, XprType>, 1,
43 typename eval<TensorBroadcastingOp<Broadcast, XprType>>::type> {
44 typedef TensorBroadcastingOp<Broadcast, XprType> type;
45};
46
47template <typename Dims>
48struct is_input_scalar {
49 static const bool value = false;
50};
51template <>
52struct is_input_scalar<Sizes<>> {
53 static const bool value = true;
54};
55template <typename std::ptrdiff_t... Indices>
56struct is_input_scalar<Sizes<Indices...>> {
57 static constexpr bool value = (Sizes<Indices...>::total_size == 1);
58};
59
60} // end namespace internal
61
65template <typename Broadcast, typename XprType>
66class TensorBroadcastingOp : public TensorBase<TensorBroadcastingOp<Broadcast, XprType>, ReadOnlyAccessors> {
67 public:
68 typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Scalar Scalar;
69 typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
70 typedef typename XprType::CoeffReturnType CoeffReturnType;
71 typedef typename Eigen::internal::nested<TensorBroadcastingOp>::type Nested;
72 typedef typename Eigen::internal::traits<TensorBroadcastingOp>::StorageKind StorageKind;
73 typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Index Index;
74
75 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast)
76 : m_xpr(expr), m_broadcast(broadcast) {}
77
78 EIGEN_DEVICE_FUNC const Broadcast& broadcast() const { return m_broadcast; }
79
80 EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
81
82 protected:
83 typename XprType::Nested m_xpr;
84 const Broadcast m_broadcast;
85};
86
87// Eval as rvalue
88template <typename Broadcast, typename ArgType, typename Device>
89struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> {
91 typedef typename XprType::Index Index;
92 static constexpr int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
93 typedef DSizes<Index, NumDims> Dimensions;
94 typedef typename XprType::Scalar Scalar;
95 typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
96 typedef typename XprType::CoeffReturnType CoeffReturnType;
97 typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
98 static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
99
100 protected: // all the non-static fields must have the same access control, otherwise the TensorEvaluator won't be
101 // standard layout;
102 bool isCopy, nByOne, oneByN;
103
104 public:
105 typedef StorageMemory<CoeffReturnType, Device> Storage;
106 typedef typename Storage::Type EvaluatorPointerType;
107
108 enum {
109 IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
110 PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
112 PreferBlockAccess = true,
113 RawAccess = false
114 };
115 static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
116
117 typedef std::remove_const_t<Scalar> ScalarNoConst;
118
119 // We do block based broadcasting using a trick with 2x tensor rank and 0
120 // strides. See block method implementation for details.
121 typedef DSizes<Index, 2 * NumDims> BroadcastDimensions;
122
123 //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
124 typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
125 typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
126
127 typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock ArgTensorBlock;
128
129 typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims, Layout, Index> TensorBlock;
130 //===--------------------------------------------------------------------===//
131
132 EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
133 : isCopy(false),
134 nByOne(false),
135 oneByN(false),
136 m_device(device),
137 m_broadcast(op.broadcast()),
138 m_impl(op.expression(), device) {
139 // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
140 // and store the result in a scalar. Instead one should reshape the scalar into a N-D
141 // tensor with N >= 1 of 1 element first and then broadcast.
142 EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
143 const InputDimensions& input_dims = m_impl.dimensions();
144 isCopy = true;
145 for (int i = 0; i < NumDims; ++i) {
146 eigen_assert(input_dims[i] > 0);
147 m_dimensions[i] = input_dims[i] * m_broadcast[i];
148 if (m_broadcast[i] != 1) {
149 isCopy = false;
150 }
151 }
152
153 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
154 m_inputStrides[0] = 1;
155 m_outputStrides[0] = 1;
156 for (int i = 1; i < NumDims; ++i) {
157 m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
158 m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
159 }
160 } else {
161 m_inputStrides[NumDims - 1] = 1;
162 m_outputStrides[NumDims - 1] = 1;
163 for (int i = NumDims - 2; i >= 0; --i) {
164 m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
165 m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
166 }
167 }
168
169 if (input_dims[0] == 1) {
170 oneByN = true;
171 for (int i = 1; i < NumDims; ++i) {
172 if (m_broadcast[i] != 1) {
173 oneByN = false;
174 break;
175 }
176 }
177 } else if (input_dims[NumDims - 1] == 1) {
178 nByOne = true;
179 for (int i = 0; i < NumDims - 1; ++i) {
180 if (m_broadcast[i] != 1) {
181 nByOne = false;
182 break;
183 }
184 }
185 }
186
187 // Handle special format like NCHW, its input shape is '[1, N..., 1]' and
188 // broadcast shape is '[N, 1..., N]'
189 if (!oneByN && !nByOne) {
190 if (input_dims[0] == 1 && input_dims[NumDims - 1] == 1 && NumDims > 2) {
191 nByOne = true;
192 oneByN = true;
193 for (int i = 1; i < NumDims - 1; ++i) {
194 if (m_broadcast[i] != 1) {
195 nByOne = false;
196 oneByN = false;
197 break;
198 }
199 }
200 }
201 }
202 }
203
204 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
205
206 EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
207 m_impl.evalSubExprsIfNeeded(NULL);
208 return true;
209 }
210
211#ifdef EIGEN_USE_THREADS
212 template <typename EvalSubExprsCallback>
213 EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) {
214 m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
215 }
216#endif // EIGEN_USE_THREADS
217
218 EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
219
220 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const {
221 if (internal::is_input_scalar<internal::remove_all_t<InputDimensions>>::value) {
222 return m_impl.coeff(0);
223 }
224
225 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
226 if (isCopy) {
227 return m_impl.coeff(index);
228 } else {
229 return coeffColMajor(index);
230 }
231 } else {
232 if (isCopy) {
233 return m_impl.coeff(index);
234 } else {
235 return coeffRowMajor(index);
236 }
237 }
238 }
239
240 // TODO: attempt to speed this up. The integer divisions and modulo are slow
241 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const {
242 Index inputIndex = 0;
243 EIGEN_UNROLL_LOOP
244 for (int i = NumDims - 1; i > 0; --i) {
245 const Index idx = index / m_outputStrides[i];
246 if (internal::index_statically_eq<Broadcast>(i, 1)) {
247 eigen_assert(idx < m_impl.dimensions()[i]);
248 inputIndex += idx * m_inputStrides[i];
249 } else {
250 if (internal::index_statically_eq<InputDimensions>(i, 1)) {
251 eigen_assert(idx % m_impl.dimensions()[i] == 0);
252 } else {
253 inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
254 }
255 }
256 index -= idx * m_outputStrides[i];
257 }
258 if (internal::index_statically_eq<Broadcast>(0, 1)) {
259 eigen_assert(index < m_impl.dimensions()[0]);
260 inputIndex += index;
261 } else {
262 if (internal::index_statically_eq<InputDimensions>(0, 1)) {
263 eigen_assert(index % m_impl.dimensions()[0] == 0);
264 } else {
265 inputIndex += (index % m_impl.dimensions()[0]);
266 }
267 }
268 return inputIndex;
269 }
270
271 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const {
272 return m_impl.coeff(indexColMajor(index));
273 }
274
275 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const {
276 Index inputIndex = 0;
277 EIGEN_UNROLL_LOOP
278 for (int i = 0; i < NumDims - 1; ++i) {
279 const Index idx = index / m_outputStrides[i];
280 if (internal::index_statically_eq<Broadcast>(i, 1)) {
281 eigen_assert(idx < m_impl.dimensions()[i]);
282 inputIndex += idx * m_inputStrides[i];
283 } else {
284 if (internal::index_statically_eq<InputDimensions>(i, 1)) {
285 eigen_assert(idx % m_impl.dimensions()[i] == 0);
286 } else {
287 inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
288 }
289 }
290 index -= idx * m_outputStrides[i];
291 }
292 if (internal::index_statically_eq<Broadcast>(NumDims - 1, 1)) {
293 eigen_assert(index < m_impl.dimensions()[NumDims - 1]);
294 inputIndex += index;
295 } else {
296 if (internal::index_statically_eq<InputDimensions>(NumDims - 1, 1)) {
297 eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0);
298 } else {
299 inputIndex += (index % m_impl.dimensions()[NumDims - 1]);
300 }
301 }
302 return inputIndex;
303 }
304
305 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const {
306 return m_impl.coeff(indexRowMajor(index));
307 }
308
309 template <int LoadMode>
310 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const {
311 if (internal::is_input_scalar<internal::remove_all_t<InputDimensions>>::value) {
312 return internal::pset1<PacketReturnType>(m_impl.coeff(0));
313 }
314
315 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
316 if (isCopy) {
317#ifdef EIGEN_GPU_COMPILE_PHASE
318 // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing
319 // unaligned loads here. The reason is unclear though.
320 return m_impl.template packet<Unaligned>(index);
321#else
322 return m_impl.template packet<LoadMode>(index);
323#endif
324 } else if (oneByN && !nByOne) {
325 return packetNByOne<LoadMode>(index);
326 } else if (!oneByN && nByOne) {
327 return packetOneByN<LoadMode>(index);
328 } else if (oneByN && nByOne) {
329 return packetOneByNByOne<LoadMode>(index);
330 } else {
331 return packetColMajor<LoadMode>(index);
332 }
333 } else {
334 if (isCopy) {
335#ifdef EIGEN_GPU_COMPILE_PHASE
336 // See above.
337 return m_impl.template packet<Unaligned>(index);
338#else
339 return m_impl.template packet<LoadMode>(index);
340#endif
341 } else if (oneByN && !nByOne) {
342 return packetOneByN<LoadMode>(index);
343 } else if (!oneByN && nByOne) {
344 return packetNByOne<LoadMode>(index);
345 } else if (oneByN && nByOne) {
346 return packetOneByNByOne<LoadMode>(index);
347 } else {
348 return packetRowMajor<LoadMode>(index);
349 }
350 }
351 }
352
353 template <int LoadMode>
354 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByNByOne(Index index) const {
355 eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
356
357 EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
358 Index startDim, endDim;
359 Index inputIndex, outputOffset, batchedIndex;
360
361 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
362 startDim = NumDims - 1;
363 endDim = 1;
364 } else {
365 startDim = 0;
366 endDim = NumDims - 2;
367 }
368
369 batchedIndex = index % m_outputStrides[startDim];
370 inputIndex = batchedIndex / m_outputStrides[endDim];
371 outputOffset = batchedIndex % m_outputStrides[endDim];
372
373 if (outputOffset + PacketSize <= m_outputStrides[endDim]) {
374 values[0] = m_impl.coeff(inputIndex);
375 return internal::pload1<PacketReturnType>(values);
376 } else {
377 EIGEN_UNROLL_LOOP
378 for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
379 if (outputOffset + cur < m_outputStrides[endDim]) {
380 values[i] = m_impl.coeff(inputIndex);
381 } else {
382 ++inputIndex;
383 inputIndex = (inputIndex == m_inputStrides[startDim] ? 0 : inputIndex);
384 values[i] = m_impl.coeff(inputIndex);
385 outputOffset = 0;
386 cur = 0;
387 }
388 }
389 return internal::pload<PacketReturnType>(values);
390 }
391 }
392
393 template <int LoadMode>
394 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const {
395 // Consider the flattened tensor [v0, ..., vN],
396 // Concatenates m_broadcast[dim] copies,
397 // [v0, ..., vN, v0, ..., vN, ... ]
398 // with dim == NumDims - 1 for col-major, dim == 0 for row-major.
399 eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
400
401 // Size of flattened tensor.
402 const Index M =
403 (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_inputStrides[NumDims - 1] : m_inputStrides[0];
404 Index inputIndex = index % M;
405 if (inputIndex + PacketSize <= M) {
406 return m_impl.template packet<Unaligned>(inputIndex);
407 } else {
408 EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
409 EIGEN_UNROLL_LOOP
410 for (int i = 0; i < PacketSize; ++i) {
411 if (inputIndex > M - 1) {
412 inputIndex = 0;
413 }
414 values[i] = m_impl.coeff(inputIndex++);
415 }
416 return internal::pload<PacketReturnType>(values);
417 }
418 }
419
420 template <int LoadMode>
421 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const {
422 // Consider the flattened tensor [v0, ..., vN],
423 // Interleaves m_broadcast[dim] copies,
424 // [v0, v0, ..., v1, v1, ..., vN, vN, ... ]
425 // with dim == 0 for col-major, dim == NumDims - 1 for row-major.
426 eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
427
428 const Index M =
429 (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_broadcast[0] : m_broadcast[NumDims - 1];
430
431 Index inputIndex = index / M;
432 Index outputOffset = index % M;
433 if (outputOffset + PacketSize <= M) {
434 return internal::pset1<PacketReturnType>(m_impl.coeff(inputIndex));
435 } else {
436 EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
437 EIGEN_UNROLL_LOOP
438 for (int i = 0; i < PacketSize; ++i) {
439 if (outputOffset < M) {
440 values[i] = m_impl.coeff(inputIndex);
441 ++outputOffset;
442 } else {
443 values[i] = m_impl.coeff(++inputIndex);
444 outputOffset = 1; // Next offset.
445 }
446 }
447 return internal::pload<PacketReturnType>(values);
448 }
449 }
450
451 // Ignore the LoadMode and always use unaligned loads since we can't guarantee
452 // the alignment at compile time.
453 template <int LoadMode>
454 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const {
455 eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
456
457 const Index originalIndex = index;
458
459 Index inputIndex = 0;
460 EIGEN_UNROLL_LOOP
461 for (int i = NumDims - 1; i > 0; --i) {
462 const Index idx = index / m_outputStrides[i];
463 if (internal::index_statically_eq<Broadcast>(i, 1)) {
464 eigen_assert(idx < m_impl.dimensions()[i]);
465 inputIndex += idx * m_inputStrides[i];
466 } else {
467 if (internal::index_statically_eq<InputDimensions>(i, 1)) {
468 eigen_assert(idx % m_impl.dimensions()[i] == 0);
469 } else {
470 inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
471 }
472 }
473 index -= idx * m_outputStrides[i];
474 }
475 Index innermostLoc;
476 if (internal::index_statically_eq<Broadcast>(0, 1)) {
477 eigen_assert(index < m_impl.dimensions()[0]);
478 innermostLoc = index;
479 } else {
480 if (internal::index_statically_eq<InputDimensions>(0, 1)) {
481 eigen_assert(index % m_impl.dimensions()[0] == 0);
482 innermostLoc = 0;
483 } else {
484 innermostLoc = index % m_impl.dimensions()[0];
485 }
486 }
487 inputIndex += innermostLoc;
488
489 // Todo: this could be extended to the second dimension if we're not
490 // broadcasting alongside the first dimension, and so on.
491 if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) {
492 return m_impl.template packet<Unaligned>(inputIndex);
493 } else {
494 EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
495 values[0] = m_impl.coeff(inputIndex);
496 EIGEN_UNROLL_LOOP
497 for (int i = 1; i < PacketSize; ++i) {
498 if (innermostLoc + i < m_impl.dimensions()[0]) {
499 values[i] = m_impl.coeff(inputIndex + i);
500 } else {
501 values[i] = coeffColMajor(originalIndex + i);
502 }
503 }
504 PacketReturnType rslt = internal::pload<PacketReturnType>(values);
505 return rslt;
506 }
507 }
508
509 template <int LoadMode>
510 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const {
511 eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
512
513 const Index originalIndex = index;
514
515 Index inputIndex = 0;
516 EIGEN_UNROLL_LOOP
517 for (int i = 0; i < NumDims - 1; ++i) {
518 const Index idx = index / m_outputStrides[i];
519 if (internal::index_statically_eq<Broadcast>(i, 1)) {
520 eigen_assert(idx < m_impl.dimensions()[i]);
521 inputIndex += idx * m_inputStrides[i];
522 } else {
523 if (internal::index_statically_eq<InputDimensions>(i, 1)) {
524 eigen_assert(idx % m_impl.dimensions()[i] == 0);
525 } else {
526 inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
527 }
528 }
529 index -= idx * m_outputStrides[i];
530 }
531 Index innermostLoc;
532 if (internal::index_statically_eq<Broadcast>(NumDims - 1, 1)) {
533 eigen_assert(index < m_impl.dimensions()[NumDims - 1]);
534 innermostLoc = index;
535 } else {
536 if (internal::index_statically_eq<InputDimensions>(NumDims - 1, 1)) {
537 eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0);
538 innermostLoc = 0;
539 } else {
540 innermostLoc = index % m_impl.dimensions()[NumDims - 1];
541 }
542 }
543 inputIndex += innermostLoc;
544
545 // Todo: this could be extended to the second dimension if we're not
546 // broadcasting alongside the first dimension, and so on.
547 if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims - 1]) {
548 return m_impl.template packet<Unaligned>(inputIndex);
549 } else {
550 EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
551 values[0] = m_impl.coeff(inputIndex);
552 EIGEN_UNROLL_LOOP
553 for (int i = 1; i < PacketSize; ++i) {
554 if (innermostLoc + i < m_impl.dimensions()[NumDims - 1]) {
555 values[i] = m_impl.coeff(inputIndex + i);
556 } else {
557 values[i] = coeffRowMajor(originalIndex + i);
558 }
559 }
560 PacketReturnType rslt = internal::pload<PacketReturnType>(values);
561 return rslt;
562 }
563 }
564
565 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
566 double compute_cost = TensorOpCost::AddCost<Index>();
567 if (!isCopy && NumDims > 0) {
568 EIGEN_UNROLL_LOOP
569 for (int i = NumDims - 1; i > 0; --i) {
570 compute_cost += TensorOpCost::DivCost<Index>();
571 if (internal::index_statically_eq<Broadcast>(i, 1)) {
572 compute_cost += TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
573 } else {
574 if (!internal::index_statically_eq<InputDimensions>(i, 1)) {
575 compute_cost +=
576 TensorOpCost::MulCost<Index>() + TensorOpCost::ModCost<Index>() + TensorOpCost::AddCost<Index>();
577 }
578 }
579 compute_cost += TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
580 }
581 }
582 return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
583 }
584
585 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
586 // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large
587 // tensors. But this might need further tuning.
588 const size_t target_size = m_device.firstLevelCacheSize();
589 return internal::TensorBlockResourceRequirements::merge(
590 m_impl.getResourceRequirements(), internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size));
591 }
592
593 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
594 bool /*root_of_expr_ast*/ = false) const {
595 BlockBroadcastingParams params = blockBroadcastingParams(desc);
596
597 if (params.inner_dim_size == 0 || params.bcast_dim_size == 0) {
598 return emptyBlock();
599 }
600
601 // Prepare storage for the materialized broadcasting result.
602 const typename TensorBlock::Storage block_storage = TensorBlock::prepareStorage(desc, scratch);
603 ScalarNoConst* materialized_output = block_storage.data();
604
605 // We potentially will need to materialize input blocks.
606 size_t materialized_input_size = 0;
607 ScalarNoConst* materialized_input = NULL;
608
609 // Initialize block broadcating iterator state for outer dimensions (outer
610 // with regard to bcast dimension). Dimension in this array are always in
611 // inner_most -> outer_most order (col major layout).
612 array<BlockBroadcastingIteratorState, NumDims> it;
613 int idx = 0;
614
615 for (int i = params.inner_dim_count + 1; i < NumDims; ++i) {
616 const Index dim = IsColMajor ? i : NumDims - 1 - i;
617 it[idx].size = params.output_dims[dim];
618 it[idx].count = 0;
619 it[idx].output_stride = m_outputStrides[dim];
620 it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
621 idx++;
622 }
623
624 // Write output into the beginning of `materialized_output`.
625 Index output_offset = 0;
626
627 // We will fill output block by broadcasting along the bcast dim, and
628 // iterating over outer dimension.
629 const Index output_size = NumDims == 0 ? 1 : params.output_dims.TotalSize();
630
631 for (Index num_output_coeffs = 0; num_output_coeffs < output_size;) {
632 ScalarNoConst* bcast_output = materialized_output + num_output_coeffs;
633 Index bcast_offset = desc.offset() + output_offset;
634
635 // Broadcast along the bcast dimension.
636 num_output_coeffs += BroadcastBlockAlongBcastDim(params, bcast_offset, scratch, bcast_output, &materialized_input,
637 &materialized_input_size);
638
639 // Switch to the next outer dimension.
640 for (int j = 0; j < idx; ++j) {
641 if (++it[j].count < it[j].size) {
642 output_offset += it[j].output_stride;
643 break;
644 }
645 it[j].count = 0;
646 output_offset -= it[j].output_span;
647 }
648 }
649
650 return block_storage.AsTensorMaterializedBlock();
651 }
652
653 EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
654
655 const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
656
657 Broadcast functor() const { return m_broadcast; }
658
659 private:
660 static constexpr bool IsColMajor = static_cast<int>(Layout) == static_cast<int>(ColMajor);
661
662 // We will build a general case block broadcasting on top of broadcasting
663 // primitive that will do broadcasting only for the inner dimension(s) along
664 // the first dimension smaller than the input size (it's called `bcast_dim`).
665 //
666 // Example:
667 // dim: 0 1 2 (ColMajor)
668 // input size: [9, 3, 6]
669 // block size: [9, 2, 6]
670 //
671 // We will compute broadcasted block by iterating over the outer dimensions
672 // before `bcast_dim` (only dimension `2` in this example) and computing
673 // broadcasts along the `bcast_dim` (dimension `1` in this example).
674
675 // BlockBroadcastingParams holds precomputed parameters for broadcasting a
676 // single block along the broadcasting dimension. Sizes and strides along the
677 // `bcast_dim` might be invalid, they will be adjusted later in
678 // `BroadcastBlockAlongBcastDim`.
679 struct BlockBroadcastingParams {
680 Dimensions input_dims; // input expression dimensions
681 Dimensions output_dims; // output block sizes
682 Dimensions output_strides; // output block strides
683
684 int inner_dim_count; // count inner dimensions matching in size
685 int bcast_dim; // broadcasting dimension index
686 Index bcast_dim_size; // broadcasting dimension size
687 Index inner_dim_size; // inner dimensions size
688
689 // Block sizes and strides for the input block where all dimensions before
690 // `bcast_dim` are equal to `1`.
691 Dimensions input_block_sizes;
692 Dimensions input_block_strides;
693
694 // Block sizes and strides for blocks with extra dimensions and strides `0`.
695 BroadcastDimensions bcast_block_sizes;
696 BroadcastDimensions bcast_block_strides;
697 BroadcastDimensions bcast_input_strides;
698 };
699
700 struct BlockBroadcastingIteratorState {
701 Index size;
702 Index count;
703 Index output_stride;
704 Index output_span;
705 };
706
707 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlockBroadcastingParams blockBroadcastingParams(TensorBlockDesc& desc) const {
708 BlockBroadcastingParams params;
709
710 params.input_dims = Dimensions(m_impl.dimensions());
711
712 // Output block sizes and strides.
713 params.output_dims = desc.dimensions();
714 params.output_strides = internal::strides<Layout>(params.output_dims);
715
716 // Find the broadcasting dimension (first dimension with output size smaller
717 // that the input size).
718 params.bcast_dim = 0;
719 params.bcast_dim_size = 1;
720 params.inner_dim_size = 1;
721
722 // Count the number of inner dimensions that have the same size in the block
723 // and in the broadcast expression.
724 params.inner_dim_count = 0;
725
726 for (int i = 0; i < NumDims; ++i) {
727 const int dim = IsColMajor ? i : NumDims - i - 1;
728
729 if (params.output_dims[dim] == m_dimensions[dim]) {
730 params.inner_dim_size *= params.output_dims[dim];
731 ++params.inner_dim_count;
732 continue;
733 }
734
735 // First non-matching dimension is the broadcasting dimension.
736 eigen_assert(params.output_dims[dim] < m_dimensions[dim]);
737 params.bcast_dim = dim;
738 params.bcast_dim_size = params.output_dims[dim];
739 break;
740 }
741
742 // Calculate the input block size for looking into the input.
743 for (int i = 0; i < params.inner_dim_count; ++i) {
744 const int dim = IsColMajor ? i : NumDims - i - 1;
745 params.input_block_sizes[dim] = params.input_dims[dim];
746 }
747 for (int i = params.inner_dim_count; i < NumDims; ++i) {
748 const int dim = IsColMajor ? i : NumDims - i - 1;
749 params.input_block_sizes[dim] = 1;
750 }
751 params.input_block_strides = internal::strides<Layout>(params.input_block_sizes);
752
753 // Broadcast with the 0-stride trick: Create 1 extra dim for each
754 // broadcast, set the input stride to 0.
755 //
756 // When ColMajor:
757 //
758 // - bcast_block_sizes:
759 // [d_0, b_0, d_1, b_1, ...]
760 //
761 // - bcast_block_strides:
762 // [output_block_strides[0], output_block_strides[0] * d_0,
763 // output_block_strides[1], output_block_strides[1] * d_1,
764 // ...]
765 //
766 // - bcast_input_strides:
767 // [input_block_strides[0], 0,
768 // input_block_strides[1], 0,
769 // ...].
770 //
771 for (int i = 0; i < params.inner_dim_count; ++i) {
772 const int dim = IsColMajor ? i : NumDims - i - 1;
773
774 const int copy_dim = IsColMajor ? 2 * i : 2 * NumDims - 2 * i - 1;
775 const int broadcast_dim = IsColMajor ? copy_dim + 1 : copy_dim - 1;
776
777 params.bcast_block_sizes[copy_dim] = params.input_dims[dim];
778 params.bcast_block_sizes[broadcast_dim] = m_broadcast[dim];
779 params.bcast_block_strides[copy_dim] = params.output_strides[dim];
780 params.bcast_block_strides[broadcast_dim] = params.output_strides[dim] * params.input_dims[dim];
781 params.bcast_input_strides[copy_dim] = params.input_block_strides[dim];
782 params.bcast_input_strides[broadcast_dim] = 0;
783 }
784
785 for (int i = 2 * params.inner_dim_count; i < 2 * NumDims; ++i) {
786 const int dim = IsColMajor ? i : 2 * NumDims - i - 1;
787 params.bcast_block_sizes[dim] = 1;
788 params.bcast_block_strides[dim] = 0;
789 params.bcast_input_strides[dim] = 0;
790 }
791
792 return params;
793 }
794
795 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock emptyBlock() const {
796 DSizes<Index, NumDims> dimensions;
797 for (int i = 0; i < NumDims; ++i) dimensions[i] = 0;
798 return TensorBlock(internal::TensorBlockKind::kView, NULL, dimensions);
799 }
800
801 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlockAlongBcastDim(
802 BlockBroadcastingParams params, Index bcast_offset, TensorBlockScratch& scratch,
803 ScalarNoConst* materialized_output, ScalarNoConst** materialized_input, size_t* materialized_input_size) const {
804 if (params.bcast_dim_size == 1) {
805 // We just need one block read using the ready-set values above.
806 return BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes,
807 params.bcast_block_strides, params.bcast_input_strides, bcast_offset, 0, scratch,
808 materialized_output, materialized_input, materialized_input_size);
809
810 } else if (params.input_dims[params.bcast_dim] == 1) {
811 // Broadcast bcast dimension (< NumDims) by bcast_dim_size.
812 const int broadcast_bcast_dim =
813 IsColMajor ? 2 * params.inner_dim_count + 1 : 2 * NumDims - 2 * params.inner_dim_count - 2;
814
815 params.bcast_block_sizes[broadcast_bcast_dim] = params.bcast_dim_size;
816 params.bcast_input_strides[broadcast_bcast_dim] = 0;
817 params.bcast_block_strides[broadcast_bcast_dim] = params.output_strides[params.bcast_dim];
818
819 return BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes,
820 params.bcast_block_strides, params.bcast_input_strides, bcast_offset, 0, scratch,
821 materialized_output, materialized_input, materialized_input_size);
822
823 } else {
824 // Keep track of the total number of the coefficients written to the
825 // output block.
826 Index num_output_coeffs = 0;
827
828 // The general case. Let's denote the output block as
829 //
830 // x[..., a:a+bcast_dim_size, :, ..., :]
831 //
832 // where a:a+bcast_dim_size is a slice on the bcast_dim dimension
833 // (< NumDims). We need to split the a:a+bcast_dim_size into possibly 3
834 // sub-blocks:
835 //
836 // (1) a:b, where b is the smallest multiple of
837 // input_dims[bcast_dim_start] in [a, a+bcast_dim_size].
838 //
839 // (2) b:c, where c is the largest multiple of input_dims[bcast_dim_start]
840 // in [a, a+bcast_dim_size].
841 //
842 // (3) c:a+bcast_dim_size .
843 //
844 // Or, when b and c do not exist, we just need to process the whole block
845 // together.
846
847 // Find a.
848 const Index bcast_dim_left_index = bcast_offset / m_outputStrides[params.bcast_dim];
849
850 // Find b and c.
851 const Index input_bcast_dim_size = params.input_dims[params.bcast_dim];
852
853 // First multiple after a. This is b when <= bcast_dim_left_index +
854 // bcast_dim_size.
855 const Index first_multiple =
856 numext::div_ceil<Index>(bcast_dim_left_index, input_bcast_dim_size) * input_bcast_dim_size;
857
858 if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) {
859 // b exists, so does c. Find it.
860 const Index last_multiple =
861 (bcast_dim_left_index + params.bcast_dim_size) / input_bcast_dim_size * input_bcast_dim_size;
862 const int copy_bcast_dim =
863 IsColMajor ? 2 * params.inner_dim_count : 2 * NumDims - 2 * params.inner_dim_count - 1;
864 const int broadcast_bcast_dim =
865 IsColMajor ? 2 * params.inner_dim_count + 1 : 2 * NumDims - 2 * params.inner_dim_count - 2;
866
867 if (first_multiple > bcast_dim_left_index) {
868 const Index head_size = first_multiple - bcast_dim_left_index;
869 params.input_block_sizes[params.bcast_dim] = head_size;
870 params.bcast_block_sizes[copy_bcast_dim] = head_size;
871 params.bcast_input_strides[copy_bcast_dim] = params.input_block_strides[params.bcast_dim];
872 params.bcast_block_strides[copy_bcast_dim] = params.output_strides[params.bcast_dim];
873 params.bcast_block_sizes[broadcast_bcast_dim] = 1;
874 params.bcast_input_strides[broadcast_bcast_dim] = 0;
875 params.bcast_block_strides[broadcast_bcast_dim] =
876 params.output_strides[params.bcast_dim] * params.input_dims[params.bcast_dim];
877
878 num_output_coeffs +=
879 BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes,
880 params.bcast_block_strides, params.bcast_input_strides, bcast_offset, 0, scratch,
881 materialized_output, materialized_input, materialized_input_size);
882 }
883 if (first_multiple < last_multiple) {
884 params.input_block_sizes[params.bcast_dim] = input_bcast_dim_size;
885 params.bcast_block_sizes[copy_bcast_dim] = input_bcast_dim_size;
886 params.bcast_input_strides[copy_bcast_dim] = params.input_block_strides[params.bcast_dim];
887 params.bcast_block_strides[copy_bcast_dim] = params.output_strides[params.bcast_dim];
888 params.bcast_block_sizes[broadcast_bcast_dim] = (last_multiple - first_multiple) / input_bcast_dim_size;
889 params.bcast_input_strides[broadcast_bcast_dim] = 0;
890 params.bcast_block_strides[broadcast_bcast_dim] =
891 params.output_strides[params.bcast_dim] * params.input_dims[params.bcast_dim];
892 const Index offset = (first_multiple - bcast_dim_left_index) * m_outputStrides[params.bcast_dim];
893
894 num_output_coeffs +=
895 BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes,
896 params.bcast_block_strides, params.bcast_input_strides, bcast_offset, offset, scratch,
897 materialized_output, materialized_input, materialized_input_size);
898 }
899 if (last_multiple < bcast_dim_left_index + params.bcast_dim_size) {
900 const Index tail_size = bcast_dim_left_index + params.bcast_dim_size - last_multiple;
901 params.input_block_sizes[params.bcast_dim] = tail_size;
902 params.bcast_block_sizes[copy_bcast_dim] = tail_size;
903 params.bcast_input_strides[copy_bcast_dim] = params.input_block_strides[params.bcast_dim];
904 params.bcast_block_strides[copy_bcast_dim] = params.output_strides[params.bcast_dim];
905 params.bcast_block_sizes[broadcast_bcast_dim] = 1;
906 params.bcast_input_strides[broadcast_bcast_dim] = 0;
907 params.bcast_block_strides[broadcast_bcast_dim] =
908 params.output_strides[params.bcast_dim] * params.input_dims[params.bcast_dim];
909 const Index offset = (last_multiple - bcast_dim_left_index) * m_outputStrides[params.bcast_dim];
910
911 num_output_coeffs +=
912 BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes,
913 params.bcast_block_strides, params.bcast_input_strides, bcast_offset, offset, scratch,
914 materialized_output, materialized_input, materialized_input_size);
915 }
916 } else {
917 // b and c do not exist.
918 const int copy_bcast_dim =
919 IsColMajor ? 2 * params.inner_dim_count : 2 * NumDims - 2 * params.inner_dim_count - 1;
920 params.input_block_sizes[params.bcast_dim] = params.bcast_dim_size;
921 params.bcast_block_sizes[copy_bcast_dim] = params.bcast_dim_size;
922 params.bcast_input_strides[copy_bcast_dim] = params.input_block_strides[params.bcast_dim];
923 params.bcast_block_strides[copy_bcast_dim] = params.output_strides[params.bcast_dim];
924
925 num_output_coeffs +=
926 BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes,
927 params.bcast_block_strides, params.bcast_input_strides, bcast_offset, 0, scratch,
928 materialized_output, materialized_input, materialized_input_size);
929 }
930
931 return num_output_coeffs;
932 }
933 }
934
935 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlock(
936 const Dimensions& input_block_sizes, const Dimensions& input_block_strides,
937 const BroadcastDimensions& bcast_block_sizes, const BroadcastDimensions& bcast_block_strides,
938 const BroadcastDimensions& bcast_input_strides, Index bcast_offset, Index offset, TensorBlockScratch& scratch,
939 ScalarNoConst* materialized_output, ScalarNoConst** materialized_input, size_t* materialized_input_size) const {
940 // ---------------------------------------------------------------------- //
941 // Tensor block descriptor for reading block from the input.
942 const Index input_offset = bcast_offset + offset;
943 TensorBlockDesc input_desc(IsColMajor ? indexColMajor(input_offset) : indexRowMajor(input_offset),
944 input_block_sizes);
945
946 ArgTensorBlock input_block = m_impl.block(input_desc, scratch);
947
948 // ---------------------------------------------------------------------- //
949 // Materialize input block into a temporary memory buffer only if it's not
950 // already available in the arg block.
951 const ScalarNoConst* input_buffer = NULL;
952
953 if (input_block.data() != NULL) {
954 // Input block already has raw data, there is no need to materialize it.
955 input_buffer = input_block.data();
956
957 } else {
958 // Otherwise we have to do block assignment into a temporary buffer.
959
960 // Maybe reuse previously allocated buffer, or allocate a new one with a
961 // scratch allocator.
962 const size_t input_total_size = input_block_sizes.TotalSize();
963 if (*materialized_input == NULL || *materialized_input_size < input_total_size) {
964 *materialized_input_size = input_total_size;
965 void* mem = scratch.allocate(*materialized_input_size * sizeof(Scalar));
966 *materialized_input = static_cast<ScalarNoConst*>(mem);
967 }
968
969 typedef internal::TensorBlockAssignment<ScalarNoConst, NumDims, typename ArgTensorBlock::XprType, Index>
970 TensorBlockAssignment;
971
972 TensorBlockAssignment::Run(
973 TensorBlockAssignment::target(input_block_sizes, input_block_strides, *materialized_input),
974 input_block.expr());
975
976 input_buffer = *materialized_input;
977 }
978
979 // ---------------------------------------------------------------------- //
980 // Copy data from materialized input block to the materialized output, using
981 // given broadcast strides (strides with zeroes).
982 typedef internal::TensorBlockIO<ScalarNoConst, Index, 2 * NumDims, Layout> TensorBlockIO;
983
984 typename TensorBlockIO::Src src(bcast_input_strides, input_buffer);
985 typename TensorBlockIO::Dst dst(bcast_block_sizes, bcast_block_strides, materialized_output + offset);
986
987 return TensorBlockIO::Copy(dst, src);
988 }
989
990 protected:
991 const Device EIGEN_DEVICE_REF m_device;
992 const std::remove_reference_t<Broadcast> m_broadcast;
993 Dimensions m_dimensions;
994 array<Index, NumDims> m_outputStrides;
995 array<Index, NumDims> m_inputStrides;
996 TensorEvaluator<ArgType, Device> m_impl;
997};
998
999} // end namespace Eigen
1000
1001#endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
The tensor base class.
Definition TensorForwardDeclarations.h:68
Definition TensorBroadcasting.h:66
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The tensor evaluator class.
Definition TensorEvaluator.h:30