Eigen-unsupported  3.4.1 (git rev 28ded8800c26864e537852658428ab44c8399e87)
 
Loading...
Searching...
No Matches
TensorBroadcasting.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
11#define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
12
13namespace Eigen {
14
15namespace internal {
16template<typename Broadcast, typename XprType>
17struct traits<TensorBroadcastingOp<Broadcast, XprType> > : public traits<XprType>
18{
19 typedef typename XprType::Scalar Scalar;
20 typedef traits<XprType> XprTraits;
21 typedef typename XprTraits::StorageKind StorageKind;
22 typedef typename XprTraits::Index Index;
23 typedef typename XprType::Nested Nested;
24 typedef typename remove_reference<Nested>::type _Nested;
25 static const int NumDimensions = XprTraits::NumDimensions;
26 static const int Layout = XprTraits::Layout;
27 typedef typename XprTraits::PointerType PointerType;
28};
29
30template<typename Broadcast, typename XprType>
31struct eval<TensorBroadcastingOp<Broadcast, XprType>, Eigen::Dense>
32{
33 typedef const TensorBroadcastingOp<Broadcast, XprType> EIGEN_DEVICE_REF type;
34};
35
36template<typename Broadcast, typename XprType>
37struct nested<TensorBroadcastingOp<Broadcast, XprType>, 1, typename eval<TensorBroadcastingOp<Broadcast, XprType> >::type>
38{
39 typedef TensorBroadcastingOp<Broadcast, XprType> type;
40};
41
42template <typename Dims>
43struct is_input_scalar {
44 static const bool value = false;
45};
46template <>
47struct is_input_scalar<Sizes<> > {
48 static const bool value = true;
49};
50#ifndef EIGEN_EMULATE_CXX11_META_H
51template <typename std::ptrdiff_t... Indices>
52struct is_input_scalar<Sizes<Indices...> > {
53 static const bool value = (Sizes<Indices...>::total_size == 1);
54};
55#endif
56
57} // end namespace internal
58
62template <typename Broadcast, typename XprType>
63class TensorBroadcastingOp : public TensorBase<TensorBroadcastingOp<Broadcast, XprType>, ReadOnlyAccessors> {
64 public:
65 typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Scalar Scalar;
66 typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
67 typedef typename XprType::CoeffReturnType CoeffReturnType;
68 typedef typename Eigen::internal::nested<TensorBroadcastingOp>::type Nested;
69 typedef typename Eigen::internal::traits<TensorBroadcastingOp>::StorageKind StorageKind;
70 typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Index Index;
71
72 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast)
73 : m_xpr(expr), m_broadcast(broadcast) {}
74
75 EIGEN_DEVICE_FUNC
76 const Broadcast& broadcast() const { return m_broadcast; }
77
78 EIGEN_DEVICE_FUNC
79 const typename internal::remove_all<typename XprType::Nested>::type&
80 expression() const { return m_xpr; }
81
82 protected:
83 typename XprType::Nested m_xpr;
84 const Broadcast m_broadcast;
85};
86
87
88// Eval as rvalue
89template<typename Broadcast, typename ArgType, typename Device>
90struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
91{
93 typedef typename XprType::Index Index;
94 static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
95 typedef DSizes<Index, NumDims> Dimensions;
96 typedef typename XprType::Scalar Scalar;
97 typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
98 typedef typename XprType::CoeffReturnType CoeffReturnType;
99 typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
100 static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
101 protected: // all the non-static fields must have the same access control, otherwise the TensorEvaluator wont be standard layout;
102 bool isCopy, nByOne, oneByN;
103 public:
104 typedef StorageMemory<CoeffReturnType, Device> Storage;
105 typedef typename Storage::Type EvaluatorPointerType;
106
107 enum {
108 IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
109 PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
110 BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
111 PreferBlockAccess = true,
112 Layout = TensorEvaluator<ArgType, Device>::Layout,
113 RawAccess = false
114 };
115
116 typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
117
118 // We do block based broadcasting using a trick with 2x tensor rank and 0
119 // strides. See block method implementation for details.
120 typedef DSizes<Index, 2 * NumDims> BroadcastDimensions;
121
122 //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
123 typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
124 typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
125
126 typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
127 ArgTensorBlock;
128
129 typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
130 Layout, Index>
131 TensorBlock;
132 //===--------------------------------------------------------------------===//
133
134 EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
135 : isCopy(false), nByOne(false), oneByN(false),
136 m_device(device), m_broadcast(op.broadcast()), m_impl(op.expression(), device)
137 {
138
139 // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
140 // and store the result in a scalar. Instead one should reshape the scalar into a N-D
141 // tensor with N >= 1 of 1 element first and then broadcast.
142 EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
143 const InputDimensions& input_dims = m_impl.dimensions();
144 isCopy = true;
145 for (int i = 0; i < NumDims; ++i) {
146 eigen_assert(input_dims[i] > 0);
147 m_dimensions[i] = input_dims[i] * m_broadcast[i];
148 if (m_broadcast[i] != 1) {
149 isCopy = false;
150 }
151 }
152
153 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
154 m_inputStrides[0] = 1;
155 m_outputStrides[0] = 1;
156 for (int i = 1; i < NumDims; ++i) {
157 m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
158 m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
159 }
160 } else {
161 m_inputStrides[NumDims-1] = 1;
162 m_outputStrides[NumDims-1] = 1;
163 for (int i = NumDims-2; i >= 0; --i) {
164 m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
165 m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
166 }
167 }
168
169 if (input_dims[0] == 1) {
170 oneByN = true;
171 for (int i = 1; i < NumDims; ++i) {
172 if (m_broadcast[i] != 1) {
173 oneByN = false;
174 break;
175 }
176 }
177 } else if (input_dims[NumDims-1] == 1) {
178 nByOne = true;
179 for (int i = 0; i < NumDims-1; ++i) {
180 if (m_broadcast[i] != 1) {
181 nByOne = false;
182 break;
183 }
184 }
185 }
186
187 // Handle special format like NCHW, its input shape is '[1, N..., 1]' and
188 // broadcast shape is '[N, 1..., N]'
189 if (!oneByN && !nByOne) {
190 if (input_dims[0] == 1 && input_dims[NumDims-1] == 1 && NumDims > 2) {
191 nByOne = true;
192 oneByN = true;
193 for (int i = 1; i < NumDims-1; ++i) {
194 if (m_broadcast[i] != 1) {
195 nByOne = false;
196 oneByN = false;
197 break;
198 }
199 }
200 }
201 }
202 }
203
204 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
205
206 EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
207 m_impl.evalSubExprsIfNeeded(NULL);
208 return true;
209 }
210
211#ifdef EIGEN_USE_THREADS
212 template <typename EvalSubExprsCallback>
213 EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
214 EvaluatorPointerType, EvalSubExprsCallback done) {
215 m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
216 }
217#endif // EIGEN_USE_THREADS
218
219 EIGEN_STRONG_INLINE void cleanup() {
220 m_impl.cleanup();
221 }
222
223 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const
224 {
225 if (internal::is_input_scalar<typename internal::remove_all<InputDimensions>::type>::value) {
226 return m_impl.coeff(0);
227 }
228
229 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
230 if (isCopy) {
231 return m_impl.coeff(index);
232 } else {
233 return coeffColMajor(index);
234 }
235 } else {
236 if (isCopy) {
237 return m_impl.coeff(index);
238 } else {
239 return coeffRowMajor(index);
240 }
241 }
242 }
243
244 // TODO: attempt to speed this up. The integer divisions and modulo are slow
245 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const {
246 Index inputIndex = 0;
247 EIGEN_UNROLL_LOOP
248 for (int i = NumDims - 1; i > 0; --i) {
249 const Index idx = index / m_outputStrides[i];
250 if (internal::index_statically_eq<Broadcast>(i, 1)) {
251 eigen_assert(idx < m_impl.dimensions()[i]);
252 inputIndex += idx * m_inputStrides[i];
253 } else {
254 if (internal::index_statically_eq<InputDimensions>(i, 1)) {
255 eigen_assert(idx % m_impl.dimensions()[i] == 0);
256 } else {
257 inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
258 }
259 }
260 index -= idx * m_outputStrides[i];
261 }
262 if (internal::index_statically_eq<Broadcast>(0, 1)) {
263 eigen_assert(index < m_impl.dimensions()[0]);
264 inputIndex += index;
265 } else {
266 if (internal::index_statically_eq<InputDimensions>(0, 1)) {
267 eigen_assert(index % m_impl.dimensions()[0] == 0);
268 } else {
269 inputIndex += (index % m_impl.dimensions()[0]);
270 }
271 }
272 return inputIndex;
273 }
274
275 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
276 {
277 return m_impl.coeff(indexColMajor(index));
278 }
279
280 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const {
281 Index inputIndex = 0;
282 EIGEN_UNROLL_LOOP
283 for (int i = 0; i < NumDims - 1; ++i) {
284 const Index idx = index / m_outputStrides[i];
285 if (internal::index_statically_eq<Broadcast>(i, 1)) {
286 eigen_assert(idx < m_impl.dimensions()[i]);
287 inputIndex += idx * m_inputStrides[i];
288 } else {
289 if (internal::index_statically_eq<InputDimensions>(i, 1)) {
290 eigen_assert(idx % m_impl.dimensions()[i] == 0);
291 } else {
292 inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
293 }
294 }
295 index -= idx * m_outputStrides[i];
296 }
297 if (internal::index_statically_eq<Broadcast>(NumDims - 1, 1)) {
298 eigen_assert(index < m_impl.dimensions()[NumDims - 1]);
299 inputIndex += index;
300 } else {
301 if (internal::index_statically_eq<InputDimensions>(NumDims - 1, 1)) {
302 eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0);
303 } else {
304 inputIndex += (index % m_impl.dimensions()[NumDims - 1]);
305 }
306 }
307 return inputIndex;
308 }
309
310 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const
311 {
312 return m_impl.coeff(indexRowMajor(index));
313 }
314
315 template<int LoadMode>
316 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const
317 {
318 if (internal::is_input_scalar<typename internal::remove_all<InputDimensions>::type>::value) {
319 return internal::pset1<PacketReturnType>(m_impl.coeff(0));
320 }
321
322 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
323 if (isCopy) {
324 #ifdef EIGEN_GPU_COMPILE_PHASE
325 // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing
326 // unaligned loads here. The reason is unclear though.
327 return m_impl.template packet<Unaligned>(index);
328 #else
329 return m_impl.template packet<LoadMode>(index);
330 #endif
331 } else if (oneByN && !nByOne) {
332 return packetNByOne<LoadMode>(index);
333 } else if (!oneByN && nByOne) {
334 return packetOneByN<LoadMode>(index);
335 } else if (oneByN && nByOne) {
336 return packetOneByNByOne<LoadMode>(index);
337 } else {
338 return packetColMajor<LoadMode>(index);
339 }
340 } else {
341 if (isCopy) {
342 #ifdef EIGEN_GPU_COMPILE_PHASE
343 // See above.
344 return m_impl.template packet<Unaligned>(index);
345 #else
346 return m_impl.template packet<LoadMode>(index);
347 #endif
348 } else if (oneByN && !nByOne) {
349 return packetOneByN<LoadMode>(index);
350 } else if (!oneByN && nByOne) {
351 return packetNByOne<LoadMode>(index);
352 } else if (oneByN && nByOne) {
353 return packetOneByNByOne<LoadMode>(index);
354 } else {
355 return packetRowMajor<LoadMode>(index);
356 }
357 }
358 }
359
360 template<int LoadMode>
361 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByNByOne
362 (Index index) const
363 {
364 EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
365 eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
366
367 EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
368 Index startDim, endDim;
369 Index inputIndex, outputOffset, batchedIndex;
370
371 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
372 startDim = NumDims - 1;
373 endDim = 1;
374 } else {
375 startDim = 0;
376 endDim = NumDims - 2;
377 }
378
379 batchedIndex = index % m_outputStrides[startDim];
380 inputIndex = batchedIndex / m_outputStrides[endDim];
381 outputOffset = batchedIndex % m_outputStrides[endDim];
382
383 if (outputOffset + PacketSize <= m_outputStrides[endDim]) {
384 values[0] = m_impl.coeff(inputIndex);
385 return internal::pload1<PacketReturnType>(values);
386 } else {
387 EIGEN_UNROLL_LOOP
388 for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
389 if (outputOffset + cur < m_outputStrides[endDim]) {
390 values[i] = m_impl.coeff(inputIndex);
391 } else {
392 ++inputIndex;
393 inputIndex = (inputIndex == m_inputStrides[startDim] ? 0 : inputIndex);
394 values[i] = m_impl.coeff(inputIndex);
395 outputOffset = 0;
396 cur = 0;
397 }
398 }
399 return internal::pload<PacketReturnType>(values);
400 }
401 }
402
403 template<int LoadMode>
404 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const
405 {
406 // Consider the flattened tensor [v0, ..., vN],
407 // Concatenates m_broadcast[dim] copies,
408 // [v0, ..., vN, v0, ..., vN, ... ]
409 // with dim == NumDims - 1 for col-major, dim == 0 for row-major.
410 EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
411 eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
412
413 // Size of flattened tensor.
414 const Index M = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ?
415 m_inputStrides[NumDims - 1] : m_inputStrides[0];
416 Index inputIndex = index % M;
417 if (inputIndex + PacketSize <= M) {
418 return m_impl.template packet<Unaligned>(inputIndex);
419 } else {
420 EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
421 EIGEN_UNROLL_LOOP
422 for (int i = 0; i < PacketSize; ++i) {
423 if (inputIndex > M - 1) {
424 inputIndex = 0;
425 }
426 values[i] = m_impl.coeff(inputIndex++);
427 }
428 return internal::pload<PacketReturnType>(values);
429 }
430 }
431
432 template<int LoadMode>
433 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const
434 {
435 // Consider the flattened tensor [v0, ..., vN],
436 // Interleaves m_broadcast[dim] copies,
437 // [v0, v0, ..., v1, v1, ..., vN, vN, ... ]
438 // with dim == 0 for col-major, dim == NumDims - 1 for row-major.
439 EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
440 eigen_assert(index + PacketSize-1 < dimensions().TotalSize());
441
442 const Index M = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ?
443 m_broadcast[0] : m_broadcast[NumDims - 1];
444
445 Index inputIndex = index / M;
446 Index outputOffset = index % M;
447 if (outputOffset + PacketSize <= M) {
448 return internal::pset1<PacketReturnType>(m_impl.coeff(inputIndex));
449 } else {
450 EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
451 EIGEN_UNROLL_LOOP
452 for (int i = 0; i < PacketSize; ++i) {
453 if (outputOffset < M) {
454 values[i] = m_impl.coeff(inputIndex);
455 ++outputOffset;
456 } else {
457 values[i] = m_impl.coeff(++inputIndex);
458 outputOffset = 1; // Next offset.
459 }
460 }
461 return internal::pload<PacketReturnType>(values);
462 }
463 }
464
465 // Ignore the LoadMode and always use unaligned loads since we can't guarantee
466 // the alignment at compile time.
467 template<int LoadMode>
468 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
469 {
470 EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
471 eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
472
473 const Index originalIndex = index;
474
475 Index inputIndex = 0;
476 EIGEN_UNROLL_LOOP
477 for (int i = NumDims - 1; i > 0; --i) {
478 const Index idx = index / m_outputStrides[i];
479 if (internal::index_statically_eq<Broadcast>(i, 1)) {
480 eigen_assert(idx < m_impl.dimensions()[i]);
481 inputIndex += idx * m_inputStrides[i];
482 } else {
483 if (internal::index_statically_eq<InputDimensions>(i, 1)) {
484 eigen_assert(idx % m_impl.dimensions()[i] == 0);
485 } else {
486 inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
487 }
488 }
489 index -= idx * m_outputStrides[i];
490 }
491 Index innermostLoc;
492 if (internal::index_statically_eq<Broadcast>(0, 1)) {
493 eigen_assert(index < m_impl.dimensions()[0]);
494 innermostLoc = index;
495 } else {
496 if (internal::index_statically_eq<InputDimensions>(0, 1)) {
497 eigen_assert(index % m_impl.dimensions()[0] == 0);
498 innermostLoc = 0;
499 } else {
500 innermostLoc = index % m_impl.dimensions()[0];
501 }
502 }
503 inputIndex += innermostLoc;
504
505 // Todo: this could be extended to the second dimension if we're not
506 // broadcasting alongside the first dimension, and so on.
507 if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) {
508 return m_impl.template packet<Unaligned>(inputIndex);
509 } else {
510 EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
511 values[0] = m_impl.coeff(inputIndex);
512 EIGEN_UNROLL_LOOP
513 for (int i = 1; i < PacketSize; ++i) {
514 if (innermostLoc + i < m_impl.dimensions()[0]) {
515 values[i] = m_impl.coeff(inputIndex+i);
516 } else {
517 values[i] = coeffColMajor(originalIndex+i);
518 }
519 }
520 PacketReturnType rslt = internal::pload<PacketReturnType>(values);
521 return rslt;
522 }
523 }
524
525 template<int LoadMode>
526 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
527 {
528 EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
529 eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
530
531 const Index originalIndex = index;
532
533 Index inputIndex = 0;
534 EIGEN_UNROLL_LOOP
535 for (int i = 0; i < NumDims - 1; ++i) {
536 const Index idx = index / m_outputStrides[i];
537 if (internal::index_statically_eq<Broadcast>(i, 1)) {
538 eigen_assert(idx < m_impl.dimensions()[i]);
539 inputIndex += idx * m_inputStrides[i];
540 } else {
541 if (internal::index_statically_eq<InputDimensions>(i, 1)) {
542 eigen_assert(idx % m_impl.dimensions()[i] == 0);
543 } else {
544 inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
545 }
546 }
547 index -= idx * m_outputStrides[i];
548 }
549 Index innermostLoc;
550 if (internal::index_statically_eq<Broadcast>(NumDims-1, 1)) {
551 eigen_assert(index < m_impl.dimensions()[NumDims-1]);
552 innermostLoc = index;
553 } else {
554 if (internal::index_statically_eq<InputDimensions>(NumDims-1, 1)) {
555 eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0);
556 innermostLoc = 0;
557 } else {
558 innermostLoc = index % m_impl.dimensions()[NumDims-1];
559 }
560 }
561 inputIndex += innermostLoc;
562
563 // Todo: this could be extended to the second dimension if we're not
564 // broadcasting alongside the first dimension, and so on.
565 if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims-1]) {
566 return m_impl.template packet<Unaligned>(inputIndex);
567 } else {
568 EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
569 values[0] = m_impl.coeff(inputIndex);
570 EIGEN_UNROLL_LOOP
571 for (int i = 1; i < PacketSize; ++i) {
572 if (innermostLoc + i < m_impl.dimensions()[NumDims-1]) {
573 values[i] = m_impl.coeff(inputIndex+i);
574 } else {
575 values[i] = coeffRowMajor(originalIndex+i);
576 }
577 }
578 PacketReturnType rslt = internal::pload<PacketReturnType>(values);
579 return rslt;
580 }
581 }
582
583 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
584 costPerCoeff(bool vectorized) const {
585 double compute_cost = TensorOpCost::AddCost<Index>();
586 if (!isCopy && NumDims > 0) {
587 EIGEN_UNROLL_LOOP
588 for (int i = NumDims - 1; i > 0; --i) {
589 compute_cost += TensorOpCost::DivCost<Index>();
590 if (internal::index_statically_eq<Broadcast>(i, 1)) {
591 compute_cost +=
592 TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
593 } else {
594 if (!internal::index_statically_eq<InputDimensions>(i, 1)) {
595 compute_cost += TensorOpCost::MulCost<Index>() +
596 TensorOpCost::ModCost<Index>() +
597 TensorOpCost::AddCost<Index>();
598 }
599 }
600 compute_cost +=
601 TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
602 }
603 }
604 return m_impl.costPerCoeff(vectorized) +
605 TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
606 }
607
608 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
609 internal::TensorBlockResourceRequirements getResourceRequirements() const {
610 // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large
611 // tensors. But this might need further tuning.
612 const size_t target_size = m_device.firstLevelCacheSize();
613 return internal::TensorBlockResourceRequirements::merge(
614 m_impl.getResourceRequirements(),
615 internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size));
616 }
617
618 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
619 block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
620 bool /*root_of_expr_ast*/ = false) const {
621 BlockBroadcastingParams params = blockBroadcastingParams(desc);
622
623 if (params.inner_dim_size == 0 || params.bcast_dim_size == 0) {
624 return emptyBlock();
625 }
626
627 // Prepare storage for the materialized broadcasting result.
628 const typename TensorBlock::Storage block_storage =
629 TensorBlock::prepareStorage(desc, scratch);
630 ScalarNoConst* materialized_output = block_storage.data();
631
632 // We potentially will need to materialize input blocks.
633 size_t materialized_input_size = 0;
634 ScalarNoConst* materialized_input = NULL;
635
636 // Initialize block broadcating iterator state for outer dimensions (outer
637 // with regard to bcast dimension). Dimension in this array are always in
638 // inner_most -> outer_most order (col major layout).
639 array<BlockBroadcastingIteratorState, NumDims> it;
640 int idx = 0;
641
642 for (int i = params.inner_dim_count + 1; i < NumDims; ++i) {
643 const Index dim = IsColMajor ? i : NumDims - 1 - i;
644 it[idx].size = params.output_dims[dim];
645 it[idx].count = 0;
646 it[idx].output_stride = m_outputStrides[dim];
647 it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
648 idx++;
649 }
650
651 // Write output into the beginning of `materialized_output`.
652 Index output_offset = 0;
653
654 // We will fill output block by broadcasting along the bcast dim, and
655 // iterating over outer dimension.
656 const Index output_size = NumDims == 0 ? 1 : params.output_dims.TotalSize();
657
658 for (Index num_output_coeffs = 0; num_output_coeffs < output_size;) {
659 ScalarNoConst* bcast_output = materialized_output + num_output_coeffs;
660 Index bcast_offset = desc.offset() + output_offset;
661
662 // Broadcast along the bcast dimension.
663 num_output_coeffs += BroadcastBlockAlongBcastDim(
664 params, bcast_offset, scratch, bcast_output, &materialized_input,
665 &materialized_input_size);
666
667 // Switch to the next outer dimension.
668 for (int j = 0; j < idx; ++j) {
669 if (++it[j].count < it[j].size) {
670 output_offset += it[j].output_stride;
671 break;
672 }
673 it[j].count = 0;
674 output_offset -= it[j].output_span;
675 }
676 }
677
678 return block_storage.AsTensorMaterializedBlock();
679 }
680
681 EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
682
683 const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
684
685 Broadcast functor() const { return m_broadcast; }
686#ifdef EIGEN_USE_SYCL
687 // binding placeholder accessors to a command group handler for SYCL
688 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(
689 cl::sycl::handler& cgh) const {
690 m_impl.bind(cgh);
691 }
692#endif
693 private:
694 static const bool IsColMajor =
695 static_cast<int>(Layout) == static_cast<int>(ColMajor);
696
697 // We will build a general case block broadcasting on top of broadcasting
698 // primitive that will do broadcasting only for the inner dimension(s) along
699 // the first dimension smaller than the input size (it's called `bcast_dim`).
700 //
701 // Example:
702 // dim: 0 1 2 (ColMajor)
703 // input size: [9, 3, 6]
704 // block size: [9, 2, 6]
705 //
706 // We will compute broadcasted block by iterating over the outer dimensions
707 // before `bcast_dim` (only dimension `2` in this example) and computing
708 // broadcasts along the `bcast_dim` (dimension `1` in this example).
709
710 // BlockBroadcastingParams holds precomputed parameters for broadcasting a
711 // single block along the broadcasting dimension. Sizes and strides along the
712 // `bcast_dim` might be invalid, they will be adjusted later in
713 // `BroadcastBlockAlongBcastDim`.
714 struct BlockBroadcastingParams {
715 Dimensions input_dims; // input expression dimensions
716 Dimensions output_dims; // output block sizes
717 Dimensions output_strides; // output block strides
718
719 int inner_dim_count; // count inner dimensions matching in size
720 int bcast_dim; // broadcasting dimension index
721 Index bcast_dim_size; // broadcasting dimension size
722 Index inner_dim_size; // inner dimensions size
723
724 // Block sizes and strides for the input block where all dimensions before
725 // `bcast_dim` are equal to `1`.
726 Dimensions input_block_sizes;
727 Dimensions input_block_strides;
728
729 // Block sizes and strides for blocks with extra dimensions and strides `0`.
730 BroadcastDimensions bcast_block_sizes;
731 BroadcastDimensions bcast_block_strides;
732 BroadcastDimensions bcast_input_strides;
733 };
734
735 struct BlockBroadcastingIteratorState {
736 Index size;
737 Index count;
738 Index output_stride;
739 Index output_span;
740 };
741
742 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlockBroadcastingParams
743 blockBroadcastingParams(TensorBlockDesc& desc) const {
744 BlockBroadcastingParams params;
745
746 params.input_dims = Dimensions(m_impl.dimensions());
747
748 // Output block sizes and strides.
749 params.output_dims = desc.dimensions();
750 params.output_strides = internal::strides<Layout>(params.output_dims);
751
752 // Find the broadcasting dimension (first dimension with output size smaller
753 // that the input size).
754 params.bcast_dim = 0;
755 params.bcast_dim_size = 1;
756 params.inner_dim_size = 1;
757
758 // Count the number of inner dimensions that have the same size in the block
759 // and in the broadcast expression.
760 params.inner_dim_count = 0;
761
762 for (int i = 0; i < NumDims; ++i) {
763 const int dim = IsColMajor ? i : NumDims - i - 1;
764
765 if (params.output_dims[dim] == m_dimensions[dim]) {
766 params.inner_dim_size *= params.output_dims[dim];
767 ++params.inner_dim_count;
768 continue;
769 }
770
771 // First non-matching dimension is the broadcasting dimension.
772 eigen_assert(params.output_dims[dim] < m_dimensions[dim]);
773 params.bcast_dim = dim;
774 params.bcast_dim_size = params.output_dims[dim];
775 break;
776 }
777
778 // Calculate the input block size for looking into the input.
779 for (int i = 0; i < params.inner_dim_count; ++i) {
780 const int dim = IsColMajor ? i : NumDims - i - 1;
781 params.input_block_sizes[dim] = params.input_dims[dim];
782 }
783 for (int i = params.inner_dim_count; i < NumDims; ++i) {
784 const int dim = IsColMajor ? i : NumDims - i - 1;
785 params.input_block_sizes[dim] = 1;
786 }
787 params.input_block_strides =
788 internal::strides<Layout>(params.input_block_sizes);
789
790 // Broadcast with the 0-stride trick: Create 1 extra dim for each
791 // broadcast, set the input stride to 0.
792 //
793 // When ColMajor:
794 //
795 // - bcast_block_sizes:
796 // [d_0, b_0, d_1, b_1, ...]
797 //
798 // - bcast_block_strides:
799 // [output_block_strides[0], output_block_strides[0] * d_0,
800 // output_block_strides[1], output_block_strides[1] * d_1,
801 // ...]
802 //
803 // - bcast_input_strides:
804 // [input_block_strides[0], 0,
805 // input_block_strides[1], 0,
806 // ...].
807 //
808 for (int i = 0; i < params.inner_dim_count; ++i) {
809 const int dim = IsColMajor ? i : NumDims - i - 1;
810
811 const int copy_dim = IsColMajor ? 2 * i : 2 * NumDims - 2 * i - 1;
812 const int broadcast_dim = IsColMajor ? copy_dim + 1 : copy_dim - 1;
813
814 params.bcast_block_sizes[copy_dim] = params.input_dims[dim];
815 params.bcast_block_sizes[broadcast_dim] = m_broadcast[dim];
816 params.bcast_block_strides[copy_dim] = params.output_strides[dim];
817 params.bcast_block_strides[broadcast_dim] =
818 params.output_strides[dim] * params.input_dims[dim];
819 params.bcast_input_strides[copy_dim] = params.input_block_strides[dim];
820 params.bcast_input_strides[broadcast_dim] = 0;
821 }
822
823 for (int i = 2 * params.inner_dim_count; i < 2 * NumDims; ++i) {
824 const int dim = IsColMajor ? i : 2 * NumDims - i - 1;
825 params.bcast_block_sizes[dim] = 1;
826 params.bcast_block_strides[dim] = 0;
827 params.bcast_input_strides[dim] = 0;
828 }
829
830 return params;
831 }
832
833 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock emptyBlock() const {
834 DSizes<Index, NumDims> dimensions;
835 for (int i = 0; i < NumDims; ++i) dimensions[i] = 0;
836 return TensorBlock(internal::TensorBlockKind::kView, NULL, dimensions);
837 }
838
839 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlockAlongBcastDim(
840 BlockBroadcastingParams params, Index bcast_offset,
841 TensorBlockScratch& scratch, ScalarNoConst* materialized_output,
842 ScalarNoConst** materialized_input,
843 size_t* materialized_input_size) const {
844 if (params.bcast_dim_size == 1) {
845 // We just need one block read using the ready-set values above.
846 return BroadcastBlock(
847 params.input_block_sizes, params.input_block_strides,
848 params.bcast_block_sizes, params.bcast_block_strides,
849 params.bcast_input_strides, bcast_offset, 0, scratch,
850 materialized_output, materialized_input, materialized_input_size);
851
852 } else if (params.input_dims[params.bcast_dim] == 1) {
853 // Broadcast bcast dimension (< NumDims) by bcast_dim_size.
854 const int broadcast_bcast_dim =
855 IsColMajor ? 2 * params.inner_dim_count + 1
856 : 2 * NumDims - 2 * params.inner_dim_count - 2;
857
858 params.bcast_block_sizes[broadcast_bcast_dim] = params.bcast_dim_size;
859 params.bcast_input_strides[broadcast_bcast_dim] = 0;
860 params.bcast_block_strides[broadcast_bcast_dim] =
861 params.output_strides[params.bcast_dim];
862
863 return BroadcastBlock(
864 params.input_block_sizes, params.input_block_strides,
865 params.bcast_block_sizes, params.bcast_block_strides,
866 params.bcast_input_strides, bcast_offset, 0, scratch,
867 materialized_output, materialized_input, materialized_input_size);
868
869 } else {
870 // Keep track of the total number of the coefficients written to the
871 // output block.
872 Index num_output_coeffs = 0;
873
874 // The general case. Let's denote the output block as
875 //
876 // x[..., a:a+bcast_dim_size, :, ..., :]
877 //
878 // where a:a+bcast_dim_size is a slice on the bcast_dim dimension
879 // (< NumDims). We need to split the a:a+bcast_dim_size into possibly 3
880 // sub-blocks:
881 //
882 // (1) a:b, where b is the smallest multiple of
883 // input_dims[bcast_dim_start] in [a, a+bcast_dim_size].
884 //
885 // (2) b:c, where c is the largest multiple of input_dims[bcast_dim_start]
886 // in [a, a+bcast_dim_size].
887 //
888 // (3) c:a+bcast_dim_size .
889 //
890 // Or, when b and c do not exist, we just need to process the whole block
891 // together.
892
893 // Find a.
894 const Index bcast_dim_left_index =
895 bcast_offset / m_outputStrides[params.bcast_dim];
896
897 // Find b and c.
898 const Index input_bcast_dim_size = params.input_dims[params.bcast_dim];
899
900 // First multiple after a. This is b when <= bcast_dim_left_index +
901 // bcast_dim_size.
902 const Index first_multiple =
903 divup<Index>(bcast_dim_left_index, input_bcast_dim_size) *
904 input_bcast_dim_size;
905
906 if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) {
907 // b exists, so does c. Find it.
908 const Index last_multiple =
909 (bcast_dim_left_index + params.bcast_dim_size) /
910 input_bcast_dim_size * input_bcast_dim_size;
911 const int copy_bcast_dim =
912 IsColMajor ? 2 * params.inner_dim_count
913 : 2 * NumDims - 2 * params.inner_dim_count - 1;
914 const int broadcast_bcast_dim =
915 IsColMajor ? 2 * params.inner_dim_count + 1
916 : 2 * NumDims - 2 * params.inner_dim_count - 2;
917
918 if (first_multiple > bcast_dim_left_index) {
919 const Index head_size = first_multiple - bcast_dim_left_index;
920 params.input_block_sizes[params.bcast_dim] = head_size;
921 params.bcast_block_sizes[copy_bcast_dim] = head_size;
922 params.bcast_input_strides[copy_bcast_dim] =
923 params.input_block_strides[params.bcast_dim];
924 params.bcast_block_strides[copy_bcast_dim] =
925 params.output_strides[params.bcast_dim];
926 params.bcast_block_sizes[broadcast_bcast_dim] = 1;
927 params.bcast_input_strides[broadcast_bcast_dim] = 0;
928 params.bcast_block_strides[broadcast_bcast_dim] =
929 params.output_strides[params.bcast_dim] *
930 params.input_dims[params.bcast_dim];
931
932 num_output_coeffs += BroadcastBlock(
933 params.input_block_sizes, params.input_block_strides,
934 params.bcast_block_sizes, params.bcast_block_strides,
935 params.bcast_input_strides, bcast_offset, 0, scratch,
936 materialized_output, materialized_input, materialized_input_size);
937 }
938 if (first_multiple < last_multiple) {
939 params.input_block_sizes[params.bcast_dim] = input_bcast_dim_size;
940 params.bcast_block_sizes[copy_bcast_dim] = input_bcast_dim_size;
941 params.bcast_input_strides[copy_bcast_dim] =
942 params.input_block_strides[params.bcast_dim];
943 params.bcast_block_strides[copy_bcast_dim] =
944 params.output_strides[params.bcast_dim];
945 params.bcast_block_sizes[broadcast_bcast_dim] =
946 (last_multiple - first_multiple) / input_bcast_dim_size;
947 params.bcast_input_strides[broadcast_bcast_dim] = 0;
948 params.bcast_block_strides[broadcast_bcast_dim] =
949 params.output_strides[params.bcast_dim] *
950 params.input_dims[params.bcast_dim];
951 const Index offset = (first_multiple - bcast_dim_left_index) *
952 m_outputStrides[params.bcast_dim];
953
954 num_output_coeffs += BroadcastBlock(
955 params.input_block_sizes, params.input_block_strides,
956 params.bcast_block_sizes, params.bcast_block_strides,
957 params.bcast_input_strides, bcast_offset, offset, scratch,
958 materialized_output, materialized_input, materialized_input_size);
959 }
960 if (last_multiple < bcast_dim_left_index + params.bcast_dim_size) {
961 const Index tail_size =
962 bcast_dim_left_index + params.bcast_dim_size - last_multiple;
963 params.input_block_sizes[params.bcast_dim] = tail_size;
964 params.bcast_block_sizes[copy_bcast_dim] = tail_size;
965 params.bcast_input_strides[copy_bcast_dim] =
966 params.input_block_strides[params.bcast_dim];
967 params.bcast_block_strides[copy_bcast_dim] =
968 params.output_strides[params.bcast_dim];
969 params.bcast_block_sizes[broadcast_bcast_dim] = 1;
970 params.bcast_input_strides[broadcast_bcast_dim] = 0;
971 params.bcast_block_strides[broadcast_bcast_dim] =
972 params.output_strides[params.bcast_dim] *
973 params.input_dims[params.bcast_dim];
974 const Index offset = (last_multiple - bcast_dim_left_index) *
975 m_outputStrides[params.bcast_dim];
976
977 num_output_coeffs += BroadcastBlock(
978 params.input_block_sizes, params.input_block_strides,
979 params.bcast_block_sizes, params.bcast_block_strides,
980 params.bcast_input_strides, bcast_offset, offset, scratch,
981 materialized_output, materialized_input, materialized_input_size);
982 }
983 } else {
984 // b and c do not exist.
985 const int copy_bcast_dim =
986 IsColMajor ? 2 * params.inner_dim_count
987 : 2 * NumDims - 2 * params.inner_dim_count - 1;
988 params.input_block_sizes[params.bcast_dim] = params.bcast_dim_size;
989 params.bcast_block_sizes[copy_bcast_dim] = params.bcast_dim_size;
990 params.bcast_input_strides[copy_bcast_dim] =
991 params.input_block_strides[params.bcast_dim];
992 params.bcast_block_strides[copy_bcast_dim] =
993 params.output_strides[params.bcast_dim];
994
995 num_output_coeffs += BroadcastBlock(
996 params.input_block_sizes, params.input_block_strides,
997 params.bcast_block_sizes, params.bcast_block_strides,
998 params.bcast_input_strides, bcast_offset, 0, scratch,
999 materialized_output, materialized_input, materialized_input_size);
1000 }
1001
1002 return num_output_coeffs;
1003 }
1004 }
1005
1006 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlock(
1007 const Dimensions& input_block_sizes,
1008 const Dimensions& input_block_strides,
1009 const BroadcastDimensions& bcast_block_sizes,
1010 const BroadcastDimensions& bcast_block_strides,
1011 const BroadcastDimensions& bcast_input_strides, Index bcast_offset,
1012 Index offset, TensorBlockScratch& scratch,
1013 ScalarNoConst* materialized_output, ScalarNoConst** materialized_input,
1014 size_t* materialized_input_size) const {
1015 // ---------------------------------------------------------------------- //
1016 // Tensor block descriptor for reading block from the input.
1017 const Index input_offset = bcast_offset + offset;
1018 TensorBlockDesc input_desc(
1019 IsColMajor ? indexColMajor(input_offset) : indexRowMajor(input_offset),
1020 input_block_sizes);
1021
1022 ArgTensorBlock input_block = m_impl.block(input_desc, scratch);
1023
1024 // ---------------------------------------------------------------------- //
1025 // Materialize input block into a temporary memory buffer only if it's not
1026 // already available in the arg block.
1027 const ScalarNoConst* input_buffer = NULL;
1028
1029 if (input_block.data() != NULL) {
1030 // Input block already has raw data, there is no need to materialize it.
1031 input_buffer = input_block.data();
1032
1033 } else {
1034 // Otherwise we have to do block assignment into a temporary buffer.
1035
1036 // Maybe reuse previously allocated buffer, or allocate a new one with a
1037 // scratch allocator.
1038 const size_t input_total_size = input_block_sizes.TotalSize();
1039 if (*materialized_input == NULL ||
1040 *materialized_input_size < input_total_size) {
1041 *materialized_input_size = input_total_size;
1042 void* mem = scratch.allocate(*materialized_input_size * sizeof(Scalar));
1043 *materialized_input = static_cast<ScalarNoConst*>(mem);
1044 }
1045
1046 typedef internal::TensorBlockAssignment<
1047 ScalarNoConst, NumDims, typename ArgTensorBlock::XprType, Index>
1048 TensorBlockAssignment;
1049
1050 TensorBlockAssignment::Run(
1051 TensorBlockAssignment::target(input_block_sizes, input_block_strides,
1052 *materialized_input),
1053 input_block.expr());
1054
1055 input_buffer = *materialized_input;
1056 }
1057
1058 // ---------------------------------------------------------------------- //
1059 // Copy data from materialized input block to the materialized output, using
1060 // given broadcast strides (strides with zeroes).
1061 typedef internal::TensorBlockIO<ScalarNoConst, Index, 2 * NumDims, Layout>
1062 TensorBlockIO;
1063
1064 typename TensorBlockIO::Src src(bcast_input_strides, input_buffer);
1065 typename TensorBlockIO::Dst dst(bcast_block_sizes, bcast_block_strides,
1066 materialized_output + offset);
1067
1068 return TensorBlockIO::Copy(dst, src);
1069 }
1070
1071protected:
1072 const Device EIGEN_DEVICE_REF m_device;
1073 const typename internal::remove_reference<Broadcast>::type m_broadcast;
1074 Dimensions m_dimensions;
1075 array<Index, NumDims> m_outputStrides;
1076 array<Index, NumDims> m_inputStrides;
1077 TensorEvaluator<ArgType, Device> m_impl;
1078};
1079
1080
1081} // end namespace Eigen
1082
1083#endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
The tensor base class.
Definition TensorForwardDeclarations.h:56
Definition TensorBroadcasting.h:63
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The tensor evaluator class.
Definition TensorEvaluator.h:27