11#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
12#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
17#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__))
18#define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
20#define KERNEL_FRIEND friend
25#include "./InternalHeaderCheck.h"
30template <
typename Op,
typename Dims,
typename XprType,
template <
class>
class MakePointer_>
31struct traits<TensorReductionOp<Op, Dims, XprType, MakePointer_> > : traits<XprType> {
32 typedef traits<XprType> XprTraits;
33 typedef typename XprTraits::Scalar Scalar;
34 typedef typename XprTraits::StorageKind StorageKind;
35 typedef typename XprTraits::Index
Index;
36 typedef typename XprType::Nested Nested;
37 static constexpr int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
38 static constexpr int Layout = XprTraits::Layout;
39 typedef typename XprTraits::PointerType PointerType;
44 typedef MakePointer_<T> MakePointerT;
45 typedef typename MakePointerT::Type Type;
49template <
typename Op,
typename Dims,
typename XprType,
template <
class>
class MakePointer_>
50struct eval<TensorReductionOp<Op, Dims, XprType, MakePointer_>, Eigen::Dense> {
51 typedef const TensorReductionOp<Op, Dims, XprType, MakePointer_>& type;
54template <
typename Op,
typename Dims,
typename XprType,
template <
class>
class MakePointer_>
55struct nested<TensorReductionOp<Op, Dims, XprType, MakePointer_>, 1,
56 typename eval<TensorReductionOp<Op, Dims, XprType, MakePointer_> >::type> {
57 typedef TensorReductionOp<Op, Dims, XprType, MakePointer_> type;
60template <
typename OutputDims>
61struct DimInitializer {
62 template <
typename InputDims,
typename ReducedDims>
63 EIGEN_DEVICE_FUNC
static void run(
const InputDims& input_dims,
64 const array<
bool, internal::array_size<InputDims>::value>& reduced,
65 OutputDims* output_dims, ReducedDims* reduced_dims) {
66 const int NumInputDims = internal::array_size<InputDims>::value;
69 for (
int i = 0; i < NumInputDims; ++i) {
71 (*reduced_dims)[reduceIndex] = input_dims[i];
74 (*output_dims)[outputIndex] = input_dims[i];
82struct DimInitializer<Sizes<> > {
83 template <
typename InputDims,
typename Index,
size_t Rank>
84 EIGEN_DEVICE_FUNC
static void run(
const InputDims& input_dims,
const array<bool, Rank>&, Sizes<>*,
85 array<Index, Rank>* reduced_dims) {
86 const int NumInputDims = internal::array_size<InputDims>::value;
87 for (
int i = 0; i < NumInputDims; ++i) {
88 (*reduced_dims)[i] = input_dims[i];
93template <
typename ReducedDims,
int NumTensorDims,
int Layout>
94struct are_inner_most_dims {
95 static const bool value =
false;
97template <
typename ReducedDims,
int NumTensorDims,
int Layout>
98struct preserve_inner_most_dims {
99 static const bool value =
false;
102template <
typename ReducedDims,
int NumTensorDims>
103struct are_inner_most_dims<ReducedDims, NumTensorDims,
ColMajor> {
104 static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
105 static const bool tmp2 = index_statically_eq<ReducedDims>(0, 0);
106 static const bool tmp3 =
107 index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, array_size<ReducedDims>::value - 1);
108 static const bool value = tmp1 & tmp2 & tmp3;
110template <
typename ReducedDims,
int NumTensorDims>
111struct are_inner_most_dims<ReducedDims, NumTensorDims,
RowMajor> {
112 static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
113 static const bool tmp2 = index_statically_eq<ReducedDims>(0, NumTensorDims - array_size<ReducedDims>::value);
114 static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
115 static const bool value = tmp1 & tmp2 & tmp3;
117template <
typename ReducedDims,
int NumTensorDims>
118struct preserve_inner_most_dims<ReducedDims, NumTensorDims,
ColMajor> {
119 static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
120 static const bool tmp2 = index_statically_gt<ReducedDims>(0, 0);
121 static const bool value = tmp1 & tmp2;
123template <
typename ReducedDims,
int NumTensorDims>
124struct preserve_inner_most_dims<ReducedDims, NumTensorDims,
RowMajor> {
125 static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
126 static const bool tmp2 = index_statically_lt<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
127 static const bool value = tmp1 & tmp2;
130template <
int DimIndex,
typename Self,
typename Op>
131struct GenericDimReducer {
132 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void reduce(
const Self& self,
typename Self::Index firstIndex,
133 Op& reducer,
typename Self::CoeffReturnType* accum) {
134 EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
135 for (
int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
136 const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
137 GenericDimReducer<DimIndex - 1, Self, Op>::reduce(self, input, reducer, accum);
141template <
typename Self,
typename Op>
142struct GenericDimReducer<0, Self, Op> {
143 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void reduce(
const Self& self,
typename Self::Index firstIndex,
144 Op& reducer,
typename Self::CoeffReturnType* accum) {
145 for (
int j = 0; j < self.m_reducedDims[0]; ++j) {
146 const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
147 reducer.reduce(self.m_impl.coeff(input), accum);
151template <
typename Self,
typename Op>
152struct GenericDimReducer<-1, Self, Op> {
153 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void reduce(
const Self& self,
typename Self::Index index, Op& reducer,
154 typename Self::CoeffReturnType* accum) {
155 reducer.reduce(self.m_impl.coeff(index), accum);
159template <
typename Self,
typename Op,
160 bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess),
161 bool UseTreeReduction = (!Self::ReducerTraits::IsStateful && !Self::ReducerTraits::IsExactlyAssociative &&
164 !Self::RunningOnGPU)>
165struct InnerMostDimReducer {
166 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
typename Self::CoeffReturnType reduce(
167 const Self& self,
typename Self::Index firstIndex,
typename Self::Index numValuesToReduce, Op& reducer) {
168 typename Self::CoeffReturnType accum = reducer.initialize();
169 for (
typename Self::Index j = 0; j < numValuesToReduce; ++j) {
170 reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
172 return reducer.finalize(accum);
176template <
typename Self,
typename Op>
177struct InnerMostDimReducer<Self, Op, true, false> {
178 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
typename Self::CoeffReturnType reduce(
179 const Self& self,
typename Self::Index firstIndex,
typename Self::Index numValuesToReduce, Op& reducer0) {
180 using Index =
typename Self::Index;
181 constexpr Index packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
183 typename Self::PacketReturnType paccum0 = reducer0.template initializePacket<typename Self::PacketReturnType>();
184 if (!Self::ReducerTraits::IsStateful && numValuesToReduce >= 4 * packetSize) {
185 const Index VectorizedSize4 = (numValuesToReduce / (4 * packetSize)) * (4 * packetSize);
186 typename Self::PacketReturnType paccum1 = reducer0.template initializePacket<typename Self::PacketReturnType>();
187 typename Self::PacketReturnType paccum2 = reducer0.template initializePacket<typename Self::PacketReturnType>();
188 typename Self::PacketReturnType paccum3 = reducer0.template initializePacket<typename Self::PacketReturnType>();
189 const Index offset0 = firstIndex;
190 const Index offset1 = firstIndex + packetSize;
191 const Index offset2 = firstIndex + 2 * packetSize;
192 const Index offset3 = firstIndex + 3 * packetSize;
193 for (
Index j = 0; j < VectorizedSize4; j += 4 * packetSize) {
194 reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset0 + j), &paccum0);
195 reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset1 + j), &paccum1);
196 reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset2 + j), &paccum2);
197 reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset3 + j), &paccum3);
199 reducer0.reducePacket(paccum1, &paccum0);
200 reducer0.reducePacket(paccum2, &paccum0);
201 reducer0.reducePacket(paccum3, &paccum0);
202 start = VectorizedSize4;
204 if (start <= (numValuesToReduce - packetSize)) {
205 const Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
206 for (
Index j = start; j < VectorizedSize; j += packetSize) {
207 reducer0.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum0);
209 start = VectorizedSize;
211 typename Self::CoeffReturnType accum = reducer0.initialize();
212 for (
Index j = start; j < numValuesToReduce; ++j) {
213 reducer0.reduce(self.m_impl.coeff(firstIndex + j), &accum);
215 return reducer0.finalizeBoth(accum, paccum0);
219#if !defined(EIGEN_HIPCC)
225EIGEN_DEVICE_FUNC
inline Index LeafSize() {
229EIGEN_DEVICE_FUNC
inline Index LeafSize<half>() {
233EIGEN_DEVICE_FUNC
inline Index LeafSize<bfloat16>() {
237template <
typename Self,
typename Op>
238struct InnerMostDimReducer<Self, Op, false, true> {
239 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
typename Self::CoeffReturnType reduce(
240 const Self& self,
typename Self::Index firstIndex,
typename Self::Index numValuesToReduce, Op& reducer) {
241 const Index kLeafSize = LeafSize<typename Self::CoeffReturnType>();
242 typename Self::CoeffReturnType accum = reducer.initialize();
243 if (numValuesToReduce > kLeafSize) {
244 const typename Self::Index half = numValuesToReduce / 2;
246 reducer.reduce(reduce(self, firstIndex, half, reducer), &accum);
247 reducer.reduce(reduce(self, firstIndex + half, numValuesToReduce - half, reducer), &accum);
248 return reducer.finalize(accum);
250 return InnerMostDimReducer<Self, Op, false, false>::reduce(self, firstIndex, numValuesToReduce, reducer);
255template <
typename Self,
typename Op>
256struct InnerMostDimReducer<Self, Op, true, true> {
257 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
typename Self::CoeffReturnType reduce(
258 const Self& self,
typename Self::Index firstIndex,
typename Self::Index numValuesToReduce, Op& reducer) {
259 const Index kLeafSize = LeafSize<typename Self::CoeffReturnType>();
260 const typename Self::Index packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
261 typename Self::CoeffReturnType accum = reducer.initialize();
262 if (numValuesToReduce > packetSize * kLeafSize) {
264 const typename Self::Index split =
266 numext::div_ceil(firstIndex + numext::div_ceil(numValuesToReduce,
typename Self::Index(2)), packetSize);
267 const typename Self::Index num_left = numext::mini(split - firstIndex, numValuesToReduce);
268 reducer.reduce(reduce(self, firstIndex, num_left, reducer), &accum);
269 if (num_left < numValuesToReduce) {
270 reducer.reduce(reduce(self, split, numValuesToReduce - num_left, reducer), &accum);
272 return reducer.finalize(accum);
274 return InnerMostDimReducer<Self, Op, true, false>::reduce(self, firstIndex, numValuesToReduce, reducer);
280template <
int DimIndex,
typename Self,
typename Op,
281 bool vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
282struct InnerMostDimPreserver {
283 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void reduce(
const Self&,
typename Self::Index, Op&,
284 typename Self::PacketReturnType*) {
285 eigen_assert(
false &&
"should never be called");
289template <
int DimIndex,
typename Self,
typename Op>
290struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
291 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void reduce(
const Self& self,
typename Self::Index firstIndex,
292 Op& reducer,
typename Self::PacketReturnType* accum) {
293 EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
294 for (
typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
295 const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
296 InnerMostDimPreserver<DimIndex - 1, Self, Op>::reduce(self, input, reducer, accum);
301template <
typename Self,
typename Op>
302struct InnerMostDimPreserver<0, Self, Op, true> {
303 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void reduce(
const Self& self,
typename Self::Index firstIndex,
304 Op& reducer0,
typename Self::PacketReturnType* accum0) {
305 using Index =
typename Self::Index;
306 const Index stride = self.m_reducedStrides[0];
307 const Index size = self.m_reducedDims[0];
308 if (!Self::ReducerTraits::IsStateful && size >= 16) {
309 const Index unrolled_size4 = (size / 4) * 4;
310 typename Self::PacketReturnType accum1 = reducer0.template initializePacket<typename Self::PacketReturnType>();
311 typename Self::PacketReturnType accum2 = reducer0.template initializePacket<typename Self::PacketReturnType>();
312 typename Self::PacketReturnType accum3 = reducer0.template initializePacket<typename Self::PacketReturnType>();
313 for (
Index j = 0; j < unrolled_size4; j += 4) {
314 const Index input0 = firstIndex + j * stride;
315 reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input0), accum0);
316 const Index input1 = firstIndex + (j + 1) * stride;
317 reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input1), &accum1);
318 const Index input2 = firstIndex + (j + 2) * stride;
319 reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input2), &accum2);
320 const Index input3 = firstIndex + (j + 3) * stride;
321 reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input3), &accum3);
323 reducer0.reducePacket(accum1, accum0);
324 reducer0.reducePacket(accum2, accum0);
325 reducer0.reducePacket(accum3, accum0);
326 for (
Index j = unrolled_size4; j < size; ++j) {
327 Index input = firstIndex + j * stride;
328 reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input), accum0);
331 for (
Index j = 0; j < size; ++j) {
332 Index input = firstIndex + j * stride;
333 reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input), accum0);
338template <
typename Self,
typename Op>
339struct InnerMostDimPreserver<-1, Self, Op,
true> {
340 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void reduce(
const Self&,
typename Self::Index, Op&,
341 typename Self::PacketReturnType*) {
342 eigen_assert(
false &&
"should never be called");
347template <
typename Self,
typename Op,
typename Device,
348 bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
350 static constexpr bool HasOptimizedImplementation =
false;
352 static EIGEN_DEVICE_FUNC
void run(
const Self& self, Op& reducer,
const Device&,
353 typename Self::EvaluatorPointerType output) {
354 const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
355 *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
359#ifdef EIGEN_USE_THREADS
361template <
typename Self,
typename Op,
bool Vectorizable>
362struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
363 static constexpr bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful;
364 static constexpr Index PacketSize = unpacket_traits<typename Self::PacketReturnType>::size;
367 static void run(
const Self& self, Op& reducer,
const ThreadPoolDevice& device,
368 typename Self::CoeffReturnType* output) {
369 typedef typename Self::Index
Index;
370 const Index num_coeffs = array_prod(self.m_impl.dimensions());
371 if (num_coeffs == 0) {
372 *output = reducer.finalize(reducer.initialize());
375 const TensorOpCost cost = self.m_impl.costPerCoeff(Vectorizable) +
376 TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable, PacketSize);
377 const Index num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(num_coeffs, cost, device.numThreads());
378 if (num_threads == 1) {
379 *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
382 const Index blocksize = num_coeffs / num_threads;
383 const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
384 eigen_assert(num_coeffs >= numblocks * blocksize);
386 Barrier barrier(internal::convert_index<unsigned int>(numblocks));
387 MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
388 for (
Index i = 0; i < numblocks; ++i) {
389 auto run_shard = [i, blocksize, &self, &barrier, &shards, &reducer](){
390 shards[i] = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, i * blocksize, blocksize, reducer);
393 device.enqueue(std::move(run_shard));
395 typename Self::CoeffReturnType finalShard;
396 if (numblocks * blocksize < num_coeffs) {
397 finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, numblocks * blocksize,
398 num_coeffs - numblocks * blocksize, reducer);
400 finalShard = reducer.initialize();
404 for (
Index i = 0; i < numblocks; ++i) {
405 reducer.reduce(shards[i], &finalShard);
407 *output = reducer.finalize(finalShard);
414template <
typename Self,
typename Op,
typename Device>
416 static constexpr bool HasOptimizedImplementation =
false;
418 EIGEN_DEVICE_FUNC
static bool run(
const Self&, Op&,
const Device&,
typename Self::CoeffReturnType*,
419 typename Self::Index,
typename Self::Index) {
420 eigen_assert(
false &&
"Not implemented");
426template <
typename Self,
typename Op,
typename Device>
428 static constexpr bool HasOptimizedImplementation =
false;
430 EIGEN_DEVICE_FUNC
static bool run(
const Self&, Op&,
const Device&,
typename Self::CoeffReturnType*,
431 typename Self::Index,
typename Self::Index) {
432 eigen_assert(
false &&
"Not implemented");
439template <
typename Self,
typename Op,
typename Device>
440struct GenericReducer {
441 static constexpr bool HasOptimizedImplementation =
false;
443 EIGEN_DEVICE_FUNC
static bool run(
const Self&, Op&,
const Device&,
typename Self::CoeffReturnType*,
444 typename Self::Index,
typename Self::Index) {
445 eigen_assert(
false &&
"Not implemented");
451#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
452template <
int B,
int N,
typename S,
typename R,
typename I_>
453__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void FullReductionKernel(R,
const S, I_,
typename S::CoeffReturnType*,
456#if defined(EIGEN_HAS_GPU_FP16)
457template <
typename S,
typename R,
typename I_>
458__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void ReductionInitFullReduxKernelHalfFloat(
459 R,
const S, I_, internal::packet_traits<half>::type*);
460template <
int B,
int N,
typename S,
typename R,
typename I_>
461__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void FullReductionKernelHalfFloat(R,
const S, I_, half*,
462 internal::packet_traits<half>::type*);
463template <
int NPT,
typename S,
typename R,
typename I_>
464__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void InnerReductionKernelHalfFloat(R,
const S, I_, I_, half*);
468template <
int NPT,
typename S,
typename R,
typename I_>
469__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void InnerReductionKernel(R,
const S, I_, I_,
typename S::CoeffReturnType*);
471template <
int NPT,
typename S,
typename R,
typename I_>
472__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void OuterReductionKernel(R,
const S, I_, I_,
typename S::CoeffReturnType*);
483template <
typename Op,
typename CoeffReturnType>
485#if defined(EIGEN_USE_SYCL)
486 typedef std::remove_const_t<decltype(std::declval<Op>().initialize())> type;
488 typedef std::remove_const_t<CoeffReturnType> type;
500template <
typename Op,
typename Dims,
typename XprType,
template <
class>
class MakePointer_>
501class TensorReductionOp :
public TensorBase<TensorReductionOp<Op, Dims, XprType, MakePointer_>, ReadOnlyAccessors> {
503 typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar;
505 typedef std::remove_const_t<typename XprType::CoeffReturnType> CoeffReturnType;
506 typedef typename Eigen::internal::nested<TensorReductionOp>::type Nested;
507 typedef typename Eigen::internal::traits<TensorReductionOp>::StorageKind StorageKind;
508 typedef typename Eigen::internal::traits<TensorReductionOp>::Index Index;
510 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionOp(
const XprType& expr,
const Dims& dims)
511 : m_expr(expr), m_dims(dims) {}
512 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionOp(
const XprType& expr,
const Dims& dims,
const Op& reducer)
513 : m_expr(expr), m_dims(dims), m_reducer(reducer) {}
515 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const XprType& expression()
const {
return m_expr; }
516 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Dims& dims()
const {
return m_dims; }
517 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Op& reducer()
const {
return m_reducer; }
520 typename XprType::Nested m_expr;
525template <
typename ArgType,
typename Device>
526struct TensorReductionEvaluatorBase;
529template <
typename Op,
typename Dims,
typename ArgType,
template <
class>
class MakePointer_,
typename Device>
530struct TensorReductionEvaluatorBase<const
TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> {
531 typedef internal::reducer_traits<Op, Device> ReducerTraits;
532 typedef Dims ReducedDims;
534 typedef typename XprType::Index
Index;
535 typedef ArgType ChildType;
536 typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
537 static constexpr int NumInputDims = internal::array_size<InputDimensions>::value;
538 static constexpr int NumReducedDims = internal::array_size<Dims>::value;
539 static constexpr int NumOutputDims = NumInputDims - NumReducedDims;
540 typedef std::conditional_t<NumOutputDims == 0, Sizes<>, DSizes<Index, NumOutputDims> >
Dimensions;
541 typedef typename XprType::Scalar
Scalar;
542 typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self;
543 static constexpr bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
544 typedef typename internal::ReductionReturnType<Op, typename XprType::CoeffReturnType>::type
CoeffReturnType;
545 typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
546 static constexpr Index PacketSize = PacketType<CoeffReturnType, Device>::size;
548 typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
549 typedef StorageMemory<CoeffReturnType, Device> Storage;
550 typedef typename Storage::Type EvaluatorPointerType;
554 static constexpr int NumPreservedStrides = max_n_1<NumOutputDims>::size;
557#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
558 static constexpr bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
559 static constexpr bool RunningOnSycl =
false;
560#elif defined(EIGEN_USE_SYCL)
561 static constexpr bool RunningOnSycl = internal::is_same<internal::remove_all_t<Device>, Eigen::SyclDevice>::value;
562 static constexpr bool RunningOnGPU =
false;
564 static constexpr bool RunningOnGPU =
false;
565 static constexpr bool RunningOnSycl =
false;
568 static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
571 PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess,
573 PreferBlockAccess =
true,
578 typedef std::remove_const_t<Scalar> ScalarNoConst;
581 typedef internal::TensorBlockNotImplemented TensorBlock;
584 static constexpr bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
585 static constexpr bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
586 static constexpr bool RunningFullReduction = (NumOutputDims == 0);
588 EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(
const XprType& op,
const Device& device)
589 : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device) {
590 EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
591 EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
592 YOU_MADE_A_PROGRAMMING_MISTAKE);
595 for (
int i = 0; i < NumInputDims; ++i) {
596 m_reduced[i] =
false;
598 for (
int i = 0; i < NumReducedDims; ++i) {
599 eigen_assert(op.dims()[i] >= 0);
600 eigen_assert(op.dims()[i] < NumInputDims);
601 m_reduced[op.dims()[i]] =
true;
604 const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
605 internal::DimInitializer<Dimensions>::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims);
608 if (NumOutputDims > 0) {
609 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
610 m_outputStrides[0] = 1;
611 for (
int i = 1; i < NumOutputDims; ++i) {
612 m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
613 m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
616 m_outputStrides[
static_cast<size_t>(NumOutputDims - 1)] = 1;
617 for (
int i = NumOutputDims - 2; i >= 0; --i) {
618 m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
619 m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
625 if (NumInputDims > 0) {
626 array<Index, NumInputDims> input_strides;
627 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
628 input_strides[0] = 1;
629 for (
int i = 1; i < NumInputDims; ++i) {
630 input_strides[i] = input_strides[i - 1] * input_dims[i - 1];
633 input_strides.back() = 1;
634 for (
int i = NumInputDims - 2; i >= 0; --i) {
635 input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
641 for (
int i = 0; i < NumInputDims; ++i) {
643 m_reducedStrides[reduceIndex] = input_strides[i];
646 m_preservedStrides[outputIndex] = input_strides[i];
647 m_output_to_input_dim_map[outputIndex] = i;
654 if (NumOutputDims == 0) {
655 m_preservedStrides[0] = internal::array_prod(input_dims);
658 m_numValuesToReduce = NumOutputDims == 0 ? internal::array_prod(input_dims)
659 : (static_cast<int>(Layout) == static_cast<int>(
ColMajor))
660 ? m_preservedStrides[0]
661 : m_preservedStrides[static_cast<size_t>(NumOutputDims - 1)];
664 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Dimensions& dimensions()
const {
return m_dimensions; }
666 EIGEN_STRONG_INLINE
bool evalSubExprsIfNeededCommon(EvaluatorPointerType data) {
668 if ((RunningFullReduction && RunningOnSycl) ||
669 (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
670 ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || !RunningOnGPU))) {
671 bool need_assign =
false;
673 m_result =
static_cast<EvaluatorPointerType
>(
674 m_device.get((CoeffReturnType*)m_device.allocate_temp(
sizeof(CoeffReturnType))));
678 Op reducer(m_reducer);
679 internal::FullReducer<Self, Op, Device>::run(*
this, reducer, m_device, data);
684 else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) {
685 bool reducing_inner_dims =
true;
686 for (
int i = 0; i < NumReducedDims; ++i) {
687 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
688 reducing_inner_dims &= m_reduced[i];
690 reducing_inner_dims &= m_reduced[NumInputDims - 1 - i];
693 if (internal::InnerReducer<Self, Op, Device>::HasOptimizedImplementation &&
694 (reducing_inner_dims || ReducingInnerMostDims)) {
695 const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
696 const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
698 if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve &&
699 num_values_to_reduce > 128) ||
701 data =
static_cast<EvaluatorPointerType
>(m_device.get(
702 (CoeffReturnType*)m_device.allocate_temp(
sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
708 Op reducer(m_reducer);
710 if (internal::InnerReducer<Self, Op, Device>::run(*
this, reducer, m_device, data, num_values_to_reduce,
711 num_coeffs_to_preserve)) {
713 m_device.deallocate_temp(m_result);
718 return (m_result != NULL);
722 bool preserving_inner_dims =
true;
723 for (
int i = 0; i < NumReducedDims; ++i) {
724 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
725 preserving_inner_dims &= m_reduced[NumInputDims - 1 - i];
727 preserving_inner_dims &= m_reduced[i];
730 if (internal::OuterReducer<Self, Op, Device>::HasOptimizedImplementation && preserving_inner_dims) {
731 const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
732 const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
734 if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve &&
735 num_values_to_reduce > 32) ||
737 data =
static_cast<EvaluatorPointerType
>(m_device.get(
738 (CoeffReturnType*)m_device.allocate_temp(
sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
744 Op reducer(m_reducer);
746 if (internal::OuterReducer<Self, Op, Device>::run(*
this, reducer, m_device, data, num_values_to_reduce,
747 num_coeffs_to_preserve)) {
749 m_device.deallocate_temp(m_result);
754 return (m_result != NULL);
757#if defined(EIGEN_USE_SYCL)
761 const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
762 const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
764 data =
static_cast<EvaluatorPointerType
>(
765 m_device.get((CoeffReturnType*)m_device.allocate_temp(
sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
768 Op reducer(m_reducer);
769 internal::GenericReducer<Self, Op, Device>::run(*
this, reducer, m_device, data, num_values_to_reduce,
770 num_coeffs_to_preserve);
771 return (m_result != NULL);
778#ifdef EIGEN_USE_THREADS
779 template <
typename EvalSubExprsCallback>
780 EIGEN_STRONG_INLINE
void evalSubExprsIfNeededAsync(EvaluatorPointerType data, EvalSubExprsCallback done) {
781 m_impl.evalSubExprsIfNeededAsync(NULL, [
this, data, done](
bool) { done(evalSubExprsIfNeededCommon(data)); });
785 EIGEN_STRONG_INLINE
bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
786 m_impl.evalSubExprsIfNeeded(NULL);
787 return evalSubExprsIfNeededCommon(data);
790 EIGEN_STRONG_INLINE
void cleanup() {
793 m_device.deallocate_temp(m_result);
798 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(
Index index)
const {
799 if ((RunningFullReduction || RunningOnGPU) && m_result) {
800 return *(m_result + index);
802 Op reducer(m_reducer);
803 if (ReducingInnerMostDims || RunningFullReduction) {
804 const Index num_values_to_reduce = (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor))
805 ? m_preservedStrides[0]
806 : m_preservedStrides[NumPreservedStrides - 1];
807 return internal::InnerMostDimReducer<Self, Op>::reduce(*
this, firstInput(index), num_values_to_reduce, reducer);
809 typename Self::CoeffReturnType accum = reducer.initialize();
810 internal::GenericDimReducer<NumReducedDims - 1, Self, Op>::reduce(*
this, firstInput(index), reducer, &accum);
811 return reducer.finalize(accum);
816 template <
int LoadMode>
817 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(
Index index)
const {
818 eigen_assert(index + PacketSize - 1 <
Index(internal::array_prod(dimensions())));
820 if (RunningOnGPU && m_result) {
821 return internal::pload<PacketReturnType>(m_result + index);
824 EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
825 if (ReducingInnerMostDims) {
826 const Index num_values_to_reduce = (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor))
827 ? m_preservedStrides[0]
828 : m_preservedStrides[NumPreservedStrides - 1];
829 const Index firstIndex = firstInput(index);
830 for (
Index i = 0; i < PacketSize; ++i) {
831 Op reducer(m_reducer);
832 values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*
this, firstIndex + i * num_values_to_reduce,
833 num_values_to_reduce, reducer);
835 }
else if (PreservingInnerMostDims) {
836 const Index firstIndex = firstInput(index);
837 const int innermost_dim = (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) ? 0 : NumOutputDims - 1;
839 if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) {
840 Op reducer(m_reducer);
841 typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
842 internal::InnerMostDimPreserver<NumReducedDims - 1, Self, Op>::reduce(*
this, firstIndex, reducer, &accum);
843 return reducer.finalizePacket(accum);
845 for (
int i = 0; i < PacketSize; ++i) {
846 values[i] = coeff(index + i);
850 for (
int i = 0; i < PacketSize; ++i) {
851 values[i] = coeff(index + i);
854 PacketReturnType rslt = internal::pload<PacketReturnType>(values);
859 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(
bool vectorized)
const {
860 if (RunningFullReduction && m_result) {
861 return TensorOpCost(
sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
863 const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
864 const double compute_cost = num_values_to_reduce * internal::functor_traits<Op>::Cost;
865 return m_impl.costPerCoeff(vectorized) * num_values_to_reduce +
866 TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
870 EIGEN_DEVICE_FUNC EvaluatorPointerType data()
const {
return m_result; }
871 EIGEN_DEVICE_FUNC
const TensorEvaluator<ArgType, Device>& impl()
const {
return m_impl; }
872 EIGEN_DEVICE_FUNC
const Device& device()
const {
return m_device; }
875 template <
int,
typename,
typename>
876 friend struct internal::GenericDimReducer;
877 template <
typename,
typename,
bool,
bool>
878 friend struct internal::InnerMostDimReducer;
879 template <
int,
typename,
typename,
bool>
880 friend struct internal::InnerMostDimPreserver;
881 template <
typename S,
typename O,
typename D,
bool V>
882 friend struct internal::FullReducer;
883#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
884 template <
int B,
int N,
typename S,
typename R,
typename I_>
885 KERNEL_FRIEND
void internal::FullReductionKernel(R,
const S, I_,
typename S::CoeffReturnType*,
unsigned int*);
886#if defined(EIGEN_HAS_GPU_FP16)
887 template <
typename S,
typename R,
typename I_>
888 KERNEL_FRIEND
void internal::ReductionInitFullReduxKernelHalfFloat(R,
const S, I_,
889 internal::packet_traits<Eigen::half>::type*);
890 template <
int B,
int N,
typename S,
typename R,
typename I_>
891 KERNEL_FRIEND
void internal::FullReductionKernelHalfFloat(R,
const S, I_, half*,
892 internal::packet_traits<Eigen::half>::type*);
893 template <
int NPT,
typename S,
typename R,
typename I_>
894 KERNEL_FRIEND
void internal::InnerReductionKernelHalfFloat(R,
const S, I_, I_, half*);
896 template <
int NPT,
typename S,
typename R,
typename I_>
897 KERNEL_FRIEND
void internal::InnerReductionKernel(R,
const S, I_, I_,
typename S::CoeffReturnType*);
899 template <
int NPT,
typename S,
typename R,
typename I_>
900 KERNEL_FRIEND
void internal::OuterReductionKernel(R,
const S, I_, I_,
typename S::CoeffReturnType*);
903#if defined(EIGEN_USE_SYCL)
904 template <
typename Evaluator_,
typename Op__>
905 friend class TensorSycl::internal::GenericNondeterministicReducer;
907 template <
typename,
typename,
typename>
908 friend struct internal::GenericReducer;
911 template <
typename S,
typename O,
typename D>
912 friend struct internal::InnerReducer;
914 struct BlockIteratorState {
922 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Index firstInput(
Index index)
const {
923 if (ReducingInnerMostDims) {
924 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
925 return index * m_preservedStrides[0];
927 return index * m_preservedStrides[NumPreservedStrides - 1];
931 Index startInput = 0;
932 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
933 for (
int i = NumOutputDims - 1; i > 0; --i) {
935 const Index idx = index / m_outputStrides[i];
936 startInput += idx * m_preservedStrides[i];
937 index -= idx * m_outputStrides[i];
939 if (PreservingInnerMostDims) {
940 eigen_assert(m_preservedStrides[0] == 1);
943 startInput += index * m_preservedStrides[0];
946 for (
int i = 0; i < NumOutputDims - 1; ++i) {
948 const Index idx = index / m_outputStrides[i];
949 startInput += idx * m_preservedStrides[i];
950 index -= idx * m_outputStrides[i];
952 if (PreservingInnerMostDims) {
953 eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1);
956 startInput += index * m_preservedStrides[NumPreservedStrides - 1];
963 array<bool, NumInputDims> m_reduced;
965 Dimensions m_dimensions;
968 array<
Index, (std::max)(NumOutputDims, 1)> m_outputStrides;
969 array<internal::TensorIntDivisor<Index>, (std::max)(NumOutputDims, 1)> m_fastOutputStrides;
970 array<
Index, (std::max)(NumPreservedStrides, 1)> m_preservedStrides;
972 array<
Index, (std::max)(NumOutputDims, 1)> m_output_to_input_dim_map;
974 Index m_numValuesToReduce;
978 array<Index, NumReducedDims> m_reducedStrides;
981 array<Index, NumReducedDims> m_reducedDims;
984 TensorEvaluator<ArgType, Device> m_impl;
989 EvaluatorPointerType m_result;
991 const Device EIGEN_DEVICE_REF m_device;
994template <
typename Op,
typename Dims,
typename ArgType,
template <
class>
class MakePointer_,
typename Device>
996 :
public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> {
997 typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Base;
998 EIGEN_STRONG_INLINE TensorEvaluator(
const typename Base::XprType& op,
const Device& device) : Base(op, device) {}
1001template <
typename Op,
typename Dims,
typename ArgType,
template <
class>
class MakePointer_>
1003 :
public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> {
1004 typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice>
1006 EIGEN_STRONG_INLINE TensorEvaluator(
const typename Base::XprType& op,
const Eigen::SyclDevice& device)
1007 : Base(op, device) {}
1011 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
typename Base::CoeffReturnType coeff(
typename Base::Index index)
const {
1012 return *(this->data() + index);
1017 template <
int LoadMode>
1018 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
typename Base::PacketReturnType packet(
typename Base::Index index)
const {
1019 return internal::pload<typename Base::PacketReturnType>(this->data() + index);
The tensor base class.
Definition TensorForwardDeclarations.h:68
Tensor reduction class.
Definition TensorReduction.h:501
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
Definition TensorReduction.h:484