Eigen-unsupported  5.0.1-dev+284dcc12
 
Loading...
Searching...
No Matches
TensorReduction.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5// Copyright (C) 2016 Mehdi Goli, Codeplay Software Ltd <eigen@codeplay.com>
6//
7// This Source Code Form is subject to the terms of the Mozilla
8// Public License v. 2.0. If a copy of the MPL was not distributed
9// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
10
11#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
12#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
13
14// clang is incompatible with the CUDA syntax wrt making a kernel a class friend,
15// so we'll use a macro to make clang happy.
16#ifndef KERNEL_FRIEND
17#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__))
18#define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
19#else
20#define KERNEL_FRIEND friend
21#endif
22#endif
23
24// IWYU pragma: private
25#include "./InternalHeaderCheck.h"
26
27namespace Eigen {
28
29namespace internal {
30template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
31struct traits<TensorReductionOp<Op, Dims, XprType, MakePointer_> > : traits<XprType> {
32 typedef traits<XprType> XprTraits;
33 typedef typename XprTraits::Scalar Scalar;
34 typedef typename XprTraits::StorageKind StorageKind;
35 typedef typename XprTraits::Index Index;
36 typedef typename XprType::Nested Nested;
37 static constexpr int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
38 static constexpr int Layout = XprTraits::Layout;
39 typedef typename XprTraits::PointerType PointerType;
40
41 template <class T>
42 struct MakePointer {
43 // Intermediate typedef to workaround MSVC issue.
44 typedef MakePointer_<T> MakePointerT;
45 typedef typename MakePointerT::Type Type;
46 };
47};
48
49template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
50struct eval<TensorReductionOp<Op, Dims, XprType, MakePointer_>, Eigen::Dense> {
51 typedef const TensorReductionOp<Op, Dims, XprType, MakePointer_>& type;
52};
53
54template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
55struct nested<TensorReductionOp<Op, Dims, XprType, MakePointer_>, 1,
56 typename eval<TensorReductionOp<Op, Dims, XprType, MakePointer_> >::type> {
57 typedef TensorReductionOp<Op, Dims, XprType, MakePointer_> type;
58};
59
60template <typename OutputDims>
61struct DimInitializer {
62 template <typename InputDims, typename ReducedDims>
63 EIGEN_DEVICE_FUNC static void run(const InputDims& input_dims,
64 const array<bool, internal::array_size<InputDims>::value>& reduced,
65 OutputDims* output_dims, ReducedDims* reduced_dims) {
66 const int NumInputDims = internal::array_size<InputDims>::value;
67 int outputIndex = 0;
68 int reduceIndex = 0;
69 for (int i = 0; i < NumInputDims; ++i) {
70 if (reduced[i]) {
71 (*reduced_dims)[reduceIndex] = input_dims[i];
72 ++reduceIndex;
73 } else {
74 (*output_dims)[outputIndex] = input_dims[i];
75 ++outputIndex;
76 }
77 }
78 }
79};
80
81template <>
82struct DimInitializer<Sizes<> > {
83 template <typename InputDims, typename Index, size_t Rank>
84 EIGEN_DEVICE_FUNC static void run(const InputDims& input_dims, const array<bool, Rank>&, Sizes<>*,
85 array<Index, Rank>* reduced_dims) {
86 const int NumInputDims = internal::array_size<InputDims>::value;
87 for (int i = 0; i < NumInputDims; ++i) {
88 (*reduced_dims)[i] = input_dims[i];
89 }
90 }
91};
92
93template <typename ReducedDims, int NumTensorDims, int Layout>
94struct are_inner_most_dims {
95 static const bool value = false;
96};
97template <typename ReducedDims, int NumTensorDims, int Layout>
98struct preserve_inner_most_dims {
99 static const bool value = false;
100};
101
102template <typename ReducedDims, int NumTensorDims>
103struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor> {
104 static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
105 static const bool tmp2 = index_statically_eq<ReducedDims>(0, 0);
106 static const bool tmp3 =
107 index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, array_size<ReducedDims>::value - 1);
108 static const bool value = tmp1 & tmp2 & tmp3;
109};
110template <typename ReducedDims, int NumTensorDims>
111struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor> {
112 static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
113 static const bool tmp2 = index_statically_eq<ReducedDims>(0, NumTensorDims - array_size<ReducedDims>::value);
114 static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
115 static const bool value = tmp1 & tmp2 & tmp3;
116};
117template <typename ReducedDims, int NumTensorDims>
118struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor> {
119 static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
120 static const bool tmp2 = index_statically_gt<ReducedDims>(0, 0);
121 static const bool value = tmp1 & tmp2;
122};
123template <typename ReducedDims, int NumTensorDims>
124struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor> {
125 static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
126 static const bool tmp2 = index_statically_lt<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
127 static const bool value = tmp1 & tmp2;
128};
129
130template <int DimIndex, typename Self, typename Op>
131struct GenericDimReducer {
132 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex,
133 Op& reducer, typename Self::CoeffReturnType* accum) {
134 EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
135 for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
136 const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
137 GenericDimReducer<DimIndex - 1, Self, Op>::reduce(self, input, reducer, accum);
138 }
139 }
140};
141template <typename Self, typename Op>
142struct GenericDimReducer<0, Self, Op> {
143 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex,
144 Op& reducer, typename Self::CoeffReturnType* accum) {
145 for (int j = 0; j < self.m_reducedDims[0]; ++j) {
146 const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
147 reducer.reduce(self.m_impl.coeff(input), accum);
148 }
149 }
150};
151template <typename Self, typename Op>
152struct GenericDimReducer<-1, Self, Op> {
153 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer,
154 typename Self::CoeffReturnType* accum) {
155 reducer.reduce(self.m_impl.coeff(index), accum);
156 }
157};
158
159template <typename Self, typename Op,
160 bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess),
161 bool UseTreeReduction = (!Self::ReducerTraits::IsStateful && !Self::ReducerTraits::IsExactlyAssociative &&
162 // GPU threads can quickly run out of stack space
163 // for moderately sized inputs.
164 !Self::RunningOnGPU)>
165struct InnerMostDimReducer {
166 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(
167 const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
168 typename Self::CoeffReturnType accum = reducer.initialize();
169 for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
170 reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
171 }
172 return reducer.finalize(accum);
173 }
174};
175
176template <typename Self, typename Op>
177struct InnerMostDimReducer<Self, Op, true, false> {
178 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(
179 const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer0) {
180 using Index = typename Self::Index;
181 constexpr Index packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
182 Index start = 0;
183 typename Self::PacketReturnType paccum0 = reducer0.template initializePacket<typename Self::PacketReturnType>();
184 if (!Self::ReducerTraits::IsStateful && numValuesToReduce >= 4 * packetSize) {
185 const Index VectorizedSize4 = (numValuesToReduce / (4 * packetSize)) * (4 * packetSize);
186 typename Self::PacketReturnType paccum1 = reducer0.template initializePacket<typename Self::PacketReturnType>();
187 typename Self::PacketReturnType paccum2 = reducer0.template initializePacket<typename Self::PacketReturnType>();
188 typename Self::PacketReturnType paccum3 = reducer0.template initializePacket<typename Self::PacketReturnType>();
189 const Index offset0 = firstIndex;
190 const Index offset1 = firstIndex + packetSize;
191 const Index offset2 = firstIndex + 2 * packetSize;
192 const Index offset3 = firstIndex + 3 * packetSize;
193 for (Index j = 0; j < VectorizedSize4; j += 4 * packetSize) {
194 reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset0 + j), &paccum0);
195 reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset1 + j), &paccum1);
196 reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset2 + j), &paccum2);
197 reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset3 + j), &paccum3);
198 }
199 reducer0.reducePacket(paccum1, &paccum0);
200 reducer0.reducePacket(paccum2, &paccum0);
201 reducer0.reducePacket(paccum3, &paccum0);
202 start = VectorizedSize4;
203 }
204 if (start <= (numValuesToReduce - packetSize)) {
205 const Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
206 for (Index j = start; j < VectorizedSize; j += packetSize) {
207 reducer0.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum0);
208 }
209 start = VectorizedSize;
210 }
211 typename Self::CoeffReturnType accum = reducer0.initialize();
212 for (Index j = start; j < numValuesToReduce; ++j) {
213 reducer0.reduce(self.m_impl.coeff(firstIndex + j), &accum);
214 }
215 return reducer0.finalizeBoth(accum, paccum0);
216 }
217};
218
219#if !defined(EIGEN_HIPCC)
220
221// The following implements tree-based reduction, which improves the accuracy
222// of sum and mean reductions, since each of the n inputs only participates in
223// O(log n) additions.
224template <typename T>
225EIGEN_DEVICE_FUNC inline Index LeafSize() {
226 return 1024;
227}
228template <>
229EIGEN_DEVICE_FUNC inline Index LeafSize<half>() {
230 return 200;
231}
232template <>
233EIGEN_DEVICE_FUNC inline Index LeafSize<bfloat16>() {
234 return 128;
235}
236
237template <typename Self, typename Op>
238struct InnerMostDimReducer<Self, Op, false, true> {
239 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(
240 const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
241 const Index kLeafSize = LeafSize<typename Self::CoeffReturnType>();
242 typename Self::CoeffReturnType accum = reducer.initialize();
243 if (numValuesToReduce > kLeafSize) {
244 const typename Self::Index half = numValuesToReduce / 2;
245 // Recursively reduce the two halves.
246 reducer.reduce(reduce(self, firstIndex, half, reducer), &accum);
247 reducer.reduce(reduce(self, firstIndex + half, numValuesToReduce - half, reducer), &accum);
248 return reducer.finalize(accum);
249 } else {
250 return InnerMostDimReducer<Self, Op, false, false>::reduce(self, firstIndex, numValuesToReduce, reducer);
251 }
252 }
253};
254
255template <typename Self, typename Op>
256struct InnerMostDimReducer<Self, Op, true, true> {
257 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(
258 const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
259 const Index kLeafSize = LeafSize<typename Self::CoeffReturnType>();
260 const typename Self::Index packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
261 typename Self::CoeffReturnType accum = reducer.initialize();
262 if (numValuesToReduce > packetSize * kLeafSize) {
263 // Make sure the split point is aligned on a packet boundary.
264 const typename Self::Index split =
265 packetSize *
266 numext::div_ceil(firstIndex + numext::div_ceil(numValuesToReduce, typename Self::Index(2)), packetSize);
267 const typename Self::Index num_left = numext::mini(split - firstIndex, numValuesToReduce);
268 reducer.reduce(reduce(self, firstIndex, num_left, reducer), &accum);
269 if (num_left < numValuesToReduce) {
270 reducer.reduce(reduce(self, split, numValuesToReduce - num_left, reducer), &accum);
271 }
272 return reducer.finalize(accum);
273 } else {
274 return InnerMostDimReducer<Self, Op, true, false>::reduce(self, firstIndex, numValuesToReduce, reducer);
275 }
276 }
277};
278#endif
279
280template <int DimIndex, typename Self, typename Op,
281 bool vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
282struct InnerMostDimPreserver {
283 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&,
284 typename Self::PacketReturnType*) {
285 eigen_assert(false && "should never be called");
286 }
287};
288
289template <int DimIndex, typename Self, typename Op>
290struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
291 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex,
292 Op& reducer, typename Self::PacketReturnType* accum) {
293 EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
294 for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
295 const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
296 InnerMostDimPreserver<DimIndex - 1, Self, Op>::reduce(self, input, reducer, accum);
297 }
298 }
299};
300
301template <typename Self, typename Op>
302struct InnerMostDimPreserver<0, Self, Op, true> {
303 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex,
304 Op& reducer0, typename Self::PacketReturnType* accum0) {
305 using Index = typename Self::Index;
306 const Index stride = self.m_reducedStrides[0];
307 const Index size = self.m_reducedDims[0];
308 if (!Self::ReducerTraits::IsStateful && size >= 16) {
309 const Index unrolled_size4 = (size / 4) * 4;
310 typename Self::PacketReturnType accum1 = reducer0.template initializePacket<typename Self::PacketReturnType>();
311 typename Self::PacketReturnType accum2 = reducer0.template initializePacket<typename Self::PacketReturnType>();
312 typename Self::PacketReturnType accum3 = reducer0.template initializePacket<typename Self::PacketReturnType>();
313 for (Index j = 0; j < unrolled_size4; j += 4) {
314 const Index input0 = firstIndex + j * stride;
315 reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input0), accum0);
316 const Index input1 = firstIndex + (j + 1) * stride;
317 reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input1), &accum1);
318 const Index input2 = firstIndex + (j + 2) * stride;
319 reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input2), &accum2);
320 const Index input3 = firstIndex + (j + 3) * stride;
321 reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input3), &accum3);
322 }
323 reducer0.reducePacket(accum1, accum0);
324 reducer0.reducePacket(accum2, accum0);
325 reducer0.reducePacket(accum3, accum0);
326 for (Index j = unrolled_size4; j < size; ++j) {
327 Index input = firstIndex + j * stride;
328 reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input), accum0);
329 }
330 } else {
331 for (Index j = 0; j < size; ++j) {
332 Index input = firstIndex + j * stride;
333 reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input), accum0);
334 }
335 }
336 }
337};
338template <typename Self, typename Op>
339struct InnerMostDimPreserver<-1, Self, Op, true> {
340 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&,
341 typename Self::PacketReturnType*) {
342 eigen_assert(false && "should never be called");
343 }
344};
345
346// Default full reducer
347template <typename Self, typename Op, typename Device,
348 bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
349struct FullReducer {
350 static constexpr bool HasOptimizedImplementation = false;
351
352 static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&,
353 typename Self::EvaluatorPointerType output) {
354 const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
355 *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
356 }
357};
358
359#ifdef EIGEN_USE_THREADS
360// Multithreaded full reducer
361template <typename Self, typename Op, bool Vectorizable>
362struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
363 static constexpr bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful;
364 static constexpr Index PacketSize = unpacket_traits<typename Self::PacketReturnType>::size;
365
366 // launch one reducer per thread and accumulate the result.
367 static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device,
368 typename Self::CoeffReturnType* output) {
369 typedef typename Self::Index Index;
370 const Index num_coeffs = array_prod(self.m_impl.dimensions());
371 if (num_coeffs == 0) {
372 *output = reducer.finalize(reducer.initialize());
373 return;
374 }
375 const TensorOpCost cost = self.m_impl.costPerCoeff(Vectorizable) +
376 TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable, PacketSize);
377 const Index num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(num_coeffs, cost, device.numThreads());
378 if (num_threads == 1) {
379 *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
380 return;
381 }
382 const Index blocksize = num_coeffs / num_threads;
383 const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
384 eigen_assert(num_coeffs >= numblocks * blocksize);
385
386 Barrier barrier(internal::convert_index<unsigned int>(numblocks));
387 MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
388 for (Index i = 0; i < numblocks; ++i) {
389 auto run_shard = [i, blocksize, &self, &barrier, &shards, &reducer](){
390 shards[i] = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, i * blocksize, blocksize, reducer);
391 barrier.Notify();
392 };
393 device.enqueue(std::move(run_shard));
394 }
395 typename Self::CoeffReturnType finalShard;
396 if (numblocks * blocksize < num_coeffs) {
397 finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, numblocks * blocksize,
398 num_coeffs - numblocks * blocksize, reducer);
399 } else {
400 finalShard = reducer.initialize();
401 }
402 barrier.Wait();
403
404 for (Index i = 0; i < numblocks; ++i) {
405 reducer.reduce(shards[i], &finalShard);
406 }
407 *output = reducer.finalize(finalShard);
408 }
409};
410
411#endif
412
413// Default inner reducer
414template <typename Self, typename Op, typename Device>
415struct InnerReducer {
416 static constexpr bool HasOptimizedImplementation = false;
417
418 EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*,
419 typename Self::Index, typename Self::Index) {
420 eigen_assert(false && "Not implemented");
421 return true;
422 }
423};
424
425// Default outer reducer
426template <typename Self, typename Op, typename Device>
427struct OuterReducer {
428 static constexpr bool HasOptimizedImplementation = false;
429
430 EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*,
431 typename Self::Index, typename Self::Index) {
432 eigen_assert(false && "Not implemented");
433 return true;
434 }
435};
436
437#ifdef EIGEN_USE_SYCL
438// Default Generic reducer
439template <typename Self, typename Op, typename Device>
440struct GenericReducer {
441 static constexpr bool HasOptimizedImplementation = false;
442
443 EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*,
444 typename Self::Index, typename Self::Index) {
445 eigen_assert(false && "Not implemented");
446 return true;
447 }
448};
449#endif
450
451#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
452template <int B, int N, typename S, typename R, typename I_>
453__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*,
454 unsigned int*);
455
456#if defined(EIGEN_HAS_GPU_FP16)
457template <typename S, typename R, typename I_>
458__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(
459 R, const S, I_, internal::packet_traits<half>::type*);
460template <int B, int N, typename S, typename R, typename I_>
461__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(R, const S, I_, half*,
462 internal::packet_traits<half>::type*);
463template <int NPT, typename S, typename R, typename I_>
464__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
465
466#endif
467
468template <int NPT, typename S, typename R, typename I_>
469__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
470
471template <int NPT, typename S, typename R, typename I_>
472__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
473#endif
474
483template <typename Op, typename CoeffReturnType>
485#if defined(EIGEN_USE_SYCL)
486 typedef std::remove_const_t<decltype(std::declval<Op>().initialize())> type;
487#else
488 typedef std::remove_const_t<CoeffReturnType> type;
489#endif
490};
491
492} // end namespace internal
493
500template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
501class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType, MakePointer_>, ReadOnlyAccessors> {
502 public:
503 typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar;
504 typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
505 typedef std::remove_const_t<typename XprType::CoeffReturnType> CoeffReturnType;
506 typedef typename Eigen::internal::nested<TensorReductionOp>::type Nested;
507 typedef typename Eigen::internal::traits<TensorReductionOp>::StorageKind StorageKind;
508 typedef typename Eigen::internal::traits<TensorReductionOp>::Index Index;
509
510 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionOp(const XprType& expr, const Dims& dims)
511 : m_expr(expr), m_dims(dims) {}
512 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer)
513 : m_expr(expr), m_dims(dims), m_reducer(reducer) {}
514
515 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& expression() const { return m_expr; }
516 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dims& dims() const { return m_dims; }
517 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& reducer() const { return m_reducer; }
518
519 protected:
520 typename XprType::Nested m_expr;
521 const Dims m_dims;
522 const Op m_reducer;
523};
524
525template <typename ArgType, typename Device>
526struct TensorReductionEvaluatorBase;
527
528// Eval as rvalue
529template <typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
530struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> {
531 typedef internal::reducer_traits<Op, Device> ReducerTraits;
532 typedef Dims ReducedDims;
534 typedef typename XprType::Index Index;
535 typedef ArgType ChildType;
536 typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
537 static constexpr int NumInputDims = internal::array_size<InputDimensions>::value;
538 static constexpr int NumReducedDims = internal::array_size<Dims>::value;
539 static constexpr int NumOutputDims = NumInputDims - NumReducedDims;
540 typedef std::conditional_t<NumOutputDims == 0, Sizes<>, DSizes<Index, NumOutputDims> > Dimensions;
541 typedef typename XprType::Scalar Scalar;
542 typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self;
543 static constexpr bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
544 typedef typename internal::ReductionReturnType<Op, typename XprType::CoeffReturnType>::type CoeffReturnType;
545 typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
546 static constexpr Index PacketSize = PacketType<CoeffReturnType, Device>::size;
547
548 typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
549 typedef StorageMemory<CoeffReturnType, Device> Storage;
550 typedef typename Storage::Type EvaluatorPointerType;
551
552 // Subset of strides of the input tensor for the non-reduced dimensions.
553 // Indexed by output dimensions.
554 static constexpr int NumPreservedStrides = max_n_1<NumOutputDims>::size;
555
556 // For full reductions
557#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
558 static constexpr bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
559 static constexpr bool RunningOnSycl = false;
560#elif defined(EIGEN_USE_SYCL)
561 static constexpr bool RunningOnSycl = internal::is_same<internal::remove_all_t<Device>, Eigen::SyclDevice>::value;
562 static constexpr bool RunningOnGPU = false;
563#else
564 static constexpr bool RunningOnGPU = false;
565 static constexpr bool RunningOnSycl = false;
566#endif
567
568 static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
569 enum {
570 IsAligned = false,
571 PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess,
572 BlockAccess = false,
573 PreferBlockAccess = true,
574 CoordAccess = false, // to be implemented
575 RawAccess = false
576 };
577
578 typedef std::remove_const_t<Scalar> ScalarNoConst;
579
580 //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
581 typedef internal::TensorBlockNotImplemented TensorBlock;
582 //===--------------------------------------------------------------------===//
583
584 static constexpr bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
585 static constexpr bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
586 static constexpr bool RunningFullReduction = (NumOutputDims == 0);
587
588 EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device)
589 : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device) {
590 EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
591 EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
592 YOU_MADE_A_PROGRAMMING_MISTAKE);
593
594 // Build the bitmap indicating if an input dimension is reduced or not.
595 for (int i = 0; i < NumInputDims; ++i) {
596 m_reduced[i] = false;
597 }
598 for (int i = 0; i < NumReducedDims; ++i) {
599 eigen_assert(op.dims()[i] >= 0);
600 eigen_assert(op.dims()[i] < NumInputDims);
601 m_reduced[op.dims()[i]] = true;
602 }
603
604 const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
605 internal::DimInitializer<Dimensions>::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims);
606
607 // Precompute output strides.
608 if (NumOutputDims > 0) {
609 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
610 m_outputStrides[0] = 1;
611 for (int i = 1; i < NumOutputDims; ++i) {
612 m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
613 m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
614 }
615 } else {
616 m_outputStrides[static_cast<size_t>(NumOutputDims - 1)] = 1;
617 for (int i = NumOutputDims - 2; i >= 0; --i) {
618 m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
619 m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
620 }
621 }
622 }
623
624 // Precompute input strides.
625 if (NumInputDims > 0) {
626 array<Index, NumInputDims> input_strides;
627 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
628 input_strides[0] = 1;
629 for (int i = 1; i < NumInputDims; ++i) {
630 input_strides[i] = input_strides[i - 1] * input_dims[i - 1];
631 }
632 } else {
633 input_strides.back() = 1;
634 for (int i = NumInputDims - 2; i >= 0; --i) {
635 input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
636 }
637 }
638
639 int outputIndex = 0;
640 int reduceIndex = 0;
641 for (int i = 0; i < NumInputDims; ++i) {
642 if (m_reduced[i]) {
643 m_reducedStrides[reduceIndex] = input_strides[i];
644 ++reduceIndex;
645 } else {
646 m_preservedStrides[outputIndex] = input_strides[i];
647 m_output_to_input_dim_map[outputIndex] = i;
648 ++outputIndex;
649 }
650 }
651 }
652
653 // Special case for full reductions
654 if (NumOutputDims == 0) {
655 m_preservedStrides[0] = internal::array_prod(input_dims);
656 }
657
658 m_numValuesToReduce = NumOutputDims == 0 ? internal::array_prod(input_dims)
659 : (static_cast<int>(Layout) == static_cast<int>(ColMajor))
660 ? m_preservedStrides[0]
661 : m_preservedStrides[static_cast<size_t>(NumOutputDims - 1)];
662 }
663
664 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
665
666 EIGEN_STRONG_INLINE bool evalSubExprsIfNeededCommon(EvaluatorPointerType data) {
667 // Use the FullReducer if possible.
668 if ((RunningFullReduction && RunningOnSycl) ||
669 (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
670 ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || !RunningOnGPU))) {
671 bool need_assign = false;
672 if (!data) {
673 m_result = static_cast<EvaluatorPointerType>(
674 m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType))));
675 data = m_result;
676 need_assign = true;
677 }
678 Op reducer(m_reducer);
679 internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data);
680 return need_assign;
681 }
682
683 // Attempt to use an optimized reduction.
684 else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) {
685 bool reducing_inner_dims = true;
686 for (int i = 0; i < NumReducedDims; ++i) {
687 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
688 reducing_inner_dims &= m_reduced[i];
689 } else {
690 reducing_inner_dims &= m_reduced[NumInputDims - 1 - i];
691 }
692 }
693 if (internal::InnerReducer<Self, Op, Device>::HasOptimizedImplementation &&
694 (reducing_inner_dims || ReducingInnerMostDims)) {
695 const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
696 const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
697 if (!data) {
698 if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve &&
699 num_values_to_reduce > 128) ||
700 (RunningOnSycl)) {
701 data = static_cast<EvaluatorPointerType>(m_device.get(
702 (CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
703 m_result = data;
704 } else {
705 return true;
706 }
707 }
708 Op reducer(m_reducer);
709 // For SYCL this if always return false
710 if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce,
711 num_coeffs_to_preserve)) {
712 if (m_result) {
713 m_device.deallocate_temp(m_result);
714 m_result = NULL;
715 }
716 return true;
717 } else {
718 return (m_result != NULL);
719 }
720 }
721
722 bool preserving_inner_dims = true;
723 for (int i = 0; i < NumReducedDims; ++i) {
724 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
725 preserving_inner_dims &= m_reduced[NumInputDims - 1 - i];
726 } else {
727 preserving_inner_dims &= m_reduced[i];
728 }
729 }
730 if (internal::OuterReducer<Self, Op, Device>::HasOptimizedImplementation && preserving_inner_dims) {
731 const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
732 const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
733 if (!data) {
734 if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve &&
735 num_values_to_reduce > 32) ||
736 (RunningOnSycl)) {
737 data = static_cast<EvaluatorPointerType>(m_device.get(
738 (CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
739 m_result = data;
740 } else {
741 return true;
742 }
743 }
744 Op reducer(m_reducer);
745 // For SYCL this if always return false
746 if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce,
747 num_coeffs_to_preserve)) {
748 if (m_result) {
749 m_device.deallocate_temp(m_result);
750 m_result = NULL;
751 }
752 return true;
753 } else {
754 return (m_result != NULL);
755 }
756 }
757#if defined(EIGEN_USE_SYCL)
758 // If there is no Optimised version for SYCL, the reduction expression
759 // must break into two subexpression and use the SYCL generic Reducer on the device.
760 if (RunningOnSycl) {
761 const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
762 const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
763 if (!data) {
764 data = static_cast<EvaluatorPointerType>(
765 m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
766 m_result = data;
767 }
768 Op reducer(m_reducer);
769 internal::GenericReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce,
770 num_coeffs_to_preserve);
771 return (m_result != NULL);
772 }
773#endif
774 }
775 return true;
776 }
777
778#ifdef EIGEN_USE_THREADS
779 template <typename EvalSubExprsCallback>
780 EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType data, EvalSubExprsCallback done) {
781 m_impl.evalSubExprsIfNeededAsync(NULL, [this, data, done](bool) { done(evalSubExprsIfNeededCommon(data)); });
782 }
783#endif
784
785 EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
786 m_impl.evalSubExprsIfNeeded(NULL);
787 return evalSubExprsIfNeededCommon(data);
788 }
789
790 EIGEN_STRONG_INLINE void cleanup() {
791 m_impl.cleanup();
792 if (m_result) {
793 m_device.deallocate_temp(m_result);
794 m_result = NULL;
795 }
796 }
797
798 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
799 if ((RunningFullReduction || RunningOnGPU) && m_result) {
800 return *(m_result + index);
801 }
802 Op reducer(m_reducer);
803 if (ReducingInnerMostDims || RunningFullReduction) {
804 const Index num_values_to_reduce = (static_cast<int>(Layout) == static_cast<int>(ColMajor))
805 ? m_preservedStrides[0]
806 : m_preservedStrides[NumPreservedStrides - 1];
807 return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index), num_values_to_reduce, reducer);
808 } else {
809 typename Self::CoeffReturnType accum = reducer.initialize();
810 internal::GenericDimReducer<NumReducedDims - 1, Self, Op>::reduce(*this, firstInput(index), reducer, &accum);
811 return reducer.finalize(accum);
812 }
813 }
814
815 // TODO(bsteiner): provide a more efficient implementation.
816 template <int LoadMode>
817 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
818 eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions())));
819
820 if (RunningOnGPU && m_result) {
821 return internal::pload<PacketReturnType>(m_result + index);
822 }
823
824 EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
825 if (ReducingInnerMostDims) {
826 const Index num_values_to_reduce = (static_cast<int>(Layout) == static_cast<int>(ColMajor))
827 ? m_preservedStrides[0]
828 : m_preservedStrides[NumPreservedStrides - 1];
829 const Index firstIndex = firstInput(index);
830 for (Index i = 0; i < PacketSize; ++i) {
831 Op reducer(m_reducer);
832 values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce,
833 num_values_to_reduce, reducer);
834 }
835 } else if (PreservingInnerMostDims) {
836 const Index firstIndex = firstInput(index);
837 const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
838 // TBD: extend this the the n innermost dimensions that we preserve.
839 if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) {
840 Op reducer(m_reducer);
841 typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
842 internal::InnerMostDimPreserver<NumReducedDims - 1, Self, Op>::reduce(*this, firstIndex, reducer, &accum);
843 return reducer.finalizePacket(accum);
844 } else {
845 for (int i = 0; i < PacketSize; ++i) {
846 values[i] = coeff(index + i);
847 }
848 }
849 } else {
850 for (int i = 0; i < PacketSize; ++i) {
851 values[i] = coeff(index + i);
852 }
853 }
854 PacketReturnType rslt = internal::pload<PacketReturnType>(values);
855 return rslt;
856 }
857
858 // Must be called after evalSubExprsIfNeeded().
859 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
860 if (RunningFullReduction && m_result) {
861 return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
862 } else {
863 const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
864 const double compute_cost = num_values_to_reduce * internal::functor_traits<Op>::Cost;
865 return m_impl.costPerCoeff(vectorized) * num_values_to_reduce +
866 TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
867 }
868 }
869
870 EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
871 EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
872 EIGEN_DEVICE_FUNC const Device& device() const { return m_device; }
873
874 private:
875 template <int, typename, typename>
876 friend struct internal::GenericDimReducer;
877 template <typename, typename, bool, bool>
878 friend struct internal::InnerMostDimReducer;
879 template <int, typename, typename, bool>
880 friend struct internal::InnerMostDimPreserver;
881 template <typename S, typename O, typename D, bool V>
882 friend struct internal::FullReducer;
883#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
884 template <int B, int N, typename S, typename R, typename I_>
885 KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
886#if defined(EIGEN_HAS_GPU_FP16)
887 template <typename S, typename R, typename I_>
888 KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_,
889 internal::packet_traits<Eigen::half>::type*);
890 template <int B, int N, typename S, typename R, typename I_>
891 KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I_, half*,
892 internal::packet_traits<Eigen::half>::type*);
893 template <int NPT, typename S, typename R, typename I_>
894 KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
895#endif
896 template <int NPT, typename S, typename R, typename I_>
897 KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
898
899 template <int NPT, typename S, typename R, typename I_>
900 KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
901#endif
902
903#if defined(EIGEN_USE_SYCL)
904 template <typename Evaluator_, typename Op__>
905 friend class TensorSycl::internal::GenericNondeterministicReducer;
906 // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer
907 template <typename, typename, typename>
908 friend struct internal::GenericReducer;
909#endif
910
911 template <typename S, typename O, typename D>
912 friend struct internal::InnerReducer;
913
914 struct BlockIteratorState {
915 Index input_dim;
916 Index output_size;
917 Index output_count;
918 };
919
920 // Returns the Index in the input tensor of the first value that needs to be
921 // used to compute the reduction at output index "index".
922 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
923 if (ReducingInnerMostDims) {
924 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
925 return index * m_preservedStrides[0];
926 } else {
927 return index * m_preservedStrides[NumPreservedStrides - 1];
928 }
929 }
930 // TBD: optimize the case where we preserve the innermost dimensions.
931 Index startInput = 0;
932 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
933 for (int i = NumOutputDims - 1; i > 0; --i) {
934 // This is index_i in the output tensor.
935 const Index idx = index / m_outputStrides[i];
936 startInput += idx * m_preservedStrides[i];
937 index -= idx * m_outputStrides[i];
938 }
939 if (PreservingInnerMostDims) {
940 eigen_assert(m_preservedStrides[0] == 1);
941 startInput += index;
942 } else {
943 startInput += index * m_preservedStrides[0];
944 }
945 } else {
946 for (int i = 0; i < NumOutputDims - 1; ++i) {
947 // This is index_i in the output tensor.
948 const Index idx = index / m_outputStrides[i];
949 startInput += idx * m_preservedStrides[i];
950 index -= idx * m_outputStrides[i];
951 }
952 if (PreservingInnerMostDims) {
953 eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1);
954 startInput += index;
955 } else {
956 startInput += index * m_preservedStrides[NumPreservedStrides - 1];
957 }
958 }
959 return startInput;
960 }
961
962 // Bitmap indicating if an input dimension is reduced or not.
963 array<bool, NumInputDims> m_reduced;
964 // Dimensions of the output of the operation.
965 Dimensions m_dimensions;
966 // Precomputed strides for the output tensor.
967 // Avoid zero-sized arrays, since element access fails to compile on GPU.
968 array<Index, (std::max)(NumOutputDims, 1)> m_outputStrides;
969 array<internal::TensorIntDivisor<Index>, (std::max)(NumOutputDims, 1)> m_fastOutputStrides;
970 array<Index, (std::max)(NumPreservedStrides, 1)> m_preservedStrides;
971 // Map from output to input dimension index.
972 array<Index, (std::max)(NumOutputDims, 1)> m_output_to_input_dim_map;
973 // How many values go into each reduction
974 Index m_numValuesToReduce;
975
976 // Subset of strides of the input tensor for the reduced dimensions.
977 // Indexed by reduced dimensions.
978 array<Index, NumReducedDims> m_reducedStrides;
979 // Size of the input dimensions that are reduced.
980 // Indexed by reduced dimensions.
981 array<Index, NumReducedDims> m_reducedDims;
982
983 // Evaluator for the input expression.
984 TensorEvaluator<ArgType, Device> m_impl;
985
986 // Operation to apply for computing the reduction.
987 Op m_reducer;
988
989 EvaluatorPointerType m_result;
990
991 const Device EIGEN_DEVICE_REF m_device;
992};
993
994template <typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
995struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
996 : public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> {
997 typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Base;
998 EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device) {}
999};
1000
1001template <typename Op, typename Dims, typename ArgType, template <class> class MakePointer_>
1002struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice>
1003 : public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> {
1004 typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice>
1005 Base;
1006 EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device)
1007 : Base(op, device) {}
1008 // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL
1009 // kernel
1010 // Therefore the coeff function should be overridden by for SYCL kernel
1011 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const {
1012 return *(this->data() + index);
1013 }
1014 // The packet function in the base the recursive method which is not an standard layout and cannot be used in the SYCL
1015 // kernel
1016 // Therefore the packet function should be overridden by for SYCL kernel
1017 template <int LoadMode>
1018 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::PacketReturnType packet(typename Base::Index index) const {
1019 return internal::pload<typename Base::PacketReturnType>(this->data() + index);
1020 }
1021};
1022
1023} // end namespace Eigen
1024
1025#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
The tensor base class.
Definition TensorForwardDeclarations.h:68
Tensor reduction class.
Definition TensorReduction.h:501
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
Definition TensorReduction.h:484