Eigen-unsupported  5.0.1-dev+284dcc12
 
Loading...
Searching...
No Matches
TensorConvolution.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
11#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
12
13// IWYU pragma: private
14#include "./InternalHeaderCheck.h"
15
16namespace Eigen {
17
18namespace internal {
19
20template <typename Index, typename InputDims, int NumKernelDims, int Layout>
21class IndexMapper {
22 public:
23 IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
24 const array<Index, NumKernelDims>& indices) {
25 array<Index, NumDims> dimensions = input_dims;
26 for (int i = 0; i < NumKernelDims; ++i) {
27 const Index index = indices[i];
28 const Index input_dim = input_dims[index];
29 const Index kernel_dim = kernel_dims[i];
30 const Index result_dim = input_dim - kernel_dim + 1;
31 dimensions[index] = result_dim;
32 }
33
34 array<Index, NumDims> inputStrides;
35 array<Index, NumDims> outputStrides;
36 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
37 inputStrides[0] = 1;
38 outputStrides[0] = 1;
39 for (int i = 1; i < NumDims; ++i) {
40 inputStrides[i] = inputStrides[i - 1] * input_dims[i - 1];
41 outputStrides[i] = outputStrides[i - 1] * dimensions[i - 1];
42 }
43 } else {
44 inputStrides[NumDims - 1] = 1;
45 outputStrides[NumDims - 1] = 1;
46 for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
47 inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
48 outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
49 }
50 }
51
52 array<Index, NumDims> gpuInputDimensions;
53 array<Index, NumDims> gpuOutputDimensions;
54 array<Index, NumDims> tmp = dimensions;
55 array<Index, NumDims> ordering;
56 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
57 for (int i = 0; i < NumKernelDims; ++i) {
58 const Index index = i + offset;
59 ordering[index] = indices[i];
60 tmp[indices[i]] = -1;
61 gpuInputDimensions[index] = input_dims[indices[i]];
62 gpuOutputDimensions[index] = dimensions[indices[i]];
63 }
64
65 int written = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? NumKernelDims : 0;
66 for (int i = 0; i < NumDims; ++i) {
67 if (tmp[i] >= 0) {
68 ordering[written] = i;
69 gpuInputDimensions[written] = input_dims[i];
70 gpuOutputDimensions[written] = dimensions[i];
71 ++written;
72 }
73 }
74
75 for (int i = 0; i < NumDims; ++i) {
76 m_inputStrides[i] = inputStrides[ordering[i]];
77 m_outputStrides[i] = outputStrides[ordering[i]];
78 }
79
80 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
81 for (int i = 0; i < NumDims; ++i) {
82 if (i > NumKernelDims) {
83 m_gpuInputStrides[i] = m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1];
84 m_gpuOutputStrides[i] = m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1];
85 } else {
86 m_gpuInputStrides[i] = 1;
87 m_gpuOutputStrides[i] = 1;
88 }
89 }
90 } else {
91 for (int i = NumDims - 1; i >= 0; --i) {
92 if (static_cast<size_t>(i + 1) < offset) {
93 m_gpuInputStrides[i] = m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
94 m_gpuOutputStrides[i] = m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
95 } else {
96 m_gpuInputStrides[i] = 1;
97 m_gpuOutputStrides[i] = 1;
98 }
99 }
100 }
101 }
102
103 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const {
104 Index inputIndex = 0;
105 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
106 for (int d = NumDims - 1; d > NumKernelDims; --d) {
107 const Index idx = p / m_gpuInputStrides[d];
108 inputIndex += idx * m_inputStrides[d];
109 p -= idx * m_gpuInputStrides[d];
110 }
111 if (NumKernelDims < NumDims) {
112 inputIndex += p * m_inputStrides[NumKernelDims];
113 }
114 } else {
115 std::ptrdiff_t limit = 0;
116 if (NumKernelDims < NumDims) {
117 limit = NumDims - NumKernelDims - 1;
118 }
119 for (int d = 0; d < limit; ++d) {
120 const Index idx = p / m_gpuInputStrides[d];
121 inputIndex += idx * m_inputStrides[d];
122 p -= idx * m_gpuInputStrides[d];
123 }
124 inputIndex += p * m_inputStrides[limit];
125 }
126 return inputIndex;
127 }
128
129 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const {
130 Index outputIndex = 0;
131 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
132 for (int d = NumDims - 1; d > NumKernelDims; --d) {
133 const Index idx = p / m_gpuOutputStrides[d];
134 outputIndex += idx * m_outputStrides[d];
135 p -= idx * m_gpuOutputStrides[d];
136 }
137 if (NumKernelDims < NumDims) {
138 outputIndex += p * m_outputStrides[NumKernelDims];
139 }
140 } else {
141 std::ptrdiff_t limit = 0;
142 if (NumKernelDims < NumDims) {
143 limit = NumDims - NumKernelDims - 1;
144 }
145 for (int d = 0; d < limit; ++d) {
146 const Index idx = p / m_gpuOutputStrides[d];
147 outputIndex += idx * m_outputStrides[d];
148 p -= idx * m_gpuOutputStrides[d];
149 }
150 outputIndex += p * m_outputStrides[limit];
151 }
152 return outputIndex;
153 }
154
155 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const {
156 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
157 return i * m_inputStrides[offset];
158 }
159
160 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const {
161 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
162 return i * m_outputStrides[offset];
163 }
164
165 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const {
166 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
167 return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
168 }
169
170 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const {
171 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
172 return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
173 }
174
175 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
176 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
177 return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] + k * m_inputStrides[offset + 2];
178 }
179
180 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
181 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
182 return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] + k * m_outputStrides[offset + 2];
183 }
184
185 private:
186 static constexpr int NumDims = internal::array_size<InputDims>::value;
187 array<Index, NumDims> m_inputStrides;
188 array<Index, NumDims> m_outputStrides;
189 array<Index, NumDims> m_gpuInputStrides;
190 array<Index, NumDims> m_gpuOutputStrides;
191};
192
193template <typename Dimensions, typename InputXprType, typename KernelXprType>
194struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> > {
195 // Type promotion to handle the case where the types of the lhs and the rhs are different.
196 typedef typename promote_storage_type<typename InputXprType::Scalar, typename KernelXprType::Scalar>::ret Scalar;
197 typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
198 typename traits<KernelXprType>::StorageKind>::ret StorageKind;
199 typedef typename promote_index_type<typename traits<InputXprType>::Index, typename traits<KernelXprType>::Index>::type
200 Index;
201 typedef typename InputXprType::Nested LhsNested;
202 typedef typename KernelXprType::Nested RhsNested;
203 typedef std::remove_reference_t<LhsNested> LhsNested_;
204 typedef std::remove_reference_t<RhsNested> RhsNested_;
205 static constexpr int NumDimensions = traits<InputXprType>::NumDimensions;
206 static constexpr int Layout = traits<InputXprType>::Layout;
207 typedef std::conditional_t<Pointer_type_promotion<typename InputXprType::Scalar, Scalar>::val,
208 typename traits<InputXprType>::PointerType, typename traits<KernelXprType>::PointerType>
209 PointerType;
210
211 enum { Flags = 0 };
212};
213
214template <typename Dimensions, typename InputXprType, typename KernelXprType>
215struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense> {
216 typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
217};
218
219template <typename Dimensions, typename InputXprType, typename KernelXprType>
220struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1,
221 typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type> {
222 typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
223};
224
225} // end namespace internal
226
230template <typename Indices, typename InputXprType, typename KernelXprType>
231class TensorConvolutionOp
232 : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors> {
233 public:
234 typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
235 typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
236 typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
237 typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
238 typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
239 typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
240 typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
241
242 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel,
243 const Indices& dims)
244 : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
245
246 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Indices& indices() const { return m_indices; }
247
249 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<typename InputXprType::Nested>& inputExpression()
250 const {
251 return m_input_xpr;
252 }
253
254 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<typename KernelXprType::Nested>& kernelExpression()
255 const {
256 return m_kernel_xpr;
257 }
258
259 protected:
260 typename InputXprType::Nested m_input_xpr;
261 typename KernelXprType::Nested m_kernel_xpr;
262 const Indices m_indices;
263};
264
265template <typename Indices, typename InputArgType, typename KernelArgType, typename Device>
266struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device> {
267 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
268
269 static constexpr int NumDims =
270 internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
271 static constexpr int NumKernelDims = internal::array_size<Indices>::value;
272 typedef typename XprType::Index Index;
273 typedef DSizes<Index, NumDims> Dimensions;
274
275 typedef typename XprType::Scalar Scalar;
276 typedef typename XprType::CoeffReturnType CoeffReturnType;
277 typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
278 static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
279 typedef StorageMemory<Scalar, Device> Storage;
280 typedef typename Storage::Type EvaluatorPointerType;
281
282 static constexpr int Layout = TensorEvaluator<InputArgType, Device>::Layout;
283 enum {
284 IsAligned =
285 int(TensorEvaluator<InputArgType, Device>::IsAligned) & int(TensorEvaluator<KernelArgType, Device>::IsAligned),
286 PacketAccess = int(TensorEvaluator<InputArgType, Device>::PacketAccess) &
287 int(TensorEvaluator<KernelArgType, Device>::PacketAccess),
288 BlockAccess = false,
289 PreferBlockAccess = false,
290 CoordAccess = false, // to be implemented
291 RawAccess = false
292 };
293
294 //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
295 typedef internal::TensorBlockNotImplemented TensorBlock;
296 //===--------------------------------------------------------------------===//
297
298 EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
299 : m_inputImpl(op.inputExpression(), device),
300 m_kernelImpl(op.kernelExpression(), device),
301 m_kernelArg(op.kernelExpression()),
302 m_kernel(NULL),
303 m_local_kernel(false),
304 m_device(device) {
305 EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) ==
306 static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)),
307 YOU_MADE_A_PROGRAMMING_MISTAKE);
308
309 const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
310 const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
311
312 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
313 m_inputStride[0] = 1;
314 for (int i = 1; i < NumDims; ++i) {
315 m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
316 }
317 } else {
318 m_inputStride[NumDims - 1] = 1;
319 for (int i = NumDims - 2; i >= 0; --i) {
320 m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
321 }
322 }
323
324 m_dimensions = m_inputImpl.dimensions();
325 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
326 for (int i = 0; i < NumKernelDims; ++i) {
327 const Index index = op.indices()[i];
328 const Index input_dim = input_dims[index];
329 const Index kernel_dim = kernel_dims[i];
330 const Index result_dim = input_dim - kernel_dim + 1;
331 m_dimensions[index] = result_dim;
332 if (i > 0) {
333 m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
334 } else {
335 m_kernelStride[0] = 1;
336 }
337 m_indexStride[i] = m_inputStride[index];
338 }
339
340 m_outputStride[0] = 1;
341 for (int i = 1; i < NumDims; ++i) {
342 m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
343 }
344 } else {
345 for (int i = NumKernelDims - 1; i >= 0; --i) {
346 const Index index = op.indices()[i];
347 const Index input_dim = input_dims[index];
348 const Index kernel_dim = kernel_dims[i];
349 const Index result_dim = input_dim - kernel_dim + 1;
350 m_dimensions[index] = result_dim;
351 if (i < NumKernelDims - 1) {
352 m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
353 } else {
354 m_kernelStride[NumKernelDims - 1] = 1;
355 }
356 m_indexStride[i] = m_inputStride[index];
357 }
358
359 m_outputStride[NumDims - 1] = 1;
360 for (int i = NumDims - 2; i >= 0; --i) {
361 m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
362 }
363 }
364 }
365
366 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
367
368 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
369 m_inputImpl.evalSubExprsIfNeeded(NULL);
370 preloadKernel();
371 return true;
372 }
373 EIGEN_STRONG_INLINE void cleanup() {
374 m_inputImpl.cleanup();
375 if (m_local_kernel) {
376 m_device.deallocate((void*)m_kernel);
377 m_local_kernel = false;
378 }
379 m_kernel = NULL;
380 }
381
382 void evalTo(typename XprType::Scalar* buffer) {
383 evalSubExprsIfNeeded(NULL);
384 for (int i = 0; i < dimensions().TotalSize(); ++i) {
385 buffer[i] += coeff(i);
386 }
387 cleanup();
388 }
389
390 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
391 CoeffReturnType result = CoeffReturnType(0);
392 convolve(firstInput(index), 0, NumKernelDims - 1, result);
393 return result;
394 }
395
396 template <int LoadMode>
397 EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const {
398 Index indices[2] = {index, index + PacketSize - 1};
399 Index startInputs[2] = {0, 0};
400 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
401 for (int i = NumDims - 1; i > 0; --i) {
402 const Index idx0 = indices[0] / m_outputStride[i];
403 const Index idx1 = indices[1] / m_outputStride[i];
404 startInputs[0] += idx0 * m_inputStride[i];
405 startInputs[1] += idx1 * m_inputStride[i];
406 indices[0] -= idx0 * m_outputStride[i];
407 indices[1] -= idx1 * m_outputStride[i];
408 }
409 } else {
410 for (int i = 0; i < NumDims - 1; ++i) {
411 const Index idx0 = indices[0] / m_outputStride[i];
412 const Index idx1 = indices[1] / m_outputStride[i];
413 startInputs[0] += idx0 * m_inputStride[i];
414 startInputs[1] += idx1 * m_inputStride[i];
415 indices[0] -= idx0 * m_outputStride[i];
416 indices[1] -= idx1 * m_outputStride[i];
417 }
418 }
419 startInputs[0] += indices[0];
420 startInputs[1] += indices[1];
421
422 if (startInputs[1] - startInputs[0] == PacketSize - 1) {
423 PacketReturnType result = internal::pset1<PacketReturnType>(0);
424 convolvePacket(startInputs[0], 0, NumKernelDims - 1, result);
425 return result;
426 } else {
427 EIGEN_ALIGN_MAX Scalar data[PacketSize];
428 data[0] = Scalar(0);
429 convolve(startInputs[0], 0, NumKernelDims - 1, data[0]);
430 for (int i = 1; i < PacketSize - 1; ++i) {
431 data[i] = Scalar(0);
432 convolve(firstInput(index + i), 0, NumKernelDims - 1, data[i]);
433 }
434 data[PacketSize - 1] = Scalar(0);
435 convolve(startInputs[1], 0, NumKernelDims - 1, data[PacketSize - 1]);
436 return internal::pload<PacketReturnType>(data);
437 }
438 }
439
440 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
441 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
442 // We ignore the use of fused multiply-add.
443 const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
444 const double firstIndex_compute_cost =
445 NumDims *
446 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
447 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
448 kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
449 TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
450 }
451
452 EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
453
454 private:
455 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
456 Index startInput = 0;
457 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
458 for (int i = NumDims - 1; i > 0; --i) {
459 const Index idx = index / m_outputStride[i];
460 startInput += idx * m_inputStride[i];
461 index -= idx * m_outputStride[i];
462 }
463 } else {
464 for (int i = 0; i < NumDims - 1; ++i) {
465 const Index idx = index / m_outputStride[i];
466 startInput += idx * m_inputStride[i];
467 index -= idx * m_outputStride[i];
468 }
469 }
470 startInput += index;
471 return startInput;
472 }
473
474 EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
475 for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
476 const Index input = firstIndex + j * m_indexStride[DimIndex];
477 const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
478 if (DimIndex > 0) {
479 convolve(input, kernel, DimIndex - 1, accum);
480 } else {
481 accum += m_inputImpl.coeff(input) * m_kernel[kernel];
482 }
483 }
484 }
485
486 template <typename Packet>
487 EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
488 for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
489 const Index input = firstIndex + j * m_indexStride[DimIndex];
490 const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
491 if (DimIndex > 0) {
492 convolvePacket(input, kernel, DimIndex - 1, accum);
493 } else {
494 accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input),
495 internal::pset1<Packet>(m_kernel[kernel]), accum);
496 }
497 }
498 }
499
500 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
501 // Don't make a local copy of the kernel unless we have to (i.e. it's an
502 // expression that needs to be evaluated)
503 const Scalar* in_place = m_kernelImpl.data();
504 if (in_place) {
505 m_kernel = in_place;
506 m_local_kernel = false;
507 } else {
508 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
509 Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz);
510 typedef TensorEvalToOp<const KernelArgType> EvalTo;
511 EvalTo evalToTmp(local, m_kernelArg);
512 const bool Vectorize = internal::IsVectorizable<Device, KernelArgType>::value;
513 internal::TensorExecutor<const EvalTo, Device, Vectorize>::run(evalToTmp, m_device);
514
515 m_kernel = local;
516 m_local_kernel = true;
517 }
518 }
519
520 array<Index, NumDims> m_inputStride;
521 array<Index, NumDims> m_outputStride;
522
523 array<Index, NumKernelDims> m_indexStride;
524 array<Index, NumKernelDims> m_kernelStride;
525 TensorEvaluator<InputArgType, Device> m_inputImpl;
526 TensorEvaluator<KernelArgType, Device> m_kernelImpl;
527 Dimensions m_dimensions;
528
529 KernelArgType m_kernelArg;
530 const Scalar* m_kernel;
531 bool m_local_kernel;
532 const Device EIGEN_DEVICE_REF m_device;
533};
534
535// Use an optimized implementation of the evaluation code for GPUs whenever possible.
536#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
537
538template <int StaticKernelSize>
539struct GetKernelSize {
540 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(const int /*kernelSize*/) const { return StaticKernelSize; }
541};
542template <>
543struct GetKernelSize<Dynamic> {
544 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(const int kernelSize) const { return kernelSize; }
545};
546
547template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSize>
548__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D(
549 InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout> indexMapper,
550 const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int kernelSize,
551 float* buffer) {
552#if defined(EIGEN_HIPCC)
553 HIP_DYNAMIC_SHARED(float, s)
554#else
555 extern __shared__ float s[];
556#endif
557
558 const int first_x = blockIdx.x * maxX;
559 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
560 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
561 const int num_x_output = last_x - first_x + 1;
562
563 const int first_plane = blockIdx.y * blockDim.y;
564 const int plane_stride = blockDim.y * gridDim.y;
565
566 for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
567 // Load inputs to shared memory
568 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
569 const int plane_kernel_offset = threadIdx.y * num_x_input;
570#pragma unroll
571 for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
572 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + first_x);
573 s[i + plane_kernel_offset] = eval.coeff(tensor_index);
574 }
575
576 __syncthreads();
577
578 // Compute the convolution
579 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
580
581#pragma unroll
582 for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
583 const int kernel_offset = plane_kernel_offset + i;
584 float result = 0.0f;
585#pragma unroll
586 for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
587 result += s[k + kernel_offset] * kernel[k];
588 }
589 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i + first_x);
590 buffer[tensor_index] = result;
591 }
592 __syncthreads();
593 }
594};
595
596template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSizeX, int StaticKernelSizeY>
597__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D(
598 InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout> indexMapper,
599 const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int numY, const int maxY,
600 const int kernelSizeX, const int kernelSizeY, float* buffer) {
601#if defined(EIGEN_HIPCC)
602 HIP_DYNAMIC_SHARED(float, s)
603#else
604 extern __shared__ float s[];
605#endif
606
607 const int first_x = blockIdx.x * maxX;
608 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
609 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
610 const int num_x_output = last_x - first_x + 1;
611
612 const int first_y = blockIdx.y * maxY;
613 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
614 const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
615 const int num_y_output = last_y - first_y + 1;
616
617 const int first_plane = blockIdx.z * blockDim.z;
618 const int plane_stride = blockDim.z * gridDim.z;
619
620 for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
621 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
622 const int plane_kernel_offset = threadIdx.z * num_y_input;
623
624// Load inputs to shared memory
625#pragma unroll
626 for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
627 const int input_offset = num_x_input * (j + plane_kernel_offset);
628#pragma unroll
629 for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
630 const int tensor_index =
631 plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + first_x, j + first_y);
632 s[i + input_offset] = eval.coeff(tensor_index);
633 }
634 }
635
636 __syncthreads();
637
638 // Convolution
639 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
640
641#pragma unroll
642 for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
643#pragma unroll
644 for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
645 float result = 0.0f;
646#pragma unroll
647 for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
648 const int kernel_offset = kernelSizeX * l;
649 const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
650#pragma unroll
651 for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
652 result += s[k + input_offset] * kernel[k + kernel_offset];
653 }
654 }
655 const int tensor_index =
656 plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i + first_x, j + first_y);
657 buffer[tensor_index] = result;
658 }
659 }
660
661 __syncthreads();
662 }
663};
664
665template <typename InputEvaluator, typename Index, typename InputDims>
666__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
667 InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout> indexMapper,
668 const float* __restrict kernel, const size_t numPlanes, const size_t numX, const size_t maxX, const size_t numY,
669 const size_t maxY, const size_t numZ, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
670 const size_t kernelSizeZ, float* buffer) {
671#if defined(EIGEN_HIPCC)
672 HIP_DYNAMIC_SHARED(float, s)
673#else
674 extern __shared__ float s[];
675#endif
676
677 // Load inputs to shared memory
678 const int first_x = blockIdx.x * maxX;
679 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
680 const int num_x_input = last_x - first_x + kernelSizeX;
681
682 const int first_y = blockIdx.y * maxY;
683 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
684 const int num_y_input = last_y - first_y + kernelSizeY;
685
686 const int first_z = blockIdx.z * maxZ;
687 const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
688 const int num_z_input = last_z - first_z + kernelSizeZ;
689
690 for (int p = 0; p < numPlanes; ++p) {
691 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
692 const int plane_kernel_offset = 0;
693
694 for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
695 for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
696 for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
697 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
698 i + first_x, j + first_y, k + first_z);
699 s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
700 }
701 }
702 }
703
704 __syncthreads();
705
706 // Convolution
707 const int num_z_output = last_z - first_z + 1;
708 const int num_y_output = last_y - first_y + 1;
709 const int num_x_output = last_x - first_x + 1;
710 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
711
712 for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
713 for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
714 for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
715 float result = 0.0f;
716 for (int n = 0; n < kernelSizeZ; ++n) {
717 for (int m = 0; m < kernelSizeY; ++m) {
718 for (int l = 0; l < kernelSizeX; ++l) {
719 result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] *
720 kernel[l + kernelSizeX * (m + kernelSizeY * n)];
721 }
722 }
723 }
724 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(
725 i + first_x, j + first_y, k + first_z);
726 buffer[tensor_index] = result;
727 }
728 }
729 }
730 __syncthreads();
731 }
732};
733
734template <typename Indices, typename InputArgType, typename KernelArgType>
735struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice> {
736 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
737
738 static constexpr int NumDims =
739 internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
740 static constexpr int NumKernelDims = internal::array_size<Indices>::value;
741 typedef typename XprType::Index Index;
742 typedef DSizes<Index, NumDims> Dimensions;
743 typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
744
745 static constexpr int Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout;
746 enum {
747 IsAligned =
748 TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
749 PacketAccess = false,
750 BlockAccess = false,
751 PreferBlockAccess = false,
752 CoordAccess = false, // to be implemented
753 RawAccess = false
754 };
755
756 //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
757 typedef internal::TensorBlockNotImplemented TensorBlock;
758 //===--------------------------------------------------------------------===//
759
760 TensorEvaluator(const XprType& op, const GpuDevice& device)
761 : m_inputImpl(op.inputExpression(), device),
762 m_kernelImpl(op.kernelExpression(), device),
763 m_kernelArg(op.kernelExpression()),
764 m_indices(op.indices()),
765 m_buf(NULL),
766 m_kernel(NULL),
767 m_local_kernel(false),
768 m_device(device) {
769 EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) ==
770 static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)),
771 YOU_MADE_A_PROGRAMMING_MISTAKE);
772
773 const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
774 const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
775
776 m_dimensions = m_inputImpl.dimensions();
777 for (int i = 0; i < NumKernelDims; ++i) {
778 const Index index = op.indices()[i];
779 const Index input_dim = input_dims[index];
780 const Index kernel_dim = kernel_dims[i];
781 const Index result_dim = input_dim - kernel_dim + 1;
782 m_dimensions[index] = result_dim;
783 }
784 }
785
786 typedef typename XprType::CoeffReturnType CoeffReturnType;
787 typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
788 typedef typename InputArgType::Scalar Scalar;
789 static constexpr int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
790
791 EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
792
793 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
794 preloadKernel();
795 m_inputImpl.evalSubExprsIfNeeded(NULL);
796 if (data) {
797 executeEval(data);
798 return false;
799 } else {
800 m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
801 executeEval(m_buf);
802 return true;
803 }
804 }
805
806 EIGEN_STRONG_INLINE void cleanup() {
807 m_inputImpl.cleanup();
808 if (m_buf) {
809 m_device.deallocate(m_buf);
810 m_buf = NULL;
811 }
812 if (m_local_kernel) {
813 m_device.deallocate((void*)m_kernel);
814 m_local_kernel = false;
815 }
816 m_kernel = NULL;
817 }
818
819 EIGEN_STRONG_INLINE void preloadKernel() {
820 // Don't make a local copy of the kernel unless we have to (i.e. it's an
821 // expression that needs to be evaluated)
822 const Scalar* in_place = m_kernelImpl.data();
823 if (in_place) {
824 m_kernel = in_place;
825 m_local_kernel = false;
826 } else {
827 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
828 Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
829 typedef TensorEvalToOp<const KernelArgType> EvalTo;
830 EvalTo evalToTmp(local, m_kernelArg);
831 const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
832 internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
833
834 m_kernel = local;
835 m_local_kernel = true;
836 }
837 }
838
839 static unsigned int ceil(unsigned int num, unsigned int denom) {
840 const unsigned int rounded_toward_zero = num / denom;
841 if (num > rounded_toward_zero * denom) {
842 return rounded_toward_zero + 1;
843 }
844 return rounded_toward_zero;
845 }
846
847 void executeEval(Scalar* data) const {
848 typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
849
850 const int maxSharedMem = m_device.sharedMemPerBlock();
851 const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();
852 const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
853 const int numMultiProcessors = m_device.getNumGpuMultiProcessors();
854 const int warpSize = 32;
855
856 switch (NumKernelDims) {
857 case 1: {
858 const int kernel_size = m_kernelImpl.dimensions().TotalSize();
859
860 const int numX = dimensions()[m_indices[0]];
861 const int numP = dimensions().TotalSize() / numX;
862 int maxX;
863 dim3 block_size;
864
865 const int single_stride_dim =
866 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : m_inputImpl.dimensions().rank() - 1;
867 if (m_indices[0] == single_stride_dim) {
868 // Maximum the reuse
869 const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
870 maxX = numext::mini<int>(inner_dim, numX);
871 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
872 block_size.x = numext::mini(maxThreadsPerBlock, maxX);
873 block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
874 } else {
875 // Read as much as possible alongside the inner most dimension, that is the plane
876 const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
877 const int maxP = numext::mini<int>(inner_dim, numP);
878 maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
879
880 block_size.x = numext::mini(warpSize, maxX);
881 block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
882 }
883
884 const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
885 gpu_assert(shared_mem <= maxSharedMem);
886
887 const int num_x_blocks = ceil(numX, maxX);
888 const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
889 const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
890
891 dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
892
893 // cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << "
894 // num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: "
895 // << shared_mem << " in stream " << m_device.stream() << endl;
896
897 const array<Index, 1> indices{m_indices[0]};
898 const array<Index, 1> kernel_dims{m_kernelImpl.dimensions()[0]};
899 internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
900 switch (kernel_size) {
901 case 4: {
902 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>),
903 num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP,
904 numX, maxX, 4, data);
905 break;
906 }
907 case 7: {
908 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>),
909 num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP,
910 numX, maxX, 7, data);
911 break;
912 }
913 default: {
914 LAUNCH_GPU_KERNEL(
915 (EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>),
916 num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
917 kernel_size, data);
918 }
919 }
920 break;
921 }
922
923 case 2: {
924 const int idxX = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
925 const int idxY = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
926 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
927 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
928
929 const int numX = dimensions()[m_indices[idxX]];
930 const int numY = dimensions()[m_indices[idxY]];
931 const int numP = dimensions().TotalSize() / (numX * numY);
932
933 const float scaling_factor =
934 sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
935
936 // Snap maxX to warp size
937 int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
938 const int maxX = numext::mini<int>(inner_dim, numX);
939 const int maxY =
940 numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
941 const int maxP = numext::mini<int>(
942 maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
943
944 dim3 block_size;
945 block_size.x = numext::mini(1024, maxX);
946 block_size.y = numext::mini<int>(1024 / block_size.x, maxY);
947 block_size.z = numext::mini<int>(1024 / (block_size.x * block_size.y), maxP);
948
949 const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
950 gpu_assert(shared_mem <= maxSharedMem);
951
952 const int num_x_blocks = ceil(numX, maxX);
953 const int num_y_blocks = ceil(numY, maxY);
954 const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
955 const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
956
957 dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
958
959 // cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << "
960 // block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y <<
961 // " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << "
962 // shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
963
964 const array<Index, 2> indices{m_indices[idxX], m_indices[idxY]};
965 const array<Index, 2> kernel_dims{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY]};
966 internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
967 switch (kernel_size_x) {
968 case 4: {
969 switch (kernel_size_y) {
970 case 7: {
971 LAUNCH_GPU_KERNEL(
972 (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>),
973 num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
974 numY, maxY, 4, 7, data);
975 break;
976 }
977 default: {
978 LAUNCH_GPU_KERNEL(
979 (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>),
980 num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
981 numY, maxY, 4, kernel_size_y, data);
982 break;
983 }
984 }
985 break;
986 }
987 case 7: {
988 switch (kernel_size_y) {
989 case 4: {
990 LAUNCH_GPU_KERNEL(
991 (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>),
992 num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
993 numY, maxY, 7, 4, data);
994 break;
995 }
996 default: {
997 LAUNCH_GPU_KERNEL(
998 (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>),
999 num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
1000 numY, maxY, 7, kernel_size_y, data);
1001 break;
1002 }
1003 }
1004 break;
1005 }
1006 default: {
1007 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims,
1008 Dynamic, Dynamic>),
1009 num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP,
1010 numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
1011 break;
1012 }
1013 }
1014 break;
1015 }
1016
1017 case 3: {
1018 const int idxX = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
1019 const int idxY = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
1020 const int idxZ = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
1021
1022 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
1023 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
1024 const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
1025
1026 const int numX = dimensions()[m_indices[idxX]];
1027 const int numY = dimensions()[m_indices[idxY]];
1028 const int numZ = dimensions()[m_indices[idxZ]];
1029 const int numP = dimensions().TotalSize() / (numX * numY * numZ);
1030
1031 const int maxX = numext::mini<int>(
1032 128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1,
1033 numX));
1034 const int maxY = numext::mini<int>(
1035 128, numext::mini<int>(
1036 maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1,
1037 numY));
1038 const int maxZ = numext::mini<int>(
1039 128, numext::mini<int>(
1040 maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) -
1041 kernel_size_z + 1,
1042 numZ));
1043
1044 dim3 block_size;
1045 block_size.x = numext::mini(32, maxX);
1046 block_size.y = numext::mini(32, maxY);
1047 block_size.z = numext::mini<int>(1024 / (block_size.x * block_size.y), maxZ);
1048 dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
1049
1050 const int shared_mem =
1051 (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
1052 gpu_assert(shared_mem <= maxSharedMem);
1053
1054 // cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << "
1055 // block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y <<
1056 // " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() <<
1057 // endl;
1058 const array<Index, 3> indices{m_indices[idxX], m_indices[idxY], m_indices[idxZ]};
1059 const array<Index, 3> kernel_dims{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY],
1060 m_kernelImpl.dimensions()[idxZ]};
1061 internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
1062
1063 LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>),
1064 num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX,
1065 maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
1066 break;
1067 }
1068
1069 default: {
1070 EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
1071 THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
1072 }
1073 }
1074 }
1075
1076 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
1077 eigen_assert(m_buf);
1078 eigen_assert(index < m_dimensions.TotalSize());
1079 return m_buf[index];
1080 }
1081
1082 template <int LoadMode>
1083 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const {
1084 eigen_assert(m_buf);
1085 eigen_assert(index < m_dimensions.TotalSize());
1086 return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
1087 }
1088
1089 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
1090 // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
1091 // model.
1092 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
1093 // We ignore the use of fused multiply-add.
1094 const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
1095 const double firstIndex_compute_cost =
1096 NumDims *
1097 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
1098 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
1099 kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
1100 TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
1101 }
1102
1103 private:
1104 TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
1105 TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
1106 KernelArgType m_kernelArg;
1107 Indices m_indices;
1108 Dimensions m_dimensions;
1109 Scalar* m_buf;
1110 const Scalar* m_kernel;
1111 bool m_local_kernel;
1112
1113 const GpuDevice& m_device;
1114};
1115#endif
1116
1117} // end namespace Eigen
1118
1119#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
The tensor base class.
Definition TensorForwardDeclarations.h:68
Definition TensorConvolution.h:232
const internal::remove_all_t< typename InputXprType::Nested > & inputExpression() const
Definition TensorConvolution.h:249
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
const int Dynamic
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_ceil_op< typename Derived::Scalar >, const Derived > ceil(const Eigen::ArrayBase< Derived > &x)
The tensor evaluator class.
Definition TensorEvaluator.h:30