Eigen-unsupported  3.4.1 (git rev 28ded8800c26864e537852658428ab44c8399e87)
 
Loading...
Searching...
No Matches
TensorConvolution.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
11#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
12
13namespace Eigen {
14
15namespace internal {
16
17template <typename Index, typename InputDims, int NumKernelDims, int Layout>
18class IndexMapper {
19 public:
20 IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
21 const array<Index, NumKernelDims>& indices) {
22
23 array<Index, NumDims> dimensions = input_dims;
24 for (int i = 0; i < NumKernelDims; ++i) {
25 const Index index = indices[i];
26 const Index input_dim = input_dims[index];
27 const Index kernel_dim = kernel_dims[i];
28 const Index result_dim = input_dim - kernel_dim + 1;
29 dimensions[index] = result_dim;
30 }
31
32 array<Index, NumDims> inputStrides;
33 array<Index, NumDims> outputStrides;
34 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
35 inputStrides[0] = 1;
36 outputStrides[0] = 1;
37 for (int i = 1; i < NumDims; ++i) {
38 inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
39 outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
40 }
41 } else {
42 inputStrides[NumDims - 1] = 1;
43 outputStrides[NumDims - 1] = 1;
44 for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
45 inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
46 outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
47 }
48 }
49
50 array<Index, NumDims> gpuInputDimensions;
51 array<Index, NumDims> gpuOutputDimensions;
52 array<Index, NumDims> tmp = dimensions;
53 array<Index, NumDims> ordering;
54 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
55 ? 0
56 : NumDims - NumKernelDims;
57 for (int i = 0; i < NumKernelDims; ++i) {
58 const Index index = i + offset;
59 ordering[index] = indices[i];
60 tmp[indices[i]] = -1;
61 gpuInputDimensions[index] = input_dims[indices[i]];
62 gpuOutputDimensions[index] = dimensions[indices[i]];
63 }
64
65 int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
66 ? NumKernelDims
67 : 0;
68 for (int i = 0; i < NumDims; ++i) {
69 if (tmp[i] >= 0) {
70 ordering[written] = i;
71 gpuInputDimensions[written] = input_dims[i];
72 gpuOutputDimensions[written] = dimensions[i];
73 ++written;
74 }
75 }
76
77 for (int i = 0; i < NumDims; ++i) {
78 m_inputStrides[i] = inputStrides[ordering[i]];
79 m_outputStrides[i] = outputStrides[ordering[i]];
80 }
81
82 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
83 for (int i = 0; i < NumDims; ++i) {
84 if (i > NumKernelDims) {
85 m_gpuInputStrides[i] =
86 m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1];
87 m_gpuOutputStrides[i] =
88 m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1];
89 } else {
90 m_gpuInputStrides[i] = 1;
91 m_gpuOutputStrides[i] = 1;
92 }
93 }
94 } else {
95 for (int i = NumDims - 1; i >= 0; --i) {
96 if (static_cast<size_t>(i + 1) < offset) {
97 m_gpuInputStrides[i] =
98 m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
99 m_gpuOutputStrides[i] =
100 m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
101 } else {
102 m_gpuInputStrides[i] = 1;
103 m_gpuOutputStrides[i] = 1;
104 }
105 }
106 }
107 }
108
109 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const {
110 Index inputIndex = 0;
111 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
112 for (int d = NumDims - 1; d > NumKernelDims; --d) {
113 const Index idx = p / m_gpuInputStrides[d];
114 inputIndex += idx * m_inputStrides[d];
115 p -= idx * m_gpuInputStrides[d];
116 }
117 inputIndex += p * m_inputStrides[NumKernelDims];
118 } else {
119 std::ptrdiff_t limit = 0;
120 if (NumKernelDims < NumDims) {
121 limit = NumDims - NumKernelDims - 1;
122 }
123 for (int d = 0; d < limit; ++d) {
124 const Index idx = p / m_gpuInputStrides[d];
125 inputIndex += idx * m_inputStrides[d];
126 p -= idx * m_gpuInputStrides[d];
127 }
128 inputIndex += p * m_inputStrides[limit];
129 }
130 return inputIndex;
131 }
132
133 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const {
134 Index outputIndex = 0;
135 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
136 for (int d = NumDims - 1; d > NumKernelDims; --d) {
137 const Index idx = p / m_gpuOutputStrides[d];
138 outputIndex += idx * m_outputStrides[d];
139 p -= idx * m_gpuOutputStrides[d];
140 }
141 outputIndex += p * m_outputStrides[NumKernelDims];
142 } else {
143 std::ptrdiff_t limit = 0;
144 if (NumKernelDims < NumDims) {
145 limit = NumDims - NumKernelDims - 1;
146 }
147 for (int d = 0; d < limit; ++d) {
148 const Index idx = p / m_gpuOutputStrides[d];
149 outputIndex += idx * m_outputStrides[d];
150 p -= idx * m_gpuOutputStrides[d];
151 }
152 outputIndex += p * m_outputStrides[limit];
153 }
154 return outputIndex;
155 }
156
157 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const {
158 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
159 ? 0
160 : NumDims - NumKernelDims;
161 return i * m_inputStrides[offset];
162 }
163
164 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const {
165 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
166 ? 0
167 : NumDims - NumKernelDims;
168 return i * m_outputStrides[offset];
169 }
170
171 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const {
172 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
173 ? 0
174 : NumDims - NumKernelDims;
175 return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
176 }
177
178 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const {
179 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
180 ? 0
181 : NumDims - NumKernelDims;
182 return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
183 }
184
185 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
186 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
187 ? 0
188 : NumDims - NumKernelDims;
189 return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] +
190 k * m_inputStrides[offset + 2];
191 }
192
193 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
194 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
195 ? 0
196 : NumDims - NumKernelDims;
197 return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] +
198 k * m_outputStrides[offset + 2];
199 }
200
201 private:
202 static const int NumDims = internal::array_size<InputDims>::value;
203 array<Index, NumDims> m_inputStrides;
204 array<Index, NumDims> m_outputStrides;
205 array<Index, NumDims> m_gpuInputStrides;
206 array<Index, NumDims> m_gpuOutputStrides;
207};
208
209
210
211template<typename Dimensions, typename InputXprType, typename KernelXprType>
212struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
213{
214 // Type promotion to handle the case where the types of the lhs and the rhs are different.
215 typedef typename promote_storage_type<typename InputXprType::Scalar,
216 typename KernelXprType::Scalar>::ret Scalar;
217 typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
218 typename traits<KernelXprType>::StorageKind>::ret StorageKind;
219 typedef typename promote_index_type<typename traits<InputXprType>::Index,
220 typename traits<KernelXprType>::Index>::type Index;
221 typedef typename InputXprType::Nested LhsNested;
222 typedef typename KernelXprType::Nested RhsNested;
223 typedef typename remove_reference<LhsNested>::type _LhsNested;
224 typedef typename remove_reference<RhsNested>::type _RhsNested;
225 static const int NumDimensions = traits<InputXprType>::NumDimensions;
226 static const int Layout = traits<InputXprType>::Layout;
227 typedef typename conditional<Pointer_type_promotion<typename InputXprType::Scalar, Scalar>::val,
228 typename traits<InputXprType>::PointerType, typename traits<KernelXprType>::PointerType>::type PointerType;
229
230 enum {
231 Flags = 0
232 };
233};
234
235template<typename Dimensions, typename InputXprType, typename KernelXprType>
236struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
237{
238 typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
239};
240
241template<typename Dimensions, typename InputXprType, typename KernelXprType>
242struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
243{
244 typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
245};
246
247} // end namespace internal
248
252template <typename Indices, typename InputXprType, typename KernelXprType>
253class TensorConvolutionOp
254 : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors> {
255 public:
256 typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
257 typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
258 typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
259 typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
260 typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
261 typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
262 typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
263
264 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
265 : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
266
267 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
268 const Indices& indices() const { return m_indices; }
269
271 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
272 const typename internal::remove_all<typename InputXprType::Nested>::type&
273 inputExpression() const { return m_input_xpr; }
274
275 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
276 const typename internal::remove_all<typename KernelXprType::Nested>::type&
277 kernelExpression() const { return m_kernel_xpr; }
278
279 protected:
280 typename InputXprType::Nested m_input_xpr;
281 typename KernelXprType::Nested m_kernel_xpr;
282 const Indices m_indices;
283};
284
285
286template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
287struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
288{
289 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
290
291 static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
292 static const int NumKernelDims = internal::array_size<Indices>::value;
293 typedef typename XprType::Index Index;
294 typedef DSizes<Index, NumDims> Dimensions;
295
296 typedef typename XprType::Scalar Scalar;
297 typedef typename XprType::CoeffReturnType CoeffReturnType;
298 typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
299 static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
300 typedef StorageMemory<Scalar, Device> Storage;
301 typedef typename Storage::Type EvaluatorPointerType;
302
303 enum {
304 IsAligned = int(TensorEvaluator<InputArgType, Device>::IsAligned) & int(TensorEvaluator<KernelArgType, Device>::IsAligned),
305 PacketAccess = int(TensorEvaluator<InputArgType, Device>::PacketAccess) & int(TensorEvaluator<KernelArgType, Device>::PacketAccess),
306 BlockAccess = false,
307 PreferBlockAccess = false,
308 Layout = TensorEvaluator<InputArgType, Device>::Layout,
309 CoordAccess = false, // to be implemented
310 RawAccess = false
311 };
312
313 //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
314 typedef internal::TensorBlockNotImplemented TensorBlock;
315 //===--------------------------------------------------------------------===//
316
317 EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
318 : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
319 {
320 EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
321
322 const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
323 const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
324
325 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
326 m_inputStride[0] = 1;
327 for (int i = 1; i < NumDims; ++i) {
328 m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
329 }
330 } else {
331 m_inputStride[NumDims - 1] = 1;
332 for (int i = NumDims - 2; i >= 0; --i) {
333 m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
334 }
335 }
336
337 m_dimensions = m_inputImpl.dimensions();
338 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
339 for (int i = 0; i < NumKernelDims; ++i) {
340 const Index index = op.indices()[i];
341 const Index input_dim = input_dims[index];
342 const Index kernel_dim = kernel_dims[i];
343 const Index result_dim = input_dim - kernel_dim + 1;
344 m_dimensions[index] = result_dim;
345 if (i > 0) {
346 m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
347 } else {
348 m_kernelStride[0] = 1;
349 }
350 m_indexStride[i] = m_inputStride[index];
351 }
352
353 m_outputStride[0] = 1;
354 for (int i = 1; i < NumDims; ++i) {
355 m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
356 }
357 } else {
358 for (int i = NumKernelDims - 1; i >= 0; --i) {
359 const Index index = op.indices()[i];
360 const Index input_dim = input_dims[index];
361 const Index kernel_dim = kernel_dims[i];
362 const Index result_dim = input_dim - kernel_dim + 1;
363 m_dimensions[index] = result_dim;
364 if (i < NumKernelDims - 1) {
365 m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
366 } else {
367 m_kernelStride[NumKernelDims - 1] = 1;
368 }
369 m_indexStride[i] = m_inputStride[index];
370 }
371
372 m_outputStride[NumDims - 1] = 1;
373 for (int i = NumDims - 2; i >= 0; --i) {
374 m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
375 }
376 }
377 }
378
379 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
380
381 EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
382 m_inputImpl.evalSubExprsIfNeeded(NULL);
383 preloadKernel();
384 return true;
385 }
386 EIGEN_STRONG_INLINE void cleanup() {
387 m_inputImpl.cleanup();
388 if (m_local_kernel) {
389 m_device.deallocate((void*)m_kernel);
390 m_local_kernel = false;
391 }
392 m_kernel = NULL;
393 }
394
395 void evalTo(typename XprType::Scalar* buffer) {
396 evalSubExprsIfNeeded(NULL);
397 for (int i = 0; i < dimensions().TotalSize(); ++i) {
398 buffer[i] += coeff(i);
399 }
400 cleanup();
401 }
402
403 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
404 {
405 CoeffReturnType result = CoeffReturnType(0);
406 convolve(firstInput(index), 0, NumKernelDims-1, result);
407 return result;
408 }
409
410 template<int LoadMode>
411 EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
412 {
413 Index indices[2] = {index, index+PacketSize-1};
414 Index startInputs[2] = {0, 0};
415 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
416 for (int i = NumDims - 1; i > 0; --i) {
417 const Index idx0 = indices[0] / m_outputStride[i];
418 const Index idx1 = indices[1] / m_outputStride[i];
419 startInputs[0] += idx0 * m_inputStride[i];
420 startInputs[1] += idx1 * m_inputStride[i];
421 indices[0] -= idx0 * m_outputStride[i];
422 indices[1] -= idx1 * m_outputStride[i];
423 }
424 } else {
425 for (int i = 0; i < NumDims - 1; ++i) {
426 const Index idx0 = indices[0] / m_outputStride[i];
427 const Index idx1 = indices[1] / m_outputStride[i];
428 startInputs[0] += idx0 * m_inputStride[i];
429 startInputs[1] += idx1 * m_inputStride[i];
430 indices[0] -= idx0 * m_outputStride[i];
431 indices[1] -= idx1 * m_outputStride[i];
432 }
433 }
434 startInputs[0] += indices[0];
435 startInputs[1] += indices[1];
436
437 if (startInputs[1]-startInputs[0] == PacketSize-1) {
438 PacketReturnType result = internal::pset1<PacketReturnType>(0);
439 convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
440 return result;
441 } else {
442 EIGEN_ALIGN_MAX Scalar data[PacketSize];
443 data[0] = Scalar(0);
444 convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
445 for (int i = 1; i < PacketSize-1; ++i) {
446 data[i] = Scalar(0);
447 convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
448 }
449 data[PacketSize-1] = Scalar(0);
450 convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
451 return internal::pload<PacketReturnType>(data);
452 }
453 }
454
455 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
456 costPerCoeff(bool vectorized) const {
457 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
458 // We ignore the use of fused multiply-add.
459 const double convolve_compute_cost =
460 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
461 const double firstIndex_compute_cost =
462 NumDims *
463 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
464 TensorOpCost::DivCost<Index>());
465 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
466 kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
467 m_kernelImpl.costPerCoeff(vectorized) +
468 TensorOpCost(0, 0, convolve_compute_cost, vectorized,
469 PacketSize));
470 }
471
472 EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
473
474 private:
475 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
476 Index startInput = 0;
477 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
478 for (int i = NumDims - 1; i > 0; --i) {
479 const Index idx = index / m_outputStride[i];
480 startInput += idx * m_inputStride[i];
481 index -= idx * m_outputStride[i];
482 }
483 } else {
484 for (int i = 0; i < NumDims - 1; ++i) {
485 const Index idx = index / m_outputStride[i];
486 startInput += idx * m_inputStride[i];
487 index -= idx * m_outputStride[i];
488 }
489 }
490 startInput += index;
491 return startInput;
492 }
493
494 EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
495 for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
496 const Index input = firstIndex + j * m_indexStride[DimIndex];
497 const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
498 if (DimIndex > 0) {
499 convolve(input, kernel, DimIndex-1, accum);
500 } else {
501 accum += m_inputImpl.coeff(input) * m_kernel[kernel];
502 }
503 }
504 }
505
506 template <typename Packet>
507 EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
508 for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
509 const Index input = firstIndex + j * m_indexStride[DimIndex];
510 const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
511 if (DimIndex > 0) {
512 convolvePacket(input, kernel, DimIndex-1, accum);
513 } else {
514 accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
515 }
516 }
517 }
518
519 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
520 // Don't make a local copy of the kernel unless we have to (i.e. it's an
521 // expression that needs to be evaluated)
522 const Scalar* in_place = m_kernelImpl.data();
523 if (in_place) {
524 m_kernel = in_place;
525 m_local_kernel = false;
526 } else {
527 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
528 Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz);
529 typedef TensorEvalToOp<const KernelArgType> EvalTo;
530 EvalTo evalToTmp(local, m_kernelArg);
531 const bool Vectorize = internal::IsVectorizable<Device, KernelArgType>::value;
532 internal::TensorExecutor<const EvalTo, Device, Vectorize>::run(evalToTmp, m_device);
533
534 m_kernel = local;
535 m_local_kernel = true;
536 }
537 }
538
539 array<Index, NumDims> m_inputStride;
540 array<Index, NumDims> m_outputStride;
541
542 array<Index, NumKernelDims> m_indexStride;
543 array<Index, NumKernelDims> m_kernelStride;
544 TensorEvaluator<InputArgType, Device> m_inputImpl;
545 TensorEvaluator<KernelArgType, Device> m_kernelImpl;
546 Dimensions m_dimensions;
547
548 KernelArgType m_kernelArg;
549 const Scalar* m_kernel;
550 bool m_local_kernel;
551 const Device EIGEN_DEVICE_REF m_device;
552};
553
554
555
556
557// Use an optimized implementation of the evaluation code for GPUs whenever possible.
558#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
559
560template <int StaticKernelSize>
561struct GetKernelSize {
562 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const {
563 return StaticKernelSize;
564 }
565};
566template <>
567struct GetKernelSize<Dynamic> {
568 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const {
569 return kernelSize;
570 }
571};
572
573template <typename InputEvaluator, typename Index, typename InputDims,
574 int StaticKernelSize>
575__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D(
576 InputEvaluator eval,
577 const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
578 indexMapper,
579 const float* __restrict kernel, const int numPlanes, const int numX,
580 const int maxX, const int kernelSize, float* buffer) {
581#if defined(EIGEN_HIPCC)
582 HIP_DYNAMIC_SHARED(float, s)
583#else
584 extern __shared__ float s[];
585#endif
586
587 const int first_x = blockIdx.x * maxX;
588 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
589 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
590 const int num_x_output = last_x - first_x + 1;
591
592 const int first_plane = blockIdx.y * blockDim.y;
593 const int plane_stride = blockDim.y * gridDim.y;
594
595 for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
596 // Load inputs to shared memory
597 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
598 const int plane_kernel_offset = threadIdx.y * num_x_input;
599 #pragma unroll
600 for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
601 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x);
602 s[i + plane_kernel_offset] = eval.coeff(tensor_index);
603 }
604
605 __syncthreads();
606
607 // Compute the convolution
608 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
609
610 #pragma unroll
611 for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
612 const int kernel_offset = plane_kernel_offset + i;
613 float result = 0.0f;
614 #pragma unroll
615 for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
616 result += s[k + kernel_offset] * kernel[k];
617 }
618 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x);
619 buffer[tensor_index] = result;
620 }
621 __syncthreads();
622 }
623};
624
625template <typename InputEvaluator, typename Index, typename InputDims,
626 int StaticKernelSizeX, int StaticKernelSizeY>
627__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D(
628 InputEvaluator eval,
629 const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
630 indexMapper,
631 const float* __restrict kernel, const int numPlanes, const int numX,
632 const int maxX, const int numY, const int maxY, const int kernelSizeX,
633 const int kernelSizeY, float* buffer) {
634#if defined(EIGEN_HIPCC)
635 HIP_DYNAMIC_SHARED(float, s)
636#else
637 extern __shared__ float s[];
638#endif
639
640 const int first_x = blockIdx.x * maxX;
641 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
642 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
643 const int num_x_output = last_x - first_x + 1;
644
645 const int first_y = blockIdx.y * maxY;
646 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
647 const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
648 const int num_y_output = last_y - first_y + 1;
649
650 const int first_plane = blockIdx.z * blockDim.z;
651 const int plane_stride = blockDim.z * gridDim.z;
652
653 for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
654
655 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
656 const int plane_kernel_offset = threadIdx.z * num_y_input;
657
658 // Load inputs to shared memory
659 #pragma unroll
660 for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
661 const int input_offset = num_x_input * (j + plane_kernel_offset);
662 #pragma unroll
663 for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
664 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y);
665 s[i + input_offset] = eval.coeff(tensor_index);
666 }
667 }
668
669 __syncthreads();
670
671 // Convolution
672 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
673
674 #pragma unroll
675 for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
676 #pragma unroll
677 for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
678 float result = 0.0f;
679 #pragma unroll
680 for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
681 const int kernel_offset = kernelSizeX * l;
682 const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
683 #pragma unroll
684 for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
685 result += s[k + input_offset] * kernel[k + kernel_offset];
686 }
687 }
688 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
689 buffer[tensor_index] = result;
690 }
691 }
692
693 __syncthreads();
694 }
695};
696
697template <typename InputEvaluator, typename Index, typename InputDims>
698__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
699 InputEvaluator eval,
700 const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
701 indexMapper,
702 const float* __restrict kernel, const size_t numPlanes, const size_t numX,
703 const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
704 const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
705 const size_t kernelSizeZ, float* buffer) {
706#if defined(EIGEN_HIPCC)
707 HIP_DYNAMIC_SHARED(float, s)
708#else
709 extern __shared__ float s[];
710#endif
711
712 // Load inputs to shared memory
713 const int first_x = blockIdx.x * maxX;
714 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
715 const int num_x_input = last_x - first_x + kernelSizeX;
716
717 const int first_y = blockIdx.y * maxY;
718 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
719 const int num_y_input = last_y - first_y + kernelSizeY;
720
721 const int first_z = blockIdx.z * maxZ;
722 const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
723 const int num_z_input = last_z - first_z + kernelSizeZ;
724
725 for (int p = 0; p < numPlanes; ++p) {
726
727 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
728 const int plane_kernel_offset = 0;
729
730 for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
731 for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
732 for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
733 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
734 s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
735 }
736 }
737 }
738
739 __syncthreads();
740
741 // Convolution
742 const int num_z_output = last_z - first_z + 1;
743 const int num_y_output = last_y - first_y + 1;
744 const int num_x_output = last_x - first_x + 1;
745 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
746
747 for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
748 for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
749 for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
750 float result = 0.0f;
751 for (int n = 0; n < kernelSizeZ; ++n) {
752 for (int m = 0; m < kernelSizeY; ++m) {
753 for (int l = 0; l < kernelSizeX; ++l) {
754 result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
755 }
756 }
757 }
758 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
759 buffer[tensor_index] = result;
760 }
761 }
762 }
763 __syncthreads();
764 }
765};
766
767
768
769template<typename Indices, typename InputArgType, typename KernelArgType>
770struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
771{
772 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
773
774 static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
775 static const int NumKernelDims = internal::array_size<Indices>::value;
776 typedef typename XprType::Index Index;
777 typedef DSizes<Index, NumDims> Dimensions;
778 typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
779
780 enum {
781 IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
782 PacketAccess = false,
783 BlockAccess = false,
784 PreferBlockAccess = false,
785 Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
786 CoordAccess = false, // to be implemented
787 RawAccess = false
788 };
789
790 //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
791 typedef internal::TensorBlockNotImplemented TensorBlock;
792 //===--------------------------------------------------------------------===//
793
794 TensorEvaluator(const XprType& op, const GpuDevice& device)
795 : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
796 {
797 EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
798
799 const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
800 const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
801
802 m_dimensions = m_inputImpl.dimensions();
803 for (int i = 0; i < NumKernelDims; ++i) {
804 const Index index = op.indices()[i];
805 const Index input_dim = input_dims[index];
806 const Index kernel_dim = kernel_dims[i];
807 const Index result_dim = input_dim - kernel_dim + 1;
808 m_dimensions[index] = result_dim;
809 }
810 }
811
812 typedef typename XprType::CoeffReturnType CoeffReturnType;
813 typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
814 typedef typename InputArgType::Scalar Scalar;
815 static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
816
817 EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
818
819 EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
820 preloadKernel();
821 m_inputImpl.evalSubExprsIfNeeded(NULL);
822 if (data) {
823 executeEval(data);
824 return false;
825 } else {
826 m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
827 executeEval(m_buf);
828 return true;
829 }
830 }
831
832 EIGEN_STRONG_INLINE void cleanup() {
833 m_inputImpl.cleanup();
834 if (m_buf) {
835 m_device.deallocate(m_buf);
836 m_buf = NULL;
837 }
838 if (m_local_kernel) {
839 m_device.deallocate((void*)m_kernel);
840 m_local_kernel = false;
841 }
842 m_kernel = NULL;
843 }
844
845 EIGEN_STRONG_INLINE void preloadKernel() {
846 // Don't make a local copy of the kernel unless we have to (i.e. it's an
847 // expression that needs to be evaluated)
848 const Scalar* in_place = m_kernelImpl.data();
849 if (in_place) {
850 m_kernel = in_place;
851 m_local_kernel = false;
852 } else {
853 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
854 Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
855 typedef TensorEvalToOp<const KernelArgType> EvalTo;
856 EvalTo evalToTmp(local, m_kernelArg);
857 const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
858 internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
859
860 m_kernel = local;
861 m_local_kernel = true;
862 }
863 }
864
865 static unsigned int ceil(unsigned int num, unsigned int denom) {
866 const unsigned int rounded_toward_zero = num / denom;
867 if (num > rounded_toward_zero * denom) {
868 return rounded_toward_zero + 1;
869 }
870 return rounded_toward_zero;
871 }
872
873 void executeEval(Scalar* data) const {
874 typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
875
876 const int maxSharedMem = m_device.sharedMemPerBlock();
877 const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();
878 const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
879 const int numMultiProcessors = m_device.getNumGpuMultiProcessors();
880 const int warpSize = 32;
881
882 switch (NumKernelDims) {
883 case 1: {
884 const int kernel_size = m_kernelImpl.dimensions().TotalSize();
885
886 const int numX = dimensions()[m_indices[0]];
887 const int numP = dimensions().TotalSize() / numX;
888 int maxX;
889 dim3 block_size;
890
891 const int single_stride_dim =
892 static_cast<int>(Layout) == static_cast<int>(ColMajor)
893 ? 0
894 : m_inputImpl.dimensions().rank() - 1;
895 if (m_indices[0] == single_stride_dim) {
896 // Maximum the reuse
897 const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
898 maxX = numext::mini<int>(inner_dim, numX);
899 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
900 block_size.x = numext::mini(maxThreadsPerBlock, maxX);
901 block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
902 }
903 else {
904 // Read as much as possible alongside the inner most dimension, that is the plane
905 const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
906 const int maxP = numext::mini<int>(inner_dim, numP);
907 maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
908
909 block_size.x = numext::mini(warpSize, maxX);
910 block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
911 }
912
913 const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
914 gpu_assert(shared_mem <= maxSharedMem);
915
916 const int num_x_blocks = ceil(numX, maxX);
917 const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
918 const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
919
920 dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
921
922
923 //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
924
925 const array<Index, 1> indices(m_indices[0]);
926 const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
927 internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(
928 m_inputImpl.dimensions(), kernel_dims, indices);
929 switch(kernel_size) {
930 case 4: {
931 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
932 break;
933 }
934 case 7: {
935 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
936 break;
937 }
938 default: {
939 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
940 }
941 }
942 break;
943 }
944
945 case 2: {
946 const int idxX =
947 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
948 const int idxY =
949 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
950 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
951 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
952
953 const int numX = dimensions()[m_indices[idxX]];
954 const int numY = dimensions()[m_indices[idxY]];
955 const int numP = dimensions().TotalSize() / (numX*numY);
956
957 const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
958
959 // Snap maxX to warp size
960 int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
961 const int maxX = numext::mini<int>(inner_dim, numX);
962 const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
963 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
964
965 dim3 block_size;
966 block_size.x = numext::mini(1024, maxX);
967 block_size.y = numext::mini<int>(1024/block_size.x, maxY);
968 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
969
970 const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
971 gpu_assert(shared_mem <= maxSharedMem);
972
973 const int num_x_blocks = ceil(numX, maxX);
974 const int num_y_blocks = ceil(numY, maxY);
975 const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
976 const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
977
978 dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
979
980
981 //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
982
983 const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
984 const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
985 m_kernelImpl.dimensions()[idxY]);
986 internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(
987 m_inputImpl.dimensions(), kernel_dims, indices);
988 switch (kernel_size_x) {
989 case 4: {
990 switch (kernel_size_y) {
991 case 7: {
992 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
993 break;
994 }
995 default: {
996 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
997 break;
998 }
999 }
1000 break;
1001 }
1002 case 7: {
1003 switch (kernel_size_y) {
1004 case 4: {
1005 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
1006 break;
1007 }
1008 default: {
1009 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
1010 break;
1011 }
1012 }
1013 break;
1014 }
1015 default: {
1016 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
1017 break;
1018 }
1019 }
1020 break;
1021 }
1022
1023 case 3: {
1024 const int idxX =
1025 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
1026 const int idxY =
1027 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
1028 const int idxZ =
1029 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
1030
1031 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
1032 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
1033 const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
1034
1035 const int numX = dimensions()[m_indices[idxX]];
1036 const int numY = dimensions()[m_indices[idxY]];
1037 const int numZ = dimensions()[m_indices[idxZ]];
1038 const int numP = dimensions().TotalSize() / (numX*numY*numZ);
1039
1040 const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
1041 const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
1042 const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
1043
1044 dim3 block_size;
1045 block_size.x = numext::mini(32, maxX);
1046 block_size.y = numext::mini(32, maxY);
1047 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
1048 dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
1049
1050 const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
1051 gpu_assert(shared_mem <= maxSharedMem);
1052
1053 //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
1054 const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
1055 m_indices[idxZ]);
1056 const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
1057 m_kernelImpl.dimensions()[idxY],
1058 m_kernelImpl.dimensions()[idxZ]);
1059 internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
1060 m_inputImpl.dimensions(), kernel_dims, indices);
1061
1062 LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
1063 break;
1064 }
1065
1066 default: {
1067 EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
1068 }
1069 }
1070 }
1071
1072 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
1073 {
1074 eigen_assert(m_buf);
1075 eigen_assert(index < m_dimensions.TotalSize());
1076 return m_buf[index];
1077 }
1078
1079 template<int LoadMode>
1080 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
1081 {
1082 eigen_assert(m_buf);
1083 eigen_assert(index < m_dimensions.TotalSize());
1084 return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
1085 }
1086
1087 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
1088 costPerCoeff(bool vectorized) const {
1089 // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
1090 // model.
1091 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
1092 // We ignore the use of fused multiply-add.
1093 const double convolve_compute_cost =
1094 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
1095 const double firstIndex_compute_cost =
1096 NumDims *
1097 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
1098 TensorOpCost::DivCost<Index>());
1099 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
1100 kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
1101 m_kernelImpl.costPerCoeff(vectorized) +
1102 TensorOpCost(0, 0, convolve_compute_cost, vectorized,
1103 PacketSize));
1104 }
1105
1106 private:
1107 // No assignment (copies are needed by the kernels)
1108 TensorEvaluator& operator = (const TensorEvaluator&);
1109
1110 TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
1111 TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
1112 KernelArgType m_kernelArg;
1113 Indices m_indices;
1114 Dimensions m_dimensions;
1115 Scalar* m_buf;
1116 const Scalar* m_kernel;
1117 bool m_local_kernel;
1118
1119 const GpuDevice& m_device;
1120};
1121#endif
1122
1123
1124} // end namespace Eigen
1125
1126#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
The tensor base class.
Definition TensorForwardDeclarations.h:56
Definition TensorConvolution.h:254
const internal::remove_all< typenameInputXprType::Nested >::type & inputExpression() const
Definition TensorConvolution.h:273
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
const int Dynamic
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_ceil_op< typename Derived::Scalar >, const Derived > ceil(const Eigen::ArrayBase< Derived > &x)
The tensor evaluator class.
Definition TensorEvaluator.h:27