Loading...
Searching...
No Matches
TensorConvolution.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
11#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
12
13namespace Eigen {
14
15namespace internal {
16
17template <typename Index, typename InputDims, int NumKernelDims, int Layout>
18class IndexMapper {
19 public:
20 IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
21 const array<Index, NumKernelDims>& indices) {
22
23 array<Index, NumDims> dimensions = input_dims;
24 for (int i = 0; i < NumKernelDims; ++i) {
25 const Index index = indices[i];
26 const Index input_dim = input_dims[index];
27 const Index kernel_dim = kernel_dims[i];
28 const Index result_dim = input_dim - kernel_dim + 1;
29 dimensions[index] = result_dim;
30 }
31
32 array<Index, NumDims> inputStrides;
33 array<Index, NumDims> outputStrides;
34 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
35 inputStrides[0] = 1;
36 outputStrides[0] = 1;
37 for (int i = 1; i < NumDims; ++i) {
38 inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
39 outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
40 }
41 } else {
42 inputStrides[NumDims - 1] = 1;
43 outputStrides[NumDims - 1] = 1;
44 for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
45 inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
46 outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
47 }
48 }
49
50 array<Index, NumDims> cudaInputDimensions;
51 array<Index, NumDims> cudaOutputDimensions;
52 array<Index, NumDims> tmp = dimensions;
53 array<Index, NumDims> ordering;
54 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
55 ? 0
56 : NumDims - NumKernelDims;
57 for (int i = 0; i < NumKernelDims; ++i) {
58 const Index index = i + offset;
59 ordering[index] = indices[i];
60 tmp[indices[i]] = -1;
61 cudaInputDimensions[index] = input_dims[indices[i]];
62 cudaOutputDimensions[index] = dimensions[indices[i]];
63 }
64
65 int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
66 ? NumKernelDims
67 : 0;
68 for (int i = 0; i < NumDims; ++i) {
69 if (tmp[i] >= 0) {
70 ordering[written] = i;
71 cudaInputDimensions[written] = input_dims[i];
72 cudaOutputDimensions[written] = dimensions[i];
73 ++written;
74 }
75 }
76
77 for (int i = 0; i < NumDims; ++i) {
78 m_inputStrides[i] = inputStrides[ordering[i]];
79 m_outputStrides[i] = outputStrides[ordering[i]];
80 }
81
82 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
83 for (int i = 0; i < NumDims; ++i) {
84 if (i > NumKernelDims) {
85 m_cudaInputStrides[i] =
86 m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1];
87 m_cudaOutputStrides[i] =
88 m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1];
89 } else {
90 m_cudaInputStrides[i] = 1;
91 m_cudaOutputStrides[i] = 1;
92 }
93 }
94 } else {
95 for (int i = NumDims - 1; i >= 0; --i) {
96 if (i + 1 < offset) {
97 m_cudaInputStrides[i] =
98 m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1];
99 m_cudaOutputStrides[i] =
100 m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1];
101 } else {
102 m_cudaInputStrides[i] = 1;
103 m_cudaOutputStrides[i] = 1;
104 }
105 }
106 }
107 }
108
109 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const {
110 Index inputIndex = 0;
111 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
112 for (int d = NumDims - 1; d > NumKernelDims; --d) {
113 const Index idx = p / m_cudaInputStrides[d];
114 inputIndex += idx * m_inputStrides[d];
115 p -= idx * m_cudaInputStrides[d];
116 }
117 inputIndex += p * m_inputStrides[NumKernelDims];
118 } else {
119 std::ptrdiff_t limit = 0;
120 if (NumKernelDims < NumDims) {
121 limit = NumDims - NumKernelDims - 1;
122 }
123 for (int d = 0; d < limit; ++d) {
124 const Index idx = p / m_cudaInputStrides[d];
125 inputIndex += idx * m_inputStrides[d];
126 p -= idx * m_cudaInputStrides[d];
127 }
128 inputIndex += p * m_inputStrides[limit];
129 }
130 return inputIndex;
131 }
132
133 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const {
134 Index outputIndex = 0;
135 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
136 for (int d = NumDims - 1; d > NumKernelDims; --d) {
137 const Index idx = p / m_cudaOutputStrides[d];
138 outputIndex += idx * m_outputStrides[d];
139 p -= idx * m_cudaOutputStrides[d];
140 }
141 outputIndex += p * m_outputStrides[NumKernelDims];
142 } else {
143 std::ptrdiff_t limit = 0;
144 if (NumKernelDims < NumDims) {
145 limit = NumDims - NumKernelDims - 1;
146 }
147 for (int d = 0; d < limit; ++d) {
148 const Index idx = p / m_cudaOutputStrides[d];
149 outputIndex += idx * m_outputStrides[d];
150 p -= idx * m_cudaOutputStrides[d];
151 }
152 outputIndex += p * m_outputStrides[limit];
153 }
154 return outputIndex;
155 }
156
157 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const {
158 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
159 ? 0
160 : NumDims - NumKernelDims;
161 return i * m_inputStrides[offset];
162 }
163
164 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const {
165 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
166 ? 0
167 : NumDims - NumKernelDims;
168 return i * m_outputStrides[offset];
169 }
170
171 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const {
172 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
173 ? 0
174 : NumDims - NumKernelDims;
175 return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
176 }
177
178 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const {
179 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
180 ? 0
181 : NumDims - NumKernelDims;
182 return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
183 }
184
185 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
186 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
187 ? 0
188 : NumDims - NumKernelDims;
189 return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] +
190 k * m_inputStrides[offset + 2];
191 }
192
193 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
194 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
195 ? 0
196 : NumDims - NumKernelDims;
197 return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] +
198 k * m_outputStrides[offset + 2];
199 }
200
201 private:
202 static const int NumDims = internal::array_size<InputDims>::value;
203 array<Index, NumDims> m_inputStrides;
204 array<Index, NumDims> m_outputStrides;
205 array<Index, NumDims> m_cudaInputStrides;
206 array<Index, NumDims> m_cudaOutputStrides;
207};
208
209
210
211template<typename Dimensions, typename InputXprType, typename KernelXprType>
212struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
213{
214 // Type promotion to handle the case where the types of the lhs and the rhs are different.
215 typedef typename promote_storage_type<typename InputXprType::Scalar,
216 typename KernelXprType::Scalar>::ret Scalar;
217 typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
218 typename traits<KernelXprType>::StorageKind>::ret StorageKind;
219 typedef typename promote_index_type<typename traits<InputXprType>::Index,
220 typename traits<KernelXprType>::Index>::type Index;
221 typedef typename InputXprType::Nested LhsNested;
222 typedef typename KernelXprType::Nested RhsNested;
223 typedef typename remove_reference<LhsNested>::type _LhsNested;
224 typedef typename remove_reference<RhsNested>::type _RhsNested;
225 static const int NumDimensions = traits<InputXprType>::NumDimensions;
226 static const int Layout = traits<InputXprType>::Layout;
227
228 enum {
229 Flags = 0
230 };
231};
232
233template<typename Dimensions, typename InputXprType, typename KernelXprType>
234struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
235{
236 typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
237};
238
239template<typename Dimensions, typename InputXprType, typename KernelXprType>
240struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
241{
242 typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
243};
244
245} // end namespace internal
246
250template <typename Indices, typename InputXprType, typename KernelXprType>
251class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors>
252{
253 public:
254 typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
255 typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
256 typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
257 typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
258 typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
259 typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
260 typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
261
262 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
263 : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
264
265 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
266 const Indices& indices() const { return m_indices; }
267
269 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
270 const typename internal::remove_all<typename InputXprType::Nested>::type&
271 inputExpression() const { return m_input_xpr; }
272
273 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
274 const typename internal::remove_all<typename KernelXprType::Nested>::type&
275 kernelExpression() const { return m_kernel_xpr; }
276
277 protected:
278 typename InputXprType::Nested m_input_xpr;
279 typename KernelXprType::Nested m_kernel_xpr;
280 const Indices m_indices;
281};
282
283
284template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
285struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
286{
287 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
288
289 static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
290 static const int NumKernelDims = internal::array_size<Indices>::value;
291 typedef typename XprType::Index Index;
292 typedef DSizes<Index, NumDims> Dimensions;
293
294 typedef typename XprType::Scalar Scalar;
295 typedef typename XprType::CoeffReturnType CoeffReturnType;
296 typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
297 static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
298
299 enum {
300 IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
301 PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
302 Layout = TensorEvaluator<InputArgType, Device>::Layout,
303 CoordAccess = false, // to be implemented
304 RawAccess = false
305 };
306
307 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
308 : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
309 {
310 EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
311
312 const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
313 const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
314
315 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
316 m_inputStride[0] = 1;
317 for (int i = 1; i < NumDims; ++i) {
318 m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
319 }
320 } else {
321 m_inputStride[NumDims - 1] = 1;
322 for (int i = NumDims - 2; i >= 0; --i) {
323 m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
324 }
325 }
326
327 m_dimensions = m_inputImpl.dimensions();
328 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
329 for (int i = 0; i < NumKernelDims; ++i) {
330 const Index index = op.indices()[i];
331 const Index input_dim = input_dims[index];
332 const Index kernel_dim = kernel_dims[i];
333 const Index result_dim = input_dim - kernel_dim + 1;
334 m_dimensions[index] = result_dim;
335 if (i > 0) {
336 m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
337 } else {
338 m_kernelStride[0] = 1;
339 }
340 m_indexStride[i] = m_inputStride[index];
341 }
342
343 m_outputStride[0] = 1;
344 for (int i = 1; i < NumDims; ++i) {
345 m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
346 }
347 } else {
348 for (int i = NumKernelDims - 1; i >= 0; --i) {
349 const Index index = op.indices()[i];
350 const Index input_dim = input_dims[index];
351 const Index kernel_dim = kernel_dims[i];
352 const Index result_dim = input_dim - kernel_dim + 1;
353 m_dimensions[index] = result_dim;
354 if (i < NumKernelDims - 1) {
355 m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
356 } else {
357 m_kernelStride[NumKernelDims - 1] = 1;
358 }
359 m_indexStride[i] = m_inputStride[index];
360 }
361
362 m_outputStride[NumDims - 1] = 1;
363 for (int i = NumDims - 2; i >= 0; --i) {
364 m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
365 }
366 }
367 }
368
369 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
370
371 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
372 m_inputImpl.evalSubExprsIfNeeded(NULL);
373 preloadKernel();
374 return true;
375 }
376 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
377 m_inputImpl.cleanup();
378 if (m_local_kernel) {
379 m_device.deallocate((void*)m_kernel);
380 m_local_kernel = false;
381 }
382 m_kernel = NULL;
383 }
384
385 void evalTo(typename XprType::Scalar* buffer) {
386 evalSubExprsIfNeeded(NULL);
387 for (int i = 0; i < dimensions().TotalSize(); ++i) {
388 buffer[i] += coeff(i);
389 }
390 cleanup();
391 }
392
393 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
394 {
395 CoeffReturnType result = CoeffReturnType(0);
396 convolve(firstInput(index), 0, NumKernelDims-1, result);
397 return result;
398 }
399
400 template<int LoadMode>
401 EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
402 {
403 Index indices[2] = {index, index+PacketSize-1};
404 Index startInputs[2] = {0, 0};
405 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
406 for (int i = NumDims - 1; i > 0; --i) {
407 const Index idx0 = indices[0] / m_outputStride[i];
408 const Index idx1 = indices[1] / m_outputStride[i];
409 startInputs[0] += idx0 * m_inputStride[i];
410 startInputs[1] += idx1 * m_inputStride[i];
411 indices[0] -= idx0 * m_outputStride[i];
412 indices[1] -= idx1 * m_outputStride[i];
413 }
414 } else {
415 for (int i = 0; i < NumDims - 1; ++i) {
416 const Index idx0 = indices[0] / m_outputStride[i];
417 const Index idx1 = indices[1] / m_outputStride[i];
418 startInputs[0] += idx0 * m_inputStride[i];
419 startInputs[1] += idx1 * m_inputStride[i];
420 indices[0] -= idx0 * m_outputStride[i];
421 indices[1] -= idx1 * m_outputStride[i];
422 }
423 }
424 startInputs[0] += indices[0];
425 startInputs[1] += indices[1];
426
427 if (startInputs[1]-startInputs[0] == PacketSize-1) {
428 PacketReturnType result = internal::pset1<PacketReturnType>(0);
429 convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
430 return result;
431 } else {
432 EIGEN_ALIGN_MAX Scalar data[PacketSize];
433 data[0] = Scalar(0);
434 convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
435 for (int i = 1; i < PacketSize-1; ++i) {
436 data[i] = Scalar(0);
437 convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
438 }
439 data[PacketSize-1] = Scalar(0);
440 convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
441 return internal::pload<PacketReturnType>(data);
442 }
443 }
444
445 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
446 costPerCoeff(bool vectorized) const {
447 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
448 // We ignore the use of fused multiply-add.
449 const double convolve_compute_cost =
450 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
451 const double firstIndex_compute_cost =
452 NumDims *
453 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
454 TensorOpCost::DivCost<Index>());
455 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
456 kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
457 m_kernelImpl.costPerCoeff(vectorized) +
458 TensorOpCost(0, 0, convolve_compute_cost, vectorized,
459 PacketSize));
460 }
461
462 EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
463
464 private:
465 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
466 Index startInput = 0;
467 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
468 for (int i = NumDims - 1; i > 0; --i) {
469 const Index idx = index / m_outputStride[i];
470 startInput += idx * m_inputStride[i];
471 index -= idx * m_outputStride[i];
472 }
473 } else {
474 for (int i = 0; i < NumDims - 1; ++i) {
475 const Index idx = index / m_outputStride[i];
476 startInput += idx * m_inputStride[i];
477 index -= idx * m_outputStride[i];
478 }
479 }
480 startInput += index;
481 return startInput;
482 }
483
484 EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
485 for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
486 const Index input = firstIndex + j * m_indexStride[DimIndex];
487 const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
488 if (DimIndex > 0) {
489 convolve(input, kernel, DimIndex-1, accum);
490 } else {
491 accum += m_inputImpl.coeff(input) * m_kernel[kernel];
492 }
493 }
494 }
495
496 template <typename Packet>
497 EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
498 for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
499 const Index input = firstIndex + j * m_indexStride[DimIndex];
500 const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
501 if (DimIndex > 0) {
502 convolvePacket(input, kernel, DimIndex-1, accum);
503 } else {
504 accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
505 }
506 }
507 }
508
509 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
510 // Don't make a local copy of the kernel unless we have to (i.e. it's an
511 // expression that needs to be evaluated)
512 const Scalar* in_place = m_kernelImpl.data();
513 if (in_place) {
514 m_kernel = in_place;
515 m_local_kernel = false;
516 } else {
517 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
518 Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
519 typedef TensorEvalToOp<const KernelArgType> EvalTo;
520 EvalTo evalToTmp(local, m_kernelArg);
521 const bool PacketAccess = internal::IsVectorizable<Device, KernelArgType>::value;
522 internal::TensorExecutor<const EvalTo, Device, PacketAccess>::run(evalToTmp, m_device);
523
524 m_kernel = local;
525 m_local_kernel = true;
526 }
527 }
528
529 array<Index, NumDims> m_inputStride;
530 array<Index, NumDims> m_outputStride;
531
532 array<Index, NumKernelDims> m_indexStride;
533 array<Index, NumKernelDims> m_kernelStride;
534 TensorEvaluator<InputArgType, Device> m_inputImpl;
535 TensorEvaluator<KernelArgType, Device> m_kernelImpl;
536 Dimensions m_dimensions;
537
538 KernelArgType m_kernelArg;
539 const Scalar* m_kernel;
540 bool m_local_kernel;
541 const Device& m_device;
542};
543
544
545
546
547// Use an optimized implementation of the evaluation code for GPUs whenever possible.
548#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
549
550template <int StaticKernelSize>
551struct GetKernelSize {
552 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const {
553 return StaticKernelSize;
554 }
555};
556template <>
557struct GetKernelSize<Dynamic> {
558 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const {
559 return kernelSize;
560 }
561};
562
563template <typename InputEvaluator, typename Index, typename InputDims,
564 int StaticKernelSize>
565__global__ void EigenConvolutionKernel1D(
566 InputEvaluator eval,
567 const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
568 indexMapper,
569 const float* __restrict kernel, const int numPlanes, const int numX,
570 const int maxX, const int kernelSize, float* buffer) {
571 extern __shared__ float s[];
572
573 const int first_x = static_cast<int>(blockIdx.x * maxX);
574 const int last_x = static_cast<int>((first_x + maxX < numX ? first_x + maxX : numX) - 1);
575 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
576 const int num_x_output = last_x - first_x + 1;
577
578 const int first_plane = blockIdx.y * blockDim.y;
579 const int plane_stride = blockDim.y * gridDim.y;
580
581 for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
582 // Load inputs to shared memory
583 const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
584 const int plane_kernel_offset = threadIdx.y * num_x_input;
585 #pragma unroll
586 for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
587 const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x);
588 s[i + plane_kernel_offset] = eval.coeff(tensor_index);
589 }
590
591 __syncthreads();
592
593 // Compute the convolution
594 const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
595
596 #pragma unroll
597 for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
598 const int kernel_offset = plane_kernel_offset + i;
599 float result = 0.0f;
600 #pragma unroll
601 for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
602 result += s[k + kernel_offset] * kernel[k];
603 }
604 const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x);
605 buffer[tensor_index] = result;
606 }
607 __syncthreads();
608 }
609}
610
611template <typename InputEvaluator, typename Index, typename InputDims,
612 int StaticKernelSizeX, int StaticKernelSizeY>
613__global__ void EigenConvolutionKernel2D(
614 InputEvaluator eval,
615 const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
616 indexMapper,
617 const float* __restrict kernel, const int numPlanes, const int numX,
618 const int maxX, const int numY, const int maxY, const int kernelSizeX,
619 const int kernelSizeY, float* buffer) {
620 extern __shared__ float s[];
621
622 const int first_x = static_cast<int>(blockIdx.x * maxX);
623 const int last_x = static_cast<int>((first_x + maxX < numX ? first_x + maxX : numX) - 1);
624 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
625 const int num_x_output = last_x - first_x + 1;
626
627 const int first_y = static_cast<int>(blockIdx.y * maxY);
628 const int last_y = static_cast<int>((first_y + maxY < numY ? first_y + maxY : numY) - 1);
629 const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
630 const int num_y_output = last_y - first_y + 1;
631
632 const int first_plane = blockIdx.z * blockDim.z;
633 const int plane_stride = blockDim.z * gridDim.z;
634
635 for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
636
637 const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
638 const int plane_kernel_offset = threadIdx.z * num_y_input;
639
640 // Load inputs to shared memory
641 #pragma unroll
642 for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
643 const int input_offset = num_x_input * (j + plane_kernel_offset);
644 #pragma unroll
645 for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
646 const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y);
647 s[i + input_offset] = eval.coeff(tensor_index);
648 }
649 }
650
651 __syncthreads();
652
653 // Convolution
654 const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
655
656 #pragma unroll
657 for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
658 #pragma unroll
659 for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
660 float result = 0.0f;
661 #pragma unroll
662 for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
663 const int kernel_offset = kernelSizeX * l;
664 const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
665 #pragma unroll
666 for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
667 result += s[k + input_offset] * kernel[k + kernel_offset];
668 }
669 }
670 const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
671 buffer[tensor_index] = result;
672 }
673 }
674
675 __syncthreads();
676 }
677}
678
679template <typename InputEvaluator, typename Index, typename InputDims>
680__global__ void EigenConvolutionKernel3D(
681 InputEvaluator eval,
682 const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
683 indexMapper,
684 const float* __restrict kernel, const size_t numPlanes, const size_t numX,
685 const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
686 const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
687 const size_t kernelSizeZ, float* buffer) {
688 extern __shared__ float s[];
689
690 // Load inputs to shared memory
691 const int first_x = static_cast<int>(blockIdx.x * maxX);
692 const int last_x = static_cast<int>((first_x + maxX < numX ? first_x + maxX : numX) - 1);
693 const int num_x_input = static_cast<int>(last_x - first_x + kernelSizeX);
694
695 const int first_y = static_cast<int>(blockIdx.y * maxY);
696 const int last_y = static_cast<int>((first_y + maxY < numY ? first_y + maxY : numY) - 1);
697 const int num_y_input = static_cast<int>(last_y - first_y + kernelSizeY);
698
699 const int first_z = static_cast<int>(blockIdx.z * maxZ);
700 const int last_z = static_cast<int>((first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1);
701 const int num_z_input = static_cast<int>(last_z - first_z + kernelSizeZ);
702
703 for (size_t p = 0; p < numPlanes; ++p) {
704
705 const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
706 const int plane_kernel_offset = 0;
707
708 for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
709 for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
710 for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
711 const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
712 s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
713 }
714 }
715 }
716
717 __syncthreads();
718
719 // Convolution
720 const int num_z_output = last_z - first_z + 1;
721 const int num_y_output = last_y - first_y + 1;
722 const int num_x_output = last_x - first_x + 1;
723 const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
724
725 for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
726 for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
727 for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
728 float result = 0.0f;
729 for (size_t n = 0; n < kernelSizeZ; ++n) {
730 for (size_t m = 0; m < kernelSizeY; ++m) {
731 for (size_t l = 0; l < kernelSizeX; ++l) {
732 result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
733 }
734 }
735 }
736 const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
737 buffer[tensor_index] = result;
738 }
739 }
740 }
741 __syncthreads();
742 }
743}
744
745template<typename Indices, typename InputArgType, typename KernelArgType>
746struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
747{
748 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
749
750 static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
751 static const int NumKernelDims = internal::array_size<Indices>::value;
752 typedef typename XprType::Index Index;
753 typedef DSizes<Index, NumDims> Dimensions;
754 typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
755
756 enum {
757 IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
758 PacketAccess = false,
759 Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
760 CoordAccess = false, // to be implemented
761 RawAccess = false
762 };
763
764 EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
765 : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
766 {
767 EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
768
769 const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
770 const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
771
772 m_dimensions = m_inputImpl.dimensions();
773 for (int i = 0; i < NumKernelDims; ++i) {
774 const Index index = op.indices()[i];
775 const Index input_dim = input_dims[index];
776 const Index kernel_dim = kernel_dims[i];
777 const Index result_dim = input_dim - kernel_dim + 1;
778 m_dimensions[index] = result_dim;
779 }
780 }
781
782 typedef typename XprType::CoeffReturnType CoeffReturnType;
783 typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
784 typedef typename InputArgType::Scalar Scalar;
785 static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
786
787 EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
788
789 EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
790 preloadKernel();
791 m_inputImpl.evalSubExprsIfNeeded(NULL);
792 if (data) {
793 executeEval(data);
794 return false;
795 } else {
796 m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
797 executeEval(m_buf);
798 return true;
799 }
800 }
801
802 EIGEN_STRONG_INLINE void cleanup() {
803 m_inputImpl.cleanup();
804 if (m_buf) {
805 m_device.deallocate(m_buf);
806 m_buf = NULL;
807 }
808 if (m_local_kernel) {
809 m_device.deallocate((void*)m_kernel);
810 m_local_kernel = false;
811 }
812 m_kernel = NULL;
813 }
814
815 EIGEN_STRONG_INLINE void preloadKernel() {
816 // Don't make a local copy of the kernel unless we have to (i.e. it's an
817 // expression that needs to be evaluated)
818 const Scalar* in_place = m_kernelImpl.data();
819 if (in_place) {
820 m_kernel = in_place;
821 m_local_kernel = false;
822 } else {
823 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
824 Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
825 typedef TensorEvalToOp<const KernelArgType> EvalTo;
826 EvalTo evalToTmp(local, m_kernelArg);
827 const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
828 internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
829
830 m_kernel = local;
831 m_local_kernel = true;
832 }
833 }
834
835 static unsigned int ceil(unsigned int num, unsigned int denom) {
836 const unsigned int rounded_toward_zero = num / denom;
837 if (num > rounded_toward_zero * denom) {
838 return rounded_toward_zero + 1;
839 }
840 return rounded_toward_zero;
841 }
842
843 void executeEval(Scalar* data) const {
844 typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
845
846 const int maxSharedMem = m_device.sharedMemPerBlock();
847 const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock();
848 const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock;
849 const int numMultiProcessors = m_device.getNumCudaMultiProcessors();
850 const int warpSize = 32;
851
852 switch (NumKernelDims) {
853 case 1: {
854 const int kernel_size = m_kernelImpl.dimensions().TotalSize();
855
856 const int numX = dimensions()[m_indices[0]];
857 const int numP = dimensions().TotalSize() / numX;
858 int maxX;
859 dim3 block_size;
860
861 const int single_stride_dim =
862 static_cast<int>(Layout) == static_cast<int>(ColMajor)
863 ? 0
864 : m_inputImpl.dimensions().rank() - 1;
865 if (m_indices[0] == single_stride_dim) {
866 // Maximum the reuse
867 const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
868 maxX = numext::mini<int>(inner_dim, numX);
869 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
870 block_size.x = numext::mini(maxThreadsPerBlock, maxX);
871 block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
872 }
873 else {
874 // Read as much as possible alongside the inner most dimension, that is the plane
875 const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
876 const int maxP = numext::mini<int>(inner_dim, numP);
877 maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
878
879 block_size.x = numext::mini(warpSize, maxX);
880 block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
881 }
882
883 const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
884 assert(shared_mem <= maxSharedMem);
885
886 const int num_x_blocks = ceil(numX, maxX);
887 const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
888 const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
889
890 dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
891
892
893 //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
894
895 const array<Index, 1> indices(m_indices[0]);
896 const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
897 internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(
898 m_inputImpl.dimensions(), kernel_dims, indices);
899 switch(kernel_size) {
900 case 4: {
901 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
902 break;
903 }
904 case 7: {
905 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
906 break;
907 }
908 default: {
909 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
910 }
911 }
912 break;
913 }
914
915 case 2: {
916 const int idxX =
917 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
918 const int idxY =
919 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
920 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
921 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
922
923 const int numX = dimensions()[m_indices[idxX]];
924 const int numY = dimensions()[m_indices[idxY]];
925 const int numP = dimensions().TotalSize() / (numX*numY);
926
927 const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
928
929 // Snap maxX to warp size
930 int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
931 const int maxX = numext::mini<int>(inner_dim, numX);
932 const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
933 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
934
935 dim3 block_size;
936 block_size.x = numext::mini(1024, maxX);
937 block_size.y = numext::mini<int>(1024/block_size.x, maxY);
938 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
939
940 const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
941 assert(shared_mem <= maxSharedMem);
942
943 const int num_x_blocks = ceil(numX, maxX);
944 const int num_y_blocks = ceil(numY, maxY);
945 const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
946 const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
947
948 dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
949
950
951 //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
952
953 const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
954 const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
955 m_kernelImpl.dimensions()[idxY]);
956 internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(
957 m_inputImpl.dimensions(), kernel_dims, indices);
958 switch (kernel_size_x) {
959 case 4: {
960 switch (kernel_size_y) {
961 case 7: {
962 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
963 break;
964 }
965 default: {
966 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
967 break;
968 }
969 }
970 break;
971 }
972 case 7: {
973 switch (kernel_size_y) {
974 case 4: {
975 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
976 break;
977 }
978 default: {
979 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
980 break;
981 }
982 }
983 break;
984 }
985 default: {
986 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
987 break;
988 }
989 }
990 break;
991 }
992
993 case 3: {
994 const int idxX =
995 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
996 const int idxY =
997 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
998 const int idxZ =
999 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
1000
1001 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
1002 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
1003 const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
1004
1005 const int numX = dimensions()[m_indices[idxX]];
1006 const int numY = dimensions()[m_indices[idxY]];
1007 const int numZ = dimensions()[m_indices[idxZ]];
1008 const int numP = dimensions().TotalSize() / (numX*numY*numZ);
1009
1010 const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
1011 const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
1012 const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
1013
1014 dim3 block_size;
1015 block_size.x = numext::mini(32, maxX);
1016 block_size.y = numext::mini(32, maxY);
1017 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
1018 dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
1019
1020 const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
1021 assert(shared_mem <= maxSharedMem);
1022
1023 //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
1024 const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
1025 m_indices[idxZ]);
1026 const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
1027 m_kernelImpl.dimensions()[idxY],
1028 m_kernelImpl.dimensions()[idxZ]);
1029 internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
1030 m_inputImpl.dimensions(), kernel_dims, indices);
1031
1032 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
1033 break;
1034 }
1035
1036 default: {
1037 EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
1038 }
1039 }
1040 }
1041
1042 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
1043 {
1044 eigen_assert(m_buf);
1045 eigen_assert(index < m_dimensions.TotalSize());
1046 return m_buf[index];
1047 }
1048
1049 template<int LoadMode>
1050 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
1051 {
1052 eigen_assert(m_buf);
1053 eigen_assert(index < m_dimensions.TotalSize());
1054 return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
1055 }
1056
1057 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
1058 costPerCoeff(bool vectorized) const {
1059 // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
1060 // model.
1061 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
1062 // We ignore the use of fused multiply-add.
1063 const double convolve_compute_cost =
1064 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
1065 const double firstIndex_compute_cost =
1066 NumDims *
1067 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
1068 TensorOpCost::DivCost<Index>());
1069 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
1070 kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
1071 m_kernelImpl.costPerCoeff(vectorized) +
1072 TensorOpCost(0, 0, convolve_compute_cost, vectorized,
1073 PacketSize));
1074 }
1075
1076 private:
1077 // No assignment (copies are needed by the kernels)
1078 TensorEvaluator& operator = (const TensorEvaluator&);
1079
1080 TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
1081 TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
1082 KernelArgType m_kernelArg;
1083 Indices m_indices;
1084 Dimensions m_dimensions;
1085 Scalar* m_buf;
1086 const Scalar* m_kernel;
1087 bool m_local_kernel;
1088
1089 const GpuDevice& m_device;
1090};
1091#endif
1092
1093
1094} // end namespace Eigen
1095
1096#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
The tensor base class.
Definition TensorForwardDeclarations.h:29
Definition TensorConvolution.h:252
const internal::remove_all< typenameInputXprType::Nested >::type & inputExpression() const
Definition TensorConvolution.h:271
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
const int Dynamic
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_ceil_op< typename Derived::Scalar >, const Derived > ceil(const Eigen::ArrayBase< Derived > &x)
The tensor evaluator class.
Definition TensorEvaluator.h:27
const Device & device() const
required by sycl in order to construct sycl buffer from raw pointer
Definition TensorEvaluator.h:112