10#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
11#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
14#include "./InternalHeaderCheck.h"
20template <
typename Index,
typename InputDims,
int NumKernelDims,
int Layout>
23 IndexMapper(
const InputDims& input_dims,
const array<Index, NumKernelDims>& kernel_dims,
24 const array<Index, NumKernelDims>& indices) {
25 array<Index, NumDims> dimensions = input_dims;
26 for (
int i = 0; i < NumKernelDims; ++i) {
27 const Index index = indices[i];
28 const Index input_dim = input_dims[index];
29 const Index kernel_dim = kernel_dims[i];
30 const Index result_dim = input_dim - kernel_dim + 1;
31 dimensions[index] = result_dim;
34 array<Index, NumDims> inputStrides;
35 array<Index, NumDims> outputStrides;
36 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
39 for (
int i = 1; i < NumDims; ++i) {
40 inputStrides[i] = inputStrides[i - 1] * input_dims[i - 1];
41 outputStrides[i] = outputStrides[i - 1] * dimensions[i - 1];
44 inputStrides[NumDims - 1] = 1;
45 outputStrides[NumDims - 1] = 1;
46 for (
int i =
static_cast<int>(NumDims) - 2; i >= 0; --i) {
47 inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
48 outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
52 array<Index, NumDims> gpuInputDimensions;
53 array<Index, NumDims> gpuOutputDimensions;
54 array<Index, NumDims> tmp = dimensions;
55 array<Index, NumDims> ordering;
56 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 0 : NumDims - NumKernelDims;
57 for (
int i = 0; i < NumKernelDims; ++i) {
58 const Index index = i + offset;
59 ordering[index] = indices[i];
61 gpuInputDimensions[index] = input_dims[indices[i]];
62 gpuOutputDimensions[index] = dimensions[indices[i]];
65 int written =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? NumKernelDims : 0;
66 for (
int i = 0; i < NumDims; ++i) {
68 ordering[written] = i;
69 gpuInputDimensions[written] = input_dims[i];
70 gpuOutputDimensions[written] = dimensions[i];
75 for (
int i = 0; i < NumDims; ++i) {
76 m_inputStrides[i] = inputStrides[ordering[i]];
77 m_outputStrides[i] = outputStrides[ordering[i]];
80 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
81 for (
int i = 0; i < NumDims; ++i) {
82 if (i > NumKernelDims) {
83 m_gpuInputStrides[i] = m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1];
84 m_gpuOutputStrides[i] = m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1];
86 m_gpuInputStrides[i] = 1;
87 m_gpuOutputStrides[i] = 1;
91 for (
int i = NumDims - 1; i >= 0; --i) {
92 if (
static_cast<size_t>(i + 1) < offset) {
93 m_gpuInputStrides[i] = m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
94 m_gpuOutputStrides[i] = m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
96 m_gpuInputStrides[i] = 1;
97 m_gpuOutputStrides[i] = 1;
103 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
Index mapGpuInputPlaneToTensorInputOffset(
Index p)
const {
104 Index inputIndex = 0;
105 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
106 for (
int d = NumDims - 1; d > NumKernelDims; --d) {
107 const Index idx = p / m_gpuInputStrides[d];
108 inputIndex += idx * m_inputStrides[d];
109 p -= idx * m_gpuInputStrides[d];
111 if (NumKernelDims < NumDims) {
112 inputIndex += p * m_inputStrides[NumKernelDims];
115 std::ptrdiff_t limit = 0;
116 if (NumKernelDims < NumDims) {
117 limit = NumDims - NumKernelDims - 1;
119 for (
int d = 0; d < limit; ++d) {
120 const Index idx = p / m_gpuInputStrides[d];
121 inputIndex += idx * m_inputStrides[d];
122 p -= idx * m_gpuInputStrides[d];
124 inputIndex += p * m_inputStrides[limit];
129 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
Index mapGpuOutputPlaneToTensorOutputOffset(
Index p)
const {
130 Index outputIndex = 0;
131 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
132 for (
int d = NumDims - 1; d > NumKernelDims; --d) {
133 const Index idx = p / m_gpuOutputStrides[d];
134 outputIndex += idx * m_outputStrides[d];
135 p -= idx * m_gpuOutputStrides[d];
137 if (NumKernelDims < NumDims) {
138 outputIndex += p * m_outputStrides[NumKernelDims];
141 std::ptrdiff_t limit = 0;
142 if (NumKernelDims < NumDims) {
143 limit = NumDims - NumKernelDims - 1;
145 for (
int d = 0; d < limit; ++d) {
146 const Index idx = p / m_gpuOutputStrides[d];
147 outputIndex += idx * m_outputStrides[d];
148 p -= idx * m_gpuOutputStrides[d];
150 outputIndex += p * m_outputStrides[limit];
155 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
Index mapGpuInputKernelToTensorInputOffset(
Index i)
const {
156 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 0 : NumDims - NumKernelDims;
157 return i * m_inputStrides[offset];
160 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
Index mapGpuOutputKernelToTensorOutputOffset(
Index i)
const {
161 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 0 : NumDims - NumKernelDims;
162 return i * m_outputStrides[offset];
165 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
Index mapGpuInputKernelToTensorInputOffset(
Index i,
Index j)
const {
166 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 0 : NumDims - NumKernelDims;
167 return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
170 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
Index mapGpuOutputKernelToTensorOutputOffset(
Index i,
Index j)
const {
171 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 0 : NumDims - NumKernelDims;
172 return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
175 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
Index mapGpuInputKernelToTensorInputOffset(
Index i,
Index j,
Index k)
const {
176 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 0 : NumDims - NumKernelDims;
177 return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] + k * m_inputStrides[offset + 2];
180 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
Index mapGpuOutputKernelToTensorOutputOffset(
Index i,
Index j,
Index k)
const {
181 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 0 : NumDims - NumKernelDims;
182 return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] + k * m_outputStrides[offset + 2];
186 static constexpr int NumDims = internal::array_size<InputDims>::value;
187 array<Index, NumDims> m_inputStrides;
188 array<Index, NumDims> m_outputStrides;
189 array<Index, NumDims> m_gpuInputStrides;
190 array<Index, NumDims> m_gpuOutputStrides;
193template <
typename Dimensions,
typename InputXprType,
typename KernelXprType>
194struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> > {
196 typedef typename promote_storage_type<typename InputXprType::Scalar, typename KernelXprType::Scalar>::ret Scalar;
197 typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
198 typename traits<KernelXprType>::StorageKind>::ret StorageKind;
199 typedef typename promote_index_type<typename traits<InputXprType>::Index,
typename traits<KernelXprType>::Index>::type
201 typedef typename InputXprType::Nested LhsNested;
202 typedef typename KernelXprType::Nested RhsNested;
203 typedef std::remove_reference_t<LhsNested> LhsNested_;
204 typedef std::remove_reference_t<RhsNested> RhsNested_;
205 static constexpr int NumDimensions = traits<InputXprType>::NumDimensions;
206 static constexpr int Layout = traits<InputXprType>::Layout;
207 typedef std::conditional_t<Pointer_type_promotion<typename InputXprType::Scalar, Scalar>::val,
208 typename traits<InputXprType>::PointerType,
typename traits<KernelXprType>::PointerType>
214template <
typename Dimensions,
typename InputXprType,
typename KernelXprType>
215struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense> {
216 typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
219template <
typename Dimensions,
typename InputXprType,
typename KernelXprType>
220struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1,
221 typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type> {
222 typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
230template <
typename Indices,
typename InputXprType,
typename KernelXprType>
231class TensorConvolutionOp
232 :
public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors> {
234 typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
236 typedef typename internal::promote_storage_type<
typename InputXprType::CoeffReturnType,
237 typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
238 typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
239 typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
240 typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
242 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(
const InputXprType& input,
const KernelXprType& kernel,
244 : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
246 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Indices& indices()
const {
return m_indices; }
249 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const internal::remove_all_t<typename InputXprType::Nested>&
inputExpression()
254 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const internal::remove_all_t<typename KernelXprType::Nested>& kernelExpression()
260 typename InputXprType::Nested m_input_xpr;
261 typename KernelXprType::Nested m_kernel_xpr;
262 const Indices m_indices;
265template <
typename Indices,
typename InputArgType,
typename KernelArgType,
typename Device>
267 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
269 static constexpr int NumDims =
270 internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
271 static constexpr int NumKernelDims = internal::array_size<Indices>::value;
272 typedef typename XprType::Index Index;
273 typedef DSizes<Index, NumDims> Dimensions;
275 typedef typename XprType::Scalar Scalar;
276 typedef typename XprType::CoeffReturnType CoeffReturnType;
277 typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
278 static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
279 typedef StorageMemory<Scalar, Device> Storage;
280 typedef typename Storage::Type EvaluatorPointerType;
282 static constexpr int Layout = TensorEvaluator<InputArgType, Device>::Layout;
285 int(TensorEvaluator<InputArgType, Device>::IsAligned) & int(TensorEvaluator<KernelArgType, Device>::IsAligned),
286 PacketAccess = int(TensorEvaluator<InputArgType, Device>::PacketAccess) &
287 int(TensorEvaluator<KernelArgType, Device>::PacketAccess),
289 PreferBlockAccess =
false,
295 typedef internal::TensorBlockNotImplemented TensorBlock;
298 EIGEN_STRONG_INLINE TensorEvaluator(
const XprType& op,
const Device& device)
299 : m_inputImpl(op.inputExpression(), device),
300 m_kernelImpl(op.kernelExpression(), device),
301 m_kernelArg(op.kernelExpression()),
303 m_local_kernel(false),
305 EIGEN_STATIC_ASSERT((
static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) ==
306 static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)),
307 YOU_MADE_A_PROGRAMMING_MISTAKE);
309 const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
310 const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
312 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
313 m_inputStride[0] = 1;
314 for (
int i = 1; i < NumDims; ++i) {
315 m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
318 m_inputStride[NumDims - 1] = 1;
319 for (
int i = NumDims - 2; i >= 0; --i) {
320 m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
324 m_dimensions = m_inputImpl.dimensions();
325 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
326 for (
int i = 0; i < NumKernelDims; ++i) {
327 const Index index = op.indices()[i];
328 const Index input_dim = input_dims[index];
329 const Index kernel_dim = kernel_dims[i];
330 const Index result_dim = input_dim - kernel_dim + 1;
331 m_dimensions[index] = result_dim;
333 m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
335 m_kernelStride[0] = 1;
337 m_indexStride[i] = m_inputStride[index];
340 m_outputStride[0] = 1;
341 for (
int i = 1; i < NumDims; ++i) {
342 m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
345 for (
int i = NumKernelDims - 1; i >= 0; --i) {
346 const Index index = op.indices()[i];
347 const Index input_dim = input_dims[index];
348 const Index kernel_dim = kernel_dims[i];
349 const Index result_dim = input_dim - kernel_dim + 1;
350 m_dimensions[index] = result_dim;
351 if (i < NumKernelDims - 1) {
352 m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
354 m_kernelStride[NumKernelDims - 1] = 1;
356 m_indexStride[i] = m_inputStride[index];
359 m_outputStride[NumDims - 1] = 1;
360 for (
int i = NumDims - 2; i >= 0; --i) {
361 m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
366 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Dimensions& dimensions()
const {
return m_dimensions; }
368 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
bool evalSubExprsIfNeeded(Scalar*) {
369 m_inputImpl.evalSubExprsIfNeeded(NULL);
373 EIGEN_STRONG_INLINE
void cleanup() {
374 m_inputImpl.cleanup();
375 if (m_local_kernel) {
376 m_device.deallocate((
void*)m_kernel);
377 m_local_kernel =
false;
382 void evalTo(
typename XprType::Scalar* buffer) {
383 evalSubExprsIfNeeded(NULL);
384 for (
int i = 0; i < dimensions().TotalSize(); ++i) {
385 buffer[i] += coeff(i);
390 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index)
const {
391 CoeffReturnType result = CoeffReturnType(0);
392 convolve(firstInput(index), 0, NumKernelDims - 1, result);
396 template <
int LoadMode>
397 EIGEN_DEVICE_FUNC PacketReturnType packet(
const Index index)
const {
398 Index indices[2] = {index, index + PacketSize - 1};
399 Index startInputs[2] = {0, 0};
400 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
401 for (
int i = NumDims - 1; i > 0; --i) {
402 const Index idx0 = indices[0] / m_outputStride[i];
403 const Index idx1 = indices[1] / m_outputStride[i];
404 startInputs[0] += idx0 * m_inputStride[i];
405 startInputs[1] += idx1 * m_inputStride[i];
406 indices[0] -= idx0 * m_outputStride[i];
407 indices[1] -= idx1 * m_outputStride[i];
410 for (
int i = 0; i < NumDims - 1; ++i) {
411 const Index idx0 = indices[0] / m_outputStride[i];
412 const Index idx1 = indices[1] / m_outputStride[i];
413 startInputs[0] += idx0 * m_inputStride[i];
414 startInputs[1] += idx1 * m_inputStride[i];
415 indices[0] -= idx0 * m_outputStride[i];
416 indices[1] -= idx1 * m_outputStride[i];
419 startInputs[0] += indices[0];
420 startInputs[1] += indices[1];
422 if (startInputs[1] - startInputs[0] == PacketSize - 1) {
423 PacketReturnType result = internal::pset1<PacketReturnType>(0);
424 convolvePacket(startInputs[0], 0, NumKernelDims - 1, result);
427 EIGEN_ALIGN_MAX Scalar data[PacketSize];
429 convolve(startInputs[0], 0, NumKernelDims - 1, data[0]);
430 for (
int i = 1; i < PacketSize - 1; ++i) {
432 convolve(firstInput(index + i), 0, NumKernelDims - 1, data[i]);
434 data[PacketSize - 1] = Scalar(0);
435 convolve(startInputs[1], 0, NumKernelDims - 1, data[PacketSize - 1]);
436 return internal::pload<PacketReturnType>(data);
440 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(
bool vectorized)
const {
441 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
443 const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
444 const double firstIndex_compute_cost =
446 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
447 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
448 kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
449 TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
452 EIGEN_DEVICE_FUNC EvaluatorPointerType data()
const {
return NULL; }
455 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index)
const {
456 Index startInput = 0;
457 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
458 for (
int i = NumDims - 1; i > 0; --i) {
459 const Index idx = index / m_outputStride[i];
460 startInput += idx * m_inputStride[i];
461 index -= idx * m_outputStride[i];
464 for (
int i = 0; i < NumDims - 1; ++i) {
465 const Index idx = index / m_outputStride[i];
466 startInput += idx * m_inputStride[i];
467 index -= idx * m_outputStride[i];
474 EIGEN_DEVICE_FUNC
void convolve(Index firstIndex, Index firstKernel,
int DimIndex, CoeffReturnType& accum)
const {
475 for (
int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
476 const Index input = firstIndex + j * m_indexStride[DimIndex];
477 const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
479 convolve(input, kernel, DimIndex - 1, accum);
481 accum += m_inputImpl.coeff(input) * m_kernel[kernel];
486 template <
typename Packet>
487 EIGEN_DEVICE_FUNC
void convolvePacket(Index firstIndex, Index firstKernel,
int DimIndex, Packet& accum)
const {
488 for (
int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
489 const Index input = firstIndex + j * m_indexStride[DimIndex];
490 const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
492 convolvePacket(input, kernel, DimIndex - 1, accum);
494 accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input),
495 internal::pset1<Packet>(m_kernel[kernel]), accum);
500 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void preloadKernel() {
503 const Scalar* in_place = m_kernelImpl.data();
506 m_local_kernel =
false;
508 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() *
sizeof(Scalar);
509 Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz);
510 typedef TensorEvalToOp<const KernelArgType> EvalTo;
511 EvalTo evalToTmp(local, m_kernelArg);
512 const bool Vectorize = internal::IsVectorizable<Device, KernelArgType>::value;
513 internal::TensorExecutor<const EvalTo, Device, Vectorize>::run(evalToTmp, m_device);
516 m_local_kernel =
true;
520 array<Index, NumDims> m_inputStride;
521 array<Index, NumDims> m_outputStride;
523 array<Index, NumKernelDims> m_indexStride;
524 array<Index, NumKernelDims> m_kernelStride;
525 TensorEvaluator<InputArgType, Device> m_inputImpl;
526 TensorEvaluator<KernelArgType, Device> m_kernelImpl;
527 Dimensions m_dimensions;
529 KernelArgType m_kernelArg;
530 const Scalar* m_kernel;
532 const Device EIGEN_DEVICE_REF m_device;
536#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
538template <
int StaticKernelSize>
539struct GetKernelSize {
540 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int operator()(
const int )
const {
return StaticKernelSize; }
544 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int operator()(
const int kernelSize)
const {
return kernelSize; }
547template <
typename InputEvaluator,
typename Index,
typename InputDims,
int StaticKernelSize>
548__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void EigenConvolutionKernel1D(
549 InputEvaluator eval,
const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout> indexMapper,
550 const float* __restrict kernel,
const int numPlanes,
const int numX,
const int maxX,
const int kernelSize,
552#if defined(EIGEN_HIPCC)
553 HIP_DYNAMIC_SHARED(
float, s)
555 extern __shared__
float s[];
558 const int first_x = blockIdx.x * maxX;
559 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
560 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
561 const int num_x_output = last_x - first_x + 1;
563 const int first_plane = blockIdx.y * blockDim.y;
564 const int plane_stride = blockDim.y * gridDim.y;
566 for (
int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
568 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
569 const int plane_kernel_offset = threadIdx.y * num_x_input;
571 for (
int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
572 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + first_x);
573 s[i + plane_kernel_offset] = eval.coeff(tensor_index);
579 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
582 for (
int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
583 const int kernel_offset = plane_kernel_offset + i;
586 for (
int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
587 result += s[k + kernel_offset] * kernel[k];
589 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i + first_x);
590 buffer[tensor_index] = result;
596template <
typename InputEvaluator,
typename Index,
typename InputDims,
int StaticKernelSizeX,
int StaticKernelSizeY>
597__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void EigenConvolutionKernel2D(
598 InputEvaluator eval,
const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout> indexMapper,
599 const float* __restrict kernel,
const int numPlanes,
const int numX,
const int maxX,
const int numY,
const int maxY,
600 const int kernelSizeX,
const int kernelSizeY,
float* buffer) {
601#if defined(EIGEN_HIPCC)
602 HIP_DYNAMIC_SHARED(
float, s)
604 extern __shared__
float s[];
607 const int first_x = blockIdx.x * maxX;
608 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
609 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
610 const int num_x_output = last_x - first_x + 1;
612 const int first_y = blockIdx.y * maxY;
613 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
614 const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
615 const int num_y_output = last_y - first_y + 1;
617 const int first_plane = blockIdx.z * blockDim.z;
618 const int plane_stride = blockDim.z * gridDim.z;
620 for (
int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
621 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
622 const int plane_kernel_offset = threadIdx.z * num_y_input;
626 for (
int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
627 const int input_offset = num_x_input * (j + plane_kernel_offset);
629 for (
int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
630 const int tensor_index =
631 plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + first_x, j + first_y);
632 s[i + input_offset] = eval.coeff(tensor_index);
639 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
642 for (
int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
644 for (
int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
647 for (
int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
648 const int kernel_offset = kernelSizeX * l;
649 const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
651 for (
int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
652 result += s[k + input_offset] * kernel[k + kernel_offset];
655 const int tensor_index =
656 plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i + first_x, j + first_y);
657 buffer[tensor_index] = result;
665template <
typename InputEvaluator,
typename Index,
typename InputDims>
666__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void EigenConvolutionKernel3D(
667 InputEvaluator eval,
const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout> indexMapper,
668 const float* __restrict kernel,
const size_t numPlanes,
const size_t numX,
const size_t maxX,
const size_t numY,
669 const size_t maxY,
const size_t numZ,
const size_t maxZ,
const size_t kernelSizeX,
const size_t kernelSizeY,
670 const size_t kernelSizeZ,
float* buffer) {
671#if defined(EIGEN_HIPCC)
672 HIP_DYNAMIC_SHARED(
float, s)
674 extern __shared__
float s[];
678 const int first_x = blockIdx.x * maxX;
679 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
680 const int num_x_input = last_x - first_x + kernelSizeX;
682 const int first_y = blockIdx.y * maxY;
683 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
684 const int num_y_input = last_y - first_y + kernelSizeY;
686 const int first_z = blockIdx.z * maxZ;
687 const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
688 const int num_z_input = last_z - first_z + kernelSizeZ;
690 for (
int p = 0; p < numPlanes; ++p) {
691 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
692 const int plane_kernel_offset = 0;
694 for (
int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
695 for (
int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
696 for (
int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
697 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
698 i + first_x, j + first_y, k + first_z);
699 s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
707 const int num_z_output = last_z - first_z + 1;
708 const int num_y_output = last_y - first_y + 1;
709 const int num_x_output = last_x - first_x + 1;
710 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
712 for (
int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
713 for (
int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
714 for (
int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
716 for (
int n = 0; n < kernelSizeZ; ++n) {
717 for (
int m = 0; m < kernelSizeY; ++m) {
718 for (
int l = 0; l < kernelSizeX; ++l) {
719 result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] *
720 kernel[l + kernelSizeX * (m + kernelSizeY * n)];
724 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(
725 i + first_x, j + first_y, k + first_z);
726 buffer[tensor_index] = result;
734template <
typename Indices,
typename InputArgType,
typename KernelArgType>
736 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
738 static constexpr int NumDims =
739 internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
740 static constexpr int NumKernelDims = internal::array_size<Indices>::value;
741 typedef typename XprType::Index Index;
742 typedef DSizes<Index, NumDims> Dimensions;
743 typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
745 static constexpr int Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout;
748 TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
749 PacketAccess =
false,
751 PreferBlockAccess =
false,
757 typedef internal::TensorBlockNotImplemented TensorBlock;
760 TensorEvaluator(
const XprType& op,
const GpuDevice& device)
761 : m_inputImpl(op.inputExpression(), device),
762 m_kernelImpl(op.kernelExpression(), device),
763 m_kernelArg(op.kernelExpression()),
764 m_indices(op.indices()),
767 m_local_kernel(false),
769 EIGEN_STATIC_ASSERT((
static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) ==
770 static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)),
771 YOU_MADE_A_PROGRAMMING_MISTAKE);
773 const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
774 const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
776 m_dimensions = m_inputImpl.dimensions();
777 for (
int i = 0; i < NumKernelDims; ++i) {
778 const Index index = op.indices()[i];
779 const Index input_dim = input_dims[index];
780 const Index kernel_dim = kernel_dims[i];
781 const Index result_dim = input_dim - kernel_dim + 1;
782 m_dimensions[index] = result_dim;
786 typedef typename XprType::CoeffReturnType CoeffReturnType;
787 typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
788 typedef typename InputArgType::Scalar Scalar;
789 static constexpr int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
791 EIGEN_DEVICE_FUNC
const Dimensions& dimensions()
const {
return m_dimensions; }
793 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
bool evalSubExprsIfNeeded(Scalar* data) {
795 m_inputImpl.evalSubExprsIfNeeded(NULL);
800 m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() *
sizeof(Scalar));
806 EIGEN_STRONG_INLINE
void cleanup() {
807 m_inputImpl.cleanup();
809 m_device.deallocate(m_buf);
812 if (m_local_kernel) {
813 m_device.deallocate((
void*)m_kernel);
814 m_local_kernel =
false;
819 EIGEN_STRONG_INLINE
void preloadKernel() {
822 const Scalar* in_place = m_kernelImpl.data();
825 m_local_kernel =
false;
827 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() *
sizeof(Scalar);
828 Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
829 typedef TensorEvalToOp<const KernelArgType> EvalTo;
830 EvalTo evalToTmp(local, m_kernelArg);
831 const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
832 internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
835 m_local_kernel =
true;
839 static unsigned int ceil(
unsigned int num,
unsigned int denom) {
840 const unsigned int rounded_toward_zero = num / denom;
841 if (num > rounded_toward_zero * denom) {
842 return rounded_toward_zero + 1;
844 return rounded_toward_zero;
847 void executeEval(Scalar* data)
const {
848 typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
850 const int maxSharedMem = m_device.sharedMemPerBlock();
851 const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();
852 const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
853 const int numMultiProcessors = m_device.getNumGpuMultiProcessors();
854 const int warpSize = 32;
856 switch (NumKernelDims) {
858 const int kernel_size = m_kernelImpl.dimensions().TotalSize();
860 const int numX = dimensions()[m_indices[0]];
861 const int numP = dimensions().TotalSize() / numX;
865 const int single_stride_dim =
866 static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 0 : m_inputImpl.dimensions().rank() - 1;
867 if (m_indices[0] == single_stride_dim) {
869 const int inner_dim = ((maxSharedMem / (
sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
870 maxX = numext::mini<int>(inner_dim, numX);
871 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) *
sizeof(Scalar)), numP);
872 block_size.x = numext::mini(maxThreadsPerBlock, maxX);
873 block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
876 const int inner_dim = maxSharedMem / ((warpSize + kernel_size) *
sizeof(Scalar));
877 const int maxP = numext::mini<int>(inner_dim, numP);
878 maxX = numext::mini<int>(maxSharedMem / (inner_dim *
sizeof(Scalar)) - kernel_size + 1, numX);
880 block_size.x = numext::mini(warpSize, maxX);
881 block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
884 const int shared_mem = block_size.y * (maxX + kernel_size - 1) *
sizeof(Scalar);
885 gpu_assert(shared_mem <= maxSharedMem);
887 const int num_x_blocks =
ceil(numX, maxX);
888 const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
889 const int num_y_blocks =
ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
891 dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks,
ceil(numP, block_size.y)));
897 const array<Index, 1> indices{m_indices[0]};
898 const array<Index, 1> kernel_dims{m_kernelImpl.dimensions()[0]};
899 internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
900 switch (kernel_size) {
902 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>),
903 num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP,
904 numX, maxX, 4, data);
908 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>),
909 num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP,
910 numX, maxX, 7, data);
915 (EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims,
Dynamic>),
916 num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
924 const int idxX =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 0 : 1;
925 const int idxY =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 1 : 0;
926 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
927 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
929 const int numX = dimensions()[m_indices[idxX]];
930 const int numY = dimensions()[m_indices[idxY]];
931 const int numP = dimensions().TotalSize() / (numX * numY);
933 const float scaling_factor =
934 sqrtf(
static_cast<float>(maxSharedMem) / (
sizeof(Scalar) * kernel_size_y * kernel_size_x));
937 int inner_dim = ((
static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
938 const int maxX = numext::mini<int>(inner_dim, numX);
940 numext::mini<int>(maxSharedMem / (
sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
941 const int maxP = numext::mini<int>(
942 maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) *
sizeof(Scalar)), numP);
945 block_size.x = numext::mini(1024, maxX);
946 block_size.y = numext::mini<int>(1024 / block_size.x, maxY);
947 block_size.z = numext::mini<int>(1024 / (block_size.x * block_size.y), maxP);
949 const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) *
sizeof(Scalar);
950 gpu_assert(shared_mem <= maxSharedMem);
952 const int num_x_blocks =
ceil(numX, maxX);
953 const int num_y_blocks =
ceil(numY, maxY);
954 const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
955 const int num_z_blocks =
ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
957 dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks,
ceil(numP, block_size.z)));
964 const array<Index, 2> indices{m_indices[idxX], m_indices[idxY]};
965 const array<Index, 2> kernel_dims{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY]};
966 internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
967 switch (kernel_size_x) {
969 switch (kernel_size_y) {
972 (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>),
973 num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
974 numY, maxY, 4, 7, data);
979 (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4,
Dynamic>),
980 num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
981 numY, maxY, 4, kernel_size_y, data);
988 switch (kernel_size_y) {
991 (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>),
992 num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
993 numY, maxY, 7, 4, data);
998 (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7,
Dynamic>),
999 num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
1000 numY, maxY, 7, kernel_size_y, data);
1007 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims,
1009 num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP,
1010 numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
1018 const int idxX =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 0 : 2;
1019 const int idxY =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 1 : 1;
1020 const int idxZ =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 2 : 0;
1022 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
1023 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
1024 const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
1026 const int numX = dimensions()[m_indices[idxX]];
1027 const int numY = dimensions()[m_indices[idxY]];
1028 const int numZ = dimensions()[m_indices[idxZ]];
1029 const int numP = dimensions().TotalSize() / (numX * numY * numZ);
1031 const int maxX = numext::mini<int>(
1032 128, numext::mini<int>(maxSharedMem / (
sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1,
1034 const int maxY = numext::mini<int>(
1035 128, numext::mini<int>(
1036 maxSharedMem / (
sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1,
1038 const int maxZ = numext::mini<int>(
1039 128, numext::mini<int>(
1040 maxSharedMem / (
sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) -
1045 block_size.x = numext::mini(32, maxX);
1046 block_size.y = numext::mini(32, maxY);
1047 block_size.z = numext::mini<int>(1024 / (block_size.x * block_size.y), maxZ);
1048 dim3 num_blocks(
ceil(numX, maxX),
ceil(numY, maxY),
ceil(numZ, maxZ));
1050 const int shared_mem =
1051 (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) *
sizeof(Scalar);
1052 gpu_assert(shared_mem <= maxSharedMem);
1058 const array<Index, 3> indices{m_indices[idxX], m_indices[idxY], m_indices[idxZ]};
1059 const array<Index, 3> kernel_dims{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY],
1060 m_kernelImpl.dimensions()[idxZ]};
1061 internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
1063 LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>),
1064 num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX,
1065 maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
1070 EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
1071 THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
1076 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index)
const {
1077 eigen_assert(m_buf);
1078 eigen_assert(index < m_dimensions.TotalSize());
1079 return m_buf[index];
1082 template <
int LoadMode>
1083 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(
const Index index)
const {
1084 eigen_assert(m_buf);
1085 eigen_assert(index < m_dimensions.TotalSize());
1086 return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
1089 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(
bool vectorized)
const {
1092 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
1094 const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
1095 const double firstIndex_compute_cost =
1097 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
1098 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
1099 kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
1100 TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
1104 TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
1105 TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
1106 KernelArgType m_kernelArg;
1108 Dimensions m_dimensions;
1110 const Scalar* m_kernel;
1111 bool m_local_kernel;
1113 const GpuDevice& m_device;
The tensor base class.
Definition TensorForwardDeclarations.h:68
Definition TensorConvolution.h:232
const internal::remove_all_t< typename InputXprType::Nested > & inputExpression() const
Definition TensorConvolution.h:249
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_ceil_op< typename Derived::Scalar >, const Derived > ceil(const Eigen::ArrayBase< Derived > &x)
The tensor evaluator class.
Definition TensorEvaluator.h:30