15#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
16#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
19#include "./InternalHeaderCheck.h"
23enum class convolution_type { CONV1D, CONV2D, CONV3D };
24template <
typename Evaluator,
typename CoeffReturnType,
typename KernelType,
typename Index,
typename InputDims,
25 typename Kernel_accessor,
typename Buffer_accessor, convolution_type Conv_Dim>
26struct EigenConvolutionKernel;
27template <
typename Evaluator,
typename CoeffReturnType,
typename KernelType,
typename Index,
typename InputDims,
28 typename Kernel_accessor,
typename Buffer_accessor>
29struct EigenConvolutionKernel<Evaluator,
CoeffReturnType, KernelType,
Index, InputDims, Kernel_accessor,
30 Buffer_accessor, convolution_type::CONV1D> {
31 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
33 Local_accessor local_acc;
34 Evaluator device_evaluator;
35 Kernel_accessor kernel_filter;
36 Buffer_accessor buffer_acc;
37 internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper;
38 const size_t kernelSize;
39 const cl::sycl::range<2> input_range;
40 EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
41 Buffer_accessor buffer_acc_,
42 internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper_,
43 const size_t kernelSize_,
const cl::sycl::range<2> input_range_)
44 : local_acc(local_acc_),
45 device_evaluator(device_evaluator_),
46 kernel_filter(kernel_filter_),
47 buffer_acc(buffer_acc_),
48 indexMapper(indexMapper_),
49 kernelSize(kernelSize_),
50 input_range(input_range_) {}
52 template <
typename BooleanDim2>
53 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
bool boundary_check(
const BooleanDim2 boolean_check)
const {
54 return (boolean_check[0] && boolean_check[1]);
56 void operator()(cl::sycl::nd_item<2> itemID)
const {
57 auto buffer_ptr = buffer_acc;
58 auto kernel_ptr = kernel_filter;
60 const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
61 const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
62 const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0];
63 const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1));
65 for (
size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) {
66 const size_t local_index = i + plane_kernel_offset;
67 const size_t tensor_index =
68 plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset);
70 local_acc[local_index] =
71 (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1])
72 ? device_evaluator.coeff(tensor_index)
76 itemID.barrier(cl::sycl::access::fence_space::local_space);
79 const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]);
80 if (boundary_check(itemID.get_global_id() < input_range)) {
81 CoeffReturnType result =
static_cast<CoeffReturnType
>(0);
82 const size_t index = plane_kernel_offset + itemID.get_local_id(0);
83 for (
size_t k = 0; k < kernelSize; ++k) {
84 result += (local_acc[k + index] * kernel_ptr[k]);
86 const size_t tensor_index =
87 indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) +
88 indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start);
89 buffer_ptr[tensor_index] = result;
94template <
typename Evaluator,
typename CoeffReturnType,
typename KernelType,
typename Index,
typename InputDims,
95 typename Kernel_accessor,
typename Buffer_accessor>
96struct EigenConvolutionKernel<Evaluator,
CoeffReturnType, KernelType,
Index, InputDims, Kernel_accessor,
97 Buffer_accessor, convolution_type::CONV2D> {
98 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
100 Local_accessor local_acc;
101 Evaluator device_evaluator;
102 Kernel_accessor kernel_filter;
103 Buffer_accessor buffer_acc;
104 internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper;
105 const cl::sycl::range<2> kernel_size;
106 const cl::sycl::range<3> input_range;
107 EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
108 Buffer_accessor buffer_acc_,
109 internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper_,
110 const cl::sycl::range<2> kernel_size_,
const cl::sycl::range<3> input_range_)
111 : local_acc(local_acc_),
112 device_evaluator(device_evaluator_),
113 kernel_filter(kernel_filter_),
114 buffer_acc(buffer_acc_),
115 indexMapper(indexMapper_),
116 kernel_size(kernel_size_),
117 input_range(input_range_) {}
118 template <
typename BooleanDim3>
119 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
bool boundary_check(
const BooleanDim3 boolean_check)
const {
120 return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
123 void operator()(cl::sycl::nd_item<3> itemID)
const {
124 auto buffer_ptr = buffer_acc;
125 auto kernel_ptr = kernel_filter;
127 const auto num_input = cl::sycl::range<2>{
128 (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
130 const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2));
131 const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1];
133 const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
134 itemID.get_group(1) * itemID.get_local_range()[1]};
137 bool in_range_dim2 = itemID.get_global_id(2) < input_range[2];
138 for (
size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
139 const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset);
140 bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1));
141 for (
size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
142 const size_t local_index = i + local_input_offset;
143 const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
144 i + input_offset[0], j + input_offset[1]);
145 local_acc[local_index] =
146 (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) && in_range_dim1 && in_range_dim2)
147 ? device_evaluator.coeff(tensor_index)
148 : CoeffReturnType(0);
152 itemID.barrier(cl::sycl::access::fence_space::local_space);
155 const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
156 itemID.get_group(1) * itemID.get_local_range()[1]};
158 if (boundary_check(itemID.get_global_id() < input_range)) {
159 CoeffReturnType result =
static_cast<CoeffReturnType
>(0);
161 for (
size_t j = 0; j < kernel_size[1]; j++) {
162 size_t kernel_offset = kernel_size[0] * j;
164 (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0);
165 for (
size_t i = 0; i < kernel_size[0]; i++) {
166 result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]);
169 const size_t tensor_index =
170 indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) +
171 indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0],
172 itemID.get_local_id(1) + output_offset[1]);
174 buffer_ptr[tensor_index] = result;
179template <
typename Evaluator,
typename CoeffReturnType,
typename KernelType,
typename Index,
typename InputDims,
180 typename Kernel_accessor,
typename Buffer_accessor>
181struct EigenConvolutionKernel<Evaluator,
CoeffReturnType, KernelType,
Index, InputDims, Kernel_accessor,
182 Buffer_accessor, convolution_type::CONV3D> {
183 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
185 Local_accessor local_acc;
186 Evaluator device_evaluator;
187 Kernel_accessor kernel_filter;
188 Buffer_accessor buffer_acc;
189 internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper;
190 const cl::sycl::range<3> kernel_size;
191 const cl::sycl::range<3> input_range;
194 EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
195 Buffer_accessor buffer_acc_,
196 internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper_,
197 const cl::sycl::range<3> kernel_size_,
const cl::sycl::range<3> input_range_,
199 : local_acc(local_acc_),
200 device_evaluator(device_evaluator_),
201 kernel_filter(kernel_filter_),
202 buffer_acc(buffer_acc_),
203 indexMapper(indexMapper_),
204 kernel_size(kernel_size_),
205 input_range(input_range_),
207 template <
typename BooleanDim3>
208 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
bool boundary_check(
const BooleanDim3 boolean_check)
const {
209 return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
211 void operator()(cl::sycl::nd_item<3> itemID)
const {
212 auto buffer_ptr = buffer_acc;
213 auto kernel_ptr = kernel_filter;
214 const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
216 const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
218 const auto output_offset =
219 cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()};
221 for (
size_t p = 0; p < numP; p++) {
223 const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
224 for (
size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) {
225 size_t local_index_dim2 = num_input[0] * num_input[1] * k;
226 bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1));
227 for (
size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
228 bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1));
229 size_t local_index_dim1 = (num_input[0] * j) + local_index_dim2;
230 for (
size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
231 bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1));
232 const size_t local_index = local_index_dim1 + i;
233 const size_t tensor_index =
234 plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
235 i + input_offset[0], j + input_offset[1], k + input_offset[2]);
236 local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0);
240 itemID.barrier(cl::sycl::access::fence_space::local_space);
244 if (boundary_check(itemID.get_global_id() < input_range)) {
245 CoeffReturnType result =
static_cast<CoeffReturnType
>(0);
246 for (
size_t k = 0; k < kernel_size[2]; k++) {
247 for (
size_t j = 0; j < kernel_size[1]; j++) {
248 for (
size_t i = 0; i < kernel_size[0]; i++) {
249 const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k);
250 const size_t local_index =
251 ((i + itemID.get_local_id(0)) +
252 num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2))));
254 result += (local_acc[local_index] * kernel_ptr[kernel_index]);
258 const size_t tensor_index =
259 indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) +
260 indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]);
261 buffer_ptr[tensor_index] = result;
264 itemID.barrier(cl::sycl::access::fence_space::local_space);
269template <
typename Indices,
typename InputArgType,
typename KernelArgType>
271 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
273 static constexpr int NumDims =
274 internal::array_size<typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions>::value;
275 static constexpr int NumKernelDims = internal::array_size<Indices>::value;
276 typedef typename XprType::Index Index;
277 typedef DSizes<Index, NumDims> Dimensions;
278 typedef typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions KernelDimensions;
279 typedef const Eigen::SyclDevice Device;
280 typedef typename XprType::CoeffReturnType CoeffReturnType;
281 typedef typename PacketType<CoeffReturnType, Eigen::SyclDevice>::type PacketReturnType;
282 typedef typename InputArgType::Scalar Scalar;
283 static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
284 typedef StorageMemory<CoeffReturnType, Eigen::SyclDevice> Storage;
285 typedef typename Storage::Type EvaluatorPointerType;
286 typedef StorageMemory<const CoeffReturnType, Eigen::SyclDevice> KernelStorage;
288 static constexpr int Layout = TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout;
290 IsAligned = TensorEvaluator<InputArgType, Eigen::SyclDevice>::IsAligned &
291 TensorEvaluator<KernelArgType, Eigen::SyclDevice>::IsAligned,
292 PacketAccess =
false,
294 PreferBlockAccess =
false,
300 typedef internal::TensorBlockNotImplemented TensorBlock;
303 TensorEvaluator(
const XprType &op,
const Eigen::SyclDevice &device)
304 : m_inputImpl(op.inputExpression(), device),
305 m_kernelArg(op.kernelExpression()),
306 m_kernelImpl(op.kernelExpression(), device),
307 m_indices(op.indices()),
310 m_local_kernel(false),
312 EIGEN_STATIC_ASSERT((
static_cast<int>(TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout) ==
313 static_cast<int>(TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Layout)),
314 YOU_MADE_A_PROGRAMMING_MISTAKE);
316 const typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions &input_dims = m_inputImpl.dimensions();
317 const typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions &kernel_dims =
318 m_kernelImpl.dimensions();
320 m_dimensions = m_inputImpl.dimensions();
321 for (
int i = 0; i < NumKernelDims; ++i) {
322 const Index index = op.indices()[i];
323 const Index input_dim = input_dims[index];
324 const Index kernel_dim = kernel_dims[i];
325 const Index result_dim = input_dim - kernel_dim + 1;
326 m_dimensions[index] = result_dim;
330 EIGEN_DEVICE_FUNC
const Dimensions &dimensions()
const {
return m_dimensions; }
332 EIGEN_STRONG_INLINE
bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
334 m_inputImpl.evalSubExprsIfNeeded(NULL);
339 m_buf = (EvaluatorPointerType)m_device.get(
340 (Scalar *)m_device.allocate_temp(dimensions().TotalSize() *
sizeof(Scalar)));
346 EIGEN_STRONG_INLINE
void cleanup() {
347 m_inputImpl.cleanup();
349 m_device.deallocate_temp(m_buf);
352 if (m_local_kernel) {
353 m_device.deallocate_temp(m_kernel);
354 m_local_kernel =
false;
359 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Device &device()
const {
return m_device; }
361 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data()
const {
return m_buf; }
363 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void preloadKernel() {
366 typename KernelStorage::Type in_place = m_kernelImpl.data();
369 m_local_kernel =
false;
371 ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() *
sizeof(Scalar);
372 EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz));
373 typedef TensorEvalToOp<const KernelArgType> EvalTo;
374 EvalTo evalToTmp(m_device.get(local), m_kernelArg);
375 const bool PacketAccess = internal::IsVectorizable<Eigen::SyclDevice, KernelArgType>::value;
376 internal::TensorExecutor<const EvalTo, Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
378 m_local_kernel =
true;
382 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void executeEval(EvaluatorPointerType data)
const {
383 typedef TensorEvaluator<InputArgType, Eigen::SyclDevice> InputEvaluator;
384 typedef typename InputEvaluator::Dimensions InputDims;
385 switch (NumKernelDims) {
387 const size_t numX = dimensions()[m_indices[0]];
388 const size_t numP = dimensions().TotalSize() / numX;
389 const auto input_dim = std::array<size_t, 2>{numX, numP};
390 auto global_range = cl::sycl::range<2>{1, 1};
391 auto local_range = cl::sycl::range<2>{1, 1};
392 const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
394 m_device.parallel_for_setup(input_dim, global_range, local_range);
395 const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]);
396 gpu_assert(
static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
397 const array<Index, 1> indices{{m_indices[0]}};
398 const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
399 internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
401 typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
402 typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV1D>
406 .template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
407 m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
408 indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]))
414 auto kernel_index = std::array<size_t, 2>{
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 0 : 1,
415 static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 1 : 0};
416 auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
417 (size_t)m_kernelImpl.dimensions()[kernel_index[1]]};
418 const size_t numX = dimensions()[m_indices[kernel_index[0]]];
419 const size_t numY = dimensions()[m_indices[kernel_index[1]]];
420 const size_t numP = dimensions().TotalSize() / (numX * numY);
421 auto input_dim = std::array<size_t, 3>{numX, numY, numP};
423 auto global_range = cl::sycl::range<3>{1, 1, 1};
424 auto local_range = cl::sycl::range<3>{1, 1, 1};
426 m_device.parallel_for_setup(input_dim, global_range, local_range);
428 const size_t local_memory_size =
429 (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2];
430 gpu_assert(
static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
431 const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}};
432 const array<Index, 2> kernel_dims{
433 {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}};
434 internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
435 typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
436 typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV2D>
439 .template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
440 m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
441 indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]})
447 auto kernel_index = std::array<size_t, 3>{
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 0 : 2,
448 static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 1 : 1,
449 static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 2 : 0};
451 auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
452 (size_t)m_kernelImpl.dimensions()[kernel_index[1]],
453 (size_t)m_kernelImpl.dimensions()[kernel_index[2]]};
455 const size_t numX = dimensions()[m_indices[kernel_index[0]]];
456 const size_t numY = dimensions()[m_indices[kernel_index[1]]];
457 const size_t numZ = dimensions()[m_indices[kernel_index[2]]];
458 auto input_dim = std::array<size_t, 3>{numX, numY, numZ};
459 const size_t numP = dimensions().TotalSize() / (numX * numY * numZ);
461 const array<Index, 3> indices{
462 {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}};
463 const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]],
464 m_kernelImpl.dimensions()[kernel_index[1]],
465 m_kernelImpl.dimensions()[kernel_index[2]]}};
467 internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
469 auto global_range = cl::sycl::range<3>{1, 1, 1};
470 auto local_range = cl::sycl::range<3>{1, 1, 1};
472 m_device.parallel_for_setup(input_dim, global_range, local_range);
473 auto local_memory_range = (local_range + kernel_size - 1);
474 const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2];
476 gpu_assert(
static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
477 typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
478 typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV3D>
481 .template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
482 m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
483 indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP)
489 EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
490 THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
495 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index)
const {
496 eigen_assert(m_buf != NULL);
497 eigen_assert(index < m_dimensions.TotalSize());
501 template <
int LoadMode>
502 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(
const Index index)
const {
503 eigen_assert(m_buf != NULL);
504 eigen_assert(index < m_dimensions.TotalSize());
505 return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
508 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(
bool vectorized)
const {
511 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
513 const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
514 const double firstIndex_compute_cost =
516 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
517 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
518 kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
519 TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
524 TensorEvaluator &operator=(
const TensorEvaluator &);
525 TensorEvaluator<InputArgType, Eigen::SyclDevice> m_inputImpl;
526 KernelArgType m_kernelArg;
527 TensorEvaluator<KernelArgType, Eigen::SyclDevice> m_kernelImpl;
529 Dimensions m_dimensions;
530 EvaluatorPointerType m_buf;
531 typename KernelStorage::Type m_kernel;
533 const Eigen::SyclDevice EIGEN_DEVICE_REF m_device;
Definition TensorConvolution.h:232
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The tensor evaluator class.
Definition TensorEvaluator.h:30