15#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
16#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
20enum class convolution_type { CONV1D, CONV2D, CONV3D };
21template <
typename Evaluator,
typename CoeffReturnType,
typename KernelType,
typename Index,
typename InputDims,
22 typename Kernel_accessor,
typename Buffer_accessor, convolution_type Conv_Dim>
23struct EigenConvolutionKernel;
24template <
typename Evaluator,
typename CoeffReturnType,
typename KernelType,
typename Index,
typename InputDims,
25 typename Kernel_accessor,
typename Buffer_accessor>
26struct EigenConvolutionKernel<Evaluator,
CoeffReturnType, KernelType,
Index, InputDims, Kernel_accessor,
27 Buffer_accessor, convolution_type::CONV1D> {
28 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
30 Local_accessor local_acc;
31 Evaluator device_evaluator;
32 Kernel_accessor kernel_filter;
33 Buffer_accessor buffer_acc;
34 internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper;
35 const size_t kernelSize;
36 const cl::sycl::range<2> input_range;
37 EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
38 Buffer_accessor buffer_acc_,
39 internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper_,
40 const size_t kernelSize_,
const cl::sycl::range<2> input_range_)
41 : local_acc(local_acc_),
42 device_evaluator(device_evaluator_),
43 kernel_filter(kernel_filter_),
44 buffer_acc(buffer_acc_),
45 indexMapper(indexMapper_),
46 kernelSize(kernelSize_),
47 input_range(input_range_) {}
49 template <
typename BooleanDim2>
50 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
bool boundary_check(
const BooleanDim2 boolean_check) {
51 return (boolean_check[0] && boolean_check[1]);
53 void operator()(cl::sycl::nd_item<2> itemID) {
54 auto buffer_ptr = buffer_acc.get_pointer();
55 auto kernel_ptr = kernel_filter.get_pointer();
57 const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
58 const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
59 const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0];
60 const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1));
62 for (
size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) {
63 const size_t local_index = i + plane_kernel_offset;
64 const size_t tensor_index =
65 plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset);
67 local_acc[local_index] =
68 (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1])
69 ? device_evaluator.coeff(tensor_index)
73 itemID.barrier(cl::sycl::access::fence_space::local_space);
76 const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]);
77 if (boundary_check(itemID.get_global_id() < input_range)) {
78 CoeffReturnType result =
static_cast<CoeffReturnType
>(0);
79 const size_t index = plane_kernel_offset + itemID.get_local_id(0);
80 for (
size_t k = 0; k < kernelSize; ++k) {
81 result += (local_acc[k + index] * kernel_ptr[k]);
83 const size_t tensor_index =
84 indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) +
85 indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start);
86 buffer_ptr[tensor_index] = result;
91template <
typename Evaluator,
typename CoeffReturnType,
typename KernelType,
typename Index,
typename InputDims,
92 typename Kernel_accessor,
typename Buffer_accessor>
93struct EigenConvolutionKernel<Evaluator,
CoeffReturnType, KernelType,
Index, InputDims, Kernel_accessor,
94 Buffer_accessor, convolution_type::CONV2D> {
95 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
97 Local_accessor local_acc;
98 Evaluator device_evaluator;
99 Kernel_accessor kernel_filter;
100 Buffer_accessor buffer_acc;
101 internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper;
102 const cl::sycl::range<2> kernel_size;
103 const cl::sycl::range<3> input_range;
104 EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
105 Buffer_accessor buffer_acc_,
106 internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper_,
107 const cl::sycl::range<2> kernel_size_,
const cl::sycl::range<3> input_range_)
108 : local_acc(local_acc_),
109 device_evaluator(device_evaluator_),
110 kernel_filter(kernel_filter_),
111 buffer_acc(buffer_acc_),
112 indexMapper(indexMapper_),
113 kernel_size(kernel_size_),
114 input_range(input_range_) {}
115 template <
typename BooleanDim3>
116 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
bool boundary_check(
const BooleanDim3 boolean_check) {
117 return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
120 void operator()(cl::sycl::nd_item<3> itemID) {
121 auto buffer_ptr = buffer_acc.get_pointer();
122 auto kernel_ptr = kernel_filter.get_pointer();
124 const auto num_input = cl::sycl::range<2>{
125 (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
127 const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2));
128 const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1];
130 const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
131 itemID.get_group(1) * itemID.get_local_range()[1]};
134 bool in_range_dim2 = itemID.get_global_id(2) < input_range[2];
135 for (
size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
136 const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset);
137 bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1));
138 for (
size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
139 const size_t local_index = i + local_input_offset;
140 const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
141 i + input_offset[0], j + input_offset[1]);
142 local_acc[local_index] = (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) &&
143 in_range_dim1 && in_range_dim2)
144 ? device_evaluator.coeff(tensor_index)
145 : CoeffReturnType(0);
149 itemID.barrier(cl::sycl::access::fence_space::local_space);
152 const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
153 itemID.get_group(1) * itemID.get_local_range()[1]};
155 if (boundary_check(itemID.get_global_id() < input_range)) {
156 CoeffReturnType result =
static_cast<CoeffReturnType
>(0);
158 for (
size_t j = 0; j < kernel_size[1]; j++) {
159 size_t kernel_offset = kernel_size[0] * j;
161 (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0);
162 for (
size_t i = 0; i < kernel_size[0]; i++) {
163 result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]);
166 const size_t tensor_index =
167 indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) +
168 indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0],
169 itemID.get_local_id(1) + output_offset[1]);
171 buffer_ptr[tensor_index] = result;
176template <
typename Evaluator,
typename CoeffReturnType,
typename KernelType,
typename Index,
typename InputDims,
177 typename Kernel_accessor,
typename Buffer_accessor>
178struct EigenConvolutionKernel<Evaluator,
CoeffReturnType, KernelType,
Index, InputDims, Kernel_accessor,
179 Buffer_accessor, convolution_type::CONV3D> {
180 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
182 Local_accessor local_acc;
183 Evaluator device_evaluator;
184 Kernel_accessor kernel_filter;
185 Buffer_accessor buffer_acc;
186 internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper;
187 const cl::sycl::range<3> kernel_size;
188 const cl::sycl::range<3> input_range;
191 EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
192 Buffer_accessor buffer_acc_,
193 internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper_,
194 const cl::sycl::range<3> kernel_size_,
const cl::sycl::range<3> input_range_,
196 : local_acc(local_acc_),
197 device_evaluator(device_evaluator_),
198 kernel_filter(kernel_filter_),
199 buffer_acc(buffer_acc_),
200 indexMapper(indexMapper_),
201 kernel_size(kernel_size_),
202 input_range(input_range_),
204 template <
typename BooleanDim3>
205 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
bool boundary_check(
const BooleanDim3 boolean_check) {
206 return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
208 void operator()(cl::sycl::nd_item<3> itemID) {
209 auto buffer_ptr = buffer_acc.get_pointer();
210 auto kernel_ptr = kernel_filter.get_pointer();
211 const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
213 const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
215 const auto output_offset =
216 cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()};
218 for (
size_t p = 0; p < numP; p++) {
220 const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
221 for (
size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) {
222 size_t local_index_dim2 = num_input[0] * num_input[1] * k;
223 bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1));
224 for (
size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
225 bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1));
226 size_t local_index_dim1 = (num_input[0] * j) + local_index_dim2;
227 for (
size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
228 bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1));
229 const size_t local_index = local_index_dim1 + i;
230 const size_t tensor_index =
231 plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
232 i + input_offset[0], j + input_offset[1], k + input_offset[2]);
233 local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0);
237 itemID.barrier(cl::sycl::access::fence_space::local_space);
241 if (boundary_check(itemID.get_global_id() < input_range)) {
242 CoeffReturnType result =
static_cast<CoeffReturnType
>(0);
243 for (
size_t k = 0; k < kernel_size[2]; k++) {
244 for (
size_t j = 0; j < kernel_size[1]; j++) {
245 for (
size_t i = 0; i < kernel_size[0]; i++) {
246 const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k);
247 const size_t local_index =
248 ((i + itemID.get_local_id(0)) +
249 num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2))));
251 result += (local_acc[local_index] * kernel_ptr[kernel_index]);
255 const size_t tensor_index =
256 indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) +
257 indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]);
258 buffer_ptr[tensor_index] = result;
261 itemID.barrier(cl::sycl::access::fence_space::local_space);
266template <
typename Indices,
typename InputArgType,
typename KernelArgType>
268 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
270 static const int NumDims =
271 internal::array_size<typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions>::value;
272 static const int NumKernelDims = internal::array_size<Indices>::value;
273 typedef typename XprType::Index Index;
274 typedef DSizes<Index, NumDims> Dimensions;
275 typedef typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions KernelDimensions;
276 typedef const Eigen::SyclDevice Device;
277 typedef typename XprType::CoeffReturnType CoeffReturnType;
278 typedef typename PacketType<CoeffReturnType, Eigen::SyclDevice>::type PacketReturnType;
279 typedef typename InputArgType::Scalar Scalar;
280 static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
281 typedef StorageMemory<CoeffReturnType, Eigen::SyclDevice> Storage;
282 typedef typename Storage::Type EvaluatorPointerType;
283 typedef StorageMemory<const CoeffReturnType, Eigen::SyclDevice> KernelStorage;
286 IsAligned = TensorEvaluator<InputArgType, Eigen::SyclDevice>::IsAligned &
287 TensorEvaluator<KernelArgType, Eigen::SyclDevice>::IsAligned,
288 PacketAccess =
false,
290 PreferBlockAccess =
false,
291 Layout = TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout,
297 typedef internal::TensorBlockNotImplemented TensorBlock;
300 TensorEvaluator(
const XprType &op,
const Eigen::SyclDevice &device)
301 : m_inputImpl(op.inputExpression(), device),
302 m_kernelArg(op.kernelExpression()),
303 m_kernelImpl(op.kernelExpression(), device),
304 m_indices(op.indices()),
307 m_local_kernel(false),
309 EIGEN_STATIC_ASSERT((
static_cast<int>(TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout) ==
310 static_cast<int>(TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Layout)),
311 YOU_MADE_A_PROGRAMMING_MISTAKE);
313 const typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions &input_dims = m_inputImpl.dimensions();
314 const typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions &kernel_dims =
315 m_kernelImpl.dimensions();
317 m_dimensions = m_inputImpl.dimensions();
318 for (
int i = 0; i < NumKernelDims; ++i) {
319 const Index index = op.indices()[i];
320 const Index input_dim = input_dims[index];
321 const Index kernel_dim = kernel_dims[i];
322 const Index result_dim = input_dim - kernel_dim + 1;
323 m_dimensions[index] = result_dim;
327 EIGEN_DEVICE_FUNC
const Dimensions &dimensions()
const {
return m_dimensions; }
329 EIGEN_STRONG_INLINE
bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
331 m_inputImpl.evalSubExprsIfNeeded(NULL);
336 m_buf = (EvaluatorPointerType)m_device.get(
337 (Scalar *)m_device.allocate_temp(dimensions().TotalSize() *
sizeof(Scalar)));
343 EIGEN_STRONG_INLINE
void cleanup() {
344 m_inputImpl.cleanup();
346 m_device.deallocate_temp(m_buf);
349 if (m_local_kernel) {
350 m_device.deallocate_temp(m_kernel);
351 m_local_kernel =
false;
356 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Device &device()
const {
return m_device; }
358 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data()
const {
return m_buf; }
360 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void preloadKernel() {
363 typename KernelStorage::Type in_place = m_kernelImpl.data();
366 m_local_kernel =
false;
368 ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() *
sizeof(Scalar);
369 EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz));
370 typedef TensorEvalToOp<const KernelArgType> EvalTo;
371 EvalTo evalToTmp(m_device.get(local), m_kernelArg);
372 const bool PacketAccess = internal::IsVectorizable<Eigen::SyclDevice, KernelArgType>::value;
373 internal::TensorExecutor<const EvalTo, Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
375 m_local_kernel =
true;
379 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void executeEval(EvaluatorPointerType data)
const {
380 typedef TensorEvaluator<InputArgType, Eigen::SyclDevice> InputEvaluator;
381 typedef typename InputEvaluator::Dimensions InputDims;
382 switch (NumKernelDims) {
384 const size_t numX = dimensions()[m_indices[0]];
385 const size_t numP = dimensions().TotalSize() / numX;
386 const auto input_dim = std::array<size_t, 2>{numX, numP};
387 auto global_range = cl::sycl::range<2>{};
388 auto local_range = cl::sycl::range<2>{};
389 const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
391 m_device.parallel_for_setup(input_dim, global_range, local_range);
392 const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]);
393 gpu_assert(
static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
394 const array<Index, 1> indices{{m_indices[0]}};
395 const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
396 internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
398 typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
399 typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV1D>
402 m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
403 m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
404 indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]));
409 auto kernel_index = std::array<size_t, 2>{
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 0 : 1,
410 static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 1 : 0};
411 auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
412 (size_t)m_kernelImpl.dimensions()[kernel_index[1]]};
413 const size_t numX = dimensions()[m_indices[kernel_index[0]]];
414 const size_t numY = dimensions()[m_indices[kernel_index[1]]];
415 const size_t numP = dimensions().TotalSize() / (numX * numY);
416 auto input_dim = std::array<size_t, 3>{numX, numY, numP};
418 auto global_range = cl::sycl::range<3>{};
419 auto local_range = cl::sycl::range<3>{};
421 m_device.parallel_for_setup(input_dim, global_range, local_range);
423 const size_t local_memory_size =
424 (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2];
425 gpu_assert(
static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
426 const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}};
427 const array<Index, 2> kernel_dims{
428 {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}};
429 internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
430 typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
431 typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV2D>
433 m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
434 m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
435 indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]});
440 auto kernel_index = std::array<size_t, 3>{
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 0 : 2,
441 static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 1 : 1,
442 static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 2 : 0};
444 auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
445 (size_t)m_kernelImpl.dimensions()[kernel_index[1]],
446 (size_t)m_kernelImpl.dimensions()[kernel_index[2]]};
448 const size_t numX = dimensions()[m_indices[kernel_index[0]]];
449 const size_t numY = dimensions()[m_indices[kernel_index[1]]];
450 const size_t numZ = dimensions()[m_indices[kernel_index[2]]];
451 auto input_dim = std::array<size_t, 3>{numX, numY, numZ};
452 const size_t numP = dimensions().TotalSize() / (numX * numY * numZ);
454 const array<Index, 3> indices{
455 {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}};
456 const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]],
457 m_kernelImpl.dimensions()[kernel_index[1]],
458 m_kernelImpl.dimensions()[kernel_index[2]]}};
460 internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
462 auto global_range = cl::sycl::range<3>{};
463 auto local_range = cl::sycl::range<3>{};
465 m_device.parallel_for_setup(input_dim, global_range, local_range);
466 auto local_memory_range = (local_range + kernel_size - 1);
467 const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2];
469 gpu_assert(
static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
470 typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
471 typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV3D>
473 m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
474 m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
475 indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP);
480 EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
481 THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
486 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index)
const {
487 eigen_assert(m_buf != NULL);
488 eigen_assert(index < m_dimensions.TotalSize());
492 template <
int LoadMode>
493 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(
const Index index)
const {
494 eigen_assert(m_buf != NULL);
495 eigen_assert(index < m_dimensions.TotalSize());
496 return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
499 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(
bool vectorized)
const {
502 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
504 const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
505 const double firstIndex_compute_cost =
507 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
508 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
509 kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
510 TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
513 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void bind(cl::sycl::handler &cgh)
const {
514 m_kernelImpl.bind(cgh);
515 m_inputImpl.bind(cgh);
522 TensorEvaluator &operator=(
const TensorEvaluator &);
523 TensorEvaluator<InputArgType, Eigen::SyclDevice> m_inputImpl;
524 KernelArgType m_kernelArg;
525 TensorEvaluator<KernelArgType, Eigen::SyclDevice> m_kernelImpl;
527 Dimensions m_dimensions;
528 EvaluatorPointerType m_buf;
529 typename KernelStorage::Type m_kernel;
531 const Eigen::SyclDevice EIGEN_DEVICE_REF m_device;
Definition TensorConvolution.h:254
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The tensor evaluator class.
Definition TensorEvaluator.h:27