10#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H)
11#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
14#include "./InternalHeaderCheck.h"
16#include "../../../../../Eigen/src/Core/util/GpuHipCudaDefines.inc"
20static const int kGpuScratchSize = 1024;
24class StreamInterface {
26 virtual ~StreamInterface() {}
28 virtual const gpuStream_t& stream()
const = 0;
29 virtual const gpuDeviceProp_t& deviceProperties()
const = 0;
32 virtual void* allocate(
size_t num_bytes)
const = 0;
33 virtual void deallocate(
void* buffer)
const = 0;
36 virtual void* scratchpad()
const = 0;
42 virtual unsigned int* semaphore()
const = 0;
45class GpuDeviceProperties {
47 GpuDeviceProperties() : initialized_(false), first_(true), device_properties_(nullptr) {}
49 ~GpuDeviceProperties() {
50 if (device_properties_) {
51 delete[] device_properties_;
55 EIGEN_STRONG_INLINE
const gpuDeviceProp_t& get(
int device)
const {
return device_properties_[device]; }
57 EIGEN_STRONG_INLINE
bool isInitialized()
const {
return initialized_; }
67 if (first_.exchange(
false)) {
70 gpuError_t status = gpuGetDeviceCount(&num_devices);
71 if (status != gpuSuccess) {
72 std::cerr <<
"Failed to get the number of GPU devices: " << gpuGetErrorString(status) << std::endl;
73 gpu_assert(status == gpuSuccess);
75 device_properties_ =
new gpuDeviceProp_t[num_devices];
76 for (
int i = 0; i < num_devices; ++i) {
77 status = gpuGetDeviceProperties(&device_properties_[i], i);
78 if (status != gpuSuccess) {
79 std::cerr <<
"Failed to initialize GPU device #" << i <<
": " << gpuGetErrorString(status) << std::endl;
80 gpu_assert(status == gpuSuccess);
84 std::atomic_thread_fence(std::memory_order_release);
88 while (!initialized_) {
89 std::atomic_thread_fence(std::memory_order_acquire);
90 std::this_thread::sleep_for(std::chrono::milliseconds(1000));
97 volatile bool initialized_;
98 std::atomic<bool> first_;
99 gpuDeviceProp_t* device_properties_;
102EIGEN_ALWAYS_INLINE
const GpuDeviceProperties& GetGpuDeviceProperties() {
103 static GpuDeviceProperties* deviceProperties =
new GpuDeviceProperties();
104 if (!deviceProperties->isInitialized()) {
105 deviceProperties->initialize();
107 return *deviceProperties;
110EIGEN_ALWAYS_INLINE
const gpuDeviceProp_t& GetGpuDeviceProperties(
int device) {
111 return GetGpuDeviceProperties().get(device);
114static const gpuStream_t default_stream = gpuStreamDefault;
116class GpuStreamDevice :
public StreamInterface {
119 GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
120 gpuError_t status = gpuGetDevice(&device_);
121 if (status != gpuSuccess) {
122 std::cerr <<
"Failed to get the GPU devices " << gpuGetErrorString(status) << std::endl;
123 gpu_assert(status == gpuSuccess);
127 GpuStreamDevice(
int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {}
132 GpuStreamDevice(
const gpuStream_t* stream,
int device = -1)
133 : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
135 gpuError_t status = gpuGetDevice(&device_);
136 if (status != gpuSuccess) {
137 std::cerr <<
"Failed to get the GPU devices " << gpuGetErrorString(status) << std::endl;
138 gpu_assert(status == gpuSuccess);
142 gpuError_t err = gpuGetDeviceCount(&num_devices);
143 EIGEN_UNUSED_VARIABLE(err)
144 gpu_assert(err == gpuSuccess);
145 gpu_assert(device < num_devices);
150 virtual ~GpuStreamDevice() {
152 deallocate(scratch_);
156 const gpuStream_t& stream()
const {
return *stream_; }
157 const gpuDeviceProp_t& deviceProperties()
const {
return GetGpuDeviceProperties(device_); }
158 virtual void* allocate(
size_t num_bytes)
const {
159 gpuError_t err = gpuSetDevice(device_);
160 EIGEN_UNUSED_VARIABLE(err)
161 gpu_assert(err == gpuSuccess);
163 err = gpuMalloc(&result, num_bytes);
164 gpu_assert(err == gpuSuccess);
165 gpu_assert(result != NULL);
168 virtual void deallocate(
void* buffer)
const {
169 gpuError_t err = gpuSetDevice(device_);
170 EIGEN_UNUSED_VARIABLE(err)
171 gpu_assert(err == gpuSuccess);
172 gpu_assert(buffer != NULL);
173 err = gpuFree(buffer);
174 gpu_assert(err == gpuSuccess);
177 virtual void* scratchpad()
const {
178 if (scratch_ == NULL) {
179 scratch_ = allocate(kGpuScratchSize +
sizeof(
unsigned int));
184 virtual unsigned int* semaphore()
const {
185 if (semaphore_ == NULL) {
186 char* scratch =
static_cast<char*
>(scratchpad()) + kGpuScratchSize;
187 semaphore_ =
reinterpret_cast<unsigned int*
>(scratch);
188 gpuError_t err = gpuMemsetAsync(semaphore_, 0,
sizeof(
unsigned int), *stream_);
189 EIGEN_UNUSED_VARIABLE(err)
190 gpu_assert(err == gpuSuccess);
196 const gpuStream_t* stream_;
198 mutable void* scratch_;
199 mutable unsigned int* semaphore_;
205 explicit GpuDevice(
const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { eigen_assert(stream); }
206 explicit GpuDevice(
const StreamInterface* stream,
int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
207 eigen_assert(stream);
210 EIGEN_STRONG_INLINE
const gpuStream_t& stream()
const {
return stream_->stream(); }
212 EIGEN_STRONG_INLINE
void* allocate(
size_t num_bytes)
const {
return stream_->allocate(num_bytes); }
214 EIGEN_STRONG_INLINE
void deallocate(
void* buffer)
const { stream_->deallocate(buffer); }
216 EIGEN_STRONG_INLINE
void* allocate_temp(
size_t num_bytes)
const {
return stream_->allocate(num_bytes); }
218 EIGEN_STRONG_INLINE
void deallocate_temp(
void* buffer)
const { stream_->deallocate(buffer); }
220 template <
typename Type>
221 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data)
const {
225 EIGEN_STRONG_INLINE
void* scratchpad()
const {
return stream_->scratchpad(); }
227 EIGEN_STRONG_INLINE
unsigned int* semaphore()
const {
return stream_->semaphore(); }
229 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void memcpy(
void* dst,
const void* src,
size_t n)
const {
230#ifndef EIGEN_GPU_COMPILE_PHASE
231 gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice, stream_->stream());
232 EIGEN_UNUSED_VARIABLE(err)
233 gpu_assert(err == gpuSuccess);
235 EIGEN_UNUSED_VARIABLE(dst);
236 EIGEN_UNUSED_VARIABLE(src);
237 EIGEN_UNUSED_VARIABLE(n);
238 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
242 EIGEN_STRONG_INLINE
void memcpyHostToDevice(
void* dst,
const void* src,
size_t n)
const {
243 gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream());
244 EIGEN_UNUSED_VARIABLE(err)
245 gpu_assert(err == gpuSuccess);
248 EIGEN_STRONG_INLINE
void memcpyDeviceToHost(
void* dst,
const void* src,
size_t n)
const {
249 gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream());
250 EIGEN_UNUSED_VARIABLE(err)
251 gpu_assert(err == gpuSuccess);
254 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void memset(
void* buffer,
int c,
size_t n)
const {
255#ifndef EIGEN_GPU_COMPILE_PHASE
256 gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream());
257 EIGEN_UNUSED_VARIABLE(err)
258 gpu_assert(err == gpuSuccess);
260 EIGEN_UNUSED_VARIABLE(buffer)
261 EIGEN_UNUSED_VARIABLE(c)
262 EIGEN_UNUSED_VARIABLE(n)
263 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
267 template <
typename T>
268 EIGEN_STRONG_INLINE
void fill(T* begin, T* end,
const T& value)
const {
269#ifndef EIGEN_GPU_COMPILE_PHASE
270 const size_t count = end - begin;
272 const int value_size =
sizeof(value);
273 char* buffer = (
char*)begin;
274 char* value_bytes = (
char*)(&value);
276 EIGEN_UNUSED_VARIABLE(err)
279 bool use_single_memset =
true;
280 for (
int i = 1; i < value_size; ++i) {
281 if (value_bytes[i] != value_bytes[0]) {
282 use_single_memset =
false;
286 if (use_single_memset) {
287 err = gpuMemsetAsync(buffer, value_bytes[0], count *
sizeof(T), stream_->stream());
288 gpu_assert(err == gpuSuccess);
290 for (
int b = 0; b < value_size; ++b) {
291 err = gpuMemset2DAsync(buffer + b, value_size, value_bytes[b], 1, count, stream_->stream());
292 gpu_assert(err == gpuSuccess);
296 EIGEN_UNUSED_VARIABLE(begin)
297 EIGEN_UNUSED_VARIABLE(end)
298 EIGEN_UNUSED_VARIABLE(value)
299 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
303 EIGEN_STRONG_INLINE
size_t numThreads()
const {
308 EIGEN_STRONG_INLINE
size_t firstLevelCacheSize()
const {
313 EIGEN_STRONG_INLINE
size_t lastLevelCacheSize()
const {
316 return firstLevelCacheSize();
319 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void synchronize()
const {
320#ifndef EIGEN_GPU_COMPILE_PHASE
321 gpuError_t err = gpuStreamSynchronize(stream_->stream());
322 if (err != gpuSuccess) {
323 std::cerr <<
"Error detected in GPU stream: " << gpuGetErrorString(err) << std::endl;
324 gpu_assert(err == gpuSuccess);
327 gpu_assert(
false &&
"The default device should be used instead to generate kernel code");
331 EIGEN_STRONG_INLINE
int getNumGpuMultiProcessors()
const {
return stream_->deviceProperties().multiProcessorCount; }
332 EIGEN_STRONG_INLINE
int maxGpuThreadsPerBlock()
const {
return stream_->deviceProperties().maxThreadsPerBlock; }
333 EIGEN_STRONG_INLINE
int maxGpuThreadsPerMultiProcessor()
const {
334 return stream_->deviceProperties().maxThreadsPerMultiProcessor;
336 EIGEN_STRONG_INLINE
int sharedMemPerBlock()
const {
337 return static_cast<int>(stream_->deviceProperties().sharedMemPerBlock);
339 EIGEN_STRONG_INLINE
int majorDeviceVersion()
const {
return stream_->deviceProperties().major; }
340 EIGEN_STRONG_INLINE
int minorDeviceVersion()
const {
return stream_->deviceProperties().minor; }
342 EIGEN_STRONG_INLINE
int maxBlocks()
const {
return max_blocks_; }
346 inline bool ok()
const {
348 gpuError_t error = gpuStreamQuery(stream_->stream());
349 return (error == gpuSuccess) || (error == gpuErrorNotReady);
356 const StreamInterface* stream_;
360#if defined(EIGEN_HIPCC)
362#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
363 hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \
364 gpu_assert(hipGetLastError() == hipSuccess);
368#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
369 (kernel)<<<(gridsize), (blocksize), (sharedmem), (device).stream()>>>(__VA_ARGS__); \
370 gpu_assert(cudaGetLastError() == cudaSuccess);
376static EIGEN_DEVICE_FUNC
inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
377#ifndef EIGEN_GPU_COMPILE_PHASE
378 gpuError_t status = gpuDeviceSetSharedMemConfig(config);
379 EIGEN_UNUSED_VARIABLE(status)
380 gpu_assert(status == gpuSuccess);
382 EIGEN_UNUSED_VARIABLE(config)
390#include "../../../../../Eigen/src/Core/util/GpuHipCudaUndefines.inc"
Namespace containing all symbols from the Eigen library.