10#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H)
11#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
15static const int kCudaScratchSize = 1024;
19class StreamInterface {
21 virtual ~StreamInterface() {}
23 virtual const cudaStream_t& stream()
const = 0;
24 virtual const cudaDeviceProp& deviceProperties()
const = 0;
27 virtual void* allocate(
size_t num_bytes)
const = 0;
28 virtual void deallocate(
void* buffer)
const = 0;
31 virtual void* scratchpad()
const = 0;
37 virtual unsigned int* semaphore()
const = 0;
40static cudaDeviceProp* m_deviceProperties;
41static volatile bool m_devicePropInitialized =
false;
43static void initializeDeviceProp() {
44 if (!m_devicePropInitialized) {
51#if __cplusplus >= 201103L
52 static std::atomic<bool> first(
true);
53 if (first.exchange(
false)) {
55 static bool first =
true;
61 cudaError_t status = cudaGetDeviceCount(&num_devices);
62 if (status != cudaSuccess) {
63 std::cerr <<
"Failed to get the number of CUDA devices: "
64 << cudaGetErrorString(status)
66 assert(status == cudaSuccess);
68 m_deviceProperties =
new cudaDeviceProp[num_devices];
69 for (
int i = 0; i < num_devices; ++i) {
70 status = cudaGetDeviceProperties(&m_deviceProperties[i], i);
71 if (status != cudaSuccess) {
72 std::cerr <<
"Failed to initialize CUDA device #"
75 << cudaGetErrorString(status)
77 assert(status == cudaSuccess);
81#if __cplusplus >= 201103L
82 std::atomic_thread_fence(std::memory_order_release);
84 m_devicePropInitialized =
true;
87 while (!m_devicePropInitialized) {
88#if __cplusplus >= 201103L
89 std::atomic_thread_fence(std::memory_order_acquire);
90 std::this_thread::sleep_for(std::chrono::milliseconds(1000));
101static const cudaStream_t default_stream = cudaStreamDefault;
103class CudaStreamDevice :
public StreamInterface {
106 CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
107 cudaGetDevice(&device_);
108 initializeDeviceProp();
111 CudaStreamDevice(
int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
112 initializeDeviceProp();
118 CudaStreamDevice(
const cudaStream_t* stream,
int device = -1)
119 : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
121 cudaGetDevice(&device_);
124 cudaError_t err = cudaGetDeviceCount(&num_devices);
125 EIGEN_UNUSED_VARIABLE(err)
126 assert(err == cudaSuccess);
127 assert(device < num_devices);
130 initializeDeviceProp();
133 virtual ~CudaStreamDevice() {
135 deallocate(scratch_);
139 const cudaStream_t& stream()
const {
return *stream_; }
140 const cudaDeviceProp& deviceProperties()
const {
141 return m_deviceProperties[device_];
143 virtual void* allocate(
size_t num_bytes)
const {
144 cudaError_t err = cudaSetDevice(device_);
145 EIGEN_UNUSED_VARIABLE(err)
146 assert(err == cudaSuccess);
148 err = cudaMalloc(&result, num_bytes);
149 assert(err == cudaSuccess);
150 assert(result != NULL);
153 virtual void deallocate(
void* buffer)
const {
154 cudaError_t err = cudaSetDevice(device_);
155 EIGEN_UNUSED_VARIABLE(err)
156 assert(err == cudaSuccess);
157 assert(buffer != NULL);
158 err = cudaFree(buffer);
159 assert(err == cudaSuccess);
162 virtual void* scratchpad()
const {
163 if (scratch_ == NULL) {
164 scratch_ = allocate(kCudaScratchSize +
sizeof(
unsigned int));
169 virtual unsigned int* semaphore()
const {
170 if (semaphore_ == NULL) {
171 char* scratch =
static_cast<char*
>(scratchpad()) + kCudaScratchSize;
172 semaphore_ =
reinterpret_cast<unsigned int*
>(scratch);
173 cudaError_t err = cudaMemsetAsync(semaphore_, 0,
sizeof(
unsigned int), *stream_);
174 EIGEN_UNUSED_VARIABLE(err)
175 assert(err == cudaSuccess);
181 const cudaStream_t* stream_;
183 mutable void* scratch_;
184 mutable unsigned int* semaphore_;
190 explicit GpuDevice(
const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
191 eigen_assert(stream);
193 explicit GpuDevice(
const StreamInterface* stream,
int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
194 eigen_assert(stream);
197 EIGEN_STRONG_INLINE
const cudaStream_t& stream()
const {
198 return stream_->stream();
201 EIGEN_STRONG_INLINE
void* allocate(
size_t num_bytes)
const {
202 return stream_->allocate(num_bytes);
205 EIGEN_STRONG_INLINE
void deallocate(
void* buffer)
const {
206 stream_->deallocate(buffer);
209 EIGEN_STRONG_INLINE
void* scratchpad()
const {
210 return stream_->scratchpad();
213 EIGEN_STRONG_INLINE
unsigned int* semaphore()
const {
214 return stream_->semaphore();
217 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void memcpy(
void* dst,
const void* src,
size_t n)
const {
219 cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
221 EIGEN_ONLY_USED_FOR_DEBUG(err);
222 assert(err == cudaSuccess);
224 EIGEN_UNUSED_VARIABLE(dst);
225 EIGEN_UNUSED_VARIABLE(src);
226 EIGEN_UNUSED_VARIABLE(n);
227 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
231 EIGEN_STRONG_INLINE
void memcpyHostToDevice(
void* dst,
const void* src,
size_t n)
const {
233 cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
234 EIGEN_UNUSED_VARIABLE(err)
235 assert(err == cudaSuccess);
238 EIGEN_STRONG_INLINE
void memcpyDeviceToHost(
void* dst,
const void* src,
size_t n)
const {
240 cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
241 EIGEN_UNUSED_VARIABLE(err)
242 assert(err == cudaSuccess);
245 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void memset(
void* buffer,
int c,
size_t n)
const {
247 cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream());
248 EIGEN_UNUSED_VARIABLE(err)
249 assert(err == cudaSuccess);
251 EIGEN_UNUSED_VARIABLE(buffer);
252 EIGEN_UNUSED_VARIABLE(c);
253 EIGEN_UNUSED_VARIABLE(n);
254 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
258 EIGEN_STRONG_INLINE
size_t numThreads()
const {
263 EIGEN_STRONG_INLINE
size_t firstLevelCacheSize()
const {
268 EIGEN_STRONG_INLINE
size_t lastLevelCacheSize()
const {
271 return firstLevelCacheSize();
274 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void synchronize()
const {
275#if defined(__CUDACC__) && !defined(__CUDA_ARCH__)
276 cudaError_t err = cudaStreamSynchronize(stream_->stream());
277 if (err != cudaSuccess) {
278 std::cerr <<
"Error detected in CUDA stream: "
279 << cudaGetErrorString(err)
281 assert(err == cudaSuccess);
284 assert(
false &&
"The default device should be used instead to generate kernel code");
288 EIGEN_STRONG_INLINE
int getNumCudaMultiProcessors()
const {
289 return stream_->deviceProperties().multiProcessorCount;
291 EIGEN_STRONG_INLINE
int maxCudaThreadsPerBlock()
const {
292 return stream_->deviceProperties().maxThreadsPerBlock;
294 EIGEN_STRONG_INLINE
int maxCudaThreadsPerMultiProcessor()
const {
295 return stream_->deviceProperties().maxThreadsPerMultiProcessor;
297 EIGEN_STRONG_INLINE
int sharedMemPerBlock()
const {
298 return static_cast<int>(stream_->deviceProperties().sharedMemPerBlock);
300 EIGEN_STRONG_INLINE
int majorDeviceVersion()
const {
301 return stream_->deviceProperties().major;
303 EIGEN_STRONG_INLINE
int minorDeviceVersion()
const {
304 return stream_->deviceProperties().minor;
307 EIGEN_STRONG_INLINE
int maxBlocks()
const {
313 inline bool ok()
const {
315 cudaError_t error = cudaStreamQuery(stream_->stream());
316 return (error == cudaSuccess) || (error == cudaErrorNotReady);
323 const StreamInterface* stream_;
327#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
328 (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
329 assert(cudaGetLastError() == cudaSuccess);
334static EIGEN_DEVICE_FUNC
inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
336 cudaError_t status = cudaDeviceSetSharedMemConfig(config);
337 EIGEN_UNUSED_VARIABLE(status)
338 assert(status == cudaSuccess);
340 EIGEN_UNUSED_VARIABLE(config)
Namespace containing all symbols from the Eigen library.