Loading...
Searching...
No Matches
TensorDeviceCuda.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H)
11#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
12
13namespace Eigen {
14
15static const int kCudaScratchSize = 1024;
16
17// This defines an interface that GPUDevice can take to use
18// CUDA streams underneath.
19class StreamInterface {
20 public:
21 virtual ~StreamInterface() {}
22
23 virtual const cudaStream_t& stream() const = 0;
24 virtual const cudaDeviceProp& deviceProperties() const = 0;
25
26 // Allocate memory on the actual device where the computation will run
27 virtual void* allocate(size_t num_bytes) const = 0;
28 virtual void deallocate(void* buffer) const = 0;
29
30 // Return a scratchpad buffer of size 1k
31 virtual void* scratchpad() const = 0;
32
33 // Return a semaphore. The semaphore is initially initialized to 0, and
34 // each kernel using it is responsible for resetting to 0 upon completion
35 // to maintain the invariant that the semaphore is always equal to 0 upon
36 // each kernel start.
37 virtual unsigned int* semaphore() const = 0;
38};
39
40static cudaDeviceProp* m_deviceProperties;
41static volatile bool m_devicePropInitialized = false;
42
43static void initializeDeviceProp() {
44 if (!m_devicePropInitialized) {
45 // Attempts to ensure proper behavior in the case of multiple threads
46 // calling this function simultaneously. This would be trivial to
47 // implement if we could use std::mutex, but unfortunately mutex don't
48 // compile with nvcc, so we resort to atomics and thread fences instead.
49 // Note that if the caller uses a compiler that doesn't support c++11 we
50 // can't ensure that the initialization is thread safe.
51#if __cplusplus >= 201103L
52 static std::atomic<bool> first(true);
53 if (first.exchange(false)) {
54#else
55 static bool first = true;
56 if (first) {
57 first = false;
58#endif
59 // We're the first thread to reach this point.
60 int num_devices;
61 cudaError_t status = cudaGetDeviceCount(&num_devices);
62 if (status != cudaSuccess) {
63 std::cerr << "Failed to get the number of CUDA devices: "
64 << cudaGetErrorString(status)
65 << std::endl;
66 assert(status == cudaSuccess);
67 }
68 m_deviceProperties = new cudaDeviceProp[num_devices];
69 for (int i = 0; i < num_devices; ++i) {
70 status = cudaGetDeviceProperties(&m_deviceProperties[i], i);
71 if (status != cudaSuccess) {
72 std::cerr << "Failed to initialize CUDA device #"
73 << i
74 << ": "
75 << cudaGetErrorString(status)
76 << std::endl;
77 assert(status == cudaSuccess);
78 }
79 }
80
81#if __cplusplus >= 201103L
82 std::atomic_thread_fence(std::memory_order_release);
83#endif
84 m_devicePropInitialized = true;
85 } else {
86 // Wait for the other thread to inititialize the properties.
87 while (!m_devicePropInitialized) {
88#if __cplusplus >= 201103L
89 std::atomic_thread_fence(std::memory_order_acquire);
90 std::this_thread::sleep_for(std::chrono::milliseconds(1000));
91#elif defined(_WIN32)
92 Sleep(1);
93#else
94 sleep(1);
95#endif
96 }
97 }
98 }
99}
100
101static const cudaStream_t default_stream = cudaStreamDefault;
102
103class CudaStreamDevice : public StreamInterface {
104 public:
105 // Use the default stream on the current device
106 CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
107 cudaGetDevice(&device_);
108 initializeDeviceProp();
109 }
110 // Use the default stream on the specified device
111 CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
112 initializeDeviceProp();
113 }
114 // Use the specified stream. Note that it's the
115 // caller responsibility to ensure that the stream can run on
116 // the specified device. If no device is specified the code
117 // assumes that the stream is associated to the current gpu device.
118 CudaStreamDevice(const cudaStream_t* stream, int device = -1)
119 : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
120 if (device < 0) {
121 cudaGetDevice(&device_);
122 } else {
123 int num_devices;
124 cudaError_t err = cudaGetDeviceCount(&num_devices);
125 EIGEN_UNUSED_VARIABLE(err)
126 assert(err == cudaSuccess);
127 assert(device < num_devices);
128 device_ = device;
129 }
130 initializeDeviceProp();
131 }
132
133 virtual ~CudaStreamDevice() {
134 if (scratch_) {
135 deallocate(scratch_);
136 }
137 }
138
139 const cudaStream_t& stream() const { return *stream_; }
140 const cudaDeviceProp& deviceProperties() const {
141 return m_deviceProperties[device_];
142 }
143 virtual void* allocate(size_t num_bytes) const {
144 cudaError_t err = cudaSetDevice(device_);
145 EIGEN_UNUSED_VARIABLE(err)
146 assert(err == cudaSuccess);
147 void* result;
148 err = cudaMalloc(&result, num_bytes);
149 assert(err == cudaSuccess);
150 assert(result != NULL);
151 return result;
152 }
153 virtual void deallocate(void* buffer) const {
154 cudaError_t err = cudaSetDevice(device_);
155 EIGEN_UNUSED_VARIABLE(err)
156 assert(err == cudaSuccess);
157 assert(buffer != NULL);
158 err = cudaFree(buffer);
159 assert(err == cudaSuccess);
160 }
161
162 virtual void* scratchpad() const {
163 if (scratch_ == NULL) {
164 scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int));
165 }
166 return scratch_;
167 }
168
169 virtual unsigned int* semaphore() const {
170 if (semaphore_ == NULL) {
171 char* scratch = static_cast<char*>(scratchpad()) + kCudaScratchSize;
172 semaphore_ = reinterpret_cast<unsigned int*>(scratch);
173 cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
174 EIGEN_UNUSED_VARIABLE(err)
175 assert(err == cudaSuccess);
176 }
177 return semaphore_;
178 }
179
180 private:
181 const cudaStream_t* stream_;
182 int device_;
183 mutable void* scratch_;
184 mutable unsigned int* semaphore_;
185};
186
187struct GpuDevice {
188 // The StreamInterface is not owned: the caller is
189 // responsible for its initialization and eventual destruction.
190 explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
191 eigen_assert(stream);
192 }
193 explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
194 eigen_assert(stream);
195 }
196 // TODO(bsteiner): This is an internal API, we should not expose it.
197 EIGEN_STRONG_INLINE const cudaStream_t& stream() const {
198 return stream_->stream();
199 }
200
201 EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
202 return stream_->allocate(num_bytes);
203 }
204
205 EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
206 stream_->deallocate(buffer);
207 }
208
209 EIGEN_STRONG_INLINE void* scratchpad() const {
210 return stream_->scratchpad();
211 }
212
213 EIGEN_STRONG_INLINE unsigned int* semaphore() const {
214 return stream_->semaphore();
215 }
216
217 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
218#ifndef __CUDA_ARCH__
219 cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
220 stream_->stream());
221 EIGEN_ONLY_USED_FOR_DEBUG(err);
222 assert(err == cudaSuccess);
223#else
224 EIGEN_UNUSED_VARIABLE(dst);
225 EIGEN_UNUSED_VARIABLE(src);
226 EIGEN_UNUSED_VARIABLE(n);
227 eigen_assert(false && "The default device should be used instead to generate kernel code");
228#endif
229 }
230
231 EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
232 cudaError_t err =
233 cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
234 EIGEN_UNUSED_VARIABLE(err)
235 assert(err == cudaSuccess);
236 }
237
238 EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
239 cudaError_t err =
240 cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
241 EIGEN_UNUSED_VARIABLE(err)
242 assert(err == cudaSuccess);
243 }
244
245 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
246#ifndef __CUDA_ARCH__
247 cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream());
248 EIGEN_UNUSED_VARIABLE(err)
249 assert(err == cudaSuccess);
250#else
251 EIGEN_UNUSED_VARIABLE(buffer);
252 EIGEN_UNUSED_VARIABLE(c);
253 EIGEN_UNUSED_VARIABLE(n);
254 eigen_assert(false && "The default device should be used instead to generate kernel code");
255#endif
256 }
257
258 EIGEN_STRONG_INLINE size_t numThreads() const {
259 // FIXME
260 return 32;
261 }
262
263 EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
264 // FIXME
265 return 48*1024;
266 }
267
268 EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
269 // We won't try to take advantage of the l2 cache for the time being, and
270 // there is no l3 cache on cuda devices.
271 return firstLevelCacheSize();
272 }
273
274 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
275#if defined(__CUDACC__) && !defined(__CUDA_ARCH__)
276 cudaError_t err = cudaStreamSynchronize(stream_->stream());
277 if (err != cudaSuccess) {
278 std::cerr << "Error detected in CUDA stream: "
279 << cudaGetErrorString(err)
280 << std::endl;
281 assert(err == cudaSuccess);
282 }
283#else
284 assert(false && "The default device should be used instead to generate kernel code");
285#endif
286 }
287
288 EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
289 return stream_->deviceProperties().multiProcessorCount;
290 }
291 EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
292 return stream_->deviceProperties().maxThreadsPerBlock;
293 }
294 EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
295 return stream_->deviceProperties().maxThreadsPerMultiProcessor;
296 }
297 EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
298 return static_cast<int>(stream_->deviceProperties().sharedMemPerBlock);
299 }
300 EIGEN_STRONG_INLINE int majorDeviceVersion() const {
301 return stream_->deviceProperties().major;
302 }
303 EIGEN_STRONG_INLINE int minorDeviceVersion() const {
304 return stream_->deviceProperties().minor;
305 }
306
307 EIGEN_STRONG_INLINE int maxBlocks() const {
308 return max_blocks_;
309 }
310
311 // This function checks if the CUDA runtime recorded an error for the
312 // underlying stream device.
313 inline bool ok() const {
314#ifdef __CUDACC__
315 cudaError_t error = cudaStreamQuery(stream_->stream());
316 return (error == cudaSuccess) || (error == cudaErrorNotReady);
317#else
318 return false;
319#endif
320 }
321
322 private:
323 const StreamInterface* stream_;
324 int max_blocks_;
325};
326
327#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
328 (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
329 assert(cudaGetLastError() == cudaSuccess);
330
331
332// FIXME: Should be device and kernel specific.
333#ifdef __CUDACC__
334static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
335#ifndef __CUDA_ARCH__
336 cudaError_t status = cudaDeviceSetSharedMemConfig(config);
337 EIGEN_UNUSED_VARIABLE(status)
338 assert(status == cudaSuccess);
339#else
340 EIGEN_UNUSED_VARIABLE(config)
341#endif
342}
343#endif
344
345} // end namespace Eigen
346
347#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
Namespace containing all symbols from the Eigen library.