Eigen-unsupported  5.0.1-dev+284dcc12
 
Loading...
Searching...
No Matches
TensorDeviceGpu.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H)
11#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
12
13// IWYU pragma: private
14#include "./InternalHeaderCheck.h"
15
16#include "../../../../../Eigen/src/Core/util/GpuHipCudaDefines.inc"
17
18namespace Eigen {
19
20static const int kGpuScratchSize = 1024;
21
22// This defines an interface that GPUDevice can take to use
23// HIP / CUDA streams underneath.
24class StreamInterface {
25 public:
26 virtual ~StreamInterface() {}
27
28 virtual const gpuStream_t& stream() const = 0;
29 virtual const gpuDeviceProp_t& deviceProperties() const = 0;
30
31 // Allocate memory on the actual device where the computation will run
32 virtual void* allocate(size_t num_bytes) const = 0;
33 virtual void deallocate(void* buffer) const = 0;
34
35 // Return a scratchpad buffer of size 1k
36 virtual void* scratchpad() const = 0;
37
38 // Return a semaphore. The semaphore is initially initialized to 0, and
39 // each kernel using it is responsible for resetting to 0 upon completion
40 // to maintain the invariant that the semaphore is always equal to 0 upon
41 // each kernel start.
42 virtual unsigned int* semaphore() const = 0;
43};
44
45class GpuDeviceProperties {
46 public:
47 GpuDeviceProperties() : initialized_(false), first_(true), device_properties_(nullptr) {}
48
49 ~GpuDeviceProperties() {
50 if (device_properties_) {
51 delete[] device_properties_;
52 }
53 }
54
55 EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const { return device_properties_[device]; }
56
57 EIGEN_STRONG_INLINE bool isInitialized() const { return initialized_; }
58
59 void initialize() {
60 if (!initialized_) {
61 // Attempts to ensure proper behavior in the case of multiple threads
62 // calling this function simultaneously. This would be trivial to
63 // implement if we could use std::mutex, but unfortunately mutex don't
64 // compile with nvcc, so we resort to atomics and thread fences instead.
65 // Note that if the caller uses a compiler that doesn't support c++11 we
66 // can't ensure that the initialization is thread safe.
67 if (first_.exchange(false)) {
68 // We're the first thread to reach this point.
69 int num_devices;
70 gpuError_t status = gpuGetDeviceCount(&num_devices);
71 if (status != gpuSuccess) {
72 std::cerr << "Failed to get the number of GPU devices: " << gpuGetErrorString(status) << std::endl;
73 gpu_assert(status == gpuSuccess);
74 }
75 device_properties_ = new gpuDeviceProp_t[num_devices];
76 for (int i = 0; i < num_devices; ++i) {
77 status = gpuGetDeviceProperties(&device_properties_[i], i);
78 if (status != gpuSuccess) {
79 std::cerr << "Failed to initialize GPU device #" << i << ": " << gpuGetErrorString(status) << std::endl;
80 gpu_assert(status == gpuSuccess);
81 }
82 }
83
84 std::atomic_thread_fence(std::memory_order_release);
85 initialized_ = true;
86 } else {
87 // Wait for the other thread to inititialize the properties.
88 while (!initialized_) {
89 std::atomic_thread_fence(std::memory_order_acquire);
90 std::this_thread::sleep_for(std::chrono::milliseconds(1000));
91 }
92 }
93 }
94 }
95
96 private:
97 volatile bool initialized_;
98 std::atomic<bool> first_;
99 gpuDeviceProp_t* device_properties_;
100};
101
102EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() {
103 static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties();
104 if (!deviceProperties->isInitialized()) {
105 deviceProperties->initialize();
106 }
107 return *deviceProperties;
108}
109
110EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) {
111 return GetGpuDeviceProperties().get(device);
112}
113
114static const gpuStream_t default_stream = gpuStreamDefault;
115
116class GpuStreamDevice : public StreamInterface {
117 public:
118 // Use the default stream on the current device
119 GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
120 gpuError_t status = gpuGetDevice(&device_);
121 if (status != gpuSuccess) {
122 std::cerr << "Failed to get the GPU devices " << gpuGetErrorString(status) << std::endl;
123 gpu_assert(status == gpuSuccess);
124 }
125 }
126 // Use the default stream on the specified device
127 GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {}
128 // Use the specified stream. Note that it's the
129 // caller responsibility to ensure that the stream can run on
130 // the specified device. If no device is specified the code
131 // assumes that the stream is associated to the current gpu device.
132 GpuStreamDevice(const gpuStream_t* stream, int device = -1)
133 : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
134 if (device < 0) {
135 gpuError_t status = gpuGetDevice(&device_);
136 if (status != gpuSuccess) {
137 std::cerr << "Failed to get the GPU devices " << gpuGetErrorString(status) << std::endl;
138 gpu_assert(status == gpuSuccess);
139 }
140 } else {
141 int num_devices;
142 gpuError_t err = gpuGetDeviceCount(&num_devices);
143 EIGEN_UNUSED_VARIABLE(err)
144 gpu_assert(err == gpuSuccess);
145 gpu_assert(device < num_devices);
146 device_ = device;
147 }
148 }
149
150 virtual ~GpuStreamDevice() {
151 if (scratch_) {
152 deallocate(scratch_);
153 }
154 }
155
156 const gpuStream_t& stream() const { return *stream_; }
157 const gpuDeviceProp_t& deviceProperties() const { return GetGpuDeviceProperties(device_); }
158 virtual void* allocate(size_t num_bytes) const {
159 gpuError_t err = gpuSetDevice(device_);
160 EIGEN_UNUSED_VARIABLE(err)
161 gpu_assert(err == gpuSuccess);
162 void* result;
163 err = gpuMalloc(&result, num_bytes);
164 gpu_assert(err == gpuSuccess);
165 gpu_assert(result != NULL);
166 return result;
167 }
168 virtual void deallocate(void* buffer) const {
169 gpuError_t err = gpuSetDevice(device_);
170 EIGEN_UNUSED_VARIABLE(err)
171 gpu_assert(err == gpuSuccess);
172 gpu_assert(buffer != NULL);
173 err = gpuFree(buffer);
174 gpu_assert(err == gpuSuccess);
175 }
176
177 virtual void* scratchpad() const {
178 if (scratch_ == NULL) {
179 scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int));
180 }
181 return scratch_;
182 }
183
184 virtual unsigned int* semaphore() const {
185 if (semaphore_ == NULL) {
186 char* scratch = static_cast<char*>(scratchpad()) + kGpuScratchSize;
187 semaphore_ = reinterpret_cast<unsigned int*>(scratch);
188 gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
189 EIGEN_UNUSED_VARIABLE(err)
190 gpu_assert(err == gpuSuccess);
191 }
192 return semaphore_;
193 }
194
195 private:
196 const gpuStream_t* stream_;
197 int device_;
198 mutable void* scratch_;
199 mutable unsigned int* semaphore_;
200};
201
202struct GpuDevice {
203 // The StreamInterface is not owned: the caller is
204 // responsible for its initialization and eventual destruction.
205 explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { eigen_assert(stream); }
206 explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
207 eigen_assert(stream);
208 }
209 // TODO(bsteiner): This is an internal API, we should not expose it.
210 EIGEN_STRONG_INLINE const gpuStream_t& stream() const { return stream_->stream(); }
211
212 EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { return stream_->allocate(num_bytes); }
213
214 EIGEN_STRONG_INLINE void deallocate(void* buffer) const { stream_->deallocate(buffer); }
215
216 EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { return stream_->allocate(num_bytes); }
217
218 EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { stream_->deallocate(buffer); }
219
220 template <typename Type>
221 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const {
222 return data;
223 }
224
225 EIGEN_STRONG_INLINE void* scratchpad() const { return stream_->scratchpad(); }
226
227 EIGEN_STRONG_INLINE unsigned int* semaphore() const { return stream_->semaphore(); }
228
229 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
230#ifndef EIGEN_GPU_COMPILE_PHASE
231 gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice, stream_->stream());
232 EIGEN_UNUSED_VARIABLE(err)
233 gpu_assert(err == gpuSuccess);
234#else
235 EIGEN_UNUSED_VARIABLE(dst);
236 EIGEN_UNUSED_VARIABLE(src);
237 EIGEN_UNUSED_VARIABLE(n);
238 eigen_assert(false && "The default device should be used instead to generate kernel code");
239#endif
240 }
241
242 EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
243 gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream());
244 EIGEN_UNUSED_VARIABLE(err)
245 gpu_assert(err == gpuSuccess);
246 }
247
248 EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
249 gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream());
250 EIGEN_UNUSED_VARIABLE(err)
251 gpu_assert(err == gpuSuccess);
252 }
253
254 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
255#ifndef EIGEN_GPU_COMPILE_PHASE
256 gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream());
257 EIGEN_UNUSED_VARIABLE(err)
258 gpu_assert(err == gpuSuccess);
259#else
260 EIGEN_UNUSED_VARIABLE(buffer)
261 EIGEN_UNUSED_VARIABLE(c)
262 EIGEN_UNUSED_VARIABLE(n)
263 eigen_assert(false && "The default device should be used instead to generate kernel code");
264#endif
265 }
266
267 template <typename T>
268 EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
269#ifndef EIGEN_GPU_COMPILE_PHASE
270 const size_t count = end - begin;
271 // Split value into bytes and run memset with stride.
272 const int value_size = sizeof(value);
273 char* buffer = (char*)begin;
274 char* value_bytes = (char*)(&value);
275 gpuError_t err;
276 EIGEN_UNUSED_VARIABLE(err)
277
278 // If all value bytes are equal, then a single memset can be much faster.
279 bool use_single_memset = true;
280 for (int i = 1; i < value_size; ++i) {
281 if (value_bytes[i] != value_bytes[0]) {
282 use_single_memset = false;
283 }
284 }
285
286 if (use_single_memset) {
287 err = gpuMemsetAsync(buffer, value_bytes[0], count * sizeof(T), stream_->stream());
288 gpu_assert(err == gpuSuccess);
289 } else {
290 for (int b = 0; b < value_size; ++b) {
291 err = gpuMemset2DAsync(buffer + b, value_size, value_bytes[b], 1, count, stream_->stream());
292 gpu_assert(err == gpuSuccess);
293 }
294 }
295#else
296 EIGEN_UNUSED_VARIABLE(begin)
297 EIGEN_UNUSED_VARIABLE(end)
298 EIGEN_UNUSED_VARIABLE(value)
299 eigen_assert(false && "The default device should be used instead to generate kernel code");
300#endif
301 }
302
303 EIGEN_STRONG_INLINE size_t numThreads() const {
304 // FIXME
305 return 32;
306 }
307
308 EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
309 // FIXME
310 return 48 * 1024;
311 }
312
313 EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
314 // We won't try to take advantage of the l2 cache for the time being, and
315 // there is no l3 cache on hip/cuda devices.
316 return firstLevelCacheSize();
317 }
318
319 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
320#ifndef EIGEN_GPU_COMPILE_PHASE
321 gpuError_t err = gpuStreamSynchronize(stream_->stream());
322 if (err != gpuSuccess) {
323 std::cerr << "Error detected in GPU stream: " << gpuGetErrorString(err) << std::endl;
324 gpu_assert(err == gpuSuccess);
325 }
326#else
327 gpu_assert(false && "The default device should be used instead to generate kernel code");
328#endif
329 }
330
331 EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const { return stream_->deviceProperties().multiProcessorCount; }
332 EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const { return stream_->deviceProperties().maxThreadsPerBlock; }
333 EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const {
334 return stream_->deviceProperties().maxThreadsPerMultiProcessor;
335 }
336 EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
337 return static_cast<int>(stream_->deviceProperties().sharedMemPerBlock);
338 }
339 EIGEN_STRONG_INLINE int majorDeviceVersion() const { return stream_->deviceProperties().major; }
340 EIGEN_STRONG_INLINE int minorDeviceVersion() const { return stream_->deviceProperties().minor; }
341
342 EIGEN_STRONG_INLINE int maxBlocks() const { return max_blocks_; }
343
344 // This function checks if the GPU runtime recorded an error for the
345 // underlying stream device.
346 inline bool ok() const {
347#ifdef EIGEN_GPUCC
348 gpuError_t error = gpuStreamQuery(stream_->stream());
349 return (error == gpuSuccess) || (error == gpuErrorNotReady);
350#else
351 return false;
352#endif
353 }
354
355 private:
356 const StreamInterface* stream_;
357 int max_blocks_;
358};
359
360#if defined(EIGEN_HIPCC)
361
362#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
363 hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \
364 gpu_assert(hipGetLastError() == hipSuccess);
365
366#else
367
368#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
369 (kernel)<<<(gridsize), (blocksize), (sharedmem), (device).stream()>>>(__VA_ARGS__); \
370 gpu_assert(cudaGetLastError() == cudaSuccess);
371
372#endif
373
374// FIXME: Should be device and kernel specific.
375#ifdef EIGEN_GPUCC
376static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
377#ifndef EIGEN_GPU_COMPILE_PHASE
378 gpuError_t status = gpuDeviceSetSharedMemConfig(config);
379 EIGEN_UNUSED_VARIABLE(status)
380 gpu_assert(status == gpuSuccess);
381#else
382 EIGEN_UNUSED_VARIABLE(config)
383#endif
384}
385#endif
386
387} // end namespace Eigen
388
389// undefine all the gpu* macros we defined at the beginning of the file
390#include "../../../../../Eigen/src/Core/util/GpuHipCudaUndefines.inc"
391
392#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
Namespace containing all symbols from the Eigen library.