Eigen-unsupported  5.0.1-dev+284dcc12
 
Loading...
Searching...
No Matches
TensorRandom.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
5// Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd.
6//
7// This Source Code Form is subject to the terms of the Mozilla
8// Public License v. 2.0. If a copy of the MPL was not distributed
9// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
10
11#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
12#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
13
14// IWYU pragma: private
15#include "./InternalHeaderCheck.h"
16
17namespace Eigen {
18namespace internal {
19
20EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t get_random_seed() {
21#if defined(EIGEN_GPU_COMPILE_PHASE)
22 // We don't support 3d kernels since we currently only use 1 and
23 // 2d kernels.
24 gpu_assert(threadIdx.z == 0);
25 return blockIdx.x * blockDim.x + threadIdx.x + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
26#else
27 // Rely on Eigen's random implementation.
28 return random<uint64_t>();
29#endif
30}
31
32EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) {
33 // TODO: Unify with the implementation in the non blocking thread pool.
34 uint64_t current = *state;
35 // Update the internal state
36 *state = current * 6364136223846793005ULL + (stream << 1 | 1);
37 // Generate the random output (using the PCG-XSH-RS scheme)
38 return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
39}
40
41EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
42 seed = seed ? seed : get_random_seed();
43 return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
44}
45
46template <typename T>
47EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T RandomToTypeUniform(uint64_t* state, uint64_t stream) {
48 unsigned rnd = PCG_XSH_RS_generator(state, stream);
49 return static_cast<T>(rnd);
50}
51
52template <>
53EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool RandomToTypeUniform<bool>(uint64_t* state, uint64_t stream) {
54 unsigned rnd = PCG_XSH_RS_generator(state, stream);
55 return (rnd & 0x1) != 0;
56}
57
58template <>
59EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) {
60 // Generate 10 random bits for the mantissa, merge with exponent.
61 unsigned rnd = PCG_XSH_RS_generator(state, stream);
62 const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10);
63 Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits);
64 // Return the final result
65 return result - Eigen::half(1.0f);
66}
67
68template <>
69EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::bfloat16 RandomToTypeUniform<Eigen::bfloat16>(uint64_t* state,
70 uint64_t stream) {
71 // Generate 7 random bits for the mantissa, merge with exponent.
72 unsigned rnd = PCG_XSH_RS_generator(state, stream);
73 const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7);
74 Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits);
75 // Return the final result
76 return result - Eigen::bfloat16(1.0f);
77}
78
79template <>
80EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float RandomToTypeUniform<float>(uint64_t* state, uint64_t stream) {
81 typedef union {
82 uint32_t raw;
83 float fp;
84 } internal;
85 internal result;
86 // Generate 23 random bits for the mantissa mantissa
87 const unsigned rnd = PCG_XSH_RS_generator(state, stream);
88 result.raw = rnd & 0x7fffffu;
89 // Set the exponent
90 result.raw |= (static_cast<uint32_t>(127) << 23);
91 // Return the final result
92 return result.fp - 1.0f;
93}
94
95template <>
96EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) {
97 typedef union {
98 uint64_t raw;
99 double dp;
100 } internal;
101 internal result;
102 result.raw = 0;
103 // Generate 52 random bits for the mantissa
104 // First generate the upper 20 bits
105 unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu;
106 // The generate the lower 32 bits
107 unsigned rnd2 = PCG_XSH_RS_generator(state, stream);
108 result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
109 // Set the exponent
110 result.raw |= (static_cast<uint64_t>(1023) << 52);
111 // Return the final result
112 return result.dp - 1.0;
113}
114
115template <>
116EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state,
117 uint64_t stream) {
118 return std::complex<float>(RandomToTypeUniform<float>(state, stream), RandomToTypeUniform<float>(state, stream));
119}
120template <>
121EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state,
122 uint64_t stream) {
123 return std::complex<double>(RandomToTypeUniform<double>(state, stream), RandomToTypeUniform<double>(state, stream));
124}
125
126template <typename T>
127class UniformRandomGenerator {
128 public:
129 static constexpr bool PacketAccess = true;
130
131 // Uses the given "seed" if non-zero, otherwise uses a random seed.
132 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(uint64_t seed = 0) {
133 m_state = PCG_XSH_RS_state(seed);
134#ifdef EIGEN_USE_SYCL
135 // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
136 // Therefore, we need two steps to initializate the m_state.
137 // IN SYCL, the constructor of the functor is s called on the CPU
138 // and we get the clock seed here from the CPU. However, This seed is
139 // the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
140 // and only available on the Operator() function (which is called on the GPU).
141 // Thus for CUDA (((CLOCK + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each
142 // thread but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each
143 // thread adds the (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the
144 // construction similar to CUDA Therefore, the thread Id injection is not available at this stage.
145 // However when the operator() is called the thread ID will be available. So inside the operator,
146 // we add the thrreadID, BlockId,... (which is equivalent of i)
147 // to the seed and construct the unique m_state per thead similar to cuda.
148 m_exec_once = false;
149#endif
150 }
151 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(const UniformRandomGenerator& other) {
152 m_state = other.m_state;
153#ifdef EIGEN_USE_SYCL
154 m_exec_once = other.m_exec_once;
155#endif
156 }
157
158 template <typename Index>
159 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const {
160#ifdef EIGEN_USE_SYCL
161 if (!m_exec_once) {
162 // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
163 // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side
164 m_state += (i * 6364136223846793005ULL);
165 m_exec_once = true;
166 }
167#endif
168 T result = RandomToTypeUniform<T>(&m_state, i);
169 return result;
170 }
171
172 template <typename Packet, typename Index>
173 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(Index i) const {
174 const int packetSize = internal::unpacket_traits<Packet>::size;
175 EIGEN_ALIGN_MAX T values[packetSize];
176#ifdef EIGEN_USE_SYCL
177 if (!m_exec_once) {
178 // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
179 m_state += (i * 6364136223846793005ULL);
180 m_exec_once = true;
181 }
182#endif
183 EIGEN_UNROLL_LOOP
184 for (int j = 0; j < packetSize; ++j) {
185 values[j] = RandomToTypeUniform<T>(&m_state, i);
186 }
187 return internal::pload<Packet>(values);
188 }
189
190 private:
191 mutable uint64_t m_state;
192#ifdef EIGEN_USE_SYCL
193 mutable bool m_exec_once;
194#endif
195};
196
197template <typename Scalar>
198struct functor_traits<UniformRandomGenerator<Scalar> > {
199 enum {
200 // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
201 Cost = 12 * NumTraits<Scalar>::AddCost * ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
202 PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
203 };
204};
205
206template <typename T>
207EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T RandomToTypeNormal(uint64_t* state, uint64_t stream) {
208 // Use the ratio of uniform method to generate numbers following a normal
209 // distribution. See for example Numerical Recipes chapter 7.3.9 for the
210 // details.
211 T u, v, q;
212 do {
213 u = RandomToTypeUniform<T>(state, stream);
214 v = T(1.7156) * (RandomToTypeUniform<T>(state, stream) - T(0.5));
215 const T x = u - T(0.449871);
216 const T y = numext::abs(v) + T(0.386595);
217 q = x * x + y * (T(0.196) * y - T(0.25472) * x);
218 } while (q > T(0.27597) && (q > T(0.27846) || v * v > T(-4) * numext::log(u) * u * u));
219
220 return v / u;
221}
222
223template <>
224EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state,
225 uint64_t stream) {
226 return std::complex<float>(RandomToTypeNormal<float>(state, stream), RandomToTypeNormal<float>(state, stream));
227}
228template <>
229EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state,
230 uint64_t stream) {
231 return std::complex<double>(RandomToTypeNormal<double>(state, stream), RandomToTypeNormal<double>(state, stream));
232}
233
234template <typename T>
235class NormalRandomGenerator {
236 public:
237 static constexpr bool PacketAccess = true;
238
239 // Uses the given "seed" if non-zero, otherwise uses a random seed.
240 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
241 m_state = PCG_XSH_RS_state(seed);
242#ifdef EIGEN_USE_SYCL
243 // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
244 // Therefore, we need two steps to initializate the m_state.
245 // IN SYCL, the constructor of the functor is s called on the CPU
246 // and we get the clock seed here from the CPU. However, This seed is
247 // the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
248 // and only available on the Operator() function (which is called on the GPU).
249 // Therefore, the thread Id injection is not available at this stage. However when the operator()
250 // is called the thread ID will be available. So inside the operator,
251 // we add the thrreadID, BlockId,... (which is equivalent of i)
252 // to the seed and construct the unique m_state per thead similar to cuda.
253 m_exec_once = false;
254#endif
255 }
256 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(const NormalRandomGenerator& other) {
257 m_state = other.m_state;
258#ifdef EIGEN_USE_SYCL
259 m_exec_once = other.m_exec_once;
260#endif
261 }
262
263 template <typename Index>
264 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const {
265#ifdef EIGEN_USE_SYCL
266 if (!m_exec_once) {
267 // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
268 m_state += (i * 6364136223846793005ULL);
269 m_exec_once = true;
270 }
271#endif
272 T result = RandomToTypeNormal<T>(&m_state, i);
273 return result;
274 }
275
276 template <typename Packet, typename Index>
277 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(Index i) const {
278 const int packetSize = internal::unpacket_traits<Packet>::size;
279 EIGEN_ALIGN_MAX T values[packetSize];
280#ifdef EIGEN_USE_SYCL
281 if (!m_exec_once) {
282 // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
283 m_state += (i * 6364136223846793005ULL);
284 m_exec_once = true;
285 }
286#endif
287 EIGEN_UNROLL_LOOP
288 for (int j = 0; j < packetSize; ++j) {
289 values[j] = RandomToTypeNormal<T>(&m_state, i);
290 }
291 return internal::pload<Packet>(values);
292 }
293
294 private:
295 mutable uint64_t m_state;
296#ifdef EIGEN_USE_SYCL
297 mutable bool m_exec_once;
298#endif
299};
300
301template <typename Scalar>
302struct functor_traits<NormalRandomGenerator<Scalar> > {
303 enum {
304 // On average, we need to generate about 3 random numbers
305 // 15 mul, 8 add, 1.5 logs
306 Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost + 15 * NumTraits<Scalar>::AddCost +
307 8 * NumTraits<Scalar>::AddCost + 3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2,
308 PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
309 };
310};
311
312} // end namespace internal
313} // end namespace Eigen
314
315#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index