Eigen-unsupported  3.4.1 (git rev 28ded8800c26864e537852658428ab44c8399e87)
 
Loading...
Searching...
No Matches
TensorRandom.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
5// Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd.
6//
7// This Source Code Form is subject to the terms of the Mozilla
8// Public License v. 2.0. If a copy of the MPL was not distributed
9// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
10
11#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
12#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
13
14namespace Eigen {
15namespace internal {
16
17EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t get_random_seed() {
18#if defined(EIGEN_GPU_COMPILE_PHASE)
19 // We don't support 3d kernels since we currently only use 1 and
20 // 2d kernels.
21 gpu_assert(threadIdx.z == 0);
22 return blockIdx.x * blockDim.x + threadIdx.x
23 + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
24#else
25 // Rely on Eigen's random implementation.
26 return random<uint64_t>();
27#endif
28}
29
30EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) {
31 // TODO: Unify with the implementation in the non blocking thread pool.
32 uint64_t current = *state;
33 // Update the internal state
34 *state = current * 6364136223846793005ULL + (stream << 1 | 1);
35 // Generate the random output (using the PCG-XSH-RS scheme)
36 return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
37}
38
39EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
40 seed = seed ? seed : get_random_seed();
41 return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
42}
43
44template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
45T RandomToTypeUniform(uint64_t* state, uint64_t stream) {
46 unsigned rnd = PCG_XSH_RS_generator(state, stream);
47 return static_cast<T>(rnd);
48}
49
50
51template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
52Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) {
53 // Generate 10 random bits for the mantissa, merge with exponent.
54 unsigned rnd = PCG_XSH_RS_generator(state, stream);
55 const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10);
56 Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits);
57 // Return the final result
58 return result - Eigen::half(1.0f);
59}
60
61template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
62Eigen::bfloat16 RandomToTypeUniform<Eigen::bfloat16>(uint64_t* state, uint64_t stream) {
63
64 // Generate 7 random bits for the mantissa, merge with exponent.
65 unsigned rnd = PCG_XSH_RS_generator(state, stream);
66 const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7);
67 Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits);
68 // Return the final result
69 return result - Eigen::bfloat16(1.0f);
70}
71
72template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
73float RandomToTypeUniform<float>(uint64_t* state, uint64_t stream) {
74 typedef union {
75 uint32_t raw;
76 float fp;
77 } internal;
78 internal result;
79 // Generate 23 random bits for the mantissa mantissa
80 const unsigned rnd = PCG_XSH_RS_generator(state, stream);
81 result.raw = rnd & 0x7fffffu;
82 // Set the exponent
83 result.raw |= (static_cast<uint32_t>(127) << 23);
84 // Return the final result
85 return result.fp - 1.0f;
86}
87
88template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
89double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) {
90 typedef union {
91 uint64_t raw;
92 double dp;
93 } internal;
94 internal result;
95 result.raw = 0;
96 // Generate 52 random bits for the mantissa
97 // First generate the upper 20 bits
98 unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu;
99 // The generate the lower 32 bits
100 unsigned rnd2 = PCG_XSH_RS_generator(state, stream);
101 result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
102 // Set the exponent
103 result.raw |= (static_cast<uint64_t>(1023) << 52);
104 // Return the final result
105 return result.dp - 1.0;
106}
107
108template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
109std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state, uint64_t stream) {
110 return std::complex<float>(RandomToTypeUniform<float>(state, stream),
111 RandomToTypeUniform<float>(state, stream));
112}
113template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
114std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state, uint64_t stream) {
115 return std::complex<double>(RandomToTypeUniform<double>(state, stream),
116 RandomToTypeUniform<double>(state, stream));
117}
118
119template <typename T> class UniformRandomGenerator {
120 public:
121 static const bool PacketAccess = true;
122
123 // Uses the given "seed" if non-zero, otherwise uses a random seed.
124 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
125 uint64_t seed = 0) {
126 m_state = PCG_XSH_RS_state(seed);
127 #ifdef EIGEN_USE_SYCL
128 // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
129 // Therefor, we need two step to initializate the m_state.
130 // IN SYCL, the constructor of the functor is s called on the CPU
131 // and we get the clock seed here from the CPU. However, This seed is
132 //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
133 // and only available on the Operator() function (which is called on the GPU).
134 // Thus for CUDA (((CLOCK + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread
135 // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds
136 // the (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction
137 // similar to CUDA Therefore, the thread Id injection is not available at this stage.
138 //However when the operator() is called the thread ID will be avilable. So inside the opeator,
139 // we add the thrreadID, BlockId,... (which is equivalent of i)
140 //to the seed and construct the unique m_state per thead similar to cuda.
141 m_exec_once =false;
142 #endif
143 }
144 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
145 const UniformRandomGenerator& other) {
146 m_state = other.m_state;
147 #ifdef EIGEN_USE_SYCL
148 m_exec_once =other.m_exec_once;
149 #endif
150 }
151
152 template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
153 T operator()(Index i) const {
154 #ifdef EIGEN_USE_SYCL
155 if(!m_exec_once) {
156 // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
157 // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side
158 m_state += (i * 6364136223846793005ULL);
159 m_exec_once =true;
160 }
161 #endif
162 T result = RandomToTypeUniform<T>(&m_state, i);
163 return result;
164 }
165
166 template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
167 Packet packetOp(Index i) const {
168 const int packetSize = internal::unpacket_traits<Packet>::size;
169 EIGEN_ALIGN_MAX T values[packetSize];
170 #ifdef EIGEN_USE_SYCL
171 if(!m_exec_once) {
172 // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
173 m_state += (i * 6364136223846793005ULL);
174 m_exec_once =true;
175 }
176 #endif
177 EIGEN_UNROLL_LOOP
178 for (int j = 0; j < packetSize; ++j) {
179 values[j] = RandomToTypeUniform<T>(&m_state, i);
180 }
181 return internal::pload<Packet>(values);
182 }
183
184 private:
185 mutable uint64_t m_state;
186 #ifdef EIGEN_USE_SYCL
187 mutable bool m_exec_once;
188 #endif
189};
190
191template <typename Scalar>
192struct functor_traits<UniformRandomGenerator<Scalar> > {
193 enum {
194 // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
195 Cost = 12 * NumTraits<Scalar>::AddCost *
196 ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
197 PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
198 };
199};
200
201
202
203template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
204T RandomToTypeNormal(uint64_t* state, uint64_t stream) {
205 // Use the ratio of uniform method to generate numbers following a normal
206 // distribution. See for example Numerical Recipes chapter 7.3.9 for the
207 // details.
208 T u, v, q;
209 do {
210 u = RandomToTypeUniform<T>(state, stream);
211 v = T(1.7156) * (RandomToTypeUniform<T>(state, stream) - T(0.5));
212 const T x = u - T(0.449871);
213 const T y = numext::abs(v) + T(0.386595);
214 q = x*x + y * (T(0.196)*y - T(0.25472)*x);
215 } while (q > T(0.27597) &&
216 (q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u));
217
218 return v/u;
219}
220
221template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
222std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state, uint64_t stream) {
223 return std::complex<float>(RandomToTypeNormal<float>(state, stream),
224 RandomToTypeNormal<float>(state, stream));
225}
226template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
227std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state, uint64_t stream) {
228 return std::complex<double>(RandomToTypeNormal<double>(state, stream),
229 RandomToTypeNormal<double>(state, stream));
230}
231
232
233template <typename T> class NormalRandomGenerator {
234 public:
235 static const bool PacketAccess = true;
236
237 // Uses the given "seed" if non-zero, otherwise uses a random seed.
238 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
239 m_state = PCG_XSH_RS_state(seed);
240 #ifdef EIGEN_USE_SYCL
241 // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
242 // Therefor, we need two steps to initializate the m_state.
243 // IN SYCL, the constructor of the functor is s called on the CPU
244 // and we get the clock seed here from the CPU. However, This seed is
245 //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
246 // and only available on the Operator() function (which is called on the GPU).
247 // Therefore, the thread Id injection is not available at this stage. However when the operator()
248 //is called the thread ID will be avilable. So inside the opeator,
249 // we add the thrreadID, BlockId,... (which is equivalent of i)
250 //to the seed and construct the unique m_state per thead similar to cuda.
251 m_exec_once =false;
252 #endif
253 }
254 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(
255 const NormalRandomGenerator& other) {
256 m_state = other.m_state;
257#ifdef EIGEN_USE_SYCL
258 m_exec_once=other.m_exec_once;
259#endif
260 }
261
262 template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
263 T operator()(Index i) const {
264 #ifdef EIGEN_USE_SYCL
265 if(!m_exec_once) {
266 // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
267 m_state += (i * 6364136223846793005ULL);
268 m_exec_once =true;
269 }
270 #endif
271 T result = RandomToTypeNormal<T>(&m_state, i);
272 return result;
273 }
274
275 template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
276 Packet packetOp(Index i) const {
277 const int packetSize = internal::unpacket_traits<Packet>::size;
278 EIGEN_ALIGN_MAX T values[packetSize];
279 #ifdef EIGEN_USE_SYCL
280 if(!m_exec_once) {
281 // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
282 m_state += (i * 6364136223846793005ULL);
283 m_exec_once =true;
284 }
285 #endif
286 EIGEN_UNROLL_LOOP
287 for (int j = 0; j < packetSize; ++j) {
288 values[j] = RandomToTypeNormal<T>(&m_state, i);
289 }
290 return internal::pload<Packet>(values);
291 }
292
293 private:
294 mutable uint64_t m_state;
295 #ifdef EIGEN_USE_SYCL
296 mutable bool m_exec_once;
297 #endif
298};
299
300
301template <typename Scalar>
302struct functor_traits<NormalRandomGenerator<Scalar> > {
303 enum {
304 // On average, we need to generate about 3 random numbers
305 // 15 mul, 8 add, 1.5 logs
306 Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost +
307 15 * NumTraits<Scalar>::AddCost + 8 * NumTraits<Scalar>::AddCost +
308 3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2,
309 PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
310 };
311};
312
313
314} // end namespace internal
315} // end namespace Eigen
316
317#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index