import math from numba import (config, cuda, float32, float64, uint32, int64, uint64, from_dtype, jit) import numpy as np # This implementation is based upon the xoroshiro128+ and splitmix64 algorithms # described at: # # http://xoroshiro.di.unimi.it/ # # and originally implemented by David Blackman and Sebastiano Vigna. # # The implementations below are based on the C source code: # # * http://xoroshiro.di.unimi.it/xoroshiro128plus.c # * http://xoroshiro.di.unimi.it/splitmix64.c # # Splitmix64 is used to generate the initial state of the xoroshiro128+ # generator to ensure that small seeds don't result in predictable output. # **WARNING**: There is a lot of verbose casting in this file to ensure that # NumPy casting conventions (which cast uint64 [op] int32 to float64) don't # turn integers into floats when using these functions in the CUDA simulator. # # There are also no function type signatures to ensure that compilation is # deferred so that import is quick, and Sphinx autodoc works. We are also # using the CPU @jit decorator everywhere to create functions that work as # both CPU and CUDA device functions. xoroshiro128p_dtype = np.dtype([('s0', np.uint64), ('s1', np.uint64)], align=True) xoroshiro128p_type = from_dtype(xoroshiro128p_dtype) # When cudasim is enabled, Fake CUDA arrays are passed to some of the # @jit-decorated functions. This required fallback to object mode. With # Numba 0.59.0 object mode must be explicitly enabled. # https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit # In order to avoid the warning / future error, we explicitly specify that # object mode with loop lifting is acceptable when using the simulator. _forceobj = _looplift = config.ENABLE_CUDASIM _nopython = not config.ENABLE_CUDASIM @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython) def init_xoroshiro128p_state(states, index, seed): '''Use SplitMix64 to generate an xoroshiro128p state from 64-bit seed. This ensures that manually set small seeds don't result in a predictable initial sequence from the random number generator. :type states: 1D array, dtype=xoroshiro128p_dtype :param states: array of RNG states :type index: uint64 :param index: offset in states to update :type seed: int64 :param seed: seed value to use when initializing state ''' index = int64(index) seed = uint64(seed) z = seed + uint64(0x9E3779B97F4A7C15) z = (z ^ (z >> uint32(30))) * uint64(0xBF58476D1CE4E5B9) z = (z ^ (z >> uint32(27))) * uint64(0x94D049BB133111EB) z = z ^ (z >> uint32(31)) states[index]['s0'] = z states[index]['s1'] = z @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython) def rotl(x, k): '''Left rotate x by k bits.''' x = uint64(x) k = uint32(k) return (x << k) | (x >> uint32(64 - k)) @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython) def xoroshiro128p_next(states, index): '''Return the next random uint64 and advance the RNG in states[index]. :type states: 1D array, dtype=xoroshiro128p_dtype :param states: array of RNG states :type index: int64 :param index: offset in states to update :rtype: uint64 ''' index = int64(index) s0 = states[index]['s0'] s1 = states[index]['s1'] result = s0 + s1 s1 ^= s0 states[index]['s0'] = uint64(rotl(s0, uint32(55))) ^ s1 ^ (s1 << uint32(14)) states[index]['s1'] = uint64(rotl(s1, uint32(36))) return result @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython) def xoroshiro128p_jump(states, index): '''Advance the RNG in ``states[index]`` by 2**64 steps. :type states: 1D array, dtype=xoroshiro128p_dtype :param states: array of RNG states :type index: int64 :param index: offset in states to update ''' index = int64(index) jump = (uint64(0xbeac0467eba5facb), uint64(0xd86b048b86aa9922)) s0 = uint64(0) s1 = uint64(0) for i in range(2): for b in range(64): if jump[i] & (uint64(1) << uint32(b)): s0 ^= states[index]['s0'] s1 ^= states[index]['s1'] xoroshiro128p_next(states, index) states[index]['s0'] = s0 states[index]['s1'] = s1 @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython) def uint64_to_unit_float64(x): '''Convert uint64 to float64 value in the range [0.0, 1.0)''' x = uint64(x) return (x >> uint32(11)) * (float64(1) / (uint64(1) << uint32(53))) @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython) def uint64_to_unit_float32(x): '''Convert uint64 to float32 value in the range [0.0, 1.0)''' x = uint64(x) return float32(uint64_to_unit_float64(x)) @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython) def xoroshiro128p_uniform_float32(states, index): '''Return a float32 in range [0.0, 1.0) and advance ``states[index]``. :type states: 1D array, dtype=xoroshiro128p_dtype :param states: array of RNG states :type index: int64 :param index: offset in states to update :rtype: float32 ''' index = int64(index) return uint64_to_unit_float32(xoroshiro128p_next(states, index)) @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython) def xoroshiro128p_uniform_float64(states, index): '''Return a float64 in range [0.0, 1.0) and advance ``states[index]``. :type states: 1D array, dtype=xoroshiro128p_dtype :param states: array of RNG states :type index: int64 :param index: offset in states to update :rtype: float64 ''' index = int64(index) return uint64_to_unit_float64(xoroshiro128p_next(states, index)) TWO_PI_FLOAT32 = np.float32(2 * math.pi) TWO_PI_FLOAT64 = np.float64(2 * math.pi) @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython) def xoroshiro128p_normal_float32(states, index): '''Return a normally distributed float32 and advance ``states[index]``. The return value is drawn from a Gaussian of mean=0 and sigma=1 using the Box-Muller transform. This advances the RNG sequence by two steps. :type states: 1D array, dtype=xoroshiro128p_dtype :param states: array of RNG states :type index: int64 :param index: offset in states to update :rtype: float32 ''' index = int64(index) u1 = xoroshiro128p_uniform_float32(states, index) u2 = xoroshiro128p_uniform_float32(states, index) z0 = math.sqrt(-float32(2.0) * math.log(u1)) * math.cos(TWO_PI_FLOAT32 * u2) # discarding second normal value # z1 = math.sqrt(-float32(2.0) * math.log(u1)) # * math.sin(TWO_PI_FLOAT32 * u2) return z0 @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython) def xoroshiro128p_normal_float64(states, index): '''Return a normally distributed float32 and advance ``states[index]``. The return value is drawn from a Gaussian of mean=0 and sigma=1 using the Box-Muller transform. This advances the RNG sequence by two steps. :type states: 1D array, dtype=xoroshiro128p_dtype :param states: array of RNG states :type index: int64 :param index: offset in states to update :rtype: float64 ''' index = int64(index) u1 = xoroshiro128p_uniform_float32(states, index) u2 = xoroshiro128p_uniform_float32(states, index) z0 = math.sqrt(-float64(2.0) * math.log(u1)) * math.cos(TWO_PI_FLOAT64 * u2) # discarding second normal value # z1 = math.sqrt(-float64(2.0) * math.log(u1)) # * math.sin(TWO_PI_FLOAT64 * u2) return z0 @jit(forceobj=_forceobj, looplift=_looplift, nopython=_nopython) def init_xoroshiro128p_states_cpu(states, seed, subsequence_start): n = states.shape[0] seed = uint64(seed) subsequence_start = uint64(subsequence_start) if n >= 1: init_xoroshiro128p_state(states, 0, seed) # advance to starting subsequence number for _ in range(subsequence_start): xoroshiro128p_jump(states, 0) # populate the rest of the array for i in range(1, n): states[i] = states[i - 1] # take state of previous generator xoroshiro128p_jump(states, i) # and jump forward 2**64 steps def init_xoroshiro128p_states(states, seed, subsequence_start=0, stream=0): '''Initialize RNG states on the GPU for parallel generators. This initializes the RNG states so that each state in the array corresponds subsequences in the separated by 2**64 steps from each other in the main sequence. Therefore, as long no CUDA thread requests more than 2**64 random numbers, all of the RNG states produced by this function are guaranteed to be independent. The subsequence_start parameter can be used to advance the first RNG state by a multiple of 2**64 steps. :type states: 1D DeviceNDArray, dtype=xoroshiro128p_dtype :param states: array of RNG states :type seed: uint64 :param seed: starting seed for list of generators ''' # Initialization on CPU is much faster than the GPU states_cpu = np.empty(shape=states.shape, dtype=xoroshiro128p_dtype) init_xoroshiro128p_states_cpu(states_cpu, seed, subsequence_start) states.copy_to_device(states_cpu, stream=stream) def create_xoroshiro128p_states(n, seed, subsequence_start=0, stream=0): '''Returns a new device array initialized for n random number generators. This initializes the RNG states so that each state in the array corresponds subsequences in the separated by 2**64 steps from each other in the main sequence. Therefore, as long no CUDA thread requests more than 2**64 random numbers, all of the RNG states produced by this function are guaranteed to be independent. The subsequence_start parameter can be used to advance the first RNG state by a multiple of 2**64 steps. :type n: int :param n: number of RNG states to create :type seed: uint64 :param seed: starting seed for list of generators :type subsequence_start: uint64 :param subsequence_start: :type stream: CUDA stream :param stream: stream to run initialization kernel on ''' states = cuda.device_array(n, dtype=xoroshiro128p_dtype, stream=stream) init_xoroshiro128p_states(states, seed, subsequence_start, stream) return states