Created
January 18, 2023 11:42
-
-
Save mratsim/967e649f19896b52f13a679a6c66bfe9 to your computer and use it in GitHub Desktop.
LCG
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Constantine | |
# Copyright (c) 2018-2019 Status Research & Development GmbH | |
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy | |
# Licensed and distributed under either of | |
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). | |
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). | |
# at your option. This file may not be copied, modified, or distributed except according to those terms. | |
# TODO: while there is likely no attacks available that may exploit | |
# how work is stolen from threads by other threads, | |
# it would be disappointing to not use a CSPRNG in a cryptographic library. | |
# | |
# See: | |
# | |
# - Parallel Random Numbers: As Easy as 1, 2, 3 | |
# John K. Salmon, Mark A. Moraes, Ron O. Dror, and David E. Shaw | |
# https://thesalmons.org/john/random123/papers/random123sc11.pdf | |
# https://github.com/DEShawResearch/random123 | |
# | |
# - Fast-key-erasure random-number generators | |
# Daniel J. Bernstein | |
# https://blog.cr.yp.to/20170723-random.html | |
# https://github.com/floodyberry/supercop/blob/a351f2c/crypto_sign/ntrumls439x/ref/fastrandombytes.c | |
# | |
# - Randen - fast backtracking-resistant random generator with AES+Feistel+Reverie | |
# Jan Wassenberg, Robert Obryk, Jyrki Alakuijala, Emmanuel Mogenet | |
# https://arxiv.org/abs/1810.02227 | |
# https://github.com/google/randen | |
import ../../../helpers/prng_unsafe | |
export prng_unsafe | |
import | |
../bithacks, | |
instrumentation/contracts | |
# Work-stealing is more efficient when the thread we steal from is randomized. | |
# If all threads steal in the same order, we increase contention | |
# on the start victims task queues. | |
# | |
# The randomness quality is not important besides distributing potential contention, | |
# i.e. randomly trying thread i, then i+1, then i+n-1 (mod n) is good enough. | |
# | |
# Hence for efficiency, so that a thread can go to sleep faster, we want to | |
# reduce calls to to the RNG as: | |
# - Getting a random value itself can be expensive, especially if we use a CSPRNG (not a requirement) | |
# - a CSPRNG can be starved of entropy as with small tasks, threads might make millions of calls. | |
# - If we want unbiaised thread ID generation in a range, rejection sampling is costly (not a requirement). | |
# | |
# Instead of using Fisher-Yates | |
# - generates the victim set eagerly, inefficient if the first steal attempts are successful | |
# - needs a RNG call when sampling a victim | |
# - memory usage: numThreads per thread so numthreads² uint8 (255 threads max) or uint32 | |
# or a sparseset | |
# - 1 RNG call when sampling a victim | |
# - memory usage: 2*numThreads per thread so 2*numthreads² uint8 (255 threads max) or uint32 | |
# | |
# we can use Linear Congruential Generators, a recurrence relation of the form Xₙ₊₁ = aXₙ+c (mod m) | |
# If we respect the Hull-Dobell theorem requirements, we can generate pseudo-random permutations in [0, m) | |
# with fixed memory usage whatever the number of potential victims: just 4 registers for a, x, c, m | |
# | |
# References: | |
# - Fisher-Yates: https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle | |
# - Sparse sets: https://dl.acm.org/doi/pdf/10.1145/176454.176484 | |
# https://github.com/mratsim/weave/blob/7682784/weave/datatypes/sparsesets.nim | |
# https://github.com/status-im/nim-taskpools/blob/4bc0b59/taskpools/sparsesets.nim | |
# - Linear Congruential Generator: https://en.wikipedia.org/wiki/Linear_congruential_generator | |
# | |
# And if we want cryptographic strength: | |
# - Ciphers with Arbitrary Finite Domains | |
# John Black and Phillip Rogaway, 2001 | |
# https://eprint.iacr.org/2001/012 | |
# - An Enciphering Scheme Based on a Card Shuffle | |
# Viet Tung Hoang, Ben Morris, Phillip Rogaway | |
# https://www.iacr.org/archive/crypto2012/74170001/74170001.pdf | |
func nextPowerOfTwo_vartime(n: uint32): uint32 {.inline.} = | |
## Returns x if x is a power of 2 | |
## or the next biggest power of 2 | |
1'u32 shl (log2_vartime(n-1) + 1) | |
func gcd_vartime(u, v: uint32): uint32 {.used.} = | |
## Compute the greatest common divisor | |
## gcd(u, v) == 1 <=> u and v are coprimes | |
if u == 0: return v | |
if v == 0: return u | |
let shift = countTrailingZeroBits(u or v) | |
var u = u shr u.countTrailingZeroBits() | |
var v = v | |
while true: | |
v = v shr v.countTrailingZeroBits() | |
if u > v: | |
swap(u, v) | |
v = v - u | |
if v == 0: | |
return u shl shift | |
func fastRange(seed, maxPow2: uint32): uint32 {.inline.} = | |
## Constrain n in range [0, m) with m a power of 2 | |
## | |
## Alternatively we could use | |
## https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ | |
## | |
## However the number of threads is often in the 2-4 bit range (2 to 16 threads) | |
## so using the low bits as well avoids hammering the 0 thread. | |
seed and (maxPow2 - 1) | |
iterator pseudoRandomPermutation*(randomSeed, maxExclusive: uint32): uint32 = | |
## Create a (low-quality) pseudo-random permutation from [0, max) | |
# Linear Congruential Generator: https://en.wikipedia.org/wiki/Linear_congruential_generator | |
# | |
# Xₙ₊₁ = aXₙ+c (mod m) generates all random number mod m without repetition | |
# if and only if (Hull-Dobell theorem): | |
# 1. c and m are coprime | |
# 2. a-1 is divisible by all prime factors of m | |
# 3. a-1 is divisible by 4 if m is divisible by 4 | |
# | |
# By choosing a=1, all conditions are easy to reach. | |
# | |
# The randomness quality is not important besides distributing potential contention, | |
# i.e. randomly trying thread i, then i+1, then i+n-1 (mod n) is good enough. | |
# | |
# Assuming 6 threads, co-primes are [1, 5], which means the following permutations | |
# assuming we start with victim 0: | |
# - [0, 1, 2, 3, 4, 5] | |
# - [0, 5, 4, 3, 2, 1] | |
# | |
# We can choose m to be the next power of 2, meaning all odd integers are co-primes, | |
# consequently: | |
# - we don't need a GCD to find them | |
# - we don't need to cache coprimes, removing a cache-miss potential | |
# - we replace a conditional substraction, with multiplication+modulo power-of-two per iteration | |
# which are both cheap | |
let M = maxExclusive.nextPowerOfTwo_vartime() | |
let c = randomSeed.fastRange(M shr 1) * 2 + 1 # c odd and c ∈ [0, M) | |
let a = randomSeed.fastRange(M shr 2) * 4 + 1 # a-1 divisible by 2 (all prime factors of m) and by 4 if m divisible by 4 | |
let start = randomSeed.fastRange(M) | |
let mask = M-1 # for mod M | |
var x = start | |
while true: | |
if x < maxExclusive: | |
yield x | |
x = (a*x + c) and mask # ax + c (mod M), with M power of 2 | |
if x == start: | |
break | |
when isMainModule: | |
import std/[sequtils, strutils] | |
proc main(numThreads: uint32) = | |
const s = 0x0EADBEEF'u32 | |
const H = 0x01000000'u32 | |
let iters = 2*numThreads | |
echo "NumThreads: ", numThreads | |
# Low-bits | |
echo " Testing low-bits change in seed" | |
for seed in s ..< s+iters: | |
echo " seed ", seed.toHex(), ": ", seed.pseudoRandomPermutation(numThreads).toSeq().mapIt(align($it, 2)).`$`().replace("\"", "") | |
echo " ----" | |
# high-bits | |
echo " Testing high-bits change in seed" | |
for seed in countup(s, s+iters*H-1, H): | |
echo " seed ", seed.toHex(), ": ", seed.pseudoRandomPermutation(numThreads).toSeq().mapIt(align($it, 2)).`$`().replace("\"", "") | |
echo " ====" | |
main(2) | |
main(4) | |
main(8) | |
main(12) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment