vrd 0.0.11 - Docs.rs

// Copyright © 2023-2026 vrd. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 OR MIT

//! SIMD-batched Xoshiro256++ for `fill_bytes`.
//!
//! Holds **K independent** Xoshiro256++ states (K = 2 on AArch64 NEON,
//! K = 4 on x86_64 AVX2) in SIMD registers and advances all of them in
//! one inner-loop iteration. Each `fill_bytes` call derives the K lane
//! states by SplitMix64-whitening the scalar generator's state with a
//! distinct lane-specific constant: cheap (~10 ns of setup), and
//! statistically independent lanes by construction. The scalar state
//! is advanced by the equivalent number of `next_u64` calls so that
//! subsequent scalar calls remain consistent with the scalar-only
//! path.
//!
//! An earlier draft used [`crate::xoshiro::Xoshiro256PlusPlus::jump`]
//! for 2¹²⁸-step
//! separation per lane, but at 256 scalar `next_u64`s per call its
//! ~256 ns setup wiped out the SIMD win for buffers under ~4 KiB. The
//! SplitMix derivation keeps lanes uncorrelated (probability of state
//! collision is ≤ K²/2²⁵⁶ — negligible) at a fraction of the cost.
//!
//! # Reproducibility contract
//!
//! The same seed produces a **different byte stream** between the
//! scalar path and the SIMD path. This is fundamental: there is no
//! correctness-preserving way to interleave K independent Xoshiro
//! generators into the *same* sequence a single-threaded generator
//! would produce. Code that depends on bit-for-bit reproducibility
//! across feature sets must use the scalar path.
//!
//! Statistical quality is unchanged — each lane is a full Xoshiro256++
//! and inherits all of its properties.

#![allow(unsafe_code)]

use crate::xoshiro::Xoshiro256PlusPlus;

/// Derives K lane states from the scalar generator's current state.
/// Lane K's seed material is the scalar state's first word XORed with
/// a lane-specific 64-bit constant, then run through four SplitMix64
/// rounds. Output: `K` arrays of `[u64; 4]`.
///
/// SplitMix64 constants per <https://prng.di.unimi.it/splitmix64.c>.
#[inline]
fn derive_lanes<const K: usize>(
    rng: &Xoshiro256PlusPlus,
) -> [[u64; 4]; K] {
    const LANE_SALT: [u64; 4] = [
        0xA076_1D64_78BD_642F,
        0xE703_7ED1_A0B4_28DB,
        0x8EBC_6AF0_9C88_C6E3,
        0x5899_65CC_7537_4CC3,
    ];
    let base = rng.state_snapshot();
    let mut out = [[0u64; 4]; K];
    for (k, lane) in out.iter_mut().enumerate() {
        let mut sm = base[0] ^ LANE_SALT[k];
        for slot in lane.iter_mut() {
            *slot = splitmix64(&mut sm);
        }
        // Mix in the rest of the base state so a lane's distribution
        // tracks the full 256-bit scalar seed, not just word 0.
        for (slot, &b) in lane.iter_mut().zip(base.iter()) {
            *slot ^= b.rotate_left(((k as u32 + 1) * 13) % 64);
        }
    }
    out
}

#[inline]
fn splitmix64(state: &mut u64) -> u64 {
    *state = state.wrapping_add(0x9E37_79B9_7F4A_7C15);
    let mut z = *state;
    z = (z ^ (z >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
    z = (z ^ (z >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
    z ^ (z >> 31)
}

/// Below this many bytes, the SIMD setup cost outweighs the per-byte
/// savings; fall back to the scalar generator instead. Calibrated for
/// the SplitMix-derived lanes, not for jump-derived lanes (which had a
/// ~4 KiB break-even).
const SIMD_THRESHOLD: usize = 64;

/// Fills `dest` with random bytes using the best SIMD path available
/// for the active target. Falls back to the scalar generator for
/// buffers smaller than 64 bytes (where setup cost beats the
/// per-byte savings) and for the trailing bytes that don't fill a
/// full SIMD register.
///
/// Three target-specific definitions live below; rustc only
/// compiles the one matching the active `target_arch`.
#[cfg(target_arch = "aarch64")]
#[inline]
pub fn fill_bytes(rng: &mut Xoshiro256PlusPlus, dest: &mut [u8]) {
    if dest.len() < SIMD_THRESHOLD {
        rng.fill_bytes_scalar(dest);
        return;
    }
    aarch64::fill_bytes_neon(rng, dest);
}

/// x86_64 dispatch: prefer AVX2 if the CPU supports it; else scalar.
#[cfg(target_arch = "x86_64")]
#[inline]
pub fn fill_bytes(rng: &mut Xoshiro256PlusPlus, dest: &mut [u8]) {
    if dest.len() < SIMD_THRESHOLD || !is_avx2_available() {
        rng.fill_bytes_scalar(dest);
        return;
    }
    // SAFETY: gated on runtime AVX2 detection above.
    unsafe { x86_64::fill_bytes_avx2(rng, dest) };
}

/// Fallback for architectures without a SIMD path.
#[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
#[inline]
pub fn fill_bytes(rng: &mut Xoshiro256PlusPlus, dest: &mut [u8]) {
    rng.fill_bytes_scalar(dest);
}

#[cfg(all(target_arch = "x86_64", feature = "std"))]
#[inline]
fn is_avx2_available() -> bool {
    std::is_x86_feature_detected!("avx2")
}

#[cfg(all(target_arch = "x86_64", not(feature = "std")))]
#[inline]
fn is_avx2_available() -> bool {
    // Without std we can't runtime-detect, so fall back to compile-time.
    cfg!(target_feature = "avx2")
}

// --------------------------- AArch64 NEON -----------------------------
//
// 2-lane Xoshiro256++. Each iteration writes 16 output bytes; the
// computation is the scalar update with `uint64x2_t` substituted for
// `u64`. Throughput target on Apple M-series: ~20 GB/s (vs. 7.5 GB/s
// scalar baseline).

#[cfg(target_arch = "aarch64")]
mod aarch64 {
    use super::Xoshiro256PlusPlus;
    use core::arch::aarch64::*;

    /// Two independent Xoshiro256++ states packed into 4 × `uint64x2_t`
    /// registers. Lane i of register `s[j]` is word j of state i.
    struct Lanes {
        s: [uint64x2_t; 4],
    }

    impl Lanes {
        /// Build a 2-lane `Lanes` from two pre-computed Xoshiro256++
        /// states. `rng` is **not** mutated by [`fill_bytes_neon`] —
        /// the scalar state is advanced once after the SIMD loop by
        /// reading lane 0's final state and writing it back.
        #[inline]
        fn from_pair(lane0: [u64; 4], lane1: [u64; 4]) -> Self {
            // SAFETY: vsetq_lane_u64 takes a valid lane index in [0, 2).
            unsafe {
                let mut s = [vdupq_n_u64(0); 4];
                for (j, slot) in s.iter_mut().enumerate() {
                    let r = vsetq_lane_u64::<0>(lane0[j], *slot);
                    *slot = vsetq_lane_u64::<1>(lane1[j], r);
                }
                Self { s }
            }
        }

        /// Reads lane 0's final state out of the SIMD registers.
        #[inline]
        fn lane0_state(&self) -> [u64; 4] {
            // SAFETY: vgetq_lane_u64 lane index is in [0, 2).
            unsafe {
                [
                    vgetq_lane_u64::<0>(self.s[0]),
                    vgetq_lane_u64::<0>(self.s[1]),
                    vgetq_lane_u64::<0>(self.s[2]),
                    vgetq_lane_u64::<0>(self.s[3]),
                ]
            }
        }

        /// One Xoshiro256++ step. Returns the per-lane outputs as a
        /// single `uint64x2_t` (= 16 bytes when stored).
        #[inline]
        unsafe fn step(&mut self) -> uint64x2_t {
            let s = &mut self.s;
            // res = rotl(s0 + s3, 23) + s0
            let sum = vaddq_u64(s[0], s[3]);
            let res = vaddq_u64(rotl::<23, 41>(sum), s[0]);

            let t = vshlq_n_u64::<17>(s[1]);

            s[2] = veorq_u64(s[2], s[0]);
            s[3] = veorq_u64(s[3], s[1]);
            s[1] = veorq_u64(s[1], s[2]);
            s[0] = veorq_u64(s[0], s[3]);

            s[2] = veorq_u64(s[2], t);
            s[3] = rotl::<45, 19>(s[3]);

            res
        }
    }

    #[inline]
    unsafe fn rotl<const N: i32, const N_INV: i32>(
        x: uint64x2_t,
    ) -> uint64x2_t {
        vorrq_u64(vshlq_n_u64::<N>(x), vshrq_n_u64::<N_INV>(x))
    }

    pub(super) fn fill_bytes_neon(
        rng: &mut Xoshiro256PlusPlus,
        dest: &mut [u8],
    ) {
        // Caller (super::fill_bytes) guarantees dest.len() >= SIMD_THRESHOLD;
        // no need for a redundant < 16 early-return.
        // Two independent 2-lane states give 4-way effective
        // parallelism, plenty for the M-series' 4-wide NEON pipeline.
        let four = super::derive_lanes::<4>(rng);
        let mut lanes_a = Lanes::from_pair(four[0], four[1]);
        let mut lanes_b = Lanes::from_pair(four[2], four[3]);
        let dest_len = dest.len();
        let mut i = 0;
        // Two steps from each lane group per iteration = 64 bytes
        // out, 4-way interleaving for the M-series' two NEON ports.
        while i + 64 <= dest_len {
            // SAFETY: each step is a pure register update; the four
            // 16-byte stores land in `dest[i..i+64]`, in bounds by
            // the loop guard.
            unsafe {
                let oa0 = lanes_a.step();
                let ob0 = lanes_b.step();
                let oa1 = lanes_a.step();
                let ob1 = lanes_b.step();
                let p = dest.as_mut_ptr().add(i) as *mut u64;
                vst1q_u64(p, oa0);
                vst1q_u64(p.add(2), ob0);
                vst1q_u64(p.add(4), oa1);
                vst1q_u64(p.add(6), ob1);
            }
            i += 64;
        }
        while i + 32 <= dest_len {
            // SAFETY: 32-byte tail of the unrolled loop.
            unsafe {
                let oa = lanes_a.step();
                let ob = lanes_b.step();
                let p = dest.as_mut_ptr().add(i) as *mut u64;
                vst1q_u64(p, oa);
                vst1q_u64(p.add(2), ob);
            }
            i += 32;
        }
        while i + 16 <= dest_len {
            // SAFETY: 16-byte store stays within `dest`.
            unsafe {
                let out = lanes_a.step();
                vst1q_u64(dest.as_mut_ptr().add(i) as *mut u64, out);
            }
            i += 16;
        }
        // Advance the scalar state by taking lanes_a's lane-0 final
        // state — deterministic, well-randomised Xoshiro256++.
        rng.set_state(lanes_a.lane0_state());
        if i < dest_len {
            rng.fill_bytes_scalar(&mut dest[i..]);
        }
    }
}

// ---------------------------- x86_64 AVX2 -----------------------------
//
// 4-lane Xoshiro256++. Each iteration writes 32 output bytes.
// Throughput target on a modern AVX2 part: ~25–40 GB/s.

#[cfg(target_arch = "x86_64")]
mod x86_64 {
    use super::Xoshiro256PlusPlus;
    use core::arch::x86_64::*;

    struct Lanes {
        s: [__m256i; 4],
    }

    impl Lanes {
        #[target_feature(enable = "avx2")]
        unsafe fn from_rng(rng: &Xoshiro256PlusPlus) -> Self {
            let lane_states = super::derive_lanes::<4>(rng);
            // Transpose: register j holds
            //   [lane0[j], lane1[j], lane2[j], lane3[j]].
            let mut s = [_mm256_setzero_si256(); 4];
            for (j, slot) in s.iter_mut().enumerate() {
                *slot = _mm256_set_epi64x(
                    lane_states[3][j] as i64,
                    lane_states[2][j] as i64,
                    lane_states[1][j] as i64,
                    lane_states[0][j] as i64,
                );
            }
            Self { s }
        }

        #[inline]
        #[target_feature(enable = "avx2")]
        unsafe fn step(&mut self) -> __m256i {
            let s = &mut self.s;
            let sum = _mm256_add_epi64(s[0], s[3]);
            let res = _mm256_add_epi64(rotl::<23, 41>(sum), s[0]);

            let t = _mm256_slli_epi64::<17>(s[1]);

            s[2] = _mm256_xor_si256(s[2], s[0]);
            s[3] = _mm256_xor_si256(s[3], s[1]);
            s[1] = _mm256_xor_si256(s[1], s[2]);
            s[0] = _mm256_xor_si256(s[0], s[3]);

            s[2] = _mm256_xor_si256(s[2], t);
            s[3] = rotl::<45, 19>(s[3]);
            res
        }

        /// Extracts lane 0's final state from the SIMD registers.
        #[target_feature(enable = "avx2")]
        unsafe fn lane0_state(&self) -> [u64; 4] {
            let mut tmp = [0i64; 4];
            let mut out = [0u64; 4];
            for (j, slot) in out.iter_mut().enumerate() {
                _mm256_storeu_si256(
                    tmp.as_mut_ptr() as *mut __m256i,
                    self.s[j],
                );
                *slot = tmp[0] as u64;
            }
            out
        }
    }

    #[inline]
    #[target_feature(enable = "avx2")]
    unsafe fn rotl<const N: i32, const N_INV: i32>(
        x: __m256i,
    ) -> __m256i {
        _mm256_or_si256(
            _mm256_slli_epi64::<N>(x),
            _mm256_srli_epi64::<N_INV>(x),
        )
    }

    #[target_feature(enable = "avx2")]
    pub(super) unsafe fn fill_bytes_avx2(
        rng: &mut Xoshiro256PlusPlus,
        dest: &mut [u8],
    ) {
        // Caller (super::fill_bytes) guarantees dest.len() >= SIMD_THRESHOLD,
        // which is well above 32 — no redundant early-return needed.
        let mut lanes = Lanes::from_rng(rng);
        let mut i = 0;
        while i + 32 <= dest.len() {
            let out = lanes.step();
            _mm256_storeu_si256(
                dest.as_mut_ptr().add(i) as *mut __m256i,
                out,
            );
            i += 32;
        }
        let new_scalar = lanes.lane0_state();
        rng.set_state(new_scalar);
        if i < dest.len() {
            rng.fill_bytes_scalar(&mut dest[i..]);
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    #[cfg(feature = "alloc")]
    use alloc::vec;
    #[cfg(all(not(feature = "alloc"), feature = "std"))]
    use std::vec;

    /// Statistical: bytes should be uniformly distributed. With 64 KiB
    /// the per-byte count is ~256, well above the χ² alarm threshold
    /// for a fair 8-bit distribution.
    #[test]
    fn fill_produces_uniform_bytes() {
        let mut rng = Xoshiro256PlusPlus::from_u64_seed(0xC0DE_BEEF);
        let mut buf = [0u8; 64 * 1024];
        fill_bytes(&mut rng, &mut buf);

        let mut counts = [0u32; 256];
        for &b in &buf[..] {
            counts[b as usize] += 1;
        }
        let mean = (buf.len() / 256) as f64;
        let chi2: f64 = counts
            .iter()
            .map(|&c| {
                let diff = c as f64 - mean;
                diff * diff / mean
            })
            .sum();
        // χ² with 255 degrees of freedom: 99.99% upper critical ≈ 358.
        // We use a loose 500 to accommodate run-to-run variance.
        assert!(chi2 < 500.0, "χ² = {chi2} too high");
    }

    /// The SIMD path must handle unaligned and short buffers via the
    /// scalar tail.
    #[test]
    fn fill_handles_short_and_unaligned_lengths() {
        let mut rng = Xoshiro256PlusPlus::from_u64_seed(1);
        for &len in
            &[0usize, 1, 7, 15, 16, 17, 31, 33, 63, 65, 127, 129]
        {
            let mut buf = vec![0u8; len];
            fill_bytes(&mut rng, &mut buf);
            // Most buffers will have at least one non-zero byte. Skip
            // the len=0 case which is vacuously fine.
            if len > 4 {
                assert!(
                    buf.iter().any(|&b| b != 0),
                    "no entropy at len {len}"
                );
            }
        }
    }

    /// SIMD must produce a different stream than scalar from the same
    /// seed — this is the documented contract. Only meaningful on
    /// architectures with a real SIMD path.
    #[test]
    #[cfg(any(target_arch = "aarch64", target_arch = "x86_64"))]
    fn simd_diverges_from_scalar() {
        let mut a = Xoshiro256PlusPlus::from_u64_seed(42);
        let mut b = Xoshiro256PlusPlus::from_u64_seed(42);
        let mut sa = [0u8; 256];
        let mut sb = [0u8; 256];
        fill_bytes(&mut a, &mut sa);
        b.fill_bytes_scalar(&mut sb);
        assert_ne!(sa, sb, "SIMD and scalar must diverge");
    }
}