structured-zstd 0.0.27

use crate::cpu_kernel::{CpuKernel, ScalarKernel};
use core::convert::TryInto;
use core::marker::PhantomData;
#[cfg(all(feature = "std", target_arch = "x86_64"))]
use std::sync::OnceLock;

/// Pre-computed mask table: `BIT_MASK[n]` equals the lower `n` bits set,
/// i.e. `(1u64 << n) - 1` for `n` in `0..=64`.
///
/// `mask_lower_bits` no longer reads this table — it computes the mask
/// via `u64::MAX >> (64 - n)` to save a load. The table is still used
/// by the BMI2 PEXT triple-extract path on x86-64 (where the mask is
/// constructed once per call and then fed to `_pext_u64`), and by the
/// tests that verify mask values directly.
#[cfg(any(test, target_arch = "x86_64"))]
const BIT_MASK: [u64; 65] = {
    let mut table = [0u64; 65];
    let mut i: u32 = 1;
    while i < 64 {
        table[i as usize] = (1u64 << i) - 1;
        i += 1;
    }
    table[64] = u64::MAX;
    table
};

/// Return the lowest `n` bits of `value` (zero the rest).
///
/// On x86-64 with BMI2 this compiles to a single `bzhi` instruction.
/// Everywhere else it computes the mask via `u64::MAX >> (64 - n)`
/// (replaced the previous `BIT_MASK[n]` table load — one shift + one
/// predicted cmov vs one 3–5 cycle L1 load on the hot FSE path).
///
/// This function supports `n <= 64`; zstd callers normally guarantee
/// `n <= 56` (the maximum single-symbol width in zstd). The
/// `debug_assert!(n <= 64)` on the FIRST line of the function body
/// (not just on `get_bits` callers) is the input-validation gate that
/// the fuzz suite relies on — invalid `n > 64` (e.g. from a malformed
/// FSE table or `accuracy_log`) trips it instead of silently returning
/// 0 from the release path. On the BMI2 path `_bzhi_u64` would
/// silently truncate without it; on the fallback path `checked_shr`
/// returns `None` for the wrapping-underflow shift and the
/// `unwrap_or(0)` would otherwise hide the upstream bug.
// Used only by the in-file mask_lower_bits unit tests after the
// hot-path migration to `K::mask_lower_bits` via the `CpuKernel`
// trait. Gating with `#[cfg(test)]` keeps the helper available for
// the regression tests below without triggering a `dead_code`
// warning under `-D warnings` in normal builds.
#[cfg(test)]
#[inline(always)]
fn mask_lower_bits(value: u64, n: u8) -> u64 {
    // Input-validation gate documented in the rustdoc above — keep this
    // as the first statement; removing it lets malformed inputs (`n >
    // 64`) silently decode to 0 in release builds instead of being
    // caught by the fuzz suite.
    debug_assert!(n <= 64, "mask_lower_bits: n must be <= 64, got {}", n);
    #[cfg(all(target_arch = "x86_64", target_feature = "bmi2"))]
    {
        // SAFETY: `_bzhi_u64` is always safe to call when the target supports BMI2.
        unsafe { core::arch::x86_64::_bzhi_u64(value, n as u32) }
    }
    #[cfg(not(all(target_arch = "x86_64", target_feature = "bmi2")))]
    {
        // Compute the mask via `u64::MAX >> (64 - n)` instead of a
        // `BIT_MASK[n]` table load. One shift + one (predicted) cmov
        // vs one L1 load (3-5 cycle latency). For the hot FSE bitstream
        // decode path this fires 3x per sequence; saving the load
        // latency per call compounds over thousands of sequences.
        //
        // `checked_shr` returns `None` when the shift count is ≥ 64,
        // which happens exactly when `n == 0` (`64 - 0 = 64`) or when
        // the debug_assert above would have fired (`n > 64`, underflow
        // wraps to a huge value). Mapping both to `0` gives the
        // mathematically-correct empty mask for n=0 and a safe-ish
        // fallback for the invalid range.
        let mask = u64::MAX
            .checked_shr(64u32.wrapping_sub(n as u32))
            .unwrap_or(0);
        value & mask
    }
}

#[cfg(all(feature = "std", target_arch = "x86_64"))]
#[derive(Copy, Clone)]
struct TripleExtractDispatch {
    use_pext: bool,
}

#[cfg(all(feature = "std", target_arch = "x86_64"))]
static TRIPLE_EXTRACT_DISPATCH: OnceLock<TripleExtractDispatch> = OnceLock::new();

#[cfg(all(feature = "std", target_arch = "x86_64"))]
#[inline(always)]
fn should_use_pext(vendor: [u8; 12], family: u32) -> bool {
    vendor != *b"AuthenticAMD" || family != 0x17
}

#[cfg(all(feature = "std", target_arch = "x86_64"))]
#[inline(always)]
fn triple_extract_dispatch() -> &'static TripleExtractDispatch {
    TRIPLE_EXTRACT_DISPATCH.get_or_init(detect_triple_extract_dispatch)
}

#[cfg(all(feature = "std", target_arch = "x86_64"))]
fn detect_triple_extract_dispatch() -> TripleExtractDispatch {
    use core::arch::x86_64::__cpuid;
    use std::arch::is_x86_feature_detected;

    if !is_x86_feature_detected!("bmi2") {
        return TripleExtractDispatch { use_pext: false };
    }

    // AMD Zen1/Zen2 execute PEXT/PDEP through a slow microcode path.
    // Keep scalar extraction there and enable PEXT on Intel and newer AMD.
    let leaf0 = __cpuid(0);
    let mut vendor = [0u8; 12];
    vendor[0..4].copy_from_slice(&leaf0.ebx.to_le_bytes());
    vendor[4..8].copy_from_slice(&leaf0.edx.to_le_bytes());
    vendor[8..12].copy_from_slice(&leaf0.ecx.to_le_bytes());
    let eax = __cpuid(1).eax;
    let base_family = (eax >> 8) & 0xF;
    let ext_family = (eax >> 20) & 0xFF;
    let family = if base_family == 0xF {
        base_family + ext_family
    } else {
        base_family
    };

    TripleExtractDispatch {
        use_pext: should_use_pext(vendor, family),
    }
}

// Used only by the in-file extract_triple correctness tests after
// `peek_bits_triple` switched to the per-reader `use_pext_triple`
// cached flag (commit 8805122f) — production now calls
// `extract_triple_pext` directly via that path. Gating with
// `#[cfg(test)]` keeps the helper available for the tests while
// avoiding a `dead_code` warning under `-D warnings`.
#[cfg(all(test, feature = "std", target_arch = "x86_64"))]
#[inline(always)]
fn try_extract_triple_with_pext(all_three: u64, n1: u8, n2: u8, n3: u8) -> Option<(u64, u64, u64)> {
    if !triple_extract_dispatch().use_pext {
        return None;
    }

    Some(unsafe { extract_triple_pext(all_three, n1, n2, n3) })
}

#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi2")]
unsafe fn extract_triple_pext(all_three: u64, n1: u8, n2: u8, n3: u8) -> (u64, u64, u64) {
    use core::arch::x86_64::_pext_u64;

    let mask3 = BIT_MASK[n3 as usize];
    let mask2 = BIT_MASK[n2 as usize].wrapping_shl(u32::from(n3));
    let mask1 = BIT_MASK[n1 as usize].wrapping_shl(u32::from(n2) + u32::from(n3));

    let val1 = _pext_u64(all_three, mask1);
    let val2 = _pext_u64(all_three, mask2);
    let val3 = _pext_u64(all_three, mask3);
    (val1, val2, val3)
}

/// Zstandard encodes some types of data in a way that the data must be read
/// back to front to decode it properly. `BitReaderReversed` provides a
/// convenient interface to do that.
pub struct BitReaderReversed<'s, K: CpuKernel = ScalarKernel> {
    /// Start offset (in bytes) of the 8-byte source window currently
    /// loaded into `bit_container`. Decreases monotonically as bytes
    /// are consumed: `refill` walks it backward toward 0 by
    /// `bits_consumed / 8`, and `bits_remaining()` uses
    /// `index * 8 + (64 - bits_consumed)` to compute how many stream
    /// bits remain. The byte at `source[index]` is the LSB of the
    /// `from_le_bytes` u64 in `bit_container`; the byte at
    /// `source[index + 7]` is the MSB (= the next stream bit at
    /// position 63 of `bit_container`, before any consumption).
    ///
    /// `pub(crate)` so the HUF 4-stream burst hot loop in
    /// `decoding::literals_section_decoder` can run donor's
    /// `ip[s] -= nb_bytes; bits[s] = MEM_read64(ip[s]) | 1` reload
    /// pattern directly against the byte stream — see
    /// [`Self::bits_consumed`] for the broader rationale.
    pub(crate) index: usize,

    /// How many bits have been consumed from `bit_container`.
    ///
    /// `pub(crate)` so the HUF 4-stream hot loop in
    /// `decoding::literals_section_decoder` can lift the reader state
    /// into a local `bits[4]` register layout (donor parity with
    /// `huf_decompress.c:HUF_decompress4X1_usingDTable_internal_fast_c_loop`):
    /// inside the burst, all symbol-decode work happens against a
    /// `bits[s]` u64 that fuses the decoder state with pending input
    /// bits, and the field is written back only at the burst boundary.
    /// Outside the burst the field is treated as opaque internal state.
    pub(crate) bits_consumed: u8,

    /// How many bits have been consumed past the end of the input. Will be zero until all the input
    /// has been read.
    extra_bits: usize,

    /// The source data to read from.
    ///
    /// `pub(crate)` — paired with [`Self::index`], the HUF 4-stream
    /// burst hot loop needs direct slice access for the per-iter
    /// donor-pattern reload (`MEM_read64(source[ip..ip+8])`).
    pub(crate) source: &'s [u8],

    /// The reader doesn't read directly from the source, it reads bits from here, and the container
    /// is "refilled" as it's emptied.
    ///
    /// `pub(crate)` — see [`Self::bits_consumed`] for the rationale.
    pub(crate) bit_container: u64,

    /// Phantom marker for the CPU kernel type parameter `K`. Zero-sized;
    /// drives monomorphisation of methods that route through `K::mask_lower_bits`
    /// without forcing the struct itself to carry runtime kernel state.
    _kernel: PhantomData<K>,

    /// Cached `triple_extract_dispatch().use_pext` snapshot, populated
    /// once in `new()`. `peek_bits_triple` reads this field instead of
    /// re-checking the global `OnceLock` on every sequence — the
    /// per-call atomic load + dispatch-branch was paying ~3 cycles on
    /// every sequence decode (thousands per block × many blocks per
    /// frame). One bool per `BitReaderReversed` lifetime, amortised
    /// across every `peek_bits_triple` in the same decode pass.
    #[cfg(all(feature = "std", target_arch = "x86_64"))]
    pub(crate) use_pext_triple: bool,
}

impl<'s, K: CpuKernel> BitReaderReversed<'s, K> {
    /// How many bits are left to read by the reader.
    pub fn bits_remaining(&self) -> isize {
        self.index as isize * 8 + (64 - self.bits_consumed as isize) - self.extra_bits as isize
    }

    /// Returns `true` when the cached vendor policy says PEXT is fast
    /// on the running CPU (Intel + AMD Zen3+) and the bmi2-direct
    /// triple-extract path should be used. AMD Zen1/Zen2 microcode
    /// PEXT is slower than the scalar 3× shift+mask path, so
    /// [`should_use_pext`] caches `false` for those vendors.
    ///
    /// `no_std` x86_64 builds lack the runtime detection (`use_pext_triple`
    /// is std-gated), so this falls back to `true`: callers on
    /// `no_std` rely on compile-time `target_feature = "bmi2"` and
    /// implicitly trust that the chosen target CPU advertises fast
    /// PEXT. Vendor-specific microcode regression remains a
    /// build-time concern there — pin a known-good target with
    /// `RUSTFLAGS="-C target-cpu=..."`.
    #[cfg(target_arch = "x86_64")]
    #[inline(always)]
    pub(crate) fn use_pext_triple_fast(&self) -> bool {
        #[cfg(all(feature = "std", target_arch = "x86_64"))]
        {
            self.use_pext_triple
        }
        #[cfg(not(all(feature = "std", target_arch = "x86_64")))]
        {
            true
        }
    }

    pub fn new(source: &'s [u8]) -> BitReaderReversed<'s, K> {
        BitReaderReversed {
            index: source.len(),
            bits_consumed: 64,
            source,
            bit_container: 0,
            extra_bits: 0,
            _kernel: PhantomData,
            #[cfg(all(feature = "std", target_arch = "x86_64"))]
            use_pext_triple: triple_extract_dispatch().use_pext,
        }
    }

    /// Refill the bit container with up to 64 fresh bits from `source`.
    ///
    /// Hot path (mid-stream, `self.index >= bytes_consumed`) is `#[inline(always)]`
    /// and folds into every caller — three operations: subtract index, mask
    /// off byte-aligned bit count, load 8 bytes. The pre-PR version wore a
    /// blanket `#[cold]` annotation which actively penalised the hot path
    /// (refill fires roughly every 2 sequences during sequence decode, so
    /// it is NOT cold). The rare edge cases — running out of source, going
    /// past the start of the stream, exhausting all useful bits — branch
    /// out to `refill_slow` which keeps the `#[cold] #[inline(never)]`
    /// treatment they actually deserve.
    #[inline(always)]
    fn refill(&mut self) {
        let bytes_consumed = self.bits_consumed as usize / 8;
        if bytes_consumed == 0 {
            return;
        }

        if self.index >= bytes_consumed {
            // We can safely move the window contained in `bit_container` down by `bytes_consumed`
            // If the reader wasn't byte aligned, the byte that was partially read is now in the highest order bits in the `bit_container`
            self.index -= bytes_consumed;
            // Some bits of the `bits_container` might have been consumed already because we read the window byte aligned
            self.bits_consumed &= 7;
            self.bit_container =
                u64::from_le_bytes((&self.source[self.index..][..8]).try_into().unwrap());
        } else {
            self.refill_slow();
        }

        // Assert that at least `56 = 64 - 8` bits are available to read.
        debug_assert!(self.bits_consumed < 8);
    }

    /// End-of-stream refill paths — runs when the next 8-byte window would
    /// underflow the source buffer. Kept `#[cold] #[inline(never)]` so the
    /// hot mid-stream path in [`refill`] folds into call sites without
    /// dragging these branches along.
    #[cold]
    #[inline(never)]
    fn refill_slow(&mut self) {
        if self.index > 0 {
            // Read the last portion of source into the `bit_container`
            if self.source.len() >= 8 {
                self.bit_container = u64::from_le_bytes((&self.source[..8]).try_into().unwrap());
            } else {
                let mut value = [0; 8];
                value[..self.source.len()].copy_from_slice(self.source);
                self.bit_container = u64::from_le_bytes(value);
            }

            self.bits_consumed -= 8 * self.index as u8;
            self.index = 0;

            self.bit_container <<= self.bits_consumed;
            self.extra_bits += self.bits_consumed as usize;
            self.bits_consumed = 0;
        } else if self.bits_consumed < 64 {
            // Shift out already used bits and fill up with zeroes
            self.bit_container <<= self.bits_consumed;
            self.extra_bits += self.bits_consumed as usize;
            self.bits_consumed = 0;
        } else {
            // All useful bits have already been read and more than 64 bits have been consumed, all we now do is return zeroes
            self.extra_bits += self.bits_consumed as usize;
            self.bits_consumed = 0;
            self.bit_container = 0;
        }
    }

    /// Read `n` number of bits from the source. Will read at most 56 bits.
    /// If there are no more bits to be read from the source zero bits will be returned instead.
    #[inline(always)]
    pub fn get_bits(&mut self, n: u8) -> u64 {
        if self.bits_consumed + n > 64 {
            self.refill();
        }

        let value = self.peek_bits(n);
        self.consume(n);
        value
    }

    /// Ensure at least `n` bits are available for subsequent unchecked reads.
    /// After calling this, it is safe to call [`get_bits_unchecked`](Self::get_bits_unchecked)
    /// for a combined total of up to `n` bits without individual refill checks.
    ///
    /// `n` must be at most 56.
    #[inline(always)]
    pub fn ensure_bits(&mut self, n: u8) {
        debug_assert!(n <= 56);
        if self.bits_consumed + n > 64 {
            self.refill();
        }
    }

    /// Read `n` bits from the source **without** checking whether a refill is
    /// needed. The caller **must** guarantee enough bits are available (e.g. via
    /// a prior [`ensure_bits`](Self::ensure_bits) call).
    #[inline(always)]
    pub fn get_bits_unchecked(&mut self, n: u8) -> u64 {
        debug_assert!(n <= 56);
        debug_assert!(
            self.bits_consumed + n <= 64,
            "get_bits_unchecked: not enough bits (consumed={}, requested={})",
            self.bits_consumed,
            n
        );
        let value = self.peek_bits(n);
        self.consume(n);
        value
    }

    /// Get the next `n` bits from the source without consuming them.
    /// Caller is responsible for making sure that `n` many bits have been refilled.
    ///
    /// Branchless: when `n == 0` the mask is zero so the result is zero
    /// without a dedicated check. `wrapping_shr` avoids a debug-mode
    /// panic when the computed shift equals 64 (which happens legitimately
    /// when `bits_consumed == 0` and `n == 0`).
    #[inline(always)]
    pub fn peek_bits(&mut self, n: u8) -> u64 {
        // n == 0 is valid (branchless no-op); otherwise the caller must
        // guarantee bits_consumed + n <= 64 via ensure_bits / get_bits.
        debug_assert!(
            n == 0 || self.bits_consumed + n <= 64,
            "peek_bits: not enough bits (consumed={}, requested={})",
            self.bits_consumed,
            n
        );
        let shift_by = (64u8 - self.bits_consumed).wrapping_sub(n);
        K::mask_lower_bits(self.bit_container.wrapping_shr(shift_by as u32), n)
    }

    /// Get the next `n1` `n2` and `n3` bits from the source without consuming them.
    /// Caller is responsible for making sure that `sum` many bits have been refilled.
    ///
    /// # Contract
    /// `sum` **must** equal `n1 + n2 + n3`. This is enforced by `debug_assert`
    /// but not checked in release builds for performance.
    ///
    /// Branchless: when all widths are zero the masks are zero, producing (0, 0, 0).
    #[inline(always)]
    pub fn peek_bits_triple(&mut self, sum: u8, n1: u8, n2: u8, n3: u8) -> (u64, u64, u64) {
        debug_assert_eq!(
            u16::from(sum),
            u16::from(n1) + u16::from(n2) + u16::from(n3),
            "peek_bits_triple: sum ({}) must equal n1+n2+n3 ({}+{}+{})",
            sum,
            n1,
            n2,
            n3
        );
        debug_assert!(
            sum == 0 || self.bits_consumed + sum <= 64,
            "peek_bits_triple: not enough bits (consumed={}, requested={})",
            self.bits_consumed,
            sum
        );
        // all_three contains bits like this: |XXXX..XXX111122223333|
        // Where XXX are already consumed bytes, 1/2/3 are bits of the respective value
        // Lower bits are to the right
        let shift_by = (64u8 - self.bits_consumed).wrapping_sub(sum);
        let all_three = self.bit_container.wrapping_shr(shift_by as u32);

        #[cfg(all(feature = "std", target_arch = "x86_64"))]
        if self.use_pext_triple {
            // SAFETY: `use_pext_triple` was set in `new()` from
            // `triple_extract_dispatch().use_pext`, which only returns
            // `true` when BMI2 is runtime-detected; the unsafe call is
            // gated on the same runtime check that the inline-form
            // `try_extract_triple_with_pext` used to perform per-call.
            return unsafe { extract_triple_pext(all_three, n1, n2, n3) };
        }

        let val1 = K::mask_lower_bits(all_three.wrapping_shr(u32::from(n3) + u32::from(n2)), n1);
        let val2 = K::mask_lower_bits(all_three.wrapping_shr(u32::from(n3)), n2);
        let val3 = K::mask_lower_bits(all_three, n3);

        (val1, val2, val3)
    }

    /// BMI2-scoped variant of [`peek_bits`]. The whole body executes
    /// in `#[target_feature(enable = "bmi2")]` scope, so `_bzhi_u64`
    /// inlines as a single `bzhi` instruction at the caller site
    /// instead of crossing the `mask_lower_bits_bmi2_impl` CALL
    /// boundary (the issue documented in #279 round 3).
    ///
    /// Use from any caller that is itself `#[target_feature(bmi2)]`-
    /// scoped and has verified the runtime CPU supports BMI2.
    ///
    /// # Safety
    /// Caller MUST ensure BMI2 is available on the running CPU. The
    /// `bzhi` instruction faults with #UD on hardware that does not
    /// advertise BMI2.
    #[cfg(target_arch = "x86_64")]
    #[target_feature(enable = "bmi2")]
    #[inline]
    #[allow(dead_code)]
    pub(crate) unsafe fn peek_bits_bmi2(&mut self, n: u8) -> u64 {
        debug_assert!(
            n == 0 || self.bits_consumed + n <= 64,
            "peek_bits_bmi2: not enough bits (consumed={}, requested={})",
            self.bits_consumed,
            n
        );
        let shift_by = (64u8 - self.bits_consumed).wrapping_sub(n);
        core::arch::x86_64::_bzhi_u64(self.bit_container.wrapping_shr(shift_by as u32), n as u32)
    }

    /// BMI2-scoped variant of [`peek_bits_triple`]. Mirrors the
    /// scalar/K-trait variant but inlines `_pext_u64` directly instead
    /// of crossing the `extract_triple_pext` CALL boundary.
    ///
    /// On AMD Zen1/Zen2 (vendor=AuthenticAMD family=0x17) `_pext_u64`
    /// goes through slow microcode; callers should still consult
    /// `self.use_pext_triple` (populated at construction from the
    /// global dispatch cache) and route to the scalar variant on
    /// those CPUs. This method assumes the caller already gated on
    /// `use_pext_triple == true`.
    ///
    /// # Safety
    /// Caller MUST ensure BMI2 is available AND the running CPU
    /// benefits from `_pext_u64` (i.e. not Zen1/Zen2).
    #[cfg(target_arch = "x86_64")]
    #[target_feature(enable = "bmi2")]
    #[inline]
    pub(crate) unsafe fn peek_bits_triple_bmi2(
        &mut self,
        sum: u8,
        n1: u8,
        n2: u8,
        n3: u8,
    ) -> (u64, u64, u64) {
        debug_assert_eq!(
            u16::from(sum),
            u16::from(n1) + u16::from(n2) + u16::from(n3),
            "peek_bits_triple_bmi2: sum ({}) must equal n1+n2+n3 ({}+{}+{})",
            sum,
            n1,
            n2,
            n3
        );
        debug_assert!(
            sum == 0 || self.bits_consumed + sum <= 64,
            "peek_bits_triple_bmi2: not enough bits (consumed={}, requested={})",
            self.bits_consumed,
            sum
        );
        let shift_by = (64u8 - self.bits_consumed).wrapping_sub(sum);
        let all_three = self.bit_container.wrapping_shr(shift_by as u32);
        // SAFETY: caller's target_feature includes BMI2 per `# Safety`
        // contract; same scope as the enclosing fn.
        unsafe { extract_triple_pext(all_three, n1, n2, n3) }
    }

    /// Consume `n` bits from the source.
    #[inline(always)]
    pub fn consume(&mut self, n: u8) {
        self.bits_consumed += n;
        debug_assert!(self.bits_consumed <= 64);
    }

    /// Same as calling get_bits three times but slightly more performant.
    ///
    /// Uses a single conditional refill (via [`ensure_bits`](Self::ensure_bits))
    /// instead of unconditionally refilling, avoiding redundant work when the
    /// bit container already holds enough bits.
    #[inline(always)]
    pub fn get_bits_triple(&mut self, n1: u8, n2: u8, n3: u8) -> (u64, u64, u64) {
        // Compute in u16 to avoid u8 overflow (max realistic sum is ~26,
        // but the type system allows up to 3×255).
        let sum_wide = u16::from(n1) + u16::from(n2) + u16::from(n3);
        if sum_wide <= 56 {
            let sum = sum_wide as u8;
            self.ensure_bits(sum);

            let triple = self.peek_bits_triple(sum, n1, n2, n3);
            self.consume(sum);
            return triple;
        }

        (self.get_bits(n1), self.get_bits(n2), self.get_bits(n3))
    }
}

#[cfg(test)]
mod test {
    #[cfg(all(feature = "std", target_arch = "x86_64"))]
    use std::arch::is_x86_feature_detected;

    #[cfg(all(feature = "std", target_arch = "x86_64"))]
    #[inline]
    fn scalar_extract_triple(all_three: u64, n1: u8, n2: u8, n3: u8) -> (u64, u64, u64) {
        let val3 = all_three & super::BIT_MASK[n3 as usize];
        let val2 = all_three.wrapping_shr(u32::from(n3)) & super::BIT_MASK[n2 as usize];
        let val1 =
            all_three.wrapping_shr(u32::from(n2) + u32::from(n3)) & super::BIT_MASK[n1 as usize];
        (val1, val2, val3)
    }

    #[cfg(all(feature = "std", target_arch = "x86_64"))]
    #[inline]
    fn next_test_value(state: &mut u64) -> u64 {
        let mut x = *state;
        x ^= x << 13;
        x ^= x >> 7;
        x ^= x << 17;
        *state = x;
        x
    }

    #[test]
    fn it_works() {
        let data = [0b10101010, 0b01010101];
        let mut br = super::BitReaderReversed::<crate::cpu_kernel::ScalarKernel>::new(&data);
        assert_eq!(br.get_bits(1), 0);
        assert_eq!(br.get_bits(1), 1);
        assert_eq!(br.get_bits(1), 0);
        assert_eq!(br.get_bits(4), 0b1010);
        assert_eq!(br.get_bits(4), 0b1101);
        assert_eq!(br.get_bits(4), 0b0101);
        // Last 0 from source, three zeroes filled in
        assert_eq!(br.get_bits(4), 0b0000);
        // All zeroes filled in
        assert_eq!(br.get_bits(4), 0b0000);
        assert_eq!(br.bits_remaining(), -7);
    }

    /// Verify that `ensure_bits(n)` + `get_bits_unchecked(..)` returns the same
    /// values as plain `get_bits(..)`, including across refill boundaries and
    /// for edge cases like n=0.
    #[test]
    fn ensure_and_unchecked_match_get_bits() {
        // 10 bytes = 80 bits — enough to force multiple refills
        let data: [u8; 10] = [0xDE, 0xAD, 0xBE, 0xEF, 0x42, 0x13, 0x37, 0xCA, 0xFE, 0x01];

        // Reference: read with get_bits
        let mut ref_br = super::BitReaderReversed::<crate::cpu_kernel::ScalarKernel>::new(&data);
        let r1 = ref_br.get_bits(0);
        let r2 = ref_br.get_bits(7);
        let r3 = ref_br.get_bits(13);
        let r4 = ref_br.get_bits(9);
        let r5 = ref_br.get_bits(8);
        let r5b = ref_br.get_bits(2);
        // After 39 bits consumed, ensure_bits(26) triggers a real refill
        // because 39 + 26 = 65 > 64.
        let r6 = ref_br.get_bits(9);
        let r7 = ref_br.get_bits(9);
        let r8 = ref_br.get_bits(8);

        // Unchecked path: same reads via ensure_bits + get_bits_unchecked
        let mut fast_br = super::BitReaderReversed::<crate::cpu_kernel::ScalarKernel>::new(&data);

        // n=0 edge case
        fast_br.ensure_bits(0);
        assert_eq!(fast_br.get_bits_unchecked(0), r1);

        // Single reads
        fast_br.ensure_bits(7);
        assert_eq!(fast_br.get_bits_unchecked(7), r2);

        fast_br.ensure_bits(13);
        assert_eq!(fast_br.get_bits_unchecked(13), r3);

        fast_br.ensure_bits(9);
        assert_eq!(fast_br.get_bits_unchecked(9), r4);

        fast_br.ensure_bits(8);
        assert_eq!(fast_br.get_bits_unchecked(8), r5);

        fast_br.ensure_bits(2);
        assert_eq!(fast_br.get_bits_unchecked(2), r5b);

        // Batched: one ensure covering 9+9+8 = 26 bits.
        // At 39 bits consumed, this forces a real refill (39+26=65 > 64).
        fast_br.ensure_bits(26);
        assert_eq!(fast_br.get_bits_unchecked(9), r6);
        assert_eq!(fast_br.get_bits_unchecked(9), r7);
        assert_eq!(fast_br.get_bits_unchecked(8), r8);

        assert_eq!(ref_br.bits_remaining(), fast_br.bits_remaining());
    }

    /// Verify that the pre-computed BIT_MASK table produces correct values.
    #[test]
    fn mask_table_correctness() {
        assert_eq!(super::BIT_MASK[0], 0);
        assert_eq!(super::BIT_MASK[1], 1);
        assert_eq!(super::BIT_MASK[8], 0xFF);
        assert_eq!(super::BIT_MASK[16], 0xFFFF);
        assert_eq!(super::BIT_MASK[32], 0xFFFF_FFFF);
        assert_eq!(super::BIT_MASK[63], (1u64 << 63) - 1);
        assert_eq!(super::BIT_MASK[64], u64::MAX);
        for n in 0..64u32 {
            assert_eq!(
                super::BIT_MASK[n as usize],
                (1u64 << n) - 1,
                "BIT_MASK[{n}] mismatch"
            );
        }
    }

    /// Verify mask_lower_bits matches manual computation for edge values.
    #[test]
    fn mask_lower_bits_edge_cases() {
        assert_eq!(super::mask_lower_bits(u64::MAX, 0), 0);
        assert_eq!(super::mask_lower_bits(u64::MAX, 1), 1);
        assert_eq!(
            super::mask_lower_bits(0xABCD_1234_5678_9ABC, 64),
            0xABCD_1234_5678_9ABC
        );
        assert_eq!(super::mask_lower_bits(0xABCD_1234_5678_9ABC, 8), 0xBC);
        assert_eq!(super::mask_lower_bits(0xABCD_1234_5678_9ABC, 16), 0x9ABC);
    }

    /// peek_bits(0) must return 0 in all states, including when
    /// bits_consumed is 0 (post-exhaustion refill).
    #[test]
    fn peek_bits_zero_is_always_zero() {
        let data = [0xFF; 8];
        let mut br = super::BitReaderReversed::<crate::cpu_kernel::ScalarKernel>::new(&data);

        // Initial state: bits_consumed = 64
        assert_eq!(br.peek_bits(0), 0);

        // After reading some bits: bits_consumed < 64
        br.get_bits(7);
        assert_eq!(br.peek_bits(0), 0);

        // Force bits_consumed == 0 to exercise the shift-by-64 edge case
        // in peek_bits. This state occurs naturally during refill() when the
        // source is exhausted. We set it directly because get_bits always
        // calls consume(n) after refill, making bits_consumed > 0 by the
        // time it returns.
        br.bits_consumed = 0;
        assert_eq!(br.peek_bits(0), 0);
    }

    /// get_bits_triple must produce the same values as three individual
    /// get_bits calls, both with and without a refill in between.
    #[test]
    fn get_bits_triple_matches_individual() {
        let data: [u8; 16] = [
            0xDE, 0xAD, 0xBE, 0xEF, 0x42, 0x13, 0x37, 0xCA, 0xFE, 0x01, 0x99, 0x88, 0x77, 0x66,
            0x55, 0x44,
        ];

        // Reference: individual reads
        let mut ref_br = super::BitReaderReversed::<crate::cpu_kernel::ScalarKernel>::new(&data);
        let r1 = ref_br.get_bits(8);
        let r2 = ref_br.get_bits(9);
        let r3 = ref_br.get_bits(9);

        // Triple read
        let mut triple_br = super::BitReaderReversed::<crate::cpu_kernel::ScalarKernel>::new(&data);
        let (t1, t2, t3) = triple_br.get_bits_triple(8, 9, 9);

        assert_eq!((r1, r2, r3), (t1, t2, t3));
        assert_eq!(ref_br.bits_remaining(), triple_br.bits_remaining());

        // No-refill fast path: 8 bits already consumed, so the next 26 bits
        // still fit in the current container and `ensure_bits(26)` should
        // skip `refill()`.
        let mut ref_br = super::BitReaderReversed::<crate::cpu_kernel::ScalarKernel>::new(&data);
        let mut triple_br = super::BitReaderReversed::<crate::cpu_kernel::ScalarKernel>::new(&data);
        let _ = ref_br.get_bits(8);
        let _ = triple_br.get_bits(8);

        let r1 = ref_br.get_bits(8);
        let r2 = ref_br.get_bits(9);
        let r3 = ref_br.get_bits(9);
        let (t1, t2, t3) = triple_br.get_bits_triple(8, 9, 9);

        assert_eq!((r1, r2, r3), (t1, t2, t3));
        assert_eq!(ref_br.bits_remaining(), triple_br.bits_remaining());

        // Mixed zero-widths: individual sequence extra-bit fields can be zero.
        let mut ref_br = super::BitReaderReversed::<crate::cpu_kernel::ScalarKernel>::new(&data);
        let mut triple_br = super::BitReaderReversed::<crate::cpu_kernel::ScalarKernel>::new(&data);

        let r1 = ref_br.get_bits(5);
        let r2 = ref_br.get_bits(0);
        let r3 = ref_br.get_bits(4);
        let (t1, t2, t3) = triple_br.get_bits_triple(5, 0, 4);

        assert_eq!((r1, r2, r3), (t1, t2, t3));
        assert_eq!(ref_br.bits_remaining(), triple_br.bits_remaining());
    }

    /// `peek_bits_bmi2` MUST produce the same value as scalar `peek_bits`
    /// on every BMI2-capable CPU. Without parity the bmi2 fast-path
    /// chain (when wired) would silently corrupt FSE state.
    #[cfg(all(feature = "std", target_arch = "x86_64"))]
    #[test]
    fn peek_bits_bmi2_matches_scalar() {
        if !is_x86_feature_detected!("bmi2") {
            return;
        }
        let data: [u8; 16] = [
            0xDE, 0xAD, 0xBE, 0xEF, 0x42, 0x13, 0x37, 0xCA, 0xFE, 0x01, 0x99, 0x88, 0x77, 0x66,
            0x55, 0x44,
        ];

        for n in [0u8, 1, 5, 8, 13, 24, 32, 48, 56] {
            let mut scalar =
                super::BitReaderReversed::<crate::cpu_kernel::ScalarKernel>::new(&data);
            let mut bmi2 = super::BitReaderReversed::<crate::cpu_kernel::ScalarKernel>::new(&data);
            scalar.ensure_bits(n);
            bmi2.ensure_bits(n);
            let s = scalar.peek_bits(n);
            // SAFETY: gated on `is_x86_feature_detected!("bmi2")` above.
            let b = unsafe { bmi2.peek_bits_bmi2(n) };
            assert_eq!(s, b, "mismatch at n={}", n);
        }
    }

    /// `peek_bits_triple_bmi2` MUST produce the same triple as the
    /// scalar variant for every width combination the FSE/HUF decoders
    /// can reach.
    #[cfg(all(feature = "std", target_arch = "x86_64"))]
    #[test]
    fn peek_bits_triple_bmi2_matches_scalar() {
        if !is_x86_feature_detected!("bmi2") {
            return;
        }
        let data: [u8; 16] = [
            0xDE, 0xAD, 0xBE, 0xEF, 0x42, 0x13, 0x37, 0xCA, 0xFE, 0x01, 0x99, 0x88, 0x77, 0x66,
            0x55, 0x44,
        ];

        let widths = [
            (0, 0, 0),
            (1, 1, 1),
            (3, 5, 7),
            (8, 8, 8),
            (15, 16, 17),
            (5, 0, 4),
        ];
        for &(n1, n2, n3) in &widths {
            let sum = n1 + n2 + n3;
            let mut scalar =
                super::BitReaderReversed::<crate::cpu_kernel::ScalarKernel>::new(&data);
            let mut bmi2 = super::BitReaderReversed::<crate::cpu_kernel::ScalarKernel>::new(&data);
            scalar.ensure_bits(sum);
            bmi2.ensure_bits(sum);
            let s = scalar.peek_bits_triple(sum, n1, n2, n3);
            // SAFETY: gated on `is_x86_feature_detected!("bmi2")` above.
            let b = unsafe { bmi2.peek_bits_triple_bmi2(sum, n1, n2, n3) };
            assert_eq!(s, b, "mismatch at widths=({},{},{})", n1, n2, n3);
        }
    }

    #[cfg(all(feature = "std", target_arch = "x86_64"))]
    #[test]
    fn should_use_pext_policy_table() {
        let cases = [
            (*b"AuthenticAMD", 0x17, false),
            (*b"AuthenticAMD", 0x19, true),
            (*b"GenuineIntel", 0x06, true),
        ];

        for (vendor, family, expected) in cases {
            assert_eq!(super::should_use_pext(vendor, family), expected);
        }
    }

    #[cfg(all(feature = "std", target_arch = "x86_64"))]
    #[test]
    fn bmi2_triple_extract_matches_scalar_reference() {
        if !is_x86_feature_detected!("bmi2") {
            return;
        }

        let widths = [
            (0, 0, 0),
            (1, 1, 1),
            (3, 5, 7),
            (8, 8, 8),
            (15, 16, 17),
            (21, 21, 21),
            (0, 13, 27),
            (31, 0, 1),
            (1, 31, 0),
            (20, 20, 24),
        ];
        let fixed_values = [
            0,
            1,
            u64::MAX,
            0x0123_4567_89AB_CDEF,
            0xFEDC_BA98_7654_3210,
            0xAAAA_AAAA_AAAA_AAAA,
            0x5555_5555_5555_5555,
            1u64 << 63,
            (1u64 << 32) - 1,
        ];

        for &(n1, n2, n3) in &widths {
            for &all_three in &fixed_values {
                let expected = scalar_extract_triple(all_three, n1, n2, n3);
                let pext = unsafe { super::extract_triple_pext(all_three, n1, n2, n3) };
                assert_eq!(pext, expected);

                if let Some(dispatched) = super::try_extract_triple_with_pext(all_three, n1, n2, n3)
                {
                    assert_eq!(dispatched, expected);
                }
            }
        }

        let mut state = 0xD6E8_FD9D_5A2C_19B7u64;
        for &(n1, n2, n3) in &widths {
            for _ in 0..64 {
                let all_three = next_test_value(&mut state);
                let expected = scalar_extract_triple(all_three, n1, n2, n3);
                let pext = unsafe { super::extract_triple_pext(all_three, n1, n2, n3) };
                assert_eq!(pext, expected);

                if let Some(dispatched) = super::try_extract_triple_with_pext(all_three, n1, n2, n3)
                {
                    assert_eq!(dispatched, expected);
                }
            }
        }
    }
}