lac 0.1.0 - Docs.rs

//! Partitioned Rice entropy coding for LPC residuals.
//!
//! Rice coding is the optimal prefix code for geometrically distributed
//! non-negative integers — precisely the distribution LPC residuals follow
//! after zigzag mapping. Signed residuals are zigzag-mapped to unsigned
//! integers, then Rice-coded with a per-partition parameter `k`.
//!
//! # Why partitioned
//!
//! Audio residual magnitudes vary substantially within a frame: silence
//! between notes, transients during note attacks, sustain during the body
//! of a note. A single frame-wide `k` picks a compromise value that's
//! wrong for most samples. Splitting the residual stream into
//! `2^partition_order` equal-size partitions and choosing `k` per partition
//! adapts the parameter to local activity, typically saving 5-15% over a
//! single-k frame on music.
//!
//! # Codeword structure
//!
//! For a zigzag value `v` with Rice parameter `k`:
//!
//! ```text
//! unary part:    (v >> k) zero-bits, followed by a single 1-bit
//! remainder:     k bits of (v & ((1<<k)-1)), MSB-first
//! ```
//!
//! Total bits per codeword = `(v >> k) + 1 + k`.
//!
//! # Wire layout
//!
//! The partitioned Rice payload is a flat bitstream:
//!
//! ```text
//! partition 0:  5-bit k_0, then Rice codewords for residuals[0 .. N/P]
//! partition 1:  5-bit k_1, then Rice codewords for residuals[N/P .. 2N/P]
//! …
//! partition P-1: 5-bit k_{P-1}, then Rice codewords for residuals[(P-1)N/P .. N]
//! ```
//!
//! where `P = 1 << partition_order`, `N` is the residual count. The frame
//! header carries `partition_order` so the decoder can reconstruct the layout.

use alloc::vec::Vec;

use crate::bit_io::{BitReader, BitWriter};
use crate::frame::DecodeError;
use crate::{MAX_PARTITION_ORDER, MAX_RICE_K};

/// Width of the per-partition `k` field in the bitstream.
///
/// `MAX_RICE_K = 23` fits in 5 bits (values 0..=31 representable); the decoder
/// rejects values above `MAX_RICE_K` as malformed.
pub const RICE_K_BITS: u8 = 5;

// ── Zigzag mapping ──────────────────────────────────────────────────────────

/// Map a signed residual to an unsigned zigzag value.
///
/// Zigzag interleaves positive and negative values so small magnitudes map to
/// small unsigned integers regardless of sign:
///
/// ```text
/// 0 → 0, −1 → 1, 1 → 2, −2 → 3, 2 → 4, …
/// ```
///
/// Formula: `(n << 1) ^ (n >> 31)`. `n >> 31` is an arithmetic right shift —
/// 0 for non-negative `n`, −1 (all-ones) for negative — acting as a
/// branchless sign mask that XORs into the doubled value to flip all bits
/// for negative inputs, placing them at odd positions.
#[inline]
pub fn zigzag(n: i32) -> u32 {
    ((n << 1) ^ (n >> 31)) as u32
}

/// Recover a signed residual from its zigzag-encoded form.
///
/// Formula: `(z >> 1) ^ -(z & 1)`. `-(z & 1)` is 0 for even `z`, −1 for odd
/// `z`, XOR'd with the logical right shift to reverse the zigzag
/// interleaving.
#[inline]
pub fn unzigzag(z: u32) -> i32 {
    ((z >> 1) as i32) ^ -((z & 1) as i32)
}

// ── Per-partition k selection ───────────────────────────────────────────────

/// Total bit cost of encoding `zigzag_vals` with parameter `k`.
///
/// The per-value cost splits into a per-value constant `1 + k` and a
/// data-dependent quotient `v >> k`, so total cost is
/// `N × (1 + k) + Σ (v >> k)`. The quotient sum is the only expensive
/// part; written as a clean `u64` reduce so LLVM autovectorises the
/// shift-and-accumulate on x86 in release mode. A hand-rolled AVX-512
/// version produced identical benchmarks on Ice Lake / Zen 4 hardware
/// and *regressed* at small orders due to function-call overhead — the
/// optimiser handles this kernel on its own.
#[inline]
fn rice_cost(zigzag_vals: &[u32], k: u8) -> usize {
    let mut q_sum: u64 = 0;
    for &v in zigzag_vals {
        q_sum += (v >> k) as u64;
    }
    q_sum as usize + zigzag_vals.len() * (1 + k as usize)
}

/// Select the Rice parameter `k ∈ [0, MAX_RICE_K]` minimising the total bit
/// cost over `zigzag_vals`, returning `(k, total_cost_bits)`.
///
/// Fast convex-descent search: seeds from a closed-form estimate based on
/// the sample mean, then descends to the exact minimum. `rice_cost(k)` is
/// convex in `k` — `Σ(v >> k)` strictly decreases while `N(1 + k)`
/// strictly increases — so locating the minimum is an O(1-few) walk in
/// either direction from the seed, not an exhaustive 24-way scan.
///
/// Returns the exact same `(k, cost)` pair as an exhaustive search over
/// `0..=MAX_RICE_K`. Verified by the `select_k_matches_exhaustive` test
/// across a spread of representative inputs.
pub fn select_k(zigzag_vals: &[u32]) -> (u8, usize) {
    // Empty input: k=0 is the only defensible choice; cost is 0 bits.
    if zigzag_vals.is_empty() {
        return (0, 0);
    }
    let sum: u64 = zigzag_vals.iter().map(|&v| v as u64).sum();
    let n = zigzag_vals.len() as u64;

    // All-zero partition: every codeword is a single 1-bit terminator at
    // k=0, which is optimal. Short-circuit to avoid the log2 path on
    // zero.
    if sum == 0 {
        return (0, zigzag_vals.len());
    }

    // Seed with `k ≈ floor(log2(mean))`. Rice's theoretical optimum for a
    // geometric distribution with mean μ lands very close to this value;
    // the convex descent below covers any rounding slip.
    let mean = sum / n;
    let k_seed = if mean == 0 {
        0
    } else {
        (63 - mean.leading_zeros() as u8).min(MAX_RICE_K)
    };

    // Evaluate cost at the seed, then walk in whichever direction is
    // improving until the cost stops decreasing. Because `rice_cost` is
    // convex, the first `k` at which cost starts to rise is the minimum.
    let mut best_k = k_seed;
    let mut best_cost = rice_cost(zigzag_vals, k_seed);

    // Descending k: prefer smaller `k` on exact ties, matching the
    // exhaustive baseline which would pick the first (smallest) `k` that
    // reaches the minimum cost. The `≤` covers the flat region at the
    // bottom of the convex curve.
    while best_k > 0 {
        let c = rice_cost(zigzag_vals, best_k - 1);
        if c <= best_cost {
            best_cost = c;
            best_k -= 1;
        } else {
            break;
        }
    }
    // Ascending k: strict `<` so ties stay on the already-selected
    // smaller `k`. Only enter this branch if descent didn't move the
    // pointer — convexity means improvement happens in exactly one
    // direction from the seed.
    if best_k == k_seed {
        while best_k < MAX_RICE_K {
            let c = rice_cost(zigzag_vals, best_k + 1);
            if c < best_cost {
                best_cost = c;
                best_k += 1;
            } else {
                break;
            }
        }
    }

    (best_k, best_cost)
}

/// Exhaustive `k` search over `0..=MAX_RICE_K`. Kept for differential
/// testing against the fast path — not called on the hot path.
#[cfg(test)]
fn select_k_exhaustive(zigzag_vals: &[u32]) -> (u8, usize) {
    let mut best_k = 0u8;
    let mut best_cost = usize::MAX;
    for k in 0..=MAX_RICE_K {
        let cost = rice_cost(zigzag_vals, k);
        if cost < best_cost {
            best_cost = cost;
            best_k = k;
        }
    }
    (best_k, best_cost)
}

// ── Encoding ────────────────────────────────────────────────────────────────

/// Encode `residuals` as a partitioned Rice bitstream.
///
/// Splits residuals into `1 << partition_order` equal-size partitions,
/// selects the best `k` per partition, and emits
/// `per_partition(5-bit k | Rice codewords)` concatenated.
///
/// # Preconditions
///
/// - `partition_order ≤ MAX_PARTITION_ORDER` (debug-asserted).
/// - `residuals.len()` must be a multiple of `1 << partition_order`
///   (debug-asserted). The caller — `frame::encode_frame` — enforces this by
///   only considering partition orders that divide the frame size evenly.
#[cfg(test)]
pub(crate) fn rice_encode(residuals: &[i32], partition_order: u8) -> Vec<u8> {
    // Zigzag the whole residual buffer once up front. Partitions are
    // then no-cost slices into the contiguous `zigzag` buffer — avoids
    // allocating a fresh `Vec<u32>` for every partition (up to 128 at
    // `partition_order = 7`).
    let zigzag_buf: Vec<u32> = residuals.iter().map(|&r| zigzag(r)).collect();
    let mut out = Vec::new();
    rice_encode_zigzag_into(&zigzag_buf, partition_order, &mut out);
    out
}

/// Rice-encode pre-zigzagged residuals into the caller's `Vec<u8>`.
/// Bytes are *appended* — existing contents of `out` are preserved,
/// which is what lets `frame::encode_frame_into` write the fixed
/// header into `out` first and then pass the same buffer through
/// for the Rice payload with no intermediate allocation or memcpy.
///
/// Callers that already have a zigzag vector on hand (e.g.
/// `frame::encode_frame_into`, which zigzags once for cost estimation)
/// use this entry point to skip the redundant mapping pass that
/// `rice_encode` does internally.
pub fn rice_encode_zigzag_into(zigzag_vals: &[u32], partition_order: u8, out: &mut Vec<u8>) {
    debug_assert!(
        partition_order <= MAX_PARTITION_ORDER,
        "partition_order={partition_order} exceeds MAX_PARTITION_ORDER={MAX_PARTITION_ORDER}"
    );
    let n_partitions = 1usize << partition_order;
    let partition_size = zigzag_vals.len() / n_partitions;
    debug_assert_eq!(
        zigzag_vals.len() % n_partitions,
        0,
        "zigzag_vals.len()={} is not a multiple of partition count={}",
        zigzag_vals.len(),
        n_partitions
    );

    let mut w = BitWriter::new(out);
    for p in 0..n_partitions {
        let partition = &zigzag_vals[p * partition_size..(p + 1) * partition_size];
        let (k, _cost) = select_k(partition);
        // Per-partition header: 5-bit k.
        w.write_bits(k as u32, RICE_K_BITS);
        write_rice_partition(&mut w, partition, k);
    }
    w.finish();
}

/// Write Rice codewords for a single partition at parameter `k`.
///
/// Each codeword: `(v >> k)` zero-bits, then a 1-bit terminator, then `k`
/// remainder bits of `v & ((1<<k) - 1)` MSB-first.
fn write_rice_partition(w: &mut BitWriter<'_>, zigzag_vals: &[u32], k: u8) {
    // `k_mask` isolates the binary remainder. Set to 0 when `k == 0` to keep
    // the subsequent `write_bits(…, 0)` a no-op; the `if k > 0` guard is a
    // small optimisation that also documents the k=0 case where the
    // remainder is empty.
    let k_mask = if k > 0 { (1u32 << k) - 1 } else { 0 };
    for &v in zigzag_vals {
        let q = v >> k;
        // Unary: `q` zero-bits followed by a terminating 1-bit.
        for _ in 0..q {
            w.write_bit(false);
        }
        w.write_bit(true);
        if k > 0 {
            w.write_bits(v & k_mask, k);
        }
    }
}

// ── Decoding ────────────────────────────────────────────────────────────────

/// Decode a partitioned Rice bitstream back to residuals.
///
/// # Parameters
///
/// - `data`: concatenation of per-partition (5-bit k | Rice codewords).
/// - `partition_order`: shared with the encoder via the frame header.
/// - `total_count`: expected total residual count, must satisfy
///   `total_count % (1 << partition_order) == 0`.
///
/// # Errors
///
/// - `DecodeError::InvalidParameter` if the input declares a per-partition
///   `k` above `MAX_RICE_K` or if `total_count` is not a multiple of the
///   partition count.
/// - `DecodeError::Truncated` if the bitstream ends mid-codeword or before
///   all residuals are produced.
#[cfg(test)]
pub(crate) fn rice_decode(
    data: &[u8],
    partition_order: u8,
    total_count: usize,
) -> Result<Vec<i32>, DecodeError> {
    let mut out = Vec::with_capacity(total_count);
    rice_decode_into(data, partition_order, total_count, &mut out)?;
    Ok(out)
}

/// Decode a partitioned Rice bitstream into a caller-provided buffer.
/// `out` is cleared first and filled with `total_count` i32 residuals
/// on success; on any error it is left in an unspecified but valid
/// state (typically a partial prefix of the expected output).
pub fn rice_decode_into(
    data: &[u8],
    partition_order: u8,
    total_count: usize,
    out: &mut Vec<i32>,
) -> Result<(), DecodeError> {
    out.clear();
    out.reserve(total_count);
    if partition_order > MAX_PARTITION_ORDER {
        return Err(DecodeError::InvalidParameter);
    }
    let n_partitions = 1usize << partition_order;
    if !total_count.is_multiple_of(n_partitions) {
        return Err(DecodeError::InvalidParameter);
    }
    let partition_size = total_count / n_partitions;

    let mut r = BitReader::new(data);
    for _ in 0..n_partitions {
        // Per-partition header: 5 bits of k.
        let k = r.read_bits(RICE_K_BITS).ok_or(DecodeError::Truncated)? as u8;
        if k > MAX_RICE_K {
            return Err(DecodeError::InvalidParameter);
        }
        // Cap on the unary run length `q`. Well-formed frames carry
        // 24-bit samples whose zigzag values fit in ~25 bits, so the
        // full value `z = (q << k) | remainder` must fit in a u32.
        // That means `q ≤ (0xFFFFFFFF >> k)`: any larger would either
        // overflow the u32 on assembly (silent corruption — critical
        // bug class in a decoder) or indicate a malformed stream. The
        // bound varies with `k` so that a valid `z` fits even at
        // `k = 0`. For `k = MAX_RICE_K = 23` the cap is `2^9 − 1 =
        // 511`; for `k = 0` it's `u32::MAX`.
        let q_max: u32 = u32::MAX >> k;
        for _ in 0..partition_size {
            // Byte-at-a-time unary scan via `BitReader::read_unary`. On
            // typical audio (small q per codeword) this compiles to a
            // tight byte-fetch + `leading_zeros` sequence; on pathological
            // inputs with long zero runs it's ~8× faster than the
            // historical per-bit loop. Exhaustion of the buffer during
            // the scan is a structural error, not a recoverable one.
            let q = r.read_unary().ok_or(DecodeError::Truncated)?;
            // Reject once `q` exceeds what can combine with any remainder
            // without overflowing u32. This closes the `q << k`
            // truncation class: without the check, adversarial input
            // could make `q = 2^26` and `k = 23`, producing `q << k =
            // 2^49` that wraps to a u32 with the top 17 bits silently
            // dropped.
            if q > q_max {
                return Err(DecodeError::InvalidParameter);
            }
            let remainder = if k > 0 {
                r.read_bits(k).ok_or(DecodeError::Truncated)?
            } else {
                0
            };
            // `q ≤ u32::MAX >> k` and `remainder < 1 << k`, so the
            // assembled value fits in u32 exactly — no silent
            // truncation. Belt-and-braces widening to u64 plus an
            // explicit range assert makes the invariant visible if a
            // future refactor loosens the q bound.
            let z = ((q as u64) << k) | (remainder as u64);
            debug_assert!(z <= u32::MAX as u64, "q<<k overflow: q={q} k={k}");
            out.push(unzigzag(z as u32));
        }
    }
    Ok(())
}

// ── Encoder cost estimation ─────────────────────────────────────────────────

/// Estimate the Rice bit cost of encoding pre-zigzagged `zigzag_vals` at a
/// given `partition_order`, including per-partition `k` headers. Returns
/// `None` if `partition_order` is out of range or the length is not evenly
/// divisible.
///
/// The caller zigzags once per LPC order and reuses the resulting vector
/// across every partition order; splitting an already-zigzagged vector is
/// a no-cost slice operation, whereas redoing the zigzag per partition
/// wastes `O(N × partition_orders)` work. `frame::encode_frame` drives
/// this pattern.
///
/// The return value is exact — it's the bit count of the Rice payload,
/// not an approximation — because the Rice bit cost is closed-form given
/// `k`.
pub fn estimate_cost(zigzag_vals: &[u32], partition_order: u8) -> Option<usize> {
    if partition_order > MAX_PARTITION_ORDER {
        return None;
    }
    let n_partitions = 1usize << partition_order;
    if !zigzag_vals.len().is_multiple_of(n_partitions) {
        return None;
    }
    let partition_size = zigzag_vals.len() / n_partitions;
    let mut total_bits = 0usize;
    for p in 0..n_partitions {
        let partition = &zigzag_vals[p * partition_size..(p + 1) * partition_size];
        let (_k, cost) = select_k(partition);
        // Per-partition overhead: 5-bit k field.
        total_bits += RICE_K_BITS as usize + cost;
    }
    Some(total_bits)
}

/// Convenience wrapper over `estimate_cost` that zigzags `residuals` on
/// the fly. Useful when the caller only has one partition order to
/// evaluate (no cross-partition-order reuse) — tests, ad-hoc callers.
///
/// The encoder hot path in `frame::encode_frame` does not use this; it
/// zigzags once per LPC order and passes the result to `estimate_cost`
/// directly.
#[cfg(test)]
pub(crate) fn estimate_cost_from_residuals(
    residuals: &[i32],
    partition_order: u8,
) -> Option<usize> {
    let zigzag_vals: Vec<u32> = residuals.iter().map(|&r| zigzag(r)).collect();
    estimate_cost(&zigzag_vals, partition_order)
}

// ── Tests ───────────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;
    use alloc::vec;
    use alloc::vec::Vec;

    #[test]
    fn zigzag_roundtrip() {
        let cases = [0, 1, -1, 2, -2, 100, -100, i32::MAX / 2, i32::MIN / 2];
        for n in cases {
            assert_eq!(unzigzag(zigzag(n)), n, "zigzag failed for {n}");
        }
    }

    #[test]
    fn zigzag_ordering() {
        assert_eq!(zigzag(0), 0);
        assert_eq!(zigzag(-1), 1);
        assert_eq!(zigzag(1), 2);
        assert_eq!(zigzag(-2), 3);
        assert_eq!(zigzag(2), 4);
    }

    #[test]
    fn rice_roundtrip_single_partition() {
        let residuals: Vec<i32> = (-480..480).collect();
        for po in 0..=MAX_PARTITION_ORDER {
            if !residuals.len().is_multiple_of(1usize << po) {
                continue;
            }
            let bytes = rice_encode(&residuals, po);
            let decoded = rice_decode(&bytes, po, residuals.len()).unwrap();
            assert_eq!(
                decoded, residuals,
                "roundtrip failed at partition_order={po}"
            );
        }
    }

    #[test]
    fn rice_roundtrip_all_partition_orders() {
        // 1024 = 2^10, divisible by all partition orders 0..=7.
        let residuals: Vec<i32> = (0..1024i32).map(|i| ((i * 13 + 7) % 200) - 100).collect();
        for po in 0..=MAX_PARTITION_ORDER {
            let bytes = rice_encode(&residuals, po);
            let decoded = rice_decode(&bytes, po, residuals.len()).unwrap();
            assert_eq!(
                decoded, residuals,
                "roundtrip failed at partition_order={po}"
            );
        }
    }

    #[test]
    fn rice_all_zeros_is_optimal() {
        // 128 zeros at partition_order=0: 5-bit k header + 128 × 1-bit
        // unary terminator = 5 + 128 = 133 bits = 17 bytes (with padding).
        let residuals = vec![0i32; 128];
        let bytes = rice_encode(&residuals, 0);
        assert_eq!(bytes.len(), (5usize + 128).div_ceil(8));
        let decoded = rice_decode(&bytes, 0, residuals.len()).unwrap();
        assert_eq!(decoded, residuals);
    }

    #[test]
    fn partitioned_beats_single_when_activity_varies() {
        // 512 residuals: first half quiet (magnitude ~3), second half loud
        // (magnitude ~1000). One-partition coding has to pick a k that's
        // wrong for half the samples; partitioned coding picks k per half
        // and saves bits.
        let mut residuals = Vec::with_capacity(512);
        for i in 0..256i32 {
            residuals.push((i % 7) - 3);
        }
        for i in 0..256i32 {
            residuals.push(((i * 41) % 2000) - 1000);
        }

        let cost_po0 = estimate_cost_from_residuals(&residuals, 0).unwrap();
        let cost_po1 = estimate_cost_from_residuals(&residuals, 1).unwrap();
        assert!(
            cost_po1 < cost_po0,
            "partitioned should beat single-k on varying activity: po0={cost_po0}, po1={cost_po1}"
        );
    }

    #[test]
    fn rice_decode_rejects_q_shift_overflow() {
        // Adversarial input: k = MAX_RICE_K (23), then a unary prefix
        // whose zero-count exceeds `q_max = u32::MAX >> k = 511`.
        // Without the `q > q_max` bound the decoder would accept
        // arbitrarily large q, and `q << k` would silently truncate the
        // top bits of the u32 assembly — a critical decoder bug class.
        //
        // Bitstream: 5 bits of k, then 512 zero bits (q = 512,
        // one past the cap), then a terminator `1` and a zero
        // remainder so the unary scan completes. The post-scan check
        // `q > q_max` triggers InvalidParameter, independent of whether
        // the scan is bit-by-bit or byte-at-a-time.
        let mut bytes = Vec::new();
        {
            let mut w = BitWriter::new(&mut bytes);
            w.write_bits(MAX_RICE_K as u32, 5);
            for _ in 0..512 {
                w.write_bit(false);
            }
            w.write_bit(true);
            // Zero-valued 23-bit remainder, padded to bitstream end.
            w.write_bits(0, MAX_RICE_K);
            w.finish();
        }
        let result = rice_decode(&bytes, 0, 1);
        assert!(
            matches!(result, Err(DecodeError::InvalidParameter)),
            "adversarial q-overflow must return InvalidParameter, got {:?}",
            result
        );
    }

    #[test]
    fn rice_decode_rejects_oversize_k() {
        // Craft a bitstream with k = 31 (above MAX_RICE_K = 23) in the
        // first 5 bits; decoder must reject.
        let mut bytes = Vec::new();
        {
            let mut w = BitWriter::new(&mut bytes);
            w.write_bits(31, 5);
            w.write_bit(true); // residual value 0 at any k: unary 1-bit
            w.finish();
        }
        let result = rice_decode(&bytes, 0, 1);
        assert!(matches!(result, Err(DecodeError::InvalidParameter)));
    }

    #[test]
    fn rice_decode_rejects_k_at_boundary() {
        // The tightest invalid case: exactly `MAX_RICE_K + 1`. A bug
        // that used `> 23` vs `>= 24` would slip an off-by-one through
        // the k=31 test above but still fail here.
        let mut bytes = Vec::new();
        {
            let mut w = BitWriter::new(&mut bytes);
            w.write_bits(MAX_RICE_K as u32 + 1, 5); // k = 24
            w.write_bit(true);
            w.finish();
        }
        let result = rice_decode(&bytes, 0, 1);
        assert!(
            matches!(result, Err(DecodeError::InvalidParameter)),
            "k = MAX_RICE_K + 1 = {} must be rejected",
            MAX_RICE_K + 1
        );
    }

    #[test]
    fn rice_decode_truncated_returns_error() {
        // Encode 128 zeros then truncate the bitstream.
        let bytes = rice_encode(&vec![0i32; 128], 0);
        let truncated = &bytes[..bytes.len() / 2];
        let result = rice_decode(truncated, 0, 128);
        assert!(matches!(result, Err(DecodeError::Truncated)));
    }

    #[test]
    fn rice_decode_truncated_mid_remainder() {
        // Truncation can land mid-remainder (inside the k-bit binary
        // suffix of a codeword). The partition uses k ≈ 10 and a
        // single small value whose remainder straddles a byte boundary;
        // cut the byte-aligned payload one byte short so the reader
        // hits EOF while consuming the remainder of the last codeword.
        // All-same-value input forces k away from 0 so the remainder
        // path is actually exercised (k = 0 skips remainders entirely).
        let residuals: Vec<i32> = vec![1024; 64];
        let bytes = rice_encode(&residuals, 0);
        // Drop the last byte — this lands inside the final residual's
        // 10-bit remainder payload, well away from a codeword boundary.
        let cut = &bytes[..bytes.len() - 1];
        let result = rice_decode(cut, 0, residuals.len());
        assert!(
            matches!(result, Err(DecodeError::Truncated)),
            "mid-remainder truncation must return Truncated, got {:?}",
            result
        );
    }

    #[test]
    fn rice_decode_truncated_mid_unary() {
        // Truncation mid-unary-run: a large residual at k=0 produces a
        // very long unary quotient. Truncating after the first few
        // zero bits leaves the reader scanning for a terminator that
        // never arrives.
        let bytes = rice_encode(&[1_000_000i32], 0);
        // Keep only the k-field byte; drop everything else.
        let cut = &bytes[..1];
        let result = rice_decode(cut, 0, 1);
        assert!(
            matches!(result, Err(DecodeError::Truncated)),
            "mid-unary-run truncation must return Truncated, got {:?}",
            result
        );
    }

    #[test]
    fn rice_decode_rejects_mismatched_partition_size() {
        // partition_order=3 ⇒ 8 partitions, but total_count=7 isn't divisible.
        let bytes = rice_encode(&[0i32; 8], 3);
        let result = rice_decode(&bytes, 3, 7);
        assert!(matches!(result, Err(DecodeError::InvalidParameter)));
    }

    #[test]
    fn select_k_picks_k0_for_all_zeros() {
        let vals = vec![0u32; 128];
        let (k, _) = select_k(&vals);
        assert_eq!(k, 0);
    }

    #[test]
    fn select_k_picks_large_k_for_large_values() {
        // Uniformly large values → k near log2(mean).
        let vals = vec![(1u32 << 20) - 1; 32];
        let (k, _) = select_k(&vals);
        assert!(k >= 16, "expected large k for large values, got {k}");
    }

    #[test]
    fn select_k_matches_exhaustive() {
        // Fast convex-descent must produce identical results to the
        // exhaustive baseline on every partition. Any divergence here
        // indicates either the convexity assumption is wrong (it isn't)
        // or the seed heuristic drops us into the wrong local search
        // direction.
        let test_cases: Vec<Vec<u32>> = vec![
            // All zeros.
            vec![0u32; 32],
            vec![0u32; 1],
            // Single non-zero value among zeros.
            vec![0, 0, 0, 0, 0, 0, 0, 42],
            // Uniform small values.
            vec![1u32; 64],
            vec![3u32; 64],
            vec![7u32; 64],
            // Uniform large values.
            vec![(1u32 << 15) - 1; 32],
            vec![(1u32 << 20) - 1; 32],
            vec![(1u32 << 24) - 1; 16],
            // Mixed small + large (transient + silence).
            {
                let mut v = vec![0u32; 64];
                for (i, slot) in v.iter_mut().take(8).enumerate() {
                    *slot = 1_000_000 + i as u32;
                }
                v
            },
            // Linear ramp (broad distribution).
            (0u32..64).collect(),
            (0u32..256).map(|i| i * 37).collect(),
            // Pseudo-random (from xorshift).
            {
                let mut v = Vec::with_capacity(128);
                let mut state: u32 = 0x9E3779B9;
                for _ in 0..128 {
                    state ^= state << 13;
                    state ^= state >> 17;
                    state ^= state << 5;
                    v.push(state & 0xFFFFF);
                }
                v
            },
            // Boundary around the maximum k.
            vec![1u32 << 23; 16],
            vec![1u32 << 24; 16],
            vec![u32::MAX >> 8; 16],
        ];
        for case in &test_cases {
            let fast = select_k(case);
            let exhaustive = select_k_exhaustive(case);
            assert_eq!(
                fast,
                exhaustive,
                "select_k mismatch for case of len {}: fast={:?} exhaustive={:?}",
                case.len(),
                fast,
                exhaustive
            );
        }
    }

    #[test]
    fn estimate_cost_matches_actual_payload_bits() {
        // `estimate_cost` is supposed to be exact; the actual encoded
        // bitstream bit count (ignoring byte padding) should match.
        let residuals: Vec<i32> = (0..256i32).map(|i| ((i * 7 + 3) % 50) - 25).collect();
        for po in 0..=3 {
            let estimated = estimate_cost_from_residuals(&residuals, po).unwrap();
            let bytes = rice_encode(&residuals, po);
            // Encoded bitstream is padded to a byte boundary. The actual bit
            // count is between (bytes.len()-1)*8 + 1 and bytes.len()*8.
            let actual_max = bytes.len() * 8;
            let actual_min = actual_max.saturating_sub(7);
            assert!(
                estimated >= actual_min && estimated <= actual_max,
                "cost mismatch at po={po}: estimated={estimated}, actual range=[{actual_min},{actual_max}]"
            );
        }
    }
}