lac 0.1.0 - Docs.rs

//! Frame encode and decode.
//!
//! A LAC frame is a self-contained unit: `header || rice_bitstream`. The
//! header carries the LPC prediction order, the Rice partition order, sample
//! count, and Q15 predictor coefficients. The Rice bitstream follows
//! immediately with all residuals for the frame, grouped by partition.
//!
//! Frames are independently decodable — no cross-frame state — so a lost or
//! corrupt frame cannot corrupt subsequent decoding. On structural or
//! authentication failure the caller substitutes `frame_sample_count` zeros
//! (silence) for the frame period.

use alloc::vec::Vec;

use crate::lpc::{
    LpcLevels, MAX_COEFFICIENT_SHIFT, compute_residuals_into, lpc_analyze_levels_into,
    lpc_synthesize_into,
};
use crate::rice::{estimate_cost, rice_decode_into, rice_encode_zigzag_into, zigzag};
use crate::{MAX_LPC_ORDER, MAX_PARTITION_ORDER};

/// Sync word at bytes 0-1 of every audio frame header.
///
/// Chosen to be rare in natural audio data: the high bit set plus the
/// "ACC" mnemonic produces a value unlikely to arise from raw PCM samples
/// treated as a byte stream. Distinct from LVC's sync word so a cross-typed
/// decoder rejects foreign frames at the first check.
pub const SYNC_WORD: u16 = 0x1ACC;

/// Structured decode error for the frame parser and Rice decoder.
///
/// On any of these errors the caller must discard the frame and substitute
/// silence — partial state must never propagate into subsequent decodes.
/// Each header-rejection variant carries the offending value so operator
/// telemetry can distinguish "this is not a LAC stream" (`BadSyncWord`)
/// from "one field was corrupted on the wire" (the field-specific
/// variants), without re-reading the failing bytes.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum DecodeError {
    /// First two bytes did not match `SYNC_WORD` (`0x1ACC`). Either the
    /// payload is not a LAC frame, or it uses a different wire-format
    /// version (future versions use distinct sync words per `Specification.md` §8).
    BadSyncWord {
        /// The 16-bit big-endian value actually read from `data[0..2]`.
        got: u16,
    },
    /// `prediction_order` field exceeds `MAX_LPC_ORDER` (32).
    InvalidPredictionOrder {
        /// The u8 value read from `data[2]`.
        got: u8,
    },
    /// `partition_order` field exceeds `MAX_PARTITION_ORDER` (7).
    InvalidPartitionOrder {
        /// The u8 value read from `data[3]`.
        got: u8,
    },
    /// `coefficient_shift` field exceeds `MAX_COEFFICIENT_SHIFT` (5).
    InvalidCoefficientShift {
        /// The u8 value read from `data[4]`.
        got: u8,
    },
    /// Verbatim frame (`prediction_order == 0`) carried a non-zero
    /// `coefficient_shift`. The shift field is meaningless without
    /// coefficients and must be zero for the header to round-trip cleanly.
    CoefficientShiftWithoutOrder {
        /// The non-zero shift value that tripped the check.
        shift: u8,
    },
    /// The bitstream ended before all expected data had been read.
    Truncated,
    /// A parameter value decoded from the bitstream is out of its documented
    /// range — e.g., a per-partition `k` above the allowed maximum, a
    /// `frame_sample_count` of zero, or a partition count that doesn't
    /// evenly divide the frame length.
    InvalidParameter,
    /// The stream uses a feature this decoder does not implement. Reserved
    /// for future format extensions; no current encoder path emits values
    /// that trigger this.
    Unsupported,
}

impl core::fmt::Display for DecodeError {
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        match self {
            DecodeError::BadSyncWord { got } => {
                write!(
                    f,
                    "bad sync word: got {got:#06x}, expected {SYNC_WORD:#06x}"
                )
            }
            DecodeError::InvalidPredictionOrder { got } => {
                write!(f, "prediction_order {got} exceeds max {MAX_LPC_ORDER}")
            }
            DecodeError::InvalidPartitionOrder { got } => {
                write!(f, "partition_order {got} exceeds max {MAX_PARTITION_ORDER}")
            }
            DecodeError::InvalidCoefficientShift { got } => {
                write!(
                    f,
                    "coefficient_shift {got} exceeds max {MAX_COEFFICIENT_SHIFT}"
                )
            }
            DecodeError::CoefficientShiftWithoutOrder { shift } => {
                write!(f, "coefficient_shift is {shift} but prediction_order is 0")
            }
            DecodeError::Truncated => f.write_str("bitstream truncated"),
            DecodeError::InvalidParameter => f.write_str("decoded parameter out of range"),
            DecodeError::Unsupported => f.write_str("unsupported stream feature"),
        }
    }
}

impl core::error::Error for DecodeError {}

// ── Frame header ────────────────────────────────────────────────────────────

/// Decoded representation of an audio frame header.
///
/// Wire layout (all multi-byte integer fields are big-endian):
///
/// ```text
///  0- 1  sync_word           u16 BE   = 0x1ACC
///  2      prediction_order   u8       ∈ [0, 32]
///  3      partition_order    u8       ∈ [0, 7]
///  4      coefficient_shift  u8       ∈ [0, 5]
///  5- 6  frame_sample_count  u16 BE
///  7+     lpc_coefficients    [i16 BE; prediction_order]
/// ```
///
/// Total fixed-header length = `7 + prediction_order × 2` bytes.
///
/// Marked `#[non_exhaustive]` so future wire-format revisions that add
/// fields (e.g. a v2 CRC tag under the versioning rules in spec §8)
/// don't become a breaking change for external callers doing
/// struct-literal construction or exhaustive pattern matches.
/// In-crate code is unaffected.
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub struct AudioFrameHeader {
    /// LPC prediction order. 0 = verbatim (no prediction, residuals equal
    /// samples).
    pub prediction_order: u8,
    /// Rice partition order. The residual stream is split into
    /// `1 << partition_order` equal-size partitions, each with its own Rice
    /// parameter stored at the start of the partition's payload.
    pub partition_order: u8,
    /// Fixed-point scale of `lpc_coefficients`. Coefficients are
    /// interpreted as `Q(15 − coefficient_shift)`. At `shift = 0` the
    /// scale is Q15 (values in `[−1, 1)`); at `shift = 5` the scale is Q10
    /// (values in `[−32, 32)`). Always `0` when `prediction_order == 0`.
    pub coefficient_shift: u8,
    /// Number of audio samples in this frame. Matches the length of the
    /// decoded sample vector. Must be a multiple of
    /// `1 << partition_order` so the residual stream divides evenly.
    pub frame_sample_count: u16,
    /// Predictor coefficients in Q(15 − coefficient_shift) format; length
    /// equals `prediction_order`. Synthesis formula:
    /// `x̂[n] = Σ coeff[j] · x[n-j-1]` with right-shift by
    /// `(15 − coefficient_shift)` (plain positive sum; the negation
    /// between analysis and predictor conventions is applied inside the
    /// LPC module at quantisation time).
    pub lpc_coefficients: Vec<i16>,
}

/// Parse an audio frame header from the start of `data`.
///
/// Returns `(header, bytes_consumed)` on success. Validates the sync word,
/// `prediction_order ≤ MAX_LPC_ORDER`, `partition_order ≤ MAX_PARTITION_ORDER`,
/// `coefficient_shift ≤ MAX_COEFFICIENT_SHIFT`, buffer length sufficient for
/// the coefficient array, and that `frame_sample_count` divides evenly by
/// `1 << partition_order`.
pub fn parse_header(data: &[u8]) -> Result<(AudioFrameHeader, usize), DecodeError> {
    if data.len() < 7 {
        return Err(DecodeError::Truncated);
    }

    let sync = u16::from_be_bytes([data[0], data[1]]);
    if sync != SYNC_WORD {
        return Err(DecodeError::BadSyncWord { got: sync });
    }

    let prediction_order = data[2];
    if prediction_order > MAX_LPC_ORDER {
        return Err(DecodeError::InvalidPredictionOrder {
            got: prediction_order,
        });
    }

    let partition_order = data[3];
    if partition_order > MAX_PARTITION_ORDER {
        return Err(DecodeError::InvalidPartitionOrder {
            got: partition_order,
        });
    }

    let coefficient_shift = data[4];
    if coefficient_shift > MAX_COEFFICIENT_SHIFT {
        return Err(DecodeError::InvalidCoefficientShift {
            got: coefficient_shift,
        });
    }
    // Verbatim frames (order 0) carry no coefficients; the shift field is
    // irrelevant and must be zero for the header to round-trip cleanly.
    if prediction_order == 0 && coefficient_shift != 0 {
        return Err(DecodeError::CoefficientShiftWithoutOrder {
            shift: coefficient_shift,
        });
    }

    let frame_sample_count = u16::from_be_bytes([data[5], data[6]]);

    // Spec §3.5 requires `frame_sample_count ≥ 1`. A zero-sample frame
    // trivially passes the partition-divisibility check below (0 % n == 0)
    // but carries no audio — rejecting it here prevents a well-formed
    // but useless frame from parsing successfully.
    if frame_sample_count == 0 {
        return Err(DecodeError::InvalidParameter);
    }

    let n_partitions = 1u32 << partition_order;
    // u32 arithmetic is sufficient: `frame_sample_count ≤ 65535` and
    // `n_partitions ≤ 128`.
    if !(frame_sample_count as u32).is_multiple_of(n_partitions) {
        return Err(DecodeError::InvalidParameter);
    }

    let coeff_bytes = prediction_order as usize * 2;
    if data.len() < 7 + coeff_bytes {
        return Err(DecodeError::Truncated);
    }

    let lpc_coefficients: Vec<i16> = (0..prediction_order as usize)
        .map(|i| i16::from_be_bytes([data[7 + i * 2], data[8 + i * 2]]))
        .collect();

    Ok((
        AudioFrameHeader {
            prediction_order,
            partition_order,
            coefficient_shift,
            frame_sample_count,
            lpc_coefficients,
        },
        7 + coeff_bytes,
    ))
}

// ── Public API ──────────────────────────────────────────────────────────────

/// Encode a frame of signed integer PCM samples to a complete frame byte
/// string.
///
/// Searches over a sparse LPC order grid (see spec §7) with a two-order
/// early-out, crossed with every `partition_order ∈ [0, 7]` that divides
/// the sample count evenly. Emits the `(order, partition_order, shift)`
/// combination that minimises `header_bits + rice_bits`.
///
/// # Preconditions
///
/// - `samples.len() ∈ [1, 65535]`. Frame length fits `u16` per spec §3.5;
///   values outside this range have no valid wire representation.
///   Violating this contract **panics** in every build (not just debug) —
///   silently truncating `samples.len() as u16` would emit a frame whose
///   `frame_sample_count = 0` that the decoder then (correctly) rejects,
///   leaving the caller with no signal.
/// - Every sample satisfies `|sample| ≤ 2²³ − 1` (spec §1). Out-of-range
///   input is a caller contract violation; the encoder panics in both
///   debug and release. The autocorrelation's `i64` accumulator
///   overflows on samples wider than ~2²⁴ at typical frame counts,
///   producing a structurally-valid frame that does not round-trip —
///   a silent corruption class the panic replaces with a loud failure.
///   Callers re-feeding decoder output (e.g. an MCU mix pipeline)
///   **must** clamp or validate first; see [`decode_frame`] for the
///   output-magnitude caveat that motivates this. The check is one
///   linear scan of `samples` per encode (~1 µs on a 4096-sample
///   frame, ~2 % of typical frame-encode time).
pub fn encode_frame(samples: &[i32]) -> Vec<u8> {
    let mut out = Vec::new();
    encode_frame_into(samples, &mut out);
    out
}

/// Like [`encode_frame`] but writes into the caller's `Vec<u8>`. The
/// buffer is cleared first; its existing capacity is retained so hot-
/// path callers can reuse one allocation across every frame they
/// encode. Designed for the MCU encode fanout and any QUIC-streaming
/// sender that owns a per-channel scratch buffer.
///
/// Same preconditions as [`encode_frame`] — the same panic and
/// debug-assert policy applies.
pub fn encode_frame_into(samples: &[i32], out: &mut Vec<u8>) {
    // Sparse order grid used by the reference encoder. See spec §7 for
    // the compliance clause: exhaustive `0..=32` search also produces
    // a valid frame, and the two paths are verified to agree within
    // ~0.5% compression on the corpus by the `sparse_vs_exhaustive`
    // differential.
    const ORDER_GRID: &[u8] = &[0, 2, 4, 6, 8, 10, 12, 16, 20, 24, 28, 32];
    encode_frame_with_grid(samples, ORDER_GRID, 2, out);
}

/// Encoder entry point parameterised by the LPC order grid and the
/// patience value for the stop-when-stale heuristic. Crate-private:
/// `encode_frame` / `encode_frame_into` are the public API. The
/// `sparse_vs_exhaustive_on_headset_speech` unit test (further down
/// this file) is the only non-production caller; it reaches in from
/// the crate-internal `tests` module.
///
/// Frame length must fit in `u16` (spec §3.5): `samples.len() ≤ 65535`.
/// The checks here are `assert!` (runtime, release-build-active) rather
/// than `debug_assert!` — silently truncating `samples.len() as u16`
/// would emit a frame whose header's `frame_sample_count = 0` that the
/// decoder then (correctly) rejects, leaving the encoder with no
/// signal that the caller violated the contract.
pub(crate) fn encode_frame_with_grid(
    samples: &[i32],
    order_grid: &[u8],
    patience: u8,
    out: &mut Vec<u8>,
) {
    assert!(
        samples.len() <= u16::MAX as usize,
        "LAC frame_sample_count {} exceeds u16::MAX ({}) — chunk the input into smaller frames",
        samples.len(),
        u16::MAX
    );
    assert!(!samples.is_empty(), "LAC cannot encode a zero-sample frame");
    // Input-magnitude contract check (spec §1). Runtime-active in
    // release — an out-of-range sample silently overflows the i64
    // autocorrelation accumulator at large frame counts
    // (`|s|² · 65535` exceeds i64 once |s| passes ~2^24, see
    // `encode_frame` doc). That wraps to garbage coefficients and
    // produces a structurally-valid frame that decodes deterministically
    // but fails round-trip. The attack surface is MCU pipelines that
    // forget to clamp decoder output per spec §6.2 before re-encoding;
    // panicking here turns that silent-corruption class into a loud
    // failure instead. Cost is one linear scan of `samples` per encode
    // (~1 µs on a 4096-sample frame), comfortably under the ~60 µs
    // total encode time.
    assert!(
        samples
            .iter()
            .all(|&s| (-((1 << 23) - 1)..=((1 << 23) - 1)).contains(&s)),
        "LAC encoder input must satisfy |sample| ≤ 2^23 - 1 (spec §1); found out-of-range value"
    );

    // `R[0] == 0` ⇔ every sample is zero. The LPC path divides by `R[0]`
    // inside Levinson-Durbin, so the encoder forces order 0 (verbatim) for
    // all-zero frames to avoid division-by-zero and because verbatim is
    // trivially optimal on silence anyway.
    let r0: i64 = samples.iter().map(|&s| (s as i64) * (s as i64)).sum();

    // Pre-compute LPC coefficients for all orders 1..=MAX_LPC_ORDER in
    // a single Levinson-Durbin pass. `levels_valid = false` means
    // all-zero input; the search below will only use order 0 in that
    // case. `levels` is stack-allocated — ~2 KB zero-fill once per
    // frame, avoids the dozens of heap allocations a `Vec<Vec<i16>>`
    // analogue would pay.
    let mut levels = LpcLevels::new();
    let levels_valid = if r0 == 0 {
        false
    } else {
        lpc_analyze_levels_into(samples, MAX_LPC_ORDER, &mut levels)
    };

    let mut best_total_bits = usize::MAX;
    let mut best_order = 0u8;
    let mut best_partition_order = 0u8;
    let mut best_shift = 0u8;
    // When `Some`, the winner is a fixed-predictor candidate and the
    // coefficient slice below should be used verbatim instead of being
    // re-fetched from `levels`. Set only when a fixed predictor beats
    // every LPC option tried for the same frame.
    let mut best_fixed_coeffs: Option<&'static [i16]> = None;

    // Fixed predictors (FLAC-style orders 1-4). Each is an
    // integer-valued coefficient vector that quantises exactly at a
    // specific `coefficient_shift`, so the prediction filter carries
    // no Q-format fitting variance. They're evaluated as additional
    // candidates alongside Levinson-Durbin.
    //
    // Layout per entry: `(prediction_order, coefficients, coefficient_shift)`.
    // The stored coefficients are in Q(15 − shift) form, matching
    // spec §3.4:
    //
    //   Order 1:  coeff = [1]            → 1st difference       (Q14)
    //   Order 2:  coeff = [2, -1]        → 2nd difference       (Q13)
    //   Order 3:  coeff = [3, -3, 1]     → 3rd difference       (Q13)
    //   Order 4:  coeff = [4, -6, 4, -1] → 4th difference       (Q12)
    //
    // Coefficients are the FLAC fixed-predictor formulas in the
    // "positive-sum predictor" convention LAC uses (see
    // `lpc::q31_to_qn` for the sign flip from analysis-filter form).
    //
    // Measured impact on the test corpus: neutral to mildly positive
    // on all six files (compression tied or 0.1-0.2 pp smaller than
    // LPC-only; `sparse_vs_exhaustive` gap narrowed from 0.19% to
    // 0.17%). No regression; net win. Most gain would appear on
    // smooth polynomial-ish signals (synthetic test tones, sensor
    // data) where the n-th difference is genuinely tiny.
    const FIXED_PREDICTORS: &[(u8, &[i16], u8)] = &[
        (1, &[16_384], 1),                          // [1]           @ Q14
        (2, &[16_384, -8_192], 2),                  // [2, -1]       @ Q13
        (3, &[24_576, -24_576, 8_192], 2),          // [3, -3, 1]    @ Q13
        (4, &[16_384, -24_576, 16_384, -4_096], 3), // [4, -6, 4, -1] @ Q12
    ];

    // Caller supplies the order grid and the early-out patience. The
    // public `encode_frame` passes the reference sparse grid; the test
    // suite uses `encode_frame_with_grid` to drive an exhaustive
    // `0..=32` search and assert sparse-vs-exhaustive parity on real
    // audio (see `sparse_vs_exhaustive_corpus`).
    let mut stale_orders = 0u8;

    // Scratch buffers reused across every order evaluated below. Each
    // `compute_residuals_into` / zigzag pass clears and refills these
    // rather than allocating a fresh `Vec`; ~15 orders × 2 allocations
    // saved per frame × 7500 frames per 5-person MCU test works out to
    // thousands of avoided allocations. Capacity reservation on the
    // first call covers every subsequent one.
    let mut residuals_buf: Vec<i32> = Vec::with_capacity(samples.len());
    let mut zigzag_buf: Vec<u32> = Vec::with_capacity(samples.len());
    // Winning-candidate zigzag snapshot. Pre-allocated to match
    // `zigzag_buf` capacity so the best-update swap below preserves
    // both sides' capacities and no allocator traffic occurs during
    // the order loop.
    let mut best_zigzag: Vec<u32> = Vec::with_capacity(samples.len());

    // Empty-coefficient placeholder used for order 0 and the all-zero
    // input fallback. Static so the search loop below can borrow `&[]`
    // from it without ever allocating a `Vec<i16>` for the common case.
    const EMPTY_COEFFS: &[i16] = &[];

    for &order in order_grid {
        let prev_best = best_total_bits;

        // For order 0 or all-zero input, the predictor is empty and
        // residuals equal the samples. Shift has no effect in that case
        // and is reported as 0. For higher orders we *borrow* the
        // coefficient slice from `levels` rather than cloning — that
        // saves a small Vec allocation per order-iteration, ~12 allocs
        // per frame at MCU scale.
        let (coeffs_slice, shift): (&[i16], u8) = if order == 0 || !levels_valid {
            (EMPTY_COEFFS, 0)
        } else {
            let view = levels.get(order);
            (view.coefficients, view.shift)
        };
        compute_residuals_into(samples, coeffs_slice, shift, &mut residuals_buf);
        // Zigzag once per LPC order and reuse the vector across every
        // partition-order evaluation. Partition slicing is a no-cost
        // byte offset into the already-mapped vector; redoing the
        // signed→unsigned map inside each partition would multiply the
        // zigzag work by 8 (one per partition_order) for no gain.
        zigzag_buf.clear();
        zigzag_buf.extend(residuals_buf.iter().map(|&r| zigzag(r)));

        // Header bit cost: 7-byte fixed part plus 2 bytes per coefficient.
        let header_bits = (7 + coeffs_slice.len() * 2) * 8;

        // Track whether any `po` improves the running best at this
        // order. The zigzag_buf ↔ best_zigzag swap happens once after
        // the partition-order loop if so — swapping inside the loop
        // would replace `zigzag_buf` with the previous-best's data
        // mid-iteration, breaking every remaining `estimate_cost` call.
        let mut best_updated = false;
        for po in 0..=MAX_PARTITION_ORDER {
            // `estimate_cost` returns None when partition count doesn't
            // divide the residual count; skip those combinations.
            let Some(rice_bits) = estimate_cost(&zigzag_buf, po) else {
                continue;
            };
            let total = header_bits + rice_bits;
            if total < best_total_bits {
                best_total_bits = total;
                best_order = order;
                best_partition_order = po;
                best_shift = shift;
                // LPC beat the fixed-predictor candidate at this order
                // (if any). Clear the fixed-coeffs pointer so the
                // post-loop coefficient fetch goes through `levels`.
                best_fixed_coeffs = None;
                best_updated = true;
            }
        }
        if best_updated {
            // Snapshot the zigzag-mapped residuals for the winner.
            // Swap instead of copy — the next order iteration will
            // clear+refill `zigzag_buf` anyway, so the swapped-out
            // previous-best data is overwritten before being read.
            // Capacities are preserved by swap, so the search loop
            // does zero allocation past the initial reservation.
            core::mem::swap(&mut best_zigzag, &mut zigzag_buf);
        }

        // For all-zero input, order > 0 gives identical (zero) residuals
        // because `compute_residuals` with any coefficients yields zeros.
        // Skip the remaining orders — they can only match, never beat,
        // the order-0 verbatim choice (which has zero coefficient overhead).
        if r0 == 0 {
            break;
        }

        // Track how long we've been unable to improve the running best.
        // Over a long-enough stale streak, stop — the cost is climbing
        // and further orders just add header overhead.
        if best_total_bits < prev_best {
            stale_orders = 0;
        } else {
            stale_orders += 1;
            if stale_orders >= patience {
                break;
            }
        }
    }

    // Fixed-predictor post-pass. Run *after* the LPC search so it
    // can't tighten `best_total_bits` early and trip the LPC
    // patience-based early-out — which, on the sparse grid with
    // `patience = 2`, would have cut the main search short after 2-3
    // non-improving orders and given up orders the LPC winner could
    // still have reached. Running fixed predictors second gives them
    // every chance to replace the LPC winner without interfering
    // with the LPC plateau heuristic.
    //
    // Skip on all-zero input — every fixed predictor there produces
    // zero residuals, which order-0 verbatim already wins on
    // coefficient-header cost.
    if r0 != 0 {
        for &(fp_order, fp_coeffs, fp_shift) in FIXED_PREDICTORS {
            compute_residuals_into(samples, fp_coeffs, fp_shift, &mut residuals_buf);
            zigzag_buf.clear();
            zigzag_buf.extend(residuals_buf.iter().map(|&r| zigzag(r)));
            let header_bits = (7 + fp_coeffs.len() * 2) * 8;
            // Same "defer swap until after the partition loop" pattern
            // as the LPC loop above.
            let mut best_updated = false;
            for po in 0..=MAX_PARTITION_ORDER {
                let Some(rice_bits) = estimate_cost(&zigzag_buf, po) else {
                    continue;
                };
                let total = header_bits + rice_bits;
                if total < best_total_bits {
                    best_total_bits = total;
                    best_order = fp_order;
                    best_partition_order = po;
                    best_shift = fp_shift;
                    best_fixed_coeffs = Some(fp_coeffs);
                    best_updated = true;
                }
            }
            if best_updated {
                core::mem::swap(&mut best_zigzag, &mut zigzag_buf);
            }
        }
    }

    // Fetch the winner's coefficients by slice. Precedence:
    //   1. Fixed-predictor winner → the static slice baked into
    //      `FIXED_PREDICTORS`. Fixed coefficients are valid even when
    //      Levinson-Durbin couldn't run (they don't depend on
    //      autocorrelation), so this branch dominates the next one.
    //   2. Order 0 or Levinson-invalid → empty slice.
    //   3. LPC winner → borrow into the stack-allocated `levels`.
    let best_coeffs: &[i16] = if let Some(fixed) = best_fixed_coeffs {
        fixed
    } else if best_order == 0 || !levels_valid {
        &[]
    } else {
        levels.get(best_order).coefficients
    };

    // Header-consistency guard: the wire format carries
    // `prediction_order` coefficients immediately after the header,
    // with the count derived from the order byte. If the search picked
    // a non-zero order but the coefficient slice ended up empty (only
    // possible when `levels_valid == false` — all-zero input, or
    // Levinson-Durbin couldn't run on some pathological signal — AND
    // the caller's order grid excludes 0), emitting the order byte
    // as-is would produce a malformed frame whose decoder then reports
    // `Truncated` because the coefficient bytes it expects don't exist.
    // Override to order 0 so the header matches the actual payload.
    if best_coeffs.is_empty() {
        best_order = 0;
        best_shift = 0;
    }

    // Serialise header bytes directly into the caller's output buffer.
    out.clear();
    out.reserve(7 + best_coeffs.len() * 2);
    out.extend_from_slice(&SYNC_WORD.to_be_bytes());
    out.push(best_order);
    out.push(best_partition_order);
    out.push(best_shift);
    out.extend_from_slice(&(samples.len() as u16).to_be_bytes());
    for &c in best_coeffs {
        out.extend_from_slice(&c.to_be_bytes());
    }
    // Rice bitstream follows immediately. Append directly into `out`
    // — the zigzag-aware encoder writes straight into the caller's
    // buffer through the `BitWriter`'s `&mut Vec<u8>` borrow, so the
    // encode hot path does not allocate or memcpy a separate Rice
    // staging buffer. Reuses the already-computed zigzag vector so the
    // encoder doesn't redo a signed→unsigned pass.
    rice_encode_zigzag_into(&best_zigzag, best_partition_order, out);
    best_zigzag.clear(); // drop early to free the Vec's capacity
}

/// Decode an audio frame, returning the reconstructed sample vector.
///
/// Parses the frame header, Rice-decodes the residuals at the indicated
/// `partition_order`, then runs LPC synthesis to recover the samples.
///
/// On structural failure the caller substitutes `frame_sample_count` zeros
/// for the frame period. Authentication failures are the transport layer's
/// responsibility and occur before this function runs; `data` here is
/// expected to already be plaintext.
///
/// # Output magnitude
///
/// On well-formed bitstreams produced by a compliant encoder from
/// in-contract samples (`|sample| ≤ 2²³ − 1`, spec §1), the decoded
/// output satisfies the same contract. Adversarial bitstreams — those
/// that pass every rejection check in spec §6 but contain crafted
/// coefficients and residuals — may produce output samples of any `i32`
/// value, including values outside `[-(2²³ − 1), 2²³ − 1]`. The
/// synthesis add (spec §3.6) is specified as wrapping `i32`: every byte
/// sequence that parses produces a defined sample vector, and the
/// decoder never panics.
///
/// Callers that re-feed decoder output into [`encode_frame`] (MCU
/// decode → mix → re-encode pipelines, most commonly) SHOULD validate
/// or clamp to the input-magnitude contract before re-encoding. The
/// encoder's input-magnitude precondition is debug-asserted (see
/// [`encode_frame`]); untrusted decoder output feeding an unchecked
/// encoder is a silent-surprise chain, not a crash chain.
///
/// # Worst-case CPU on untrusted input
///
/// The decoder's per-codeword unary-run cap `q ≤ u32::MAX >> k`
/// (spec §4.2) prevents any single codeword from consuming unbounded
/// CPU. Total decode work per frame is bounded by the input buffer's
/// byte length — long unary runs require proportional bytes of zero
/// bits to encode — so a sane application-layer cap on the input
/// buffer (e.g. reject frames larger than the negotiated MTU plus a
/// generous sanity margin) also caps decode CPU. This isn't LAC-
/// specific advice, but it's the place to apply it: no internal
/// decoder timer or allocation limit enforces it.
///
/// # Error recovery
///
/// The frame length needed for silence substitution can always be
/// recovered from `data` when the header itself was well-formed. Call
/// [`parse_header`] on the same buffer; the returned
/// [`AudioFrameHeader::frame_sample_count`] is the count to zero-fill.
/// This works for any error raised after the header parse (`Truncated`
/// in the bitstream, `InvalidParameter` during Rice decode) because
/// those errors don't invalidate the already-parsed header.
///
/// When `parse_header` itself errors — `BadSyncWord`, one of the
/// field-range variants, `Truncated` at the header — the length is
/// genuinely unknowable from the frame alone, and the caller must fall
/// back to the session-level default frame size (the container or
/// transport typically carries this out-of-band).
///
/// ```no_run
/// use lac::{decode_frame, parse_header, DecodeError};
/// # fn get_frame() -> Vec<u8> { Vec::new() }
/// # const SESSION_DEFAULT_FRAME: usize = 320;
///
/// let data = get_frame();
/// let samples = match decode_frame(&data) {
///     Ok(s) => s,
///     Err(_) => {
///         let count = parse_header(&data)
///             .map(|(h, _)| h.frame_sample_count as usize)
///             .unwrap_or(SESSION_DEFAULT_FRAME);
///         vec![0i32; count]
///     }
/// };
/// ```
pub fn decode_frame(data: &[u8]) -> Result<Vec<i32>, DecodeError> {
    let mut out = Vec::new();
    decode_frame_into(data, &mut out)?;
    Ok(out)
}

/// Like [`decode_frame`] but writes reconstructed samples into the
/// caller's `Vec<i32>`. The buffer is cleared first; its existing
/// capacity is retained so the MCU decode path can reuse one
/// allocation per participant stream across every frame tick.
///
/// On error the output is left in an unspecified but valid state
/// (typically empty or a partial decode prefix); callers must treat it
/// the same as a structural failure and substitute silence.
pub fn decode_frame_into(data: &[u8], out: &mut Vec<i32>) -> Result<(), DecodeError> {
    let (header, header_len) = parse_header(data)?;
    let rice_data = &data[header_len..];
    let count = header.frame_sample_count as usize;

    // Scratch residuals buffer: one heap allocation per decode. For a
    // truly alloc-free decode path a caller-owned scratch struct could
    // hold this, but the decode hot path is cheap enough that the
    // extra API surface isn't worth it yet — encode allocations
    // dominated the MCU hot loop, not decode.
    let mut residuals: Vec<i32> = Vec::with_capacity(count);
    rice_decode_into(rice_data, header.partition_order, count, &mut residuals)?;
    lpc_synthesize_into(
        &residuals,
        &header.lpc_coefficients,
        header.coefficient_shift,
        out,
    );
    Ok(())
}

// ── Tests ───────────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;
    use alloc::vec;
    use alloc::vec::Vec;

    use crate::test_signals::{angular_step_q32, sine_samples as int_sine_samples};

    /// Integer-math equivalent of the old float `sine_samples`: produces
    /// `n` samples of `amplitude · sin(2π · freq_hz · i / sample_rate)`
    /// using the test-time LUT in `crate::test_signals`. Keeps the old
    /// call-site ergonomics so individual tests read the same, but the
    /// whole repository stays free of float arithmetic.
    fn sine_samples(n: usize, freq_hz: u64, sample_rate: u64, amplitude: i32) -> Vec<i32> {
        let step = angular_step_q32(freq_hz, sample_rate);
        int_sine_samples(n, step, amplitude)
    }

    #[test]
    fn roundtrip_sine_440hz() {
        let samples = sine_samples(960, 440, 48_000, 100_000);
        let encoded = encode_frame(&samples);
        let decoded = decode_frame(&encoded).unwrap();
        assert_eq!(decoded, samples, "sine frame roundtrip failed");
    }

    #[test]
    fn roundtrip_silence() {
        let samples = vec![0i32; 960];
        let encoded = encode_frame(&samples);
        let decoded = decode_frame(&encoded).unwrap();
        assert_eq!(decoded, samples);
    }

    #[test]
    fn silence_uses_order_zero() {
        // All-zero input: encoder must emit order 0 without running
        // Levinson-Durbin (division by R[0] = 0 would panic).
        let samples = vec![0i32; 960];
        let encoded = encode_frame(&samples);
        let (header, _) = parse_header(&encoded).unwrap();
        assert_eq!(header.prediction_order, 0);
    }

    #[test]
    fn roundtrip_short_frame() {
        // 128 samples, divisible by all partition orders up to 7.
        let samples = sine_samples(128, 220, 48_000, 50_000);
        let encoded = encode_frame(&samples);
        let decoded = decode_frame(&encoded).unwrap();
        assert_eq!(decoded, samples);
    }

    #[test]
    fn roundtrip_non_power_of_two_length() {
        // 137 samples: prime, so only partition_order=0 divides evenly.
        // Encoder must pick partition_order=0 and the frame still decodes
        // correctly.
        let samples = sine_samples(137, 220, 48_000, 50_000);
        let encoded = encode_frame(&samples);
        let (header, _) = parse_header(&encoded).unwrap();
        assert_eq!(header.partition_order, 0);
        let decoded = decode_frame(&encoded).unwrap();
        assert_eq!(decoded, samples);
    }

    #[test]
    fn roundtrip_full_scale() {
        // Near-maximum s24le amplitude alternating sign: exercises overflow
        // paths throughout LPC and Rice.
        let samples: Vec<i32> = (0..1024)
            .map(|i| {
                if i % 2 == 0 {
                    (1 << 23) - 1
                } else {
                    -((1 << 23) - 1)
                }
            })
            .collect();
        let encoded = encode_frame(&samples);
        let decoded = decode_frame(&encoded).unwrap();
        assert_eq!(decoded, samples);
    }

    #[test]
    fn sync_word_present() {
        let samples = sine_samples(1024, 1000, 48_000, 10_000);
        let encoded = encode_frame(&samples);
        let sync = u16::from_be_bytes([encoded[0], encoded[1]]);
        assert_eq!(sync, SYNC_WORD);
    }

    #[test]
    fn decode_rejects_bad_sync() {
        let samples = sine_samples(960, 440, 48_000, 1_000);
        let mut encoded = encode_frame(&samples);
        encoded[0] = 0xFF;
        // The sync word becomes 0xFFCC after the byte flip. Exact match
        // here verifies the offending value is surfaced to the caller —
        // an MCU operator needs to distinguish "unknown sync" (possible
        // v2 peer) from "corrupted byte" at a glance.
        assert_eq!(
            decode_frame(&encoded),
            Err(DecodeError::BadSyncWord { got: 0xFFCC })
        );
    }

    #[test]
    fn decode_rejects_order_above_max() {
        let mut encoded = encode_frame(&sine_samples(960, 440, 48_000, 1_000));
        encoded[2] = MAX_LPC_ORDER + 1;
        assert_eq!(
            decode_frame(&encoded),
            Err(DecodeError::InvalidPredictionOrder {
                got: MAX_LPC_ORDER + 1
            })
        );
    }

    #[test]
    fn decode_rejects_partition_order_above_max() {
        let mut encoded = encode_frame(&sine_samples(960, 440, 48_000, 1_000));
        encoded[3] = MAX_PARTITION_ORDER + 1;
        assert_eq!(
            decode_frame(&encoded),
            Err(DecodeError::InvalidPartitionOrder {
                got: MAX_PARTITION_ORDER + 1
            })
        );
    }

    #[test]
    fn decode_rejects_mismatched_partition_count() {
        // Craft a header that claims partition_order=3 (8 partitions) but
        // frame_sample_count=7 (not divisible). parse_header must reject.
        let mut buf = Vec::new();
        buf.extend_from_slice(&SYNC_WORD.to_be_bytes());
        buf.push(0); // prediction_order
        buf.push(3); // partition_order = 3 → 8 partitions
        buf.push(0); // coefficient_shift
        buf.extend_from_slice(&7u16.to_be_bytes()); // frame_sample_count
        assert_eq!(parse_header(&buf), Err(DecodeError::InvalidParameter));
    }

    #[test]
    fn decode_rejects_coefficient_shift_above_max() {
        let mut encoded = encode_frame(&sine_samples(960, 440, 48_000, 10_000));
        // Forcing an out-of-range shift only matters when we actually have
        // coefficients — order 0 clamps shift to 0 by construction and the
        // decoder special-cases that in `parse_header`.
        let (hdr, _) = parse_header(&encoded).unwrap();
        if hdr.prediction_order > 0 {
            encoded[4] = MAX_COEFFICIENT_SHIFT + 1;
            assert_eq!(
                decode_frame(&encoded),
                Err(DecodeError::InvalidCoefficientShift {
                    got: MAX_COEFFICIENT_SHIFT + 1
                })
            );
        }
    }

    #[test]
    fn decode_rejects_coefficient_shift_without_order() {
        // Hand-construct a header with prediction_order = 0 (verbatim)
        // but coefficient_shift = 3. Spec §3.4 requires shift = 0 for
        // verbatim frames; the decoder must surface this as a distinct
        // error so an operator can tell a corrupted-shift payload apart
        // from a corrupted-order one.
        let mut buf = Vec::new();
        buf.extend_from_slice(&SYNC_WORD.to_be_bytes());
        buf.push(0); // prediction_order = 0 (verbatim)
        buf.push(0); // partition_order
        buf.push(3); // coefficient_shift ≠ 0 — contradiction
        buf.extend_from_slice(&320u16.to_be_bytes());
        assert_eq!(
            parse_header(&buf),
            Err(DecodeError::CoefficientShiftWithoutOrder { shift: 3 })
        );
    }

    #[test]
    fn decode_rejects_truncated_header() {
        let encoded = encode_frame(&sine_samples(960, 440, 48_000, 1_000));
        let truncated = &encoded[..6];
        assert_eq!(decode_frame(truncated), Err(DecodeError::Truncated));
    }

    #[test]
    fn decode_rejects_truncated_coefficients() {
        let encoded = encode_frame(&sine_samples(960, 440, 48_000, 10_000));
        let (hdr, _) = parse_header(&encoded).unwrap();
        if hdr.prediction_order > 0 {
            // Cut one byte short of the end of the coefficient array.
            let cut_at = 7 + hdr.prediction_order as usize * 2 - 1;
            assert_eq!(
                decode_frame(&encoded[..cut_at]),
                Err(DecodeError::Truncated)
            );
        }
    }

    #[test]
    fn higher_order_chosen_for_tonal_signal() {
        let samples = sine_samples(960, 440, 48_000, 500_000);
        let encoded = encode_frame(&samples);
        let (header, _) = parse_header(&encoded).unwrap();
        assert!(
            header.prediction_order > 0,
            "expected non-zero order for tonal signal, got {}",
            header.prediction_order
        );
    }

    #[test]
    fn roundtrip_various_lengths() {
        // Exercise several frame sizes, especially ones with mixed divisibility.
        for &n in &[64usize, 120, 256, 480, 960, 1024, 2048, 4096] {
            let samples = sine_samples(n, 1000, 48_000, 10_000);
            let encoded = encode_frame(&samples);
            let decoded = decode_frame(&encoded).unwrap();
            assert_eq!(decoded, samples, "roundtrip failed at n={n}");
        }
    }

    #[test]
    fn roundtrip_transient_burst() {
        // Mimic a drum hit: quiet then loud then decaying. This is exactly
        // the signal that partitioned Rice beats single-k on — verifies
        // the encoder actually picks partition_order > 0 and round-trips.
        let mut samples = Vec::with_capacity(1024);
        for i in 0..256i32 {
            samples.push((i % 13) - 6);
        }
        for i in 0..256i32 {
            samples.push(((i * 31) % 400_000) - 200_000);
        }
        // Decaying sine tail: amplitude linearly interpolates from 50 000
        // down to 0 across 512 samples. Angular step ≈ 0.2 rad/sample
        // ≈ `2³² · 0.2 / 2π` in Q32 units. All arithmetic integer.
        let step = crate::test_signals::angular_step_q32(200, 6283); // 200/6283 ≈ 0.2/(2π)
        let mut phase: u32 = 0;
        for i in 0..512i32 {
            let decay = 50_000 * (512 - i) / 512;
            let s = crate::test_signals::sin_q15(phase);
            // `(s · decay + 16384) / 32768` with round-half-up.
            let sample = ((s as i64 * decay as i64 + (1 << 14)) >> 15) as i32;
            samples.push(sample);
            phase = phase.wrapping_add(step);
        }
        let encoded = encode_frame(&samples);
        let decoded = decode_frame(&encoded).unwrap();
        assert_eq!(decoded, samples);
    }

    #[test]
    fn decode_rejects_zero_sample_count() {
        // Spec §3.5 — frame_sample_count must be ≥ 1. A well-formed
        // header with count=0 trivially satisfies the partition-
        // divisibility check (0 % anything == 0), so an explicit guard
        // is required.
        let mut buf = Vec::new();
        buf.extend_from_slice(&SYNC_WORD.to_be_bytes());
        buf.push(0); // prediction_order
        buf.push(0); // partition_order
        buf.push(0); // coefficient_shift
        buf.extend_from_slice(&0u16.to_be_bytes()); // frame_sample_count = 0
        assert_eq!(
            parse_header(&buf),
            Err(DecodeError::InvalidParameter),
            "a zero-sample frame must be rejected"
        );
        assert_eq!(
            decode_frame(&buf),
            Err(DecodeError::InvalidParameter),
            "decode_frame must surface the zero-count rejection"
        );
    }

    #[test]
    fn roundtrip_single_sample() {
        // Spec §5.2: frame_sample_count = 1 is valid. The only
        // admissible partition_order is 0 (1 divides evenly only by 1).
        // No LPC order ≥ 1 can predict the single sample (no
        // predecessors), so the encoder's search must either pick
        // order 0 or pick an order whose warm-up region covers the
        // whole frame.
        for v in [0i32, 1, -1, 123_456, -((1 << 23) - 1), (1 << 23) - 1] {
            let samples = vec![v];
            let encoded = encode_frame(&samples);
            let (hdr, _) = parse_header(&encoded).unwrap();
            assert_eq!(hdr.frame_sample_count, 1);
            assert_eq!(hdr.partition_order, 0);
            let decoded = decode_frame(&encoded).unwrap();
            assert_eq!(decoded, samples, "single-sample roundtrip failed for v={v}");
        }
    }

    #[test]
    fn roundtrip_single_sample_forced_high_order() {
        // `encode_frame` naturally picks order 0 for a single-sample
        // frame (zero coefficient-header cost beats every option). This
        // test forces the encoder through orders 16 and 32 via
        // `encode_frame_with_grid` with a single-order grid, exercising
        // the LPC warm-up code path at i = 0: for any order ≥ 1,
        // `terms = min(0, order) = 0` so the predictor sum is empty and
        // `residual[0] = sample[0]`. The decoder applies the same warm-
        // up logic, so the round-trip must be bit-exact regardless of
        // whether the header reports the forced order or falls back to
        // 0 (the consistency guard resets the order when Levinson-
        // Durbin couldn't produce valid coefficients, e.g. for v == 0).
        for &forced_order in &[16u8, 32u8] {
            for v in [0i32, 1, -1, 123_456, (1 << 23) - 1] {
                let samples = vec![v];
                let mut encoded = Vec::new();
                encode_frame_with_grid(&samples, &[forced_order], u8::MAX, &mut encoded);
                let (hdr, _) =
                    parse_header(&encoded).expect("forced-order encoder output must parse");
                assert_eq!(hdr.frame_sample_count, 1);
                let decoded = decode_frame(&encoded).unwrap();
                assert_eq!(
                    decoded, samples,
                    "single-sample round-trip failed at forced order {forced_order} for v={v}"
                );
            }
        }
    }

    #[test]
    fn roundtrip_frame_at_u16_max() {
        // 65535 is the largest frame the u16 sample-count field can
        // carry. A regression that tightens the limit (e.g., by
        // accidentally adding an off-by-one in the `assert!` check
        // introduced to close the silent-truncation hole) would fail
        // this round-trip. Use a deterministic ramp to keep the test
        // fast without WAV loading; encoder + decoder must still agree
        // bit-for-bit at the boundary.
        let samples: Vec<i32> = (0..u16::MAX as i32)
            .map(|i| (i.wrapping_mul(17)) & 0xFFFF)
            .collect();
        assert_eq!(samples.len(), u16::MAX as usize);
        let encoded = encode_frame(&samples);
        let decoded = decode_frame(&encoded).unwrap();
        assert_eq!(decoded, samples);
    }

    #[test]
    #[should_panic(expected = "exceeds u16::MAX")]
    fn encode_panics_at_frame_above_u16_max() {
        // 65536 samples must panic at the public entry point; silently
        // truncating `samples.len() as u16` to 0 would emit a frame
        // whose decoder-side `frame_sample_count = 0` rejection gives
        // the caller no signal that they overran the spec limit.
        let samples = vec![0i32; u16::MAX as usize + 1];
        let _ = encode_frame(&samples);
    }

    #[test]
    fn decode_panic_free_on_adversarial_coefficients_and_residuals() {
        // Construct a frame by hand with max-shift coefficients at
        // extreme Q10 magnitudes and residuals at the widest zigzag
        // value the wire can carry. Pre-fix, the synthesis step
        // `res + pred as i32` panicked in debug builds on overflow;
        // post-fix the wrapping add keeps the decoder contract
        // ("no panic on any byte sequence") in both debug and release.
        //
        // We're not asserting on the decoded sample values — they're
        // garbage by construction — only that `decode_frame` returns
        // `Ok(_)` without panicking.
        let mut buf = Vec::new();
        buf.extend_from_slice(&SYNC_WORD.to_be_bytes());
        buf.push(32); // prediction_order = 32 (max)
        buf.push(0); // partition_order = 0
        buf.push(5); // coefficient_shift = 5 (Q10 — widest real range)
        let n_samples = 64u16;
        buf.extend_from_slice(&n_samples.to_be_bytes());
        // Coefficients alternating +32767 / −32768 at Q10 scale means
        // `|real| ≈ 32`, so the prediction accumulator can reach
        // `32 × 32 × 2^23 ≈ 2^33` at order 32 — guaranteed to overflow
        // i32 if residuals drive the reconstructed sample any further.
        for j in 0..32 {
            let c = if j & 1 == 0 { i16::MAX } else { i16::MIN };
            buf.extend_from_slice(&c.to_be_bytes());
        }
        // Rice bitstream: one partition with `k = 0`, then `n_samples`
        // codewords each encoding a large zigzag value. At `k = 0`, a
        // zigzag of `q` is `q` zero-bits followed by a 1-bit, so we
        // can pick `q = 8192` to get a residual ≈ 2^12 after unzigzag.
        // Add the k field (5 bits = 0) plus 64 codewords of
        // `8192 zero-bits + 1-bit terminator`.
        {
            let mut w = crate::bit_io::BitWriter::new(&mut buf);
            w.write_bits(0, 5); // k = 0
            for _ in 0..n_samples {
                for _ in 0..8192 {
                    w.write_bit(false);
                }
                w.write_bit(true);
            }
            w.finish();
        }

        // The decoder must return some result — either `Ok` with
        // garbage samples or `Err(InvalidParameter)` if `q` crosses
        // the `u32::MAX >> k` threshold — but under no circumstances
        // panic. Exercising this in cfg(debug_assertions) is where
        // the old bug manifested.
        let _ = decode_frame(&buf);
    }

    /// Sparse LPC order grid (production default) compared against an
    /// exhaustive `0..=32` search on real speech. Lives in the unit-test
    /// module so `encode_frame_with_grid` stays `pub(crate)` — the grid
    /// parameterisation is internal, not part of the crate's semver
    /// surface.
    ///
    /// Asserts two things about every frame in the first minute of a
    /// headset-speech recording:
    ///
    /// 1. Both bitstreams are lossless.
    /// 2. The sparse path's total byte count is within a small regression
    ///    budget of the exhaustive optimum. README claims ≤ 0.1 pp on
    ///    speech; the 0.5 % ceiling here absorbs CI / compiler wobble
    ///    while still flagging any real regression.
    #[test]
    fn sparse_vs_exhaustive_on_headset_speech() {
        // The crate is `#![no_std]`, but the `cfg(test)` build links
        // against std — test binaries are inherently host-bound. Reach
        // `std::path` and the `eprintln!` / `format!` macros explicitly
        // through a local `extern crate std` so this block compiles
        // alongside the no_std library code above.
        extern crate std;
        use hound::WavReader;
        use std::eprintln;
        use std::path::Path;

        const CORPUS_PATH: &str = "corpus/ES2002a.Headset-0.wav";
        const FRAME_SIZE: usize = 4096;

        let path = Path::new(CORPUS_PATH);
        if !path.exists() {
            eprintln!("skipping: corpus file not found: {}", path.display());
            return;
        }

        // 16 kHz mono AMI recording; read as i32 samples.
        let mut reader = WavReader::open(path).expect("open headset wav");
        let channel: Vec<i32> = reader
            .samples::<i32>()
            .collect::<Result<Vec<_>, _>>()
            .expect("parse samples");
        // Cap to one minute at 16 kHz — enough to stress every frame-size
        // code path without making the test take minutes.
        let cap = (16_000 * 60).min(channel.len());
        let channel = &channel[..cap];

        // Exhaustive grid: every integer order 0..=32, patience disabled.
        let exhaustive_grid: Vec<u8> = (0u8..=32).collect();
        // Sparse grid: what `encode_frame` uses in production.
        let sparse_grid: &[u8] = &[0, 2, 4, 6, 8, 10, 12, 16, 20, 24, 28, 32];

        let mut sparse_bytes = 0usize;
        let mut exhaustive_bytes = 0usize;

        let mut sparse = Vec::new();
        let mut exhaustive = Vec::new();
        for chunk in channel.chunks(FRAME_SIZE) {
            encode_frame_with_grid(chunk, sparse_grid, 2, &mut sparse);
            encode_frame_with_grid(chunk, &exhaustive_grid, u8::MAX, &mut exhaustive);

            // Both paths must round-trip losslessly.
            assert_eq!(decode_frame(&sparse).unwrap(), chunk);
            assert_eq!(decode_frame(&exhaustive).unwrap(), chunk);

            sparse_bytes += sparse.len();
            exhaustive_bytes += exhaustive.len();
        }

        // Sparse should never be smaller than exhaustive — it's a strict
        // subset of the search space — and the excess should be tiny.
        assert!(
            sparse_bytes >= exhaustive_bytes,
            "sparse smaller than exhaustive? sparse={} exhaustive={}",
            sparse_bytes,
            exhaustive_bytes
        );
        let excess = (sparse_bytes as f64 / exhaustive_bytes as f64) - 1.0;
        eprintln!(
            "sparse_vs_exhaustive_on_headset_speech  sparse={}  exhaustive={}  excess={:.2}%",
            sparse_bytes,
            exhaustive_bytes,
            excess * 100.0
        );
        // 0.5 % ceiling to absorb CI hardware / compiler wobble; a real
        // regression widening the gap past this would fail here.
        assert!(
            excess < 0.005,
            "sparse grid is {:.2}% larger than exhaustive (budget 0.5%)",
            excess * 100.0
        );
    }
}