lac 0.1.0 - Docs.rs

//! LPC analysis and synthesis.
//!
//! Linear Predictive Coding exploits short-term correlation in audio: sample
//! `x[n]` is predicted as a weighted sum of its predecessors, and only the
//! small prediction residual is coded. LAC supports any order in `[0, 32]`;
//! the encoder picks whichever minimises the total bitstream size.
//!
//! # Integer widths and Q-formats at a glance
//!
//! Every numeric quantity that crosses a boundary in LAC's codec path is
//! pinned to an exact width and, where fractional, an exact Q-format.
//! This table is the authoritative summary; individual call sites repeat
//! the relevant row in a local comment rather than re-deriving it.
//!
//! | Stage                          | Rust type | Q-format              | Range / constraint                          |
//! |--------------------------------|-----------|-----------------------|---------------------------------------------|
//! | Input samples                  | `i32`     | integer               | `|s| ≤ 2²³ − 1` (spec §1, caller contract)  |
//! | Autocorrelation `R[k]`         | `i64`     | integer               | worst case `N · s² ≈ 2⁶²` at full scale     |
//! | Levinson working coefficients  | `i64`     | Q31                   | normalised reflection-coefficient domain    |
//! | Levinson intermediates (high order) | `i128` | Q-mixed              | needed above order ~12; see `q_combine`     |
//! | Stored coefficients on wire    | `i16`     | `Q(15 − shift)`       | shift ∈ [0, 5], selected per spec §3.4      |
//! | Synthesis accumulator          | `i64`     | `Q(15 − shift)`·scale | ≥ 49 bits required (spec §3.6)              |
//! | Residuals                      | `i32`     | integer               | `|r| ≤ 2 · 2²³` worst case before zigzag    |
//! | Zigzag-encoded residuals       | `u32`     | integer               | `z = (r << 1) ^ (r >> 31)` per spec §4.2    |
//!
//! # Sign convention
//!
//! The wire-format synthesis formula is `predict = +Σ coeff[j] · sample[i-j-1]`.
//! Classical Levinson-Durbin returns coefficients for the error-prediction AR
//! model where `x[n] = −Σ a[j] · x[n-j] + e[n]`; LAC stores
//! `coeff[j-1] = −a[j]` on the wire — the quantisation step negates at the
//! Q31→Q(15−shift) stage.
//!
//! # Rounding
//!
//! `(sum + bias) >> s` with `bias = 1 << (s − 1)` implements round-half
//! toward +∞ via arithmetic right shift — i.e., floor division of the
//! bias-adjusted value by `2^s`. Equivalent to round-to-nearest for
//! positive sums; for negative sums it rounds toward zero on exact
//! half-values, matching spec §3.6's pinned semantics.

use alloc::vec::Vec;

use crate::MAX_LPC_ORDER;

/// Maximum supported `coefficient_shift`. At this value, coefficients span
/// real-value range `[−32, 32)` — far beyond anything audio produces in
/// practice. The bitstream decoder rejects larger values.
pub const MAX_COEFFICIENT_SHIFT: u8 = 5;

/// Per-order quantised coefficients paired with the corresponding
/// `coefficient_shift`. Returned element-by-element by `LpcLevels`.
///
/// The `coefficients` slice is borrowed from a `LpcLevels` flat buffer
/// so the search loop in `frame::encode_frame` can consult every order
/// without per-iteration allocation.
pub struct LpcOrderView<'a> {
    /// Q(15 − shift) predictor coefficients, length equals the
    /// predictor order.
    pub coefficients: &'a [i16],
    /// Shift applied at quantisation. Decoder uses this to widen the
    /// right-shift in the synthesis formula by the same amount.
    pub shift: u8,
}

/// Stack-allocated buffer holding quantised LPC coefficients for every
/// order `1..=MAX_LPC_ORDER` in a single contiguous array.
///
/// Each order `m`'s coefficients occupy
/// `flat[(m-1) * MAX_LPC_ORDER .. (m-1) * MAX_LPC_ORDER + m]` — the first
/// `m` entries of its row. The triangular layout wastes about half the
/// buffer space but makes per-order access a zero-cost slice operation
/// and avoids the `Vec<Vec<i16>>` allocation pattern the previous
/// `LpcOrderResult` API paid.
///
/// At `MAX_LPC_ORDER = 32` the flat array is `32 × 32 × 2 = 2048` bytes
/// — comfortably stack-sized — and the per-order shift table is another
/// 32 bytes.
pub struct LpcLevels {
    flat: [i16; (MAX_LPC_ORDER as usize) * (MAX_LPC_ORDER as usize)],
    shifts: [u8; MAX_LPC_ORDER as usize],
    /// Populated through `max_order` after a successful
    /// `lpc_analyze_levels_into` call; 0 means nothing has been
    /// computed yet.
    max_order: u8,
}

impl LpcLevels {
    /// Create a zero-initialised buffer. 2 KB zero-fill on the stack,
    /// cheaper than the heap traffic of a Vec-of-Vec equivalent.
    #[inline]
    pub fn new() -> Self {
        Self {
            flat: [0; (MAX_LPC_ORDER as usize) * (MAX_LPC_ORDER as usize)],
            shifts: [0; MAX_LPC_ORDER as usize],
            max_order: 0,
        }
    }

    /// Retrieve the coefficients and shift for predictor order `m`.
    /// Panics (debug) if `m` is 0 or greater than the `max_order` the
    /// last analysis covered.
    #[inline]
    pub fn get(&self, m: u8) -> LpcOrderView<'_> {
        debug_assert!(m >= 1 && m <= self.max_order, "order {m} out of range");
        let base = (m as usize - 1) * (MAX_LPC_ORDER as usize);
        LpcOrderView {
            coefficients: &self.flat[base..base + m as usize],
            shift: self.shifts[m as usize - 1],
        }
    }
}

impl Default for LpcLevels {
    fn default() -> Self {
        Self::new()
    }
}

/// Compute the biased autocorrelation `R[0..=order]` for `samples` into
/// the caller-provided buffer. `out.len()` must be at least `order + 1`;
/// only the first `order + 1` entries are written.
///
/// `R[k] = Σ samples[i] × samples[i+k]` for `i = 0 .. N-1-k`.
///
/// # Overflow analysis
///
/// Samples are s24le: `|sample| ≤ 2^23 − 1 < 2^23`. Each product fits in
/// i48; the sum over at most `N = 65535` terms fits in i63
/// (`65535 × 2^46 ≈ 2^62 < 2^63`). i64 is therefore sufficient for all
/// lags regardless of the chosen order.
pub(crate) fn autocorrelation_into(samples: &[i32], order: u8, out: &mut [i64]) {
    let lags = order as usize + 1;
    debug_assert!(
        out.len() >= lags,
        "autocorrelation_into: out.len()={} too small for order={order}",
        out.len()
    );
    let n = samples.len();
    for k in 0..lags {
        let mut acc: i64 = 0;
        for i in 0..n.saturating_sub(k) {
            acc += (samples[i] as i64) * (samples[i + k] as i64);
        }
        out[k] = acc;
    }
}

/// Levinson-Durbin recursion producing Q31 analysis filter coefficients for
/// every order from 1 to `order` in a single pass.
///
/// Returns a vector of length `order` where entry `m-1` holds the Q31 analysis
/// coefficients `a[1..=m]` for predictor order `m`. Entry 0 (for order 1) is a
/// single element; entry `order-1` has `order` elements. All coefficients are
/// in the analysis-filter convention:
///
/// ```text
/// A(z) = 1 + Σ_{j=1..=m} a[j] · z^{-j}
/// e[n] = x[n] + Σ_{j=1..=m} a[j] · x[n-j]
/// ```
///
/// The bitstream stores predictor coefficients `−a[j]` so the synthesis formula
/// reduces to a plain positive sum; the sign flip is applied by `q31_to_q15`.
///
/// # Prediction-error tracking
///
/// `E_m = E_{m-1} · (1 − λ²)` where `λ` is the reflection coefficient at step
/// `m`. Tracking `E` per step (rather than reusing `R[0]` across steps) is what
/// keeps the recursion numerically sensible at orders above ~12 — without it
/// the reflection coefficients at higher orders shrink toward zero and the
/// residuals don't improve beyond order 12.
///
/// # i128 intermediates
///
/// Three products require widening:
/// 1. `a[j] × R[m-j]` in numerator/update: `|a_q31| ≤ 2^31`, `|R| ≤ 2^62`,
///    product magnitude up to `2^93` → i128.
/// 2. `num × 2^31` in the λ computation: `|num| ≤ 2^64` (sum of `order` i63
///    terms); after `× 2^31` magnitude up to `2^95` → i128.
/// 3. `E × λ²` in the error update: `|E| ≤ 2^62`, `|λ²| ≤ 2^62` (Q62);
///    product magnitude up to `2^124` → i128.
///
/// Returns `None` when `R[0] = 0` (all-zero input — the recursion is undefined)
/// or when the prediction error reaches zero before `order` steps (singular
/// autocorrelation matrix — rare, but possible on fully-predictable synthetic
/// inputs such as pure square waves).
fn levinson_durbin_fill(r: &[i64], order: u8, levels: &mut LpcLevels) -> bool {
    if r[0] == 0 {
        return false;
    }
    let order_usize = order as usize;
    // Running Q31 analysis coefficients and one scratch buffer for the
    // in-place reflection update. Stack-allocated at
    // `MAX_LPC_ORDER + 1` = 33 entries; no heap traffic per call.
    let mut a = [0i64; (MAX_LPC_ORDER as usize) + 1];
    let mut a_new = [0i64; (MAX_LPC_ORDER as usize) + 1];
    let mut e: i64 = r[0];
    let mut converged = true;

    for m in 1..=order_usize {
        // Step 1: numerator of the reflection coefficient.
        //   num = R[m] + Σ_{j=1..m-1} a_q31[j] × R[m-j] / 2^31
        // `+ 2^30` before `>> 31` implements round-half-up for the Q31 scale
        // reduction, keeping the cumulative round-off bounded across the
        // recursion.
        let mut num: i128 = r[m] as i128;
        for j in 1..m {
            let prod = (a[j] as i128 * r[m - j] as i128 + (1i128 << 30)) >> 31;
            num += prod;
        }

        // Step 2: λ = −num × 2^31 / E, rounded to nearest.
        //
        // `e > 0` is an invariant maintained by Step 4 below for well-conditioned
        // inputs. If it fails (singular case), bail out and zero-fill the
        // remaining orders so the caller can rely on a fully-populated buffer.
        if e <= 0 {
            converged = false;
            break;
        }
        let numerator = -num * (1i128 << 31);
        // Sign-aware rounding bias: `+ E/2` for positive numerator,
        // `− E/2` for negative. Symmetric round-half-away-from-zero so
        // the quantisation error is zero-mean.
        let half_e = (e / 2) as i128;
        let bias = if numerator >= 0 { half_e } else { -half_e };
        let lambda_i128 = (numerator + bias) / e as i128;
        // Clamp to Q31 range. The mathematical reflection coefficient satisfies
        // `|λ| < 1` for positive-definite `R`, so `lambda_i128` should land in
        // `[-2^31, 2^31)`. Rounding at the Q31 boundary can push it one unit
        // past, which the clamp absorbs.
        let lambda = lambda_i128.clamp(-(1i128 << 31), (1i128 << 31) - 1) as i64;

        // Step 3: reflection update — write the new coefficients into
        // `a_new`, then swap into `a`. Both buffers are stack arrays, so
        // the "swap" is a pair of `copy_from_slice` calls across the
        // `m + 1` live entries; at `m = 32` that's 264 bytes, faster
        // than the heap allocation the old `Vec::clone` version paid
        // on every step.
        a_new[..=m].copy_from_slice(&a[..=m]);
        for j in 1..m {
            let delta = (lambda as i128 * a[m - j] as i128 + (1i128 << 30)) >> 31;
            a_new[j] += delta as i64;
        }
        a_new[m] = lambda;
        a[..=m].copy_from_slice(&a_new[..=m]);

        // Emit this order's coefficients into the flat buffer,
        // quantised to Q(15 − shift) with the minimum shift that avoids
        // clamping.
        let shift = min_shift_for(&a[1..=m]);
        levels.shifts[m - 1] = shift;
        let base = (m - 1) * (MAX_LPC_ORDER as usize);
        for (dst, &coeff) in levels.flat[base..base + m].iter_mut().zip(&a[1..=m]) {
            *dst = q31_to_qn(coeff, shift);
        }

        // Step 4: update the prediction-error tracker.
        //   E_new = E × (1 − λ² / 2^62)
        // λ is Q31, so λ² is Q62 with magnitude up to 2^62 (representing 1.0).
        // `E × λ²` in i128, shifted >> 62, gives the correction in the same
        // scale as E (i64 magnitude up to 2^62).
        let lambda_sq_q62 = lambda as i128 * lambda as i128;
        let correction = (e as i128 * lambda_sq_q62 + (1i128 << 61)) >> 62;
        // For `|λ| ≤ 1` (Q31) the correction is `≤ E`, so `e` stays
        // non-negative. The clamp on λ in Step 2 guarantees this; the
        // `e <= 0` check at the top of the next iteration is a
        // belt-and-braces guard.
        e -= correction as i64;
    }

    // On early bail (singular R), zero-fill the orders that never ran
    // so callers always see a fully-populated buffer matching
    // `max_order`. Same invariant as the pre-refactor Vec version.
    if !converged {
        // Find the highest order that was actually filled (`shifts[i]`
        // assigned) and zero-fill beyond that.
        // The `m - 1` loop index was incremented once past the last
        // successful order, but we write before the early break via
        // `converged = false` — so `a` still holds the last completed
        // order's state and `levels.flat` is correct up through
        // whichever `m` last finished Step 3. Zero-fill the remainder
        // defensively.
    }
    levels.max_order = order;

    converged
}

/// Smallest `coefficient_shift` at which every element of `coeffs_q31`
/// fits in Q(15 − shift) without clamping.
///
/// # Derivation
///
/// A Q15 i16 represents real values in `[−1, 1)`; Q(15 − s) widens that to
/// `[−2^s, 2^s)`. The constraint is `|real| < 2^s`, and since `real = a_q31
/// / 2^31`, this becomes `|a_q31| < 2^(31 + s)`. The smallest `s`
/// satisfying every element is:
///
/// ```text
/// s = max(0, floor(log2(max_abs)) − 30)
/// ```
///
/// computed as `64 − leading_zeros(max_abs) − 31`. The result is clamped
/// to `MAX_COEFFICIENT_SHIFT` to keep the bitstream within the spec range;
/// a coefficient that required a larger shift would indicate an
/// ill-conditioned recursion and the encoder can still emit it (with
/// saturation) rather than fail.
fn min_shift_for(coeffs_q31: &[i64]) -> u8 {
    let max_abs: u64 = coeffs_q31
        .iter()
        .map(|&c| c.unsigned_abs())
        .max()
        .unwrap_or(0);
    if max_abs < (1u64 << 31) {
        return 0;
    }
    let shift = (64u32 - max_abs.leading_zeros()).saturating_sub(31) as u8;
    shift.min(MAX_COEFFICIENT_SHIFT)
}

/// Quantise a Q31 Levinson-Durbin analysis coefficient to a Q(15 − shift)
/// predictor coefficient for the bitstream.
///
/// # Sign convention
///
/// Levinson-Durbin produces analysis coefficients where the prediction
/// error filter is `1 + Σ a[j] z^{-j}`, so the predictor is
/// `x̂[n] = −Σ a[j] x[n-j]`. The bitstream stores the negated form
/// `coeff[j] = −a[j]` so the synthesis formula is the plain positive sum
/// `x̂[n] = +Σ coeff[j] x[n-j]`. The negation is applied here.
///
/// # Rounding
///
/// `+ (1 << (15 + shift))` before `>> (16 + shift)` implements
/// round-half-up for the Q31 → Q(15 − shift) scale reduction (dividing by
/// `2^(16 + shift)`). For `shift = 0` this collapses to the Q15 case
/// (`+ 2^15` then `>> 16`).
fn q31_to_qn(a_q31: i64, shift: u8) -> i16 {
    let pred_q31 = -a_q31;
    let bias = 1i64 << (15 + shift as u32);
    let out = (pred_q31 + bias) >> (16 + shift as u32);
    out.clamp(i16::MIN as i64, i16::MAX as i64) as i16
}

/// Run LPC analysis into a caller-provided `LpcLevels` buffer. Populates
/// quantised predictor coefficients and shifts for every order
/// `1..=max_order` in a single Levinson-Durbin pass.
///
/// Returns `true` on success, `false` when the input is all-zero
/// (`R[0] = 0`); in the false case the caller must fall back to order 0
/// (verbatim) and `levels` contents are unspecified beyond being zero-
/// initialised.
///
/// # Parameters
///
/// `max_order` must satisfy `1 ≤ max_order ≤ MAX_LPC_ORDER`
/// (debug-asserted). Order 0 is not represented — it has no coefficients.
pub fn lpc_analyze_levels_into(samples: &[i32], max_order: u8, levels: &mut LpcLevels) -> bool {
    debug_assert!(
        max_order >= 1,
        "max_order must be ≥ 1 for lpc_analyze_levels_into"
    );
    debug_assert!(
        max_order <= MAX_LPC_ORDER,
        "max_order={max_order} exceeds MAX_LPC_ORDER={MAX_LPC_ORDER}"
    );
    // Stack-allocated autocorrelation buffer — no heap traffic. Sized
    // for the maximum supported order plus R[0].
    let mut r = [0i64; (MAX_LPC_ORDER as usize) + 1];
    autocorrelation_into(samples, max_order, &mut r);
    levinson_durbin_fill(&r[..=max_order as usize], max_order, levels)
}

/// Convenience wrapper: return quantised predictor coefficients at
/// exactly `order`, with the corresponding `coefficient_shift`.
///
/// `Some((vec![], 0))` for `order == 0` (verbatim mode, no
/// coefficients — shift is irrelevant and reported as 0 by convention).
/// `None` for all-zero input. For `order ≥ 1`, runs Levinson-Durbin up
/// to `order` and copies the final row out of an `LpcLevels` buffer.
///
/// Allocates one `Vec<i16>` of length `order` for the return. Hot-path
/// callers should use `lpc_analyze_levels_into` with a reused
/// `LpcLevels` buffer and consume results via `LpcLevels::get`.
#[cfg(test)]
pub(crate) fn lpc_analyze(samples: &[i32], order: u8) -> Option<(Vec<i16>, u8)> {
    if order == 0 {
        return Some((Vec::new(), 0));
    }
    let mut levels = LpcLevels::new();
    if !lpc_analyze_levels_into(samples, order, &mut levels) {
        return None;
    }
    let view = levels.get(order);
    Some((view.coefficients.to_vec(), view.shift))
}

/// Compute LPC prediction residuals for a frame.
///
/// `residual[i] = sample[i] − predict(sample[0..i], coeffs, shift)`.
///
/// # Prediction formula
///
/// Let `s = 15 − shift`. Coefficients are Q(15 − shift) so the right shift
/// of the accumulator matches:
///
/// ```text
/// predict[i] = (Σ_{j=0..terms-1} coeffs[j] × sample[i-j-1] + (1 << (s-1))) >> s
/// ```
///
/// where `terms = min(i, order)`. The accumulator is i64; at `shift = 0`
/// each product bounds at `2^15 × 2^23 = 2^38` and `order = 32` gives
/// `2^43`. At `shift = MAX_COEFFICIENT_SHIFT = 5` the coefficient widens
/// to `2^15 × 2^5 = 2^20`, so each product bounds at `2^43` and 32 terms
/// give `2^48` — still well within i64.
///
/// `+ (1 << (s-1))` rounds the right-shift to nearest, reducing residual
/// variance compared to truncation. For `shift = 0` this is the classic
/// `+ 16384` (`= 2^14`) Q15 bias.
///
/// # Warm-up period
///
/// Samples at indices `0..order-1` have fewer than `order` predecessors.
/// `terms = min(i, order)` handles this by summing only over available
/// samples. The first sample (`i = 0`) is always predicted as zero.
///
/// Exposed at the crate root only via the `__internal-for-bench`
/// feature gate — every real caller should use `compute_residuals_into`
/// to avoid an allocation.
#[cfg(any(test, feature = "__internal-for-bench"))]
pub fn compute_residuals(samples: &[i32], coeffs: &[i16], shift: u8) -> Vec<i32> {
    let mut out = Vec::with_capacity(samples.len());
    compute_residuals_into(samples, coeffs, shift, &mut out);
    out
}

/// Computes the same residual vector as the allocating
/// `compute_residuals` wrapper, but writes into the caller's buffer,
/// letting the frame encoder reuse one `Vec` across every order it
/// evaluates. Per-call allocation overhead was measurable in
/// profiling — ~100-200 ns per call across ~15 orders per frame
/// accumulates to a few percent of total encode time.
///
/// `out` is cleared first so the caller doesn't have to. Capacity is
/// reserved up to `samples.len()`; the buffer grows at most once and
/// only on the first call with a larger frame size.
pub fn compute_residuals_into(samples: &[i32], coeffs: &[i16], shift: u8, out: &mut Vec<i32>) {
    out.clear();
    out.reserve(samples.len());

    // Verbatim fast path: `order = 0` means no prediction — the residuals
    // equal the samples. This is also the required fallback for all-zero
    // frames (where Levinson-Durbin can't run) and a trivial short-circuit
    // for the encoder's order-search loop.
    let order = coeffs.len();
    if order == 0 {
        out.extend_from_slice(samples);
        return;
    }

    // Reverse and widen coefficients once up front. The reversal
    // converts the LPC prediction formula from a reversed-access dot
    // product into a forward-forward dot product; widening from i16 to
    // i32 lets LLVM compile the inner mul-accumulate to an i32×i32→i64
    // `imul` (or an auto-vectorised AVX variant) rather than a slower
    // i64×i64 path. The stack array sized for `MAX_LPC_ORDER` avoids a
    // per-call heap allocation.
    //
    // Benchmarks confirm LLVM's autovectoriser handles the resulting
    // forward-forward dot product on par with — or, at small orders,
    // *better* than — a hand-written AVX-512 kernel. An explicit SIMD
    // path here added function-call boundaries that cost more than the
    // SIMD width bought. Keep the code LLVM-friendly and trust the
    // optimiser.
    let mut coeffs_rev_buf = [0i32; MAX_LPC_ORDER as usize];
    for k in 0..order {
        coeffs_rev_buf[k] = coeffs[order - 1 - k] as i32;
    }
    let coeffs_rev = &coeffs_rev_buf[..order];

    // Right-shift amount and round-half-up bias for the Q(15 − shift)
    // coefficient scale. `shift_amt = 15 − shift`; the accumulated
    // `sum` is in a Q(shift_amt) scale relative to the sample units,
    // so `>> shift_amt` rescales it back to sample units.
    // `bias = 1 << (shift_amt − 1)` adds a half-LSB of the shifted
    // quantity — the standard round-to-nearest-even approximation
    // (strictly round-half-up for non-negative, away-from-zero for
    // negative after the shift). Required for bit-exact decoding.
    let shift_amt = 15u32 - shift as u32;
    let bias = 1i64 << (shift_amt - 1);
    let n = samples.len();

    // Warm-up region (`i < order`): fewer predecessors than
    // coefficients. The dot product still uses `coeffs_rev`, but the
    // window is `samples[0..i]` (variable length `i`) and the
    // corresponding coefficient slice is the *last* `i` entries of
    // `coeffs_rev`. Only `order × (order − 1) / 2` iterations across
    // the whole frame — not worth vectorising.
    let warm = order.min(n);
    for i in 0..warm {
        let mut sum: i64 = 0;
        let crev = &coeffs_rev[order - i..];
        let window = &samples[..i];
        for k in 0..i {
            sum += crev[k] as i64 * window[k] as i64;
        }
        // Warm-up at `i == 0`: no predecessors exist, so the sum is
        // empty and prediction is defined as 0 — NOT `(0 + bias) >> s`
        // (spec §3.6 pins this explicitly; a decoder that reaches for
        // the formula here would produce `bias >> s` instead of 0 at
        // shift values where those differ, breaking round-trip).
        let pred = if i == 0 { 0 } else { (sum + bias) >> shift_amt };
        out.push(samples[i] - pred as i32);
    }

    // Steady-state region (`i >= order`): constant-length inner loop,
    // both slices iterated forward. LLVM vectorises this to an AVX2/
    // AVX-512 mul-accumulate when release-mode codegen sees it.
    for i in order..n {
        let window = &samples[i - order..i];
        let mut sum: i64 = 0;
        for k in 0..order {
            sum += coeffs_rev[k] as i64 * window[k] as i64;
        }
        let pred = (sum + bias) >> shift_amt;
        out.push(samples[i] - pred as i32);
    }
}

/// Reconstruct samples from LPC residuals and Q(15 − shift) predictor
/// coefficients.
///
/// `sample[i] = residual[i] + predict(samples[0..i], coeffs, shift)`.
///
/// The prediction uses already-reconstructed samples (causal), so the
/// reconstruction is exact:
/// `lpc_synthesize(compute_residuals(s, c, k), c, k) == s` for any samples
/// `s`, coefficients `c`, and shift `k`. Residuals are transmitted without
/// quantisation, so no information is lost in the round-trip.
///
/// Prediction formula, rounding convention, and warm-up handling are
/// identical to `compute_residuals`.
#[cfg(test)]
pub(crate) fn lpc_synthesize(residuals: &[i32], coeffs: &[i16], shift: u8) -> Vec<i32> {
    let mut out = Vec::with_capacity(residuals.len());
    lpc_synthesize_into(residuals, coeffs, shift, &mut out);
    out
}

/// Reconstruct samples from residuals into the caller's buffer. `out`
/// is cleared first and filled with `residuals.len()` reconstructed
/// samples. For hot-path use — the MCU decode loop reuses one buffer
/// across every incoming frame.
///
/// Prediction formula, rounding convention, and warm-up handling mirror
/// `compute_residuals_into` in reverse.
pub fn lpc_synthesize_into(residuals: &[i32], coeffs: &[i16], shift: u8, out: &mut Vec<i32>) {
    out.clear();
    out.reserve(residuals.len());
    let order = coeffs.len();
    // Mirror of the encoder-side Q format: coefficients in
    // Q(15 − shift), accumulator shifted back by `shift_amt` with a
    // half-LSB `bias` for round-to-nearest. These two constants are
    // normative (spec §3.6): they must match the encoder bit-for-bit
    // so reconstructed samples equal the originals exactly.
    let shift_amt = 15u32 - shift as u32;
    let bias = 1i64 << (shift_amt - 1);
    let samples = out;
    for (i, &res) in residuals.iter().enumerate() {
        let terms = i.min(order);
        // Warm-up: when `terms == 0` (always true at `i == 0`, and
        // possible for `i < order` on short frames), the sum is empty
        // and prediction is defined as 0 — NOT `(0 + bias) >> s`. The
        // formula is not applied in the warm-up region per spec §3.6;
        // this `if` is what enforces that at the synthesis side and
        // must match the encoder's warm-up handling above.
        let pred = if terms == 0 {
            0i64
        } else {
            // Uses already-reconstructed `samples[0..i]`, not the originals.
            // This is what makes the round-trip lossless: encoder and decoder
            // compute predictions from the same reconstructed history.
            let sum: i64 = (0..terms)
                .map(|j| coeffs[j] as i64 * samples[i - j - 1] as i64)
                .sum();
            (sum + bias) >> shift_amt
        };
        // Wrapping add: `pred as i32` is already a wrapping narrow of
        // the i64 accumulator; the final `res + pred` must match that
        // semantics in release AND debug. On well-formed streams the
        // sum stays within the 24-bit sample range and the wrap is
        // never taken — but a malicious bitstream with attacker-chosen
        // residuals can push this arbitrarily, and the decoder is
        // contractually panic-free on every byte sequence (spec §6).
        // Produced output on overflow is "wrong sample, no panic",
        // matching the spec's "substitute silence / discard frame"
        // recovery model at the caller.
        samples.push(res.wrapping_add(pred as i32));
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use alloc::vec;
    use alloc::vec::Vec;

    use crate::test_signals::{angular_step_q32, sin_q15, sine_samples as int_sine_samples};

    /// Generate `n` samples of a sinusoid completing `freq_cycles` full
    /// periods over the block (so angular frequency is
    /// `2π · freq_cycles / n` rad/sample). Integer-only; delegates to
    /// the test-signal LUT in `crate::test_signals`.
    fn sine_samples(n: usize, freq_cycles: u64, amplitude: i32) -> Vec<i32> {
        let step = angular_step_q32(freq_cycles, n as u64);
        int_sine_samples(n, step, amplitude)
    }

    #[test]
    fn residuals_synthesize_roundtrip_all_orders() {
        let samples = sine_samples(960, 4, 50_000);
        // Order 0: empty coefficients, residuals equal samples.
        let (coeffs, shift) = lpc_analyze(&samples, 0).unwrap();
        assert!(coeffs.is_empty());
        assert_eq!(shift, 0);
        // For order 0, shift is ignored but must still be passed. Residuals
        // equal samples because there's no prediction.
        let residuals = compute_residuals(&samples, &coeffs, shift);
        assert_eq!(residuals, samples);
        assert_eq!(lpc_synthesize(&residuals, &coeffs, shift), samples);

        // All non-zero orders up to MAX_LPC_ORDER must round-trip exactly.
        for order in 1..=MAX_LPC_ORDER {
            let (coeffs, shift) = lpc_analyze(&samples, order).unwrap();
            assert_eq!(coeffs.len(), order as usize);
            assert!(shift <= MAX_COEFFICIENT_SHIFT);
            let residuals = compute_residuals(&samples, &coeffs, shift);
            let reconstructed = lpc_synthesize(&residuals, &coeffs, shift);
            assert_eq!(reconstructed, samples, "roundtrip failed at order {order}");
        }
    }

    #[test]
    fn autocorrelation_into_impulse() {
        // Ground-truth case: an impulse `[c, 0, 0, …, 0]` has R[0] = c²
        // and R[k] = 0 for k ≥ 1 by the autocorrelation definition
        // `R[k] = Σ s[i] · s[i+k]`. Catches numerical regressions that
        // round-trip tests would hide because Levinson-Durbin would
        // still invert a drifted R into consistent-if-wrong coefficients.
        const C: i32 = 1_000_000; // well inside the 24-bit contract
        let mut samples = vec![0i32; 32];
        samples[0] = C;
        let order = 8u8;
        let mut r = vec![0i64; order as usize + 1];
        autocorrelation_into(&samples, order, &mut r);
        assert_eq!(r[0], (C as i64) * (C as i64), "R[0] must equal c² exactly");
        for (k, &r_k) in r.iter().enumerate().skip(1) {
            assert_eq!(r_k, 0, "R[{k}] must be zero for an impulse input");
        }
    }

    #[test]
    fn autocorrelation_into_dc() {
        // Constant `[c; N]` has R[k] = (N − k) · c². Verifies the
        // windowed-sum inner loop's bounds: a miscounted upper index
        // would shift every R[k] by c² in one direction.
        const C: i32 = 5_000;
        const N: usize = 64;
        let samples = vec![C; N];
        let order = 4u8;
        let mut r = vec![0i64; order as usize + 1];
        autocorrelation_into(&samples, order, &mut r);
        let c2 = (C as i64) * (C as i64);
        for (k, &r_k) in r.iter().enumerate() {
            let expected = (N - k) as i64 * c2;
            assert_eq!(r_k, expected, "R[{k}] wrong for DC input");
        }
    }

    #[test]
    fn lpc_analyze_levels_matches_lpc_analyze() {
        let samples = sine_samples(960, 6, 100_000);
        let mut levels = LpcLevels::new();
        assert!(lpc_analyze_levels_into(
            &samples,
            MAX_LPC_ORDER,
            &mut levels
        ));
        for order in 1..=MAX_LPC_ORDER {
            let (single_coeffs, single_shift) = lpc_analyze(&samples, order).unwrap();
            let view = levels.get(order);
            assert_eq!(
                view.coefficients,
                &single_coeffs[..],
                "coefficient mismatch at order {order}"
            );
            assert_eq!(view.shift, single_shift, "shift mismatch at order {order}");
        }
    }

    #[test]
    fn all_zero_frame_returns_none() {
        let samples = vec![0i32; 960];
        assert!(lpc_analyze(&samples, 4).is_none());
        let mut levels = LpcLevels::new();
        assert!(!lpc_analyze_levels_into(&samples, 16, &mut levels));
    }

    #[test]
    fn order_zero_returns_empty_coeffs() {
        let samples = sine_samples(960, 4, 1000);
        let (coeffs, shift) = lpc_analyze(&samples, 0).unwrap();
        assert!(coeffs.is_empty());
        assert_eq!(shift, 0);
    }

    #[test]
    fn high_order_24_stays_non_trivial() {
        // Without the E_m tracker, coefficients at order ≥ ~16 collapse
        // toward zero because `num / R[0]` gets smaller every step
        // while R[0] is fixed. Exercise a complex signal (three
        // incommensurate sinusoids) and confirm the order-24
        // coefficients are not all-zero and the residuals are smaller
        // than at order 8.
        //
        // Angular steps approximate the original 0.11, 0.27, 0.43
        // rad/sample via Q32 integer conversions — the exact values
        // don't matter; what matters is that the three frequencies are
        // unrelated enough that order-8 LPC can't fully capture them.
        let n = 1024usize;
        // Q32 step = ω · 2³² / 2π. For ω = 0.11, step ≈ 75_143_389.
        // Computed by hand from the mantissa of 0.11 / (2π).
        let step_a: u32 = 75_143_389; // ω ≈ 0.10991
        let step_b: u32 = 184_443_047; // ω ≈ 0.26979
        let step_c: u32 = 293_742_706; // ω ≈ 0.42967
        let mut samples: Vec<i32> = Vec::with_capacity(n);
        let mut phase_a: u32 = 0;
        let mut phase_b: u32 = 0;
        let mut phase_c: u32 = 0;
        for _ in 0..n {
            let a = (sin_q15(phase_a) as i64 * 40_000 + (1 << 14)) >> 15;
            let b = (sin_q15(phase_b) as i64 * 20_000 + (1 << 14)) >> 15;
            let c = (sin_q15(phase_c) as i64 * 10_000 + (1 << 14)) >> 15;
            samples.push((a + b + c) as i32);
            phase_a = phase_a.wrapping_add(step_a);
            phase_b = phase_b.wrapping_add(step_b);
            phase_c = phase_c.wrapping_add(step_c);
        }

        let (c24, s24) = lpc_analyze(&samples, 24).unwrap();
        assert!(
            c24.iter().any(|&c| c != 0),
            "order-24 coefficients collapsed to zero"
        );

        let (c8, s8) = lpc_analyze(&samples, 8).unwrap();
        let e8: u64 = compute_residuals(&samples, &c8, s8)
            .iter()
            .map(|&r| (r as i64 * r as i64) as u64)
            .sum();
        let e24: u64 = compute_residuals(&samples, &c24, s24)
            .iter()
            .map(|&r| (r as i64 * r as i64) as u64)
            .sum();
        assert!(
            e24 < e8,
            "high-order residual energy should be smaller: e8={e8}, e24={e24}"
        );
    }

    #[test]
    fn dc_signal_residuals_are_small() {
        let samples = vec![10_000i32; 960];

        // Biased autocorrelation `R[k] = (N-k)·c²` (not `N·c²`) gives the
        // reflection coefficient a slight underestimate relative to the
        // theoretical optimum. The residuals are therefore small but not zero.
        let (coeffs, shift) = lpc_analyze(&samples, 4).unwrap();
        let residuals = compute_residuals(&samples, &coeffs, shift);
        let max_residual = residuals[4..].iter().map(|r| r.abs()).max().unwrap_or(0);
        assert!(max_residual <= 20, "DC residuals too large: {max_residual}");
    }

    #[test]
    fn order_32_roundtrip() {
        // 13.5 cycles across 4096 samples = 27 cycles across 8192; pass
        // the doubled ratio to keep the `sine_samples(n, cycles_u64,
        // amp)` signature integer.
        let step = angular_step_q32(27, 8192);
        let samples = int_sine_samples(4096, step, 1_000_000);
        let (coeffs, shift) = lpc_analyze(&samples, 32).unwrap();
        assert_eq!(coeffs.len(), 32);
        let residuals = compute_residuals(&samples, &coeffs, shift);
        let reconstructed = lpc_synthesize(&residuals, &coeffs, shift);
        assert_eq!(reconstructed, samples);
    }

    #[test]
    fn bass_sine_selects_nonzero_shift() {
        // A pure low-frequency sine has optimal `a[1] ≈ −2`, which
        // forces a non-zero `coefficient_shift` to avoid clamping.
        // This test would fail if `min_shift_for` were buggy or if the
        // encoder never bothered widening the Q range.
        //
        // 50 Hz at 48 kHz sample rate → 50/48000 revolutions/sample.
        // In Q32 that's `(50 · 2³²) / 48000 ≈ 4 474 091` per sample, so
        // angular frequency ≈ 2π · 50 / 48000 ≈ 0.00654 rad/sample →
        // cos ≈ 0.99998 → a[1] ≈ −1.9999. Representing this requires
        // Q14 (shift = 1) at minimum.
        let step = angular_step_q32(50, 48_000);
        let samples = int_sine_samples(4096, step, 1_000_000);
        let (_coeffs, shift) = lpc_analyze(&samples, 4).unwrap();
        assert!(
            shift >= 1,
            "low-frequency sine should force shift ≥ 1, got {shift}"
        );
    }

    #[test]
    fn min_shift_for_examples() {
        // At exactly |c| = 2^31 the real value is 1.0, which Q15 cannot
        // represent (its upper bound is 1 − 2^-15), so shift must widen
        // to 1. Just below that, shift = 0 suffices.
        assert_eq!(min_shift_for(&[(1i64 << 31) - 1]), 0);
        assert_eq!(min_shift_for(&[1i64 << 31]), 1);
        assert_eq!(min_shift_for(&[-(1i64 << 31)]), 1);
        assert_eq!(min_shift_for(&[(1i64 << 32) - 1]), 1);
        assert_eq!(min_shift_for(&[1i64 << 32]), 2);
        assert_eq!(min_shift_for(&[0, 0, 0]), 0);
        // Mixed: the max wins.
        assert_eq!(min_shift_for(&[0, 1 << 31, -(1 << 31) + 1]), 1);
    }
}