neutts 0.1.1 - Docs.rs

//! NeuCodec decoder — pure-Rust CPU inference from safetensors weights.
//!
//! ## Architecture (XCodec2-based)
//!
//! ```text
//!  codes [T]  ──►  FSQ lookup  ──►  fc_post_a  ──►  VocosBackbone  ──►  ISTFTHead  ──►  audio
//! (int, 0..65535)  [T, 2048]      [T, 1024]        [T, 1024]                          [T*hop]
//! ```
//!
//! **VocosBackbone**: Conv1d(k=7) → 2×ResnetBlock → 12×TransformerBlock (RoPE) → 2×ResnetBlock → LayerNorm
//!
//! **ISTFTHead**: Linear(1024 → n_fft+2) → split mag/phase → ISTFT
//!
//! ## Setup (one-time)
//!
//! ```sh
//! python scripts/convert_weights.py   # download + extract decoder weights to safetensors
//! ```
//!
//! Weights are then loaded at runtime from `models/neucodec_decoder.safetensors`.

use std::path::{Path, PathBuf};

use anyhow::{bail, Context, Result};
use ndarray::{s, Array1, Array2, Array3, ArrayView1, ArrayView2, ArrayView3};
use rustfft::{num_complex::Complex, FftPlanner};
use safetensors::SafeTensors;

// ─── Public constants ─────────────────────────────────────────────────────────

/// Sample rate of the decoder output (24 kHz).
pub const SAMPLE_RATE: u32 = 24_000;

/// Sample rate the encoder expects as input (16 kHz).
pub const ENCODER_SAMPLE_RATE: u32 = 16_000;

/// Decoder audio samples per speech token — assuming 50 tokens/s at 24 kHz.
/// The actual value is detected from the weight shapes at load time.
pub const SAMPLES_PER_TOKEN: usize = 480;

/// Encoder audio samples consumed per speech token (16 000 / 50 = 320).
pub const ENCODER_SAMPLES_PER_TOKEN: usize = 320;

/// Default reference audio length for the encoder: 10 s × 16 000 Hz.
pub const ENCODER_DEFAULT_INPUT_SAMPLES: usize = 16_000 * 10;

/// True when the `wgpu` Cargo feature is enabled.
/// When enabled, `NeuCodecDecoder` uses the Burn wgpu backend (GPU) with
/// automatic fallback to Burn NdArray (CPU) and then raw ndarray.
pub fn wgpu_feature_enabled() -> bool {
    cfg!(feature = "wgpu")
}

// ─── FSQ constants ────────────────────────────────────────────────────────────

/// FSQ levels for NeuCodec: 8 dimensions × 4 levels → 4^8 = 65 536 codes.
pub(crate) const FSQ_LEVELS: [i32; 8] = [4, 4, 4, 4, 4, 4, 4, 4];

/// Cumulative products of FSQ_LEVELS: used to decompose an integer code.
/// basis[j] = product(FSQ_LEVELS[0..j])
pub(crate) const FSQ_BASIS: [i32; 8] = [1, 4, 16, 64, 256, 1_024, 4_096, 16_384];

// ─── Tensor helpers ───────────────────────────────────────────────────────────

fn load_f32(st: &SafeTensors<'_>, name: &str) -> Result<Vec<f32>> {
    let view = st
        .tensor(name)
        .with_context(|| format!("Missing weight: {name}"))?;
    let raw = view.data();
    use safetensors::tensor::Dtype;
    Ok(match view.dtype() {
        Dtype::F32 => {
            // Fast path: the bytes are already little-endian f32.  On LE
            // hosts (x86, ARM) we can reinterpret directly with no per-byte
            // work — essentially a single memcpy via the Vec allocation.
            assert!(raw.len() % 4 == 0, "F32 tensor byte length not divisible by 4");
            let n = raw.len() / 4;
            let mut out = Vec::with_capacity(n);
            // SAFETY: raw is valid, aligned to u8 (no alignment requirement
            // for the source), and we write exactly `n` f32 values.
            #[cfg(target_endian = "little")]
            {
                // SAFETY: f32 and u8 have no padding/invalid-bit patterns for
                // this cast; we own `out` and set its length immediately after.
                unsafe {
                    std::ptr::copy_nonoverlapping(
                        raw.as_ptr(),
                        out.as_mut_ptr() as *mut u8,
                        raw.len(),
                    );
                    out.set_len(n);
                }
            }
            #[cfg(not(target_endian = "little"))]
            {
                out.extend(raw.chunks_exact(4).map(|b| {
                    f32::from_le_bytes([b[0], b[1], b[2], b[3]])
                }));
            }
            out
        }
        Dtype::BF16 => raw
            .chunks_exact(2)
            .map(|b| {
                let bits = u16::from_le_bytes([b[0], b[1]]);
                f32::from_bits((bits as u32) << 16)
            })
            .collect(),
        dt => bail!("Tensor {name}: unsupported dtype {dt:?} (expected F32 or BF16)"),
    })
}

fn shape_of(st: &SafeTensors<'_>, name: &str) -> Result<Vec<usize>> {
    Ok(st
        .tensor(name)
        .with_context(|| format!("Missing weight: {name}"))?
        .shape()
        .to_vec())
}

fn as1d(data: Vec<f32>, n: usize) -> Array1<f32> {
    Array1::from_shape_vec(n, data).expect("1-D shape mismatch")
}

fn as2d(data: Vec<f32>, rows: usize, cols: usize) -> Array2<f32> {
    Array2::from_shape_vec((rows, cols), data).expect("2-D shape mismatch")
}

fn as3d(data: Vec<f32>, d0: usize, d1: usize, d2: usize) -> Array3<f32> {
    Array3::from_shape_vec((d0, d1, d2), data).expect("3-D shape mismatch")
}

// ─── Math primitives ──────────────────────────────────────────────────────────

/// Linear layer: `out = x @ w.T + b`
///
/// * `x`: \[T, in_dim\]
/// * `w`: \[out_dim, in_dim\]  (PyTorch row-major convention)
/// * `b`: \[out_dim\]  (optional)
/// * returns: \[T, out_dim\]
fn linear(x: ArrayView2<f32>, w: ArrayView2<f32>, b: Option<ArrayView1<f32>>) -> Array2<f32> {
    let mut out = x.dot(&w.t()); // [T, out_dim]
    if let Some(b) = b {
        out += &b;
    }
    out
}

/// Conv1d with same-length output (zero-padded).
///
/// * `x`: \[c_in, T\]
/// * `w`: \[c_out, c_in, k\]
/// * `b`: \[c_out\]  (optional)
/// * returns: \[c_out, T\]
fn conv1d(
    x: ArrayView2<f32>,
    w: ArrayView3<f32>,
    b: Option<ArrayView1<f32>>,
    pad: usize,
) -> Array2<f32> {
    let (c_in, t) = (x.shape()[0], x.shape()[1]);
    let (c_out, _, k) = (w.shape()[0], w.shape()[1], w.shape()[2]);

    // im2col: build [T, c_in × k] column matrix
    let mut col = Array2::<f32>::zeros((t, c_in * k));
    for ti in 0..t {
        for ci in 0..c_in {
            for ki in 0..k {
                let src = ti + ki;
                if src >= pad && src < t + pad {
                    col[[ti, ci * k + ki]] = x[[ci, src - pad]];
                }
                // else zero-pad (already zeroed)
            }
        }
    }

    // weight: [c_out, c_in × k]
    let w2 = w.into_shape_with_order((c_out, c_in * k)).expect("conv1d reshape");

    // out_t = col @ w2.T  →  [T, c_out]  then transpose to [c_out, T]
    let out_t = col.dot(&w2.t());
    let mut out = out_t.t().to_owned(); // [c_out, T]

    if let Some(b) = b {
        // Broadcast b [c_out] over [c_out, T] — one ndarray op, no manual loop.
        use ndarray::Axis;
        out += &b.view().insert_axis(Axis(1));
    }
    out
}

/// GroupNorm: `affine=True`, over input \[C, T\].
/// Normalises over (group_size × T) elements per group.
///
/// Uses an iterator-based variance computation to avoid the temporary
/// array that `block.mapv(|v| (v - mean).powi(2))` would allocate.
fn group_norm(
    x: ArrayView2<f32>,
    n_groups: usize,
    w: ArrayView1<f32>,
    b: ArrayView1<f32>,
    eps: f32,
) -> Array2<f32> {
    let (c, t) = (x.shape()[0], x.shape()[1]);
    let group_size = c / n_groups;
    let n = (group_size * t) as f32;
    let mut out = Array2::<f32>::zeros((c, t));

    for g in 0..n_groups {
        let c_start = g * group_size;
        let c_end   = c_start + group_size;
        let block   = x.slice(s![c_start..c_end, ..]);

        // Mean — no temporary allocation
        let mean = block.iter().sum::<f32>() / n;
        // Variance — single pass, no temporary allocation
        let var  = block.iter().map(|&v| { let d = v - mean; d * d }).sum::<f32>() / n;
        let inv_std = 1.0 / (var + eps).sqrt();

        for ci in c_start..c_end {
            let scale = inv_std * w[ci];
            let shift = b[ci];
            for ti in 0..t {
                out[[ci, ti]] = (x[[ci, ti]] - mean) * scale + shift;
            }
        }
    }
    out
}

/// LayerNorm over the last axis of \[T, C\].
///
/// Uses iterator sums to avoid the temporary arrays that `row.mapv(…).sum()`
/// would allocate for each of the T rows.
fn layer_norm(
    x: ArrayView2<f32>,
    w: ArrayView1<f32>,
    b: ArrayView1<f32>,
    eps: f32,
) -> Array2<f32> {
    let (t, c) = (x.shape()[0], x.shape()[1]);
    let c_f = c as f32;
    let mut out = Array2::<f32>::zeros((t, c));
    for ti in 0..t {
        let row = x.slice(s![ti, ..]);
        let mean = row.iter().sum::<f32>() / c_f;
        let var  = row.iter().map(|&v| { let d = v - mean; d * d }).sum::<f32>() / c_f;
        let inv_std = 1.0 / (var + eps).sqrt();
        for ci in 0..c {
            out[[ti, ci]] = (x[[ti, ci]] - mean) * inv_std * w[ci] + b[ci];
        }
    }
    out
}

/// RMSNorm over the last axis of \[T, C\].
///
/// Uses an iterator sum to avoid the temporary array that `row.mapv(|v|
/// v*v).sum()` would allocate for each of the T rows.
fn rms_norm(x: ArrayView2<f32>, w: ArrayView1<f32>, eps: f32) -> Array2<f32> {
    let (t, c) = (x.shape()[0], x.shape()[1]);
    let c_f = c as f32;
    let mut out = Array2::<f32>::zeros((t, c));
    for ti in 0..t {
        let row = x.slice(s![ti, ..]);
        let ms    = row.iter().map(|&v| v * v).sum::<f32>() / c_f;
        let scale = 1.0 / (ms + eps).sqrt();
        for ci in 0..c {
            out[[ti, ci]] = x[[ti, ci]] * scale * w[ci];
        }
    }
    out
}

/// SiLU (swish): `x * σ(x)`.
#[inline(always)]
fn silu(x: f32) -> f32 {
    x / (1.0 + (-x).exp())
}

/// Row-wise softmax (in-place) over \[T\].
fn softmax_inplace(x: &mut [f32]) {
    let max = x.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
    let mut sum = 0.0f32;
    x.iter_mut().for_each(|v| {
        *v = (*v - max).exp();
        sum += *v;
    });
    x.iter_mut().for_each(|v| *v /= sum);
}

// ─── FSQ decode ───────────────────────────────────────────────────────────────

/// Decode integer FSQ codes → continuous embeddings.
///
/// For each code (0..65535):
/// 1. Decompose into 8 base-4 digits using `FSQ_BASIS`.
/// 2. Scale each digit d ∈ {0,1,2,3} to {−1, −⅓, ⅓, 1} via `(d/1.5) - 1`.
/// 3. Apply the `project_out` linear layer (8 → 2048).
///
/// Returns \[T, fsq_out_dim\].
fn fsq_decode(
    codes: &[i32],
    proj_w: ArrayView2<f32>, // [fsq_out_dim, 8]
    proj_b: ArrayView1<f32>, // [fsq_out_dim]
) -> Array2<f32> {
    let t = codes.len();
    let _out_dim = proj_w.shape()[0];

    // Build [T, 8] matrix of scaled FSQ digits
    let mut digits = Array2::<f32>::zeros((t, FSQ_BASIS.len()));
    for (i, &code) in codes.iter().enumerate() {
        for (j, (&basis, &levels)) in FSQ_BASIS.iter().zip(FSQ_LEVELS.iter()).enumerate() {
            let d = (code / basis) % levels;
            // Scale from {0,1,…,L-1} to {-1, -1/3, 1/3, 1} for L=4
            // Formula: (d / ((L-1)/2)) - 1  =  (d / 1.5) - 1
            digits[[i, j]] = d as f32 / 1.5 - 1.0;
        }
    }

    // project_out: [T, 8] @ [8, out_dim] + [out_dim]
    linear(digits.view(), proj_w, Some(proj_b))
}

// ─── RoPE sin/cos dispatch ────────────────────────────────────────────────────

/// Compute `(sin(x), cos(x))` for use in Rotary Positional Embedding.
///
/// The implementation is selected at compile time by the active feature flag:
///
/// | Feature     | Implementation                          | Max abs. error |
/// |-------------|-----------------------------------------|----------------|
/// | `fast`      | degree-7/6 Horner polynomial + f32 RR   | ~1 × 10⁻⁴     |
/// | `precise`   | `f32::sin_cos()` — correctly rounded    | ~1 × 10⁻⁷     |
/// | *(neither)* | same as `fast` (default)                | ~1 × 10⁻⁴     |
///
/// ### Fast-mode notes
///
/// The polynomial path avoids transcendental function calls entirely: sin and
/// cos are each evaluated with 6 fused multiply-adds (Horner's method).  On
/// platforms where hardware `sin`/`cos` instructions are slow or absent this
/// can be 6–12× faster per value.
///
/// Range reduction to \[−π, π\] uses a single `f32` round-multiply.  For large
/// angles — RoPE dimensions with position ≈ 2 047 and the highest frequency
/// (`inv_freq = 1.0`) give θ ≈ 2 047 rad — floating-point cancellation in the
/// reduction introduces O(2⁻²³ · |θ|) extra absolute error before the
/// polynomial.  At the worst case this is ≈ 2 × 10⁻⁴ rad, which is well
/// within perceptual threshold for speech synthesis.
///
/// Both this function and the Burn GPU path in `codec_burn::load_weights` use
/// the same dispatch, so precomputed RoPE tables and runtime CPU evaluations
/// are always produced by the same algorithm.
#[cfg(not(feature = "precise"))]
#[inline(always)]
pub(crate) fn rope_sin_cos(x: f32) -> (f32, f32) {
    use std::f32::consts::TAU;
    // Range-reduce to [−π, π] with a single round() multiply.
    let x = x - TAU * (x * (1.0 / TAU)).round();
    let x2 = x * x;
    // Horner-form degree-7 sin: x(1 + x²(−1/6 + x²(1/120 − x²/5040)))
    let s = x  * (1.0 + x2 * (-1.0/6.0   + x2 * (1.0/120.0  - x2 * (1.0/5040.0))));
    // Horner-form degree-6 cos: 1 + x²(−1/2 + x²(1/24 − x²/720))
    let c = 1.0 + x2 * (-0.5             + x2 * (1.0/24.0    - x2 * (1.0/720.0)));
    (s, c)
}

#[cfg(feature = "precise")]
#[inline(always)]
pub(crate) fn rope_sin_cos(x: f32) -> (f32, f32) {
    x.sin_cos()
}

// ─── Rotary positional embedding ──────────────────────────────────────────────

/// Apply split-half RoPE (torchtune convention) to `x` in-place.
///
/// * `x`: \[T, n_heads, head_dim\]
///
/// The outer loop order is `(position, freq_index, head)` so each
/// `sin_cos()` result is computed **once per (position, freq)** and reused
/// across all heads — previously it was computed once per (position, freq,
/// head), allocating a fresh `Vec<f32>` for each position.
fn apply_rope(x: &mut Array3<f32>) {
    let (t, n_heads, head_dim) = (x.shape()[0], x.shape()[1], x.shape()[2]);
    let half = head_dim / 2;

    // Inverse frequencies — only `half` f32 values, computed once.
    let inv_freqs: Vec<f32> = (0..half)
        .map(|i| 1.0_f32 / 10_000_f32.powf(2.0 * i as f32 / head_dim as f32))
        .collect();

    for p in 0..t {
        let p_f = p as f32;
        for i in 0..half {
            // Dispatch to rope_sin_cos(): polynomial (fast, default) or
            // stdlib sin_cos (precise feature).
            let (s, c) = rope_sin_cos(p_f * inv_freqs[i]);
            // Apply the same rotation to every head — no per-head recompute.
            for h in 0..n_heads {
                let x1 = x[[p, h, i]];
                let x2 = x[[p, h, i + half]];
                x[[p, h, i]]        = x1 * c - x2 * s;
                x[[p, h, i + half]] = x1 * s + x2 * c;
            }
        }
    }
}

// ─── Transformer components ───────────────────────────────────────────────────

pub(crate) struct TransformerWeights {
    pub(crate) att_norm_w: Array1<f32>,  // RMSNorm  [D]
    pub(crate) c_attn_w: Array2<f32>,    // Linear   [3D, D]  (no bias)
    pub(crate) c_proj_w: Array2<f32>,    // Linear   [D, D]   (no bias)
    pub(crate) ffn_norm_w: Array1<f32>,  // RMSNorm  [D]
    pub(crate) fc1_w: Array2<f32>,       // Linear   [4D, D]  (no bias)
    pub(crate) fc2_w: Array2<f32>,       // Linear   [D, 4D]  (no bias)
}

/// Single Transformer block (RMSNorm → Attention → RMSNorm → MLP), residual.
///
/// * `x`: \[T, D\]  (modified in-place conceptually; returns new array)
fn transformer_block(x: ArrayView2<f32>, w: &TransformerWeights, n_heads: usize) -> Array2<f32> {
    let (t, d) = (x.shape()[0], x.shape()[1]);
    let head_dim = d / n_heads;

    // ── Attention sub-layer ───────────────────────────────────────────────────
    let normed = rms_norm(x, w.att_norm_w.view(), 1e-6);
    // qkv: [T, 3D]  (no bias)
    let qkv = linear(normed.view(), w.c_attn_w.view(), None);

    // Split into Q, K, V each [T, D]
    let q_flat = qkv.slice(s![.., 0..d]).to_owned();
    let k_flat = qkv.slice(s![.., d..2 * d]).to_owned();
    let v_flat = qkv.slice(s![.., 2 * d..]).to_owned();

    // Reshape to [T, n_heads, head_dim]
    let mut q = q_flat
        .into_shape_with_order((t, n_heads, head_dim))
        .expect("q reshape");
    let mut k = k_flat
        .into_shape_with_order((t, n_heads, head_dim))
        .expect("k reshape");
    let v = v_flat
        .into_shape_with_order((t, n_heads, head_dim))
        .expect("v reshape");

    apply_rope(&mut q);
    apply_rope(&mut k);

    // Scaled dot-product attention per head
    let scale = (head_dim as f32).sqrt().recip();
    // attn_out: [T, n_heads, head_dim]
    let mut attn_out = Array3::<f32>::zeros((t, n_heads, head_dim));

    for h in 0..n_heads {
        let qh = q.slice(s![.., h, ..]).to_owned(); // [T, head_dim]
        let kh = k.slice(s![.., h, ..]).to_owned();
        let vh = v.slice(s![.., h, ..]).to_owned();

        // scores = qh @ kh.T * scale  →  [T, T]
        let mut scores = qh.dot(&kh.t());
        scores.mapv_inplace(|v| v * scale);

        // softmax over last dim (per query row)
        for ti in 0..t {
            softmax_inplace(scores.slice_mut(s![ti, ..]).as_slice_mut().unwrap());
        }

        // weighted_v = scores @ vh  →  [T, head_dim]
        let wv = scores.dot(&vh);
        attn_out.slice_mut(s![.., h, ..]).assign(&wv);
    }

    // Reshape [T, n_heads, head_dim] → [T, D]
    let attn_flat = attn_out
        .into_shape_with_order((t, d))
        .expect("attn out reshape");

    // Project: c_proj (no bias)
    let attn_proj = linear(attn_flat.view(), w.c_proj_w.view(), None);

    // Residual
    let x_attn = &x + &attn_proj;

    // ── MLP sub-layer ─────────────────────────────────────────────────────────
    let normed2 = rms_norm(x_attn.view(), w.ffn_norm_w.view(), 1e-6);
    let h1 = linear(normed2.view(), w.fc1_w.view(), None);
    let h1_act = h1.mapv(silu);
    let h2 = linear(h1_act.view(), w.fc2_w.view(), None);

    &x_attn + &h2
}

// ─── ResnetBlock ─────────────────────────────────────────────────────────────

pub(crate) struct ResnetBlockWeights {
    pub(crate) norm1_w: Array1<f32>, // GroupNorm [C]
    pub(crate) norm1_b: Array1<f32>,
    pub(crate) conv1_w: Array3<f32>, // Conv1d [C, C, 3]
    pub(crate) conv1_b: Array1<f32>,
    pub(crate) norm2_w: Array1<f32>,
    pub(crate) norm2_b: Array1<f32>,
    pub(crate) conv2_w: Array3<f32>, // Conv1d [C, C, 3]
    pub(crate) conv2_b: Array1<f32>,
}

/// ResnetBlock: GroupNorm → swish → Conv1d(k=3) → GroupNorm → swish → Conv1d(k=3) + residual.
///
/// * `x`: \[C, T\]  (channels-first)
fn resnet_block(x: ArrayView2<f32>, w: &ResnetBlockWeights) -> Array2<f32> {
    // norm1 → swish → conv1
    let h = group_norm(x, 32, w.norm1_w.view(), w.norm1_b.view(), 1e-6);
    let h = h.mapv(silu);
    let h = conv1d(h.view(), w.conv1_w.view(), Some(w.conv1_b.view()), 1);

    // norm2 → swish → (dropout=no-op at inference) → conv2
    let h = group_norm(h.view(), 32, w.norm2_w.view(), w.norm2_b.view(), 1e-6);
    let h = h.mapv(silu);
    let h = conv1d(h.view(), w.conv2_w.view(), Some(w.conv2_b.view()), 1);

    // residual (in_channels == out_channels so no projection)
    &x + &h
}

// ─── ISTFT ────────────────────────────────────────────────────────────────────

/// Inverse STFT matching PyTorch `torch.istft(..., center=True)`.
///
/// * `mag`: \[n_fft/2+1, T\]  **log**-magnitudes (the model head outputs log-mag)
/// * `phase`: \[n_fft/2+1, T\] phase angles in radians
/// * `hop`: hop length (= n_fft / 4)
/// * `window`: Hann window \[n_fft\]
/// * returns: waveform of exactly `T × hop` samples
///
/// ### Two bugs this function previously had (now fixed)
///
/// 1. **Clamp-before-exp** — the original code did `mag.min(1e2).exp()`, which
///    caps the *log*-magnitude at 100 (meaning `exp(100) ≈ 2.7e43` for large
///    bins).  The correct Python behaviour is `exp(mag).clamp(max=1e2)` — clamp
///    the *linear* magnitude to 100.  Large log-magnitude bins (common for
///    loud/low-frequency speech) therefore blew up, drowning out high-frequency
///    content and causing muffled output.
///
/// 2. **Wrong center trim** — PyTorch's `center=True` removes `n_fft/2` samples
///    from the **start** of the OLA buffer and then takes exactly `T*hop`
///    samples.  The old code instead removed `(n_fft-hop)/2` from **both ends**,
///    which is a 240-sample temporal offset (at 24 kHz with hop=480) and
///    includes partially-overlapped edge frames with poor reconstruction quality.
///
/// `pub(crate)` so the Burn decoder in `codec_burn.rs` can call it after
/// pulling the head output back from the device.
pub(crate) fn istft_burn(
    mag: ArrayView2<f32>,
    phase: ArrayView2<f32>,
    hop: usize,
    window: &[f32],
    ifft: &dyn rustfft::Fft<f32>,
) -> Vec<f32> {
    let n_bins = mag.shape()[0]; // n_fft/2 + 1
    let n_frames = mag.shape()[1];
    let n_fft = (n_bins - 1) * 2;
    debug_assert_eq!(n_fft, window.len());
    debug_assert_eq!(hop, n_fft / 4);

    // Output buffer length before trimming
    let out_size = (n_frames - 1) * hop + n_fft;
    let mut y   = vec![0.0f32; out_size];
    let mut env = vec![0.0f32; out_size];

    let mut buf = vec![Complex::<f32>::default(); n_fft];

    for ti in 0..n_frames {
        // Build the complex spectrum from log-magnitude + phase angle.
        //
        // FIX 1: exp() first, then clamp — matching PyTorch's
        //   `mag = torch.exp(mag).clamp(max=1e2)`
        // The old `.min(1e2).exp()` capped the log-magnitude at 100, which
        // effectively allowed linear magnitudes up to exp(100) ≈ 2.7e43.
        for fi in 0..n_bins {
            let m = mag[[fi, ti]].exp().min(1e2); // ← fixed: clamp linear mag
            let p = phase[[fi, ti]];
            buf[fi] = Complex::new(m * p.cos(), m * p.sin());
        }
        // Hermitian symmetry for real IFFT output
        for fi in 1..n_bins - 1 {
            buf[n_fft - fi] = buf[fi].conj();
        }

        // Inverse FFT (rustfft is unnormalized — we divide by n_fft below)
        ifft.process(&mut buf);

        // Normalize + apply synthesis window, then overlap-add
        let norm = n_fft as f32;
        let offset = ti * hop;
        for i in 0..n_fft {
            let sample = buf[i].re / norm * window[i];
            y[offset + i]   += sample;
            env[offset + i] += window[i] * window[i];
        }
    }

    // Weighted overlap-add normalization
    for i in 0..out_size {
        if env[i] > 1e-11 {
            y[i] /= env[i];
        }
    }

    // FIX 2: match PyTorch center=True — trim n_fft/2 from the START only,
    // then take exactly T*hop samples.
    //
    // Old code: y[(n_fft-hop)/2 .. out_size-(n_fft-hop)/2]
    //   → 240-sample temporal offset + includes edge frames with 1-2 overlaps.
    // Correct:  y[n_fft/2 .. n_fft/2 + T*hop]
    //   → first fully-overlapped sample (≥4 frames) through end of signal.
    let start  = n_fft / 2;
    let length = n_frames * hop;
    y[start..start + length].to_vec()
}

/// Hann window of length `n`.
fn hann_window(n: usize) -> Vec<f32> {
    (0..n)
        .map(|i| 0.5 * (1.0 - (2.0 * std::f32::consts::PI * i as f32 / n as f32).cos()))
        .collect()
}

// ─── Decoder weights ──────────────────────────────────────────────────────────

pub(crate) struct DecoderWeights {
    // FSQ
    pub(crate) fsq_proj_w: Array2<f32>, // [2048, 8]
    pub(crate) fsq_proj_b: Array1<f32>, // [2048]

    // fc_post_a: Linear(2048, 1024)
    pub(crate) fc_post_a_w: Array2<f32>, // [1024, 2048]
    pub(crate) fc_post_a_b: Array1<f32>, // [1024]

    // backbone.embed: Conv1d(1024, 1024, k=7, pad=3)
    pub(crate) embed_w: Array3<f32>, // [1024, 1024, 7]
    pub(crate) embed_b: Array1<f32>, // [1024]

    // backbone.prior_net (2 ResnetBlocks)
    pub(crate) prior_net: Vec<ResnetBlockWeights>,

    // backbone.transformers (N TransformerBlocks)
    pub(crate) transformers: Vec<TransformerWeights>,

    // backbone.final_layer_norm: LayerNorm [D]
    pub(crate) final_norm_w: Array1<f32>,
    pub(crate) final_norm_b: Array1<f32>,

    // backbone.post_net (2 ResnetBlocks)
    pub(crate) post_net: Vec<ResnetBlockWeights>,

    // head.out: Linear(D, n_fft+2)
    pub(crate) head_w: Array2<f32>, // [n_fft+2, 1024]
    pub(crate) head_b: Array1<f32>, // [n_fft+2]

    // Hann window
    pub(crate) window: Vec<f32>, // [n_fft]

    // Detected hyper-parameters
    pub(crate) hidden_dim: usize,
    pub(crate) hop_length: usize,
    pub(crate) depth: usize,
    pub(crate) n_heads: usize,

    // Cached IFFT plan — created once at load time so the plan cache is not
    // discarded between decode() calls.
    pub(crate) ifft_plan: std::sync::Arc<dyn rustfft::Fft<f32>>,
}

fn load_resnet_block(st: &SafeTensors<'_>, prefix: &str, c: usize) -> Result<ResnetBlockWeights> {
    Ok(ResnetBlockWeights {
        norm1_w: as1d(load_f32(st, &format!("{prefix}.norm1.weight"))?, c),
        norm1_b: as1d(load_f32(st, &format!("{prefix}.norm1.bias"))?, c),
        conv1_w: as3d(load_f32(st, &format!("{prefix}.conv1.weight"))?, c, c, 3),
        conv1_b: as1d(load_f32(st, &format!("{prefix}.conv1.bias"))?, c),
        norm2_w: as1d(load_f32(st, &format!("{prefix}.norm2.weight"))?, c),
        norm2_b: as1d(load_f32(st, &format!("{prefix}.norm2.bias"))?, c),
        conv2_w: as3d(load_f32(st, &format!("{prefix}.conv2.weight"))?, c, c, 3),
        conv2_b: as1d(load_f32(st, &format!("{prefix}.conv2.bias"))?, c),
    })
}

fn load_transformer(st: &SafeTensors<'_>, prefix: &str, d: usize) -> Result<TransformerWeights> {
    Ok(TransformerWeights {
        att_norm_w: as1d(load_f32(st, &format!("{prefix}.att_norm.weight"))?, d),
        c_attn_w: as2d(load_f32(st, &format!("{prefix}.att.c_attn.weight"))?, 3 * d, d),
        c_proj_w: as2d(load_f32(st, &format!("{prefix}.att.c_proj.weight"))?, d, d),
        ffn_norm_w: as1d(load_f32(st, &format!("{prefix}.ffn_norm.weight"))?, d),
        fc1_w: as2d(load_f32(st, &format!("{prefix}.mlp.fc1.weight"))?, 4 * d, d),
        fc2_w: as2d(load_f32(st, &format!("{prefix}.mlp.fc2.weight"))?, d, 4 * d),
    })
}

fn load_decoder_weights(
    st: &SafeTensors<'_>,
    user_meta: &Option<std::collections::HashMap<String, String>>,
) -> Result<DecoderWeights> {
    // ── Auto-detect hyper-parameters from weight shapes ───────────────────────
    let embed_shape = shape_of(st, "generator.backbone.embed.weight")?;
    let hidden_dim = embed_shape[0]; // c_out

    let head_shape = shape_of(st, "generator.head.out.weight")?;
    let out_dim = head_shape[0];   // n_fft + 2
    let hop_length = (out_dim - 2) / 4;

    // Count transformer blocks by probing for weight keys
    let depth = (0..64)
        .take_while(|&i| {
            st.tensor(&format!(
                "generator.backbone.transformers.{i}.att_norm.weight"
            ))
            .is_ok()
        })
        .count();

    if depth == 0 {
        bail!("No transformer blocks found — is the safetensors file correct?");
    }

    // n_heads: read from safetensors __metadata__ if present, otherwise default to 16
    let n_heads: usize = user_meta
        .as_ref()
        .and_then(|m| m.get("n_heads"))
        .and_then(|s| s.parse().ok())
        .unwrap_or(16);

    // FSQ codebook projection
    // Try the nested key first (older exports), fall back to the flat key
    let fsq_proj_key = if st.tensor("generator.quantizer.fsqs.0.project_out.weight").is_ok() {
        "generator.quantizer.fsqs.0.project_out.weight"
    } else {
        "generator.quantizer.project_out.weight"
    };
    let fsq_bias_key = if st.tensor("generator.quantizer.fsqs.0.project_out.bias").is_ok() {
        "generator.quantizer.fsqs.0.project_out.bias"
    } else {
        "generator.quantizer.project_out.bias"
    };

    let fsq_shape = shape_of(st, fsq_proj_key)?;
    let fsq_out_dim = fsq_shape[0]; // 2048
    let fsq_in_dim = fsq_shape[1];  // 8

    let fsq_proj_w = as2d(
        load_f32(st, fsq_proj_key)?,
        fsq_out_dim,
        fsq_in_dim,
    );
    let fsq_proj_b = as1d(
        load_f32(st, fsq_bias_key)?,
        fsq_out_dim,
    );

    // fc_post_a: [1024, 2048]
    let fc_post_a_w = as2d(
        load_f32(st, "fc_post_a.weight")?,
        hidden_dim,
        fsq_out_dim,
    );
    let fc_post_a_b = as1d(load_f32(st, "fc_post_a.bias")?, hidden_dim);

    // backbone.embed Conv1d
    let embed_k = embed_shape[2];
    let embed_w = as3d(
        load_f32(st, "generator.backbone.embed.weight")?,
        hidden_dim,
        hidden_dim,
        embed_k,
    );
    let embed_b = as1d(
        load_f32(st, "generator.backbone.embed.bias")?,
        hidden_dim,
    );

    // prior_net (2 ResnetBlocks)
    let prior_net = (0..2)
        .map(|i| {
            load_resnet_block(
                st,
                &format!("generator.backbone.prior_net.{i}"),
                hidden_dim,
            )
        })
        .collect::<Result<Vec<_>>>()?;

    // transformers
    let transformers = (0..depth)
        .map(|i| {
            load_transformer(
                st,
                &format!("generator.backbone.transformers.{i}"),
                hidden_dim,
            )
        })
        .collect::<Result<Vec<_>>>()?;

    // final_layer_norm
    let final_norm_w = as1d(
        load_f32(st, "generator.backbone.final_layer_norm.weight")?,
        hidden_dim,
    );
    let final_norm_b = as1d(
        load_f32(st, "generator.backbone.final_layer_norm.bias")?,
        hidden_dim,
    );

    // post_net (2 ResnetBlocks)
    let post_net = (0..2)
        .map(|i| {
            load_resnet_block(
                st,
                &format!("generator.backbone.post_net.{i}"),
                hidden_dim,
            )
        })
        .collect::<Result<Vec<_>>>()?;

    // head.out
    let n_fft = hop_length * 4;
    let head_w = as2d(
        load_f32(st, "generator.head.out.weight")?,
        out_dim,
        hidden_dim,
    );
    let head_b = as1d(load_f32(st, "generator.head.out.bias")?, out_dim);

    // Hann window: try to load from safetensors; compute as fallback
    let window = if st.tensor("generator.head.istft.window").is_ok() {
        load_f32(st, "generator.head.istft.window")?
    } else {
        hann_window(n_fft)
    };

    // Build the IFFT plan once at load time — the FftPlanner caches plans
    // internally, but recreating the planner on each decode() call would
    // silently discard that cache and re-plan from scratch every time.
    let ifft_plan = {
        let mut planner = FftPlanner::<f32>::new();
        planner.plan_fft_inverse(n_fft)
    };

    Ok(DecoderWeights {
        fsq_proj_w,
        fsq_proj_b,
        fc_post_a_w,
        fc_post_a_b,
        embed_w,
        embed_b,
        prior_net,
        transformers,
        final_norm_w,
        final_norm_b,
        post_net,
        head_w,
        head_b,
        window,
        hidden_dim,
        hop_length,
        depth,
        n_heads,
        ifft_plan,
    })
}

// ─── Decoder forward pass ─────────────────────────────────────────────────────

fn decode_forward(codes: &[i32], w: &DecoderWeights) -> Vec<f32> {
    let hop = w.hop_length;
    let n_fft = hop * 4;
    let embed_k = w.embed_w.shape()[2];
    let embed_pad = embed_k / 2;

    // 1. FSQ decode: [T] → [T, fsq_out_dim]
    let emb = fsq_decode(codes, w.fsq_proj_w.view(), w.fsq_proj_b.view());

    // 2. fc_post_a: [T, fsq_out_dim] → [T, hidden_dim]
    let x = linear(emb.view(), w.fc_post_a_w.view(), Some(w.fc_post_a_b.view()));

    // 3. backbone.embed Conv1d: [hidden_dim, T]
    let x_ct = x.t().to_owned(); // [hidden_dim, T]
    let x_ct = conv1d(
        x_ct.view(),
        w.embed_w.view(),
        Some(w.embed_b.view()),
        embed_pad,
    );

    // 4. prior_net (ResnetBlocks, channels-first)
    let x_ct = w
        .prior_net
        .iter()
        .fold(x_ct, |acc, rw| resnet_block(acc.view(), rw));

    // 5. Transformers (sequence-first)
    let x_tc = x_ct.t().to_owned(); // [T, hidden_dim]
    let x_tc = w
        .transformers
        .iter()
        .fold(x_tc, |acc, tw| transformer_block(acc.view(), tw, w.n_heads));

    // 6. post_net (channels-first)
    let x_ct = x_tc.t().to_owned(); // [hidden_dim, T]
    let x_ct = w
        .post_net
        .iter()
        .fold(x_ct, |acc, rw| resnet_block(acc.view(), rw));

    // 7. final_layer_norm (sequence-first)
    let x_tc = x_ct.t().to_owned(); // [T, hidden_dim]
    let x_tc = layer_norm(
        x_tc.view(),
        w.final_norm_w.view(),
        w.final_norm_b.view(),
        1e-6,
    );

    // 8. head.out: [T, hidden_dim] → [T, n_fft+2]
    let x_pred = linear(x_tc.view(), w.head_w.view(), Some(w.head_b.view()));

    // 9. Transpose → [n_fft+2, T], split mag and phase
    let x_pred_ct = x_pred.t().to_owned(); // [n_fft+2, T]
    let half = (n_fft / 2) + 1; // n_bins = 641 for n_fft=1280, 961 for n_fft=1920
    let mag = x_pred_ct.slice(s![0..half, ..]).to_owned();
    let phase = x_pred_ct.slice(s![half.., ..]).to_owned();

    // 10. ISTFT — use the pre-built plan from DecoderWeights
    istft_burn(mag.view(), phase.view(), hop, &w.window, w.ifft_plan.as_ref())
}

// ─── Public API ───────────────────────────────────────────────────────────────

/// Default path for the decoder safetensors weight file.
fn default_decoder_path() -> PathBuf {
    PathBuf::from("models/neucodec_decoder.safetensors")
}

/// NeuCodec decoder: converts speech token IDs to a 24 kHz audio waveform.
///
/// ## Setup
///
/// Obtain the weights once with:
/// ```sh
/// python scripts/convert_weights.py
/// ```
/// Then at runtime:
/// ```rust,ignore
/// let dec = NeuCodecDecoder::new()?;
/// let audio = dec.decode(&codes)?;
/// ```
///
/// ## Backend selection
///
/// When built with `--features wgpu`, the decoder automatically selects the
/// best available backend on the **first call to [`decode`]** (lazy init):
///
/// | Priority | Backend                   | When used                          |
/// |----------|---------------------------|------------------------------------|
/// | 1        | Burn wgpu (GPU)           | Metal / Vulkan / DX12 adapter found|
/// | 2        | Burn NdArray (CPU)        | No GPU adapter available           |
/// | 3        | Raw ndarray (CPU)         | Burn init failed entirely          |
///
/// The Burn backend is initialised **eagerly** at [`from_file`](Self::from_file)
/// time so the GPU upload cost is part of model loading, not synthesis latency.
pub struct NeuCodecDecoder {
    weights: DecoderWeights,
    path:    PathBuf,

    /// Eagerly-initialised Burn backend.
    ///
    /// * `Ready(Some(_))` — Burn backend available and ready.
    /// * `Ready(None)`    — Burn init was attempted but failed; falls through
    ///                      to raw ndarray.
    ///
    /// The `Mutex` provides interior mutability so that `decode(&self)` can
    /// be called without `&mut self`.
    /// `Mutex<T>: Send + Sync` when `T: Send`, so `NeuCodecDecoder` remains
    /// `Send + Sync`.
    #[cfg(feature = "wgpu")]
    burn_decoder: std::sync::Mutex<LazyBurnDecoder>,
}

#[cfg(feature = "wgpu")]
enum LazyBurnDecoder {
    /// Burn backend is available and ready.
    Ready(Option<Box<dyn crate::codec_burn::BurnDecoder + Send>>),
}

impl NeuCodecDecoder {
    /// Load from the default path (`models/neucodec_decoder.safetensors`).
    pub fn new() -> Result<Self> {
        Self::from_file(&default_decoder_path())
    }

    /// Load from an explicit file path.
    pub fn from_file(path: &Path) -> Result<Self> {
        if !path.exists() {
            bail!(
                "NeuCodec decoder weights not found: {}\n\
                 \n\
                 Run the one-time conversion to generate them:\n\
                 \n\
                 \tpython scripts/convert_weights.py\n\
                 \n\
                 Or set a custom path with NeuCodecDecoder::from_file().",
                path.display()
            );
        }

        // Memory-map the file so the OS pages in tensor data on demand instead
        // of reading all 840 MB into a heap Vec<u8> upfront.  This halves peak
        // RAM usage during loading and avoids a large malloc + full-file copy.
        let file = std::fs::File::open(path)
            .with_context(|| format!("Failed to open {}", path.display()))?;
        // SAFETY: we do not mutate the mapping, and we hold `mmap` for the
        // full lifetime of `st` (both are dropped at the end of this block
        // after `load_decoder_weights` has copied all tensors into ndarray).
        let mmap = unsafe {
            memmap2::Mmap::map(&file)
                .with_context(|| format!("Failed to mmap {}", path.display()))?
        };
        let bytes: &[u8] = &mmap;

        // Read user-defined metadata (n_heads, depth, etc.) from the file header
        let (_, file_meta) = SafeTensors::read_metadata(bytes)
            .with_context(|| format!("Failed to parse safetensors header: {}", path.display()))?;
        let user_meta = file_meta.metadata().clone();

        let st = SafeTensors::deserialize(bytes)
            .with_context(|| format!("Failed to parse safetensors: {}", path.display()))?;

        let weights = load_decoder_weights(&st, &user_meta)
            .with_context(|| format!("Failed to load decoder weights from {}", path.display()))?;

        // `st` and `mmap` are dropped here — all tensor data is now owned by
        // the ndarray arrays inside `weights`.
        drop(st);
        drop(mmap);

        println!(
            "NeuCodec decoder: hidden={}, depth={}, heads={}, hop={} ({} samples/token = {} tokens/s)",
            weights.hidden_dim,
            weights.depth,
            weights.n_heads,
            weights.hop_length,
            weights.hop_length,
            SAMPLE_RATE as usize / weights.hop_length,
        );

        // ── Eagerly initialise the Burn GPU/CPU backend ───────────────────────
        //
        // Initialising here (at load time) rather than lazily on the first
        // decode() call moves the ~1-2 s GPU upload cost out of synthesis
        // latency and into model loading, which is a better user experience:
        // the "loaded in X s" number is honest, and "synth took Y s" reflects
        // only the actual forward pass.
        #[cfg(feature = "wgpu")]
        let burn_decoder = {
            let t0 = std::time::Instant::now();
            let dec = crate::codec_burn::make_burn_decoder(&weights);
            println!(
                "NeuCodec: {} backend ready in {:.2} s",
                dec.as_ref().map_or("cpu (ndarray)", |b| b.backend_name()),
                t0.elapsed().as_secs_f32(),
            );
            std::sync::Mutex::new(LazyBurnDecoder::Ready(dec))
        };

        Ok(Self {
            weights,
            path: path.to_path_buf(),
            #[cfg(feature = "wgpu")]
            burn_decoder,
        })
    }

    /// Decode speech token IDs to a 24 kHz audio waveform.
    ///
    /// * `codes` — integer token IDs in `0..=65535` (NeuCodec FSQ range).
    ///   Out-of-range values are rejected with an error rather than silently
    ///   producing garbage digits from the FSQ decomposition.
    /// * returns — `Vec<f32>` of `codes.len() × hop_length` samples.
    pub fn decode(&self, codes: &[i32]) -> Result<Vec<f32>> {
        if codes.is_empty() {
            return Ok(Vec::new());
        }

        // Validate before touching any weights — an out-of-range code would
        // silently produce wrong FSQ digits (e.g. a negative modulo result).
        for (i, &code) in codes.iter().enumerate() {
            if !(0..=65535).contains(&code) {
                anyhow::bail!(
                    "Speech token at index {i} is out of range: {code} \
                     (NeuCodec FSQ codes must be in 0..=65535)"
                );
            }
        }

        // ── Prefer Burn-accelerated path (wgpu GPU or NdArray CPU via Burn) ──
        //
        // The backend is always Ready here: it is initialised eagerly in
        // from_file() so there is no lazy-init stall inside synthesis.
        #[cfg(feature = "wgpu")]
        {
            let state = self.burn_decoder.lock().unwrap();
            if let LazyBurnDecoder::Ready(Some(ref bd)) = *state {
                return bd.decode(codes);
            }
        }

        // ── Fallback: raw ndarray CPU decoder ─────────────────────────────────
        Ok(decode_forward(codes, &self.weights))
    }

    /// Name of the active inference backend.
    ///
    /// | Return value            | Condition                                  |
    /// |-------------------------|--------------------------------------------|
    /// | `"burn/wgpu (GPU)"`     | `wgpu` feature + GPU adapter found         |
    /// | `"burn/ndarray (CPU)"`  | `wgpu` feature + no GPU (Burn NdArray)     |
    /// | `"cpu (ndarray)"`       | raw ndarray fallback (no `wgpu` feature)   |
    pub fn backend_name(&self) -> &str {
        #[cfg(feature = "wgpu")]
        {
            let state = self.burn_decoder.lock().unwrap();
            return match &*state {
                LazyBurnDecoder::Ready(Some(bd)) => bd.backend_name(),
                LazyBurnDecoder::Ready(None)     => "cpu (ndarray)",
            };
        }
        #[cfg(not(feature = "wgpu"))]
        "cpu (ndarray)"
    }

    /// Alias for [`from_file`](Self::from_file) — load from an explicit path.
    pub fn load(path: &Path) -> Result<Self> {
        Self::from_file(path)
    }

    /// Path from which the decoder was loaded.
    pub fn weights_path(&self) -> &Path {
        &self.path
    }

    /// Detected `hop_length` (audio samples per speech token).
    pub fn hop_length(&self) -> usize {
        self.weights.hop_length
    }
}

// ─── Encoder (stub) ───────────────────────────────────────────────────────────

/// NeuCodec encoder: converts a 16 kHz audio waveform to speech token IDs.
///
/// **Note**: The full NeuCodec encoder requires Wav2Vec2BertModel (~600 MB)
/// as a semantic feature extractor.  Encoder support is not yet implemented
/// in this pure-Rust build.
///
/// For reference audio encoding, use the Python `neucodec` package:
/// ```python
/// from neucodec import NeuCodec
/// model = NeuCodec.from_pretrained("neuphonic/neucodec")
/// codes = model.encode_code(waveform)   # → i32 array
/// ```
/// Then save the codes as a `.npy` file and pass via `--ref-codes` to the
/// synthesis examples.
pub struct NeuCodecEncoder;

impl NeuCodecEncoder {
    /// Always returns an error — encoder not yet implemented.
    pub fn new() -> Result<Self> {
        bail!(
            "The NeuCodec encoder is not yet implemented in the pure-Rust build.\n\
             \n\
             To encode reference audio, use the Python neucodec package:\n\
             \n\
             \tpip install neucodec huggingface_hub\n\
             \tpython scripts/encode_reference.py --audio reference.wav --out ref.npy\n\
             \n\
             Then pass the .npy file via --ref-codes to the synthesis examples."
        )
    }

    /// Always returns an error — encoder not yet implemented.
    pub fn load(_path: &Path) -> Result<Self> {
        Self::new()
    }

    /// Encode a WAV file to speech token IDs (not implemented).
    pub fn encode_wav(&self, _path: &Path) -> Result<Vec<i32>> {
        bail!("Encoder not implemented — see NeuCodecEncoder docs")
    }

    /// Backend name.
    pub fn backend_name(&self) -> &str {
        "not available"
    }
}

// ─── Resample helper ──────────────────────────────────────────────────────────

/// Naive linear resampler: changes sample rate of `samples` from `from_hz` to `to_hz`.
pub fn resample(samples: &[f32], from_hz: u32, to_hz: u32) -> Vec<f32> {
    if from_hz == to_hz {
        return samples.to_vec();
    }
    let ratio = from_hz as f64 / to_hz as f64;
    let out_len = (samples.len() as f64 / ratio).ceil() as usize;
    (0..out_len)
        .map(|i| {
            let src = i as f64 * ratio;
            let lo = src.floor() as usize;
            let hi = (lo + 1).min(samples.len() - 1);
            let frac = (src - lo as f64) as f32;
            samples[lo] * (1.0 - frac) + samples[hi] * frac
        })
        .collect()
}

// ─── Unit tests ───────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_fsq_decode_shape() {
        // Minimal project_out: 4-dim output, 8-dim input
        let w = Array2::ones((4, 8));
        let b = Array1::zeros(4);
        let codes = vec![0i32, 1, 2, 65535];
        let out = fsq_decode(&codes, w.view(), b.view());
        assert_eq!(out.shape(), &[4, 4]);
    }

    #[test]
    fn test_fsq_code_0() {
        // Code 0 → all digits 0 → all scaled to -1.0
        // project_out identity (8×8)
        let w = Array2::eye(8);
        let b = Array1::zeros(8);
        let out = fsq_decode(&[0], w.view(), b.view());
        for v in out.iter() {
            assert!((*v + 1.0).abs() < 1e-5, "expected -1.0, got {v}");
        }
    }

    #[test]
    fn test_fsq_code_max() {
        // Code 65535 = 4^8 - 1 → all digits 3 → all scaled to 1.0
        let w = Array2::eye(8);
        let b = Array1::zeros(8);
        let out = fsq_decode(&[65535], w.view(), b.view());
        for v in out.iter() {
            assert!((*v - 1.0).abs() < 1e-5, "expected 1.0, got {v}");
        }
    }

    #[test]
    fn test_linear_shape() {
        let x = Array2::ones((5, 3));
        let w = Array2::ones((7, 3));
        let b = Array1::zeros(7);
        let out = linear(x.view(), w.view(), Some(b.view()));
        assert_eq!(out.shape(), &[5, 7]);
    }

    #[test]
    fn test_conv1d_same_length() {
        let c_in = 4;
        let c_out = 8;
        let t = 16;
        let k = 3;
        let x = Array2::ones((c_in, t));
        let w = Array3::ones((c_out, c_in, k));
        let b = Array1::zeros(c_out);
        let out = conv1d(x.view(), w.view(), Some(b.view()), 1);
        assert_eq!(out.shape(), &[c_out, t]); // same length
    }

    #[test]
    fn test_group_norm_shape() {
        let c = 64;
        let t = 10;
        let x = Array2::ones((c, t));
        let w = Array1::ones(c);
        let b = Array1::zeros(c);
        let out = group_norm(x.view(), 4, w.view(), b.view(), 1e-6);
        assert_eq!(out.shape(), &[c, t]);
        // All-ones input → mean 1, var 0 → norm 0*w + b = 0
        for &v in out.iter() {
            assert!(v.abs() < 1e-4, "expected ~0 after group_norm of all-ones, got {v}");
        }
    }

    #[test]
    fn test_layer_norm_shape() {
        let t = 5;
        let c = 32;
        let x = Array2::from_elem((t, c), 2.0f32);
        let w = Array1::ones(c);
        let b = Array1::zeros(c);
        let out = layer_norm(x.view(), w.view(), b.view(), 1e-6);
        assert_eq!(out.shape(), &[t, c]);
        // Constant input → LayerNorm output is 0
        for &v in out.iter() {
            assert!(v.abs() < 1e-4, "expected ~0, got {v}");
        }
    }

    #[test]
    fn test_rms_norm_shape() {
        let t = 3;
        let c = 8;
        let x = Array2::ones((t, c));
        let w = Array1::ones(c);
        let out = rms_norm(x.view(), w.view(), 1e-6);
        assert_eq!(out.shape(), &[t, c]);
        // RMSNorm of all-ones → 1/rms(1) * 1 = 1
        for &v in out.iter() {
            assert!((v - 1.0).abs() < 1e-4, "expected 1.0, got {v}");
        }
    }

    #[test]
    fn test_rope_shape_preserved() {
        let t = 4;
        let n_heads = 2;
        let head_dim = 8;
        let mut x = Array3::ones((t, n_heads, head_dim));
        apply_rope(&mut x);
        assert_eq!(x.shape(), &[t, n_heads, head_dim]);
    }

    #[test]
    fn test_hann_window() {
        let w = hann_window(4);
        assert_eq!(w.len(), 4);
        // Hann window: w[0] = 0, w[n/2] = 1, w[n] = 0
        assert!(w[0].abs() < 1e-6);
        assert!((w[2] - 1.0).abs() < 1e-6);
    }

    fn make_ifft(n_fft: usize) -> std::sync::Arc<dyn rustfft::Fft<f32>> {
        FftPlanner::<f32>::new().plan_fft_inverse(n_fft)
    }

    #[test]
    fn test_istft_length() {
        let hop = 4;
        let n_fft = 16; // hop * 4
        let t = 10;
        let n_bins = n_fft / 2 + 1; // 9
        // Zero mag → exp(0)=1 magnitude, zero phase → cos(0)=1, sin(0)=0
        let mag   = Array2::zeros((n_bins, t));
        let phase = Array2::zeros((n_bins, t));
        let win   = hann_window(n_fft);
        let ifft  = make_ifft(n_fft);
        let audio = istft_burn(mag.view(), phase.view(), hop, &win, ifft.as_ref());
        // center=True: output is exactly T*hop samples
        assert_eq!(audio.len(), t * hop, "expected {} samples, got {}", t * hop, audio.len());
    }

    #[test]
    fn test_istft_clamp_does_not_blow_up() {
        // Log-magnitudes well above ln(100)≈4.6 must be clamped to 100 (linear),
        // not allowed to reach exp(large) ≈ infinity.
        let hop   = 4;
        let n_fft = 16;
        let t     = 4;
        let n_bins = n_fft / 2 + 1;
        // All log-magnitudes = 50 (would give exp(50) ≈ 5e21 without the fix)
        let mag   = Array2::from_elem((n_bins, t), 50.0f32);
        let phase = Array2::zeros((n_bins, t));
        let win   = hann_window(n_fft);
        let ifft  = make_ifft(n_fft);
        let audio = istft_burn(mag.view(), phase.view(), hop, &win, ifft.as_ref());
        // All samples must be finite and ≤ some reasonable bound (the clamp
        // limits linear magnitude to 1e2, so waveform values should be bounded)
        for &s in &audio {
            assert!(s.is_finite(), "sample is not finite: {s}");
            assert!(s.abs() < 1e6,  "sample magnitude suspiciously large: {s}");
        }
    }

    #[test]
    fn test_wgpu_feature_fn() {
        // Just verify it compiles and returns a bool
        let _ = wgpu_feature_enabled();
    }

    #[test]
    fn test_resample_identity() {
        let s: Vec<f32> = (0..100).map(|i| i as f32).collect();
        let r = resample(&s, 16_000, 16_000);
        assert_eq!(r, s);
    }
}