car-inference 0.14.0

//! Native LTX-2.3 video generation backend for Apple Silicon via mlx-rs.
//!
//! Implements the FULL LTX-2.3 transformer + 3D causal VAE architecture using
//! weights from `dgrauet/ltx-2.3-mlx-q4`. Transformer blocks are 4-bit quantized
//! (group_size=64); connector, VAE decoder, vocoder, and audio VAE are unquantized
//! BF16.
//!
//! Supports all generation modes: text-to-video, image-to-video, audio-video, and
//! video extension/retake.

use std::collections::HashMap;
use std::path::{Path, PathBuf};
use std::time::SystemTime;

use mlx_rs::module::{Module, Param};
use mlx_rs::nn;
use mlx_rs::ops;
use mlx_rs::ops::indexing::IndexOp;
use mlx_rs::Array;
use tracing::{info, warn};

use super::image_io::load_rgb_image;
use super::mlx::{build_qlinear, QLinear, QuantConfig};
use crate::tasks::generate_video::{GenerateVideoRequest, GenerateVideoResult, VideoMode};
use crate::InferenceError;

// ─── LTX Config ───────────────────────────────────────────────────────────

/// Configuration for the LTX-2.3 model (video + audio).
#[derive(Debug, Clone)]
pub struct LtxConfig {
    pub hidden_dim: usize,
    pub num_heads: usize,
    pub head_dim: usize,
    pub num_layers: usize,
    pub cross_attention_dim: usize,
    pub in_channels: usize,
    pub out_channels: usize,
    pub norm_eps: f32,
    pub quant: Option<QuantConfig>,
    // Audio dimensions
    pub audio_hidden_dim: usize,
    pub audio_heads: usize,
    pub audio_head_dim: usize,
}

impl Default for LtxConfig {
    fn default() -> Self {
        Self {
            hidden_dim: 4096,
            num_heads: 32,
            head_dim: 128,
            num_layers: 48,
            cross_attention_dim: 4096,
            in_channels: 128,
            out_channels: 128,
            norm_eps: 1e-6,
            quant: Some(QuantConfig {
                group_size: 64,
                bits: 4,
            }),
            audio_hidden_dim: 2048,
            audio_heads: 32,
            audio_head_dim: 64,
        }
    }
}

// ─── Parity Tensor Dumping ────────────────────────────────────────────────

/// Save a tensor as raw little-endian f32 alongside a `.meta` sidecar so the
/// Python `tools/parity/diff_ltx.py` can compare byte-wise. Activated per-run
/// via `CAR_DUMP_LTX_STAGE=<dir>`. Mirrors `dump_flux_stage` in mlx_flux.rs.
fn dump_ltx_stage(name: &str, t: &Array) {
    let Ok(dir) = std::env::var("CAR_DUMP_LTX_STAGE") else {
        return;
    };
    let _ = std::fs::create_dir_all(&dir);
    let Ok(t_f32) = t.as_dtype(mlx_rs::Dtype::Float32) else {
        return;
    };
    let _ = mlx_rs::transforms::eval([&t_f32]);
    let shape = t_f32.shape().to_vec();
    let data: &[f32] = t_f32.as_slice();
    let bin_path = format!("{dir}/{name}.bin");
    let meta_path = format!("{dir}/{name}.meta");
    let bytes: Vec<u8> = data.iter().flat_map(|v| v.to_le_bytes()).collect();
    let _ = std::fs::write(&bin_path, &bytes);
    let _ = std::fs::write(&meta_path, format!("{shape:?}\n"));
}

/// Dump only on first call per `name` — lets us call it from inside a loop
/// without overwriting earlier dumps on later steps.
fn dump_ltx_stage_first_call(name: &str, t: &Array) {
    use std::sync::Mutex;
    static SEEN: Mutex<Option<std::collections::HashSet<String>>> = Mutex::new(None);
    let mut g = SEEN.lock().unwrap();
    let set = g.get_or_insert_with(std::collections::HashSet::new);
    if !set.insert(name.to_string()) {
        return;
    }
    drop(g);
    dump_ltx_stage(name, t);
}

// ─── Helpers ──────────────────────────────────────────────────────────────

fn get_tensor(tensors: &HashMap<String, Array>, key: &str) -> Result<Array, InferenceError> {
    tensors
        .get(key)
        .cloned()
        .ok_or_else(|| InferenceError::InferenceFailed(format!("missing tensor: {key}")))
}

/// Build an nn::Linear from unquantized (BF16/F32) tensors.
fn build_dense_linear(
    tensors: &HashMap<String, Array>,
    prefix: &str,
) -> Result<nn::Linear, InferenceError> {
    let weight = get_tensor(tensors, &format!("{prefix}.weight"))?;
    let bias = tensors.get(&format!("{prefix}.bias")).cloned();
    Ok(nn::Linear {
        weight: Param::new(weight),
        bias: Param::new(bias),
    })
}

/// Load all safetensors from named files in the LTX model directory.
/// LTX-2.3 uses separate weight files:
/// - transformer-distilled.safetensors — transformer blocks (quantized)
/// - connector.safetensors — text embedding connector (BF16)
/// - vae_decoder.safetensors — 3D causal VAE decoder (BF16)
/// - audio_vae.safetensors — audio VAE weights (BF16)
/// - vocoder.safetensors — vocoder weights for audio output (BF16)
fn load_ltx_tensors(model_dir: &Path) -> Result<HashMap<String, Array>, InferenceError> {
    // Required weight files.
    let required_files = [
        "transformer-distilled.safetensors",
        "connector.safetensors",
        "vae_decoder.safetensors",
        "audio_vae.safetensors",
        "vocoder.safetensors",
    ];
    // Optional weight files — loaded when present to enable extra modes.
    let optional_files = ["vae_encoder.safetensors"];

    let mut all_tensors = HashMap::new();
    for filename in &required_files {
        let path = model_dir.join(filename);
        if !path.exists() {
            return Err(InferenceError::InferenceFailed(format!(
                "missing weight file: {}",
                path.display()
            )));
        }
        let tensors = Array::load_safetensors(&path)
            .map_err(|e| InferenceError::InferenceFailed(format!("load {filename}: {e}")))?;
        for (name, array) in tensors {
            all_tensors.insert(name, array);
        }
    }
    for filename in &optional_files {
        let path = model_dir.join(filename);
        if !path.exists() {
            continue;
        }
        let tensors = Array::load_safetensors(&path)
            .map_err(|e| InferenceError::InferenceFailed(format!("load {filename}: {e}")))?;
        for (name, array) in tensors {
            all_tensors.insert(name, array);
        }
    }
    Ok(all_tensors)
}

// ─── RMSNorm ─────────────────────────────────────────────────────────────

struct RmsNorm {
    weight: Array,
    eps: f32,
}

impl RmsNorm {
    fn forward(&self, x: &Array) -> Result<Array, mlx_rs::error::Exception> {
        let x_sq = ops::multiply(x, x)?;
        let mean = x_sq.mean_axes(&[-1], true)?;
        let eps = Array::from_f32(self.eps);
        let norm = ops::rsqrt(&ops::add(&mean, &eps)?)?;
        let normed = ops::multiply(x, &norm)?;
        ops::multiply(&normed, &self.weight)
    }
}

fn build_rms_norm(
    tensors: &HashMap<String, Array>,
    prefix: &str,
    eps: f32,
) -> Result<RmsNorm, InferenceError> {
    let weight = get_tensor(tensors, &format!("{prefix}.weight"))?;
    Ok(RmsNorm { weight, eps })
}

// ─── Gated Attention ──────────────────────────────────────────────────────

/// Self-attention with QK RMS norms and per-head gating.
struct GatedSelfAttention {
    to_q: QLinear,
    to_k: QLinear,
    to_v: QLinear,
    to_out: QLinear,
    norm_q: RmsNorm,
    norm_k: RmsNorm,
    to_gate_logits: QLinear,
    num_heads: usize,
    head_dim: usize,
}

impl GatedSelfAttention {
    /// Load from quantized transformer block weights.
    /// Uses `{prefix}.to_out` key pattern for the output projection.
    fn load(
        tensors: &HashMap<String, Array>,
        prefix: &str,
        quant: Option<&QuantConfig>,
        num_heads: usize,
        head_dim: usize,
        eps: f32,
    ) -> Result<Self, InferenceError> {
        Ok(Self {
            to_q: build_qlinear(tensors, &format!("{prefix}.to_q"), quant)?,
            to_k: build_qlinear(tensors, &format!("{prefix}.to_k"), quant)?,
            to_v: build_qlinear(tensors, &format!("{prefix}.to_v"), quant)?,
            to_out: build_qlinear(tensors, &format!("{prefix}.to_out"), quant)?,
            norm_q: build_rms_norm(tensors, &format!("{prefix}.q_norm"), eps)?,
            norm_k: build_rms_norm(tensors, &format!("{prefix}.k_norm"), eps)?,
            to_gate_logits: build_qlinear(tensors, &format!("{prefix}.to_gate_logits"), quant)?,
            num_heads,
            head_dim,
        })
    }

    fn forward(
        &mut self,
        x: &Array,
        rope_freqs: Option<(&Array, &Array)>,
    ) -> Result<Array, mlx_rs::error::Exception> {
        let shape = x.shape();
        let (batch, seq_len) = (shape[0] as usize, shape[1] as usize);
        let nh = self.num_heads as i32;
        let hd = self.head_dim as i32;
        let hidden = (self.num_heads * self.head_dim) as i32;

        // 1. Q/K/V projections
        let q = self.to_q.forward(x)?;
        let k = self.to_k.forward(x)?;
        let v = self.to_v.forward(x)?;

        // 2. Apply QK RMS norms on the flat inner_dim axis BEFORE head split
        // (norm weight shape is [inner_dim=num_heads*head_dim], upstream matches).
        let q = self.norm_q.forward(&q)?;
        let k = self.norm_k.forward(&k)?;

        // 3. Reshape to [batch, num_heads, seq_len, head_dim]
        let reshape_head = |t: Array| -> Result<Array, mlx_rs::error::Exception> {
            let r = ops::reshape(&t, &[batch as i32, seq_len as i32, nh, hd])?;
            ops::transpose_axes(&r, &[0, 2, 1, 3])
        };
        let mut q = reshape_head(q)?;
        let mut k = reshape_head(k)?;
        let v = reshape_head(v)?;

        // 4. Apply SPLIT RoPE if caller provided cos/sin freqs.
        // Upstream LTXModel config default: rope_type="split" → the DiT's
        // self-attention uses apply_rope_split (attention.py:117).
        if let Some((cos_f, sin_f)) = rope_freqs {
            q = apply_split_rope(&q, cos_f, sin_f)?;
            k = apply_split_rope(&k, cos_f, sin_f)?;
        }

        // 5. Scaled dot-product attention — MLX's fused SDPA runs softmax in
        // f32 regardless of input dtype, matching upstream attention.py:127.
        // A manual matmul + softmax_axis chain does the softmax in bf16 when
        // inputs are bf16, drifting the attention weights by a tiny amount
        // that compounds across 48 blocks.
        let scale = 1.0_f32 / (self.head_dim as f32).sqrt();
        let attn_out = mlx_rs::fast::scaled_dot_product_attention(&q, &k, &v, scale, None)?;

        // Reshape back to [batch, seq_len, num_heads, head_dim]
        let attn_out = ops::transpose_axes(&attn_out, &[0, 2, 1, 3])?;

        // 5. Gate via sigmoid(to_gate_logits(x)) per head
        // gate_logits: [batch, seq, num_heads]
        let gate_logits = self.to_gate_logits.forward(x)?;
        // Upstream: `gate = 2 * sigmoid(logits)`. Zero-init logits then give
        // identity gate (2 * 0.5 = 1.0). Forgetting the factor 2 halves every
        // attention output and compounds hard across 48 blocks.
        let two = Array::from_f32(2.0);
        let gate = ops::multiply(&ops::sigmoid(&gate_logits)?, &two)?;
        // Expand gate to [batch, seq, num_heads, 1] for broadcasting
        let gate = ops::reshape(&gate, &[batch as i32, seq_len as i32, nh, 1])?;
        // Apply per-head gating: attn_out is [batch, seq, num_heads, head_dim]
        let gated = ops::multiply(&attn_out, &gate)?;

        // 6. Reshape to [batch, seq, hidden_dim] and output projection
        let gated = ops::reshape(&gated, &[batch as i32, seq_len as i32, hidden])?;
        self.to_out.forward(&gated)
    }
}

/// Cross-attention with QK RMS norms and per-head gating.
struct GatedCrossAttention {
    to_q: QLinear,
    to_k: QLinear,
    to_v: QLinear,
    to_out: QLinear,
    norm_q: RmsNorm,
    norm_k: RmsNorm,
    to_gate_logits: QLinear,
    num_heads: usize,
    head_dim: usize,
}

impl GatedCrossAttention {
    /// Load from quantized transformer block weights.
    /// Uses `{prefix}.to_out` key pattern for the output projection.
    fn load(
        tensors: &HashMap<String, Array>,
        prefix: &str,
        quant: Option<&QuantConfig>,
        num_heads: usize,
        head_dim: usize,
        eps: f32,
    ) -> Result<Self, InferenceError> {
        Ok(Self {
            to_q: build_qlinear(tensors, &format!("{prefix}.to_q"), quant)?,
            to_k: build_qlinear(tensors, &format!("{prefix}.to_k"), quant)?,
            to_v: build_qlinear(tensors, &format!("{prefix}.to_v"), quant)?,
            to_out: build_qlinear(tensors, &format!("{prefix}.to_out"), quant)?,
            norm_q: build_rms_norm(tensors, &format!("{prefix}.q_norm"), eps)?,
            norm_k: build_rms_norm(tensors, &format!("{prefix}.k_norm"), eps)?,
            to_gate_logits: build_qlinear(tensors, &format!("{prefix}.to_gate_logits"), quant)?,
            num_heads,
            head_dim,
        })
    }

    fn forward(
        &mut self,
        x: &Array,
        context: &Array,
        rope_q: Option<(&Array, &Array)>,
        rope_k: Option<(&Array, &Array)>,
    ) -> Result<Array, mlx_rs::error::Exception> {
        let x_shape = x.shape();
        let (batch, x_seq) = (x_shape[0] as usize, x_shape[1] as usize);
        let ctx_seq = context.shape()[1] as usize;
        let nh = self.num_heads as i32;
        let hd = self.head_dim as i32;
        let hidden = (self.num_heads * self.head_dim) as i32;

        // 1. Q from x, K/V from context
        let q = self.to_q.forward(x)?;
        let k = self.to_k.forward(context)?;
        let v = self.to_v.forward(context)?;

        // 2. QK RMS norms on flat inner_dim axis BEFORE head split.
        let q = self.norm_q.forward(&q)?;
        let k = self.norm_k.forward(&k)?;

        // 3. Reshape to [batch, num_heads, seq, head_dim]
        let mut q = ops::transpose_axes(
            &ops::reshape(&q, &[batch as i32, x_seq as i32, nh, hd])?,
            &[0, 2, 1, 3],
        )?;
        let mut k = ops::transpose_axes(
            &ops::reshape(&k, &[batch as i32, ctx_seq as i32, nh, hd])?,
            &[0, 2, 1, 3],
        )?;
        let v = ops::transpose_axes(
            &ops::reshape(&v, &[batch as i32, ctx_seq as i32, nh, hd])?,
            &[0, 2, 1, 3],
        )?;

        // 3a. Apply SPLIT RoPE per side (upstream av_cross_attn path).
        // Video-text CA passes None on both sides — matches upstream (no RoPE
        // on text tokens). AV cross-attn passes the 1D-temporal rope freqs for
        // the query's modality (rope_q) and the key's modality (rope_k).
        if let Some((cos_f, sin_f)) = rope_q {
            q = apply_split_rope(&q, cos_f, sin_f)?;
        }
        if let Some((cos_f, sin_f)) = rope_k {
            k = apply_split_rope(&k, cos_f, sin_f)?;
        }

        // Scaled dot-product attention — fused SDPA runs softmax in f32,
        // matching upstream. See the matching comment in GatedSelfAttention.
        let scale = 1.0_f32 / (self.head_dim as f32).sqrt();
        let attn_out = mlx_rs::fast::scaled_dot_product_attention(&q, &k, &v, scale, None)?;

        // Reshape back to [batch, x_seq, num_heads, head_dim]
        let attn_out = ops::transpose_axes(&attn_out, &[0, 2, 1, 3])?;

        // 3. Per-head gating via to_gate_logits(x)
        let gate_logits = self.to_gate_logits.forward(x)?;
        // Upstream: `gate = 2 * sigmoid(logits)`. Zero-init logits then give
        // identity gate (2 * 0.5 = 1.0). Forgetting the factor 2 halves every
        // attention output and compounds hard across 48 blocks.
        let two = Array::from_f32(2.0);
        let gate = ops::multiply(&ops::sigmoid(&gate_logits)?, &two)?;
        let gate = ops::reshape(&gate, &[batch as i32, x_seq as i32, nh, 1])?;
        let gated = ops::multiply(&attn_out, &gate)?;

        // 4. Reshape and output projection
        let gated = ops::reshape(&gated, &[batch as i32, x_seq as i32, hidden])?;
        self.to_out.forward(&gated)
    }
}

// ─── Transformer Block FFN ────────────────────────────────────────────────

/// Feed-forward network for the LTX transformer blocks.
///
/// Weight shapes in the real checkpoint: `proj_in [16384, 4096]`,
/// `proj_out [4096, 16384]`. That's a plain activated MLP (4× expansion), not
/// a GeGLU-style split. The previous revision of this code split proj_in's
/// output in half and multiplied — matches neither the weight shapes nor the
/// upstream architecture.
struct GeGluFfn {
    proj_in: QLinear,
    proj_out: QLinear,
}

impl GeGluFfn {
    fn load(
        tensors: &HashMap<String, Array>,
        prefix: &str,
        quant: Option<&QuantConfig>,
    ) -> Result<Self, InferenceError> {
        Ok(Self {
            proj_in: build_qlinear(tensors, &format!("{prefix}.proj_in"), quant)?,
            proj_out: build_qlinear(tensors, &format!("{prefix}.proj_out"), quant)?,
        })
    }

    fn forward(&mut self, x: &Array) -> Result<Array, mlx_rs::error::Exception> {
        // proj_in → GELU → proj_out (standard MLP, no GeGLU split).
        let h = self.proj_in.forward(x)?;
        // Upstream (ltx-2-mlx) uses `gelu_approx` (tanh-based) not the precise
        // erf-based GELU. With 48 blocks the difference compounds.
        let activated = nn::gelu_approximate(&h)?;

        self.proj_out.forward(&activated)
    }
}

// ─── Transformer Block ────────────────────────────────────────────────────

/// One LTX transformer block with FULL video + audio pathways.
///
/// Each of the 48 blocks contains:
///
/// Video pathway:
/// - `attn1`: Video self-attention (4096 dim, 32 heads, 128 head_dim)
/// - `attn2`: Video cross-attention for text conditioning
/// - `ff`: Video GEGLU FFN
/// - `scale_shift_table`: [9, 4096] AdaLN modulation
/// - `prompt_scale_shift_table`: [2, 4096]
///
/// Audio pathway:
/// - `audio_attn1`: Audio self-attention (2048 dim, 32 heads, 64 head_dim)
/// - `audio_attn2`: Audio cross-attention for text conditioning
/// - `audio_ff`: Audio GEGLU FFN
/// - `audio_scale_shift_table`: [9, 2048]
/// - `audio_prompt_scale_shift_table`: [2, 2048]
///
/// Audio-video cross-attention:
/// - `audio_to_video_attn`: Cross-attention from audio to video
///   (Q 2048, KV 2048, out 4096, 32 heads)
/// - `video_to_audio_attn`: Cross-attention from video to audio
///   (Q 2048, KV from video 4096->2048 projection, out 2048, 32 heads)
/// - `scale_shift_table_a2v_ca_audio`: [5, 2048]
/// - `scale_shift_table_a2v_ca_video`: [5, 4096]
struct LtxTransformerBlock {
    // Video pathway
    attn1: GatedSelfAttention,
    attn2: GatedCrossAttention,
    ff: GeGluFfn,
    /// AdaLN scale/shift table: [9, hidden_dim]
    scale_shift_table: Array,
    /// Prompt AdaLN: [2, hidden_dim]
    prompt_scale_shift_table: Array,

    // Audio pathway
    audio_attn1: GatedSelfAttention,
    audio_attn2: GatedCrossAttention,
    audio_ff: GeGluFfn,
    /// Audio AdaLN scale/shift table: [9, audio_hidden_dim]
    audio_scale_shift_table: Array,
    /// Audio prompt AdaLN: [2, audio_hidden_dim]
    audio_prompt_scale_shift_table: Array,

    // Audio-video cross-attention
    /// Cross-attention: audio queries attend to video context, output to video dim.
    /// Q from audio (2048), KV from audio-projected-video (2048), out projected to 4096.
    audio_to_video_attn: GatedCrossAttention,
    /// Cross-attention: audio queries attend to video context, output to audio dim.
    /// Q from audio (2048), KV from video (4096->2048 via projection), out 2048.
    video_to_audio_attn: GatedCrossAttention,
    /// Audio-to-video cross-attention modulation (audio side): [5, 2048]
    scale_shift_table_a2v_ca_audio: Array,
    /// Audio-to-video cross-attention modulation (video side): [5, 4096]
    scale_shift_table_a2v_ca_video: Array,
}

impl LtxTransformerBlock {
    fn load(
        tensors: &HashMap<String, Array>,
        prefix: &str,
        config: &LtxConfig,
    ) -> Result<Self, InferenceError> {
        let quant = config.quant.as_ref();
        let eps = config.norm_eps;

        Ok(Self {
            // Video pathway
            attn1: GatedSelfAttention::load(
                tensors,
                &format!("{prefix}.attn1"),
                quant,
                config.num_heads,
                config.head_dim,
                eps,
            )?,
            attn2: GatedCrossAttention::load(
                tensors,
                &format!("{prefix}.attn2"),
                quant,
                config.num_heads,
                config.head_dim,
                eps,
            )?,
            ff: GeGluFfn::load(tensors, &format!("{prefix}.ff"), quant)?,
            scale_shift_table: get_tensor(tensors, &format!("{prefix}.scale_shift_table"))?,
            prompt_scale_shift_table: get_tensor(
                tensors,
                &format!("{prefix}.prompt_scale_shift_table"),
            )?,

            // Audio pathway
            audio_attn1: GatedSelfAttention::load(
                tensors,
                &format!("{prefix}.audio_attn1"),
                quant,
                config.audio_heads,
                config.audio_head_dim,
                eps,
            )?,
            audio_attn2: GatedCrossAttention::load(
                tensors,
                &format!("{prefix}.audio_attn2"),
                quant,
                config.audio_heads,
                config.audio_head_dim,
                eps,
            )?,
            audio_ff: GeGluFfn::load(tensors, &format!("{prefix}.audio_ff"), quant)?,
            audio_scale_shift_table: get_tensor(
                tensors,
                &format!("{prefix}.audio_scale_shift_table"),
            )?,
            audio_prompt_scale_shift_table: get_tensor(
                tensors,
                &format!("{prefix}.audio_prompt_scale_shift_table"),
            )?,

            // Audio-video cross-attention
            // audio_to_video_attn: Q from audio (2048), KV from audio-projected-video (2048),
            // output projected to 4096. 32 heads from gate_logits.
            audio_to_video_attn: GatedCrossAttention::load(
                tensors,
                &format!("{prefix}.audio_to_video_attn"),
                quant,
                config.audio_heads,
                config.audio_head_dim,
                eps,
            )?,
            // video_to_audio_attn: Q from audio (2048), KV from video (4096->2048 projection),
            // output 2048. 32 heads from gate_logits.
            video_to_audio_attn: GatedCrossAttention::load(
                tensors,
                &format!("{prefix}.video_to_audio_attn"),
                quant,
                config.audio_heads,
                config.audio_head_dim,
                eps,
            )?,
            scale_shift_table_a2v_ca_audio: get_tensor(
                tensors,
                &format!("{prefix}.scale_shift_table_a2v_ca_audio"),
            )?,
            scale_shift_table_a2v_ca_video: get_tensor(
                tensors,
                &format!("{prefix}.scale_shift_table_a2v_ca_video"),
            )?,
        })
    }

    fn forward(
        &mut self,
        x: &Array,
        context: &Array,
        timestep_emb: &Array,
        audio: Option<&Array>,
        audio_context: Option<&Array>,
        audio_timestep_emb: Option<&Array>,
        video_rope_freqs: Option<(&Array, &Array)>,
        audio_rope_freqs: Option<(&Array, &Array)>,
        video_cross_rope_freqs: Option<(&Array, &Array)>,
        audio_cross_rope_freqs: Option<(&Array, &Array)>,
        // Additional timestep-dependent AdaLN outputs. Upstream computes each
        // of these via a separate `AdaLayerNormSingle` module (prompt_adaln,
        // av_ca_video_scale_shift_adaln, …) and ADDS them to the per-block
        // scale_shift tables before applying. These were loaded but unused,
        // so prompt + AV cross-attn modulations were missing their
        // timestep-dependent component entirely.
        video_prompt_emb: &Array, // [B, 2 * vdim] — prompt scale/shift (video side)
        audio_prompt_emb: Option<&Array>, // [B, 2 * adim]
        av_ca_video_emb: &Array,  // [B, 4 * vdim] — AV cross-attn video scale/shift
        av_ca_audio_emb: Option<&Array>, // [B, 4 * adim]
        av_ca_a2v_gate_emb: &Array, // [B, vdim] — A->V gate
        av_ca_v2a_gate_emb: Option<&Array>, // [B, adim]
    ) -> Result<(Array, Option<Array>), mlx_rs::error::Exception> {
        // Helper: AdaLN modulate = rms_norm(x) * (1 + scale) + shift. Used on
        // the residual-stream activations (video/audio hidden states).
        let adaln_modulate =
            |x: &Array, scale: &Array, shift: &Array| -> Result<Array, mlx_rs::error::Exception> {
                let eps = Array::from_f32(1e-6);
                let var = ops::multiply(x, x)?.mean_axes(&[-1], true)?;
                let inv = ops::rsqrt(&ops::add(&var, &eps)?)?;
                let normed = ops::multiply(x, &inv)?;
                let one = Array::from_f32(1.0);
                let scaled = ops::multiply(&normed, &ops::add(&one, scale)?)?;
                ops::add(&scaled, shift)
            };

        // Helper: raw affine modulate = x * (1 + scale) + shift (no RMS-norm).
        // Used on text embeddings in the cross-attention prompt modulation —
        // upstream doesn't normalize text embeds, just scales and shifts.
        let adaln_modulate_raw =
            |x: &Array, scale: &Array, shift: &Array| -> Result<Array, mlx_rs::error::Exception> {
                let one = Array::from_f32(1.0);
                let scaled = ops::multiply(x, &ops::add(&one, scale)?)?;
                ops::add(&scaled, shift)
            };

        // Helper: extract row from a scale_shift_table (no timestep), returning [1, 1, hidden].
        let extract_mod = |table: &Array, idx: i32| -> Result<Array, mlx_rs::error::Exception> {
            let row = table.index((idx, ..)); // [hidden]
            ops::reshape(&row, &[1, 1, row.shape()[0]])
        };

        // Helper: identity-pass for shapes that are already [*, 1, *] or [1, hidden].
        // Used for pre-computed pieces that don't require further reshaping.
        let unsqueeze = |a: &Array| -> Result<Array, mlx_rs::error::Exception> {
            let s = a.shape();
            if s.len() == 1 {
                ops::reshape(a, &[1, 1, s[0]])
            } else if s.len() == 2 {
                ops::reshape(a, &[s[0], 1, s[1]])
            } else {
                Ok(a.clone())
            }
        };

        // Helper: chunk an AdaLN-Single output by param index `i` and add the
        // matching row of `table` ([N, hidden]). Two input shapes supported:
        //
        // - **uniform timestep** `[B, N*hidden]` → returns `[B, 1, hidden]`,
        //   broadcasts against `[B, seq, hidden]` (t2v / audio-video path).
        // - **per-token timestep** `[B, T, N*hidden]` → returns `[B, T, hidden]`,
        //   aligns position-wise with `[B, seq=T, hidden]` (i2v with denoise
        //   mask — conditioned tokens get t=0, others get scheduler sigma).
        // Generic N-param chunk: handles the `(num_params × hidden)` adaln
        // outputs when the caller's table may have MORE rows than num_params
        // (e.g. AV table has 5 rows but the scale/shift adaln has 4 params,
        // with row 4 used by a separate gate adaln).
        let ada_chunk_n = |table: &Array,
                           t_emb: &Array,
                           idx: i32,
                           num_params: i32|
         -> Result<Array, mlx_rs::error::Exception> {
            let hidden = table.shape()[1];
            let t_shape = t_emb.shape();
            let row = table.index((idx, ..)); // [hidden]
            match t_shape.len() {
                2 => {
                    let b = t_shape[0];
                    let reshaped = ops::reshape(t_emb, &[b, num_params, hidden])?;
                    let slice_i = reshaped.index((.., idx..idx + 1, ..)); // [B, 1, hidden]
                    let row_bcast = ops::reshape(&row, &[1, 1, hidden])?;
                    ops::add(&slice_i, &row_bcast)
                }
                3 => {
                    let (b, t) = (t_shape[0], t_shape[1]);
                    let reshaped = ops::reshape(t_emb, &[b, t, num_params, hidden])?;
                    let slice_i = reshaped.index((.., .., idx..idx + 1, ..)); // [B, T, 1, h]
                    let collapsed = ops::reshape(&slice_i, &[b, t, hidden])?; // [B, T, h]
                    let row_bcast = ops::reshape(&row, &[1, 1, hidden])?;
                    ops::add(&collapsed, &row_bcast)
                }
                other => Err(mlx_rs::error::Exception::custom(format!(
                    "ada_chunk: unsupported timestep embedding rank {other}"
                ))),
            }
        };

        // Convenience wrapper: use the full table row count as num_params.
        // Correct for tables where the adaln output matches the table's row
        // count exactly (e.g. scale_shift_table is 9 rows, adaln emits 9 params).
        let ada_chunk =
            |table: &Array, t_emb: &Array, idx: i32| -> Result<Array, mlx_rs::error::Exception> {
                ada_chunk_n(table, t_emb, idx, table.shape()[0])
            };

        // ── Video pathway ──
        //
        // Upstream BasicAVTransformerBlock AdaLN ordering (9 params):
        //   [0-2] self-attn: shift, scale, gate
        //   [3-5] feed-forward: shift, scale, gate
        //   [6-8] text cross-attn: shift, scale, gate
        // A prior revision of this code had FF ↔ CA swapped, which was a major
        // source of divergence from the upstream reference.
        let shift_sa = ada_chunk(&self.scale_shift_table, timestep_emb, 0)?;
        let scale_sa = ada_chunk(&self.scale_shift_table, timestep_emb, 1)?;
        let gate_sa = ada_chunk(&self.scale_shift_table, timestep_emb, 2)?;
        let shift_ff = ada_chunk(&self.scale_shift_table, timestep_emb, 3)?;
        let scale_ff = ada_chunk(&self.scale_shift_table, timestep_emb, 4)?;
        let gate_ff = ada_chunk(&self.scale_shift_table, timestep_emb, 5)?;
        let shift_ca = ada_chunk(&self.scale_shift_table, timestep_emb, 6)?;
        let scale_ca = ada_chunk(&self.scale_shift_table, timestep_emb, 7)?;
        let gate_ca = ada_chunk(&self.scale_shift_table, timestep_emb, 8)?;

        // 1. Video self-attention: x + gate * attn(rms(x)*(1+scale)+shift)
        let x_mod = adaln_modulate(x, &scale_sa, &shift_sa)?;
        let attn_out = self.attn1.forward(&x_mod, video_rope_freqs)?;
        let mut x_vid = ops::add(x, &ops::multiply(&gate_sa, &attn_out)?)?;

        // 2. Video text cross-attention — modulate TEXT embeds with
        // prompt_scale_shift_table [2, vdim] PLUS per-timestep adaln from
        // prompt_adaln_single (video_prompt_emb has shape [B, 2*vdim]).
        // Upstream `_unpack_adaln` reshapes video_prompt_emb to (B, 2, vdim)
        // and ADDS the table's [:2] rows to it before using.
        //
        // Prior code used only the static table rows, dropping the timestep
        // component entirely — that silently removed per-step prompt
        // modulation and contributed to block-by-block drift.
        let prompt_shift_v = ada_chunk(&self.prompt_scale_shift_table, video_prompt_emb, 0)?;
        let prompt_scale_v = ada_chunk(&self.prompt_scale_shift_table, video_prompt_emb, 1)?;
        let text_scaled_v = adaln_modulate_raw(context, &prompt_scale_v, &prompt_shift_v)?;
        let x_mod = adaln_modulate(&x_vid, &scale_ca, &shift_ca)?;
        // Video-text cross-attention has NO RoPE (upstream `attn2(x, enc)`
        // omits positional encoding on both text tokens and video queries).
        let ca_out = self.attn2.forward(&x_mod, &text_scaled_v, None, None)?;
        x_vid = ops::add(&x_vid, &ops::multiply(&gate_ca, &ca_out)?)?;

        // ── Audio pathway (only when audio latents are provided) ──

        let x_audio_out = if let (Some(aud), Some(aud_ctx), Some(aud_temb)) =
            (audio, audio_context, audio_timestep_emb)
        {
            // Audio AdaLN params in upstream order: [0-2] SA, [3-5] FF, [6-8] CA.
            let a_shift_sa = ada_chunk(&self.audio_scale_shift_table, aud_temb, 0)?;
            let a_scale_sa = ada_chunk(&self.audio_scale_shift_table, aud_temb, 1)?;
            let a_gate_sa = ada_chunk(&self.audio_scale_shift_table, aud_temb, 2)?;
            let a_shift_ff = ada_chunk(&self.audio_scale_shift_table, aud_temb, 3)?;
            let a_scale_ff = ada_chunk(&self.audio_scale_shift_table, aud_temb, 4)?;
            let a_gate_ff = ada_chunk(&self.audio_scale_shift_table, aud_temb, 5)?;
            let a_shift_ca = ada_chunk(&self.audio_scale_shift_table, aud_temb, 6)?;
            let a_scale_ca = ada_chunk(&self.audio_scale_shift_table, aud_temb, 7)?;
            let a_gate_ca = ada_chunk(&self.audio_scale_shift_table, aud_temb, 8)?;

            // 3. Audio self-attention.
            let a_mod = adaln_modulate(aud, &a_scale_sa, &a_shift_sa)?;
            let a_attn_out = self.audio_attn1.forward(&a_mod, audio_rope_freqs)?;
            let mut x_aud = ops::add(aud, &ops::multiply(&a_gate_sa, &a_attn_out)?)?;

            // 4. Audio text cross-attention — same pattern as video: add
            // per-timestep adaln (audio_prompt_emb) to audio_prompt table.
            let a_prompt_emb =
                audio_prompt_emb.expect("audio_prompt_emb required when audio is active");
            let a_prompt_shift = ada_chunk(&self.audio_prompt_scale_shift_table, a_prompt_emb, 0)?;
            let a_prompt_scale = ada_chunk(&self.audio_prompt_scale_shift_table, a_prompt_emb, 1)?;
            let text_scaled_a = adaln_modulate_raw(aud_ctx, &a_prompt_scale, &a_prompt_shift)?;
            let a_mod = adaln_modulate(&x_aud, &a_scale_ca, &a_shift_ca)?;
            // Audio-text CA also has NO RoPE (same as video-text).
            let a_ca_out = self
                .audio_attn2
                .forward(&a_mod, &text_scaled_a, None, None)?;
            x_aud = ops::add(&x_aud, &ops::multiply(&a_gate_ca, &a_ca_out)?)?;

            // 5-6. Audio-Video cross-modal attention.
            // Upstream: norm both modalities ONCE (pre-A2V), reuse for both
            // A2V and V2A. Apply modulation with separate scale/shift for
            // each direction. Table layout [5, dim]:
            //   [0]=scale_a2v, [1]=shift_a2v, [2]=scale_v2a, [3]=shift_v2a, [4]=gate
            // A prior revision used entirely wrong indices here.
            let video_norm3 = rms_norm_parameterless(&x_vid, 1e-6)?;
            let audio_norm3 = rms_norm_parameterless(&x_aud, 1e-6)?;

            // Upstream `_unpack_adaln(av_ca_video_params, scale_shift_table_a2v_ca_video, 4, vdim)`
            // reshapes the (B, 4*vdim) adaln output to (B, 4, vdim) and ADDS
            // the first 4 rows of the table. The 5th row (gate) gets its own
            // adaln input via `av_ca_a2v_gate_emb` (shape [B, vdim]).
            //
            // Table layout [5, dim]:
            //   [0]=scale_a2v, [1]=shift_a2v, [2]=scale_v2a, [3]=shift_v2a, [4]=gate
            // AV table has 5 rows; the scale/shift adaln has 4 params. Pass
            // num_params=4 so the t_emb reshape matches (table row 4 is the
            // separate gate added further down).
            let av_v_scale_a2v =
                ada_chunk_n(&self.scale_shift_table_a2v_ca_video, av_ca_video_emb, 0, 4)?;
            let av_v_shift_a2v =
                ada_chunk_n(&self.scale_shift_table_a2v_ca_video, av_ca_video_emb, 1, 4)?;
            let av_v_scale_v2a =
                ada_chunk_n(&self.scale_shift_table_a2v_ca_video, av_ca_video_emb, 2, 4)?;
            let av_v_shift_v2a =
                ada_chunk_n(&self.scale_shift_table_a2v_ca_video, av_ca_video_emb, 3, 4)?;
            // Gate: adaln output has ONE param, add to row 4 of the table.
            // Use ada_chunk against a single-param table view by building a
            // [1, vdim] slice (row 4) and adding the full gate adaln output.
            let av_v_gate_a2v = {
                let row4 = self.scale_shift_table_a2v_ca_video.index((4, ..));
                let row4 = ops::reshape(&row4, &[1, 1, row4.shape()[0]])?;
                // av_ca_a2v_gate_emb shape [B, vdim] → [B, 1, vdim]
                let b_emb = if av_ca_a2v_gate_emb.shape().len() == 2 {
                    let s = av_ca_a2v_gate_emb.shape();
                    ops::reshape(av_ca_a2v_gate_emb, &[s[0], 1, s[1]])?
                } else {
                    av_ca_a2v_gate_emb.clone()
                };
                ops::add(&b_emb, &row4)?
            };

            let a_gate_emb =
                av_ca_v2a_gate_emb.expect("av_ca_v2a_gate_emb required when audio is active");
            let av_a_scale_a2v = ada_chunk_n(
                &self.scale_shift_table_a2v_ca_audio,
                av_ca_audio_emb.expect("av_ca_audio_emb required when audio is active"),
                0,
                4,
            )?;
            let av_a_shift_a2v = ada_chunk_n(
                &self.scale_shift_table_a2v_ca_audio,
                av_ca_audio_emb.unwrap(),
                1,
                4,
            )?;
            let av_a_scale_v2a = ada_chunk_n(
                &self.scale_shift_table_a2v_ca_audio,
                av_ca_audio_emb.unwrap(),
                2,
                4,
            )?;
            let av_a_shift_v2a = ada_chunk_n(
                &self.scale_shift_table_a2v_ca_audio,
                av_ca_audio_emb.unwrap(),
                3,
                4,
            )?;
            let av_a_gate_v2a = {
                let row4 = self.scale_shift_table_a2v_ca_audio.index((4, ..));
                let row4 = ops::reshape(&row4, &[1, 1, row4.shape()[0]])?;
                let b_emb = if a_gate_emb.shape().len() == 2 {
                    let s = a_gate_emb.shape();
                    ops::reshape(a_gate_emb, &[s[0], 1, s[1]])?
                } else {
                    a_gate_emb.clone()
                };
                ops::add(&b_emb, &row4)?
            };

            // A2V: Q from video, KV from audio.
            let video_q_a2v = adaln_modulate_raw(&video_norm3, &av_v_scale_a2v, &av_v_shift_a2v)?;
            let audio_kv_a2v = adaln_modulate_raw(&audio_norm3, &av_a_scale_a2v, &av_a_shift_a2v)?;
            // A2V applies 1D-temporal RoPE: rope_q = video_cross (over video
            // tokens), rope_k = audio_cross (over audio tokens).
            let a2v_out = self.audio_to_video_attn.forward(
                &video_q_a2v,
                &audio_kv_a2v,
                video_cross_rope_freqs,
                audio_cross_rope_freqs,
            )?;
            x_vid = ops::add(&x_vid, &ops::multiply(&av_v_gate_a2v, &a2v_out)?)?;

            // V2A: Q from audio, KV from video (reuse pre-A2V norms).
            let audio_q_v2a = adaln_modulate_raw(&audio_norm3, &av_a_scale_v2a, &av_a_shift_v2a)?;
            let video_kv_v2a = adaln_modulate_raw(&video_norm3, &av_v_scale_v2a, &av_v_shift_v2a)?;
            // V2A: rope_q = audio_cross, rope_k = video_cross.
            let v2a_out = self.video_to_audio_attn.forward(
                &audio_q_v2a,
                &video_kv_v2a,
                audio_cross_rope_freqs,
                video_cross_rope_freqs,
            )?;
            x_aud = ops::add(&x_aud, &ops::multiply(&av_a_gate_v2a, &v2a_out)?)?;

            // 8. Audio FF (after AV cross-attn, matching upstream ordering).
            let a_mod = adaln_modulate(&x_aud, &a_scale_ff, &a_shift_ff)?;
            let a_ff_out = self.audio_ff.forward(&a_mod)?;
            x_aud = ops::add(&x_aud, &ops::multiply(&a_gate_ff, &a_ff_out)?)?;

            Some(x_aud)
        } else {
            None
        };

        // 7. Video FF — upstream applies FF LAST, after AV cross-attn.
        let x_mod = adaln_modulate(&x_vid, &scale_ff, &shift_ff)?;
        let ff_out = self.ff.forward(&x_mod)?;
        x_vid = ops::add(&x_vid, &ops::multiply(&gate_ff, &ff_out)?)?;

        Ok((x_vid, x_audio_out))
    }
}

// ─── Timestep Embedding ───────────────────────────────────────────────────

/// AdaLN-Single timestep embedding: MLP(SiLU) that produces per-block modulation.
struct AdaLnSingle {
    emb_timestep_linear1: QLinear,
    emb_timestep_linear2: QLinear,
    linear: QLinear,
}

impl AdaLnSingle {
    fn load(
        tensors: &HashMap<String, Array>,
        prefix: &str,
        quant: Option<&QuantConfig>,
    ) -> Result<Self, InferenceError> {
        Ok(Self {
            emb_timestep_linear1: build_qlinear(
                tensors,
                &format!("{prefix}.emb.timestep_embedder.linear1"),
                quant,
            )?,
            emb_timestep_linear2: build_qlinear(
                tensors,
                &format!("{prefix}.emb.timestep_embedder.linear2"),
                quant,
            )?,
            linear: build_qlinear(tensors, &format!("{prefix}.linear"), quant)?,
        })
    }

    /// Returns the per-block modulation parameters AND the intermediate
    /// `embedded_timestep` (output of the TimestepEmbedder's `linear2`,
    /// pre-silu). Upstream `AdaLayerNormSingle` returns both so the
    /// final output block can compute
    ///   `scale_shift_values = scale_shift_table + embedded_timestep`.
    fn forward(&mut self, timestep: &Array) -> Result<(Array, Array), mlx_rs::error::Exception> {
        // Per-element sinusoidal embedding. Upstream model.py:132 scales
        // `timestep * config.timestep_scale_multiplier` (= 1000) before
        // calling `get_timestep_embedding`. Our scheduler emits sigmas in
        // [0, 1] so without this multiplier the embedding's frequency band
        // is wrong and every block receives out-of-distribution conditioning.
        let scale = Array::from_f32(1000.0);
        let t_scaled = ops::multiply(timestep, &scale)?;
        let t_emb = timestep_embedding_tensor(&t_scaled, 256)?;
        let h = self.emb_timestep_linear1.forward(&t_emb)?;
        let h = nn::silu(&h)?;
        let embedded = self.emb_timestep_linear2.forward(&h)?;
        let h = nn::silu(&embedded)?;
        let params = self.linear.forward(&h)?;
        Ok((params, embedded))
    }
}

/// AdaLN-Single variant with wider linear output.
/// Used for audio-video cross-attention modulation where the linear projects
/// to a wider dimension (e.g., 4096->16384 or 2048->8192).
struct AdaLnSingleWide {
    emb_timestep_linear1: QLinear,
    emb_timestep_linear2: QLinear,
    linear: QLinear,
}

impl AdaLnSingleWide {
    fn load(
        tensors: &HashMap<String, Array>,
        prefix: &str,
        quant: Option<&QuantConfig>,
    ) -> Result<Self, InferenceError> {
        Ok(Self {
            emb_timestep_linear1: build_qlinear(
                tensors,
                &format!("{prefix}.emb.timestep_embedder.linear1"),
                quant,
            )?,
            emb_timestep_linear2: build_qlinear(
                tensors,
                &format!("{prefix}.emb.timestep_embedder.linear2"),
                quant,
            )?,
            linear: build_qlinear(tensors, &format!("{prefix}.linear"), quant)?,
        })
    }

    fn forward(&mut self, timestep: &Array) -> Result<Array, mlx_rs::error::Exception> {
        // Same structure as AdaLnSingle but with wider output linear.
        // Upstream scales timestep by `timestep_scale_multiplier` (= 1000)
        // before the sinusoidal embedding.
        let scale = Array::from_f32(1000.0);
        let t_scaled = ops::multiply(timestep, &scale)?;
        let t_emb = timestep_embedding_tensor(&t_scaled, 256)?;

        let h = self.emb_timestep_linear1.forward(&t_emb)?;
        let h = nn::silu(&h)?;
        let h = self.emb_timestep_linear2.forward(&h)?;

        let h = nn::silu(&h)?;
        self.linear.forward(&h)
    }
}

/// Prompt-level AdaLN-Single (produces 2*hidden_dim modulation).
struct PromptAdaLnSingle {
    emb_timestep_linear1: QLinear,
    emb_timestep_linear2: QLinear,
    linear: QLinear,
}

impl PromptAdaLnSingle {
    fn load(
        tensors: &HashMap<String, Array>,
        prefix: &str,
        quant: Option<&QuantConfig>,
    ) -> Result<Self, InferenceError> {
        Ok(Self {
            emb_timestep_linear1: build_qlinear(
                tensors,
                &format!("{prefix}.emb.timestep_embedder.linear1"),
                quant,
            )?,
            emb_timestep_linear2: build_qlinear(
                tensors,
                &format!("{prefix}.emb.timestep_embedder.linear2"),
                quant,
            )?,
            linear: build_qlinear(tensors, &format!("{prefix}.linear"), quant)?,
        })
    }

    fn forward(&mut self, timestep: &Array) -> Result<Array, mlx_rs::error::Exception> {
        // Sinusoidal encoding -> timestep MLP -> final linear.
        // Output is [batch, 2*hidden_dim] (shift + scale for prompt CA).
        // Upstream applies the `* 1000` scale before sinusoidal embedding.
        let scale = Array::from_f32(1000.0);
        let t_scaled = ops::multiply(timestep, &scale)?;
        let t_emb = timestep_embedding_tensor(&t_scaled, 256)?;

        let h = self.emb_timestep_linear1.forward(&t_emb)?;
        let h = nn::silu(&h)?;
        let h = self.emb_timestep_linear2.forward(&h)?;

        let h = nn::silu(&h)?;
        self.linear.forward(&h)
    }
}

// ─── LTX Transformer ─────────────────────────────────────────────────────

/// The full LTX-2.3 transformer with video + audio pathways.
struct LtxTransformer {
    // ── Video top-level ──
    /// Patchify projection: Linear(128 -> 4096)
    patchify_proj: QLinear,
    /// Output projection: Linear(4096 -> 128)
    proj_out: QLinear,
    /// Video timestep modulation
    adaln_single: AdaLnSingle,
    /// Video prompt-level modulation
    prompt_adaln_single: PromptAdaLnSingle,
    /// Final video scale/shift table: [2, 4096]
    scale_shift_table: Array,

    // ── Audio top-level ──
    /// Audio patchify projection: Linear(128 -> 2048)
    audio_patchify_proj: nn::Linear,
    /// Audio output projection: Linear(2048 -> 128)
    audio_proj_out: nn::Linear,
    /// Audio timestep modulation
    audio_adaln_single: AdaLnSingle,
    /// Audio prompt-level modulation
    audio_prompt_adaln_single: PromptAdaLnSingle,
    /// Final audio scale/shift table: [2, 2048]
    audio_scale_shift_table: Array,

    // ── Audio-video cross-attention AdaLN singles ──
    /// Audio-to-video gate modulation (4096 dim)
    av_ca_a2v_gate_adaln_single: AdaLnSingle,
    /// Video-to-audio gate modulation (2048 dim)
    av_ca_v2a_gate_adaln_single: AdaLnSingle,
    /// Audio-to-video cross-attn video scale/shift modulation.
    /// Wider linear: 4096->16384 (4*4096).
    av_ca_video_scale_shift_adaln_single: AdaLnSingleWide,
    /// Audio-to-video cross-attn audio scale/shift modulation.
    /// Wider linear: 2048->8192 (4*2048).
    av_ca_audio_scale_shift_adaln_single: AdaLnSingleWide,

    // ── Transformer blocks ──
    /// 48 transformer blocks (video + audio pathways)
    blocks: Vec<LtxTransformerBlock>,
}

impl LtxTransformer {
    fn load(tensors: &HashMap<String, Array>, config: &LtxConfig) -> Result<Self, InferenceError> {
        let quant = config.quant.as_ref();
        // All transformer tensors live under the `transformer.*` top-level prefix
        // in the real checkpoint (e.g. `transformer.patchify_proj.weight`).
        let pfx = "transformer";

        // Video top-level
        let patchify_proj = build_qlinear(tensors, &format!("{pfx}.patchify_proj"), quant)?;
        let proj_out = build_qlinear(tensors, &format!("{pfx}.proj_out"), quant)?;
        let adaln_single = AdaLnSingle::load(tensors, &format!("{pfx}.adaln_single"), quant)?;
        let prompt_adaln_single =
            PromptAdaLnSingle::load(tensors, &format!("{pfx}.prompt_adaln_single"), quant)?;
        let scale_shift_table = get_tensor(tensors, &format!("{pfx}.scale_shift_table"))?;

        // Audio top-level (dense/unquantized)
        let audio_patchify_proj =
            build_dense_linear(tensors, &format!("{pfx}.audio_patchify_proj"))?;
        let audio_proj_out = build_dense_linear(tensors, &format!("{pfx}.audio_proj_out"))?;
        let audio_adaln_single =
            AdaLnSingle::load(tensors, &format!("{pfx}.audio_adaln_single"), quant)?;
        let audio_prompt_adaln_single =
            PromptAdaLnSingle::load(tensors, &format!("{pfx}.audio_prompt_adaln_single"), quant)?;
        let audio_scale_shift_table =
            get_tensor(tensors, &format!("{pfx}.audio_scale_shift_table"))?;

        // Audio-video cross-attention AdaLN singles
        let av_ca_a2v_gate_adaln_single = AdaLnSingle::load(
            tensors,
            &format!("{pfx}.av_ca_a2v_gate_adaln_single"),
            quant,
        )?;
        let av_ca_v2a_gate_adaln_single = AdaLnSingle::load(
            tensors,
            &format!("{pfx}.av_ca_v2a_gate_adaln_single"),
            quant,
        )?;
        let av_ca_video_scale_shift_adaln_single = AdaLnSingleWide::load(
            tensors,
            &format!("{pfx}.av_ca_video_scale_shift_adaln_single"),
            quant,
        )?;
        let av_ca_audio_scale_shift_adaln_single = AdaLnSingleWide::load(
            tensors,
            &format!("{pfx}.av_ca_audio_scale_shift_adaln_single"),
            quant,
        )?;

        // Transformer blocks live at `transformer.transformer_blocks.{i}`.
        let mut blocks = Vec::with_capacity(config.num_layers);
        for i in 0..config.num_layers {
            blocks.push(LtxTransformerBlock::load(
                tensors,
                &format!("{pfx}.transformer_blocks.{i}"),
                config,
            )?);
        }

        Ok(Self {
            patchify_proj,
            proj_out,
            adaln_single,
            prompt_adaln_single,
            scale_shift_table,
            audio_patchify_proj,
            audio_proj_out,
            audio_adaln_single,
            audio_prompt_adaln_single,
            audio_scale_shift_table,
            av_ca_a2v_gate_adaln_single,
            av_ca_v2a_gate_adaln_single,
            av_ca_video_scale_shift_adaln_single,
            av_ca_audio_scale_shift_adaln_single,
            blocks,
        })
    }

    /// Run a single denoising step.
    ///
    /// - `latents`: patchified video latents [batch, num_patches, in_channels]
    /// - `text_embed`: text conditioning [batch, seq_len, hidden_dim]
    /// - `timestep`: scalar timestep, or per-video-token timestep for anchored i2v
    /// - `global_timestep`: scalar timestep for prompt/audio/AV modulation
    /// - `audio_latents`: optional audio latents [batch, audio_patches, audio_in_channels]
    /// - `audio_text_embed`: optional audio text conditioning [batch, seq_len, audio_hidden_dim]
    fn forward(
        &mut self,
        latents: &Array,
        text_embed: &Array,
        timestep: &Array,
        global_timestep: &Array,
        audio_latents: Option<&Array>,
        audio_text_embed: Option<&Array>,
        rope: &RopeBundle,
    ) -> Result<(Array, Option<Array>), mlx_rs::error::Exception> {
        let video_rope_freqs = rope.video_pair();
        let audio_rope_freqs = rope.audio_pair();
        let video_cross_rope_freqs = rope.video_cross_pair();
        let audio_cross_rope_freqs = rope.audio_cross_pair();
        // Helper: unsqueeze for broadcasting [dim] -> [1, 1, dim] or [batch, dim] -> [batch, 1, dim]
        let unsqueeze_final = |a: &Array| -> Result<Array, mlx_rs::error::Exception> {
            let s = a.shape();
            if s.len() == 1 {
                ops::reshape(a, &[1, 1, s[0]])
            } else if s.len() == 2 {
                ops::reshape(a, &[s[0], 1, s[1]])
            } else {
                Ok(a.clone())
            }
        };

        // Cast inputs to bf16 to match weight dtype — upstream `LTXModel.__call__`
        // (model.py:212-217) casts video_latent, audio_latent, text_embeds, and
        // timestep to bf16 BEFORE the patchify projection. Without this, f32
        // accumulation through 48 blocks drifts the patchify output by ~O(10)
        // at the element level, propagating catastrophic divergence downstream.
        dump_ltx_stage_first_call("latents_input", latents);
        let latents = latents.as_dtype(mlx_rs::Dtype::Bfloat16)?;
        dump_ltx_stage_first_call("latents_bf16", &latents);
        let text_embed = text_embed.as_dtype(mlx_rs::Dtype::Bfloat16)?;
        let timestep = timestep.as_dtype(mlx_rs::Dtype::Bfloat16)?;
        let global_timestep = global_timestep.as_dtype(mlx_rs::Dtype::Bfloat16)?;

        // 1. Patchify: project latents into transformer hidden dim
        let mut hidden = self.patchify_proj.forward(&latents)?;
        // Parity dump: patchify output (first forward only).
        dump_ltx_stage_first_call("patchify_out", &hidden);

        // 2. Compute timestep conditioning via AdaLN-Single. Returns
        //    `(params, embedded_timestep)`; the latter feeds the final
        //    output block per upstream `_output_block`.
        let (timestep_emb, video_embedded_ts) = self.adaln_single.forward(&timestep)?;
        dump_ltx_stage_first_call("timestep_adaln_params", &timestep_emb);
        dump_ltx_stage_first_call("timestep_embedded", &video_embedded_ts);

        // Audio patchify and timestep conditioning (when audio is provided)
        let audio_latents_bf = match audio_latents {
            Some(a) => Some(a.as_dtype(mlx_rs::Dtype::Bfloat16)?),
            None => None,
        };
        let audio_text_embed_bf = match audio_text_embed {
            Some(a) => Some(a.as_dtype(mlx_rs::Dtype::Bfloat16)?),
            None => None,
        };
        let mut audio_hidden = if let Some(aud_lat) = audio_latents_bf.as_ref() {
            let ah = self.audio_patchify_proj.forward(aud_lat)?;
            Some(ah)
        } else {
            None
        };
        let (audio_timestep_emb, audio_embedded_ts) = if audio_latents.is_some() {
            let (p, e) = self.audio_adaln_single.forward(&global_timestep)?;
            (Some(p), Some(e))
        } else {
            (None, None)
        };

        // Additional top-level AdaLN outputs that every block consumes.
        // These were loaded but never invoked in prior revisions — the block
        // forward silently used only the static scale_shift tables, dropping
        // the per-timestep prompt and AV cross-attn modulations entirely.
        //
        // Upstream (model.py:244-263):
        //   video_prompt_emb, _  = self.prompt_adaln_single(t_emb)
        //   av_ca_video_emb, _   = self.av_ca_video_scale_shift_adaln_single(t_emb)
        //   av_ca_a2v_gate_emb,_ = self.av_ca_a2v_gate_adaln_single(t_emb_av_gate)
        //   (same for audio variants)
        //
        // For LTX-2.3-q4: av_ca_timestep_scale_multiplier == timestep_scale_multiplier,
        // so `t_emb_av_gate == t_emb`. We collapse them to one timestep here.
        let video_prompt_emb = self.prompt_adaln_single.forward(&global_timestep)?;
        let av_ca_video_emb = self
            .av_ca_video_scale_shift_adaln_single
            .forward(&global_timestep)?;
        // AV gate adaln produces [B, dim] — single param (just a gate).
        // `av_ca_a2v_gate_adaln_single` is an AdaLnSingle that returns
        // `(params, embedded)`; we want params only.
        let (av_ca_a2v_gate_emb, _) = self.av_ca_a2v_gate_adaln_single.forward(&global_timestep)?;
        let (audio_prompt_emb, av_ca_audio_emb, av_ca_v2a_gate_emb): (
            Option<Array>,
            Option<Array>,
            Option<Array>,
        ) = if audio_latents.is_some() {
            let ap = self.audio_prompt_adaln_single.forward(&global_timestep)?;
            let av_aa = self
                .av_ca_audio_scale_shift_adaln_single
                .forward(&global_timestep)?;
            let (av_v2a, _) = self.av_ca_v2a_gate_adaln_single.forward(&global_timestep)?;
            (Some(ap), Some(av_aa), Some(av_v2a))
        } else {
            (None, None, None)
        };

        // 3. Run through all transformer blocks
        let total_blocks = self.blocks.len();
        let mid_idx = total_blocks / 2; // 24 for num_layers=48
        let last_idx = total_blocks - 1; // 47
        for (i, block) in self.blocks.iter_mut().enumerate() {
            let (vid_out, aud_out) = block.forward(
                &hidden,
                &text_embed,
                &timestep_emb,
                audio_hidden.as_ref(),
                audio_text_embed_bf.as_ref(),
                audio_timestep_emb.as_ref(),
                video_rope_freqs,
                audio_rope_freqs,
                video_cross_rope_freqs,
                audio_cross_rope_freqs,
                &video_prompt_emb,
                audio_prompt_emb.as_ref(),
                &av_ca_video_emb,
                av_ca_audio_emb.as_ref(),
                &av_ca_a2v_gate_emb,
                av_ca_v2a_gate_emb.as_ref(),
            )?;
            hidden = vid_out;
            audio_hidden = aud_out;
            // Parity dumps: sample video hidden at block 0, mid, last (first step only).
            if i == 0 || i == 2 || i == 5 || i == 10 || i == 15 || i == 20 || i == 30 || i == 40 {
                dump_ltx_stage_first_call(&format!("block{:02}_out", i), &hidden);
            }
            if i == 0 {
                dump_ltx_stage_first_call("block00_out", &hidden);
            } else if i == mid_idx {
                dump_ltx_stage_first_call(&format!("block_mid{mid_idx:02}_out"), &hidden);
            } else if i == last_idx {
                dump_ltx_stage_first_call(&format!("block_last{last_idx:02}_out"), &hidden);
            }
        }

        // 4. Final output block — upstream `_output_block`:
        //    scale_shift = table[None, None, :, :] + embedded_ts[:, :, None, :]
        //    shift = scale_shift[:, :, 0, :]; scale = scale_shift[:, :, 1, :]
        //    x = LayerNorm(x) * (1 + scale) + shift
        //    return proj_out(x)
        //
        // Broadcast shapes:
        //   scalar timestep → embedded_ts shape (B, dim),  broadcast to (B, 1, dim).
        //   per-token (i2v) → embedded_ts shape (B, N, dim), pairs 1:1 with tokens.
        let compute_final_mod =
            |table: &Array, embedded_ts: &Array| -> Result<Array, mlx_rs::error::Exception> {
                let t_shape = embedded_ts.shape();
                let dim = table.shape()[1];
                let (b, n_or_1) = match t_shape.len() {
                    2 => (t_shape[0], 1),
                    3 => (t_shape[0], t_shape[1]),
                    other => {
                        return Err(mlx_rs::error::Exception::custom(format!(
                            "embedded_ts unexpected rank {other}"
                        )))
                    }
                };
                let embedded_exp = ops::reshape(embedded_ts, &[b, n_or_1, 1, dim])?;
                let table_exp = ops::reshape(table, &[1, 1, 2, dim])?;
                ops::add(&table_exp, &embedded_exp) // (B, N_or_1, 2, D)
            };

        // Video output
        let v_scale_shift = compute_final_mod(&self.scale_shift_table, &video_embedded_ts)?;
        let v_shift = v_scale_shift.index((.., .., 0, ..));
        let v_scale = v_scale_shift.index((.., .., 1, ..));
        let v_ln = ltx_layer_norm_parameterless(&hidden, 1e-6)?;
        let one = Array::from_f32(1.0);
        let v_mod = ops::add(&ops::multiply(&v_ln, &ops::add(&one, &v_scale)?)?, &v_shift)?;
        let video_out = self.proj_out.forward(&v_mod)?;
        // Parity dump: final video output (pre-unpatchify, first step only).
        dump_ltx_stage_first_call("final_output_video", &video_out);

        // 6. Audio final modulation and output projection (when audio is active)
        let audio_out = if let Some(aud_h) = audio_hidden {
            let aud_emb = audio_embedded_ts.as_ref().ok_or_else(|| {
                mlx_rs::error::Exception::custom(
                    "audio_embedded_ts must be set when audio_hidden is".to_string(),
                )
            })?;
            let a_scale_shift = compute_final_mod(&self.audio_scale_shift_table, aud_emb)?;
            let a_shift = a_scale_shift.index((.., .., 0, ..));
            let a_scale = a_scale_shift.index((.., .., 1, ..));
            let a_ln = ltx_layer_norm_parameterless(&aud_h, 1e-6)?;
            let a_mod = ops::add(&ops::multiply(&a_ln, &ops::add(&one, &a_scale)?)?, &a_shift)?;
            Some(self.audio_proj_out.forward(&a_mod)?)
        } else {
            None
        };

        Ok((video_out, audio_out))
    }
}

// ─── Text Embedding Connector ─────────────────────────────────────────────

/// Connector transformer-1D block — self-attention with gated per-head output
/// and a plain MLP feed-forward. Mirrors Lightricks' `_BasicTransformerBlock1D`
/// (see `packages/ltx-core/src/ltx_core/text_encoders/gemma/embeddings_connector.py`).
///
/// Both norms are parameterless RMS-norm (`rms_norm(x) = x * rsqrt(mean(x²) + eps)`).
/// RoPE is deliberately skipped here — upstream allows `pe=None` and the swift-diffusion
/// port documents it as safe to skip for the connector.
struct ConnectorBlock {
    /// Number of attention heads (32 for 4096-dim video, 32 for 2048-dim audio).
    num_heads: usize,
    /// Per-head dim (128 for video, 64 for audio).
    head_dim: usize,
    attn_to_q: nn::Linear,
    attn_to_k: nn::Linear,
    attn_to_v: nn::Linear,
    attn_to_out: nn::Linear,
    attn_q_norm: Option<Array>,
    attn_k_norm: Option<Array>,
    /// Per-head gate projection `query_dim -> num_heads`. Gate = 2·sigmoid(logits).
    attn_to_gate_logits: Option<nn::Linear>,
    ff_proj_in: nn::Linear,
    ff_proj_out: nn::Linear,
}

impl ConnectorBlock {
    fn load(
        tensors: &HashMap<String, Array>,
        prefix: &str,
        num_heads: usize,
        head_dim: usize,
    ) -> Result<Self, InferenceError> {
        let has_gate = tensors.contains_key(&format!("{prefix}.attn1.to_gate_logits.weight"));
        let attn_to_gate_logits = if has_gate {
            Some(build_dense_linear(
                tensors,
                &format!("{prefix}.attn1.to_gate_logits"),
            )?)
        } else {
            None
        };
        Ok(Self {
            num_heads,
            head_dim,
            attn_to_q: build_dense_linear(tensors, &format!("{prefix}.attn1.to_q"))?,
            attn_to_k: build_dense_linear(tensors, &format!("{prefix}.attn1.to_k"))?,
            attn_to_v: build_dense_linear(tensors, &format!("{prefix}.attn1.to_v"))?,
            attn_to_out: build_dense_linear(tensors, &format!("{prefix}.attn1.to_out.0"))?,
            attn_q_norm: tensors
                .get(&format!("{prefix}.attn1.q_norm.weight"))
                .cloned(),
            attn_k_norm: tensors
                .get(&format!("{prefix}.attn1.k_norm.weight"))
                .cloned(),
            attn_to_gate_logits,
            ff_proj_in: build_dense_linear(tensors, &format!("{prefix}.ff.net.0.proj"))?,
            ff_proj_out: build_dense_linear(tensors, &format!("{prefix}.ff.net.2"))?,
        })
    }

    fn forward(&mut self, x: &Array) -> Result<Array, mlx_rs::error::Exception> {
        let shape = x.shape();
        let batch = shape[0];
        let seq_len = shape[1];
        let hdim = shape[2];
        let nh = self.num_heads as i32;
        let hd = self.head_dim as i32;

        // ── Pre-attention parameterless RMS-norm ──
        let normed = rms_norm_parameterless(x, 1e-6)?;

        // ── Self-attention with optional per-head output gating ──
        let mut q = self.attn_to_q.forward(&normed)?;
        let mut k = self.attn_to_k.forward(&normed)?;
        let v = self.attn_to_v.forward(&normed)?;

        // Q/K RMS-norm on the full inner_dim axis (upstream applies before head split).
        if let Some(ref w) = self.attn_q_norm {
            q = apply_weighted_rms(&q, w, 1e-6)?;
        }
        if let Some(ref w) = self.attn_k_norm {
            k = apply_weighted_rms(&k, w, 1e-6)?;
        }

        let reshape_head = |t: &Array| -> Result<Array, mlx_rs::error::Exception> {
            let r = ops::reshape(t, &[batch, seq_len, nh, hd])?;
            ops::transpose_axes(&r, &[0, 2, 1, 3])
        };
        let mut qh = reshape_head(&q)?;
        let mut kh = reshape_head(&k)?;
        let vh = reshape_head(&v)?;

        // SPLIT-type RoPE applied per-head. Upstream's Embeddings1DConnector
        // uses `precompute_rope_freqs(rope_type="split", theta=10000)` over
        // 1D positions with `max_pos=4096`.
        let inner_dim = self.num_heads * self.head_dim;
        let (cos_f, sin_f) =
            precompute_split_rope(seq_len, self.num_heads, inner_dim, 10_000.0, 4096.0)?;
        qh = apply_split_rope(&qh, &cos_f, &sin_f)?;
        kh = apply_split_rope(&kh, &cos_f, &sin_f)?;

        let scale = Array::from_f32(1.0 / (self.head_dim as f32).sqrt());
        let scores = ops::multiply(
            &ops::matmul(&qh, &ops::transpose_axes(&kh, &[0, 1, 3, 2])?)?,
            &scale,
        )?;
        let attn = ops::softmax_axis(&scores, -1, None)?;
        let attn_out = ops::matmul(&attn, &vh)?;
        // [B, H, T, D] → [B, T, H, D]
        let attn_out = ops::transpose_axes(&attn_out, &[0, 2, 1, 3])?;

        // Per-head gate: 2·sigmoid(to_gate_logits(normed)) applied to (B, T, H, D).
        let gated = if let Some(ref mut gate) = self.attn_to_gate_logits {
            let gate_logits = gate.forward(&normed)?; // [B, T, H]
            let two = Array::from_f32(2.0);
            let gates = ops::multiply(&ops::sigmoid(&gate_logits)?, &two)?;
            let gates = ops::expand_dims(&gates, -1)?; // [B, T, H, 1]
            ops::multiply(&attn_out, &gates)?
        } else {
            attn_out
        };

        // Merge heads → [B, T, H*D]
        let merged = ops::reshape(&gated, &[batch, seq_len, hdim])?;
        let attn_out = self.attn_to_out.forward(&merged)?;

        // Residual after attention.
        let x_post_attn = ops::add(x, &attn_out)?;

        // ── Pre-FF parameterless RMS-norm + plain MLP with tanh-GELU ──
        let normed = rms_norm_parameterless(&x_post_attn, 1e-6)?;
        let h = self.ff_proj_in.forward(&normed)?;
        // Upstream (ltx-2-mlx) uses `gelu_approx` (tanh-based) not the precise
        // erf-based GELU. With 48 blocks the difference compounds.
        let activated = nn::gelu_approximate(&h)?;
        let ff_out = self.ff_proj_out.forward(&activated)?;

        // Residual connection
        ops::add(&x_post_attn, &ff_out)
    }
}

/// Pre-compute SPLIT-type RoPE cos/sin tensors for the connector attention.
///
/// Ports `generate_freq_grid` + `compute_freqs` + `precompute_rope_freqs`
/// (`rope_type="split"`) from `ltx-2-mlx/model/transformer/rope.py`. The
/// connector uses 1D positions (seq axis only) with log-spaced frequency
/// indices, fractional positions scaled to `[-1, 1]`, then the cos/sin are
/// reshaped to per-head `[1, H, N, head_dim/2]`.
///
/// Returns `(cos_freqs, sin_freqs)` each of shape `[1, num_heads, seq_len, head_dim/2]`.
fn precompute_split_rope(
    seq_len: i32,
    num_heads: usize,
    inner_dim: usize,
    theta: f32,
    max_pos: f32,
) -> Result<(Array, Array), mlx_rs::error::Exception> {
    // 1D positions: num_freqs = inner_dim / 2.
    let num_freqs = inner_dim / 2;

    // Log-spaced frequency indices:
    //   indices = theta ** linspace(0, 1, num_freqs) * (pi/2)
    let freq_indices: Vec<f32> = (0..num_freqs)
        .map(|i| {
            let t = i as f32 / (num_freqs.saturating_sub(1).max(1) as f32);
            theta.powf(t) * std::f32::consts::FRAC_PI_2
        })
        .collect();
    let freq_indices = Array::from_slice(&freq_indices, &[num_freqs as i32]);

    // Positions [0..seq_len) → fractional [0, 1] → scaled [-1, 1].
    let positions: Vec<f32> = (0..seq_len)
        .map(|p| (p as f32 / max_pos) * 2.0 - 1.0)
        .collect();
    let positions = Array::from_slice(&positions, &[1, seq_len, 1]);

    // `scaled = freq_indices * positions` → [1, seq_len, num_freqs] via broadcast.
    let freqs = ops::multiply(&positions, &freq_indices)?;
    // Already [1, seq_len, num_freqs]; for SPLIT this is directly the angle array.

    let cos_f = ops::cos(&freqs)?;
    let sin_f = ops::sin(&freqs)?;

    // Reshape to per-head: [1, seq_len, H, head_dim/2] → [1, H, seq_len, head_dim/2].
    let head_dim_half = (inner_dim / (2 * num_heads)) as i32;
    let cos_f = ops::reshape(&cos_f, &[1, seq_len, num_heads as i32, head_dim_half])?;
    let sin_f = ops::reshape(&sin_f, &[1, seq_len, num_heads as i32, head_dim_half])?;
    let cos_f = ops::transpose_axes(&cos_f, &[0, 2, 1, 3])?;
    let sin_f = ops::transpose_axes(&sin_f, &[0, 2, 1, 3])?;
    Ok((cos_f, sin_f))
}

/// Pre-compute SPLIT-type RoPE cos/sin tensors for the main DiT transformer.
///
/// Ports `precompute_rope_freqs(rope_type="split")` from
/// `ltx-2-mlx/model/transformer/rope.py`. Upstream LTXModel config default
/// is `rope_type="split"` (see model.py:57), and attention.py:117 picks
/// `apply_rope_split` accordingly. Used for video/audio self-attention
/// (3D / 1D positions) and AV cross-attention (1D temporal positions).
///
/// `positions` is `[axis][token]` — one Vec per position dim. For video
/// self-attn `num_pos_dims=3` (t,h,w), for AV cross-attn `num_pos_dims=1`.
///
/// `inner_dim = num_heads * head_dim`. Upstream splits the head_dim into
/// halves (first half cos-rotated, second half sin-rotated). The returned
/// tensors have shape `[1, num_heads, num_tokens, head_dim/2]`.
fn precompute_split_rope_multiaxis(
    positions: &[Vec<f32>],
    num_heads: usize,
    head_dim: usize,
    theta: f32,
    max_pos: &[f32],
) -> Result<(Array, Array), mlx_rs::error::Exception> {
    let num_pos_dims = max_pos.len();
    assert!(
        positions.len() == num_pos_dims,
        "positions axis count mismatch"
    );
    let inner_dim = num_heads * head_dim;
    let n_elem = 2 * num_pos_dims;
    let num_freqs = inner_dim / n_elem;
    let expected = inner_dim / 2; // target length after padding
    let covered = num_freqs * num_pos_dims;
    let pad = expected - covered; // front-padded zeros

    // Log-spaced frequency indices (same generator as interleaved):
    //   indices = theta ** linspace(0, 1, num_freqs) * (pi/2)
    let freq_indices: Vec<f32> = (0..num_freqs)
        .map(|i| {
            let t = i as f32 / (num_freqs.saturating_sub(1).max(1) as f32);
            theta.powf(t) * std::f32::consts::FRAC_PI_2
        })
        .collect();

    // Upstream compute_freqs: (B, N, num_pos_dims, num_freqs) then
    //   transpose(0,1,3,2).reshape(-1, num_freqs * num_pos_dims)
    // → order per token is (freq varies slow, pos_dim varies fast).
    let num_tokens = positions[0].len();
    let mut freqs_flat = vec![0.0f32; num_tokens * expected];
    for tok in 0..num_tokens {
        // Front pad = zeros, then freqs in (f, d) order.
        let base = tok * expected + pad;
        for f in 0..num_freqs {
            for d in 0..num_pos_dims {
                let p = positions[d][tok];
                let frac = (p / max_pos[d]) * 2.0 - 1.0;
                freqs_flat[base + f * num_pos_dims + d] = frac * freq_indices[f];
            }
        }
    }
    let freqs = Array::from_slice(&freqs_flat, &[1, num_tokens as i32, expected as i32]);

    let cos_f = ops::cos(&freqs)?;
    let sin_f = ops::sin(&freqs)?;

    // Reshape to per-head half: [1, N, H, head_dim/2] → [1, H, N, head_dim/2].
    let head_dim_half = (head_dim / 2) as i32;
    let cos_f = ops::reshape(
        &cos_f,
        &[1, num_tokens as i32, num_heads as i32, head_dim_half],
    )?;
    let sin_f = ops::reshape(
        &sin_f,
        &[1, num_tokens as i32, num_heads as i32, head_dim_half],
    )?;
    let cos_f = ops::transpose_axes(&cos_f, &[0, 2, 1, 3])?;
    let sin_f = ops::transpose_axes(&sin_f, &[0, 2, 1, 3])?;
    Ok((cos_f, sin_f))
}

/// Pre-compute INTERLEAVED-type RoPE cos/sin tensors.
///
/// Ports `precompute_rope_freqs(rope_type="interleaved")` from
/// `ltx-2-mlx/model/transformer/rope.py`. Kept for any caller that opts
/// into interleaved (none in the current DiT path — upstream default is
/// split — but the primitive may still be useful for tools).
///
/// `positions` shape: `[1, num_tokens, num_pos_dims]`. For video self-attn
/// `num_pos_dims=3`, for AV cross-attn `num_pos_dims=1`.
///
/// Returns `(cos_freqs, sin_freqs)` each of shape `[1, num_heads, num_tokens, head_dim]`.
#[allow(dead_code)]
fn precompute_interleaved_rope(
    positions: &[Vec<f32>],
    num_heads: usize,
    head_dim: usize,
    theta: f32,
    max_pos: &[f32],
) -> Result<(Array, Array), mlx_rs::error::Exception> {
    let num_pos_dims = max_pos.len();
    assert!(
        positions.len() == num_pos_dims,
        "positions axis count mismatch"
    );
    let n_elem = 2 * num_pos_dims;
    let inner_dim = num_heads * head_dim;
    let num_freqs = inner_dim / n_elem;

    // Log-spaced frequency indices (same generator as the SPLIT variant).
    let freq_indices: Vec<f32> = (0..num_freqs)
        .map(|i| {
            let t = i as f32 / (num_freqs.saturating_sub(1).max(1) as f32);
            theta.powf(t) * std::f32::consts::FRAC_PI_2
        })
        .collect();

    // For each position axis, compute (position / max_pos) * 2 - 1 in [-1, 1],
    // then outer-multiply with freq_indices. Stack axes and transpose
    // to get [1, num_tokens, num_freqs * num_pos_dims].
    let num_tokens = positions[0].len();
    let mut freqs_flat = vec![0.0f32; num_tokens * num_freqs * num_pos_dims];
    for tok in 0..num_tokens {
        for d in 0..num_pos_dims {
            let p = positions[d][tok];
            let frac = (p / max_pos[d]) * 2.0 - 1.0;
            for f in 0..num_freqs {
                // Upstream layout: `scaled.transpose(0, 1, 3, 2).reshape(..., num_freqs * num_pos_dims)`
                // gives order (freq varies slow, pos_dim varies fast) per token.
                let out_idx = tok * (num_freqs * num_pos_dims) + f * num_pos_dims + d;
                freqs_flat[out_idx] = frac * freq_indices[f];
            }
        }
    }
    let freqs = Array::from_slice(
        &freqs_flat,
        &[1, num_tokens as i32, (num_freqs * num_pos_dims) as i32],
    );

    let cos_f = ops::cos(&freqs)?;
    let sin_f = ops::sin(&freqs)?;
    // Repeat each freq for the pair: [..., 2*num_freqs*num_pos_dims]
    // Upstream uses mx.repeat(cos_f, 2, axis=-1) — repeat each element twice.
    let cos_f = ops::repeat_axis::<f32>(cos_f, 2, -1)?;
    let sin_f = ops::repeat_axis::<f32>(sin_f, 2, -1)?;
    // Pad to inner_dim — upstream pads with ones (cos) / zeros (sin) on the front.
    let covered = 2 * num_freqs * num_pos_dims;
    let pad = (inner_dim - covered) as i32;
    let cos_f = if pad > 0 {
        let ones = Array::ones::<f32>(&[1, num_tokens as i32, pad])?;
        ops::concatenate_axis(&[&ones, &cos_f], -1)?
    } else {
        cos_f
    };
    let sin_f = if pad > 0 {
        let zeros = Array::zeros::<f32>(&[1, num_tokens as i32, pad])?;
        ops::concatenate_axis(&[&zeros, &sin_f], -1)?
    } else {
        sin_f
    };

    // Reshape to per-head: [1, N, H, D] → [1, H, N, D].
    let cos_f = ops::reshape(
        &cos_f,
        &[1, num_tokens as i32, num_heads as i32, head_dim as i32],
    )?;
    let sin_f = ops::reshape(
        &sin_f,
        &[1, num_tokens as i32, num_heads as i32, head_dim as i32],
    )?;
    let cos_f = ops::transpose_axes(&cos_f, &[0, 2, 1, 3])?;
    let sin_f = ops::transpose_axes(&sin_f, &[0, 2, 1, 3])?;
    Ok((cos_f, sin_f))
}

/// Apply INTERLEAVED RoPE to `x` of shape `[B, H, N, D]`. Pairs `(x[2i], x[2i+1])`
/// rotate together: `out[2i] = x[2i]*cos - x[2i+1]*sin;
/// out[2i+1] = x[2i+1]*cos + x[2i]*sin`.
#[allow(dead_code)]
fn apply_interleaved_rope(
    x: &Array,
    cos_f: &Array,
    sin_f: &Array,
) -> Result<Array, mlx_rs::error::Exception> {
    let shape = x.shape();
    let last = shape[shape.len() - 1];
    let half = last / 2;
    // Reshape last axis into pairs: [..., half, 2].
    let mut paired_shape: Vec<i32> = shape.to_vec();
    let ln = paired_shape.len();
    paired_shape[ln - 1] = half;
    paired_shape.push(2);
    let x_pairs = ops::reshape(x, &paired_shape)?;
    // Split the 2: x1 = [..., 0], x2 = [..., 1].
    let x1 = x_pairs.index((.., .., .., .., 0..1));
    let x2 = x_pairs.index((.., .., .., .., 1..2));
    // Build x_rot = interleaved(-x2, x1): stack and reshape to match x.
    let neg_x2 = ops::negative(&x2)?;
    let rotated_pairs = ops::concatenate_axis(&[&neg_x2, &x1], -1)?; // [..., half, 2]
    let x_rot = ops::reshape(&rotated_pairs, shape)?; // [..., last]
    ops::add(&ops::multiply(x, cos_f)?, &ops::multiply(&x_rot, sin_f)?)
}

/// Apply SPLIT-type RoPE to a per-head tensor `x` of shape `[B, H, N, D]`.
///
/// Upstream formula: `x1, x2 = split(x, D/2)`; output =
/// `concat([x1*cos - x2*sin, x1*sin + x2*cos], axis=-1)`.
fn apply_split_rope(
    x: &Array,
    cos_f: &Array,
    sin_f: &Array,
) -> Result<Array, mlx_rs::error::Exception> {
    let shape = x.shape();
    let half = shape[shape.len() - 1] / 2;
    let x1 = x.index((.., .., .., ..half));
    let x2 = x.index((.., .., .., half..));
    let out1 = ops::subtract(&ops::multiply(&x1, cos_f)?, &ops::multiply(&x2, sin_f)?)?;
    let out2 = ops::add(&ops::multiply(&x1, sin_f)?, &ops::multiply(&x2, cos_f)?)?;
    ops::concatenate_axis(&[&out1, &out2], -1)
}

// ─── RoPE Position Helpers ────────────────────────────────────────────────

/// VAE / transformer scale factors (must match upstream `utils/positions.py`).
const VIDEO_TEMPORAL_SCALE: f32 = 8.0;
const VIDEO_SPATIAL_SCALE: f32 = 32.0;
const AUDIO_DOWNSAMPLE_FACTOR: f32 = 4.0;
const AUDIO_HOP_LENGTH: f32 = 160.0;
const AUDIO_SAMPLE_RATE: f32 = 16000.0;

/// Build 3D video positions: `[t_seconds, h_pixel, w_pixel]` per token with
/// causal temporal midpoint. Returns axis-major layout `[[f...], [h...], [w...]]`
/// suitable for `precompute_interleaved_rope`.
///
/// Matches upstream `compute_video_positions` in
/// `ltx-core-mlx/utils/positions.py`: latent frame `i` covers pixel range
/// `[max(0,(i-1)*8+1), i*8+1]`; midpoint divided by fps gives the temporal
/// coordinate. Spatial is simple `h*32 + 16` midpoint.
fn build_video_positions_3d(
    latent_f: i32,
    latent_h: i32,
    latent_w: i32,
    fps: f32,
) -> [Vec<f32>; 3] {
    let f = latent_f as usize;
    let h = latent_h as usize;
    let w = latent_w as usize;
    let n = f * h * w;

    // Per-axis midpoints.
    let f_mids: Vec<f32> = (0..f)
        .map(|i| {
            let start = (i as f32 * VIDEO_TEMPORAL_SCALE + 1.0 - VIDEO_TEMPORAL_SCALE).max(0.0);
            let end =
                ((i as f32 + 1.0) * VIDEO_TEMPORAL_SCALE + 1.0 - VIDEO_TEMPORAL_SCALE).max(0.0);
            (start + end) / 2.0 / fps
        })
        .collect();
    let h_mids: Vec<f32> = (0..h)
        .map(|i| i as f32 * VIDEO_SPATIAL_SCALE + VIDEO_SPATIAL_SCALE / 2.0)
        .collect();
    let w_mids: Vec<f32> = (0..w)
        .map(|i| i as f32 * VIDEO_SPATIAL_SCALE + VIDEO_SPATIAL_SCALE / 2.0)
        .collect();

    // Mesh-grid flattened in (f, h, w) order: `positions[fi*H*W + hi*W + wi]`.
    let mut f_col = Vec::with_capacity(n);
    let mut h_col = Vec::with_capacity(n);
    let mut w_col = Vec::with_capacity(n);
    for fi in 0..f {
        for hi in 0..h {
            for wi in 0..w {
                f_col.push(f_mids[fi]);
                h_col.push(h_mids[hi]);
                w_col.push(w_mids[wi]);
            }
        }
    }
    [f_col, h_col, w_col]
}

/// Build 1D audio positions in seconds with causal midpoint.
/// Matches upstream `compute_audio_positions`.
fn build_audio_positions_1d(num_tokens: i32) -> [Vec<f32>; 1] {
    let n = num_tokens as usize;
    let mids: Vec<f32> = (0..n)
        .map(|i| {
            let i_f = i as f32;
            let s = ((i_f * AUDIO_DOWNSAMPLE_FACTOR + 1.0 - AUDIO_DOWNSAMPLE_FACTOR).max(0.0))
                * AUDIO_HOP_LENGTH
                / AUDIO_SAMPLE_RATE;
            let e = (((i_f + 1.0) * AUDIO_DOWNSAMPLE_FACTOR + 1.0 - AUDIO_DOWNSAMPLE_FACTOR)
                .max(0.0))
                * AUDIO_HOP_LENGTH
                / AUDIO_SAMPLE_RATE;
            (s + e) / 2.0
        })
        .collect();
    [mids]
}

/// Precomputed RoPE tensors for a single forward pass. All entries are
/// `Option<(cos, sin)>` and default to `None` when the corresponding modality
/// is inactive (e.g. no audio for t2v).
#[derive(Default, Clone)]
pub struct RopeBundle {
    /// Video self-attention: 3D RoPE over `[t, h, w]`,
    /// shape `[1, video_heads, F*H*W, video_head_dim]`.
    pub video: Option<(Array, Array)>,
    /// Audio self-attention: 1D RoPE over audio time seconds,
    /// shape `[1, audio_heads, T, audio_head_dim]`.
    pub audio: Option<(Array, Array)>,
    /// A2V / V2A cross-attention video side: 1D temporal positions only,
    /// shape `[1, av_cross_heads, F*H*W, av_cross_head_dim]`.
    pub video_cross: Option<(Array, Array)>,
    /// A2V / V2A cross-attention audio side: 1D temporal positions only,
    /// shape `[1, av_cross_heads, T, av_cross_head_dim]`.
    pub audio_cross: Option<(Array, Array)>,
}

impl RopeBundle {
    /// Build all RoPE tensors for a given latent layout. Pass `audio_seq_len = 0`
    /// when audio is disabled.
    ///
    /// Upstream defaults: `positional_embedding_max_pos = (20, 2048, 2048)`,
    /// `audio_positional_embedding_max_pos = (20,)`, `rope_theta = 10000`,
    /// `av_cross_num_heads = 32`, `av_cross_head_dim = 64`.
    pub fn build(
        latent_f: i32,
        latent_h: i32,
        latent_w: i32,
        audio_seq_len: i32,
        fps: f32,
        video_num_heads: usize,
        video_head_dim: usize,
        audio_num_heads: usize,
        audio_head_dim: usize,
    ) -> Result<Self, mlx_rs::error::Exception> {
        const THETA: f32 = 10000.0;
        const AV_CROSS_HEADS: usize = 32;
        const AV_CROSS_HEAD_DIM: usize = 64;
        const MAX_POS_3D: [f32; 3] = [20.0, 2048.0, 2048.0];
        const MAX_POS_1D: [f32; 1] = [20.0];

        // Video 3D RoPE.
        let video_pos = build_video_positions_3d(latent_f, latent_h, latent_w, fps);
        let video = precompute_split_rope_multiaxis(
            &video_pos,
            video_num_heads,
            video_head_dim,
            THETA,
            &MAX_POS_3D,
        )?;

        // Cross-modal video RoPE: 1D temporal only (first axis of video_pos).
        let video_cross_pos = [video_pos[0].clone()];
        let video_cross = precompute_split_rope_multiaxis(
            &video_cross_pos,
            AV_CROSS_HEADS,
            AV_CROSS_HEAD_DIM,
            THETA,
            &MAX_POS_1D,
        )?;

        // Audio branches only when audio is active.
        let (audio, audio_cross) = if audio_seq_len > 0 {
            let audio_pos = build_audio_positions_1d(audio_seq_len);
            let audio = precompute_split_rope_multiaxis(
                &audio_pos,
                audio_num_heads,
                audio_head_dim,
                THETA,
                &MAX_POS_1D,
            )?;
            let audio_cross = precompute_split_rope_multiaxis(
                &audio_pos,
                AV_CROSS_HEADS,
                AV_CROSS_HEAD_DIM,
                THETA,
                &MAX_POS_1D,
            )?;
            (Some(audio), Some(audio_cross))
        } else {
            (None, None)
        };

        Ok(Self {
            video: Some(video),
            audio,
            video_cross: Some(video_cross),
            audio_cross,
        })
    }

    fn video_pair(&self) -> Option<(&Array, &Array)> {
        self.video.as_ref().map(|(c, s)| (c, s))
    }
    fn audio_pair(&self) -> Option<(&Array, &Array)> {
        self.audio.as_ref().map(|(c, s)| (c, s))
    }
    fn video_cross_pair(&self) -> Option<(&Array, &Array)> {
        self.video_cross.as_ref().map(|(c, s)| (c, s))
    }
    fn audio_cross_pair(&self) -> Option<(&Array, &Array)> {
        self.audio_cross.as_ref().map(|(c, s)| (c, s))
    }
}

/// Parameterless LayerNorm over the last axis via MLX's fused kernel.
/// Matches upstream `_output_block` which uses
/// `mx.fast.layer_norm(x, weight=None, bias=None, eps=self.config.norm_eps)`.
/// The hand-rolled `(x - mean) * rsqrt(var + eps)` chain did a slightly
/// different bf16 reduction order and drifted ~1% at the last dim.
fn ltx_layer_norm_parameterless(x: &Array, eps: f32) -> Result<Array, mlx_rs::error::Exception> {
    mlx_rs::fast::layer_norm(x, None, None, eps)
}

/// Parameterless RMS-norm over the last axis. Uses MLX's fused
/// `fast::rms_norm` kernel with a ones-weight to match upstream
/// `BasicAVTransformerBlock._rms_norm` exactly — a hand-rolled
/// `x * rsqrt(mean(x²)+eps)` chain does a different bf16 reduction order
/// and drifts small deviations that compound across 48 blocks.
fn rms_norm_parameterless(x: &Array, eps: f32) -> Result<Array, mlx_rs::error::Exception> {
    let last = x.shape()[x.shape().len() - 1];
    // Cache ones weights per last-axis size so we don't rebuild every call.
    use std::cell::RefCell;
    use std::collections::HashMap;
    thread_local! {
        static ONES_CACHE: RefCell<HashMap<i32, &'static Array>> = RefCell::new(HashMap::new());
    }
    let weight = ONES_CACHE.with(|cell| {
        if let Some(w) = cell.borrow().get(&last) {
            return *w;
        }
        let ones = Array::ones::<f32>(&[last]).expect("ones");
        let leaked: &'static Array = Box::leak(Box::new(ones));
        cell.borrow_mut().insert(last, leaked);
        leaked
    });
    mlx_rs::fast::rms_norm(x, weight, eps)
}

/// Weighted RMS-norm: `(x * rsqrt(mean(x²) + eps)) * weight` broadcast on last axis.
fn apply_weighted_rms(
    x: &Array,
    weight: &Array,
    eps: f32,
) -> Result<Array, mlx_rs::error::Exception> {
    let normed = rms_norm_parameterless(x, eps)?;
    ops::multiply(&normed, weight)
}

/// Text embedding connector that maps T5/CLIP embeddings to transformer space.
/// Includes both video and audio projection pathways.
struct TextEmbeddingConnector {
    // ── Video connector ──
    /// Projects concatenated text embeddings to video hidden dim.
    /// Linear(188160 -> 4096)
    video_aggregate_embed: nn::Linear,
    /// 8 connector transformer blocks with learnable registers.
    video_blocks: Vec<ConnectorBlock>,
    /// Learnable register tokens: [128, 4096]
    video_registers: Option<Array>,

    // ── Audio connector ──
    /// Projects concatenated text embeddings to audio hidden dim.
    /// Linear(188160 -> 2048)
    audio_aggregate_embed: nn::Linear,
    /// 8 connector transformer blocks with learnable registers (2048 dim).
    audio_blocks: Vec<ConnectorBlock>,
    /// Learnable register tokens: [128, 2048]
    audio_registers: Option<Array>,
}

impl TextEmbeddingConnector {
    fn load(tensors: &HashMap<String, Array>) -> Result<Self, InferenceError> {
        // All connector tensors live under `connector.*` in the real checkpoint.
        // Block paths include a `transformer_1d_blocks.{i}` segment, and the
        // learnable register buffer is stored as `learnable_registers` (not
        // `registers` as a prior draft assumed).
        let pfx = "connector";

        // Video connector
        let video_aggregate_embed = build_dense_linear(
            tensors,
            &format!("{pfx}.text_embedding_projection.video_aggregate_embed"),
        )?;

        // Video connector: 32 heads × 128 dim_head = 4096 inner_dim.
        let mut video_blocks = Vec::with_capacity(8);
        for i in 0..8 {
            video_blocks.push(ConnectorBlock::load(
                tensors,
                &format!("{pfx}.video_embeddings_connector.transformer_1d_blocks.{i}"),
                32,
                128,
            )?);
        }

        let video_registers = tensors
            .get(&format!(
                "{pfx}.video_embeddings_connector.learnable_registers"
            ))
            .cloned();

        // Audio connector
        let audio_aggregate_embed = build_dense_linear(
            tensors,
            &format!("{pfx}.text_embedding_projection.audio_aggregate_embed"),
        )?;

        // Audio connector: 32 heads × 64 dim_head = 2048 inner_dim.
        let mut audio_blocks = Vec::with_capacity(8);
        for i in 0..8 {
            audio_blocks.push(ConnectorBlock::load(
                tensors,
                &format!("{pfx}.audio_embeddings_connector.transformer_1d_blocks.{i}"),
                32,
                64,
            )?);
        }

        let audio_registers = tensors
            .get(&format!(
                "{pfx}.audio_embeddings_connector.learnable_registers"
            ))
            .cloned();

        Ok(Self {
            video_aggregate_embed,
            video_blocks,
            video_registers,
            audio_aggregate_embed,
            audio_blocks,
            audio_registers,
        })
    }

    /// Project and refine text embeddings for video and audio conditioning.
    ///
    /// `text_embeddings` has shape `[B, T, 188160]` — per-token Gemma 3
    /// hidden-state stack. For all-zero inputs (Gemma not loaded), we
    /// short-circuit to tiled learnable registers instead of running the
    /// aggregate_embed bias through attention — matches upstream's
    /// "replace padded positions with registers" semantics when every
    /// position is effectively padded.
    fn forward(
        &mut self,
        text_embeddings: &Array,
        n_valid: usize,
    ) -> Result<(Array, Array), mlx_rs::error::Exception> {
        let batch = text_embeddings.shape()[0];
        // If `text_embeddings` is the all-zero placeholder (no Gemma encoder
        // loaded), tile the learnable registers as the input sequence so the
        // connector still produces coherent conditioning. Otherwise run the
        // real aggregate_embed projection on the Gemma-derived features.
        let use_registers_only = is_all_zero(text_embeddings);

        // Upstream (TextEmbeddingProjection) rescales the concatenated hidden
        // states by sqrt(target_dim / embedding_dim) before the linear. For
        // Gemma 3 12B hidden_dim=3840, that's sqrt(4096/3840)≈1.033 for video
        // and sqrt(2048/3840)≈0.730 for audio. Important for numerical parity
        // even though the factor is small on the video side.
        let embedding_dim = 3840_f32;
        let video_scale = Array::from_f32((4096.0_f32 / embedding_dim).sqrt());
        let audio_scale = Array::from_f32((2048.0_f32 / embedding_dim).sqrt());

        let mut run_pass_through = |aggregate: &mut nn::Linear,
                                    blocks: &mut [ConnectorBlock],
                                    regs: Option<&Array>,
                                    scale: &Array,
                                    dump_tag: &str|
         -> Result<Array, mlx_rs::error::Exception> {
            // Upstream `Embeddings1DConnector._replace_padding_with_registers`:
            // text_embeddings are LEFT-padded (pad at positions 0..pad_count,
            // real tokens at pad_count..seq_len). The connector rearranges:
            //
            //   output[0..n_valid]         = proj(real_tokens)  (moved to front)
            //   output[n_valid..seq_len]   = tiled learnable_registers
            //
            // so the connector sequence length STAYS at seq_len (1024) — it
            // does NOT grow by num_registers. A prior revision appended
            // registers, producing a 1152-token sequence that doesn't match
            // what the transformer was trained on.
            let projected = if use_registers_only {
                // Pure-register path: Gemma absent — return just tiled registers.
                match regs {
                    Some(regs) => {
                        let rs = regs.shape();
                        let num_registers = rs[0];
                        let dim = rs[1];
                        let r3 = ops::reshape(regs, &[1, num_registers, dim])?;
                        let seq_len = text_embeddings.shape()[1];
                        let tiles = (seq_len + num_registers - 1) / num_registers;
                        let tiled = ops::tile(&r3, &[batch, tiles, 1])?;
                        tiled.index((.., 0..seq_len, ..))
                    }
                    None => {
                        let rescaled = ops::multiply(text_embeddings, scale)?;
                        aggregate.forward(&rescaled)?
                    }
                }
            } else {
                let rescaled = ops::multiply(text_embeddings, scale)?;
                let proj = aggregate.forward(&rescaled)?;
                // Parity dump: post aggregate_embed projection (pre register replace).
                dump_ltx_stage_first_call(&format!("connector_proj_{dump_tag}"), &proj);

                match regs {
                    Some(regs) => {
                        let proj_shape = proj.shape();
                        let seq_len = proj_shape[1];
                        let dim = proj_shape[2];
                        let rs = regs.shape();
                        let num_registers = rs[0];

                        // Build tiled registers along sequence: (B, seq_len, dim).
                        let r3 = ops::reshape(regs, &[1, num_registers, dim])?;
                        let tiles = (seq_len + num_registers - 1) / num_registers;
                        let tiled = ops::tile(&r3, &[batch, tiles, 1])?;
                        let tiled = tiled.index((.., 0..seq_len, ..));

                        // Move valid tokens to the front.
                        // Left-padded input: valid tokens live at positions
                        // [pad_count..seq_len]. We slice them out and zero-pad
                        // the tail to keep shape (B, seq_len, dim).
                        let n_valid_i32 = n_valid.min(seq_len as usize) as i32;
                        let pad_count = seq_len - n_valid_i32;
                        let adjusted = if n_valid_i32 == 0 {
                            // No real tokens — whole sequence is registers.
                            tiled.clone()
                        } else {
                            let valid = proj.index((.., pad_count..seq_len, ..));
                            if n_valid_i32 == seq_len {
                                valid
                            } else {
                                // [valid_tokens, zeros] of length seq_len
                                let zeros =
                                    Array::zeros::<f32>(&[batch, seq_len - n_valid_i32, dim])?;
                                ops::concatenate_axis(&[&valid, &zeros], 1)?
                            }
                        };

                        // Blend: valid tokens in first n_valid positions, registers in the rest.
                        // flipped = [1]*n_valid + [0]*(seq_len - n_valid), broadcast to (B, seq_len, 1).
                        let mask_vals: Vec<f32> = (0..seq_len)
                            .map(|i| if i < n_valid_i32 { 1.0 } else { 0.0 })
                            .collect();
                        let flipped = Array::from_slice(&mask_vals, &[1, seq_len, 1]);
                        let inv_flipped = ops::subtract(&Array::from_f32(1.0), &flipped)?;

                        let a_part = ops::multiply(&flipped, &adjusted)?;
                        let b_part = ops::multiply(&inv_flipped, &tiled)?;
                        ops::add(&a_part, &b_part)?
                    }
                    None => proj,
                }
            };
            let mut h = projected;
            for block in blocks.iter_mut() {
                h = block.forward(&h)?;
            }
            let out = rms_norm_parameterless(&h, 1e-6)?;
            // Parity dump: final connector output (post-RMSNorm).
            dump_ltx_stage_first_call(&format!("connector_out_{dump_tag}"), &out);
            Ok(out)
        };

        let video_h = run_pass_through(
            &mut self.video_aggregate_embed,
            &mut self.video_blocks,
            self.video_registers.as_ref(),
            &video_scale,
            "video",
        )?;
        let audio_h = run_pass_through(
            &mut self.audio_aggregate_embed,
            &mut self.audio_blocks,
            self.audio_registers.as_ref(),
            &audio_scale,
            "audio",
        )?;
        Ok((video_h, audio_h))
    }
}

/// Cheap check for the "Gemma encoder absent" fallback: evaluate the tensor
/// and peek at the absolute-sum. Only called once per generate() so the eval
/// cost is negligible.
fn is_all_zero(x: &Array) -> bool {
    let Ok(absx) = ops::abs(x) else {
        return false;
    };
    // Reduce over all axes via iterative mean_axes over every dim.
    let rank = absx.shape().len();
    let mut reduced = absx;
    for _ in 0..rank {
        let Ok(r) = reduced.mean_axes(&[0], false) else {
            return false;
        };
        reduced = r;
    }
    if mlx_rs::transforms::eval([&reduced]).is_err() {
        return false;
    }
    let slice: &[f32] = reduced.as_slice();
    slice.first().map(|v| *v == 0.0).unwrap_or(false)
}

// ─── VAE Decoder (3D Causal) ──────────────────────────────────────────────

/// Apply a single 3D causal convolution decomposed into per-frame 2D spatial
/// convolutions with temporal accumulation.
///
/// For causal convolution with kernel temporal size kT, we pad (kT-1) zero
/// frames on the left (past) and 0 on the right (future). Spatial padding is
/// symmetric: (kH/2, kW/2).
///
/// - `x`: input tensor in BTWHC format `[B, T, H, W, C_in]`
/// - `weight`: 3D conv weight `[C_out, kT, kH, kW, C_in]`
/// - `bias`: optional bias `[C_out]`
///
/// Returns output in BTWHC format `[B, T, H_out, W_out, C_out]` where
/// H_out and W_out preserve spatial dims (stride=1, symmetric spatial padding).
fn conv3d_causal(
    x: &Array,
    weight: &Array,
    bias: Option<&Array>,
) -> Result<Array, mlx_rs::error::Exception> {
    let x_shape = x.shape();
    let (batch, t_in, h_in, w_in, _c_in) =
        (x_shape[0], x_shape[1], x_shape[2], x_shape[3], x_shape[4]);

    let w_shape = weight.shape();
    let (c_out, kt, kh, kw, c_in_w) = (w_shape[0], w_shape[1], w_shape[2], w_shape[3], w_shape[4]);

    let pad_t = kt - 1; // causal: all temporal padding on left

    // Causal temporal padding: upstream Conv3dBlock replicates the FIRST
    // frame `kt - 1` times rather than zero-padding. Previously we prepended
    // zero frames, which makes every causal Conv3d in the VAE (90+ calls)
    // start from an impossible "black frames" context instead of extending
    // the clip naturally — the decode output comes out as noise even when
    // fed upstream's own post-denoise latent.
    let x_padded = if pad_t > 0 {
        let first = x.index((.., 0..1, .., .., ..));
        let first_rep = ops::repeat_axis::<f32>(first, pad_t, 1)?;
        ops::concatenate_axis(&[&first_rep, x], 1)?
    } else {
        x.clone()
    };

    let pad_h = kh / 2;
    let pad_w = kw / 2;

    // For each output frame t_out in 0..T, accumulate across temporal kernel:
    //   out[:, t_out] = sum_{tk=0}^{kT-1} conv2d(x_padded[:, t_out+tk], weight[:, tk])
    // Batch all frames together for efficiency.

    let mut accum: Option<Array> = None;
    for tk in 0..kt {
        // Extract temporal weight slice: weight[:, tk, :, :, :] -> [C_out, kH, kW, C_in]
        let w_slice = weight.index((.., tk, .., .., ..));

        // Gather frames for this temporal offset: for each output frame t_out,
        // we need padded frame (t_out + tk). Shape: [B, T, H, W, C_in]
        // where T = t_in (number of output frames).
        let frames_slice = x_padded.index((.., tk..(tk + t_in), .., .., ..));

        // Reshape [B, T, H, W, C] -> [B*T, H, W, C] for 2D conv
        let bt = batch * t_in;
        let flat = ops::reshape(&frames_slice, &[bt, h_in, w_in, c_in_w])?;

        // Apply 2D spatial conv with symmetric padding
        let conv_out = ops::conv2d(
            &flat,
            &w_slice,
            (1, 1),         // stride
            (pad_h, pad_w), // padding
            None::<(i32, i32)>,
            None::<i32>,
        )?;

        accum = Some(match accum {
            Some(a) => ops::add(&a, &conv_out)?,
            None => conv_out,
        });
    }

    let mut result = accum.unwrap();

    // Add bias: [C_out] broadcast over [B*T, H_out, W_out, C_out]
    if let Some(b) = bias {
        let b_reshaped = ops::reshape(b, &[1, 1, 1, c_out])?;
        result = ops::add(&result, &b_reshaped)?;
    }

    // Reshape back to [B, T, H_out, W_out, C_out]
    let h_out = result.shape()[1];
    let w_out = result.shape()[2];
    ops::reshape(&result, &[batch, t_in, h_out, w_out, c_out])
}

/// Pixel-norm: per-pixel normalization along channel dimension (parameter-free).
///
/// `x / sqrt(mean(x^2, dim=-1, keepdim=True) + eps)`
///
/// Used by the LTX CausalVideoAutoencoder instead of GroupNorm.
fn pixel_norm(x: &Array) -> Result<Array, mlx_rs::error::Exception> {
    // Upstream uses `mx.fast.rms_norm(x, weight=None, eps=1e-8)`, the fused
    // Metal kernel. A hand-rolled `x * rsqrt(mean(x²) + eps)` chain computes
    // the same math but with a different bf16 reduction order, and across 90+
    // pixel_norm calls in the decoder those 1-ulp differences accumulate to
    // a qualitatively different output.
    rms_norm_parameterless(x, 1e-8)
}

/// 3D pixel shuffle: rearrange channels into spatial/temporal dimensions.
///
/// Input: `[B, T, H, W, C * factor_t * factor_h * factor_w]`
/// Output: `[B, T*factor_t, H*factor_h, W*factor_w, C]`
fn pixel_shuffle_3d(
    x: &Array,
    factor_t: i32,
    factor_h: i32,
    factor_w: i32,
) -> Result<Array, mlx_rs::error::Exception> {
    let s = x.shape();
    let (b, t, h, w, c_total) = (s[0], s[1], s[2], s[3], s[4]);
    let factor = factor_t * factor_h * factor_w;
    let c_out = c_total / factor;

    // Reshape: [B, T, H, W, C, ft, fh, fw]
    let x = ops::reshape(x, &[b, t, h, w, c_out, factor_t, factor_h, factor_w])?;

    // Permute: [B, T, ft, H, fh, W, fw, C]
    let x = ops::transpose_axes(&x, &[0, 1, 5, 2, 6, 3, 7, 4])?;

    // Reshape: [B, T*ft, H*fh, W*fw, C]
    ops::reshape(&x, &[b, t * factor_t, h * factor_h, w * factor_w, c_out])
}

/// Inverse 3D pixel shuffle: fold spatial/temporal factors into the channel dim.
///
/// Input: `[B, T*factor_t, H*factor_h, W*factor_w, C]`
/// Output: `[B, T, H, W, C * factor_t * factor_h * factor_w]`
fn pixel_unshuffle_3d(
    x: &Array,
    factor_t: i32,
    factor_h: i32,
    factor_w: i32,
) -> Result<Array, mlx_rs::error::Exception> {
    let s = x.shape();
    let (b, t_big, h_big, w_big, c) = (s[0], s[1], s[2], s[3], s[4]);
    let t = t_big / factor_t;
    let h = h_big / factor_h;
    let w = w_big / factor_w;

    // Reshape: [B, T, ft, H, fh, W, fw, C]
    let x = ops::reshape(x, &[b, t, factor_t, h, factor_h, w, factor_w, c])?;
    // Permute: [B, T, H, W, C, ft, fh, fw]
    let x = ops::transpose_axes(&x, &[0, 1, 3, 5, 7, 2, 4, 6])?;
    // Reshape: [B, T, H, W, C * ft * fh * fw]
    ops::reshape(&x, &[b, t, h, w, c * factor_t * factor_h * factor_w])
}

/// A ResNet3d block for the 3D causal VAE decoder.
///
/// Forward pass: `x + conv2(pixel_norm(silu(conv1(pixel_norm(silu(x))))))`
///
/// All convolutions are 3D causal with kernel 3x3x3.
struct VaeResNet3dBlock {
    conv1_weight: Array,
    conv1_bias: Option<Array>,
    conv2_weight: Array,
    conv2_bias: Option<Array>,
}

impl VaeResNet3dBlock {
    fn load(tensors: &HashMap<String, Array>, prefix: &str) -> Result<Self, InferenceError> {
        Ok(Self {
            conv1_weight: get_tensor(tensors, &format!("{prefix}.conv1.conv.weight"))?,
            conv1_bias: tensors.get(&format!("{prefix}.conv1.conv.bias")).cloned(),
            conv2_weight: get_tensor(tensors, &format!("{prefix}.conv2.conv.weight"))?,
            conv2_bias: tensors.get(&format!("{prefix}.conv2.conv.bias")).cloned(),
        })
    }

    /// Forward: pixel_norm -> SiLU -> conv3d_causal -> pixel_norm -> SiLU -> conv3d_causal + residual
    fn forward(&self, x: &Array) -> Result<Array, mlx_rs::error::Exception> {
        let h = pixel_norm(x)?;
        let h = nn::silu(&h)?;
        let h = conv3d_causal(&h, &self.conv1_weight, self.conv1_bias.as_ref())?;
        let h = pixel_norm(&h)?;
        let h = nn::silu(&h)?;
        let h = conv3d_causal(&h, &self.conv2_weight, self.conv2_bias.as_ref())?;
        ops::add(x, &h)
    }
}

/// An upsample block that applies Conv3d followed by 3D pixel shuffle.
///
/// The convolution expands channels by a factor corresponding to the spatial/temporal
/// upsample, and the pixel shuffle rearranges them into larger dimensions.
struct VaeUpsampleBlock {
    conv_weight: Array,
    conv_bias: Option<Array>,
    /// Pixel shuffle factors (temporal, height, width).
    shuffle_factors: (i32, i32, i32),
}

impl VaeUpsampleBlock {
    fn load(
        tensors: &HashMap<String, Array>,
        prefix: &str,
        shuffle_factors: (i32, i32, i32),
    ) -> Result<Self, InferenceError> {
        Ok(Self {
            conv_weight: get_tensor(tensors, &format!("{prefix}.conv.conv.weight"))?,
            conv_bias: tensors.get(&format!("{prefix}.conv.conv.bias")).cloned(),
            shuffle_factors,
        })
    }

    /// Forward: conv3d_causal -> pixel_shuffle_3d -> drop-first-frame when tf>1
    ///
    /// Upstream `VideoDecoder.decode` in ltx-core-mlx video_vae.py:200-210:
    ///   x = block(x)
    ///   if i % 2 == 1:  # odd indices = DepthToSpaceUpsample
    ///       sf, tf = self._upsample_config[upsample_idx]
    ///       x = pixel_shuffle_3d(x, spatial_factor=sf, temporal_factor=tf)
    ///       if tf > 1:
    ///           x = x[:, 1:, :, :, :]
    ///   upsample_idx += 1
    ///
    /// The drop-first-frame is ALWAYS applied when `tf > 1` (unconditional on
    /// causal mode). Without it, each 2× temporal upsample leaves a duplicated
    /// pad frame at position 0 that never gets removed, and the final pixel
    /// output is misaligned with the expected temporal grid — visible as a
    /// regular checkerboard when the misalignment interacts with the spatial
    /// unpatchify_spatial(patch_size=4) step.
    fn forward(&self, x: &Array) -> Result<Array, mlx_rs::error::Exception> {
        let h = conv3d_causal(x, &self.conv_weight, self.conv_bias.as_ref())?;
        let (ft, fh, fw) = self.shuffle_factors;
        let shuffled = pixel_shuffle_3d(&h, ft, fh, fw)?;
        if ft > 1 {
            // Drop frame 0 along the temporal axis (BTHWC layout, axis=1).
            let s = shuffled.shape();
            shuffled
                .index((.., 1..s[1], .., .., ..))
                .as_dtype(shuffled.dtype())
        } else {
            Ok(shuffled)
        }
    }
}

/// 3D Causal VAE decoder for the LTX video model.
///
/// Architecture (from embedded_config.json `decoder_blocks`):
///
/// ```text
/// conv_in: Conv3d(128 -> 1024, 3x3x3)
/// up_blocks.0: ResNet3d x2 (1024 channels)
/// up_blocks.1: Upsample Conv3d(1024 -> 4096) + pixel_shuffle(2,2,2) -> 512ch, 2x all dims
/// up_blocks.2: ResNet3d x2 (512 channels)
/// up_blocks.3: Upsample Conv3d(512 -> 4096) + pixel_shuffle(2,2,2) -> 512ch, 2x all dims
/// up_blocks.4: ResNet3d x4 (512 channels)
/// up_blocks.5: Upsample Conv3d(512 -> 512) + pixel_shuffle(2,1,1) -> 256ch, 2x temporal
/// up_blocks.6: ResNet3d x6 (256 channels)
/// up_blocks.7: Upsample Conv3d(256 -> 512) + pixel_shuffle(1,2,2) -> 128ch, 2x spatial
/// up_blocks.8: ResNet3d x4 (128 channels)
/// conv_out: Conv3d(128 -> 48, 3x3x3)  [48 = 3 RGB * patch_size^2 with patch_size=4]
/// ```
///
/// Total upsample: 8x spatial (2*2*1*2), 8x temporal (2*2*2*1).
/// Pixel denormalization at the end restores original value range.
struct VaeDecoder3D {
    /// conv_in: Conv3d(128 -> 1024, kernel=3x3x3)
    conv_in_weight: Array,
    conv_in_bias: Option<Array>,
    /// conv_out: Conv3d(128 -> 48, kernel=3x3x3)
    conv_out_weight: Array,
    conv_out_bias: Option<Array>,
    /// Per-channel mean for latent denormalization.
    per_channel_mean: Option<Array>,
    /// Per-channel std for latent denormalization.
    per_channel_std: Option<Array>,

    // ── Structured decoder blocks ──
    /// up_blocks.0: 2 ResNet3d blocks at 1024 channels
    res_blocks_0: Vec<VaeResNet3dBlock>,
    /// up_blocks.1: upsample 1024->4096 with pixel_shuffle(2,2,2) -> 512ch
    upsample_1: VaeUpsampleBlock,
    /// up_blocks.2: 2 ResNet3d blocks at 512 channels
    res_blocks_2: Vec<VaeResNet3dBlock>,
    /// up_blocks.3: upsample 512->4096 with pixel_shuffle(2,2,2) -> 512ch
    upsample_3: VaeUpsampleBlock,
    /// up_blocks.4: 4 ResNet3d blocks at 512 channels
    res_blocks_4: Vec<VaeResNet3dBlock>,
    /// up_blocks.5: upsample 512->512 with pixel_shuffle(2,1,1) -> 256ch
    upsample_5: VaeUpsampleBlock,
    /// up_blocks.6: 6 ResNet3d blocks at 256 channels
    res_blocks_6: Vec<VaeResNet3dBlock>,
    /// up_blocks.7: upsample 256->512 with pixel_shuffle(1,2,2) -> 128ch
    upsample_7: VaeUpsampleBlock,
    /// up_blocks.8: 4 ResNet3d blocks at 128 channels
    res_blocks_8: Vec<VaeResNet3dBlock>,
}

impl VaeDecoder3D {
    fn load(tensors: &HashMap<String, Array>) -> Result<Self, InferenceError> {
        let pfx = "vae_decoder";

        let conv_in_weight = get_tensor(tensors, &format!("{pfx}.conv_in.conv.weight"))?;
        let conv_in_bias = tensors.get(&format!("{pfx}.conv_in.conv.bias")).cloned();
        let conv_out_weight = get_tensor(tensors, &format!("{pfx}.conv_out.conv.weight"))?;
        let conv_out_bias = tensors.get(&format!("{pfx}.conv_out.conv.bias")).cloned();

        let per_channel_mean = tensors
            .get(&format!("{pfx}.per_channel_statistics.mean"))
            .cloned();
        let per_channel_std = tensors
            .get(&format!("{pfx}.per_channel_statistics.std"))
            .cloned();

        // Load ResNet3d blocks
        let load_res_blocks =
            |block_idx: usize, count: usize| -> Result<Vec<VaeResNet3dBlock>, InferenceError> {
                let mut blocks = Vec::with_capacity(count);
                for i in 0..count {
                    blocks.push(VaeResNet3dBlock::load(
                        tensors,
                        &format!("{pfx}.up_blocks.{block_idx}.res_blocks.{i}"),
                    )?);
                }
                Ok(blocks)
            };

        let res_blocks_0 = load_res_blocks(0, 2)?;
        let upsample_1 = VaeUpsampleBlock::load(tensors, &format!("{pfx}.up_blocks.1"), (2, 2, 2))?;
        let res_blocks_2 = load_res_blocks(2, 2)?;
        let upsample_3 = VaeUpsampleBlock::load(tensors, &format!("{pfx}.up_blocks.3"), (2, 2, 2))?;
        let res_blocks_4 = load_res_blocks(4, 4)?;
        let upsample_5 = VaeUpsampleBlock::load(tensors, &format!("{pfx}.up_blocks.5"), (2, 1, 1))?;
        let res_blocks_6 = load_res_blocks(6, 6)?;
        let upsample_7 = VaeUpsampleBlock::load(tensors, &format!("{pfx}.up_blocks.7"), (1, 2, 2))?;
        let res_blocks_8 = load_res_blocks(8, 4)?;

        Ok(Self {
            conv_in_weight,
            conv_in_bias,
            conv_out_weight,
            conv_out_bias,
            per_channel_mean,
            per_channel_std,
            res_blocks_0,
            upsample_1,
            res_blocks_2,
            upsample_3,
            res_blocks_4,
            upsample_5,
            res_blocks_6,
            upsample_7,
            res_blocks_8,
        })
    }

    /// Decode latents to pixel-space video frames.
    ///
    /// Input: `[B, C, T, H, W]` where C=128 (latent channels).
    ///
    /// Pipeline:
    /// 1. Denormalize latents using per-channel statistics
    /// 2. conv_in (3D causal, 128->1024)
    /// 3. up_blocks.0: 2x ResNet3d at 1024ch
    /// 4. up_blocks.1: upsample (2,2,2) -> 512ch, 2x all dims
    /// 5. up_blocks.2: 2x ResNet3d at 512ch
    /// 6. up_blocks.3: upsample (2,2,2) -> 512ch, 2x all dims
    /// 7. up_blocks.4: 4x ResNet3d at 512ch
    /// 8. up_blocks.5: upsample (2,1,1) -> 256ch, 2x temporal
    /// 9. up_blocks.6: 6x ResNet3d at 256ch
    /// 10. up_blocks.7: upsample (1,2,2) -> 128ch, 2x spatial
    /// 11. up_blocks.8: 4x ResNet3d at 128ch
    /// 12. pixel_norm + SiLU + conv_out (3D causal, 128->48)
    /// 13. Unpack 48 channels -> RGB via patch_size=4 spatial unpacking
    /// 14. Clamp to [0, 1]
    ///
    /// Output: `[B, T_out, H_out, W_out, 3]` with values in [0, 1].
    fn decode(&self, latents: &Array) -> Result<Array, mlx_rs::error::Exception> {
        // latents shape: [B, C, T, H, W] where C=128
        let shape = latents.shape();
        let batch = shape[0];

        // Convert from BCTHW to BTWHC (channels-last for MLX conv2d)
        let x = ops::transpose_axes(latents, &[0, 2, 3, 4, 1])?; // [B, T, H, W, C]

        // 1. Denormalize latents: x = x * std + mean (reverse the encoder normalization)
        let x = if let (Some(ref mean), Some(ref std_)) =
            (&self.per_channel_mean, &self.per_channel_std)
        {
            let mean = ops::reshape(mean, &[1, 1, 1, 1, mean.shape()[0]])?;
            let std_ = ops::reshape(std_, &[1, 1, 1, 1, std_.shape()[0]])?;
            ops::add(&ops::multiply(&x, &std_)?, &mean)?
        } else {
            x
        };

        // 2. conv_in: Conv3d(128 -> 1024, 3x3x3)
        let mut h = conv3d_causal(&x, &self.conv_in_weight, self.conv_in_bias.as_ref())?;

        // 3. up_blocks.0: 2x ResNet3d at 1024 channels
        for block in &self.res_blocks_0 {
            h = block.forward(&h)?;
        }

        // 4. up_blocks.1: upsample (2,2,2) -> 512ch
        h = self.upsample_1.forward(&h)?;

        // 5. up_blocks.2: 2x ResNet3d at 512 channels
        for block in &self.res_blocks_2 {
            h = block.forward(&h)?;
        }

        // 6. up_blocks.3: upsample (2,2,2) -> 512ch
        h = self.upsample_3.forward(&h)?;

        // 7. up_blocks.4: 4x ResNet3d at 512 channels
        for block in &self.res_blocks_4 {
            h = block.forward(&h)?;
        }

        // 8. up_blocks.5: upsample (2,1,1) -> 256ch, 2x temporal
        h = self.upsample_5.forward(&h)?;

        // 9. up_blocks.6: 6x ResNet3d at 256 channels
        for block in &self.res_blocks_6 {
            h = block.forward(&h)?;
        }

        // 10. up_blocks.7: upsample (1,2,2) -> 128ch, 2x spatial
        h = self.upsample_7.forward(&h)?;

        // 11. up_blocks.8: 4x ResNet3d at 128 channels
        for block in &self.res_blocks_8 {
            h = block.forward(&h)?;
        }

        // 12. Final pixel_norm + SiLU + conv_out
        let h = pixel_norm(&h)?;
        let h = nn::silu(&h)?;
        let h = conv3d_causal(&h, &self.conv_out_weight, self.conv_out_bias.as_ref())?;
        // h: [B, T_up, H_up, W_up, 48]

        // 13. Unpack 48 channels into RGB frames via spatial patch unpacking.
        // conv_out produces 48 = 3 * 4 * 4 channels. Upstream
        // `unpatchify_spatial` splits them as `(C, r_W, q_H)` — WIDTH
        // subpixels first, HEIGHT subpixels second. Swapping the order
        // (as a prior revision did) produces a visible checkerboard grid
        // because adjacent H/W subpixels get scrambled. The ltx-2-mlx docs
        // explicitly warn: "r (width) comes BEFORE q (height). Using
        // pixel_shuffle_3d for unpatchify swaps H/W sub-pixels and causes
        // checkerboard artifacts."
        let h_shape = h.shape();
        let (t_up, h_up, w_up) = (h_shape[1], h_shape[2], h_shape[3]);
        let ps: i32 = 4;
        // Split last axis into (C, r_W, q_H): axes (B, T, H, W, C, r_W, q_H).
        let out = ops::reshape(&h, &[batch, t_up, h_up, w_up, 3, ps, ps])?;
        // Transpose to (B, T, H, q_H, W, r_W, C) so the final reshape
        // correctly interleaves the H subpixels within H and W subpixels within W.
        let out = ops::transpose_axes(&out, &[0, 1, 2, 6, 3, 5, 4])?;
        // Merge spatial → (B, T, H*ps, W*ps, C).
        let out = ops::reshape(&out, &[batch, t_up, h_up * ps, w_up * ps, 3])?;

        // Upstream VAE outputs pixels in [-1, 1]. Convert to [0, 1] for the
        // caller (who scales to uint8): `(x + 1) * 0.5`, then clip to handle
        // slight overshoot from quantized matmul.
        let half = Array::from_f32(0.5);
        let one_a = Array::from_f32(1.0);
        let scaled = ops::multiply(&ops::add(&out, &one_a)?, &half)?;
        let zero = Array::from_f32(0.0);
        let out = ops::clip(&scaled, (&zero, &one_a))?;

        Ok(out)
    }
}

/// A downsample block that applies Conv3d followed by 3D pixel unshuffle.
///
/// Inverse of `VaeUpsampleBlock`: the convolution reduces channels, and the
/// pixel unshuffle folds the selected spatial/temporal factors into the
/// channel dimension so the block output has the target channel count for the
/// next encoder stage.
struct VaeDownsampleBlock {
    conv_weight: Array,
    conv_bias: Option<Array>,
    /// Pixel unshuffle factors (temporal, height, width).
    unshuffle_factors: (i32, i32, i32),
}

impl VaeDownsampleBlock {
    fn load(
        tensors: &HashMap<String, Array>,
        prefix: &str,
        unshuffle_factors: (i32, i32, i32),
    ) -> Result<Self, InferenceError> {
        Ok(Self {
            conv_weight: get_tensor(tensors, &format!("{prefix}.conv.conv.weight"))?,
            conv_bias: tensors.get(&format!("{prefix}.conv.conv.bias")).cloned(),
            unshuffle_factors,
        })
    }

    /// Forward: conv3d_causal -> pixel_unshuffle_3d
    fn forward(&self, x: &Array) -> Result<Array, mlx_rs::error::Exception> {
        let h = conv3d_causal(x, &self.conv_weight, self.conv_bias.as_ref())?;
        let (ft, fh, fw) = self.unshuffle_factors;
        pixel_unshuffle_3d(&h, ft, fh, fw)
    }
}

/// 3D Causal VAE encoder — symmetric inverse of `VaeDecoder3D`.
///
/// Architecture (from embedded_config.json `encoder_blocks`, mirrored):
///
/// ```text
/// pack input: pixel_unshuffle(1, 4, 4) on RGB  → 48 channels packed
/// conv_in: Conv3d(48 → 128, 3×3×3)
/// down_blocks.0: ResNet3d ×4  (128 channels)
/// down_blocks.1: Down Conv3d(128 →  64) + pixel_unshuffle(1, 2, 2) → 256ch, 2× spatial
/// down_blocks.2: ResNet3d ×6  (256 channels)
/// down_blocks.3: Down Conv3d(256 → 256) + pixel_unshuffle(2, 1, 1) → 512ch, 2× temporal
/// down_blocks.4: ResNet3d ×4  (512 channels)
/// down_blocks.5: Down Conv3d(512 → 128) + pixel_unshuffle(2, 2, 2) → 1024ch, 2× all
/// down_blocks.6: ResNet3d ×2  (1024 channels)
/// down_blocks.7: Down Conv3d(1024 → 128) + pixel_unshuffle(2, 2, 2) → 1024ch, 2× all
/// down_blocks.8: ResNet3d ×2  (1024 channels)
/// conv_out: Conv3d(1024 → 129, 3×3×3)  [take first 128 deterministic channels]
/// normalize: z = (x - mean_of_means) / std_of_means   (per channel)
/// ```
///
/// Block-level downsample: 8× spatial (pre-pack), 8× temporal.
/// Pixel → latent ratio including the 4×4 patch pack: 32× spatial, 8× temporal.
struct VaeEncoder3D {
    conv_in_weight: Array,
    conv_in_bias: Option<Array>,
    conv_out_weight: Array,
    conv_out_bias: Option<Array>,
    mean_of_means: Option<Array>,
    std_of_means: Option<Array>,

    res_blocks_0: Vec<VaeResNet3dBlock>,
    downsample_1: VaeDownsampleBlock,
    res_blocks_2: Vec<VaeResNet3dBlock>,
    downsample_3: VaeDownsampleBlock,
    res_blocks_4: Vec<VaeResNet3dBlock>,
    downsample_5: VaeDownsampleBlock,
    res_blocks_6: Vec<VaeResNet3dBlock>,
    downsample_7: VaeDownsampleBlock,
    res_blocks_8: Vec<VaeResNet3dBlock>,
}

impl VaeEncoder3D {
    fn load(tensors: &HashMap<String, Array>) -> Result<Self, InferenceError> {
        let pfx = "vae_encoder";

        let conv_in_weight = get_tensor(tensors, &format!("{pfx}.conv_in.conv.weight"))?;
        let conv_in_bias = tensors.get(&format!("{pfx}.conv_in.conv.bias")).cloned();
        let conv_out_weight = get_tensor(tensors, &format!("{pfx}.conv_out.conv.weight"))?;
        let conv_out_bias = tensors.get(&format!("{pfx}.conv_out.conv.bias")).cloned();

        let mean_of_means = tensors
            .get(&format!("{pfx}.per_channel_statistics._mean_of_means"))
            .cloned();
        let std_of_means = tensors
            .get(&format!("{pfx}.per_channel_statistics._std_of_means"))
            .cloned();

        let load_res_blocks =
            |block_idx: usize, count: usize| -> Result<Vec<VaeResNet3dBlock>, InferenceError> {
                let mut blocks = Vec::with_capacity(count);
                for i in 0..count {
                    blocks.push(VaeResNet3dBlock::load(
                        tensors,
                        &format!("{pfx}.down_blocks.{block_idx}.res_blocks.{i}"),
                    )?);
                }
                Ok(blocks)
            };

        let res_blocks_0 = load_res_blocks(0, 4)?;
        let downsample_1 =
            VaeDownsampleBlock::load(tensors, &format!("{pfx}.down_blocks.1"), (1, 2, 2))?;
        let res_blocks_2 = load_res_blocks(2, 6)?;
        let downsample_3 =
            VaeDownsampleBlock::load(tensors, &format!("{pfx}.down_blocks.3"), (2, 1, 1))?;
        let res_blocks_4 = load_res_blocks(4, 4)?;
        let downsample_5 =
            VaeDownsampleBlock::load(tensors, &format!("{pfx}.down_blocks.5"), (2, 2, 2))?;
        let res_blocks_6 = load_res_blocks(6, 2)?;
        let downsample_7 =
            VaeDownsampleBlock::load(tensors, &format!("{pfx}.down_blocks.7"), (2, 2, 2))?;
        let res_blocks_8 = load_res_blocks(8, 2)?;

        Ok(Self {
            conv_in_weight,
            conv_in_bias,
            conv_out_weight,
            conv_out_bias,
            mean_of_means,
            std_of_means,
            res_blocks_0,
            downsample_1,
            res_blocks_2,
            downsample_3,
            res_blocks_4,
            downsample_5,
            res_blocks_6,
            downsample_7,
            res_blocks_8,
        })
    }

    /// Encode pixel-space frames to the latent representation consumed by the LTX transformer.
    ///
    /// Input: `[B, T, H, W, 3]` with f32 values in `[0, 1]`.
    /// `T`, `H`, `W` must be divisible by 8, 32, 32 respectively so the downsampling
    /// pyramid produces integer dimensions.
    ///
    /// Output: `[B, C=128, T/8, H/32, W/32]` (BCTHW) — already normalized with the
    /// per-channel statistics consumed by the decoder.
    fn encode(&self, rgb: &Array) -> Result<Array, mlx_rs::error::Exception> {
        // Pack RGB 4×4 patches into 48 channels.
        // [B, T, H, W, 3] → [B, T, H/4, W/4, 48]
        let packed = pixel_unshuffle_3d(rgb, 1, 4, 4)?;

        let mut h = conv3d_causal(&packed, &self.conv_in_weight, self.conv_in_bias.as_ref())?;

        for block in &self.res_blocks_0 {
            h = block.forward(&h)?;
        }
        h = self.downsample_1.forward(&h)?;
        for block in &self.res_blocks_2 {
            h = block.forward(&h)?;
        }
        h = self.downsample_3.forward(&h)?;
        for block in &self.res_blocks_4 {
            h = block.forward(&h)?;
        }
        h = self.downsample_5.forward(&h)?;
        for block in &self.res_blocks_6 {
            h = block.forward(&h)?;
        }
        h = self.downsample_7.forward(&h)?;
        for block in &self.res_blocks_8 {
            h = block.forward(&h)?;
        }

        // Final norm + SiLU + conv_out (1024 → 129)
        let h = pixel_norm(&h)?;
        let h = nn::silu(&h)?;
        let h = conv3d_causal(&h, &self.conv_out_weight, self.conv_out_bias.as_ref())?;

        // Take first 128 channels (deterministic means); drop the auxiliary
        // 129th (logvar channel, unused at inference).
        // h shape: [B, T, H, W, 129] — slice last axis to 128.
        let latent = h.index((.., .., .., .., ..128));

        // Upstream `VideoEncoder.normalize_latent` applies
        //   (x - mean_of_means) / std_of_means
        // on the NTHWC tensor before transposing to BCTHW. The transformer
        // was trained on this normalized space, so the anchor / denoising
        // inputs must match it exactly. An earlier revision skipped this
        // because the stats looked large enough to "blow up" the output;
        // that was wrong — magnitude mismatch was exactly what made i2v
        // anchors bleed through as washed-out frames.
        let latent = if let (Some(m), Some(s)) = (&self.mean_of_means, &self.std_of_means) {
            let mean = ops::reshape(m, &[1, 1, 1, 1, -1])?;
            let std = ops::reshape(s, &[1, 1, 1, 1, -1])?;
            ops::divide(&ops::subtract(&latent, &mean)?, &std)?
        } else {
            latent
        };

        // Convert NTHWC → BCTHW to match what the transformer consumes.
        ops::transpose_axes(&latent, &[0, 4, 1, 2, 3])
    }
}

// ─── Audio VAE ────────────────────────────────────────────────────────────

/// 2D convolution forward pass for NHWC tensors (used by audio VAE).
fn audio_conv2d_forward(
    input: &Array,
    weight: &Array,
    bias: Option<&Array>,
    stride: (i32, i32),
    padding: (i32, i32),
) -> Result<Array, mlx_rs::error::Exception> {
    let mut y = ops::conv2d(
        input,
        weight,
        stride,
        padding,
        None::<(i32, i32)>,
        None::<i32>,
    )?;
    if let Some(b) = bias {
        y = ops::add(&y, b)?;
    }
    Ok(y)
}

/// Nearest-neighbor 2x upsample for NHWC tensors (used by audio VAE).
fn audio_upsample_2x(x: &Array) -> Result<Array, mlx_rs::error::Exception> {
    let shape = x.shape();
    let (b, h, w, c) = (shape[0], shape[1], shape[2], shape[3]);
    let expanded_h = ops::reshape(x, &[b, h, 1, w, c])?;
    let tiled_h = ops::concatenate_axis(&[&expanded_h, &expanded_h], 2)?;
    let merged_h = ops::reshape(&tiled_h, &[b, h * 2, w, c])?;
    let expanded_w = ops::reshape(&merged_h, &[b, h * 2, w, 1, c])?;
    let tiled_w = ops::concatenate_axis(&[&expanded_w, &expanded_w], 3)?;
    ops::reshape(&tiled_w, &[b, h * 2, w * 2, c])
}

/// ResNet block for the audio VAE decoder.
///
/// Architecture: `x + conv2(SiLU(conv1(SiLU(x))))` with optional 1x1
/// `nin_shortcut` when input and output channel counts differ.
struct AudioResNetBlock {
    conv1_weight: Array,
    conv1_bias: Option<Array>,
    conv2_weight: Array,
    conv2_bias: Option<Array>,
    /// Optional 1x1 convolution for channel mismatch on the skip path.
    nin_shortcut_weight: Option<Array>,
    nin_shortcut_bias: Option<Array>,
}

impl AudioResNetBlock {
    fn load(tensors: &HashMap<String, Array>, prefix: &str) -> Result<Self, InferenceError> {
        let conv1_weight = get_tensor(tensors, &format!("{prefix}.conv1.conv.weight"))?;
        let conv1_bias = tensors.get(&format!("{prefix}.conv1.conv.bias")).cloned();
        let conv2_weight = get_tensor(tensors, &format!("{prefix}.conv2.conv.weight"))?;
        let conv2_bias = tensors.get(&format!("{prefix}.conv2.conv.bias")).cloned();
        let nin_shortcut_weight = tensors
            .get(&format!("{prefix}.nin_shortcut.conv.weight"))
            .cloned();
        let nin_shortcut_bias = tensors
            .get(&format!("{prefix}.nin_shortcut.conv.bias"))
            .cloned();
        Ok(Self {
            conv1_weight,
            conv1_bias,
            conv2_weight,
            conv2_bias,
            nin_shortcut_weight,
            nin_shortcut_bias,
        })
    }

    fn forward(&self, x: &Array) -> Result<Array, mlx_rs::error::Exception> {
        let h = nn::silu(x)?;
        let h = audio_conv2d_forward(
            &h,
            &self.conv1_weight,
            self.conv1_bias.as_ref(),
            (1, 1),
            (1, 1),
        )?;
        let h = nn::silu(&h)?;
        let h = audio_conv2d_forward(
            &h,
            &self.conv2_weight,
            self.conv2_bias.as_ref(),
            (1, 1),
            (1, 1),
        )?;

        let skip = if let Some(ref sw) = self.nin_shortcut_weight {
            audio_conv2d_forward(x, sw, self.nin_shortcut_bias.as_ref(), (1, 1), (0, 0))?
        } else {
            x.clone()
        };

        ops::add(&skip, &h)
    }
}

/// Up-sampling block for the audio VAE decoder.
///
/// Contains one or more `AudioResNetBlock`s followed by an optional 2x
/// nearest-neighbor upsample + convolution.
struct AudioUpBlock {
    blocks: Vec<AudioResNetBlock>,
    upsample_conv_weight: Option<Array>,
    upsample_conv_bias: Option<Array>,
}

impl AudioUpBlock {
    fn forward(&self, x: &Array) -> Result<Array, mlx_rs::error::Exception> {
        let mut h = x.clone();
        for block in &self.blocks {
            h = block.forward(&h)?;
        }
        if let Some(ref w) = self.upsample_conv_weight {
            h = audio_upsample_2x(&h)?;
            h = audio_conv2d_forward(&h, w, self.upsample_conv_bias.as_ref(), (1, 1), (1, 1))?;
        }
        Ok(h)
    }
}

/// Audio VAE decoder for decoding audio latents to mel spectrograms.
///
/// Operates on 2D mel-spectrogram latents using standard Conv2d layers.
/// Weight tensors use the `audio_vae.decoder.*` prefix and are in MLX
/// channels-last format `[C_out, kH, kW, C_in]`.
///
/// Decoder pipeline:
///   conv_in → mid_block (2 ResNet blocks) → up_blocks (reversed) → SiLU → conv_out → denorm
struct AudioVae {
    conv_in_weight: Array,
    conv_in_bias: Option<Array>,
    mid_block_1: AudioResNetBlock,
    mid_block_2: AudioResNetBlock,
    /// Up blocks in weight-file order (highest resolution last).
    /// Processed in reverse during decoding.
    up_blocks: Vec<AudioUpBlock>,
    conv_out_weight: Array,
    conv_out_bias: Option<Array>,
    /// Per-channel mean for token denormalization, shape [128].
    /// Applied by callers to the 128-dim patched audio tokens BEFORE unpatchifying.
    denorm_mean: Array,
    /// Per-channel std for token denormalization, shape [128].
    denorm_std: Array,
}

impl AudioVae {
    fn load(tensors: &HashMap<String, Array>) -> Result<Self, InferenceError> {
        let pfx = "audio_vae.decoder";

        let conv_in_weight = get_tensor(tensors, &format!("{pfx}.conv_in.conv.weight"))?;
        let conv_in_bias = tensors.get(&format!("{pfx}.conv_in.conv.bias")).cloned();
        let conv_out_weight = get_tensor(tensors, &format!("{pfx}.conv_out.conv.weight"))?;
        let conv_out_bias = tensors.get(&format!("{pfx}.conv_out.conv.bias")).cloned();

        // Mid block: two ResNet blocks
        let mid_block_1 = AudioResNetBlock::load(tensors, &format!("{pfx}.mid.block_1"))?;
        let mid_block_2 = AudioResNetBlock::load(tensors, &format!("{pfx}.mid.block_2"))?;

        // Discover up blocks by scanning tensor keys for `audio_vae.decoder.up.N`
        let mut num_up_blocks = 0usize;
        for key in tensors.keys() {
            if let Some(rest) = key.strip_prefix(&format!("{pfx}.up.")) {
                if let Some(idx_str) = rest.split('.').next() {
                    if let Ok(idx) = idx_str.parse::<usize>() {
                        num_up_blocks = num_up_blocks.max(idx + 1);
                    }
                }
            }
        }

        let mut up_blocks = Vec::with_capacity(num_up_blocks);
        for i in 0..num_up_blocks {
            let bpfx = format!("{pfx}.up.{i}");

            // Discover how many resnet blocks in this up block
            let mut num_blocks = 0usize;
            for key in tensors.keys() {
                if let Some(rest) = key.strip_prefix(&format!("{bpfx}.block.")) {
                    if let Some(idx_str) = rest.split('.').next() {
                        if let Ok(idx) = idx_str.parse::<usize>() {
                            num_blocks = num_blocks.max(idx + 1);
                        }
                    }
                }
            }

            let mut blocks = Vec::with_capacity(num_blocks);
            for b in 0..num_blocks {
                blocks.push(AudioResNetBlock::load(
                    tensors,
                    &format!("{bpfx}.block.{b}"),
                )?);
            }

            let upsample_conv_weight = tensors
                .get(&format!("{bpfx}.upsample.conv.conv.weight"))
                .cloned();
            let upsample_conv_bias = tensors
                .get(&format!("{bpfx}.upsample.conv.conv.bias"))
                .cloned();

            up_blocks.push(AudioUpBlock {
                blocks,
                upsample_conv_weight,
                upsample_conv_bias,
            });
        }

        // Per-channel token statistics — required. A checkpoint that has the
        // decoder weights but lacks these is malformed; fail loudly at load.
        let denorm_mean = get_tensor(tensors, "audio_vae.per_channel_statistics._mean_of_means")?;
        let denorm_std = get_tensor(tensors, "audio_vae.per_channel_statistics._std_of_means")?;

        Ok(Self {
            conv_in_weight,
            conv_in_bias,
            mid_block_1,
            mid_block_2,
            up_blocks,
            conv_out_weight,
            conv_out_bias,
            denorm_mean,
            denorm_std,
        })
    }

    /// Decode audio VAE latents to a log-mel spectrogram.
    ///
    /// Input: `[B, C=8, T, F=16]` (NCHW VAE latents after the transformer
    /// output has been un-normalized and un-patchified upstream).
    /// Output: `[B, 2, T', 64]` (NCHW stereo log-mel, 64 mel bins).
    ///
    /// Per-channel statistics `[128]` apply to the *patchified* token tensor
    /// (`[B, T, 128]`), not to the post-conv_out spectrogram — that
    /// normalization is performed by the caller before unpatchifying to the
    /// shape this function consumes.
    fn decode(&self, latents: &Array) -> Result<Array, mlx_rs::error::Exception> {
        // Convert from NCHW to NHWC for MLX conv2d
        let x = ops::transpose_axes(latents, &[0, 2, 3, 1])?;

        let mut h = audio_conv2d_forward(
            &x,
            &self.conv_in_weight,
            self.conv_in_bias.as_ref(),
            (1, 1),
            (1, 1),
        )?;

        h = self.mid_block_1.forward(&h)?;
        h = self.mid_block_2.forward(&h)?;

        for block in self.up_blocks.iter().rev() {
            h = block.forward(&h)?;
        }

        h = nn::silu(&h)?;
        h = audio_conv2d_forward(
            &h,
            &self.conv_out_weight,
            self.conv_out_bias.as_ref(),
            (1, 1),
            (1, 1),
        )?;

        // Convert back to NCHW: [B, H, W, C] -> [B, C, H, W]
        ops::transpose_axes(&h, &[0, 3, 1, 2])
    }
}

// ─── Vocoder ──────────────────────────────────────────────────────────────

/// 1D convolution forward pass for channels-last tensors `[B, T, C]`.
///
/// Weight shape: `[C_out, K, C_in]` (MLX channels-last convention for 1D).
fn vocoder_conv1d(
    input: &Array,
    weight: &Array,
    bias: Option<&Array>,
    stride: i32,
    padding: i32,
    dilation: i32,
) -> Result<Array, mlx_rs::error::Exception> {
    // ops::conv1d expects input [N, L, C_in], weight [C_out, K, C_in]
    let mut y = ops::conv1d(input, weight, stride, padding, dilation, None::<i32>)?;
    if let Some(b) = bias {
        // bias [C_out] broadcasts over [B, T_out, C_out]
        y = ops::add(&y, b)?;
    }
    Ok(y)
}

/// 1D transposed convolution forward pass for channels-last tensors `[B, T, C]`.
///
/// Weight shape: `[C_out, K, C_in]` (MLX channels-last convention).
fn vocoder_conv_transpose1d(
    input: &Array,
    weight: &Array,
    bias: Option<&Array>,
    stride: i32,
    padding: i32,
) -> Result<Array, mlx_rs::error::Exception> {
    // ops::conv_transpose1d expects input [N, L, C_in], weight [C_out, K, C_in]
    let mut y = ops::conv_transpose1d(
        input,
        weight,
        stride,
        padding,
        None::<i32>,
        None::<i32>,
        None::<i32>,
    )?;
    if let Some(b) = bias {
        y = ops::add(&y, b)?;
    }
    Ok(y)
}

/// Snake activation: `x + (1/alpha) * sin^2(alpha * x)`.
///
/// Alpha and beta are learned per-channel parameters. Beta is used as a
/// secondary scaling factor on the sine term (replaces the `1/alpha` factor
/// when present).
fn snake_activation(
    x: &Array,
    alpha: &Array,
    beta: Option<&Array>,
) -> Result<Array, mlx_rs::error::Exception> {
    // alpha shape: [C] -- broadcast over [B, T, C]
    let ax = ops::multiply(x, alpha)?;
    let sin_ax = ops::sin(&ax)?;
    let sin2 = ops::multiply(&sin_ax, &sin_ax)?;
    // scale = beta if provided, else 1/alpha
    let scaled = if let Some(b) = beta {
        ops::multiply(&sin2, b)?
    } else {
        ops::divide(&sin2, alpha)?
    };
    ops::add(x, &scaled)
}

/// A single residual block in the BigVGAN vocoder.
///
/// Each block contains multiple pairs of (activation, conv) layers with
/// residual connections. The dilated convolutions capture multi-scale features.
struct VocoderResBlock {
    /// Pairs of (alpha1, beta1, conv1_weight, conv1_bias, alpha2, beta2, conv2_weight, conv2_bias).
    /// Each pair: act1 -> conv1 (dilated) -> act2 -> conv2, with residual skip.
    layers: Vec<VocoderResLayer>,
}

struct VocoderResLayer {
    alpha1: Array,
    beta1: Option<Array>,
    conv1_weight: Array,
    conv1_bias: Option<Array>,
    conv1_dilation: i32,
    alpha2: Array,
    beta2: Option<Array>,
    conv2_weight: Array,
    conv2_bias: Option<Array>,
}

impl VocoderResBlock {
    fn forward(&self, input: &Array) -> Result<Array, mlx_rs::error::Exception> {
        let mut x = input.clone();
        for layer in &self.layers {
            let residual = x.clone();
            // act1 -> conv1 (dilated)
            x = snake_activation(&x, &layer.alpha1, layer.beta1.as_ref())?;
            let k = layer.conv1_weight.shape()[1];
            let pad = layer.conv1_dilation * (k - 1) / 2;
            x = vocoder_conv1d(
                &x,
                &layer.conv1_weight,
                layer.conv1_bias.as_ref(),
                1,
                pad,
                layer.conv1_dilation,
            )?;
            // act2 -> conv2 (no dilation)
            x = snake_activation(&x, &layer.alpha2, layer.beta2.as_ref())?;
            let k2 = layer.conv2_weight.shape()[1];
            let pad2 = (k2 - 1) / 2;
            x = vocoder_conv1d(
                &x,
                &layer.conv2_weight,
                layer.conv2_bias.as_ref(),
                1,
                pad2,
                1,
            )?;
            x = ops::add(&x, &residual)?;
        }
        Ok(x)
    }
}

/// BigVGAN-style generator: conv_pre -> (upsample + resblocks)* -> act_post -> conv_post.
struct VocoderGenerator {
    conv_pre_weight: Array,
    conv_pre_bias: Option<Array>,
    /// Upsample stages: (weight, bias, stride) for transposed conv1d.
    ups: Vec<(Array, Option<Array>, i32)>,
    /// Residual blocks grouped by upsample stage.
    /// resblock_groups[i] contains the resblocks applied after ups[i].
    resblock_groups: Vec<Vec<VocoderResBlock>>,
    act_post_alpha: Array,
    act_post_beta: Option<Array>,
    conv_post_weight: Array,
    conv_post_bias: Option<Array>,
}

impl VocoderGenerator {
    fn forward(&self, input: &Array) -> Result<Array, mlx_rs::error::Exception> {
        // input: [B, T, C_in] channels-last
        let k_pre = self.conv_pre_weight.shape()[1];
        let pad_pre = (k_pre - 1) / 2;
        let mut x = vocoder_conv1d(
            input,
            &self.conv_pre_weight,
            self.conv_pre_bias.as_ref(),
            1,
            pad_pre,
            1,
        )?;

        // Upsample stages interleaved with resblocks
        for (i, (up_w, up_b, stride)) in self.ups.iter().enumerate() {
            let k_up = up_w.shape()[1];
            // For transposed conv: padding = (kernel_size - stride) / 2
            let pad_up = (k_up - *stride) / 2;
            x = vocoder_conv_transpose1d(&x, up_w, up_b.as_ref(), *stride, pad_up)?;

            // Apply all resblocks for this stage and average their outputs
            if let Some(resblocks) = self.resblock_groups.get(i) {
                if !resblocks.is_empty() {
                    let mut sum = resblocks[0].forward(&x)?;
                    for rb in &resblocks[1..] {
                        let out = rb.forward(&x)?;
                        sum = ops::add(&sum, &out)?;
                    }
                    let n = Array::from_f32(resblocks.len() as f32);
                    x = ops::divide(&sum, &n)?;
                }
            }
        }

        // Final activation and output projection
        x = snake_activation(&x, &self.act_post_alpha, self.act_post_beta.as_ref())?;
        let k_post = self.conv_post_weight.shape()[1];
        let pad_post = (k_post - 1) / 2;
        x = vocoder_conv1d(
            &x,
            &self.conv_post_weight,
            self.conv_post_bias.as_ref(),
            1,
            pad_post,
            1,
        )?;
        // Tanh output to bound waveform in [-1, 1]
        ops::tanh(&x)
    }
}

/// BigVGAN-style vocoder for converting audio VAE latents to waveform.
///
/// Architecture: main generator path with transposed-conv upsampling and
/// snake-activation residual blocks. Produces stereo audio (2 output channels).
struct Vocoder {
    generator: VocoderGenerator,
}

impl Vocoder {
    fn load(tensors: &HashMap<String, Array>) -> Result<Self, InferenceError> {
        let prefix = "vocoder";
        let gen = Self::load_generator(tensors, prefix)?;
        Ok(Self { generator: gen })
    }

    /// Discover and load a generator (main or BWE) from tensor keys.
    fn load_generator(
        tensors: &HashMap<String, Array>,
        prefix: &str,
    ) -> Result<VocoderGenerator, InferenceError> {
        // conv_pre
        let conv_pre_weight = get_tensor(tensors, &format!("{prefix}.conv_pre.weight"))?;
        let conv_pre_bias = tensors.get(&format!("{prefix}.conv_pre.bias")).cloned();

        // Discover upsample stages by scanning keys for ups.N.weight
        let mut num_ups = 0usize;
        loop {
            let key = format!("{prefix}.ups.{num_ups}.weight");
            if tensors.contains_key(&key) {
                num_ups += 1;
            } else {
                break;
            }
        }
        info!(num_ups, prefix, "vocoder: discovered upsample stages");

        let mut ups = Vec::with_capacity(num_ups);
        for i in 0..num_ups {
            let w = get_tensor(tensors, &format!("{prefix}.ups.{i}.weight"))?;
            let b = tensors.get(&format!("{prefix}.ups.{i}.bias")).cloned();
            // Infer stride from kernel size: BigVGAN uses stride = kernel_size / 2
            // but we derive it from the weight shape change between stages.
            // Common pattern: k=11 -> stride=5 or 6, k=12 -> stride=6.
            // Use kernel_size / 2 as a reasonable default; the exact stride can be
            // determined from the channel reduction pattern.
            let k = w.shape()[1];
            // BigVGAN upsample: stride = kernel_size / 2 (rounded)
            let stride = k / 2;
            if stride < 1 {
                return Err(InferenceError::InferenceFailed(format!(
                    "vocoder ups.{i} has invalid kernel size {k}"
                )));
            }
            ups.push((w, b, stride));
        }

        // Discover resblocks by scanning keys for resblocks.N.convs1.0.weight
        let mut num_resblocks = 0usize;
        loop {
            let key = format!("{prefix}.resblocks.{num_resblocks}.convs1.0.weight");
            if tensors.contains_key(&key) {
                num_resblocks += 1;
            } else {
                break;
            }
        }
        info!(num_resblocks, prefix, "vocoder: discovered resblocks");

        // Group resblocks by upsample stage.
        // BigVGAN interleaves: after each upsample, apply `num_kernels` resblocks.
        // num_kernels = num_resblocks / num_ups (typically 3 resblocks per stage).
        let blocks_per_stage = if num_ups > 0 && num_resblocks > 0 {
            num_resblocks / num_ups
        } else {
            num_resblocks
        };

        let mut resblock_groups: Vec<Vec<VocoderResBlock>> = Vec::new();
        let mut rb_idx = 0usize;
        for _stage in 0..num_ups {
            let mut stage_blocks = Vec::new();
            for _ in 0..blocks_per_stage {
                if rb_idx >= num_resblocks {
                    break;
                }
                let block = Self::load_resblock(tensors, prefix, rb_idx)?;
                stage_blocks.push(block);
                rb_idx += 1;
            }
            resblock_groups.push(stage_blocks);
        }
        // Any remaining resblocks get appended to the last stage
        if rb_idx < num_resblocks && !resblock_groups.is_empty() {
            let last = resblock_groups.last_mut().unwrap();
            while rb_idx < num_resblocks {
                let block = Self::load_resblock(tensors, prefix, rb_idx)?;
                last.push(block);
                rb_idx += 1;
            }
        }

        // act_post
        let act_post_alpha = get_tensor(tensors, &format!("{prefix}.act_post.act.alpha"))?;
        let act_post_beta = tensors.get(&format!("{prefix}.act_post.act.beta")).cloned();

        // conv_post
        let conv_post_weight = get_tensor(tensors, &format!("{prefix}.conv_post.weight"))?;
        let conv_post_bias = tensors.get(&format!("{prefix}.conv_post.bias")).cloned();

        Ok(VocoderGenerator {
            conv_pre_weight,
            conv_pre_bias,
            ups,
            resblock_groups,
            act_post_alpha,
            act_post_beta,
            conv_post_weight,
            conv_post_bias,
        })
    }

    /// Load a single residual block from tensors.
    fn load_resblock(
        tensors: &HashMap<String, Array>,
        prefix: &str,
        idx: usize,
    ) -> Result<VocoderResBlock, InferenceError> {
        // Discover number of conv layers in this resblock
        let mut num_layers = 0usize;
        loop {
            let key = format!("{prefix}.resblocks.{idx}.convs1.{num_layers}.weight");
            if tensors.contains_key(&key) {
                num_layers += 1;
            } else {
                break;
            }
        }

        // BigVGAN dilations pattern: [1, 3, 5] for the 3 layers in each resblock
        let dilations = [1, 3, 5, 7, 11, 13];

        let mut layers = Vec::with_capacity(num_layers);
        for l in 0..num_layers {
            let alpha1 = get_tensor(
                tensors,
                &format!("{prefix}.resblocks.{idx}.acts1.{l}.act.alpha"),
            )
            .or_else(|_| {
                // Fallback: if no per-layer activation, use ones
                Array::ones::<f32>(&[1]).map_err(|e| InferenceError::InferenceFailed(e.to_string()))
            })?;
            let beta1 = tensors
                .get(&format!("{prefix}.resblocks.{idx}.acts1.{l}.act.beta"))
                .cloned();
            let conv1_weight = get_tensor(
                tensors,
                &format!("{prefix}.resblocks.{idx}.convs1.{l}.weight"),
            )?;
            let conv1_bias = tensors
                .get(&format!("{prefix}.resblocks.{idx}.convs1.{l}.bias"))
                .cloned();

            let alpha2 = get_tensor(
                tensors,
                &format!("{prefix}.resblocks.{idx}.acts2.{l}.act.alpha"),
            )
            .or_else(|_| {
                Array::ones::<f32>(&[1]).map_err(|e| InferenceError::InferenceFailed(e.to_string()))
            })?;
            let beta2 = tensors
                .get(&format!("{prefix}.resblocks.{idx}.acts2.{l}.act.beta"))
                .cloned();
            let conv2_weight = get_tensor(
                tensors,
                &format!("{prefix}.resblocks.{idx}.convs2.{l}.weight"),
            )?;
            let conv2_bias = tensors
                .get(&format!("{prefix}.resblocks.{idx}.convs2.{l}.bias"))
                .cloned();

            let dilation = if l < dilations.len() { dilations[l] } else { 1 };

            layers.push(VocoderResLayer {
                alpha1,
                beta1,
                conv1_weight,
                conv1_bias,
                conv1_dilation: dilation,
                alpha2,
                beta2,
                conv2_weight,
                conv2_bias,
            });
        }

        Ok(VocoderResBlock { layers })
    }

    /// Convert audio VAE latent features to waveform.
    ///
    /// Input: `[B, C, T]` (channels-first from audio VAE output).
    /// Output: `[B, 2, T_wav]` (stereo waveform, channels-first).
    fn forward(&self, audio: &Array) -> Result<Array, mlx_rs::error::Exception> {
        let shape = audio.shape();
        if shape.len() < 2 {
            return Array::zeros::<f32>(&[1, 2, 1]);
        }

        // Convert from channels-first [B, C, T] to channels-last [B, T, C] for MLX conv1d
        let x = if shape.len() == 3 {
            ops::transpose_axes(audio, &[0, 2, 1])?
        } else {
            // [C, T] -> [1, T, C]
            let expanded = ops::expand_dims(audio, 0)?;
            ops::transpose_axes(&expanded, &[0, 2, 1])?
        };

        // Run main generator
        let wav = self.generator.forward(&x)?;

        // Convert back to channels-first [B, T_wav, C_out] -> [B, C_out, T_wav]
        ops::transpose_axes(&wav, &[0, 2, 1])
    }
}

// ─── Rectified Flow Scheduler ─────────────────────────────────────────────

/// Rectified flow scheduler for LTX-2.3.
///
/// LTX uses rectified flow with a sigma schedule. Upstream exposes:
///   - DISTILLED_SIGMAS: 9-entry baked schedule for the 8-step distilled model
///   - ltx2_schedule(): dynamic token-count-dependent schedule for full model
///
/// `timesteps` holds `num_inference_steps + 1` sigmas (includes terminal 0.0),
/// so step_idx → (sigmas[i], sigmas[i+1]) pairs drive the Euler update
///   d = (x - x0) / sigma ≡ v (since x0 = x - sigma * v)
///   x_next = x + (sigma_next - sigma) * v
pub struct RectifiedFlowScheduler {
    pub num_inference_steps: usize,
    pub timesteps: Vec<f32>,
}

/// Upstream 8-step distilled sigma schedule (9 entries incl. terminal 0.0).
/// Source: ltx-pipelines-mlx/scheduler.py DISTILLED_SIGMAS.
const DISTILLED_SIGMAS_8: [f32; 9] = [
    1.0, 0.99375, 0.9875, 0.98125, 0.975, 0.909375, 0.725, 0.421875, 0.0,
];

const BASE_SHIFT_ANCHOR: f32 = 1024.0;
const MAX_SHIFT_ANCHOR: f32 = 4096.0;

/// Dynamic schedule for non-distilled models. Port of ltx2_schedule().
/// `num_tokens` = number of video latent tokens (affects sigma shift).
pub fn ltx2_schedule(steps: usize, num_tokens: usize) -> Vec<f32> {
    let max_shift = 2.05f32;
    let base_shift = 0.95f32;
    let terminal = 0.1f32;

    // Linearly spaced from 1.0 to 0.0 with (steps+1) points.
    let mut sigmas: Vec<f32> = (0..=steps)
        .map(|i| 1.0 - (i as f32 / steps as f32))
        .collect();

    let mm = (max_shift - base_shift) / (MAX_SHIFT_ANCHOR - BASE_SHIFT_ANCHOR);
    let b = base_shift - mm * BASE_SHIFT_ANCHOR;
    let sigma_shift = num_tokens as f32 * mm + b;
    let exp_s = sigma_shift.exp();

    // Shift non-zero sigmas; avoid 1/0 for the terminal zero entry.
    for s in sigmas.iter_mut() {
        if *s != 0.0 {
            *s = exp_s / (exp_s + (1.0 / *s - 1.0));
        }
    }

    // Stretch so the last non-zero sigma maps to `terminal`.
    // idx of last non-zero:
    let last_nonzero_idx = sigmas
        .iter()
        .rposition(|&s| s != 0.0)
        .unwrap_or(sigmas.len() - 1);
    let one_minus_last = 1.0 - sigmas[last_nonzero_idx];
    let scale = one_minus_last / (1.0 - terminal);
    if scale != 0.0 {
        for s in sigmas.iter_mut() {
            if *s != 0.0 {
                *s = 1.0 - (1.0 - *s) / scale;
            }
        }
    }
    sigmas
}

impl RectifiedFlowScheduler {
    /// Build scheduler with `num_inference_steps` denoising steps.
    /// `num_tokens` is the video latent token count (for dynamic schedule).
    /// If `steps == 8`, prefers the distilled baked schedule.
    pub fn new_with_tokens(num_inference_steps: usize, num_tokens: usize) -> Self {
        let timesteps = if num_inference_steps == 8 {
            DISTILLED_SIGMAS_8.to_vec()
        } else {
            ltx2_schedule(num_inference_steps, num_tokens)
        };
        Self {
            num_inference_steps,
            timesteps,
        }
    }

    /// Legacy constructor — uses distilled schedule at 8 steps, otherwise
    /// a linear 1→0 schedule. Prefer `new_with_tokens` when you know token count.
    pub fn new(num_inference_steps: usize) -> Self {
        let timesteps = if num_inference_steps == 8 {
            DISTILLED_SIGMAS_8.to_vec()
        } else {
            // Fall back to dynamic schedule with default anchor token count.
            ltx2_schedule(num_inference_steps, MAX_SHIFT_ANCHOR as usize)
        };
        Self {
            num_inference_steps,
            timesteps,
        }
    }

    /// Perform one rectified flow step using sigma-based Euler update.
    ///
    /// Upstream (samplers.py `euler_step`):
    ///   x0 = x - sigma * v   (X0Model wrapper)
    ///   d  = (x - x0) / sigma = v
    ///   x_next = x + (sigma_next - sigma) * d
    ///
    /// Which reduces to `x + dt * v` with `dt = sigma_next - sigma`.
    /// The sigma *schedule* (not this formula) is what matters for parity.
    pub fn step(
        &self,
        velocity: &Array,
        step_index: usize,
        sample: &Array,
    ) -> Result<Array, mlx_rs::error::Exception> {
        let sigma = self.timesteps[step_index];
        let sigma_next = if step_index + 1 < self.timesteps.len() {
            self.timesteps[step_index + 1]
        } else {
            0.0
        };
        let dt = Array::from_f32(sigma_next - sigma);
        ops::add(sample, &ops::multiply(velocity, &dt)?)
    }

    /// Create initial random noise latents.
    ///
    /// Matches upstream `create_initial_state` in latent_cond.py:
    ///   mx.random.seed(seed)
    ///   noise = mx.random.normal(shape).astype(mx.bfloat16)
    ///
    /// The previous implementation passed an explicit key
    /// (`random::key(seed)` + `normal(..., Some(&key))`) which MLX derives
    /// differently from the seed-global path and produced different noise
    /// even for the same seed — breaking parity from token 0 onward.
    pub fn init_noise(&self, shape: &[i32], seed: u64) -> Result<Array, mlx_rs::error::Exception> {
        mlx_rs::random::seed(seed)?;
        let noise = mlx_rs::random::normal::<f32>(shape, None, None, None)?;
        noise.as_dtype(mlx_rs::Dtype::Bfloat16)
    }
}

// ─── Sinusoidal Timestep Encoding ─────────────────────────────────────────

/// Sinusoidal timestep embedding (256 dims).
fn timestep_embedding(timestep: f32, dim: usize) -> Result<Array, mlx_rs::error::Exception> {
    let half = dim / 2;
    // Match `timestep_embedding_tensor`: max_period=10000, layout [cos, sin].
    let ln_max_period = 10_000_f32.ln();
    let mut emb = vec![0.0f32; dim];
    for i in 0..half {
        let freq = (-(i as f32) / half as f32 * ln_max_period).exp();
        emb[i] = (timestep * freq).cos();
        emb[i + half] = (timestep * freq).sin();
    }
    Ok(Array::from_slice(&emb, &[1, dim as i32]))
}

/// Tensor-aware sinusoidal timestep embedding. Accepts `timestep` of any
/// broadcast-compatible shape `[...]` and returns `[..., dim]`. Used for
/// per-token timesteps in i2v / video-extension modes where conditioned
/// positions must stay at t=0 while un-conditioned ones carry the full
/// scheduler sigma.
fn timestep_embedding_tensor(
    timestep: &Array,
    dim: usize,
) -> Result<Array, mlx_rs::error::Exception> {
    let half = dim / 2;
    // Upstream `get_timestep_embedding(flip_sin_to_cos=True, max_period=10000)`:
    // - base frequencies: `freq = exp(-log(10000) * i / half)`; a prior
    //   revision used `LN_2 * 10` (base 1024) — wrong band entirely.
    // - output layout: `[cos, sin]` (upstream builds `[sin, cos]` then
    //   swaps halves). Consumers' learned weights expect this exact order.
    let ln_max_period = 10_000_f32.ln();
    let freqs: Vec<f32> = (0..half)
        .map(|i| (-(i as f32) / half as f32 * ln_max_period).exp())
        .collect();
    let freqs = Array::from_slice(&freqs, &[half as i32]);
    // `timestep.unsqueeze(-1)` → [..., 1]; broadcast-multiply with freqs [half]
    // → [..., half]. Concat cos/sin on last axis → [..., dim].
    let t = ops::expand_dims(timestep, -1)?;
    let tf = ops::multiply(&t, &freqs)?;
    let s = ops::sin(&tf)?;
    let c = ops::cos(&tf)?;
    ops::concatenate_axis(&[&c, &s], -1)
}

// ─── LtxBackend ───────────────────────────────────────────────────────────

/// LTX-2.3 generation backend for Apple Silicon via MLX.
///
/// Supports all generation modes:
/// - **Text-to-video (t2v)**: Generate video from text prompt
/// - **Image-to-video (i2v)**: Generate video from text + reference image
/// - **Audio-video**: Generate synchronized audio+video from text
/// - **Video extension/retake**: Extend or regenerate video segments
pub struct LtxBackend {
    transformer: LtxTransformer,
    connector: TextEmbeddingConnector,
    vae: VaeDecoder3D,
    /// Optional VAE encoder for image-to-video / extension modes.
    /// Only present when `vae_encoder.safetensors` is available in the checkpoint.
    vae_encoder: Option<VaeEncoder3D>,
    audio_vae: AudioVae,
    vocoder: Vocoder,
    config: LtxConfig,
    /// Optional T5 text encoder (shared from Flux model weights).
    t5_encoder: Option<super::mlx_flux::T5TextEncoder>,
    /// Optional T5 tokenizer.
    t5_tokenizer: Option<tokenizers::Tokenizer>,
    /// Optional Gemma 3 12B text encoder — the model LTX-2.3 was actually
    /// trained with. When present, produces the [1, T, 188160] per-token
    /// feature tensor that feeds the connector's aggregate_embed layer.
    gemma_encoder: Option<super::mlx_gemma3::Gemma3TextEncoder>,
}

// SAFETY: LtxBackend is only accessed through RwLock in InferenceEngine.
unsafe impl Send for LtxBackend {}
unsafe impl Sync for LtxBackend {}

impl LtxBackend {
    /// Load the FULL LTX-2.3 model from a directory containing safetensors weights.
    ///
    /// Required weight files:
    /// - `transformer-distilled.safetensors` — transformer blocks (4-bit quantized)
    /// - `connector.safetensors` — text embedding connector (BF16)
    /// - `vae_decoder.safetensors` — 3D causal VAE decoder (BF16)
    /// - `audio_vae.safetensors` — audio VAE (BF16)
    /// - `vocoder.safetensors` — vocoder for audio output (BF16)
    pub fn load(model_dir: &Path) -> Result<Self, InferenceError> {
        let config = LtxConfig::default();

        info!(
            hidden = config.hidden_dim,
            layers = config.num_layers,
            heads = config.num_heads,
            audio_hidden = config.audio_hidden_dim,
            audio_heads = config.audio_heads,
            "loading FULL LTX-2.3 model via MLX (video + audio)"
        );

        // Set default device
        #[cfg(feature = "mlx-metal")]
        let default_device = mlx_rs::Device::gpu();
        #[cfg(not(feature = "mlx-metal"))]
        let default_device = mlx_rs::Device::cpu();

        match std::env::var("CAR_MLX_DEVICE").ok().as_deref() {
            Some("cpu") => mlx_rs::Device::set_default(&mlx_rs::Device::cpu()),
            #[cfg(feature = "mlx-metal")]
            Some("gpu") => mlx_rs::Device::set_default(&mlx_rs::Device::gpu()),
            _ => mlx_rs::Device::set_default(&default_device),
        }

        info!("loading safetensors weights for LTX-2.3 (all modalities)");
        let tensors = load_ltx_tensors(model_dir)?;
        info!(tensors = tensors.len(), "LTX tensors loaded");

        let transformer = LtxTransformer::load(&tensors, &config)?;
        info!("LTX transformer loaded (48 blocks, video + audio pathways)");

        let connector = TextEmbeddingConnector::load(&tensors)?;
        info!("text embedding connector loaded (video + audio)");

        let vae = VaeDecoder3D::load(&tensors)?;
        info!("3D causal VAE decoder loaded");

        // Encoder is optional: `vae_encoder.safetensors` may not be present in
        // every checkpoint. Distinguish "file absent" (None, i2v will error at
        // call time) from "file present but malformed" (propagate — silent
        // swallow would make i2v failures impossible to debug).
        let vae_encoder = if model_dir.join("vae_encoder.safetensors").exists() {
            let enc = VaeEncoder3D::load(&tensors)?;
            info!("3D causal VAE encoder loaded (i2v available)");
            Some(enc)
        } else {
            None
        };

        let audio_vae = AudioVae::load(&tensors)?;
        info!("audio VAE loaded");

        let vocoder = Vocoder::load(&tensors)?;
        info!("vocoder loaded");

        // Try to load T5 encoder from Flux model (shares T5-XXL weights).
        // Look for Flux model in the same parent directory or HuggingFace cache.
        let (t5_encoder, t5_tokenizer) = Self::try_load_t5(model_dir);

        // Try to load Gemma 3 12B text encoder — the real LTX-2.3 text encoder.
        // Absence falls back to zero embeddings (unconditional-ish generation).
        let gemma_encoder = match super::mlx_gemma3::Gemma3TextEncoder::try_load_default() {
            Ok(Some(enc)) => {
                info!("Gemma 3 12B text encoder loaded (text conditioning active)");
                Some(enc)
            }
            Ok(None) => {
                tracing::warn!(
                    "Gemma 3 12B cache not found — text prompt will be ignored. \
                     Run `huggingface-cli download mlx-community/gemma-3-12b-it-4bit` \
                     to enable text conditioning."
                );
                None
            }
            Err(e) => {
                tracing::warn!(error = %e, "Gemma 3 12B load failed — running unconditional");
                None
            }
        };

        info!("FULL LTX-2.3 model loaded successfully (all generation modes)");
        Ok(Self {
            transformer,
            connector,
            vae,
            vae_encoder,
            audio_vae,
            vocoder,
            config,
            t5_encoder,
            t5_tokenizer,
            gemma_encoder,
        })
    }

    /// Try to load T5-XXL encoder from a Flux model in the HuggingFace cache.
    /// Returns (None, None) if not found — generation falls back to unconditional.
    fn try_load_t5(
        _model_dir: &Path,
    ) -> (
        Option<super::mlx_flux::T5TextEncoder>,
        Option<tokenizers::Tokenizer>,
    ) {
        let flux_dir = latest_huggingface_repo_snapshot("mlx-community/Flux-1.lite-8B-MLX-Q4");

        let Some(flux_dir) = flux_dir else {
            info!("Flux model not found in HF cache — T5 encoder unavailable for LTX");
            return (None, None);
        };

        // Load T5 weights from Flux text_encoder_2
        info!(flux_dir = %flux_dir.display(), "loading T5-XXL from Flux model for LTX text conditioning");
        let t5_tensors = match super::mlx::load_all_tensors(&flux_dir) {
            Ok(t) => t,
            Err(e) => {
                tracing::warn!("failed to load Flux tensors for T5: {e}");
                return (None, None);
            }
        };

        let flux_config = super::mlx_flux::FluxConfig::default();
        let t5 = match super::mlx_flux::T5TextEncoder::load(&t5_tensors, &flux_config) {
            Ok(t5) => t5,
            Err(e) => {
                tracing::warn!("failed to load T5 encoder from Flux: {e}");
                return (None, None);
            }
        };
        info!("T5-XXL encoder loaded from Flux model");

        // Load tokenizer
        let tok_path = flux_dir.join("tokenizer_2").join("tokenizer.json");
        let tokenizer = if tok_path.exists() {
            match tokenizers::Tokenizer::from_file(&tok_path) {
                Ok(t) => Some(t),
                Err(e) => {
                    tracing::warn!("failed to load T5 tokenizer: {e}");
                    None
                }
            }
        } else {
            info!("T5 tokenizer not found at {}", tok_path.display());
            None
        };

        (Some(t5), tokenizer)
    }

    /// Generate video (and optionally audio) from a text prompt.
    ///
    /// Supported generation modes:
    /// - **Text-to-video (t2v)**: Standard text-prompted video generation
    /// - **Image-to-video (i2v)**: Video generation conditioned on a reference image
    /// - **Audio-video**: Synchronized audio+video generation with cross-attention
    /// - **Video extension/retake**: Extend existing video or regenerate segments
    pub fn generate(
        &mut self,
        req: &GenerateVideoRequest,
    ) -> Result<GenerateVideoResult, InferenceError> {
        req.validate().map_err(InferenceError::InferenceFailed)?;

        let width = req.width.unwrap_or(768);
        let height = req.height.unwrap_or(512);
        let num_frames = req.num_frames.unwrap_or(41);
        let steps = req.steps.unwrap_or(20) as usize;
        let output_fps = req.fps.unwrap_or(24);
        let guidance_scale = req.guidance.unwrap_or(3.0);
        // Audio gets a gentler default CFG: video CFG ranges (3-8) over-guide the
        // audio branch into pumping / robotic timbre. Callers can override via
        // `audio_guidance`; the fallback caps at 3.0 so users don't silently
        // inherit a high video-only CFG.
        let audio_guidance_scale = req
            .audio_guidance
            .unwrap_or_else(|| guidance_scale.min(3.0));
        let seed = req.seed.unwrap_or(42);

        let _map_err = |e: mlx_rs::error::Exception| InferenceError::InferenceFailed(e.to_string());

        info!(
            prompt = %req.prompt,
            width,
            height,
            num_frames,
            steps,
            guidance = guidance_scale,
            seed,
            "generating video with LTX-2.3 (full model)"
        );

        let output_path = req
            .output_path
            .clone()
            .unwrap_or_else(|| "output.mp4".to_string());

        let map_err = |e: mlx_rs::error::Exception| InferenceError::InferenceFailed(e.to_string());

        // 1. Build the per-token feature grid that feeds the connector's
        // `{video,audio}_aggregate_embed` (Linear(188160 → {4096,2048})).
        //
        // Upstream uses Gemma 3 12B: 48 decoder layers + 1 embedding = 49
        // hidden states, each [B, T, 3840]. They are stacked, per-token
        // RMS-normed over the 3840 axis, and flattened to [B, T, 188160].
        // The connector then prepends 128 learnable registers and runs 8
        // transformer-1D blocks.
        //
        // When the Gemma encoder isn't loaded (cache absent), we feed zeros
        // with the correct shape — the 128 learnable registers carry enough
        // model-learned bias to produce coherent unconditional video.
        let aggregate_dim: i32 = 188160;
        // Upstream ltx-2-mlx (`GemmaFeaturesExtractorV2`) always pads to 1024
        // tokens. Running with 64 landed the Gemma output in the wrong shape
        // against the downstream connector (which expects 1024-token context)
        // and the observed symptom was a checkerboard-pixel video output.
        let gemma_seq_default: i32 = 1024;
        let (text_embed, n_valid) = if let Some(ref mut gemma) = self.gemma_encoder {
            info!(prompt = %req.prompt, "encoding prompt via Gemma 3 12B");
            gemma.encode_for_ltx(&req.prompt, gemma_seq_default as usize)?
        } else {
            if !req.prompt.is_empty() {
                tracing::warn!(
                    prompt = %req.prompt,
                    "Gemma 3 encoder unavailable — text prompt ignored"
                );
            }
            (
                Array::zeros::<f32>(&[1, gemma_seq_default, aggregate_dim]).map_err(map_err)?,
                0usize,
            )
        };

        // 2. Pass through TextEmbeddingConnector to get video + audio conditioning
        let (video_cond, audio_cond) = self
            .connector
            .forward(&text_embed, n_valid)
            .map_err(map_err)?;

        info!(
            video_cond_shape = ?video_cond.shape(),
            audio_cond_shape = ?audio_cond.shape(),
            "text conditioning computed via connector"
        );

        // 3. Compute latent dimensions.
        // Upstream (LTX video VAE docstring): `H' = H/32, W' = W/32,
        //  F' = 1 + (F-1)/8`. The 32x spatial factor is block-level 8x
        // downsample × the 4x4 patch-pack at conv_in. Previously I used 8x
        // spatial and frames/8 temporal — decoder then expanded 32x and 8x,
        // giving outputs 4x too big spatially and off-by-one temporally.
        let spatial_downsample: i32 = 32;
        let effective_h = (height as i32 / spatial_downsample) * spatial_downsample;
        let effective_w = (width as i32 / spatial_downsample) * spatial_downsample;
        let latent_h = effective_h / spatial_downsample;
        let latent_w = effective_w / spatial_downsample;

        // Temporal: causal packing means the first frame is special.
        // Upstream: F' = 1 + (F-1)/8; round requested frames down to 1 + 8*k.
        let nf = num_frames as i32;
        let latent_t = 1 + (nf - 1) / 8;
        let effective_t = 1 + (latent_t - 1) * 8;
        if effective_h != height as i32 || effective_w != width as i32 || effective_t != nf {
            tracing::warn!(
                requested_h = height, effective_h,
                requested_w = width, effective_w,
                requested_frames = nf, effective_frames = effective_t,
                "dimensions adjusted to upstream constraints (H,W divisible by 32; frames = 1 + 8k)"
            );
        }
        let latent_c = self.config.in_channels as i32; // 128

        // Flatten spatial+temporal into sequence for transformer:
        // num_patches = latent_t * latent_h * latent_w
        let num_patches = latent_t * latent_h * latent_w;

        info!(
            latent_t,
            latent_h, latent_w, latent_c, num_patches, "computed latent dimensions"
        );

        // 4. Optional i2v / extension conditioning: encode reference frame(s) into latent
        // space and produce an anchor latent we re-inject into the first temporal slice.
        //
        // Layout reminder: the sequence latent has shape [1, num_patches, 128] where
        // patches are ordered (t, h, w) — so the first `latent_h * latent_w` rows
        // correspond to t=0. We clamp those rows to the encoded reference each step.
        let anchor_latent = self.compute_anchor_latent(req, latent_h, latent_w)?;
        let anchor_patches = (latent_h * latent_w) as usize;

        // Audio pathway (joint synthesis) for AudioVideo mode.
        // Upstream formula: audio_latent_frames = round(num_frames / fps * 25)
        // 25 = sample_rate / hop_length / latent_downsample = 16000 / 160 / 4.
        // Latent channels = 128 (8 VAE channels × 16 latent mel bins, flattened via patchify).
        // LTX-2.3 is a joint audio+video model: the transformer's AV
        // cross-modal attention runs even for "t2v only" generation in the
        // upstream pipeline (TextToVideoPipeline.generate produces audio
        // alongside video unconditionally). Previously we gated this on
        // `VideoMode::AudioVideo`, which skipped AV cross-attention entirely
        // for plain T2v — producing a completely different velocity estimate
        // per step. Always run the joint path; the caller can choose not to
        // emit the audio track downstream.
        let audio_mode = true;
        let _ = req.effective_mode();
        let audio_fps = output_fps as f32;
        let audio_frames = if audio_mode {
            (num_frames as f32 / audio_fps * 25.0).round() as i32
        } else {
            0
        };
        let audio_latent_c: i32 = 128;
        // Upstream ti2vid_one_stage uses `create_initial_state(audio_shape, seed + 1)`
        // for the audio noise. Previously we XOR'd with a golden-ratio constant
        // to decorrelate the streams — good intuition, wrong constant vs
        // reference. Match upstream exactly so AV cross-modal attention sees
        // the same audio prior.
        let audio_seed = seed.wrapping_add(1);

        // 5. Initialize noise latents via RectifiedFlowScheduler.
        // num_patches drives the dynamic schedule's token-count shift for
        // non-distilled runs; the 8-step path uses DISTILLED_SIGMAS verbatim.
        let scheduler = RectifiedFlowScheduler::new_with_tokens(steps, num_patches as usize);
        let mut latents = scheduler
            .init_noise(&[1, num_patches, latent_c], seed)
            .map_err(map_err)?;

        let mode = req.effective_mode();
        if mode == VideoMode::AudioRefVideo {
            // AudioRefVideo on the native backend currently does NOT
            // condition video on the audio bytes — the LTX audio VAE
            // encoder hasn't been ported, and the earlier RMS-envelope
            // synthesis attempt produced artifacts (#123). The
            // request shape is kept so downstream tooling can name
            // "this clip pairs with that song segment," but the
            // generated frames are pure text-to-video.
            //
            // Loud warn (not info) per #130 — callers were getting
            // confused by the API name implying conditioning, so the
            // log line surfaces the actual capability every call.
            let audio_path = req.audio_path.as_deref().ok_or_else(|| {
                InferenceError::InferenceFailed("audio_ref_video requires audio_path".to_string())
            })?;
            tracing::warn!(
                audio_path,
                "audio_ref_video: audio_path is passthrough-only on this backend — \
                 recorded on the request for downstream muxing, NOT used for \
                 video conditioning. Real conditioning is tracked at \
                 Parslee-ai/car#130; the CLI exposes this path via `--audio-mux` \
                 (#183)."
            );
        }

        let mut audio_latents = if audio_mode {
            Some(
                scheduler
                    .init_noise(&[1, audio_frames, audio_latent_c], audio_seed)
                    .map_err(map_err)?,
            )
        } else {
            None
        };

        if audio_mode {
            info!(
                audio_frames,
                audio_latent_c,
                mode = ?mode,
                "joint audio+video pathway enabled"
            );
        }

        // If we have an anchor, overwrite the first temporal slice so the denoising
        // starts from the reference rather than pure noise.
        if let Some(ref anchor) = anchor_latent {
            latents =
                splice_first_temporal_slice(&latents, anchor, anchor_patches).map_err(map_err)?;
        }

        info!(steps, "starting rectified flow denoising loop");

        // Build RoPE tensors once — positions are fixed across denoising steps.
        // max_pos defaults are upstream's: video [20, 2048, 2048] (temporal
        // seconds × height × width), audio [20] (seconds), cross [20] (seconds).
        let rope = RopeBundle::build(
            latent_t,
            latent_h,
            latent_w,
            if audio_mode { audio_frames } else { 0 },
            audio_fps,
            self.config.num_heads,
            self.config.head_dim,
            self.config.audio_heads,
            self.config.audio_head_dim,
        )
        .map_err(map_err)?;

        // CFG conditioning inputs are shape-invariant across steps — build once.
        let cfg_enabled = guidance_scale > 1.0;
        let video_scale = Array::from_f32(guidance_scale);
        let audio_scale = Array::from_f32(audio_guidance_scale);
        let null_cond = if cfg_enabled {
            Some(Array::zeros::<f32>(video_cond.shape()).map_err(map_err)?)
        } else {
            None
        };
        let null_audio_cond = if cfg_enabled && audio_mode {
            Some(Array::zeros::<f32>(audio_cond.shape()).map_err(map_err)?)
        } else {
            None
        };

        // Per-token denoise mask for i2v: anchor positions stay at t=0 (clean),
        // other positions get the full scheduler timestep. Rank-2 `[1, num_patches]`
        // so `timestep_embedding_tensor` promotes it to `[1, num_patches, dim]`
        // and ada_chunk sees a rank-3 tensor.
        let denoise_mask: Option<Array> = if anchor_latent.is_some() {
            let mut mask_vals = vec![1.0f32; num_patches as usize];
            for i in 0..anchor_patches {
                mask_vals[i] = 0.0;
            }
            Some(Array::from_slice(&mask_vals, &[1, num_patches]))
        } else {
            None
        };

        // 6. Denoising loop (rectified flow)
        for step_idx in 0..steps {
            let t = scheduler.timesteps[step_idx];
            let global_timestep = ops::reshape(&Array::from_f32(t), &[1]).map_err(map_err)?;
            // Build the timestep tensor: rank-1 `[1]` for uniform, rank-3
            // `[1, num_patches, 1]` for i2v so conditioned positions see
            // t=0 while others see the scheduler sigma.
            let timestep = if let Some(ref mask) = denoise_mask {
                let t_scalar = Array::from_f32(t);
                ops::multiply(mask, &t_scalar).map_err(map_err)?
            } else {
                ops::reshape(&Array::from_f32(t), &[1]).map_err(map_err)?
            };

            // Predict velocity: v = transformer(latents, text_cond, timestep).
            // Audio latents, when active, are denoised jointly with video.
            let (velocity, audio_velocity) = self
                .transformer
                .forward(
                    &latents,
                    &video_cond,
                    &timestep,
                    &global_timestep,
                    audio_latents.as_ref(),
                    if audio_mode { Some(&audio_cond) } else { None },
                    &rope,
                )
                .map_err(map_err)?;

            // Classifier-free guidance. Video and audio get independent scales
            // (audio uses a gentler default to avoid over-guidance artifacts).
            let (velocity, audio_velocity) = if cfg_enabled {
                let (uncond_velocity, uncond_audio_velocity) = self
                    .transformer
                    .forward(
                        &latents,
                        null_cond.as_ref().unwrap(),
                        &timestep,
                        &global_timestep,
                        audio_latents.as_ref(),
                        null_audio_cond.as_ref(),
                        &rope,
                    )
                    .map_err(map_err)?;

                let diff = ops::subtract(&velocity, &uncond_velocity).map_err(map_err)?;
                let scaled_diff = ops::multiply(&diff, &video_scale).map_err(map_err)?;
                let v = ops::add(&uncond_velocity, &scaled_diff).map_err(map_err)?;

                let av = match (audio_velocity, uncond_audio_velocity) {
                    (Some(a_cond), Some(a_uncond)) => {
                        let a_diff = ops::subtract(&a_cond, &a_uncond).map_err(map_err)?;
                        let a_scaled = ops::multiply(&a_diff, &audio_scale).map_err(map_err)?;
                        Some(ops::add(&a_uncond, &a_scaled).map_err(map_err)?)
                    }
                    _ => None,
                };
                (v, av)
            } else {
                (velocity, audio_velocity)
            };

            // Rectified flow step for video.
            latents = scheduler
                .step(&velocity, step_idx, &latents)
                .map_err(map_err)?;

            // With per-token timesteps the transformer sees t=0 at anchor
            // positions and produces near-zero velocity there, so the
            // scheduler step barely moves those tokens. Still, numeric drift
            // accumulates over steps — re-snap to the exact anchor so
            // conditioned positions stay bit-identical to the clean latent
            // (matches upstream's post-step
            //   out = denoised * mask + clean * (1 - mask)  blend).
            if let Some(ref anchor) = anchor_latent {
                latents = splice_first_temporal_slice(&latents, anchor, anchor_patches)
                    .map_err(map_err)?;
            }

            // Rectified flow step for audio.
            if let (Some(a_lat), Some(a_vel)) = (audio_latents.as_ref(), audio_velocity.as_ref()) {
                audio_latents = Some(scheduler.step(a_vel, step_idx, a_lat).map_err(map_err)?);
            }

            // Force materialization per step. Without this, MLX accumulates
            // the full multi-step graph (48 blocks × steps × CFG × 2 passes)
            // before the final `eval` at MP4 write, which OOMs at any
            // meaningful resolution. Evaluating incrementally keeps the
            // retained graph to a single step.
            let mut to_eval: Vec<&Array> = vec![&latents];
            if let Some(ref a) = audio_latents {
                to_eval.push(a);
            }
            mlx_rs::transforms::eval(to_eval).map_err(map_err)?;

            if step_idx % 5 == 0 || step_idx == steps - 1 {
                info!(step = step_idx + 1, total = steps, t, "denoising step");
            }
        }

        info!("denoising complete, decoding latents to video frames");

        // Debug path: load a reference latent from safetensors instead of
        // using ours. Isolates "VAE decode + unpatchify" correctness from
        // the denoising chain. Set `CAR_LTX_REF_LATENT=/path/to/ref.safetensors`
        // (key = "latent") to enable.
        if let Ok(path) = std::env::var("CAR_LTX_REF_LATENT") {
            info!(%path, "overriding latents with reference safetensors");
            let loaded = Array::load_safetensors(&path).map_err(|e| {
                InferenceError::InferenceFailed(format!("load ref latent {path}: {e}"))
            })?;
            if let Some(r) = loaded.get("latent") {
                latents = r.as_dtype(latents.dtype()).map_err(map_err)?;
            } else {
                return Err(InferenceError::InferenceFailed(format!(
                    "ref latent safetensors missing 'latent' key: {path}"
                )));
            }
        }

        // Parity dump: final denoised latent (post-denoise, pre-unpatchify). Lets us
        // diff against upstream's `final_video_latent.npy` to separate the
        // "velocity at step 0" parity (already CLOSE) from the "after 8
        // compound steps" parity (where small errors amplify).
        if let Ok(dir) = std::env::var("CAR_DUMP_LTX_STAGE") {
            let _ = std::fs::create_dir_all(&dir);
            if let Ok(l_f32) = latents.as_dtype(mlx_rs::Dtype::Float32) {
                let _ = mlx_rs::transforms::eval([&l_f32]);
                let shape = l_f32.shape().to_vec();
                let data: &[f32] = l_f32.as_slice();
                let bytes: Vec<u8> = data.iter().flat_map(|v| v.to_le_bytes()).collect();
                let _ = std::fs::write(format!("{dir}/final_video_latent_patched.bin"), &bytes);
                let _ = std::fs::write(
                    format!("{dir}/final_video_latent_patched.meta"),
                    format!("{shape:?}\n"),
                );
            }
        }

        // 6. Reshape latents from [B, T*H*W, C] to [B, C, T, H, W] for VAE decode.
        // Must go through channels-last intermediate since reshape preserves memory layout.
        let latents_5d = ops::reshape(&latents, &[1, latent_t, latent_h, latent_w, latent_c])
            .map_err(map_err)?;
        let latents_5d = ops::transpose_axes(&latents_5d, &[0, 4, 1, 2, 3]).map_err(map_err)?;

        // Decode video latents via 3D causal VAE
        let video_frames = self.vae.decode(&latents_5d).map_err(map_err)?;
        info!(frames_shape = ?video_frames.shape(), "video frames decoded");

        // 7. Save frames as raw pixel data or encode to MP4 via ffmpeg
        // video_frames: [B, T_out, H, W, 3] with values in [0, 1]
        let frame_shape = video_frames.shape();
        let total_frames = frame_shape[1];
        let out_h = frame_shape[2];
        let out_w = frame_shape[3];

        // Convert to uint8: [0,1] -> [0,255]
        let scale_255 = Array::from_f32(255.0);
        let pixels_u8 = ops::multiply(&video_frames, &scale_255).map_err(map_err)?;
        mlx_rs::transforms::eval([&pixels_u8]).map_err(map_err)?;

        // Write raw frames to a temporary file, then use ffmpeg to encode MP4
        let tmp_dir = std::env::temp_dir().join(format!("ltx_frames_{seed}"));
        std::fs::create_dir_all(&tmp_dir)
            .map_err(|e| InferenceError::InferenceFailed(format!("mkdir: {e}")))?;

        // Save frames as raw RGB binary for ffmpeg piped input
        let raw_path = tmp_dir.join("frames.raw");
        let pixel_data: Vec<f32> = pixels_u8.as_slice::<f32>().to_vec();
        let pixel_bytes: Vec<u8> = pixel_data
            .iter()
            .map(|&v| v.clamp(0.0, 255.0) as u8)
            .collect();
        std::fs::write(&raw_path, &pixel_bytes)
            .map_err(|e| InferenceError::InferenceFailed(format!("write frames: {e}")))?;

        // Encode to MP4 via ffmpeg (container encoding, not inference)
        let ffmpeg_status = std::process::Command::new("ffmpeg")
            .args([
                "-y",
                "-f",
                "rawvideo",
                "-pix_fmt",
                "rgb24",
                "-s",
                &format!("{out_w}x{out_h}"),
                "-r",
                &output_fps.to_string(),
                "-i",
                raw_path.to_str().unwrap_or("frames.raw"),
                "-c:v",
                "libx264",
                "-pix_fmt",
                "yuv420p",
                "-frames:v",
                &total_frames.to_string(),
                &output_path,
            ])
            .output();

        // Clean up temp files
        let _ = std::fs::remove_dir_all(&tmp_dir);

        match ffmpeg_status {
            Ok(output) if output.status.success() => {
                info!(path = %output_path, frames = total_frames, "MP4 encoded successfully");
            }
            Ok(output) => {
                let stderr = String::from_utf8_lossy(&output.stderr);
                return Err(InferenceError::InferenceFailed(format!(
                    "ffmpeg failed: {stderr}"
                )));
            }
            Err(e) => {
                return Err(InferenceError::InferenceFailed(format!(
                    "ffmpeg not found or failed to execute: {e}. \
                     Install ffmpeg to encode video output."
                )));
            }
        }

        // 9. Joint text-to-audio+video synthesis: decode generated audio and
        // mux into the output MP4. AudioRefVideo uses audio as input
        // conditioning only; downstream assembly muxes the original song
        // audio, so leave this clip video-only.
        if mode == VideoMode::AudioVideo {
            if let Some(a_lat) = audio_latents {
                info!("decoding audio latents → mel spectrogram → waveform");
                let decode_result = self.decode_audio_latents(&a_lat).map_err(|e| {
                    InferenceError::InferenceFailed(format!("decode_audio_latents: {e}"))
                });
                let waveform = match decode_result {
                    Ok(w) => w,
                    Err(e) => {
                        let _ = std::fs::remove_file(&output_path);
                        return Err(e);
                    }
                };
                let sample_rate = 16000u32;
                if let Err(e) = mux_audio_into_mp4(&output_path, &waveform, sample_rate) {
                    let _ = std::fs::remove_file(&output_path);
                    return Err(e);
                }
                info!(sample_rate, "audio track muxed into MP4");
            }
        }

        Ok(GenerateVideoResult {
            video_path: output_path,
            media_type: "video/mp4".to_string(),
            model_used: Some("ltx-2.3-mlx-q4".to_string()),
        })
    }

    /// Decode transformer audio token latents `[1, T, 128]` through
    /// denormalize → unpatchify → AudioVae decoder → Vocoder, returning an
    /// interleaved stereo waveform `[2, N]` as f32 in approximately [-1, 1].
    fn decode_audio_latents(&self, tokens: &Array) -> Result<Array, mlx_rs::error::Exception> {
        // 1. Denormalize the 128-dim tokens: x * std + mean (broadcast on last axis).
        let mean = &self.audio_vae.denorm_mean;
        let std_ = &self.audio_vae.denorm_std;
        let mean = ops::reshape(mean, &[1, 1, mean.shape()[0]])?;
        let std_ = ops::reshape(std_, &[1, 1, std_.shape()[0]])?;
        let denormed = ops::add(&ops::multiply(tokens, &std_)?, &mean)?;

        // 2. Unpatchify: "b t (c f) -> b c t f" with c=8, f=16.
        let s = denormed.shape();
        let (b, t) = (s[0], s[1]);
        let c: i32 = 8;
        let f: i32 = 16;
        let reshaped = ops::reshape(&denormed, &[b, t, c, f])?;
        let vae_latent = ops::transpose_axes(&reshaped, &[0, 2, 1, 3])?; // [B, 8, T, 16]

        // 3. Run through existing AudioVae decoder → stereo log-mel [B, 2, T', 64].
        let mel = self.audio_vae.decode(&vae_latent)?;

        // 4. Vocoder expects [B, 128, T'] with channels contiguous as (stereo_outer,
        //    mel_inner) — i.e. upstream's `rearrange("b s c t -> b (s c) t")`.
        //    mel shape is [B, s=2, t=T', c=64] (NCHW from AudioVae::decode). A plain
        //    reshape would walk (s, t, c), yielding garbage. Transpose the last two
        //    axes to get [B, 2, 64, T'] first, THEN flatten s+c into channels.
        let ms = mel.shape();
        debug_assert_eq!(ms[1], 2, "audio VAE must output stereo mel");
        let (mb, mt, mf) = (ms[0], ms[2], ms[3]);
        let mel_bctf = ops::transpose_axes(&mel, &[0, 1, 3, 2])?; // [B, 2, 64, T']
        let mel_chan = ops::reshape(&mel_bctf, &[mb, 2 * mf, mt])?; // [B, 128, T']

        // 5. Vocoder → stereo waveform [B, 2, T_wav].
        self.vocoder.forward(&mel_chan)
    }

    /// Encode the reference frame for i2v / extension modes into a patch-ordered
    /// latent slab shaped `[1, latent_h * latent_w, 128]` suitable for splicing
    /// into the first temporal slice of the denoising latents.
    ///
    /// Returns `Ok(None)` for plain t2v. Returns an error when a mode requires
    /// encoding but the VAE encoder isn't loaded or the input file is missing.
    fn compute_anchor_latent(
        &self,
        req: &GenerateVideoRequest,
        latent_h: i32,
        latent_w: i32,
    ) -> Result<Option<Array>, InferenceError> {
        let image_path = match req.effective_mode() {
            // AudioRefVideo never reaches this function with an image
            // anchor — validation (#130) rejects the combination. So
            // it falls in with the no-anchor modes here.
            VideoMode::T2v | VideoMode::AudioVideo | VideoMode::AudioRefVideo => {
                return Ok(None);
            }
            VideoMode::I2v => req.image_path.as_deref().ok_or_else(|| {
                InferenceError::InferenceFailed("i2v mode requires image_path".to_string())
            })?,
            VideoMode::Extend => {
                return Err(InferenceError::UnsupportedMode {
                    mode: "extend",
                    backend: "native-mlx-ltx",
                    reason: "accepted on the request surface but not yet wired; \
                         video extension requires VAE-encoding the input clip and \
                         splicing its latents into the denoising schedule",
                });
            }
            VideoMode::Retake => {
                return Err(InferenceError::UnsupportedMode {
                    mode: "retake",
                    backend: "native-mlx-ltx",
                    reason: "accepted on the request surface but not yet wired; \
                         retake requires partial-frame-range masked diffusion on \
                         VAE-encoded input latents",
                });
            }
        };

        let encoder = self.vae_encoder.as_ref().ok_or_else(|| {
            InferenceError::InferenceFailed(
                "image-to-video requires the VAE encoder, which is not available \
                 in this checkpoint (missing vae_encoder.safetensors)"
                    .to_string(),
            )
        })?;

        // Pixel dimensions the encoder needs to produce a latent matching the
        // denoising slab: latent_{h,w} = pixel_{h,w} / 32 (4×4 patch pack + 8× blocks).
        let pixel_w = (latent_w * 32) as u32;
        let pixel_h = (latent_h * 32) as u32;

        info!(
            path = image_path,
            pixel_w, pixel_h, "encoding i2v reference frame"
        );

        let img = load_rgb_image(std::path::Path::new(image_path), pixel_w, pixel_h)?;
        // The trained encoder expects input in [-1, 1] (upstream
        // `normalize_latent(x) = x/127.5 - 1.0`). `load_rgb_image` gives us
        // [0, 1]; shift to [-1, 1] before encoding so the output magnitude
        // matches the unit-variance latent space the transformer denoises in.
        let img = {
            let two = Array::from_f32(2.0);
            let one = Array::from_f32(1.0);
            ops::subtract(
                &ops::multiply(&img, &two)
                    .map_err(|e| InferenceError::InferenceFailed(format!("scale img: {e}")))?,
                &one,
            )
            .map_err(|e| InferenceError::InferenceFailed(format!("shift img: {e}")))?
        };
        // Tile the single reference frame 8× along T so the causal VAE
        // produces one latent temporal position after 8× downsample.
        // [1, H, W, 3] → [1, 1, H, W, 3] → tile along T → [1, 8, H, W, 3]
        let s = img.shape();
        let frames = ops::reshape(&img, &[s[0], 1, s[1], s[2], s[3]])
            .and_then(|x| ops::tile(&x, &[1, 8, 1, 1, 1]))
            .map_err(|e| InferenceError::InferenceFailed(format!("tile ref frame: {e}")))?;

        let latent = encoder
            .encode(&frames)
            .map_err(|e| InferenceError::InferenceFailed(format!("vae_encoder.encode: {e}")))?;
        // latent shape: [1, 128, latent_t_enc, latent_h, latent_w]. Take the
        // first temporal slice (t=0) and flatten (h, w) into the patch axis.
        let shape = latent.shape();
        let c = shape[1];
        let lh = shape[3];
        let lw = shape[4];
        debug_assert_eq!(
            (lh, lw),
            (latent_h, latent_w),
            "encoder latent spatial size must match computed latent slab \
             (internal invariant — inputs derived from latent_h/w above)"
        );

        // latent[:, :, 0, :, :] -> [1, C, 1, H, W]
        let first_t = latent.index((.., .., 0..1, .., ..));
        // Convert BCTHW → BTHWC → [1, H*W, C] matching the patch layout.
        let nhwc = ops::transpose_axes(&first_t, &[0, 2, 3, 4, 1])
            .map_err(|e| InferenceError::InferenceFailed(format!("anchor transpose: {e}")))?;
        let seq = ops::reshape(&nhwc, &[1, lh * lw, c])
            .map_err(|e| InferenceError::InferenceFailed(format!("anchor reshape: {e}")))?;

        Ok(Some(seq))
    }
}

fn huggingface_cache_root() -> PathBuf {
    std::env::var("HF_HOME")
        .map(PathBuf::from)
        .unwrap_or_else(|_| {
            std::env::var("HOME")
                .map(PathBuf::from)
                .unwrap_or_else(|_| PathBuf::from("."))
                .join(".cache")
                .join("huggingface")
        })
        .join("hub")
}

fn huggingface_repo_dir(repo_id: &str) -> PathBuf {
    huggingface_cache_root().join(format!("models--{}", repo_id.replace('/', "--")))
}

fn resolve_huggingface_ref_snapshot(repo_dir: &Path, name: &str) -> Option<PathBuf> {
    let sha = std::fs::read_to_string(repo_dir.join("refs").join(name))
        .ok()?
        .trim()
        .to_string();
    if sha.is_empty() {
        return None;
    }

    let snapshot = repo_dir.join("snapshots").join(sha);
    if snapshot.is_dir() {
        Some(snapshot)
    } else {
        None
    }
}

fn latest_huggingface_repo_snapshot(repo_id: &str) -> Option<PathBuf> {
    let repo_dir = huggingface_repo_dir(repo_id);
    if let Some(snapshot) = resolve_huggingface_ref_snapshot(&repo_dir, "main") {
        return Some(snapshot);
    }

    let snapshots = repo_dir.join("snapshots");
    let mut candidates: Vec<(SystemTime, PathBuf)> = std::fs::read_dir(snapshots)
        .ok()?
        .filter_map(Result::ok)
        .map(|entry| entry.path())
        .filter(|path| path.is_dir())
        .map(|path| {
            let modified = path
                .metadata()
                .and_then(|metadata| metadata.modified())
                .unwrap_or(SystemTime::UNIX_EPOCH);
            (modified, path)
        })
        .collect();
    candidates.sort();
    candidates.pop().map(|(_, path)| path)
}

/// Overwrite the first temporal slice of a patch-ordered latent
/// (`[1, num_patches, C]`) with `anchor` (`[1, anchor_patches, C]`).
///
/// The caller guarantees `anchor_patches <= num_patches`; this is an internal
/// invariant derived from the same latent shape computation that seeded both
/// the anchor and the noise latents.
fn splice_first_temporal_slice(
    latents: &Array,
    anchor: &Array,
    anchor_patches: usize,
) -> Result<Array, mlx_rs::error::Exception> {
    debug_assert!(anchor_patches <= latents.shape()[1] as usize);
    let tail = latents.index((.., (anchor_patches as i32).., ..));
    ops::concatenate_axis(&[anchor, &tail], 1)
}

/// Write a stereo waveform as raw float32 PCM, then use ffmpeg to mux it into
/// an existing MP4 as an AAC audio track.
///
/// `waveform` is `[1, 2, N]` or `[2, N]` f32 in approximately [-1, 1].
fn mux_audio_into_mp4(
    video_path: &str,
    waveform: &Array,
    sample_rate: u32,
) -> Result<(), InferenceError> {
    // Normalize to [2, N]
    let w_shape = waveform.shape();
    let stereo = match w_shape.len() {
        2 => waveform.clone(),
        3 => ops::reshape(waveform, &[w_shape[1], w_shape[2]])
            .map_err(|e| InferenceError::InferenceFailed(format!("waveform squeeze: {e}")))?,
        other => {
            return Err(InferenceError::InferenceFailed(format!(
                "expected waveform rank 2 or 3, got {other}"
            )))
        }
    };
    let s = stereo.shape();
    if s[0] != 2 {
        return Err(InferenceError::InferenceFailed(format!(
            "expected stereo waveform, got {} channels",
            s[0]
        )));
    }
    let n_samples = s[1];

    // Interleave channels [2, N] → [N, 2] then flatten row-major.
    let interleaved = ops::transpose_axes(&stereo, &[1, 0])
        .map_err(|e| InferenceError::InferenceFailed(format!("interleave: {e}")))?;
    mlx_rs::transforms::eval([&interleaved])
        .map_err(|e| InferenceError::InferenceFailed(format!("eval waveform: {e}")))?;
    let raw: &[f32] = interleaved.as_slice::<f32>();

    // Single-pass write into a preallocated byte buffer. NaN/Inf from
    // unconditioned / numerically-iffy inference would crash the AAC encoder;
    // coerce them to 0 before clamp.
    let mut bytes: Vec<u8> = Vec::with_capacity(raw.len() * 4);
    for &v in raw {
        let clean = if v.is_finite() {
            v.clamp(-1.0, 1.0)
        } else {
            0.0
        };
        bytes.extend_from_slice(&clean.to_le_bytes());
    }

    let pcm_path = std::env::temp_dir().join(format!(
        "ltx_audio_{}.pcm",
        std::time::SystemTime::now()
            .duration_since(std::time::UNIX_EPOCH)
            .map(|d| d.as_nanos())
            .unwrap_or(0)
    ));
    std::fs::write(&pcm_path, &bytes)
        .map_err(|e| InferenceError::InferenceFailed(format!("write pcm: {e}")))?;

    // ffmpeg: re-mux video + newly generated audio into a temp output, then
    // replace the original file.
    let tmp_out = format!("{video_path}.with_audio.mp4");
    let status = std::process::Command::new("ffmpeg")
        .args([
            "-y",
            "-i",
            video_path,
            "-f",
            "f32le",
            "-ar",
            &sample_rate.to_string(),
            "-ac",
            "2",
            "-i",
            pcm_path.to_str().unwrap_or(""),
            "-c:v",
            "copy",
            "-c:a",
            "aac",
            "-b:a",
            "192k",
            "-shortest",
            &tmp_out,
        ])
        .output();

    let _ = std::fs::remove_file(&pcm_path);

    match status {
        Ok(o) if o.status.success() => {
            if let Err(e) = std::fs::rename(&tmp_out, video_path) {
                let _ = std::fs::remove_file(&tmp_out);
                return Err(InferenceError::InferenceFailed(format!("replace mp4: {e}")));
            }
            info!(samples = n_samples, sample_rate, "muxed audio into mp4");
            Ok(())
        }
        Ok(o) => {
            let _ = std::fs::remove_file(&tmp_out);
            Err(InferenceError::InferenceFailed(format!(
                "ffmpeg audio mux failed: {}",
                String::from_utf8_lossy(&o.stderr)
            )))
        }
        Err(e) => {
            let _ = std::fs::remove_file(&tmp_out);
            Err(InferenceError::InferenceFailed(format!(
                "ffmpeg audio mux spawn failed: {e}"
            )))
        }
    }
}