rlx-mlx 0.2.8 - Docs.rs

// RLX — versatile ML compiler + runtime.
// Copyright (C) 2026 Eugene Hauptmann, Nataliya Kosmyna.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 3.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.

//! Lower an `rlx_ir::Graph` into a chain of MLX `Array` handles.
//!
//! Strategy is "fresh graph per run": every call rebuilds the MLX
//! graph from scratch using current input/param data. Simpler than
//! holding a persistent graph + replaceable placeholders, and MLX's
//! own trace cache amortizes the per-build cost. A future pass can
//! switch to `mlx::compile`-style placeholder bindings if we need
//! to drop the per-run construction overhead.

use std::collections::{HashMap, HashSet};

use rlx_ir::RegionPrologue;
use rlx_ir::op::{
    Activation, BinaryOp, ChainOperand, ChainStep, CmpOp, MaskKind, ReduceOp, ScaleMode, SteKind,
    TransformStep,
};
use rlx_ir::shape::{Dim, DimBinding, Shape};
use rlx_ir::{DType, Graph, NodeId, Op};

use crate::array::{Array, MlxError, async_eval, eval};
use crate::ffi::{MlxMask, MlxReduce, MlxUnary};
use crate::ops;

#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum MlxMode {
    /// Eval after every op. Slower but useful for debugging — failures
    /// surface at the offending op rather than at the final eval.
    Eager,
    /// Build the full graph, eval all outputs in one shot. Default.
    /// Lets MLX's optimizer schedule the whole DAG.
    #[default]
    Lazy,
    /// Build the full graph and `async_eval` the outputs, but don't
    /// wait for completion. Used by `commit_no_wait` to amortize sync
    /// latency across pipelined runs.
    AsyncCommit,
    /// Compile the graph once via `mlx::compile` and replay the
    /// optimized trace on every subsequent `run()`. First call pays
    /// the trace cost; subsequent calls skip the per-run rebuild.
    Compiled,
}

/// What kind of host-side data each leaf node needs. Built once at
/// compile time; re-used at run time to materialize MLX leaves in the
/// same order across calls (essential for the mlx::compile path —
/// position determines which placeholder the compiled trace expects).
#[derive(Debug, Clone)]
pub enum LeafKey {
    Input(String),
    Param(String),
    Constant, // node id is implicit from leaf_order's NodeId
}

/// Walk `graph` in topo order and return the (NodeId, LeafKey) pairs
/// for every Input/Param/Constant node, in declaration order.
pub fn leaf_order(graph: &Graph) -> Vec<(NodeId, LeafKey)> {
    let mut out = Vec::new();
    for node in graph.nodes() {
        match &node.op {
            Op::Input { name } => out.push((node.id, LeafKey::Input(name.clone()))),
            Op::Param { name } => out.push((node.id, LeafKey::Param(name.clone()))),
            Op::Constant { .. } => out.push((node.id, LeafKey::Constant)),
            _ => {}
        }
    }
    out
}

/// Positional compile/run inputs: one slot per unique Input/Param name,
/// every Constant node. Graph builders often call `g.param("shared", …)`
/// once per block, producing many `NodeId`s with the same name; feeding
/// each as a separate mlx::compile leaf misbinds inputs on replay.
pub fn compile_leaf_order(graph: &Graph) -> Vec<(NodeId, LeafKey)> {
    let mut seen_input = HashSet::new();
    let mut seen_param = HashSet::new();
    let mut out = Vec::new();
    for node in graph.nodes() {
        match &node.op {
            Op::Input { name } if seen_input.insert(name.clone()) => {
                out.push((node.id, LeafKey::Input(name.clone())));
            }
            Op::Param { name } if seen_param.insert(name.clone()) => {
                out.push((node.id, LeafKey::Param(name.clone())));
            }
            Op::Constant { .. } => out.push((node.id, LeafKey::Constant)),
            _ => {}
        }
    }
    out
}

/// If `graph` contains an op whose MLX lowering eagerly evaluates a
/// tensor on the host (`to_f32` / `to_bytes`), return a short label
/// for the first offender. MLX's `mlx::compile` callback forbids
/// host eval; entering Compiled mode on such a graph triggers the
/// `[eval] Attempting to eval an array during function
/// transformations…` panic. Backends should check this up front and
/// fall back to Lazy.
pub fn first_host_eval_op(graph: &Graph) -> Option<&'static str> {
    for node in graph.nodes() {
        match &node.op {
            Op::DequantMatMul { scheme }
                if (scheme.is_gguf() || matches!(scheme, rlx_ir::QuantScheme::Nvfp4Block)) =>
            {
                return Some("DequantMatMul[GGUF|NVFP4] (host dequant)");
            }
            Op::DequantGroupedMatMul { scheme } if scheme.is_gguf() => {
                return Some("DequantGroupedMatMul[GGUF] (host dequant)");
            }
            Op::GaussianSplatRender { .. } => return Some("GaussianSplatRender (host kernel)"),
            Op::GaussianSplatRenderBackward { .. } => {
                return Some("GaussianSplatRenderBackward (host kernel)");
            }
            Op::LogMel | Op::LogMelBackward => return Some("LogMel (host filterbank)"),
            Op::WelchPeaks { .. } => return Some("WelchPeaks (host PSD top-K)"),
            Op::Custom { .. } => return Some("Custom (host kernel)"),
            Op::RngNormal { .. } | Op::RngUniform { .. } => {
                return Some("RngNormal/RngUniform (host fill)");
            }
            _ => {}
        }
    }
    None
}

/// Fan canonical leaf arrays out to every duplicate Input/Param `NodeId`.
pub fn expand_leaf_env(
    graph: &Graph,
    mut env: HashMap<NodeId, Array>,
) -> Result<HashMap<NodeId, Array>, MlxError> {
    let mut canon_input: HashMap<String, NodeId> = HashMap::new();
    let mut canon_param: HashMap<String, NodeId> = HashMap::new();
    for (id, key) in compile_leaf_order(graph) {
        match key {
            LeafKey::Input(name) => {
                canon_input.insert(name, id);
            }
            LeafKey::Param(name) => {
                canon_param.insert(name, id);
            }
            LeafKey::Constant => {}
        }
    }

    for node in graph.nodes() {
        if env.contains_key(&node.id) {
            continue;
        }
        match &node.op {
            Op::Input { name } => {
                let canon = *canon_input.get(name).ok_or_else(|| {
                    MlxError(format!("expand_leaf_env: missing canonical input '{name}'"))
                })?;
                let arr = env.get(&canon).ok_or_else(|| {
                    MlxError(format!("expand_leaf_env: canonical input '{name}' unbound"))
                })?;
                env.insert(node.id, arr.clone_handle()?);
            }
            Op::Param { name } => {
                let canon = *canon_param.get(name).ok_or_else(|| {
                    MlxError(format!("expand_leaf_env: missing canonical param '{name}'"))
                })?;
                let arr = env.get(&canon).ok_or_else(|| {
                    MlxError(format!("expand_leaf_env: canonical param '{name}' unbound"))
                })?;
                env.insert(node.id, arr.clone_handle()?);
            }
            Op::Constant { .. } => {
                return Err(MlxError(format!(
                    "expand_leaf_env: constant leaf {:?} not bound",
                    node.id
                )));
            }
            _ => {}
        }
    }
    Ok(env)
}

/// Expand scalar host buffers to match a batched graph leaf when vmap
/// left a shared `[1]` binding but the lifted node is `[B, …]`.
pub(crate) fn broadcast_leaf_data(
    name: &str,
    data: &[f32],
    shape: &[usize],
) -> Result<Vec<f32>, MlxError> {
    let product: usize = shape.iter().product();
    if data.len() == product {
        return Ok(data.to_vec());
    }
    if data.len() == 1 && product > 1 {
        return Ok(vec![data[0]; product]);
    }
    Err(MlxError(format!(
        "leaf '{name}': host len {} != shape {shape:?} product {product}",
        data.len()
    )))
}

/// Build the leaf array for a single node. Prefers typed bytes if a
/// matching name appears in `inputs_typed` / `params_typed`; falls
/// back to the f32 host map. The typed path uses Array::from_bytes
/// for zero-widen F16/BF16 / I32 leaves.
pub fn build_leaf_for(
    graph: &Graph,
    id: NodeId,
    params: &HashMap<String, Vec<f32>>,
    inputs: &HashMap<String, Vec<f32>>,
    params_typed: &HashMap<String, (Vec<u8>, DType)>,
    inputs_typed: &HashMap<String, (Vec<u8>, DType)>,
    gpu_inputs: Option<&HashMap<String, Array>>,
) -> Result<Array, MlxError> {
    let node = graph.node(id);
    let shape: Vec<usize> = node
        .shape
        .dims()
        .iter()
        .map(|d| d.unwrap_static())
        .collect();
    let dtype = node.shape.dtype();
    match &node.op {
        Op::Input { name } => {
            if let Some(map) = gpu_inputs {
                if let Some(arr) = map.get(name) {
                    return arr.clone_handle();
                }
            }
            if let Some((bytes, dt)) = inputs_typed.get(name) {
                if *dt != dtype {
                    return Err(MlxError(format!(
                        "typed input '{name}' dtype {dt:?} doesn't match graph's {dtype:?}"
                    )));
                }
                return Array::from_bytes(bytes, &shape, dtype);
            }
            let data = inputs
                .get(name)
                .ok_or_else(|| MlxError(format!("missing input '{name}'")))?;
            let data = broadcast_leaf_data(name, data, &shape)?;
            Array::from_f32_slice(&data, &shape, dtype)
        }
        Op::Param { name } => {
            if let Some((bytes, dt)) = params_typed.get(name) {
                if *dt != dtype {
                    return Err(MlxError(format!(
                        "typed param '{name}' dtype {dt:?} doesn't match graph's {dtype:?}"
                    )));
                }
                return Array::from_bytes(bytes, &shape, dtype);
            }
            let data = params
                .get(name)
                .ok_or_else(|| MlxError(format!("missing param '{name}'")))?;
            // Fast path: f32 param whose host buffer already matches
            // the graph-declared shape (no broadcast needed). Wrap as
            // a zero-copy view over the caller-owned `Vec<f32>` — the
            // MlxExecutable mutates the Vec in place on set_param
            // calls, keeping the buffer address stable across runs.
            //
            // SAFETY: `data` lives in `MlxExecutable::params` (a
            // HashMap of Vec<f32>) which outlives the Array. The
            // executable syncs MLX evaluation in `run_internal`
            // before returning, so the Array is no longer referenced
            // by MLX by the time the next `set_param` mutates the
            // buffer.
            // Param zero-copy view: previously default, now opt-in via
            // `RLX_MLX_PARAM_VIEW=1`. Holding an MLX Array as a *view*
            // over a host Vec across multiple `compiled.run()` calls
            // breaks parity on Gemma 4 E2B QAT — same prefill graph
            // re-invoked with a new `input_ids[5]` returned stale
            // logits (the value MLX computed on the previous call,
            // i.e. position-5 of the original prompt) while a freshly
            // compiled executable returned the correct value. The
            // safety comment below ("Array is no longer referenced by
            // MLX by the time the next set_param mutates the buffer")
            // is true for `set_param`-driven mutations but does NOT
            // cover MLX's internal compile-trace cache, which appears
            // to retain a reference to the view past the eval barrier.
            // Default to the safe copying path; callers that have
            // proven their lifetime story can re-enable the view.
            if dtype == DType::F32
                && data.len() == shape.iter().product::<usize>()
                && std::env::var("RLX_MLX_PARAM_VIEW").as_deref() == Ok("1")
            {
                return unsafe { Array::from_f32_slice_view(data, &shape) };
            }
            let data = broadcast_leaf_data(name, data, &shape)?;
            Array::from_f32_slice(&data, &shape, dtype)
        }
        Op::Constant { data } => {
            // Constants are little-endian raw bytes in the node's
            // dtype. Every dtype rlx-ir declares has a native MLX
            // counterpart; from_bytes handles the typed read directly.
            // F32 still goes through the iterator path because that
            // matches the prior behavior bit-for-bit.
            match dtype {
                DType::F32 => {
                    let n = data.len() / 4;
                    let mut buf = Vec::with_capacity(n);
                    for i in 0..n {
                        let bytes = [
                            data[i * 4],
                            data[i * 4 + 1],
                            data[i * 4 + 2],
                            data[i * 4 + 3],
                        ];
                        buf.push(f32::from_le_bytes(bytes));
                    }
                    Array::from_f32_slice(&buf, &shape, dtype)
                }
                _ => Array::from_bytes(data, &shape, dtype),
            }
        }
        other => Err(MlxError(format!("build_leaf called on non-leaf {other:?}"))),
    }
}

/// Lower a sub-graph (then/else branch of `Op::If`, or body/cond of
/// `Op::While`). Captures bind positionally: the i-th `Op::Input` in
/// the sub-graph (in topo order) is bound to `captures[i]`. Params
/// look up in the parent's `params` / `params_typed` by name. Every
/// leaf array gets a fresh `clone_handle` so the parent's ownership
/// is undisturbed.
pub fn lower_subgraph(
    sub: &Graph,
    captures: &[&Array],
    parent_params: &HashMap<String, Vec<f32>>,
    parent_params_typed: &HashMap<String, (Vec<u8>, DType)>,
    rng: rlx_ir::RngOptions,
) -> Result<Vec<Array>, MlxError> {
    let mut sub_env: HashMap<NodeId, Array> = HashMap::with_capacity(sub.nodes().len());
    let mut canon_param: HashMap<String, NodeId> = HashMap::new();

    let mut input_idx = 0;
    for node in sub.nodes() {
        match &node.op {
            Op::Input { name } => {
                if input_idx >= captures.len() {
                    return Err(MlxError(format!(
                        "sub-graph has more Op::Input nodes than parent supplied \
                         captures (input #{input_idx} = {name:?})"
                    )));
                }
                sub_env.insert(node.id, captures[input_idx].clone_handle()?);
                input_idx += 1;
            }
            Op::Param { name } => {
                if let Some(&canon_id) = canon_param.get(name) {
                    let arr = sub_env.get(&canon_id).ok_or_else(|| {
                        MlxError(format!("sub-graph canonical param '{name}' missing"))
                    })?;
                    sub_env.insert(node.id, arr.clone_handle()?);
                    continue;
                }
                let leaf = if let Some((bytes, dt)) = parent_params_typed.get(name) {
                    let shape: Vec<usize> = node
                        .shape
                        .dims()
                        .iter()
                        .map(|d| d.unwrap_static())
                        .collect();
                    Array::from_bytes(bytes, &shape, *dt)?
                } else if let Some(data) = parent_params.get(name) {
                    let shape: Vec<usize> = node
                        .shape
                        .dims()
                        .iter()
                        .map(|d| d.unwrap_static())
                        .collect();
                    let dtype = node.shape.dtype();
                    Array::from_f32_slice(data, &shape, dtype)?
                } else {
                    return Err(MlxError(format!(
                        "sub-graph param '{name}' not found in parent's param maps"
                    )));
                };
                canon_param.insert(name.clone(), node.id);
                sub_env.insert(node.id, leaf);
            }
            Op::Constant { data } => {
                let shape: Vec<usize> = node
                    .shape
                    .dims()
                    .iter()
                    .map(|d| d.unwrap_static())
                    .collect();
                let dtype = node.shape.dtype();
                let leaf = match dtype {
                    DType::F32 => {
                        let n = data.len() / 4;
                        let mut buf = Vec::with_capacity(n);
                        for i in 0..n {
                            let bytes = [
                                data[i * 4],
                                data[i * 4 + 1],
                                data[i * 4 + 2],
                                data[i * 4 + 3],
                            ];
                            buf.push(f32::from_le_bytes(bytes));
                        }
                        Array::from_f32_slice(&buf, &shape, dtype)?
                    }
                    _ => Array::from_bytes(data, &shape, dtype)?,
                };
                sub_env.insert(node.id, leaf);
            }
            _ => {} // non-leaf: handled by lower_with_env
        }
    }

    if input_idx < captures.len() {
        // More captures than the sub-graph used. Not necessarily an
        // error — extra captures may have been provided "in case" —
        // but worth a debug-friendly note. For now silently allow.
    }

    lower_with_env(sub, sub_env, parent_params, parent_params_typed, rng)
}

/// Walk `graph` with `env` already populated for every leaf node
/// (Input/Param/Constant). Internal nodes are dispatched to ops::* in
/// topological order; the resulting Array is inserted into `env`.
/// Returns the arrays for `graph.outputs`.
///
/// The eval semantics are the caller's responsibility — this function
/// only constructs the symbolic chain. `params` / `params_typed` are
/// the parent-scope param maps; they're needed only for ops that
/// recurse into sub-graphs (Op::If, Op::While) — sub-graph leaves
/// look them up by name. Pass empty maps for trace contexts that
/// don't see sub-graphs.
pub fn lower_with_env(
    graph: &Graph,
    mut env: HashMap<NodeId, Array>,
    params: &HashMap<String, Vec<f32>>,
    params_typed: &HashMap<String, (Vec<u8>, DType)>,
    rng: rlx_ir::RngOptions,
) -> Result<Vec<Array>, MlxError> {
    let debug_eval = std::env::var("RLX_MLX_DEBUG_EVAL").is_ok();
    if debug_eval {
        eprintln!("rlx-mlx: lower_with_env {} nodes", graph.nodes().len());
    }
    for node in graph.nodes() {
        let id = node.id;
        if env.contains_key(&id) {
            // Pre-populated leaf — already bound by the caller.
            continue;
        }
        if !node.shape.dims().iter().all(|d| d.is_static()) {
            return Err(MlxError(format!(
                "MLX backend: dynamic shapes not yet supported (node {:?})",
                node.id
            )));
        }

        let arr = match &node.op {
            // Leaves should have been pre-bound by the caller; if we
            // see one here it means env was incomplete.
            Op::Input { .. } | Op::Param { .. } | Op::Constant { .. } => {
                return Err(MlxError(format!(
                    "lower_with_env: leaf node {id:?} not bound in env"
                )));
            }

            Op::MatMul => {
                let a = lookup(&env, node.inputs[0])?;
                let b = lookup(&env, node.inputs[1])?;
                let graph_a = node_input_shape(graph, node.inputs[0]);
                let graph_out = node_input_shape(graph, node.id);
                let a = flatten_matmul_lhs_if_needed(a, &graph_a, &graph_out)?;
                ops::matmul(&a, b).map_err(|e| {
                    let name = node.name.as_deref().unwrap_or("?");
                    MlxError(format!(
                        "MatMul {name}: {e} (lhs={:?}, rhs={:?})",
                        a.shape(),
                        b.shape()
                    ))
                })?
            }
            // Dense linear solve. MLX's linalg::solve handles the
            // rank-2 single-system case directly. For rlx's
            // `Op::BatchedDenseSolve` (A: [B, n, n], b: [B, n] →
            // x: [B, n]) we adapt to MLX's multi-RHS convention:
            // MLX treats a rank-2 `b` as `[n, k]` (k right-hand
            // sides), not `[B, n]`. So we reshape b to `[B, n, 1]`
            // before the solve and squeeze the trailing 1 back off
            // afterwards. Same shim entry point covers both ops.
            // Dtype must be f32 or f64 (validated by MLX upstream).
            //
            // Caveat: the C++ shim pins this to MLX's CPU stream because
            // MLX-GPU linalg::solve isn't implemented yet upstream. Op
            // still lives in the lazy graph (no host roundtrip; fuses
            // with surrounding ops on either side), but the LU runs on
            // CPU LAPACK. When MLX adds a Metal solve, the shim's stream
            // pin can be dropped — no change here.
            Op::DenseSolve => {
                let a = lookup(&env, node.inputs[0])?;
                let b = lookup(&env, node.inputs[1])?;
                ops::solve(a, b)?
            }
            Op::BatchedDenseSolve => {
                let a = lookup(&env, node.inputs[0])?;
                let b = lookup(&env, node.inputs[1])?;
                let b_shape: Vec<i32> = node_input_shape(graph, node.inputs[1]);
                let n = if b_shape.len() >= 2 {
                    b_shape[1] as usize
                } else {
                    0
                };
                let dtype = node.shape.dtype();

                // Custom Metal LU+solve kernel — runs on the Apple GPU,
                // dispatches one threadgroup per batch element. Bound by
                // threadgroup memory at f32: NMAX² + NMAX ≤ 32 KB ⇒
                // n ≤ 90. Falls back to MLX-CPU `linalg::solve` outside
                // the supported envelope (n > 90, or non-f32 dtype).
                if dtype == DType::F32 && n > 0 && n <= 90 {
                    static REGISTER_KERNELS: std::sync::Once = std::sync::Once::new();
                    REGISTER_KERNELS.call_once(crate::batched_lu_kernel::register);

                    if let Some(kernel) =
                        crate::op_registry::lookup_mlx_kernel(crate::batched_lu_kernel::KERNEL_NAME)
                    {
                        let out_shape = node.shape.clone();
                        // Errors here propagate as a backend failure.
                        // Don't silently fall back — that would mask
                        // bugs in the kernel, which is worse than a
                        // loud error since the fallback exists for
                        // numerical/capability reasons, not for kernel
                        // correctness regressions.
                        kernel.execute(&[a, b], &out_shape, &[])?
                    } else {
                        // Registry returned None — should be
                        // impossible after call_once, but stay safe.
                        let mut shape_b1 = b_shape.clone();
                        shape_b1.push(1);
                        let b_un = ops::reshape(b, &shape_b1)?;
                        let solved = ops::solve(a, &b_un)?;
                        ops::reshape(&solved, &b_shape)?
                    }
                } else {
                    // Fallback path: MLX's linalg::solve on the CPU
                    // stream. MLX expects rank-3 b for batched solve
                    // (multi-RHS form), so reshape [B,n] ↔ [B,n,1].
                    let mut shape_b1 = b_shape.clone();
                    shape_b1.push(1);
                    let b_un = ops::reshape(b, &shape_b1)?;
                    let solved = ops::solve(a, &b_un)?;
                    ops::reshape(&solved, &b_shape)?
                }
            }
            Op::DotGeneral {
                lhs_contracting,
                rhs_contracting,
                lhs_batch,
                rhs_batch,
            } => {
                // General case: permute each operand into [batch...,
                // outer..., contracting...] (or [batch..., contracting...,
                // outer...] for rhs), reshape to [B, M, K] / [B, K, N],
                // run a batched matmul, reshape back to the declared
                // output shape. The canonical 2D pattern (no batch,
                // contract lhs[1] × rhs[0]) reduces to a plain MatMul
                // through this same code path.
                let lhs = lookup(&env, node.inputs[0])?;
                let rhs = lookup(&env, node.inputs[1])?;
                let lhs_shape = node_input_shape(graph, node.inputs[0]);
                let rhs_shape = node_input_shape(graph, node.inputs[1]);

                // Compute "outer" axes (everything that's not batch and
                // not contracting) for each operand.
                let lhs_outer: Vec<usize> = (0..lhs_shape.len())
                    .filter(|i| !lhs_batch.contains(i) && !lhs_contracting.contains(i))
                    .collect();
                let rhs_outer: Vec<usize> = (0..rhs_shape.len())
                    .filter(|i| !rhs_batch.contains(i) && !rhs_contracting.contains(i))
                    .collect();

                // Permutations: lhs → [batch..., outer..., contracting...];
                // rhs → [batch..., contracting..., outer...].
                let mut lhs_perm: Vec<i32> = Vec::with_capacity(lhs_shape.len());
                for &b in lhs_batch {
                    lhs_perm.push(b as i32);
                }
                for &o in &lhs_outer {
                    lhs_perm.push(o as i32);
                }
                for &c in lhs_contracting {
                    lhs_perm.push(c as i32);
                }

                let mut rhs_perm: Vec<i32> = Vec::with_capacity(rhs_shape.len());
                for &b in rhs_batch {
                    rhs_perm.push(b as i32);
                }
                for &c in rhs_contracting {
                    rhs_perm.push(c as i32);
                }
                for &o in &rhs_outer {
                    rhs_perm.push(o as i32);
                }

                let lhs_p = ops::transpose(lhs, &lhs_perm)?;
                let rhs_p = ops::transpose(rhs, &rhs_perm)?;

                // Compute B/M/K/N. Batch dims must match between lhs and
                // rhs by definition of DotGeneral.
                let dim_prod = |shape: &[i32], idxs: &[usize]| -> i32 {
                    idxs.iter().map(|&i| shape[i]).product::<i32>().max(1)
                };
                let big_b = dim_prod(&lhs_shape, lhs_batch);
                let big_m = dim_prod(&lhs_shape, &lhs_outer);
                let big_k = dim_prod(&lhs_shape, lhs_contracting);
                let big_n = dim_prod(&rhs_shape, &rhs_outer);

                let lhs_3d = ops::reshape(&lhs_p, &[big_b, big_m, big_k])?;
                let rhs_3d = ops::reshape(&rhs_p, &[big_b, big_k, big_n])?;

                // Batched matmul. MLX's matmul supports rank-3 batched
                // matmul natively.
                let mm = ops::matmul(&lhs_3d, &rhs_3d)?;

                // Reshape back to the declared output shape so downstream
                // consumers see exactly what the IR's shape inference
                // promised.
                let out_shape: Vec<i32> = node
                    .shape
                    .dims()
                    .iter()
                    .map(|d| d.unwrap_static() as i32)
                    .collect();
                ops::reshape(&mm, &out_shape)?
            }
            Op::Binary(bop) => {
                let a = lookup(&env, node.inputs[0])?;
                let b = lookup(&env, node.inputs[1])?;
                let (a, b) = mlx_align_rank3_seq_pair(a, b)?;
                match bop {
                    BinaryOp::Add => ops::add(&a, &b)?,
                    BinaryOp::Mul => ops::mul(&a, &b)?,
                    BinaryOp::Sub => ops::sub(&a, &b)?,
                    BinaryOp::Div => ops::div(&a, &b)?,
                    BinaryOp::Max => ops::max(&a, &b)?,
                    BinaryOp::Min => ops::min(&a, &b)?,
                    BinaryOp::Pow => ops::pow(&a, &b)?,
                }
            }
            Op::Compare(cop) => {
                let a = lookup(&env, node.inputs[0])?;
                let b = lookup(&env, node.inputs[1])?;
                let (a, b) = mlx_align_rank3_seq_pair(a, b)?;
                match cop {
                    CmpOp::Eq => ops::eq(&a, &b)?,
                    CmpOp::Ne => ops::ne(&a, &b)?,
                    CmpOp::Lt => ops::lt(&a, &b)?,
                    CmpOp::Le => ops::le(&a, &b)?,
                    CmpOp::Gt => ops::gt(&a, &b)?,
                    CmpOp::Ge => ops::ge(&a, &b)?,
                }
            }
            Op::Where => {
                let c = lookup(&env, node.inputs[0])?;
                let x = lookup(&env, node.inputs[1])?;
                let y = lookup(&env, node.inputs[2])?;
                let (c, x) = mlx_align_rank3_seq_pair(c, x)?;
                let (x, y) = mlx_align_rank3_seq_pair(&x, y)?;
                ops::select(&c, &x, &y)?
            }
            Op::TransformRegion { steps, .. } => {
                let mut cur = lookup(&env, node.inputs[0])?.clone_handle()?;
                for step in steps {
                    match step {
                        TransformStep::ResizeNearest2x(_) => {
                            cur = ops::resize_nearest_2x_nchw(&cur)?;
                        }
                    }
                }
                cur
            }
            Op::BatchElementwiseRegion {
                chain,
                num_batch_inputs,
                scalar_input_mask: _,
                input_modulus: _,
                prologue,
                prologue_input: _,
            } => {
                let n = *num_batch_inputs as usize;
                if node.inputs.len() != n {
                    return Err(MlxError(format!(
                        "BatchElementwiseRegion: declared {n} batch inputs but node has {}",
                        node.inputs.len()
                    )));
                }
                let mut slices = Vec::with_capacity(n);
                for &in_id in &node.inputs {
                    slices.push(eval_elementwise_region_on_inputs(
                        &env,
                        std::slice::from_ref(&in_id),
                        chain,
                        *prologue,
                    )?);
                }
                let refs: Vec<&Array> = slices.iter().collect();
                ops::concat(&refs, 0)?
            }
            Op::ElementwiseRegion {
                chain,
                num_inputs,
                scalar_input_mask: _,
                input_modulus: _,
                prologue,
                prologue_input: _,
            } => {
                // PLAN L2: native MLX lowering — see `eval_elementwise_region_on_inputs`.
                // `scalar_input_mask` / `input_modulus` are for interpreted GPU kernels.
                let n_in = *num_inputs as usize;
                if node.inputs.len() != n_in {
                    return Err(MlxError(format!(
                        "ElementwiseRegion: declared {n_in} inputs but node has {}",
                        node.inputs.len()
                    )));
                }
                eval_elementwise_region_on_inputs(&env, &node.inputs, chain, *prologue)?
            }
            Op::Activation(act) => {
                let x = lookup(&env, node.inputs[0])?;
                match act {
                    Activation::Gelu => ops::gelu(x)?,
                    Activation::GeluApprox => ops::gelu_approx(x)?,
                    Activation::Silu => ops::silu(x)?,
                    Activation::Relu => ops::unary(x, MlxUnary::Relu)?,
                    Activation::Sigmoid => ops::unary(x, MlxUnary::Sigmoid)?,
                    Activation::Tanh => ops::unary(x, MlxUnary::Tanh)?,
                    Activation::Exp => ops::unary(x, MlxUnary::Exp)?,
                    Activation::Log => ops::unary(x, MlxUnary::Log)?,
                    Activation::Sqrt => ops::unary(x, MlxUnary::Sqrt)?,
                    Activation::Rsqrt => ops::unary(x, MlxUnary::Rsqrt)?,
                    Activation::Neg => ops::unary(x, MlxUnary::Neg)?,
                    Activation::Abs => ops::unary(x, MlxUnary::Abs)?,
                    Activation::Round => ops::unary(x, MlxUnary::Round)?,
                    Activation::Sin => ops::unary(x, MlxUnary::Sin)?,
                    Activation::Cos => ops::unary(x, MlxUnary::Cos)?,
                    Activation::Tan => ops::unary(x, MlxUnary::Tan)?,
                    Activation::Atan => ops::unary(x, MlxUnary::Atan)?,
                }
            }
            Op::Cast { to } => {
                let x = lookup(&env, node.inputs[0])?;
                ops::cast(x, *to)?
            }
            Op::Softmax { axis } => {
                let x = lookup(&env, node.inputs[0])?;
                ops::softmax(x, *axis)?
            }
            Op::LayerNorm { eps, .. } => {
                let x = lookup(&env, node.inputs[0])?;
                let g = mlx_norm_scale_1d(lookup(&env, node.inputs[1])?)?;
                let b = if node.inputs.len() >= 3 {
                    Some(mlx_norm_scale_1d(lookup(&env, node.inputs[2])?)?)
                } else {
                    None
                };
                ops::layer_norm(x, &g, b.as_ref(), *eps)?
            }
            Op::Reshape { new_shape } => {
                let x = lookup(&env, node.inputs[0])?;
                let rt = x.shape()?;
                let s = mlx_fix_reshape_shape(&rt, new_shape);
                ops::reshape(x, &s)?
            }
            Op::Transpose { perm } => {
                let x = lookup(&env, node.inputs[0])?;
                let p: Vec<i32> = perm.iter().map(|&d| d as i32).collect();
                ops::transpose(x, &p)?
            }
            Op::ResizeNearest2x => {
                let x = lookup(&env, node.inputs[0])?;
                ops::resize_nearest_2x_nchw(x)?
            }
            Op::Narrow { axis, start, len } => {
                let x = lookup(&env, node.inputs[0])?;
                let graph_shape = node_input_shape(graph, node.inputs[0]);
                let runtime_shape: Vec<i32> = x.shape()?.iter().map(|&d| d as i32).collect();
                let axis_rt =
                    map_graph_axis_to_runtime(*axis, graph_shape.len(), runtime_shape.len());
                let mut s_start = vec![0i32; runtime_shape.len()];
                let mut s_stop = runtime_shape.clone();
                s_start[axis_rt] = *start as i32;
                s_stop[axis_rt] = (*start + *len) as i32;
                ops::slice(x, &s_start, &s_stop)?
            }
            Op::Concat { axis } => {
                let inputs: Vec<&Array> = node
                    .inputs
                    .iter()
                    .map(|&id| lookup(&env, id))
                    .collect::<Result<_, _>>()?;
                let aligned = mlx_align_concat_inputs(&inputs, *axis)?;
                let refs: Vec<&Array> = aligned.iter().collect();
                ops::concat(&refs, *axis as i32)?
            }
            Op::Expand { .. } => {
                mlx_expand(graph, node.inputs[0], node, lookup(&env, node.inputs[0])?)?
            }
            Op::Gather { axis } => {
                let x = lookup(&env, node.inputs[0])?;
                let idx = mlx_indices_i64(lookup(&env, node.inputs[1])?)?;
                ops::take(x, &idx, *axis as i32)?
            }
            Op::Reduce {
                op: rop,
                axes,
                keep_dim,
            } => {
                let x = lookup(&env, node.inputs[0])?;
                let kind = match rop {
                    ReduceOp::Sum => MlxReduce::Sum,
                    ReduceOp::Mean => MlxReduce::Mean,
                    ReduceOp::Max => MlxReduce::Max,
                    ReduceOp::Min => MlxReduce::Min,
                    ReduceOp::Prod => MlxReduce::Prod,
                };
                let ax: Vec<i32> = axes.iter().map(|&a| a as i32).collect();
                ops::reduce(x, kind, &ax, *keep_dim)?
            }
            Op::Cumsum { axis, exclusive } => {
                let x = lookup(&env, node.inputs[0])?;
                ops::cumsum(x, *axis, *exclusive)?
            }
            Op::Fft { inverse, norm } => {
                let x = lookup(&env, node.inputs[0])?;
                ops::fft(x, *inverse, norm.tag())?
            }
            Op::LogMel => {
                let spec = lookup(&env, node.inputs[0])?.to_f32()?;
                let filters = lookup(&env, node.inputs[1])?.to_f32()?;
                let spec_shape = graph.node(node.inputs[0]).shape.clone();
                let filt_shape = graph.node(node.inputs[1]).shape.clone();
                let meta =
                    rlx_ir::audio::log_mel_meta(&spec_shape, &filt_shape).map_err(MlxError)?;
                let mut out = vec![0f32; meta.outer * meta.n_mels];
                rlx_ir::audio::log_mel_block_f32(
                    &spec,
                    &filters,
                    meta.outer,
                    meta.n_fft,
                    meta.n_bins,
                    meta.n_mels,
                    &mut out,
                );
                let out_shape: Vec<usize> = node
                    .shape
                    .dims()
                    .iter()
                    .map(|d| d.unwrap_static())
                    .collect();
                Array::from_f32_slice(&out, &out_shape, DType::F32)?
            }
            Op::LogMelBackward => {
                let spec = lookup(&env, node.inputs[0])?.to_f32()?;
                let filters = lookup(&env, node.inputs[1])?.to_f32()?;
                let dy = lookup(&env, node.inputs[2])?.to_f32()?;
                let spec_shape = graph.node(node.inputs[0]).shape.clone();
                let filt_shape = graph.node(node.inputs[1]).shape.clone();
                let meta =
                    rlx_ir::audio::log_mel_meta(&spec_shape, &filt_shape).map_err(MlxError)?;
                let mut d_spec = vec![0f32; meta.outer * meta.n_fft * 2];
                rlx_ir::audio::log_mel_block_vjp(
                    &spec,
                    &filters,
                    &dy,
                    meta.outer,
                    meta.n_fft,
                    meta.n_bins,
                    meta.n_mels,
                    &mut d_spec,
                );
                let out_shape: Vec<usize> = node
                    .shape
                    .dims()
                    .iter()
                    .map(|d| d.unwrap_static())
                    .collect();
                Array::from_f32_slice(&d_spec, &out_shape, DType::F32)?
            }
            Op::WelchPeaks { k, n_segments } => {
                let spec = lookup(&env, node.inputs[0])?.to_f32()?;
                let spec_shape = graph.node(node.inputs[0]).shape.clone();
                let meta = rlx_ir::audio::welch_peaks_meta(&spec_shape, *k, *n_segments)
                    .map_err(MlxError)?;
                let mut out = vec![0f32; meta.welch_batch * meta.k * 2];
                rlx_ir::audio::welch_peaks_block_f32(
                    &spec,
                    meta.welch_batch,
                    meta.n_fft,
                    meta.n_segments,
                    meta.k,
                    &mut out,
                );
                let out_shape: Vec<usize> = node
                    .shape
                    .dims()
                    .iter()
                    .map(|d| d.unwrap_static())
                    .collect();
                Array::from_f32_slice(&out, &out_shape, DType::F32)?
            }
            Op::RmsNorm { eps, .. } => {
                let x = lookup(&env, node.inputs[0])?;
                let g = mlx_norm_scale_1d(lookup(&env, node.inputs[1])?)?;
                ops::rms_norm(x, &g, *eps)?
            }
            Op::Attention {
                num_heads,
                head_dim,
                mask_kind,
                score_scale,
                attn_logit_softcap: _,
            } => {
                // MLX's fast::scaled_dot_product_attention expects Q/K/V
                // as rank-4 [B, H, S, D]. rlx callers may hand us either
                // that or rank-3 [B, S, H*D] (the un-split BERT-style
                // post-projection layout). For rank-3 we reshape +
                // transpose into [B, H, S, D] and back.
                let q_in = lookup(&env, node.inputs[0])?;
                let k_in = lookup(&env, node.inputs[1])?;
                let v_in = lookup(&env, node.inputs[2])?;
                let q_shape = node_input_shape(graph, node.inputs[0]);
                let k_shape = node_input_shape(graph, node.inputs[1]);

                let nh = *num_heads as i32;
                let hd = *head_dim as i32;
                // Respect `score_scale` when the IR specifies one — Gemma 4
                // sets `Some(1.0)` because Q is per-head RMS-normed before
                // attention, so the standard `1/sqrt(head_dim)` factor
                // crushes the scores (E2B head_dim=256 → 16× too small).
                // Without this the SWA attention output drifts from CPU and
                // every E2B greedy step diverges from HF on MLX.
                let scale = score_scale.unwrap_or_else(|| 1.0 / (hd as f32).sqrt());

                let q_ir = graph.node(node.inputs[0]).shape.clone();
                let k_ir = graph.node(node.inputs[1]).shape.clone();
                let geom = rlx_ir::attention_geom(&q_ir, &k_ir, *num_heads, *head_dim);
                let bshd_rank4 = q_shape.len() == 4 && !geom.bhsd;

                let to_bhsd = |t: &Array, sh: &[i32]| -> Result<Array, MlxError> {
                    if sh.len() == 4 {
                        if sh[1] == nh {
                            return t.clone_handle();
                        }
                        // [B, S, H, D] → [B, H, S, D]
                        let t = ops::transpose(t, &[0, 2, 1, 3])?;
                        // Materialize: mlx::compile elides transpose views otherwise
                        // (same issue as conv NHWC→NCHW in conv_compile_mode_repro).
                        return ops::contiguous(&t);
                    }
                    // [B, S, H*D] → [B, S, H, D] → [B, H, S, D]
                    let b = sh[0];
                    let s = sh[1];
                    let r = ops::reshape(t, &[b, s, nh, hd])?;
                    let t = ops::transpose(&r, &[0, 2, 1, 3])?;
                    ops::contiguous(&t)
                };
                let q = to_bhsd(q_in, &q_shape)?;
                let k = to_bhsd(k_in, &k_shape)?;
                let v = to_bhsd(v_in, &node_input_shape(graph, node.inputs[2]))?;

                // Mask must promote to Q/output dtype — MLX's SDPA
                // rejects an f32 mask when Q is f16/bf16. AutoMixed
                // promotes Q/K/V but masks aren't tagged in the
                // precision pass, so cast at the dispatch site.
                let q_dtype = graph.node(node.inputs[0]).shape.dtype();

                // Reshape an arbitrary-rank mask into a 4-D shape SDPA
                // can broadcast against [B, H, S_q, S_k]:
                //   rank 2 [B, S]          → [B, 1, 1, S]
                //   rank 3 [B, S_q, S_k]   → [B, 1, S_q, S_k]
                //   rank 4 [...]           → pass through
                let normalize_mask = |m: &Array, m_shape: &[i32]| -> Result<Array, MlxError> {
                    match m_shape.len() {
                        2 => ops::reshape(m, &[m_shape[0], 1, 1, m_shape[1]]),
                        3 => ops::reshape(m, &[m_shape[0], 1, m_shape[1], m_shape[2]]),
                        _ => m.clone_handle(),
                    }
                };

                let (mask_kind_ffi, mask_owned, mask) = match mask_kind {
                    MaskKind::None => (MlxMask::None, None, None),
                    MaskKind::Causal => (MlxMask::Causal, None, None),
                    MaskKind::Custom => {
                        // MLX SDPA adds the mask additively to scores. The
                        // burnembed BERT graph (and the CPU/Metal/wgpu
                        // backends) interpret MaskKind::Custom as a *binary*
                        // multiplicative mask (1 = valid, 0 = padding).
                        // Convert here so MLX matches the rest of the
                        // workspace: additive = (mask - 1) * 1e9 → 0 when
                        // valid, -1e9 when padded.
                        let m = lookup(&env, node.inputs[3])?;
                        let m_shape = node_input_shape(graph, node.inputs[3]);
                        let one = Array::from_f32_slice(&[1.0], &[1], q_dtype)?;
                        let scl = Array::from_f32_slice(&[1.0e9], &[1], q_dtype)?;
                        let m_cast = if q_dtype != DType::F32 {
                            ops::cast(m, q_dtype)?
                        } else {
                            m.clone_handle()?
                        };
                        let shifted = ops::sub(&m_cast, &one)?;
                        let additive = ops::mul(&shifted, &scl)?;
                        (
                            MlxMask::Custom,
                            Some(normalize_mask(&additive, &m_shape)?),
                            None,
                        )
                    }
                    MaskKind::SlidingWindow(window) => {
                        let s_q = q_shape[q_shape.len() - 2];
                        let s_k = k_shape[k_shape.len() - 2];
                        let m = build_sliding_window_mask(s_q, s_k, *window as i32)?;
                        // build_sliding_window_mask returns rank-2; normalize.
                        let m4 = ops::reshape(&m, &[1, 1, s_q, s_k])?;
                        let m4 = if q_dtype != DType::F32 {
                            ops::cast(&m4, q_dtype)?
                        } else {
                            m4
                        };
                        (MlxMask::Custom, Some(m4), None)
                    }
                    MaskKind::Bias => {
                        // Bias mask = raw additive bias tensor on the 4th input. Pass
                        // through unmodified — MLX SDPA already adds it to scores.
                        let m = lookup(&env, node.inputs[3])?;
                        let m_shape = node_input_shape(graph, node.inputs[3]);
                        let m_cast = if q_dtype != DType::F32 {
                            ops::cast(m, q_dtype)?
                        } else {
                            m.clone_handle()?
                        };
                        (
                            MlxMask::Custom,
                            Some(normalize_mask(&m_cast, &m_shape)?),
                            None,
                        )
                    }
                };
                let m_ref: Option<&Array> = mask.as_ref().or(mask_owned.as_ref());
                let attn_out = if rlx_ir::env::flag("RLX_MLX_SDPA_REFERENCE") {
                    ops::attention_reference_bhsd(&q, &k, &v, scale, m_ref)?
                } else {
                    ops::attention(&q, &k, &v, scale, mask_kind_ffi, m_ref)?
                };

                if q_shape.len() == 3 {
                    // [B, H, S, D] → [B, S, H, D] → [B, S, H*D]
                    let b = q_shape[0];
                    let s = q_shape[1];
                    let bsd = ops::transpose(&attn_out, &[0, 2, 1, 3])?;
                    ops::reshape(&bsd, &[b, s, nh * hd])?
                } else if bshd_rank4 {
                    let t = ops::transpose(&attn_out, &[0, 2, 1, 3])?;
                    ops::contiguous(&t)?
                } else {
                    attn_out
                }
            }

            // ── Fused ops produced by the optimizer's fusion passes ──
            //
            // We compose these from primitives MLX already understands;
            // the fused IR variant exists mainly to keep CPU/Metal
            // happy. Behaviour matches the CPU executor's reference.
            Op::FusedMatMulBiasAct { activation } => {
                let a = lookup(&env, node.inputs[0])?;
                let w = lookup(&env, node.inputs[1])?;
                let b = lookup(&env, node.inputs[2])?;
                let mm = ops::matmul(a, w)?;
                let biased = mlx_add_aligned(&mm, b)?;
                match activation {
                    None => biased,
                    Some(Activation::Gelu) => ops::gelu(&biased)?,
                    Some(Activation::GeluApprox) => ops::gelu_approx(&biased)?,
                    Some(Activation::Silu) => ops::silu(&biased)?,
                    Some(Activation::Relu) => ops::unary(&biased, MlxUnary::Relu)?,
                    Some(Activation::Sigmoid) => ops::unary(&biased, MlxUnary::Sigmoid)?,
                    Some(Activation::Tanh) => ops::unary(&biased, MlxUnary::Tanh)?,
                    Some(Activation::Exp) => ops::unary(&biased, MlxUnary::Exp)?,
                    Some(Activation::Log) => ops::unary(&biased, MlxUnary::Log)?,
                    Some(Activation::Sqrt) => ops::unary(&biased, MlxUnary::Sqrt)?,
                    Some(Activation::Rsqrt) => ops::unary(&biased, MlxUnary::Rsqrt)?,
                    Some(Activation::Neg) => ops::unary(&biased, MlxUnary::Neg)?,
                    Some(Activation::Abs) => ops::unary(&biased, MlxUnary::Abs)?,
                    Some(Activation::Round) => ops::unary(&biased, MlxUnary::Round)?,
                    Some(Activation::Sin) => ops::unary(&biased, MlxUnary::Sin)?,
                    Some(Activation::Cos) => ops::unary(&biased, MlxUnary::Cos)?,
                    Some(Activation::Tan) => ops::unary(&biased, MlxUnary::Tan)?,
                    Some(Activation::Atan) => ops::unary(&biased, MlxUnary::Atan)?,
                }
            }
            Op::FusedResidualLN { has_bias, eps } => {
                let x = lookup(&env, node.inputs[0])?;
                let r = lookup(&env, node.inputs[1])?;
                let summed = mlx_add_aligned(x, r)?;
                let summed = if *has_bias {
                    let bias = lookup(&env, node.inputs[2])?;
                    mlx_add_aligned(&summed, bias)?
                } else {
                    summed
                };
                let (g_idx, b_idx) = if *has_bias { (3, 4) } else { (2, 3) };
                let g = mlx_norm_scale_1d(lookup(&env, node.inputs[g_idx])?)?;
                let b = mlx_norm_scale_1d(lookup(&env, node.inputs[b_idx])?)?;
                ops::layer_norm(&summed, &g, Some(&b), *eps)?
            }
            Op::FusedResidualRmsNorm { has_bias, eps } => {
                let x = lookup(&env, node.inputs[0])?;
                let r = lookup(&env, node.inputs[1])?;
                let summed = mlx_add_aligned(x, r)?;
                let summed = if *has_bias {
                    let bias = lookup(&env, node.inputs[2])?;
                    mlx_add_aligned(&summed, bias)?
                } else {
                    summed
                };
                let g_idx = if *has_bias { 3 } else { 2 };
                let g = mlx_norm_scale_1d(lookup(&env, node.inputs[g_idx])?)?;
                ops::rms_norm(&summed, &g, *eps)?
            }
            Op::Rope { head_dim, n_rot } => {
                let x = lookup(&env, node.inputs[0])?;
                let cos = lookup(&env, node.inputs[1])?;
                let sin = lookup(&env, node.inputs[2])?;

                let graph_x = node_input_shape(graph, node.inputs[0]);
                let x_shape = runtime_shape_or_graph(x, &graph_x)?;
                let cos_runtime = cos.shape().unwrap_or_default();
                if cos_runtime.len() != 2 {
                    return Err(MlxError(format!(
                        "Rope: cos must be rank-2 [seq, half], got rank-{} shape={cos_runtime:?} (graph x={x_shape:?}, n_rot={n_rot})",
                        cos_runtime.len()
                    )));
                }
                let n = x_shape.len();
                if n < 2 {
                    return Err(MlxError("Rope: x must be rank ≥ 2".into()));
                }
                if head_dim % 2 != 0 {
                    return Err(MlxError(format!("Rope: head_dim {head_dim} must be even")));
                }
                if *n_rot > *head_dim || !n_rot.is_multiple_of(2) {
                    return Err(MlxError(format!(
                        "Rope: n_rot={n_rot} must be even and <= head_dim={head_dim}"
                    )));
                }
                let hd = *head_dim as i32;
                let nr = *n_rot as i32;
                let rot_half = nr / 2;

                let last = *x_shape.last().unwrap() as usize;
                if last < *n_rot {
                    return Err(MlxError(format!("Rope: x last dim {last} < n_rot {n_rot}")));
                }
                let heads_in_last = (last / *head_dim) as i32;
                let multi_head_packed =
                    heads_in_last > 1 && last.is_multiple_of(*head_dim) && n >= 3;
                let has_tail = !last.is_multiple_of(*head_dim);

                let rotate = |x_rot: &Array,
                              rot_shape: &[i32],
                              seq_axis: usize,
                              pairs: i32|
                 -> Result<Array, MlxError> {
                    let rn = rot_shape.len();
                    let seq_v = rot_shape[seq_axis];
                    let cos_rows = cos.shape()?.first().copied().unwrap_or(0) as i32;
                    let seq_cos = seq_v.min(cos_rows.max(1));
                    let cos_seq = ops::slice(cos, &[0, 0], &[seq_cos, pairs])?;
                    let sin_seq = ops::slice(sin, &[0, 0], &[seq_cos, pairs])?;
                    let mut bshape = vec![1i32; rn];
                    bshape[seq_axis] = seq_cos;
                    bshape[rn - 1] = pairs;
                    let cos_b = ops::reshape(&cos_seq, &bshape)?;
                    let sin_b = ops::reshape(&sin_seq, &bshape)?;
                    let mut x1_stop = rot_shape.to_vec();
                    x1_stop[rn - 1] = pairs;
                    let x1 = ops::slice(x_rot, &vec![0i32; rn], &x1_stop)?;
                    let mut x2_start = vec![0i32; rn];
                    x2_start[rn - 1] = pairs;
                    let x2 = ops::slice(x_rot, &x2_start, rot_shape)?;
                    let x1_cos = ops::mul(&x1, &cos_b)?;
                    let x2_sin = ops::mul(&x2, &sin_b)?;
                    let x2_cos = ops::mul(&x2, &cos_b)?;
                    let x1_sin = ops::mul(&x1, &sin_b)?;
                    let y1 = ops::sub(&x1_cos, &x2_sin)?;
                    let y2 = ops::add(&x2_cos, &x1_sin)?;
                    ops::concat(&[&y1, &y2], (rn - 1) as i32)
                };

                if has_tail {
                    let mut rot_stop = x_shape.clone();
                    rot_stop[n - 1] = nr.min(hd);
                    let rot = ops::slice(x, &vec![0i32; n], &rot_stop)?;
                    let mut tail_start = vec![0i32; n];
                    tail_start[n - 1] = nr.min(hd);
                    let tail = ops::slice(x, &tail_start, &x_shape)?;
                    let mut rot_shape = x_shape.clone();
                    rot_shape[n - 1] = nr.min(hd);
                    let y_rot = rotate(&rot, &rot_shape, n - 2, rot_half)?;
                    ops::concat(&[&y_rot, &tail], (n - 1) as i32)?
                } else if multi_head_packed {
                    let mut split_shape = x_shape.clone();
                    split_shape[n - 1] = heads_in_last;
                    split_shape.push(hd);
                    // `Op::Rope`'s seq axis is `n-2` (original rank). For packed rank-3 callers
                    // (`[B, S, H*D]`), reshape gives `[B, S, H, D]` but we need `[B, H, S, D]`
                    // so that `seq_axis = n-1` (after adding the hd axis) points at `S`.
                    let x_split = ops::reshape(x, &split_shape)?;
                    let mut perm: Vec<i32> = (0..(n as i32 + 1)).collect();
                    perm.swap(n - 1, n - 2);
                    let x_split = ops::transpose(&x_split, &perm)?;
                    split_shape.swap(n - 1, n - 2);
                    if nr < hd {
                        let mut rot_stop = split_shape.clone();
                        rot_stop[n] = nr;
                        let rot = ops::slice(&x_split, &vec![0i32; n + 1], &rot_stop)?;
                        let mut pass_start = vec![0i32; n + 1];
                        pass_start[n] = nr;
                        let pass = ops::slice(&x_split, &pass_start, &split_shape)?;
                        let mut rot_shape = split_shape.clone();
                        rot_shape[n] = nr;
                        let y_rot = rotate(&rot, &rot_shape, n - 1, rot_half)?;
                        let y_head = ops::concat(&[&y_rot, &pass], n as i32)?;
                        // Transpose back to `[... , S, H, D]` then reshape to original packed rank-3.
                        let mut perm_back: Vec<i32> = (0..(n as i32 + 1)).collect();
                        perm_back.swap(n - 1, n - 2);
                        let y_bshd = ops::transpose(&y_head, &perm_back)?;
                        ops::reshape(&y_bshd, &x_shape)?
                    } else {
                        let y_split = rotate(&x_split, &split_shape, n - 1, rot_half)?;
                        let mut perm_back: Vec<i32> = (0..(n as i32 + 1)).collect();
                        perm_back.swap(n - 1, n - 2);
                        let y_bshd = ops::transpose(&y_split, &perm_back)?;
                        ops::reshape(&y_bshd, &x_shape)?
                    }
                } else if nr < hd {
                    let mut rot_stop = x_shape.clone();
                    rot_stop[n - 1] = nr;
                    let rot = ops::slice(x, &vec![0i32; n], &rot_stop)?;
                    let mut pass_start = vec![0i32; n];
                    pass_start[n - 1] = nr;
                    let pass = ops::slice(x, &pass_start, &x_shape)?;
                    let mut rot_shape = x_shape.clone();
                    rot_shape[n - 1] = nr;
                    let y_rot = rotate(&rot, &rot_shape, n - 2, rot_half)?;
                    ops::concat(&[&y_rot, &pass], (n - 1) as i32)?
                } else {
                    rotate(x, &x_shape, n - 2, rot_half)?
                }
            }
            Op::Conv {
                kernel_size,
                stride,
                padding,
                dilation,
                groups,
            } => {
                // rlx convention: NCHW (or NCL / NCDHW) inputs +
                // [C_out, C_in/g, ...spatial] weights.
                // MLX expects channels-last (NHWC, NLC, NDHWC) and
                // weight [C_out, ...spatial, C_in/g]. We transpose
                // around the call. A future pass could keep
                // activations in channels-last across consecutive
                // convs to amortize the conversion.
                let in_shape = node_input_shape(graph, node.inputs[0]);
                let x = lookup(&env, node.inputs[0])?;
                let w = lookup(&env, node.inputs[1])?;
                let s = |i: usize| stride.get(i).copied().unwrap_or(1) as i32;
                let p = |i: usize| padding.get(i).copied().unwrap_or(0) as i32;
                let d = |i: usize| dilation.get(i).copied().unwrap_or(1) as i32;

                match (kernel_size.len(), in_shape.len()) {
                    (1, 3) => {
                        // NCL → NLC: perm [0, 2, 1]; weight [Co, Ci, kL]
                        // → [Co, kL, Ci]: perm [0, 2, 1]
                        let x_nlc = ops::transpose(x, &[0, 2, 1])?;
                        let w_mlx = ops::transpose(w, &[0, 2, 1])?;
                        let y_nlc = ops::conv1d(&x_nlc, &w_mlx, s(0), p(0), d(0), *groups as i32)?;
                        ops::transpose(&y_nlc, &[0, 2, 1])?
                    }
                    (2, 4) if in_shape[2] == 1 || in_shape[3] == 1 => {
                        // 1D conv expressed as 2D NCHW with a unit spatial axis
                        // (rlx lowers ONNX 1D convs as `[N,C,1,L]`/`[N,C,L,1]` with
                        // the length-axis kernel/stride/pad at index 0). Applying a
                        // 2D conv would run the kernel over the singleton axis, so
                        // collapse to NCL and use conv1d over the real length, then
                        // reshape back to the rlx 4D output convention `[N,C,Lo,1]`.
                        let n = in_shape[0];
                        let ci = in_shape[1];
                        let length = if in_shape[2] == 1 {
                            in_shape[3]
                        } else {
                            in_shape[2]
                        };
                        let wsh = w.shape()?; // [Co, Ci/g, kh, kw]
                        let co = wsh[0] as i32;
                        let cig = wsh[1] as i32;
                        let k = wsh[2].max(wsh[3]) as i32;
                        let _ = co;
                        let x_ncl = ops::reshape(x, &[n, ci, length])?;
                        let w_ncl = ops::reshape(w, &[co, cig, k])?;
                        let x_nlc = ops::transpose(&x_ncl, &[0, 2, 1])?;
                        let w_mlx = ops::transpose(&w_ncl, &[0, 2, 1])?;
                        let y_nlc = ops::conv1d(&x_nlc, &w_mlx, s(0), p(0), d(0), *groups as i32)?;
                        let y_ncl = ops::transpose(&y_nlc, &[0, 2, 1])?; // [N, Co, Lo]
                        // Reshape to the importer's declared 4D output shape (it places
                        // the length axis in W: `[N,Co,1,Lo]`) so downstream ops that
                        // rely on the declared layout line up. MLX's conv length may
                        // differ from the importer's by a few samples (padding
                        // rounding); trim the length axis to the declared length first.
                        let out_dims: Vec<i32> = node
                            .shape
                            .dims()
                            .iter()
                            .map(|d| d.unwrap_static() as i32)
                            .collect();
                        let target_len = out_dims.iter().product::<i32>() / (n * co).max(1);
                        let cur = y_ncl.shape()?;
                        let y_ncl = if cur.get(2).copied().unwrap_or(0) as i32 > target_len {
                            let mut stop: Vec<i32> = cur.iter().map(|&d| d as i32).collect();
                            stop[2] = target_len;
                            ops::slice(&y_ncl, &[0, 0, 0], &stop)?
                        } else {
                            y_ncl
                        };
                        ops::reshape(&y_ncl, &out_dims)?
                    }
                    (2, 4) => {
                        let x_nhwc = ops::transpose(x, &[0, 2, 3, 1])?;
                        let w_mlx = ops::transpose(w, &[0, 2, 3, 1])?;
                        let y_nhwc = ops::conv2d(
                            &x_nhwc,
                            &w_mlx,
                            (s(0), s(1)),
                            (p(0), p(1)),
                            (d(0), d(1)),
                            *groups as i32,
                        )?;
                        ops::transpose(&y_nhwc, &[0, 3, 1, 2])?
                    }
                    (3, 5) => {
                        // NCDHW → NDHWC: perm [0, 2, 3, 4, 1]
                        let x_nd = ops::transpose(x, &[0, 2, 3, 4, 1])?;
                        let w_mlx = ops::transpose(w, &[0, 2, 3, 4, 1])?;
                        let y_nd = ops::conv3d(
                            &x_nd,
                            &w_mlx,
                            (s(0), s(1), s(2)),
                            (p(0), p(1), p(2)),
                            (d(0), d(1), d(2)),
                            *groups as i32,
                        )?;
                        ops::transpose(&y_nd, &[0, 4, 1, 2, 3])?
                    }
                    (k, n) => {
                        return Err(MlxError(format!(
                            "Conv: kernel rank {k} with input rank {n} \
                         not supported (use 1D/2D/3D NCHW)"
                        )));
                    }
                }
            }
            Op::LayerNorm2d { eps } => {
                let x = lookup(&env, node.inputs[0])?;
                let g = mlx_norm_scale_1d(lookup(&env, node.inputs[1])?)?;
                let b = mlx_norm_scale_1d(lookup(&env, node.inputs[2])?)?;
                let shape = x.shape()?;
                if shape.len() != 4 {
                    return Err(MlxError(
                        "LayerNorm2d on MLX: expects NCHW rank-4 input".into(),
                    ));
                }
                let n = shape[0];
                let c = shape[1];
                let h = shape[2];
                let w = shape[3];
                let flat = ops::reshape(x, &[(n * h * w) as i32, c as i32])?;
                let y = ops::layer_norm(&flat, &g, Some(&b), *eps)?;
                ops::reshape(&y, &[n as i32, c as i32, h as i32, w as i32])?
            }
            Op::ConvTranspose2d {
                kernel_size,
                stride,
                padding,
                dilation,
                output_padding,
                groups,
            } => {
                if kernel_size.len() != 2 {
                    return Err(MlxError("ConvTranspose2d on MLX: 2D NCHW only".into()));
                }
                let x = lookup(&env, node.inputs[0])?;
                let w = lookup(&env, node.inputs[1])?;
                let x_shape = node_input_shape(graph, node.inputs[0]);
                let w_shape = node_input_shape(graph, node.inputs[1]);
                let out_shape: Vec<i32> = node
                    .shape
                    .dims()
                    .iter()
                    .map(|d| d.unwrap_static() as i32)
                    .collect();
                if x_shape.len() != 4 || w_shape.len() != 4 || out_shape.len() != 4 {
                    return Err(MlxError(
                        "ConvTranspose2d on MLX: rank-4 NCHW tensors only".into(),
                    ));
                }
                let g = *groups as i32;
                let c_in = x_shape[1];
                let c_out = out_shape[1];
                let h = x_shape[2];
                let w_in = x_shape[3];
                let h_out = out_shape[2];
                let w_out = out_shape[3];
                let kh = w_shape[2];
                let kw = w_shape[3];
                let c_in_per_g = c_in / g;
                let c_out_per_g = c_out / g;
                let s = |i: usize| stride.get(i).copied().unwrap_or(1) as i32;
                let p = |i: usize| padding.get(i).copied().unwrap_or(0) as i32;
                let d = |i: usize| dilation.get(i).copied().unwrap_or(1) as i32;
                let opad = |i: usize| output_padding.get(i).copied().unwrap_or(0) as i32;
                // 1D transposed conv lowered as 2D NCHW with a unit spatial axis
                // (rlx places the length-axis params at index 0 and the kernel as
                // `[k,1]`, but the length may sit in W). Running a 2D conv would
                // apply the kernel over the singleton axis (→ empty output), so do a
                // genuine 1D transposed conv over the real length axis. Mirrors the
                // forward `Conv` 1D handling.
                let onnx_1d = kh == 1 || kw == 1;
                if onnx_1d {
                    let in_len = if h == 1 { w_in } else { h };
                    let out_len = if h_out == 1 { w_out } else { h_out };
                    let k = kh.max(kw);
                    // MLX models a transposed conv as conv_general with
                    // input_dilation = stride, stride = 1, flip = true. The padding
                    // is then symmetric `dilation*(k-1) - pad_orig` on each side, plus
                    // `output_padding` on the high side. (The 2D path's
                    // `in-1 - s*(out-1) + p` is a large negative for upsampling, which
                    // MLX treats as cropping → an empty input window.)
                    let pad_lo1 = d(0) * (k - 1) - p(0);
                    let pad_hi1 = d(0) * (k - 1) - p(0) + opad(0);
                    // NCHW(with unit axis) → NCL → NLC for MLX channels-last conv.
                    let x_ncl = ops::reshape(x, &[x_shape[0], c_in, in_len])?;
                    let x_nlc = ops::transpose(&x_ncl, &[0, 2, 1])?;
                    // Transposed-conv weight [C_in, C_out/g, k] → MLX channels-last
                    // [C_out/g, k, C_in] (output-first, input-last, matching the 1D
                    // conv_general weight layout).
                    let w_ncl = ops::reshape(w, &[c_in, c_out_per_g, k])?;
                    let w_mlx = ops::transpose(&w_ncl, &[1, 2, 0])?;
                    let needs_inflate = g > 1 && s(0) > 1;
                    let (x_in1, in_dil): (Array, [i32; 1]) = if needs_inflate {
                        let inflated = inflate_spatial_1d(&x_nlc, s(0) as usize)?;
                        (inflated, [1])
                    } else {
                        (x_nlc.clone_handle()?, [s(0)])
                    };
                    let raw = ops::conv_general(
                        &x_in1,
                        &w_mlx,
                        &[1],
                        &[pad_lo1],
                        &[pad_hi1],
                        &[d(0)],
                        &in_dil,
                        g,
                        true,
                    )?;
                    // conv_general may overshoot the target length by a few samples
                    // (padding/output_padding rounding); trim the NLC length axis to
                    // exactly the declared output length.
                    let cur: Vec<i32> = raw.shape()?.iter().map(|&d| d as i32).collect();
                    let adjusted = if cur.get(1).copied().unwrap_or(0) > out_len {
                        let start = vec![0i32; cur.len()];
                        let mut stop = cur.clone();
                        stop[1] = out_len;
                        ops::slice(&raw, &start, &stop)?
                    } else {
                        raw
                    };
                    // NLC [N, Lo, Cout] → NCL → declared NCHW out_shape.
                    let y_ncl = ops::transpose(&adjusted, &[0, 2, 1])?;
                    ops::reshape(&y_ncl, &out_shape)?
                } else {
                    let pad_lo: Vec<i32> = vec![d(0) * (kh - 1) - p(0), d(1) * (kw - 1) - p(1)];
                    let pad_hi: Vec<i32> = vec![
                        h - 1 - s(0) * (h_out - 1) + p(0) + opad(0),
                        w_in - 1 - s(1) * (w_out - 1) + p(1) + opad(1),
                    ];
                    let x_nhwc = ops::transpose(x, &[0, 2, 3, 1])?;
                    let needs_inflate = g > 1 && (s(0) > 1 || s(1) > 1);
                    let (x_input, conv_input_dilation): (Array, [i32; 2]) = if needs_inflate {
                        let inflated = inflate_spatial_2d(&x_nhwc, s(0) as usize, s(1) as usize)?;
                        (inflated, [1, 1])
                    } else {
                        (x_nhwc.clone_handle()?, [s(0), s(1)])
                    };
                    // Weight [C_in, C_out/g, kH, kW] → MLX [C_in, kH, kW, C_out/g]
                    let w_t = if g == 1 {
                        ops::transpose(w, &[0, 2, 3, 1])?
                    } else {
                        let split = ops::reshape(w, &[g, c_in_per_g, c_out_per_g, kh, kw])?;
                        let perm = ops::transpose(&split, &[0, 1, 3, 4, 2])?;
                        ops::reshape(&perm, &[c_in, kh, kw, c_out_per_g])?
                    };
                    let raw = ops::conv_general(
                        &x_input,
                        &w_t,
                        &[1, 1],
                        &pad_lo,
                        &pad_hi,
                        &[d(0), d(1)],
                        &conv_input_dilation,
                        g,
                        true,
                    )?;
                    let needs_slice = pad_lo.iter().chain(pad_hi.iter()).any(|&p| p < 0);
                    let adjusted = if needs_slice {
                        let cur: Vec<i32> = raw.shape()?.iter().map(|&d| d as i32).collect();
                        let mut start = vec![0i32; cur.len()];
                        let mut stop = cur.clone();
                        for i in 0..2 {
                            if pad_lo[i] < 0 {
                                start[1 + i] = -pad_lo[i];
                            }
                            if pad_hi[i] < 0 {
                                stop[1 + i] += pad_hi[i];
                            }
                        }
                        ops::slice(&raw, &start, &stop)?
                    } else {
                        raw
                    };
                    ops::transpose(&adjusted, &[0, 3, 1, 2])?
                }
            }
            Op::TopK { k } => {
                // Op::TopK returns f32-encoded indices of the k largest
                // values along the last axis (descending). We use
                // argpartition to position them, then a slice extracts
                // the back end of the result. argpartition with
                // kth=size-k puts the top-k *largest* in the last k
                // positions (unsorted relative order — matches
                // rlx's "ties broken by index" semantics? No — rlx
                // wants sorted. So we follow with argsort *only over
                // the last k* via take_along_axis, but to keep things
                // tractable we leave the order as argpartition gives.
                let x = lookup(&env, node.inputs[0])?;
                let in_shape = node_input_shape(graph, node.inputs[0]);
                if in_shape.is_empty() {
                    return Err(MlxError("TopK: input must be rank ≥ 1".into()));
                }
                let last_axis = (in_shape.len() - 1) as i32;
                let last_size = *in_shape.last().unwrap();
                if (*k as i32) > last_size {
                    return Err(MlxError(format!("TopK: k={k} > last_dim={last_size}")));
                }
                let kth = last_size - (*k as i32);
                let idx_full = ops::argpartition(x, kth, last_axis)?;
                // Slice the last `k` indices along the last axis.
                let mut start = vec![0i32; in_shape.len()];
                let mut stop = in_shape.clone();
                start[in_shape.len() - 1] = kth;
                stop[in_shape.len() - 1] = last_size;
                let idx = ops::slice(&idx_full, &start, &stop)?;
                // rlx encodes indices as f32 at the I/O boundary.
                ops::cast(&idx, DType::F32)?
            }
            Op::ScatterAdd => {
                // Inputs: [updates, indices]. Output is a fresh
                // tensor of node.shape; rlx semantics is "initial
                // output is zero, accumulate updates by indices."
                // MLX's scatter_add takes a base array and writes onto
                // it — we feed it a zero base of the right shape.
                let updates = lookup(&env, node.inputs[0])?;
                let indices_in = mlx_indices_i64(lookup(&env, node.inputs[1])?)?;
                let out_shape: Vec<i32> = node
                    .shape
                    .dims()
                    .iter()
                    .map(|d| d.unwrap_static() as i32)
                    .collect();
                // Build a zero base directly at the target shape via
                // `Array::from_f32_slice(&[0.0; N], shape, F32)`.  The earlier
                // `broadcast_to(sub(updates, updates), out_shape)` only worked
                // when `updates.shape[0]` equaled `out_shape[0]` — false when
                // the gradient comes from a Gather whose index set is denser
                // than the source table (e.g. ScatterAdd 240→30 in routing AD).
                let n_elem: usize = out_shape.iter().product::<i32>() as usize;
                let zeros = vec![0.0_f32; n_elem];
                let out_shape_usize: Vec<usize> = out_shape.iter().map(|d| *d as usize).collect();
                let zero_target =
                    crate::array::Array::from_f32_slice(&zeros, &out_shape_usize, DType::F32)?;
                let upd_shape = node_input_shape(graph, node.inputs[0]);
                let idx_shape = node_input_shape(graph, node.inputs[1]);
                // Gather axis-0 VJP: updates `[n_edges, d]`, indices `[n_edges]` → table `[n, d]`.
                // MLX scatter expects index rank to match the scattered array rank.
                let indices = if upd_shape.len() > 1 && idx_shape.len() == 1 {
                    ops::reshape(&indices_in, &[idx_shape[0], 1])?
                } else {
                    indices_in
                };
                if upd_shape.len() > 1 {
                    ops::scatter_add_axis(&zero_target, &indices, updates, 0)?
                } else {
                    ops::scatter_add(&zero_target, &indices, updates, 0)?
                }
            }
            Op::GroupedMatMul => {
                // Inputs: [input, weight, expert_idx].
                let x = lookup(&env, node.inputs[0])?;
                let w = lookup(&env, node.inputs[1])?;
                let i = lookup(&env, node.inputs[2])?;
                ops::gather_mm(x, w, i)?
            }
            Op::DequantGroupedMatMul { scheme } => {
                if !scheme.is_gguf() {
                    return Err(MlxError(
                        "DequantGroupedMatMul: only GGUF K-quants supported".into(),
                    ));
                }
                let x = lookup(&env, node.inputs[0])?;
                let wq = lookup(&env, node.inputs[1])?;
                let idx = lookup(&env, node.inputs[2])?;
                let out_shape: Vec<usize> = node
                    .shape
                    .dims()
                    .iter()
                    .map(|d| d.unwrap_static())
                    .collect();
                let m = out_shape[out_shape.len() - 2];
                let n = out_shape[out_shape.len() - 1];
                let x_f32 = x.to_f32()?;
                let k = x_f32.len() / m.max(1);
                let w_bytes = wq.to_bytes()?;
                let idx_f32 = idx.to_f32()?;
                let block_elems = scheme.gguf_block_size() as usize;
                let block_bytes = scheme.gguf_block_bytes() as usize;
                let slab_bytes = (k * n) / block_elems * block_bytes;
                let num_experts = w_bytes.len() / slab_bytes.max(1);
                let mut out_host = vec![0f32; m * n];
                rlx_cpu::gguf_matmul::gguf_grouped_matmul_bt(
                    &x_f32,
                    &w_bytes,
                    &idx_f32,
                    &mut out_host,
                    m,
                    k,
                    n,
                    num_experts,
                    *scheme,
                );
                Array::from_f32_slice(&out_host, &out_shape, DType::F32)?
            }
            Op::DequantMatMul { scheme } => {
                if scheme.is_gguf() {
                    let x = lookup(&env, node.inputs[0])?;
                    let wq = lookup(&env, node.inputs[1])?;
                    let n = node.shape.dim(node.shape.rank() - 1).unwrap_static();
                    let total = node.shape.num_elements().unwrap();
                    let m = total / n.max(1);
                    let x_total = graph.node(node.inputs[0]).shape.num_elements().unwrap();
                    let k = x_total / m.max(1);
                    let w_bytes = wq.to_bytes()?;
                    // The naive host loop in `rlx_cpu::gguf_matmul::gguf_matmul_bt`
                    // measures 100×+ slower than MLX's native matmul. Dequant
                    // once to f32 here and let MLX's tuned sgemm pick up from
                    // there. GGUF Q4K stores weights as `[n, k]` row-major;
                    // transpose to `[k, n]` so `x @ w_t == [m, n]`. Off-switch
                    // `RLX_MLX_GGUF_HOST_FALLBACK=1` reverts to the host kernel.
                    let use_host_fallback =
                        std::env::var("RLX_MLX_GGUF_HOST_FALLBACK").as_deref() == Ok("1");
                    if use_host_fallback {
                        let mut out_host = vec![0f32; m * n];
                        rlx_cpu::gguf_matmul::gguf_matmul_bt(
                            &x.to_f32()?,
                            &w_bytes,
                            &mut out_host,
                            m,
                            k,
                            n,
                            *scheme,
                        );
                        let out_shape: Vec<usize> = node
                            .shape
                            .dims()
                            .iter()
                            .map(|d| d.unwrap_static())
                            .collect();
                        Array::from_f32_slice(&out_host, &out_shape, DType::F32)?
                    } else {
                        // Cache the dequanted+transposed [k, n] f32 Array per
                        // Param name so subsequent decode steps reuse it
                        // instead of paying the Q4K → f32 cost every dispatch.
                        // Without the cache, dequant of all 48 layers'
                        // weights inflates the first decode step from ~ms to
                        // ~170s on Gemma 4 12B Q4_K_M. Cache survives across
                        // generate() calls because the Param bytes are stable.
                        let w_node = graph.node(node.inputs[1]);
                        let cache_key = match &w_node.op {
                            rlx_ir::Op::Param { name } => Some(format!("{name}#kn")),
                            _ => None,
                        };
                        let w_kn = if let Some(ref key) = cache_key {
                            if let Some(arr) = mlx_dequant_cache_get(key)? {
                                arr
                            } else {
                                let arr = build_dequanted_kn(&w_bytes, k, n, scheme)?;
                                let to_store = arr.clone_handle()?;
                                mlx_dequant_cache_put(key.clone(), to_store);
                                arr
                            }
                        } else {
                            build_dequanted_kn(&w_bytes, k, n, scheme)?
                        };
                        ops::matmul(x, &w_kn)?
                    }
                } else if matches!(scheme, rlx_ir::QuantScheme::Nvfp4Block) {
                    let x = lookup(&env, node.inputs[0])?;
                    let wq = lookup(&env, node.inputs[1])?;
                    let sc = lookup(&env, node.inputs[2])?;
                    let gs_arr = lookup(&env, node.inputs[3])?;
                    let n = node.shape.dim(node.shape.rank() - 1).unwrap_static();
                    let total = node.shape.num_elements().unwrap();
                    let m = total / n.max(1);
                    let x_total = graph.node(node.inputs[0]).shape.num_elements().unwrap();
                    let k = x_total / m.max(1);
                    let xs = x.to_f32()?;
                    let w_bytes = wq.to_bytes()?;
                    let scale_bytes = sc.to_bytes()?;
                    let global_scale = gs_arr.to_f32()?[0];
                    let mut out_host = vec![0f32; m * n];
                    rlx_cpu::thunk::dequant_matmul_nvfp4(
                        &xs,
                        &w_bytes,
                        &scale_bytes,
                        global_scale,
                        &mut out_host,
                        m,
                        k,
                        n,
                    );
                    let out_shape: Vec<usize> = node
                        .shape
                        .dims()
                        .iter()
                        .map(|d| d.unwrap_static())
                        .collect();
                    Array::from_f32_slice(&out_host, &out_shape, DType::F32)?
                } else if matches!(
                    scheme,
                    rlx_ir::QuantScheme::Int8Block { .. }
                        | rlx_ir::QuantScheme::Int8BlockAsym { .. }
                ) {
                    let x = lookup(&env, node.inputs[0])?;
                    let wq = lookup(&env, node.inputs[1])?;
                    let sc = lookup(&env, node.inputs[2])?;
                    let zp = lookup(&env, node.inputs[3])?;
                    let n = node.shape.dim(node.shape.rank() - 1).unwrap_static();
                    let total = node.shape.num_elements().unwrap();
                    let m = total / n.max(1);
                    let x_total = graph.node(node.inputs[0]).shape.num_elements().unwrap();
                    let k = x_total / m.max(1);
                    let block_size = match scheme {
                        rlx_ir::QuantScheme::Int8Block { block_size }
                        | rlx_ir::QuantScheme::Int8BlockAsym { block_size } => *block_size,
                        _ => unreachable!(),
                    };
                    let asym = matches!(scheme, rlx_ir::QuantScheme::Int8BlockAsym { .. });
                    let xs = x.to_f32()?;
                    let w_raw = wq.to_bytes()?;
                    let w_bytes = unsafe {
                        std::slice::from_raw_parts(w_raw.as_ptr() as *const i8, w_raw.len())
                    };
                    let scales = sc.to_f32()?;
                    let zps = if asym { zp.to_f32()? } else { Vec::new() };
                    let mut out_host = vec![0f32; m * n];
                    rlx_cpu::thunk::dequant_matmul_int8(
                        &xs,
                        w_bytes,
                        &scales,
                        &zps,
                        &mut out_host,
                        m,
                        k,
                        n,
                        block_size as usize,
                        asym,
                    );
                    let out_shape: Vec<usize> = node
                        .shape
                        .dims()
                        .iter()
                        .map(|d| d.unwrap_static())
                        .collect();
                    Array::from_f32_slice(&out_host, &out_shape, DType::F32)?
                } else {
                    // Inputs: [x, w_q, scale, zp]. Map to MLX's
                    // quantized_matmul. The bit-width and group-size come
                    // from the rlx QuantScheme.
                    let x = lookup(&env, node.inputs[0])?;
                    let wq = lookup(&env, node.inputs[1])?;
                    let s = lookup(&env, node.inputs[2])?;
                    let zp = lookup(&env, node.inputs[3])?;
                    let (bits, gs) = quant_scheme_to_mlx(scheme)?;
                    ops::quantized_matmul(x, wq, s, Some(zp), /*transpose=*/ true, gs, bits)?
                }
            }
            Op::LoraMatMul { scale } => {
                // out = x @ W + scale * (x @ A) @ B
                let x = lookup(&env, node.inputs[0])?;
                let w = lookup(&env, node.inputs[1])?;
                let a = lookup(&env, node.inputs[2])?;
                let b = lookup(&env, node.inputs[3])?;
                let base = ops::matmul(x, w)?;
                let xa = ops::matmul(x, a)?;
                let xab = ops::matmul(&xa, b)?;
                // Scale via in-graph mul against a scalar array.
                let s = Array::from_f32_slice(&[*scale], &[1], DType::F32)?;
                let scaled = ops::mul(&xab, &s)?;
                ops::add(&base, &scaled)?
            }
            Op::FusedTransformerLayer {
                num_heads,
                head_dim,
                intermediate_size: _,
                eps1,
                eps2,
                activation,
                has_bias,
            } => {
                // Standard BERT-style post-norm transformer layer.
                // Inputs (per IR doc):
                //   hidden, qkv_w, qkv_b, out_w, out_b,
                //   ln1_g, ln1_b, fc1_w, fc1_b, fc2_w, fc2_b,
                //   ln2_g, ln2_b, mask
                //
                // Wiring:
                //   attn_out = attention_block(hidden, qkv_w, [qkv_b],
                //                              out_w, [out_b], mask)
                //   h1       = layer_norm(hidden + attn_out, ln1_g, ln1_b, eps1)
                //   ffn      = activation(h1 @ fc1_w [+ fc1_b])
                //   ffn_out  = ffn @ fc2_w [+ fc2_b]
                //   h2       = layer_norm(h1 + ffn_out, ln2_g, ln2_b, eps2)
                // Index map. has_bias gates every bias input (including
                // the two LayerNorm betas, per Op::num_inputs above):
                //   has_bias=true  → 14 inputs (full BERT layout)
                //   has_bias=false → 8 inputs (no biases at all)
                let (
                    hidden,
                    qkv_w,
                    qkv_b,
                    out_w,
                    out_b,
                    ln1_g,
                    ln1_b,
                    fc1_w,
                    fc1_b,
                    fc2_w,
                    fc2_b,
                    ln2_g,
                    ln2_b,
                    mask,
                ) = if *has_bias {
                    (
                        lookup(&env, node.inputs[0])?,
                        lookup(&env, node.inputs[1])?,
                        Some(lookup(&env, node.inputs[2])?),
                        lookup(&env, node.inputs[3])?,
                        Some(lookup(&env, node.inputs[4])?),
                        lookup(&env, node.inputs[5])?,
                        Some(lookup(&env, node.inputs[6])?),
                        lookup(&env, node.inputs[7])?,
                        Some(lookup(&env, node.inputs[8])?),
                        lookup(&env, node.inputs[9])?,
                        Some(lookup(&env, node.inputs[10])?),
                        lookup(&env, node.inputs[11])?,
                        Some(lookup(&env, node.inputs[12])?),
                        lookup(&env, node.inputs[13])?,
                    )
                } else {
                    (
                        lookup(&env, node.inputs[0])?,
                        lookup(&env, node.inputs[1])?,
                        None,
                        lookup(&env, node.inputs[2])?,
                        None,
                        lookup(&env, node.inputs[3])?,
                        None,
                        lookup(&env, node.inputs[4])?,
                        None,
                        lookup(&env, node.inputs[5])?,
                        None,
                        lookup(&env, node.inputs[6])?,
                        None,
                        lookup(&env, node.inputs[7])?,
                    )
                };

                let h_shape = node_input_shape(graph, node.inputs[0]);
                let batch = h_shape[0];
                let seq = h_shape[1];
                let nh = *num_heads as i32;
                let hd = *head_dim as i32;
                let inner = nh * hd;

                // Optional-bias add helper: idempotent when bias is None.
                let maybe_add = |x: Array, b: Option<&Array>| -> Result<Array, MlxError> {
                    match b {
                        Some(b) => ops::add(&x, b),
                        None => Ok(x),
                    }
                };

                // --- Attention block ---
                let qkv = ops::matmul(hidden, qkv_w)?;
                let qkv = maybe_add(qkv, qkv_b)?;
                let q = ops::slice(&qkv, &[0, 0, 0], &[batch, seq, inner])?;
                let k = ops::slice(&qkv, &[0, 0, inner], &[batch, seq, 2 * inner])?;
                let v = ops::slice(&qkv, &[0, 0, 2 * inner], &[batch, seq, 3 * inner])?;
                // Materialize the transpose with `ops::contiguous` (MLX's
                // `compile` elides transpose views — same fix as Op::Attention
                // at lower.rs:851/858 and Op::FusedAttentionBlock above).
                let to_h = |t: Array| -> Result<Array, MlxError> {
                    let r = ops::reshape(&t, &[batch, seq, nh, hd])?;
                    let t = ops::transpose(&r, &[0, 2, 1, 3])?;
                    ops::contiguous(&t)
                };
                let q = to_h(q)?;
                let k = to_h(k)?;
                let v = to_h(v)?;
                let scale = 1.0 / (hd as f32).sqrt();

                // Convert the BERT-style binary mask `[B, S]` (1.0 valid,
                // 0.0 padding) → additive (`(mask - 1) * 1e9`) and reshape
                // to `[B, 1, 1, S]` so it broadcasts over heads + query
                // positions in SDPA. Same handling as the unfused
                // `Op::Attention` path and the standalone
                // `Op::FusedAttentionBlock` above.
                let h_dtype = graph.node(node.inputs[0]).shape.dtype();
                let mask_idx = if *has_bias { 13 } else { 7 };
                let m_shape = node_input_shape(graph, node.inputs[mask_idx]);
                let mask_cast = if h_dtype != DType::F32 {
                    ops::cast(mask, h_dtype)?
                } else {
                    mask.clone_handle()?
                };
                let one = Array::from_f32_slice(&[1.0], &[1], h_dtype)?;
                let scl = Array::from_f32_slice(&[1.0e9], &[1], h_dtype)?;
                let shifted = ops::sub(&mask_cast, &one)?;
                let additive = ops::mul(&shifted, &scl)?;
                let additive_4d = match m_shape.len() {
                    2 => ops::reshape(&additive, &[m_shape[0], 1, 1, m_shape[1]])?,
                    3 => ops::reshape(&additive, &[m_shape[0], 1, m_shape[1], m_shape[2]])?,
                    _ => additive,
                };
                let attn = ops::attention(
                    &q,
                    &k,
                    &v,
                    scale,
                    crate::ffi::MlxMask::Custom,
                    Some(&additive_4d),
                )?;
                let attn = ops::transpose(&attn, &[0, 2, 1, 3])?;
                let attn = ops::reshape(&attn, &[batch, seq, inner])?;
                let attn_out = ops::matmul(&attn, out_w)?;
                let attn_out = maybe_add(attn_out, out_b)?;

                // --- Residual + LayerNorm 1 ---
                let pre1 = ops::add(hidden, &attn_out)?;
                let ln1_g_n = mlx_norm_scale_1d(ln1_g)?;
                let ln1_b_n = ln1_b.map(mlx_norm_scale_1d).transpose()?;
                let h1 = ops::layer_norm(&pre1, &ln1_g_n, ln1_b_n.as_ref(), *eps1)?;

                // --- FFN: activation(h1 @ fc1_w [+ fc1_b]) @ fc2_w [+ fc2_b] ---
                let ffn1 = ops::matmul(&h1, fc1_w)?;
                let ffn1 = maybe_add(ffn1, fc1_b)?;
                let ffn1 = match activation {
                    Activation::Gelu => ops::gelu(&ffn1)?,
                    Activation::GeluApprox => ops::gelu_approx(&ffn1)?,
                    Activation::Silu => ops::silu(&ffn1)?,
                    Activation::Relu => ops::unary(&ffn1, MlxUnary::Relu)?,
                    Activation::Sigmoid => ops::unary(&ffn1, MlxUnary::Sigmoid)?,
                    Activation::Tanh => ops::unary(&ffn1, MlxUnary::Tanh)?,
                    Activation::Exp => ops::unary(&ffn1, MlxUnary::Exp)?,
                    Activation::Log => ops::unary(&ffn1, MlxUnary::Log)?,
                    Activation::Sqrt => ops::unary(&ffn1, MlxUnary::Sqrt)?,
                    Activation::Rsqrt => ops::unary(&ffn1, MlxUnary::Rsqrt)?,
                    Activation::Neg => ops::unary(&ffn1, MlxUnary::Neg)?,
                    Activation::Abs => ops::unary(&ffn1, MlxUnary::Abs)?,
                    Activation::Round => ops::unary(&ffn1, MlxUnary::Round)?,
                    Activation::Sin => ops::unary(&ffn1, MlxUnary::Sin)?,
                    Activation::Cos => ops::unary(&ffn1, MlxUnary::Cos)?,
                    Activation::Tan => ops::unary(&ffn1, MlxUnary::Tan)?,
                    Activation::Atan => ops::unary(&ffn1, MlxUnary::Atan)?,
                };
                let ffn2 = ops::matmul(&ffn1, fc2_w)?;
                let ffn_out = maybe_add(ffn2, fc2_b)?;

                // --- Residual + LayerNorm 2 ---
                let pre2 = ops::add(&h1, &ffn_out)?;
                let ln2_g_n = mlx_norm_scale_1d(ln2_g)?;
                let ln2_b_n = ln2_b.map(mlx_norm_scale_1d).transpose()?;
                ops::layer_norm(&pre2, &ln2_g_n, ln2_b_n.as_ref(), *eps2)?
            }
            Op::FusedAttentionBlock {
                num_heads,
                head_dim,
                has_bias,
                has_rope,
            } => {
                // Compose: QKV proj → split → reshape → transpose →
                // [Rope on Q, K] → SDPA → transpose back → reshape →
                // out proj. Custom mask kind (mask is always input #3).
                //
                // Inputs (in order):
                //   hidden, qkv_w, out_w, mask,
                //   [qkv_b, out_b]      if has_bias,
                //   [rope_cos, rope_sin] if has_rope
                let h_idx = 0;
                let qkv_w_idx = 1;
                let out_w_idx = 2;
                let mask_idx = 3;
                let mut next = 4;
                let (qkv_b_idx, out_b_idx) = if *has_bias {
                    let r = (next, next + 1);
                    next += 2;
                    r
                } else {
                    (usize::MAX, usize::MAX)
                };
                let (cos_idx, sin_idx) = if *has_rope {
                    let r = (next, next + 1);
                    let _ = next + 2; // consumed
                    r
                } else {
                    (usize::MAX, usize::MAX)
                };

                let hidden = lookup(&env, node.inputs[h_idx])?;
                let qkv_w = lookup(&env, node.inputs[qkv_w_idx])?;
                let out_w = lookup(&env, node.inputs[out_w_idx])?;
                let mask = lookup(&env, node.inputs[mask_idx])?;

                let h_shape = node_input_shape(graph, node.inputs[h_idx]);
                if h_shape.len() != 3 {
                    return Err(MlxError(format!(
                        "FusedAttentionBlock: hidden must be rank-3 [B, S, H], got {}",
                        h_shape.len()
                    )));
                }
                let (batch, seq) = runtime_bsh_dims(hidden, &h_shape)?;
                let nh = *num_heads as i32;
                let hd = *head_dim as i32;
                let inner = nh * hd;

                // 1. qkv = matmul(hidden, qkv_w) [+ qkv_b]
                let qkv = ops::matmul(hidden, qkv_w)?;
                let qkv = if *has_bias {
                    let qkv_b = lookup(&env, node.inputs[qkv_b_idx])?;
                    ops::add(&qkv, qkv_b)?
                } else {
                    qkv
                };

                // 2. split into Q, K, V along last axis (each [B, S, inner])
                let q = ops::slice(&qkv, &[0, 0, 0], &[batch, seq, inner])?;
                let k = ops::slice(&qkv, &[0, 0, inner], &[batch, seq, 2 * inner])?;
                let v = ops::slice(&qkv, &[0, 0, 2 * inner], &[batch, seq, 3 * inner])?;

                // 3. reshape to [B, S, H, D] then transpose to [B, H, S, D].
                // Materialize the transposed view with `ops::contiguous`: MLX's
                // `compile` elides bare transpose views, and SDPA needs a real
                // contiguous buffer (same materialization required by the
                // unfused `Op::Attention` lowering at lower.rs:851 and 858).
                let to_h = |t: Array| -> Result<Array, MlxError> {
                    let r = ops::reshape(&t, &[batch, seq, nh, hd])?;
                    let t = ops::transpose(&r, &[0, 2, 1, 3])?;
                    ops::contiguous(&t)
                };
                let mut q = to_h(q)?;
                let mut k = to_h(k)?;
                let v_h = to_h(v)?;

                // 4. Rope on Q and K if requested
                if *has_rope {
                    let cos = lookup(&env, node.inputs[cos_idx])?;
                    let sin = lookup(&env, node.inputs[sin_idx])?;
                    // Inline the Rope composition for full-dim
                    // (head_dim == last_dim for Q/K which are
                    // [B, H, S, D]).
                    let do_rope = |x: &Array| -> Result<Array, MlxError> {
                        let half = hd / 2;
                        let cos_shape = cos.shape().unwrap_or_default();
                        if cos_shape.len() != 2 {
                            return Err(MlxError(format!(
                                "FusedAttentionBlock rope: cos must be rank-2, got rank-{} shape={cos_shape:?}",
                                cos_shape.len()
                            )));
                        }
                        let cos_rows = cos_shape[0] as i32;
                        let seq_rope = seq.min(cos_rows);
                        let cos_seq = ops::slice(cos, &[0, 0], &[seq_rope, half])?;
                        let sin_seq = ops::slice(sin, &[0, 0], &[seq_rope, half])?;
                        let bshape = [1, 1, seq_rope, half];
                        let cos_b = ops::reshape(&cos_seq, &bshape)?;
                        let sin_b = ops::reshape(&sin_seq, &bshape)?;
                        let x1 = ops::slice(x, &[0, 0, 0, 0], &[batch, nh, seq_rope, half])?;
                        let x2 = ops::slice(x, &[0, 0, 0, half], &[batch, nh, seq_rope, hd])?;
                        let y1 = ops::sub(&ops::mul(&x1, &cos_b)?, &ops::mul(&x2, &sin_b)?)?;
                        let y2 = ops::add(&ops::mul(&x2, &cos_b)?, &ops::mul(&x1, &sin_b)?)?;
                        ops::concat(&[&y1, &y2], 3)
                    };
                    q = do_rope(&q)?;
                    k = do_rope(&k)?;
                }

                // 5. SDPA with custom mask.
                //
                // The mask on input #3 is the BERT-style binary mask
                // `[B, S]` (1.0 = valid, 0.0 = padding). MLX's SDPA adds the
                // mask *additively* to scores, so we must convert
                // binary → additive (matching the unfused `Op::Attention`
                // lowering at lower.rs:893-907):
                //     additive = (mask - 1) * 1e9
                //   → 0 for valid positions, -1e9 for padding.
                //
                // We also reshape the [B, S] mask to [B, 1, 1, S] so it
                // broadcasts across the head and query axes against the
                // [B, H, S_q, S_k] score tensor — same normalization the
                // unfused path applies at lower.rs:875-881.
                let scale = 1.0 / (hd as f32).sqrt();
                let q_dtype = graph.node(node.inputs[h_idx]).shape.dtype();
                let m_shape = node_input_shape(graph, node.inputs[mask_idx]);
                let mask_cast = if q_dtype != DType::F32 {
                    ops::cast(mask, q_dtype)?
                } else {
                    mask.clone_handle()?
                };
                let one = Array::from_f32_slice(&[1.0], &[1], q_dtype)?;
                let scl = Array::from_f32_slice(&[1.0e9], &[1], q_dtype)?;
                let shifted = ops::sub(&mask_cast, &one)?;
                let additive = ops::mul(&shifted, &scl)?;
                let additive_4d = match m_shape.len() {
                    2 => ops::reshape(&additive, &[m_shape[0], 1, 1, m_shape[1]])?,
                    3 => ops::reshape(&additive, &[m_shape[0], 1, m_shape[1], m_shape[2]])?,
                    _ => additive,
                };
                let attn_out = ops::attention(
                    &q,
                    &k,
                    &v_h,
                    scale,
                    crate::ffi::MlxMask::Custom,
                    Some(&additive_4d),
                )?;

                // 6. transpose back [B, H, S, D] → [B, S, H, D] → reshape [B, S, H*D]
                let attn_out = ops::transpose(&attn_out, &[0, 2, 1, 3])?;
                let attn_out = ops::reshape(&attn_out, &[batch, seq, inner])?;

                // 7. out projection
                let y = ops::matmul(&attn_out, out_w)?;
                if *has_bias {
                    let out_b = lookup(&env, node.inputs[out_b_idx])?;
                    ops::add(&y, out_b)?
                } else {
                    y
                }
            }
            Op::FusedSwiGLU { cast_to, .. } => {
                let src = lookup(&env, node.inputs[0])?;
                let in_shape = node_input_shape(graph, node.inputs[0]);
                let last = *in_shape
                    .last()
                    .ok_or_else(|| MlxError("FusedSwiGLU: input is rank-0".into()))?;
                if last % 2 != 0 {
                    return Err(MlxError(format!(
                        "FusedSwiGLU: last dim {last} must be even"
                    )));
                }
                let half = last / 2;
                let last_idx = in_shape.len() - 1;
                let up_start = vec![0i32; in_shape.len()];
                let mut up_stop = in_shape.clone();
                up_stop[last_idx] = half;
                let mut g_start = vec![0i32; in_shape.len()];
                g_start[last_idx] = half;
                let g_stop = in_shape.clone();
                let up = ops::slice(src, &up_start, &up_stop)?;
                let gate = ops::slice(src, &g_start, &g_stop)?;
                let silu_g = ops::silu(&gate)?;
                let result = ops::mul(&up, &silu_g)?;
                match cast_to {
                    Some(dt) if *dt != node.shape.dtype() => ops::cast(&result, *dt)?,
                    _ => result,
                }
            }

            Op::If {
                then_branch,
                else_branch,
            } => {
                // Lower both branches inline using the same captures
                // (parent's inputs[1..]). Output is per-element select
                // via mc::where(pred, then_out, else_out).
                if node.inputs.is_empty() {
                    return Err(MlxError("If: missing predicate input".into()));
                }
                let pred = lookup(&env, node.inputs[0])?;
                let captures: Vec<&Array> = node.inputs[1..]
                    .iter()
                    .map(|&id| lookup(&env, id))
                    .collect::<Result<_, _>>()?;
                let then_outs = lower_subgraph(then_branch, &captures, params, params_typed, rng)?;
                let else_outs = lower_subgraph(else_branch, &captures, params, params_typed, rng)?;
                if then_outs.len() != 1 || else_outs.len() != 1 {
                    return Err(MlxError(format!(
                        "If: each branch must produce exactly 1 output \
                         (then={}, else={})",
                        then_outs.len(),
                        else_outs.len()
                    )));
                }
                ops::select(pred, &then_outs[0], &else_outs[0])?
            }
            Op::While {
                cond,
                body,
                max_iterations,
            } => {
                // Bounded unroll: body and cond each get the current
                // loop-carried state as their captures. After body, we
                // mask updates with where(active && cond, body_out,
                // carried) so that once cond becomes false the carried
                // values stop changing. Without max_iterations the
                // loop has no static bound, which MLX can't trace —
                // error explicitly so callers fall back to host-side
                // looping.
                let max_iter = max_iterations.ok_or_else(|| {
                    MlxError(
                        "While: max_iterations required for unrolled \
                              lowering — MLX has no runtime loop primitive"
                            .into(),
                    )
                })?;

                // Initial carried values (clone-share from parent env).
                let mut carried: Vec<Array> = Vec::with_capacity(node.inputs.len());
                for &id in &node.inputs {
                    carried.push(lookup(&env, id)?.clone_handle()?);
                }
                // Active mask: 1.0 while still iterating, 0.0 once a
                // cond evaluation says we're done.
                let mut active = Array::from_f32_slice(&[1.0], &[1], DType::F32)?;

                for _ in 0..max_iter {
                    let captures: Vec<&Array> = carried.iter().collect();
                    let cond_outs = lower_subgraph(cond, &captures, params, params_typed, rng)?;
                    if cond_outs.len() != 1 {
                        return Err(MlxError(format!(
                            "While: cond sub-graph must produce 1 output \
                             (got {})",
                            cond_outs.len()
                        )));
                    }
                    // active &= cond (cast bool to f32, multiply)
                    let cond_f = ops::cast(&cond_outs[0], DType::F32)?;
                    active = ops::mul(&active, &cond_f)?;

                    let body_outs = lower_subgraph(body, &captures, params, params_typed, rng)?;
                    if body_outs.len() != carried.len() {
                        return Err(MlxError(format!(
                            "While: body produced {} outputs but {} loop-carried \
                             values were expected",
                            body_outs.len(),
                            carried.len()
                        )));
                    }
                    let active_bool = ops::cast(&active, DType::Bool)?;
                    let mut next: Vec<Array> = Vec::with_capacity(carried.len());
                    for (b, c) in body_outs.iter().zip(carried.iter()) {
                        next.push(ops::select(&active_bool, b, c)?);
                    }
                    carried = next;
                }

                // Op::While is a single-output node by IR convention;
                // we return the first carried value. For multi-output
                // While the IR would need a separate variant or a
                // tuple-typed output node — neither exists today.
                if carried.is_empty() {
                    return Err(MlxError("While: no loop-carried values".into()));
                }
                carried.into_iter().next().unwrap()
            }
            Op::Sample {
                top_k,
                top_p,
                temperature,
                seed,
            } => {
                let logits = lookup(&env, node.inputs[0])?;
                // Apply temperature.
                let scaled_owned: Option<Array> = if (*temperature - 1.0).abs() <= 1e-6 {
                    None
                } else {
                    let inv_t = 1.0 / *temperature;
                    let s = Array::from_f32_slice(&[inv_t], &[1], DType::F32)?;
                    Some(ops::mul(logits, &s)?)
                };
                let scaled: &Array = scaled_owned.as_ref().unwrap_or(logits);

                let in_shape = node_input_shape(graph, node.inputs[0]);
                let last_axis = if in_shape.is_empty() {
                    -1
                } else {
                    (in_shape.len() - 1) as i32
                };
                let neg_inf = Array::from_f32_slice(&[f32::NEG_INFINITY], &[1], DType::F32)?;

                // top_k filter: keep only the top-k logits, mask the
                // rest to -∞. Threshold = k-th largest value.
                let topk_owned: Option<Array> =
                    if *top_k > 0 && (*top_k as i32) < *in_shape.last().unwrap_or(&i32::MAX) {
                        let k = *top_k as i32;
                        let topk = ops::topk_values(scaled, k, last_axis)?;
                        let mut t_start = vec![0i32; in_shape.len()];
                        let mut t_stop = in_shape.clone();
                        t_start[in_shape.len() - 1] = k - 1;
                        t_stop[in_shape.len() - 1] = k;
                        let threshold = ops::slice(&topk, &t_start, &t_stop)?;
                        let mask = ops::ge(scaled, &threshold)?;
                        Some(ops::select(&mask, scaled, &neg_inf)?)
                    } else {
                        None
                    };
                let after_topk: &Array = topk_owned.as_ref().unwrap_or(scaled);

                // top_p (nucleus) filter. Algorithm:
                //   1. p = softmax(logits)
                //   2. sort_desc(p) via -sort(-p)
                //   3. exclusive cumsum over sorted_p
                //   4. nucleus = (exclusive_cumsum < top_p)
                //   5. threshold_p = min(sorted_p where nucleus, +inf
                //      where not) — smallest probability still in
                //      the nucleus
                //   6. mask = p >= threshold_p   (broadcast back to
                //      original positions)
                //   7. logits' = where(mask, logits, -inf)
                let topp_owned: Option<Array> = if (*top_p - 1.0).abs() > 1e-6 && *top_p > 0.0 {
                    let p = ops::softmax(after_topk, last_axis)?;
                    let neg_p = ops::unary(&p, MlxUnary::Neg)?;
                    let neg_sorted = ops::sort(&neg_p, last_axis)?;
                    let sorted_p = ops::unary(&neg_sorted, MlxUnary::Neg)?;

                    // Exclusive cumsum: cumsum_excl[i] = sum of first i
                    // entries (so the first entry's cumsum is 0).
                    let cumsum_excl = ops::cumsum(&sorted_p, last_axis, /*exclusive=*/ true)?;
                    let p_thresh = Array::from_f32_slice(&[*top_p], &[1], DType::F32)?;
                    let nucleus = ops::lt(&cumsum_excl, &p_thresh)?;

                    let pos_inf = Array::from_f32_slice(&[f32::INFINITY], &[1], DType::F32)?;
                    let masked_sorted = ops::select(&nucleus, &sorted_p, &pos_inf)?;
                    let threshold_p = ops::reduce(
                        &masked_sorted,
                        MlxReduce::Min,
                        &[last_axis],
                        /*keep_dim=*/ true,
                    )?;

                    let mask_orig = ops::ge(&p, &threshold_p)?;
                    Some(ops::select(&mask_orig, after_topk, &neg_inf)?)
                } else {
                    None
                };
                let final_logits: &Array = topp_owned.as_ref().unwrap_or(after_topk);

                // categorical samples one int32 per row. rlx encodes
                // ids as f32 at the I/O boundary.
                let ids = ops::categorical(final_logits, last_axis, *seed)?;
                ops::cast(&ids, DType::F32)?
            }

            Op::RngNormal {
                mean,
                scale,
                key,
                op_seed,
            } => {
                let n = node.shape.num_elements().unwrap_or(0);
                let mut buf = vec![0f32; n];
                rlx_ir::fill_normal_like(&mut buf, *mean, *scale, rng, *key, *op_seed);
                let dims: Vec<usize> = node
                    .shape
                    .dims()
                    .iter()
                    .map(|d| d.unwrap_static())
                    .collect();
                Array::from_f32_slice(&buf, &dims, node.shape.dtype())?
            }
            Op::RngUniform {
                low,
                high,
                key,
                op_seed,
            } => {
                let n = node.shape.num_elements().unwrap_or(0);
                let mut buf = vec![0f32; n];
                rlx_ir::fill_uniform_like(&mut buf, *low, *high, rng, *key, *op_seed);
                let dims: Vec<usize> = node
                    .shape
                    .dims()
                    .iter()
                    .map(|d| d.unwrap_static())
                    .collect();
                Array::from_f32_slice(&buf, &dims, node.shape.dtype())?
            }

            // ── Explicit "no MLX primitive" stops ────────────────
            //
            // The fallback `other` arm below catches anything we
            // haven't enumerated, but a few ops deserve a specific
            // pointer to *why* they're absent so users don't waste
            // time hunting for an off-by-one.
            Op::Pool {
                kind,
                kernel_size,
                stride,
                padding,
            } => {
                // N-D channels-first pool composed from strided-slice
                // + reduction. For each multi-index in the kernel grid
                // we extract the window-positioned slice with the
                // kernel's stride, then merge with the pool's
                // reduction op. Avg-pool divides the running sum by
                // kernel volume; prod multiplies windows together.
                let in_shape = node_input_shape(graph, node.inputs[0]);
                let spatial = kernel_size.len();
                // Input layout: [N, C, ...spatial]. Need rank = 2 + spatial.
                if in_shape.len() != 2 + spatial {
                    return Err(MlxError(format!(
                        "Pool: kernel rank {spatial} requires input rank \
                         {} (channels-first), got {}",
                        2 + spatial,
                        in_shape.len()
                    )));
                }
                if !matches!(
                    kind,
                    ReduceOp::Max | ReduceOp::Min | ReduceOp::Sum | ReduceOp::Mean | ReduceOp::Prod
                ) {
                    return Err(MlxError(format!("Pool: kind {kind:?} not supported")));
                }
                let x = lookup(&env, node.inputs[0])?;
                let ks: Vec<i32> = kernel_size.iter().map(|&k| k as i32).collect();
                let ss: Vec<i32> = (0..spatial)
                    .map(|i| stride.get(i).copied().unwrap_or(1) as i32)
                    .collect();
                let ps: Vec<i32> = (0..spatial)
                    .map(|i| padding.get(i).copied().unwrap_or(0) as i32)
                    .collect();

                // Pad if requested. Max/Min/Prod use neutral elements;
                // sum/avg use 0.
                let pad_value = match kind {
                    ReduceOp::Max => f32::NEG_INFINITY,
                    ReduceOp::Min => f32::INFINITY,
                    ReduceOp::Prod => 1.0,
                    _ => 0.0,
                };
                let needs_pad = ps.iter().any(|&p| p > 0);
                let x_padded_owned;
                let x_padded: &Array = if needs_pad {
                    let mut low = vec![0i32; in_shape.len()];
                    let mut high = vec![0i32; in_shape.len()];
                    low[2..2 + spatial].copy_from_slice(&ps[..spatial]);
                    high[2..2 + spatial].copy_from_slice(&ps[..spatial]);
                    x_padded_owned = ops::pad(x, &low, &high, pad_value)?;
                    &x_padded_owned
                } else {
                    x
                };

                // Output spatial dims.
                let mut out_spatial = Vec::with_capacity(spatial);
                for i in 0..spatial {
                    out_spatial.push((in_shape[2 + i] + 2 * ps[i] - ks[i]) / ss[i] + 1);
                }

                // Iterate kernel multi-index lexicographically.
                let kvol: i64 = ks.iter().map(|&v| v as i64).product();
                let mut acc: Option<Array> = None;
                for k_lin in 0..kvol {
                    let mut k_idx = vec![0i32; spatial];
                    let mut rem = k_lin;
                    for i in (0..spatial).rev() {
                        k_idx[i] = (rem % ks[i] as i64) as i32;
                        rem /= ks[i] as i64;
                    }
                    let mut start = vec![0i32; in_shape.len()];
                    let mut stop = vec![0i32; in_shape.len()];
                    let mut strides = vec![1i32; in_shape.len()];
                    start[0] = 0;
                    stop[0] = in_shape[0]; // batch
                    start[1] = 0;
                    stop[1] = in_shape[1]; // channels
                    for i in 0..spatial {
                        start[2 + i] = k_idx[i];
                        stop[2 + i] = k_idx[i] + ss[i] * out_spatial[i];
                        strides[2 + i] = ss[i];
                    }
                    let win = ops::slice_strided(x_padded, &start, &stop, &strides)?;
                    acc = Some(match (acc, kind) {
                        (None, _) => win,
                        (Some(a), ReduceOp::Max) => ops::max(&a, &win)?,
                        (Some(a), ReduceOp::Min) => ops::min(&a, &win)?,
                        (Some(a), ReduceOp::Prod) => ops::mul(&a, &win)?,
                        (Some(a), _) => ops::add(&a, &win)?,
                    });
                }
                let acc = acc.ok_or_else(|| MlxError("Pool: empty kernel".into()))?;

                if matches!(kind, ReduceOp::Mean) {
                    let count = kvol as f32;
                    let s = Array::from_f32_slice(&[1.0 / count], &[1], DType::F32)?;
                    ops::mul(&acc, &s)?
                } else {
                    acc
                }
            }
            Op::Scan {
                body,
                length,
                save_trajectory,
                num_xs,
                num_bcast,
                num_checkpoints: _,
            } => {
                // Generic loop-unrolled scan. MLX has no native scan
                // primitive, so we lower it the same way SelectiveScan
                // below does: walk t = 0..length, lower the body once
                // per iter with the previous step's carry as the first
                // capture, and (if save_trajectory) collect the
                // outputs into a stacked `[length, *carry]` tensor.
                //
                // Inputs layout (per Op::Scan IR doc):
                //   [init, bcast_0..bcast_{B-1}, x_t_0..x_t_{X-1}]
                // The body's Op::Inputs in declaration order are:
                //   [carry, bcast_0..bcast_{B-1}, x_at_t_0..x_at_t_{X-1}]
                //
                // For static `length`, the unrolled trace lives in
                // MLX's lazy graph and gets compiled once on first
                // dispatch — same amortization the SelectiveScan
                // path relies on.
                let init = lookup(&env, node.inputs[0])?;
                let bcasts: Vec<&Array> = (0..*num_bcast as usize)
                    .map(|i| lookup(&env, node.inputs[1 + i]))
                    .collect::<Result<Vec<_>, _>>()?;
                let xs: Vec<&Array> = (0..*num_xs as usize)
                    .map(|i| lookup(&env, node.inputs[1 + *num_bcast as usize + i]))
                    .collect::<Result<Vec<_>, _>>()?;

                // Carry shape (used for both per-iter trial reshape
                // and the final stacked-trajectory shape).
                let carry_shape: Vec<i32> = init.shape()?.iter().map(|d| *d as i32).collect();
                let carry_rank = carry_shape.len();

                let mut carry: Array = init.clone_handle()?;
                let mut traj_slices: Vec<Array> = if *save_trajectory {
                    Vec::with_capacity(*length as usize)
                } else {
                    Vec::new()
                };

                for t in 0..(*length as i32) {
                    // Build per-iter captures: carry, bcasts, xs[t].
                    let mut captures: Vec<Array> = Vec::with_capacity(1 + bcasts.len() + xs.len());
                    captures.push(carry.clone_handle()?);
                    for b in &bcasts {
                        captures.push(b.clone_handle()?);
                    }
                    for x in &xs {
                        // x has shape [length, *per_step]. Slice axis-0
                        // row t and squeeze that axis to feed body.
                        let mut start = vec![t];
                        let mut stop = vec![t + 1];
                        let x_shape = x.shape()?;
                        for i in 1..x_shape.len() {
                            start.push(0);
                            stop.push(x_shape[i] as i32);
                        }
                        let row = ops::slice(x, &start, &stop)?;
                        let per_step_dims: Vec<i32> =
                            x_shape[1..].iter().map(|d| *d as i32).collect();
                        let row_squeezed = ops::reshape(&row, &per_step_dims)?;
                        captures.push(row_squeezed);
                    }
                    let capture_refs: Vec<&Array> = captures.iter().collect();
                    let body_outs = lower_subgraph(body, &capture_refs, params, params_typed, rng)?;
                    if body_outs.is_empty() {
                        return Err(MlxError("Op::Scan: body produced no outputs".into()));
                    }
                    // First output is next carry.
                    carry = body_outs.into_iter().next().unwrap();

                    if *save_trajectory {
                        // Reshape to add a leading length-1 axis so we
                        // can concat into [length, *carry].
                        let mut row_shape: Vec<i32> = vec![1];
                        row_shape.extend_from_slice(&carry_shape);
                        traj_slices.push(ops::reshape(&carry, &row_shape)?);
                    }
                }

                if *save_trajectory {
                    let refs: Vec<&Array> = traj_slices.iter().collect();
                    ops::concat(&refs, 0)?
                } else {
                    let _ = carry_rank;
                    carry
                }
            }
            Op::SelectiveScan { state_size } => {
                // Mamba SSM step. MLX has no native scan primitive,
                // so we compose by unrolling the time loop into seq
                // many op chains. Acceptable for static-shape graphs
                // (which all our graphs are); mlx::compile then caches
                // the unrolled trace so per-call cost is amortized.
                //
                // Inputs (per the IR doc):
                //   x [b, s, h]      f32 input
                //   delta [b, s, h]  f32 step size
                //   a [h, n]         f32 transition matrix
                //   b [b, s, n]      f32 input projection
                //   c [b, s, n]      f32 output projection
                // Output [b, s, h], state h [b, h, n] init to zero.
                let x = lookup(&env, node.inputs[0])?;
                let delta = lookup(&env, node.inputs[1])?;
                let a = lookup(&env, node.inputs[2])?;
                let b_in = lookup(&env, node.inputs[3])?;
                let c_in = lookup(&env, node.inputs[4])?;

                let x_shape = node_input_shape(graph, node.inputs[0]);
                if x_shape.len() != 3 {
                    return Err(MlxError(format!(
                        "SelectiveScan: x must be rank-3 [B, S, H], got rank {}",
                        x_shape.len()
                    )));
                }
                let batch = x_shape[0];
                let seq = x_shape[1];
                let hidden = x_shape[2];
                let n = *state_size as i32;

                // State: [B, H, N]. Initialize from a zero scalar
                // broadcast to the target shape; broadcast_to gives
                // a strided view, but we follow with a multiply later
                // so it materializes.
                let zero = Array::from_f32_slice(&[0.0], &[1], DType::F32)?;
                let mut state = ops::broadcast_to(&zero, &[batch, hidden, n])?;

                let mut ys: Vec<Array> = Vec::with_capacity(seq as usize);
                for t in 0..seq {
                    // Slice time-step t.
                    let dt = ops::slice(delta, &[0, t, 0], &[batch, t + 1, hidden])?;
                    let dt = ops::reshape(&dt, &[batch, hidden, 1])?; // [B, H, 1]
                    let xt = ops::slice(x, &[0, t, 0], &[batch, t + 1, hidden])?;
                    let xt = ops::reshape(&xt, &[batch, hidden, 1])?; // [B, H, 1]
                    let bt = ops::slice(b_in, &[0, t, 0], &[batch, t + 1, n])?;
                    let bt = ops::reshape(&bt, &[batch, 1, n])?; // [B, 1, N]
                    let ct = ops::slice(c_in, &[0, t, 0], &[batch, t + 1, n])?;
                    let ct = ops::reshape(&ct, &[batch, 1, n])?; // [B, 1, N]

                    // exp(delta * A): a is [H, N], dt is [B, H, 1].
                    // Their product broadcasts to [B, H, N].
                    let delta_a = ops::mul(&dt, a)?;
                    let exp_delta_a = ops::unary(&delta_a, MlxUnary::Exp)?;

                    // delta * B[t] * x[t]: dt [B, H, 1], bt [B, 1, N],
                    // xt [B, H, 1] → product [B, H, N].
                    let dt_b = ops::mul(&dt, &bt)?; // [B, H, N]
                    let delta_bx = ops::mul(&dt_b, &xt)?; // [B, H, N]

                    // Recurrence: state = exp(δA) * state + δBx
                    let damped = ops::mul(&exp_delta_a, &state)?;
                    state = ops::add(&damped, &delta_bx)?;

                    // y[t] = sum_n( C[t] * state ) along axis 2 ↓ [B, H]
                    let c_state = ops::mul(&ct, &state)?; // [B, H, N]
                    let yt = ops::reduce(&c_state, MlxReduce::Sum, &[2], /*keep_dim=*/ false)?;
                    // Reshape to [B, 1, H] so we can concat into [B, S, H].
                    let yt = ops::reshape(&yt, &[batch, 1, hidden])?;
                    ys.push(yt);
                }

                let refs: Vec<&Array> = ys.iter().collect();
                ops::concat(&refs, 1)?
            }
            Op::GatedDeltaNet {
                state_size,
                carry_state,
            } => {
                let q = lookup(&env, node.inputs[0])?;
                let k = lookup(&env, node.inputs[1])?;
                let v = lookup(&env, node.inputs[2])?;
                let g_in = lookup(&env, node.inputs[3])?;
                let beta = lookup(&env, node.inputs[4])?;
                let (out, state_wb) = lower_gated_delta_net(
                    q,
                    k,
                    v,
                    g_in,
                    beta,
                    *state_size,
                    if *carry_state {
                        Some(lookup(&env, node.inputs[5])?)
                    } else {
                        None
                    },
                    node_input_shape(graph, node.inputs[0]),
                )?;
                if *carry_state {
                    if let Some(state_arr) = state_wb {
                        env.insert(node.inputs[5], state_arr);
                    }
                }
                out
            }

            // ── Tier 1 autodiff backward ops ─────────────────────────
            // Composed from existing MLX primitives so MLX can run the
            // gradient graph emitted by `rlx_opt::autodiff::grad_with_loss`.
            // Formulas mirror `rlx-cpu/src/thunk.rs` (the reference).
            Op::ReluBackward => {
                let x = lookup(&env, node.inputs[0])?;
                let dy = lookup(&env, node.inputs[1])?;
                let dtype = node.shape.dtype();
                let zero = Array::from_f32_slice(&[0.0], &[1], dtype)?;
                let mask = ops::gt(x, &zero)?;
                ops::select(&mask, dy, &zero)?
            }

            Op::ActivationBackward { kind } => {
                let x = lookup(&env, node.inputs[0])?;
                let dy = lookup(&env, node.inputs[1])?;
                let dtype = node.shape.dtype();
                activation_backward_compose(x, dy, *kind, dtype)?
            }

            Op::SoftmaxCrossEntropyWithLogits => {
                // logits: [N, C], labels: [N] (f32-encoded indices).
                // loss[n] = lse(logits[n]) - logits[n, labels[n]].
                let logits = lookup(&env, node.inputs[0])?;
                let labels = lookup(&env, node.inputs[1])?;
                let logits_shape = node_input_shape(graph, node.inputs[0]);
                let n = logits_shape[0];
                let c = logits_shape[1];
                let dtype = node.shape.dtype();

                // Numerically-stable logsumexp along axis 1.
                let m = ops::reduce(logits, MlxReduce::Max, &[1], /*keep_dim=*/ true)?;
                let shifted = ops::sub(logits, &m)?;
                let exp_d = ops::unary(&shifted, MlxUnary::Exp)?;
                let sum_exp = ops::reduce(&exp_d, MlxReduce::Sum, &[1], /*keep_dim=*/ false)?;
                let log_sum = ops::unary(&sum_exp, MlxUnary::Log)?;
                let m_squeezed = ops::reshape(&m, &[n])?;
                let lse = ops::add(&m_squeezed, &log_sum)?;

                // logits[label] via one-hot mask.
                let oh = one_hot_2d(labels, n as usize, c as usize, dtype)?;
                let masked = ops::mul(logits, &oh)?;
                let logit_at_label =
                    ops::reduce(&masked, MlxReduce::Sum, &[1], /*keep_dim=*/ false)?;

                ops::sub(&lse, &logit_at_label)?
            }

            Op::SoftmaxCrossEntropyBackward => {
                // dlogits[n, c] = (softmax(logits)[n, c] - one_hot(labels)[n, c]) * d_loss[n].
                let logits = lookup(&env, node.inputs[0])?;
                let labels = lookup(&env, node.inputs[1])?;
                let d_loss = lookup(&env, node.inputs[2])?;
                let logits_shape = node_input_shape(graph, node.inputs[0]);
                let n = logits_shape[0];
                let c = logits_shape[1];
                let dtype = node.shape.dtype();

                let sm = ops::softmax(logits, 1)?;
                let oh = one_hot_2d(labels, n as usize, c as usize, dtype)?;
                let diff = ops::sub(&sm, &oh)?;
                let d_loss_2d = ops::reshape(d_loss, &[n, 1])?;
                ops::mul(&diff, &d_loss_2d)?
            }

            Op::LayerNormBackwardInput { eps, axis: _ } => {
                // axis = -1 only (per IR docstring).
                // dx = inv_std · (sy − mean(sy) − x̂ · mean(sy · x̂))
                // where sy = dy · γ, x̂ = (x − μ) · inv_std.
                let x = lookup(&env, node.inputs[0])?;
                let gamma = lookup(&env, node.inputs[1])?;
                let dy = lookup(&env, node.inputs[2])?;
                let x_shape = node_input_shape(graph, node.inputs[0]);
                let last = (x_shape.len() - 1) as i32;
                let dtype = node.shape.dtype();
                let eps_arr = Array::from_f32_slice(&[*eps], &[1], dtype)?;

                let mean = ops::reduce(x, MlxReduce::Mean, &[last], true)?;
                let diff = ops::sub(x, &mean)?;
                let diff_sq = ops::mul(&diff, &diff)?;
                let var = ops::reduce(&diff_sq, MlxReduce::Mean, &[last], true)?;
                let var_eps = ops::add(&var, &eps_arr)?;
                let inv_std = ops::unary(&var_eps, MlxUnary::Rsqrt)?;
                let xhat = ops::mul(&diff, &inv_std)?;
                let sy = ops::mul(dy, gamma)?;
                let m_sy = ops::reduce(&sy, MlxReduce::Mean, &[last], true)?;
                let sy_xh = ops::mul(&sy, &xhat)?;
                let m_sxh = ops::reduce(&sy_xh, MlxReduce::Mean, &[last], true)?;
                let term1 = ops::sub(&sy, &m_sy)?;
                let term2 = ops::mul(&xhat, &m_sxh)?;
                let inner = ops::sub(&term1, &term2)?;
                ops::mul(&inv_std, &inner)?
            }

            Op::FakeQuantize {
                bits,
                axis,
                ste: _,
                scale_mode,
            } => {
                // y = clamp(round(x / s), -q_max, q_max) · s
                // where `s` per channel comes from `scale_mode`.
                // Forward `ste` doesn't affect the output — only the
                // backward.
                let x = lookup(&env, node.inputs[0])?;
                let x_shape = node_input_shape(graph, node.inputs[0]);
                let dtype = node.shape.dtype();
                let q_max = fq_q_max(*bits)?;

                let scale = match scale_mode {
                    ScaleMode::PerBatch => fq_scale_perbatch(x, &x_shape, *axis, q_max, dtype)?,
                    ScaleMode::Fixed => {
                        let state = lookup(&env, node.inputs[1])?;
                        fq_scale_from_state(state, &x_shape, *axis, dtype)?
                    }
                    ScaleMode::EMA { .. } => {
                        return Err(MlxError(
                            "Op::FakeQuantize with ScaleMode::EMA not yet \
                             supported on MLX (the running scale state \
                             update needs side-effect plumbing the lazy \
                             trace doesn't expose). Use ScaleMode::PerBatch \
                             for QAT training or ScaleMode::Fixed for \
                             pre-calibrated inference."
                                .into(),
                        ));
                    }
                };
                fq_quantize_dequantize(x, &scale, q_max, dtype)?
            }

            Op::FakeQuantizeBackward { bits, axis, ste } => {
                // The CPU thunk recomputes the scale via PerBatch from
                // the current `x` regardless of how the forward derived
                // it (see `rlx-cpu/src/thunk.rs:4239`); we mirror that.
                let x = lookup(&env, node.inputs[0])?;
                let dy = lookup(&env, node.inputs[1])?;
                let x_shape = node_input_shape(graph, node.inputs[0]);
                let dtype = node.shape.dtype();
                let q_max = fq_q_max(*bits)?;
                let scale = fq_scale_perbatch(x, &x_shape, *axis, q_max, dtype)?;

                let q_max_arr = Array::from_f32_slice(&[q_max], &[1], dtype)?;
                let one = Array::from_f32_slice(&[1.0], &[1], dtype)?;
                let zero = Array::from_f32_slice(&[0.0], &[1], dtype)?;

                match ste {
                    SteKind::Identity => dy.clone_handle()?,
                    SteKind::ClippedIdentity => {
                        // dx = where(|x| ≤ q_max·s, dy, 0)
                        let bound = ops::mul(&scale, &q_max_arr)?;
                        let abs_x = ops::unary(x, MlxUnary::Abs)?;
                        let mask = ops::le(&abs_x, &bound)?;
                        ops::select(&mask, dy, &zero)?
                    }
                    SteKind::Tanh => {
                        // dx = dy · (1 − tanh²(x/s))
                        let scaled = ops::div(x, &scale)?;
                        let t = ops::unary(&scaled, MlxUnary::Tanh)?;
                        let t_sq = ops::mul(&t, &t)?;
                        let factor = ops::sub(&one, &t_sq)?;
                        ops::mul(dy, &factor)?
                    }
                    SteKind::HardTanh => {
                        // dx = dy · max(0, 1 − |x/(q_max·s)|)
                        let bound = ops::mul(&scale, &q_max_arr)?;
                        let scaled = ops::div(x, &bound)?;
                        let abs_scaled = ops::unary(&scaled, MlxUnary::Abs)?;
                        let one_minus = ops::sub(&one, &abs_scaled)?;
                        let attenuation = ops::max(&one_minus, &zero)?;
                        ops::mul(dy, &attenuation)?
                    }
                }
            }

            Op::MaxPool2dBackward {
                kernel_size,
                stride,
                padding,
            } => {
                // x shape [N, C, H, W], dy shape [N, C, H_out, W_out]
                // Output dx shape [N, C, H, W].
                if kernel_size.len() != 2 || stride.len() != 2 || padding.len() != 2 {
                    return Err(MlxError("MaxPool2dBackward on MLX: 2D pool only".into()));
                }
                let x = lookup(&env, node.inputs[0])?;
                let dy = lookup(&env, node.inputs[1])?;
                let x_shape = node_input_shape(graph, node.inputs[0]);
                let dy_shape = node_input_shape(graph, node.inputs[1]);
                if x_shape.len() != 4 || dy_shape.len() != 4 {
                    return Err(MlxError(
                        "MaxPool2dBackward on MLX: 2D pool expects rank-4 tensors".into(),
                    ));
                }
                let n = x_shape[0];
                let cc = x_shape[1];
                let h = x_shape[2];
                let w = x_shape[3];
                let h_out = dy_shape[2];
                let w_out = dy_shape[3];
                let kh = kernel_size[0] as i32;
                let kw = kernel_size[1] as i32;
                let sh = stride[0] as i32;
                let sw = stride[1] as i32;
                let ph = padding[0] as i32;
                let pw = padding[1] as i32;

                // Custom Metal kernel: one thread per output position
                // does an in-window argmax + atomic-fetch-add into dx.
                // Handles overlap (stride < kernel) and padding > 0 in
                // one path. ~5–10× faster than the primitive-composition
                // alternative on shapes where MLX's `scatter_add_axis`
                // is the bottleneck.
                ops::maxpool2d_backward_metal(
                    x, dy, n, cc, h, w, h_out, w_out, kh, kw, sh, sw, ph, pw,
                )?
            }

            Op::Conv2dBackwardInput {
                kernel_size,
                stride,
                padding,
                dilation,
                groups,
            } => {
                // Reverse-mode conv-grad-w.r.t.-input. Translates the
                // forward conv parameters into the `conv_general`
                // arguments MLX itself uses inside its built-in vjp
                // (see vendor/mlx/mlx/primitives.cpp `Convolution::vjp`).
                if kernel_size.len() != 2 {
                    return Err(MlxError("Conv2dBackwardInput on MLX: 2D conv only".into()));
                }
                let dy = lookup(&env, node.inputs[0])?;
                let w = lookup(&env, node.inputs[1])?;
                let dy_shape = node_input_shape(graph, node.inputs[0]);
                let w_shape = node_input_shape(graph, node.inputs[1]);
                let dx_shape: Vec<i32> = node
                    .shape
                    .dims()
                    .iter()
                    .map(|d| d.unwrap_static() as i32)
                    .collect();
                if dy_shape.len() != 4 || w_shape.len() != 4 || dx_shape.len() != 4 {
                    return Err(MlxError(
                        "Conv2dBackwardInput on MLX: 2D conv expects rank-4 tensors".into(),
                    ));
                }

                let g = *groups as i32;
                let c_in = dx_shape[1];
                let c_out = dy_shape[1];
                if c_in % g != 0 || c_out % g != 0 {
                    return Err(MlxError(format!(
                        "Conv2dBackwardInput: groups ({g}) must divide \
                         C_in ({c_in}) and C_out ({c_out})"
                    )));
                }
                let c_in_per_g = c_in / g;
                let c_out_per_g = c_out / g;
                let h = dx_shape[2];
                let w_in = dx_shape[3];
                let h_out = dy_shape[2];
                let w_out = dy_shape[3];
                let kh = w_shape[2];
                let kw = w_shape[3];
                let s = |i: usize| stride.get(i).copied().unwrap_or(1) as i32;
                let p = |i: usize| padding.get(i).copied().unwrap_or(0) as i32;
                let d = |i: usize| dilation.get(i).copied().unwrap_or(1) as i32;

                // Per MLX vjp (vendor/mlx/mlx/primitives.cpp):
                //   wt_size       = 1 + D·(K−1)
                //   padding_lo[i] = wt_size − P_orig − 1     = D·(K−1) − P
                //   in_size       = H,   out_size = 1 + S·(H_out − 1)
                //   padding_hi[i] = in_size − out_size + P
                let pad_lo: Vec<i32> = vec![d(0) * (kh - 1) - p(0), d(1) * (kw - 1) - p(1)];
                let pad_hi: Vec<i32> = vec![
                    h - 1 - s(0) * (h_out - 1) + p(0),
                    w_in - 1 - s(1) * (w_out - 1) + p(1),
                ];

                // dy: rlx NCHW → MLX NHWC.
                let dy_nhwc = ops::transpose(dy, &[0, 2, 3, 1])?;

                // MLX limitation: `conv_general` with both `groups > 1` and
                // `input_dilation > 1` produces incorrect output (the
                // grouped path doesn't compose with the dilated-input
                // path; tests/autodiff_conv_parity.rs::*_groups_*_stride2
                // proves it). Workaround: when both kick in, materialize
                // the input dilation by reshape+pad+reshape (zero-inflate
                // dy along each spatial axis) and call conv_general with
                // `input_dilation=[1,1]`.
                let needs_inflate = g > 1 && (s(0) > 1 || s(1) > 1);
                let (dy_input, conv_input_dilation): (Array, [i32; 2]) = if needs_inflate {
                    let inflated = inflate_spatial_2d(&dy_nhwc, s(0) as usize, s(1) as usize)?;
                    (inflated, [1, 1])
                } else {
                    (dy_nhwc.clone_handle()?, [s(0), s(1)])
                };

                // Weight transform — translates MLX vjp's `group_transpose(wt, 0, 1, -1)`.
                //   groups=1: rlx [C_out, C_in, kH, kW] → [C_in, kH, kW, C_out]
                //             via the single perm [1, 2, 3, 0].
                //   groups>1: split C_out by group via reshape, swap C_out/g
                //             with C_in/g, then flatten (groups, C_in/g) → C_in:
                //               [C_out, C_in/g, kH, kW]
                //             → [g, C_out/g, C_in/g, kH, kW]   (reshape)
                //             → [g, C_in/g, kH, kW, C_out/g]   (perm 0,2,3,4,1)
                //             → [C_in, kH, kW, C_out/g]        (reshape)
                let w_t = if g == 1 {
                    ops::transpose(w, &[1, 2, 3, 0])?
                } else {
                    let split = ops::reshape(w, &[g, c_out_per_g, c_in_per_g, kh, kw])?;
                    let perm = ops::transpose(&split, &[0, 2, 3, 4, 1])?;
                    ops::reshape(&perm, &[c_in, kh, kw, c_out_per_g])?
                };

                let raw = ops::conv_general(
                    &dy_input,
                    &w_t,
                    /* stride          = */ &[1, 1],
                    /* padding_lo      = */ &pad_lo,
                    /* padding_hi      = */ &pad_hi,
                    /* kernel_dilation = */ &[d(0), d(1)],
                    /* input_dilation  = */ &conv_input_dilation,
                    /* groups          = */ g,
                    /* flip            = */ true,
                )?;

                // Negative-padding fixup: MLX's `conv_general` accepts
                // negative padding by *over-producing* and we slice the
                // overshoot off (matches MLX vjp's own behavior).
                let needs_slice = pad_lo.iter().chain(pad_hi.iter()).any(|&p| p < 0);
                let adjusted = if needs_slice {
                    let cur: Vec<i32> = raw.shape()?.iter().map(|&d| d as i32).collect();
                    let mut start = vec![0i32; cur.len()];
                    let mut stop = cur.clone();
                    for i in 0..2 {
                        if pad_lo[i] < 0 {
                            start[1 + i] = -pad_lo[i];
                        }
                        if pad_hi[i] < 0 {
                            stop[1 + i] += pad_hi[i];
                        }
                    }
                    ops::slice(&raw, &start, &stop)?
                } else {
                    raw
                };

                // NHWC → NCHW for the rlx-side consumer.
                // `contiguous` materializes the strided view; without
                // it `mc::compile` elides the transpose and the readback
                // ends up in NHWC layout (compile-mode bug repro:
                // `tests/conv_compile_mode_repro.rs`).
                let nchw = ops::transpose(&adjusted, &[0, 3, 1, 2])?;
                ops::contiguous(&nchw)?
            }

            Op::Conv2dBackwardWeight {
                kernel_size,
                stride,
                padding,
                dilation,
                groups,
            } => {
                if kernel_size.len() != 2 {
                    return Err(MlxError("Conv2dBackwardWeight on MLX: 2D conv only".into()));
                }
                let x = lookup(&env, node.inputs[0])?;
                let dy = lookup(&env, node.inputs[1])?;
                let x_shape = node_input_shape(graph, node.inputs[0]);
                let dy_shape = node_input_shape(graph, node.inputs[1]);
                let dw_shape: Vec<i32> = node
                    .shape
                    .dims()
                    .iter()
                    .map(|d| d.unwrap_static() as i32)
                    .collect();
                if x_shape.len() != 4 || dy_shape.len() != 4 || dw_shape.len() != 4 {
                    return Err(MlxError(
                        "Conv2dBackwardWeight on MLX: 2D conv expects rank-4 tensors".into(),
                    ));
                }
                let g = *groups as i32;
                let n_batch = x_shape[0];
                let c_in = x_shape[1];
                let c_out = dy_shape[1];
                if c_in % g != 0 || c_out % g != 0 {
                    return Err(MlxError(format!(
                        "Conv2dBackwardWeight: groups ({g}) must divide \
                         C_in ({c_in}) and C_out ({c_out})"
                    )));
                }
                let c_in_per_g = c_in / g;
                let h = x_shape[2];
                let w_in = x_shape[3];
                let h_out = dy_shape[2];
                let w_out = dy_shape[3];
                let kh = dw_shape[2];
                let kw = dw_shape[3];
                let s = |i: usize| stride.get(i).copied().unwrap_or(1) as i32;
                let p = |i: usize| padding.get(i).copied().unwrap_or(0) as i32;
                let d = |i: usize| dilation.get(i).copied().unwrap_or(1) as i32;

                // Per MLX vjp:
                //   padding_lo[i] = P
                //   padding_hi[i] = (S·(H_out−1) + 1) − H + (D·(K−1) + 1) − P − 1
                let pad_lo: Vec<i32> = vec![p(0), p(1)];
                let pad_hi: Vec<i32> = vec![
                    s(0) * (h_out - 1) + 1 - h + d(0) * (kh - 1) + 1 - p(0) - 1,
                    s(1) * (w_out - 1) + 1 - w_in + d(1) * (kw - 1) + 1 - p(1) - 1,
                ];

                // dy: rlx NCHW → swapaxes(NHWC, 0, -1) =
                //   [C_out, H_out, W_out, N]  via transpose [1, 2, 3, 0].
                let cotan_trans = ops::transpose(dy, &[1, 2, 3, 0])?;

                // x transform — translates MLX vjp's `group_transpose(in, -1, 0, -1)`.
                //   groups=1: rlx [N, C_in, H, W] → [C_in, H, W, N]
                //             via the single perm [1, 2, 3, 0].
                //   groups>1: split C_in by group, swap N and C_in/g, then
                //             flatten (g, N) → (g·N):
                //               [N, C_in, H, W]
                //             → [N, g, C_in/g, H, W]            (reshape)
                //             → [C_in/g, H, W, g, N]            (perm 2,3,4,1,0)
                //             → [C_in/g, H, W, g·N]             (reshape)
                let in_trans = if g == 1 {
                    ops::transpose(x, &[1, 2, 3, 0])?
                } else {
                    let split = ops::reshape(x, &[n_batch, g, c_in_per_g, h, w_in])?;
                    let perm = ops::transpose(&split, &[2, 3, 4, 1, 0])?;
                    ops::reshape(&perm, &[c_in_per_g, h, w_in, g * n_batch])?
                };

                let grad_trans = ops::conv_general(
                    &in_trans,
                    &cotan_trans,
                    /* stride          = */ &[d(0), d(1)],
                    /* padding_lo      = */ &pad_lo,
                    /* padding_hi      = */ &pad_hi,
                    /* kernel_dilation = */ &[s(0), s(1)],
                    /* input_dilation  = */ &[1, 1],
                    /* groups          = */ g,
                    /* flip            = */ false,
                )?;
                // grad_trans: [C_in, kH, kW, C_out]. rlx layout wants
                // [C_out, C_in, kH, kW] → perm [3, 0, 1, 2]. As with
                // backward-input, `contiguous` is required to defeat
                // `mc::compile`'s strided-view elision.
                let dw = ops::transpose(&grad_trans, &[3, 0, 1, 2])?;
                ops::contiguous(&dw)?
            }

            Op::LayerNormBackwardGamma { eps, axis: _ } => {
                // axis = -1 only. dgamma = sum_over_outer(dy · x̂).
                let x = lookup(&env, node.inputs[0])?;
                let dy = lookup(&env, node.inputs[1])?;
                let x_shape = node_input_shape(graph, node.inputs[0]);
                let last = (x_shape.len() - 1) as i32;
                let dtype = node.shape.dtype();
                let eps_arr = Array::from_f32_slice(&[*eps], &[1], dtype)?;

                let mean = ops::reduce(x, MlxReduce::Mean, &[last], true)?;
                let diff = ops::sub(x, &mean)?;
                let diff_sq = ops::mul(&diff, &diff)?;
                let var = ops::reduce(&diff_sq, MlxReduce::Mean, &[last], true)?;
                let var_eps = ops::add(&var, &eps_arr)?;
                let inv_std = ops::unary(&var_eps, MlxUnary::Rsqrt)?;
                let xhat = ops::mul(&diff, &inv_std)?;
                let prod = ops::mul(dy, &xhat)?;

                if last == 0 {
                    prod
                } else {
                    let reduce_axes: Vec<i32> = (0..last).collect();
                    let summed = ops::reduce(
                        &prod,
                        MlxReduce::Sum,
                        &reduce_axes,
                        /*keep_dim=*/ false,
                    )?;
                    let want: Vec<i32> = node
                        .shape
                        .dims()
                        .iter()
                        .map(|d| d.unwrap_static() as i32)
                        .collect();
                    let got = summed.shape()?;
                    let got_i32: Vec<i32> = got.iter().map(|&d| d as i32).collect();
                    if got_i32 == want {
                        summed
                    } else {
                        ops::reshape(&summed, &want)?
                    }
                }
            }

            Op::AttentionBackward {
                num_heads,
                head_dim,
                mask_kind,
                wrt,
            } => {
                let q_in = lookup(&env, node.inputs[0])?;
                let k_in = lookup(&env, node.inputs[1])?;
                let v_in = lookup(&env, node.inputs[2])?;
                let dy_in = lookup(&env, node.inputs[3])?;
                let q_shape = node_input_shape(graph, node.inputs[0]);
                let k_shape = node_input_shape(graph, node.inputs[1]);
                let nh = *num_heads as i32;
                let hd = *head_dim as i32;
                let need_split = q_shape.len() == 3;
                let to_bhsd = |t: &Array, sh: &[i32]| -> Result<Array, MlxError> {
                    if sh.len() == 4 {
                        return t.clone_handle();
                    }
                    let b = sh[0];
                    let s = sh[1];
                    let r = ops::reshape(t, &[b, s, nh, hd])?;
                    ops::transpose(&r, &[0, 2, 1, 3])
                };
                let q = to_bhsd(q_in, &q_shape)?;
                let k = to_bhsd(k_in, &k_shape)?;
                let v = to_bhsd(v_in, &node_input_shape(graph, node.inputs[2]))?;
                let dy = to_bhsd(dy_in, &node_input_shape(graph, node.inputs[3]))?;
                let q_dtype = graph.node(node.inputs[0]).shape.dtype();
                let normalize_mask = |m: &Array, m_shape: &[i32]| -> Result<Array, MlxError> {
                    match m_shape.len() {
                        2 => ops::reshape(m, &[m_shape[0], 1, 1, m_shape[1]]),
                        3 => ops::reshape(m, &[m_shape[0], 1, m_shape[1], m_shape[2]]),
                        _ => m.clone_handle(),
                    }
                };
                let (mask_additive, window) = match mask_kind {
                    MaskKind::Custom => {
                        let m = lookup(&env, node.inputs[4])?;
                        let m_shape = node_input_shape(graph, node.inputs[4]);
                        let one = Array::from_f32_slice(&[1.0], &[1], q_dtype)?;
                        let scl = Array::from_f32_slice(&[1.0e9], &[1], q_dtype)?;
                        let m_cast = if q_dtype != DType::F32 {
                            ops::cast(m, q_dtype)?
                        } else {
                            m.clone_handle()?
                        };
                        let shifted = ops::sub(&m_cast, &one)?;
                        let additive = ops::mul(&shifted, &scl)?;
                        (Some(normalize_mask(&additive, &m_shape)?), 0usize)
                    }
                    MaskKind::Bias => {
                        let m = lookup(&env, node.inputs[4])?;
                        let m_shape = node_input_shape(graph, node.inputs[4]);
                        let m_cast = if q_dtype != DType::F32 {
                            ops::cast(m, q_dtype)?
                        } else {
                            m.clone_handle()?
                        };
                        (Some(normalize_mask(&m_cast, &m_shape)?), 0usize)
                    }
                    MaskKind::SlidingWindow(w) => (None, *w),
                    _ => (None, 0usize),
                };
                let mask_ref = mask_additive.as_ref();
                let grad = crate::attention_bwd::attention_backward_bhsd(
                    *wrt, &q, &k, &v, &dy, hd, *mask_kind, mask_ref, window,
                )?;
                if need_split {
                    let b = q_shape[0];
                    let s = q_shape[1];
                    let bsd = ops::transpose(&grad, &[0, 2, 1, 3])?;
                    ops::reshape(&bsd, &[b, s, nh * hd])?
                } else {
                    grad
                }
            }

            Op::RmsNormBackwardInput { eps, axis: _ } => {
                let x = lookup(&env, node.inputs[0])?;
                let gamma = lookup(&env, node.inputs[1])?;
                let _beta = lookup(&env, node.inputs[2])?;
                let dy = lookup(&env, node.inputs[3])?;
                let x_shape = node_input_shape(graph, node.inputs[0]);
                let last = (x_shape.len() - 1) as i32;
                let dtype = node.shape.dtype();
                let eps_arr = Array::from_f32_slice(&[*eps], &[1], dtype)?;

                let x_sq = ops::mul(x, x)?;
                let mean_sq = ops::reduce(&x_sq, MlxReduce::Mean, &[last], true)?;
                let var_eps = ops::add(&mean_sq, &eps_arr)?;
                let inv_r = ops::unary(&var_eps, MlxUnary::Rsqrt)?;
                let inv_r3 = ops::mul(&inv_r, &ops::mul(&inv_r, &inv_r)?)?;
                let dy_g = ops::mul(dy, gamma)?;
                let dy_gx = ops::mul(&dy_g, x)?;
                let dot = ops::reduce(&dy_gx, MlxReduce::Mean, &[last], true)?;
                let x_dot = ops::mul(x, &dot)?;
                let term = ops::sub(&dy_g, &ops::mul(&x_dot, &inv_r3)?)?;
                ops::mul(&inv_r, &term)?
            }

            Op::RmsNormBackwardGamma { eps, axis: _ } => {
                let x = lookup(&env, node.inputs[0])?;
                let _gamma = lookup(&env, node.inputs[1])?;
                let _beta = lookup(&env, node.inputs[2])?;
                let dy = lookup(&env, node.inputs[3])?;
                let x_shape = node_input_shape(graph, node.inputs[0]);
                let last = (x_shape.len() - 1) as i32;
                let dtype = node.shape.dtype();
                let eps_arr = Array::from_f32_slice(&[*eps], &[1], dtype)?;

                let x_sq = ops::mul(x, x)?;
                let mean_sq = ops::reduce(&x_sq, MlxReduce::Mean, &[last], true)?;
                let var_eps = ops::add(&mean_sq, &eps_arr)?;
                let inv_r = ops::unary(&var_eps, MlxUnary::Rsqrt)?;
                let prod = ops::mul(dy, &ops::mul(x, &inv_r)?)?;

                if last == 0 {
                    prod
                } else {
                    let reduce_axes: Vec<i32> = (0..last).collect();
                    let summed = ops::reduce(
                        &prod,
                        MlxReduce::Sum,
                        &reduce_axes,
                        /*keep_dim=*/ false,
                    )?;
                    let want: Vec<i32> = node
                        .shape
                        .dims()
                        .iter()
                        .map(|d| d.unwrap_static() as i32)
                        .collect();
                    let got = summed.shape()?;
                    let got_i32: Vec<i32> = got.iter().map(|&d| d as i32).collect();
                    if got_i32 == want {
                        summed
                    } else {
                        ops::reshape(&summed, &want)?
                    }
                }
            }

            Op::RmsNormBackwardBeta { axis: _, .. } => {
                let dy = lookup(&env, node.inputs[3])?;
                let x_shape = node_input_shape(graph, node.inputs[0]);
                let last = (x_shape.len() - 1) as i32;
                if last == 0 {
                    dy.clone_handle()?
                } else {
                    let reduce_axes: Vec<i32> = (0..last).collect();
                    let summed =
                        ops::reduce(dy, MlxReduce::Sum, &reduce_axes, /*keep_dim=*/ false)?;
                    let want: Vec<i32> = node
                        .shape
                        .dims()
                        .iter()
                        .map(|d| d.unwrap_static() as i32)
                        .collect();
                    let got = summed.shape()?;
                    let got_i32: Vec<i32> = got.iter().map(|&d| d as i32).collect();
                    if got_i32 == want {
                        summed
                    } else {
                        ops::reshape(&summed, &want)?
                    }
                }
            }

            Op::GroupNormBackwardInput { num_groups, eps } => {
                let x = lookup(&env, node.inputs[0])?;
                let gamma = lookup(&env, node.inputs[1])?;
                let dy = lookup(&env, node.inputs[3])?;
                let x_shape = node_input_shape(graph, node.inputs[0]);
                let dtype = node.shape.dtype();
                let n = x_shape[0];
                let c = x_shape[1];
                let h = x_shape[2];
                let w = x_shape[3];
                let g = *num_groups as i32;
                let cpg = c / g;
                let inner = cpg * h * w;
                let x5 = ops::reshape(x, &[n, g, cpg, h, w])?;
                let dy5 = ops::reshape(dy, &[n, g, cpg, h, w])?;
                let x3 = ops::reshape(&x5, &[n, g, inner])?;
                let dy3 = ops::reshape(&dy5, &[n, g, inner])?;
                let gamma_g = ops::reshape(gamma, &[1, g, cpg, 1])?;
                let gamma_b = ops::broadcast_to(&gamma_g, &[n, g, cpg, h * w])?;
                let gamma_flat = ops::reshape(&gamma_b, &[n, g, inner])?;
                let eps_arr = Array::from_f32_slice(&[*eps], &[1], dtype)?;
                let mean = ops::reduce(&x3, MlxReduce::Mean, &[2], true)?;
                let x_c = ops::sub(&x3, &mean)?;
                let x_sq = ops::mul(&x_c, &x_c)?;
                let var = ops::reduce(&x_sq, MlxReduce::Mean, &[2], true)?;
                let var_eps = ops::add(&var, &eps_arr)?;
                let inv_std = ops::unary(&var_eps, MlxUnary::Rsqrt)?;
                let x_hat = ops::mul(&x_c, &inv_std)?;
                let dy_g = ops::mul(&dy3, &gamma_flat)?;
                let m_sy = ops::reduce(&dy_g, MlxReduce::Mean, &[2], true)?;
                let dy_gxh = ops::mul(&dy_g, &x_hat)?;
                let m_sxh = ops::reduce(&dy_gxh, MlxReduce::Mean, &[2], true)?;
                let term = ops::sub(&dy_g, &ops::add(&m_sy, &ops::mul(&x_hat, &m_sxh)?)?)?;
                let dx3 = ops::mul(&inv_std, &term)?;
                let dx5 = ops::reshape(&dx3, &[n, g, cpg, h, w])?;
                ops::reshape(&dx5, &[n, c, h, w])?
            }

            Op::GroupNormBackwardGamma { num_groups, eps } => {
                let x = lookup(&env, node.inputs[0])?;
                let dy = lookup(&env, node.inputs[1])?;
                let x_shape = node_input_shape(graph, node.inputs[0]);
                let n = x_shape[0];
                let c = x_shape[1];
                let h = x_shape[2];
                let w = x_shape[3];
                let g = *num_groups as i32;
                let cpg = c / g;
                let inner = cpg * h * w;
                let dtype = node.shape.dtype();
                let eps_arr = Array::from_f32_slice(&[*eps], &[1], dtype)?;
                let x5 = ops::reshape(x, &[n, g, cpg, h, w])?;
                let x3 = ops::reshape(&x5, &[n, g, inner])?;
                let x_sq = ops::mul(&x3, &x3)?;
                let mean_sq = ops::reduce(&x_sq, MlxReduce::Mean, &[2], true)?;
                let mean = ops::reduce(&x3, MlxReduce::Mean, &[2], true)?;
                let mean_sq2 = ops::mul(&mean, &mean)?;
                let var = ops::sub(&mean_sq, &mean_sq2)?;
                let var_eps = ops::add(&var, &eps_arr)?;
                let inv_std = ops::unary(&var_eps, MlxUnary::Rsqrt)?;
                let x_hat3 = ops::mul(&ops::sub(&x3, &mean)?, &inv_std)?;
                let x_hat = ops::reshape(&x_hat3, &[n, c, h, w])?;
                let prod = ops::mul(dy, &x_hat)?;
                let summed = ops::reduce(&prod, MlxReduce::Sum, &[0, 2, 3], false)?;
                let want: Vec<i32> = node
                    .shape
                    .dims()
                    .iter()
                    .map(|d| d.unwrap_static() as i32)
                    .collect();
                let got = summed.shape()?;
                let got_i32: Vec<i32> = got.iter().map(|&d| d as i32).collect();
                if got_i32 == want {
                    summed
                } else {
                    ops::reshape(&summed, &want)?
                }
            }

            Op::GroupNormBackwardBeta {
                num_groups: _,
                eps: _,
            } => {
                let dy = lookup(&env, node.inputs[1])?;
                let summed = ops::reduce(dy, MlxReduce::Sum, &[0, 2, 3], false)?;
                let want: Vec<i32> = node
                    .shape
                    .dims()
                    .iter()
                    .map(|d| d.unwrap_static() as i32)
                    .collect();
                let got = summed.shape()?;
                let got_i32: Vec<i32> = got.iter().map(|&d| d as i32).collect();
                if got_i32 == want {
                    summed
                } else {
                    ops::reshape(&summed, &want)?
                }
            }

            Op::CumsumBackward { axis, exclusive } => {
                let dy = lookup(&env, node.inputs[0])?;
                let axis_pos = if *axis < 0 {
                    node_input_shape(graph, node.inputs[0]).len() as i32 + *axis
                } else {
                    *axis
                };
                let total = ops::reduce(dy, MlxReduce::Sum, &[axis_pos], true)?;
                if *exclusive {
                    let inc = ops::cumsum(dy, axis_pos, false)?;
                    ops::sub(&total, &inc)?
                } else {
                    let pref = ops::cumsum(dy, axis_pos, true)?;
                    ops::sub(&total, &pref)?
                }
            }

            Op::GatherBackward { axis } => {
                let dy = lookup(&env, node.inputs[0])?;
                let indices_in = lookup(&env, node.inputs[1])?.clone_handle()?;
                let out_shape: Vec<i32> = node
                    .shape
                    .dims()
                    .iter()
                    .map(|d| d.unwrap_static() as i32)
                    .collect();
                let axis_pos = if *axis < 0 {
                    out_shape.len() as i32 + *axis
                } else {
                    *axis
                };
                let dy_shape = node_input_shape(graph, node.inputs[0]);
                let idx_shape = node_input_shape(graph, node.inputs[1]);
                let n_elem: usize = out_shape.iter().product::<i32>() as usize;
                let zeros = vec![0.0_f32; n_elem];
                let out_shape_usize: Vec<usize> = out_shape.iter().map(|d| *d as usize).collect();
                let zero_target =
                    crate::array::Array::from_f32_slice(&zeros, &out_shape_usize, DType::F32)?;
                let indices = if dy_shape.len() > 1 && idx_shape.len() == 1 {
                    ops::reshape(&indices_in, &[idx_shape[0], 1])?
                } else {
                    indices_in
                };
                ops::scatter_add_axis(&zero_target, &indices, dy, axis_pos)?
            }

            Op::RopeBackward { head_dim, n_rot } => {
                // Backward = forward rotation with negated sin (NeoX).
                let dy = lookup(&env, node.inputs[0])?;
                let cos = lookup(&env, node.inputs[1])?;
                let sin = lookup(&env, node.inputs[2])?;
                let neg_one = Array::from_f32_slice(&[-1.0], &[1], node.shape.dtype())?;
                let sin_neg = ops::mul(sin, &neg_one)?;
                let x_shape = node_input_shape(graph, node.inputs[0]);
                let n = x_shape.len();
                let hd = *head_dim as i32;
                let nr = *n_rot as i32;
                let rot_half = nr / 2;
                if n < 2 {
                    return Err(MlxError("RopeBackward: dy must be rank ≥ 2".into()));
                }
                let rotate = |x_rot: &Array,
                              rot_shape: &[i32],
                              seq_axis: usize,
                              pairs: i32|
                 -> Result<Array, MlxError> {
                    let rn = rot_shape.len();
                    let seq_v = rot_shape[seq_axis];
                    let cos_seq = ops::slice(cos, &[0, 0], &[seq_v, pairs])?;
                    let sin_seq = ops::slice(&sin_neg, &[0, 0], &[seq_v, pairs])?;
                    let mut bshape = vec![1i32; rn];
                    bshape[seq_axis] = seq_v;
                    bshape[rn - 1] = pairs;
                    let cos_b = ops::reshape(&cos_seq, &bshape)?;
                    let sin_b = ops::reshape(&sin_seq, &bshape)?;
                    let mut x1_stop = rot_shape.to_vec();
                    x1_stop[rn - 1] = pairs;
                    let x1 = ops::slice(x_rot, &vec![0i32; rn], &x1_stop)?;
                    let mut x2_start = vec![0i32; rn];
                    x2_start[rn - 1] = pairs;
                    let x2 = ops::slice(x_rot, &x2_start, rot_shape)?;
                    let x1_cos = ops::mul(&x1, &cos_b)?;
                    let x2_sin = ops::mul(&x2, &sin_b)?;
                    let x2_cos = ops::mul(&x2, &cos_b)?;
                    let x1_sin = ops::mul(&x1, &sin_b)?;
                    let y1 = ops::sub(&x1_cos, &x2_sin)?;
                    let y2 = ops::add(&x2_cos, &x1_sin)?;
                    ops::concat(&[&y1, &y2], (rn - 1) as i32)
                };
                let last = *x_shape.last().unwrap();
                if last < nr {
                    return Err(MlxError(format!(
                        "RopeBackward: last dim {last} < n_rot {n_rot}"
                    )));
                }
                let mut rot_stop = x_shape.clone();
                rot_stop[n - 1] = nr.min(hd);
                let rot = ops::slice(dy, &vec![0i32; n], &rot_stop)?;
                let rotated = rotate(&rot, &rot_stop, n - 2, rot_half)?;
                if last == nr.min(hd) {
                    rotated
                } else {
                    let mut tail_start = vec![0i32; n];
                    tail_start[n - 1] = nr.min(hd);
                    let tail = ops::slice(dy, &tail_start, &x_shape)?;
                    ops::concat(&[&rotated, &tail], (n - 1) as i32)?
                }
            }

            Op::GaussianSplatRender {
                width,
                height,
                tile_size,
                radius_scale,
                alpha_cutoff,
                max_splat_steps,
                transmittance_threshold,
                max_list_entries,
            } => {
                let positions = lookup(&env, node.inputs[0])?.to_f32()?;
                let scales = lookup(&env, node.inputs[1])?.to_f32()?;
                let rotations = lookup(&env, node.inputs[2])?.to_f32()?;
                let opacities = lookup(&env, node.inputs[3])?.to_f32()?;
                let colors = lookup(&env, node.inputs[4])?.to_f32()?;
                let sh_coeffs = lookup(&env, node.inputs[5])?.to_f32()?;
                let meta = lookup(&env, node.inputs[6])?.to_f32()?;
                let out_host = crate::splat::render_host_slices(
                    &positions,
                    &scales,
                    &rotations,
                    &opacities,
                    &colors,
                    &sh_coeffs,
                    &meta,
                    *width,
                    *height,
                    *tile_size,
                    *radius_scale,
                    *alpha_cutoff,
                    *max_splat_steps,
                    *transmittance_threshold,
                    *max_list_entries,
                );
                let out_shape: Vec<usize> = node
                    .shape
                    .dims()
                    .iter()
                    .map(|d| d.unwrap_static())
                    .collect();
                Array::from_f32_slice(&out_host, &out_shape, DType::F32)?
            }

            Op::GaussianSplatRenderBackward {
                width,
                height,
                tile_size,
                radius_scale,
                alpha_cutoff,
                max_splat_steps,
                transmittance_threshold,
                max_list_entries,
                loss_grad_clip,
                sh_band,
                max_anisotropy,
            } => {
                let positions = lookup(&env, node.inputs[0])?.to_f32()?;
                let scales = lookup(&env, node.inputs[1])?.to_f32()?;
                let rotations = lookup(&env, node.inputs[2])?.to_f32()?;
                let opacities = lookup(&env, node.inputs[3])?.to_f32()?;
                let colors = lookup(&env, node.inputs[4])?.to_f32()?;
                let sh_coeffs = lookup(&env, node.inputs[5])?.to_f32()?;
                let meta = lookup(&env, node.inputs[6])?.to_f32()?;
                let d_loss = lookup(&env, node.inputs[7])?.to_f32()?;
                let packed = crate::splat::backward_host_slices(
                    &positions,
                    &scales,
                    &rotations,
                    &opacities,
                    &colors,
                    &sh_coeffs,
                    &meta,
                    &d_loss,
                    *width,
                    *height,
                    *tile_size,
                    *radius_scale,
                    *alpha_cutoff,
                    *max_splat_steps,
                    *transmittance_threshold,
                    *max_list_entries,
                    *loss_grad_clip,
                    *sh_band,
                    *max_anisotropy,
                );
                let out_shape: Vec<usize> = node
                    .shape
                    .dims()
                    .iter()
                    .map(|d| d.unwrap_static())
                    .collect();
                Array::from_f32_slice(&packed, &out_shape, DType::F32)?
            }

            Op::Custom { name, attrs, .. } => {
                // Dispatch through the registered MlxKernel. Each
                // input is looked up as an MLX Array (already
                // computed by earlier iterations); the kernel
                // produces a fresh Array for this node, which feeds
                // any consumers downstream. The kernel is free to
                // compose existing MLX `Array` ops (staying in the
                // lazy graph for `mlx::compile`'s benefit) or to
                // call into `mlx::fast::metal_kernel` for raw MSL.
                let kernel = crate::op_registry::lookup_mlx_kernel(name).ok_or_else(|| {
                    MlxError(format!(
                        "rlx-mlx: no MlxKernel registered for \
                         Op::Custom('{name}'). Either register one \
                         via rlx_mlx::op_registry::register_mlx_kernel \
                         or pin this graph to Device::Cpu."
                    ))
                })?;
                let in_refs: Vec<&Array> = node
                    .inputs
                    .iter()
                    .map(|&in_id| lookup(&env, in_id))
                    .collect::<Result<Vec<_>, _>>()?;
                kernel.execute(&in_refs, &node.shape, attrs)?
            }

            // Identity-forward op used by the GRL (Gradient Reverse Layer)
            // in adversarial training. Forward value matches the input; the
            // gradient pass treats it as a stop. MLX's compiled trace only
            // sees the forward, so we lower it to a no-op clone.
            Op::StopGradient => {
                let x = lookup(&env, node.inputs[0])?;
                x.clone_handle()?
            }

            other => {
                return unsupported(format!("{other:?}"));
            }
        };

        env.insert(id, arr);
        if debug_eval {
            let label = node
                .name
                .as_deref()
                .map(|n| format!("{n} ({id:?})"))
                .unwrap_or_else(|| format!("{id:?}"));
            if let Some(a) = env.get(&id) {
                eval(&[a]).map_err(|e| MlxError(format!("eval at {label}: {e}")))?;
            }
        }
    }

    // Look outputs up by reference — `graph.outputs` may legitimately
    // contain duplicate NodeIds (e.g. when a vmap'd graph has the same
    // tangent output reused across multiple slots), so removing on
    // first hit would break the second occurrence with a phantom
    // "not lowered" error. The Array clones here are MLX handle
    // clones (Arc-like), not data copies.
    let mut outs = Vec::with_capacity(graph.outputs.len());
    for &out_id in &graph.outputs {
        let arr = env
            .get(&out_id)
            .ok_or_else(|| MlxError(format!("output node {out_id:?} was not lowered")))?
            .clone_handle()?;
        outs.push(arr);
    }
    Ok(outs)
}

/// Build the MLX graph and return the array handles for the graph's
/// declared outputs (in `graph.outputs` order).
///
/// Host-data variant: leaves are constructed from f32 input/param
/// buffers. The compile path uses [`lower_with_env`] directly with a
/// pre-built leaf binding instead.
pub fn lower_and_run(
    graph: &Graph,
    params: &HashMap<String, Vec<f32>>,
    inputs: &HashMap<String, Vec<f32>>,
    mode: MlxMode,
) -> Result<Vec<Array>, MlxError> {
    // PLAN L3: coarse Perfetto span around the whole MLX lower+eval
    // pass. MLX is lazy (graph build → eval); per-node spans would
    // measure build time, not GPU compute. One span per run() is the
    // honest cross-backend marker for an MLX execution.
    let _perf = rlx_ir::perfetto::TraceSpan::new("lower_and_run", "mlx");
    lower_and_run_typed(
        graph,
        params,
        &HashMap::new(),
        inputs,
        &HashMap::new(),
        mode,
    )
}

/// Same as `lower_and_run` but accepts parallel typed maps. When a
/// name appears in `params_typed` / `inputs_typed`, the typed bytes
/// are bound directly via `Array::from_bytes` (no f32 round-trip).
/// Existing f32 callers thread empty maps through `lower_and_run`.
///
/// Dynamic shapes (`Dim::Dynamic`) get resolved here too: we infer
/// symbol→size bindings from the actual data lengths of each Input,
/// rebuild the graph with bound shapes, and lower against the
/// concretized version. MLX's per-shape trace caching handles the
/// re-shape efficiency on subsequent calls.
pub fn lower_and_run_typed(
    graph: &Graph,
    params: &HashMap<String, Vec<f32>>,
    params_typed: &HashMap<String, (Vec<u8>, DType)>,
    inputs: &HashMap<String, Vec<f32>>,
    inputs_typed: &HashMap<String, (Vec<u8>, DType)>,
    mode: MlxMode,
) -> Result<Vec<Array>, MlxError> {
    lower_and_run_typed_with_extent(
        graph,
        params,
        params_typed,
        inputs,
        inputs_typed,
        mode,
        /*active_extent=*/ None,
        None,
        rlx_ir::RngOptions::default(),
    )
}

/// Variant of [`lower_and_run_typed`] honoring a PLAN L1 active-extent
/// hint (`Some((actual, upper))`). When set AND the graph passes
/// [`is_safe_for_active_extent`], every input leaf whose outer dim
/// equals `upper` is sliced along axis 0 to `actual` before
/// composition. MLX's lazy eval propagates the smaller shapes through
/// the rest of the trace, so most ops just produce smaller outputs
/// naturally — no per-op kernel scaling needed. Falls back to the full
/// extent when the hint is `None` or the graph contains an unsafe op.
pub fn lower_and_run_typed_with_extent(
    graph: &Graph,
    params: &HashMap<String, Vec<f32>>,
    params_typed: &HashMap<String, (Vec<u8>, DType)>,
    inputs: &HashMap<String, Vec<f32>>,
    inputs_typed: &HashMap<String, (Vec<u8>, DType)>,
    mode: MlxMode,
    active_extent: Option<(usize, usize)>,
    gpu_inputs: Option<&HashMap<String, Array>>,
    rng: rlx_ir::RngOptions,
) -> Result<Vec<Array>, MlxError> {
    // Resolve dynamic dims if any. The graph as-given may have
    // Dim::Dynamic entries in Input shapes (and propagated through
    // inferred internal shapes). We gather concrete bindings from the
    // supplied data and rebuild the graph with every shape bound.
    let resolved_owner;
    let graph: &Graph = if has_dynamic_dims(graph) {
        let binding = collect_bindings(graph, inputs, inputs_typed)?;
        resolved_owner = resolve_graph(graph, &binding);
        &resolved_owner
    } else {
        graph
    };

    let order = compile_leaf_order(graph);
    let mut env: HashMap<NodeId, Array> = HashMap::with_capacity(graph.nodes().len());
    for (id, _key) in &order {
        env.insert(
            *id,
            build_leaf_for(
                graph,
                *id,
                params,
                inputs,
                params_typed,
                inputs_typed,
                gpu_inputs,
            )?,
        );
    }
    env = expand_leaf_env(graph, env)?;

    // PLAN L1 active-extent: when hinted + safe, slice each Input leaf
    // along axis 0 from `upper` to `actual`. Only Input leaves get
    // sliced — Param/Constant tensors don't carry a batch dim that
    // matches the bucket axis. MLX's lazy graph propagates the smaller
    // shapes naturally through downstream element-wise / reduction-on-
    // inner / matmul ops.
    if let Some((actual, upper)) = active_extent
        && actual < upper
        && is_safe_for_active_extent(graph, upper)
    {
        for (id, _key) in &order {
            let node = graph.node(*id);
            if !matches!(node.op, Op::Input { .. }) {
                continue;
            }
            let dims = node.shape.dims();
            if dims.is_empty() {
                continue;
            }
            let outer = match dims[0] {
                Dim::Static(d) => d,
                _ => continue,
            };
            if outer != upper {
                continue;
            }
            let leaf = env.get(id).unwrap();
            let in_shape: Vec<usize> = dims.iter().map(|d| d.unwrap_static()).collect();
            let mut start = vec![0i32; in_shape.len()];
            let mut stop: Vec<i32> = in_shape.iter().map(|&d| d as i32).collect();
            start[0] = 0;
            stop[0] = actual as i32;
            let sliced = ops::slice(leaf, &start, &stop)?;
            env.insert(*id, sliced);
        }
    }

    // Eager mode wants per-op eval for debugging; the env-walker's
    // construction is pure (no eval), so we trigger it here against
    // outputs after lowering. For interleaved per-op eval we'd need
    // a separate walker variant — currently no caller asks for that.
    let outs = lower_with_env(graph, env, params, params_typed, rng)?;

    let refs: Vec<&Array> = outs.iter().collect();
    match mode {
        MlxMode::Eager => {
            // Eval outputs one at a time. Functionally equivalent to
            // per-op eval since outputs are dependency roots; only
            // the failure-localization aspect is weaker.
            for o in &outs {
                eval(&[o])?;
            }
        }
        MlxMode::Lazy => {
            for (i, o) in refs.iter().enumerate() {
                let oid = graph.outputs.get(i).copied();
                let name = oid
                    .and_then(|id| graph.node(id).name.clone())
                    .unwrap_or_else(|| format!("{oid:?}"));
                eval(&[*o]).map_err(|e| MlxError(format!("eval output[{i}] {name}: {e}")))?;
            }
        }
        MlxMode::AsyncCommit => {
            async_eval(&refs)?;
        }
        MlxMode::Compiled => {
            // Compiled mode shouldn't reach this code path —
            // backend.rs dispatches to run_compiled before calling
            // here. If we did get here it means the host-data path
            // was used, so just eval normally (correct, just misses
            // the trace-cache benefit).
            eval(&refs)?;
        }
    }

    Ok(outs)
}

/// PLAN L1 — true when the graph is safe for active-extent dispatch
/// at the given `upper` extent. Conservative: rejects ops that either
/// (a) hardcode the outer dim in their parameters
/// (`Op::Reshape { new_shape }` / `Op::Expand { target_shape }` / etc.
/// when those shapes mention `upper`), (b) operate along axis 0
/// (`Op::Reduce` / `Op::Cumsum` / `Op::Concat` / `Op::Narrow` with
/// axis 0; `Op::Transpose` whose perm permutes axis 0), or (c) have
/// outer-dim semantics that can't be honored by simply slicing the
/// input (`Op::Gather` / `Op::ScatterAdd` / `Op::Sample` / `Op::TopK`
/// / `Op::SelectiveScan` / `Op::GroupedMatMul` / `Op::Pool` /
/// `Op::Conv` / `Op::FusedTransformerLayer` / sub-graph control flow).
pub fn is_safe_for_active_extent(graph: &Graph, upper: usize) -> bool {
    let upper_i64 = upper as i64;
    for node in graph.nodes() {
        match &node.op {
            // Leaves & element-wise ops: always safe (slicing inputs
            // produces correctly-sized intermediates via lazy eval).
            Op::Input { .. } | Op::Param { .. } | Op::Constant { .. } => {}
            Op::Activation(_)
            | Op::Cast { .. }
            | Op::Binary(_)
            | Op::Compare(_)
            | Op::Where
            | Op::ElementwiseRegion { .. }
            | Op::BatchElementwiseRegion { .. }
            | Op::TransformRegion { .. } => {}
            // Per-row normalizations: operate on inner axes, batch is
            // pass-through. Safe.
            Op::Softmax { axis: _ }
            | Op::LayerNorm { .. }
            | Op::LayerNorm2d { .. }
            | Op::GroupNorm { .. }
            | Op::RmsNorm { .. }
            | Op::ResizeNearest2x => {}
            // Rope / Attention / matmul: batch in outer dim, computation
            // on inner axes. Safe by construction.
            Op::Rope { .. }
            | Op::Attention { .. }
            | Op::MatMul
            | Op::DotGeneral { .. }
            | Op::FusedMatMulBiasAct { .. }
            | Op::FusedSwiGLU { .. }
            | Op::FusedResidualLN { .. }
            | Op::FusedResidualRmsNorm { .. }
            | Op::FusedAttentionBlock { .. } => {}
            // DequantMatMul / LoraMatMul follow MatMul's batch-outer
            // contract.
            Op::DequantMatMul { .. } | Op::LoraMatMul { .. } => {}
            // Real INT8 ops: not lowered on MLX yet — train/quantize
            // on CPU, run inference there. Reject so the dispatch
            // surfaces a clear error.
            Op::QMatMul { .. } | Op::QConv2d { .. } => return false,
            // Reduce / Cumsum: safe iff the operation doesn't touch
            // axis 0.
            Op::Reduce { axes, .. } => {
                if axes.contains(&0) {
                    return false;
                }
            }
            Op::Cumsum { axis, .. } => {
                if *axis == 0 {
                    return false;
                }
            }
            // Concat: safe iff axis != 0 (concatenating along the batch
            // axis would mix batches across the slice boundary).
            Op::Concat { axis } => {
                if *axis == 0 {
                    return false;
                }
            }
            // Narrow on axis 0 changes the bucket itself — unsafe.
            Op::Narrow { axis, .. } => {
                if *axis == 0 {
                    return false;
                }
            }
            // Transpose is safe iff perm[0] == 0 (axis 0 stays put;
            // inner axes can permute freely).
            Op::Transpose { perm } => {
                if perm.first().copied() != Some(0) {
                    return false;
                }
            }
            // Reshape / Expand: reject if their target shape mentions
            // `upper` — that hardcoded dim won't survive the slice.
            Op::Reshape { new_shape } => {
                if new_shape.contains(&upper_i64) {
                    return false;
                }
            }
            Op::Expand { target_shape } => {
                if target_shape.contains(&upper_i64) {
                    return false;
                }
            }
            // Gather operates on axis 0 of its lookup table; the
            // batch contract isn't compatible with bucket slicing.
            Op::Gather { .. } => return false,
            // Conservatively unsafe — these have batch-touching
            // semantics (or sub-graph leaves) that the slice trick
            // doesn't handle.
            Op::ScatterAdd
            | Op::Sample { .. }
            | Op::RngNormal { .. }
            | Op::RngUniform { .. }
            | Op::TopK { .. }
            | Op::SelectiveScan { .. }
            | Op::GatedDeltaNet { .. }
            | Op::GroupedMatMul
            | Op::Pool { .. }
            | Op::Conv { .. }
            | Op::ConvTranspose2d { .. }
            | Op::FusedTransformerLayer { .. }
            | Op::DenseSolve
            | Op::Custom { .. }
            | Op::If { .. }
            | Op::While { .. } => return false,
            // Quantization: not lowered on MLX yet — train/quantize on
            // CPU, run inference on the dequantized fp32/fp16 path.
            Op::Quantize { .. }
            | Op::Dequantize { .. }
            | Op::FakeQuantize { .. }
            | Op::FakeQuantizeBackward { .. }
            | Op::FakeQuantizeLSQ { .. }
            | Op::FakeQuantizeLSQBackwardX { .. }
            | Op::FakeQuantizeLSQBackwardScale { .. } => return false,
            // Backward / training ops: active-extent dispatch is an
            // inference-only batch-bucketing optimization, so the safe
            // default for any training-graph node is `false` regardless
            // of whether MLX can lower it. Tier 1 (Relu/Activation/SCE/
            // LayerNorm/RmsNorm/Rope/Cumsum/Gather backward) DOES lower
            // on MLX — see `lower_with_env`.
            Op::ReluBackward
            | Op::ActivationBackward { .. }
            | Op::MaxPool2dBackward { .. }
            | Op::Conv2dBackwardInput { .. }
            | Op::Conv2dBackwardWeight { .. }
            | Op::SoftmaxCrossEntropyWithLogits
            | Op::SoftmaxCrossEntropyBackward
            | Op::LayerNormBackwardInput { .. }
            | Op::LayerNormBackwardGamma { .. }
            | Op::RmsNormBackwardInput { .. }
            | Op::RmsNormBackwardGamma { .. }
            | Op::RmsNormBackwardBeta { .. }
            | Op::RopeBackward { .. }
            | Op::CumsumBackward { .. }
            | Op::GatherBackward { .. }
            | Op::GroupNormBackwardInput { .. }
            | Op::GroupNormBackwardGamma { .. }
            | Op::GroupNormBackwardBeta { .. } => return false,
            Op::Scan { .. }
            | Op::ScanBackward { .. }
            | Op::ScanBackwardXs { .. }
            | Op::BatchedDenseSolve => return false,
            // CustomFn is opaque to active-extent analysis — the body
            // graph may have arbitrary internal structure. Fall back
            // to full extent for graphs that contain them. (Op::Custom
            // is already rejected in the conservatively-unsafe arm.)
            Op::CustomFn { .. } => return false,
            // FFT lowered natively via `mlx::fft::fft` FFI shim.
            Op::Fft { .. } => return true,
            // C64 ops are CPU-only today; pin to Device::Cpu.
            Op::ComplexNormSq | Op::ComplexNormSqBackward | Op::Conjugate => return false,
            _ => return false,
        }
    }
    true
}

/// True if any node in the graph has a Dim::Dynamic entry. Cheap
/// scan; lets us skip the resolve step for fully-static graphs.
fn has_dynamic_dims(graph: &Graph) -> bool {
    graph
        .nodes()
        .iter()
        .any(|n| n.shape.dims().iter().any(|d| !d.is_static()))
}

/// Walk the graph, infer concrete sizes for each `Dim::Dynamic` symbol
/// from the supplied input data. Each Input with exactly one dynamic
/// dim contributes a binding (data_nelems / static_dim_product). The
/// inference is conservative: if a single input has multiple dynamic
/// dims it errors, since the data length is one number and we can't
/// distribute it across multiple unknowns. Multi-dynamic inputs would
/// need an externally-supplied DimBinding; out of scope today.
fn collect_bindings(
    graph: &Graph,
    inputs: &HashMap<String, Vec<f32>>,
    inputs_typed: &HashMap<String, (Vec<u8>, DType)>,
) -> Result<DimBinding, MlxError> {
    let mut binding = DimBinding::new();
    for node in graph.nodes() {
        if let Op::Input { name } = &node.op {
            // Element count from the supplied data (typed wins).
            let n_elems = if let Some((bytes, dt)) = inputs_typed.get(name) {
                let elem_size = dt.size_bytes();
                if elem_size == 0 || bytes.len() % elem_size != 0 {
                    return Err(MlxError(format!(
                        "Input '{name}': typed bytes len {} not aligned to dtype size",
                        bytes.len()
                    )));
                }
                bytes.len() / elem_size
            } else if let Some(data) = inputs.get(name) {
                data.len()
            } else {
                // No data yet — skip; the leaf-build step will error
                // with a clearer "missing input" diagnostic.
                continue;
            };

            // Walk the shape's dims, accumulating the static product
            // and identifying the (single allowed) dynamic position.
            let mut static_prod: usize = 1;
            let mut dynamic_sym: Option<u32> = None;
            for d in node.shape.dims().iter() {
                match d {
                    Dim::Static(n) => {
                        static_prod = static_prod.checked_mul(*n).ok_or_else(|| {
                            MlxError(format!("Input '{name}': static dim product overflow"))
                        })?;
                    }
                    Dim::Dynamic(sym) => {
                        if dynamic_sym.is_some() {
                            return Err(MlxError(format!(
                                "Input '{name}' has multiple dynamic dims; \
                                 explicit DimBinding required"
                            )));
                        }
                        dynamic_sym = Some(*sym);
                    }
                }
            }

            if let Some(sym) = dynamic_sym {
                if static_prod == 0 {
                    return Err(MlxError(format!(
                        "Input '{name}': can't infer dynamic dim against zero \
                         static product"
                    )));
                }
                if n_elems % static_prod != 0 {
                    return Err(MlxError(format!(
                        "Input '{name}': nelems {n_elems} not divisible by \
                         static dim product {static_prod}"
                    )));
                }
                let dim_size = n_elems / static_prod;
                if let Some(prev) = binding.get(sym) {
                    if prev != dim_size {
                        return Err(MlxError(format!(
                            "Dynamic dim ?{sym}: inconsistent values across \
                             inputs ({prev} vs {dim_size})"
                        )));
                    }
                } else {
                    binding.set(sym, dim_size);
                }
            }
        }
    }
    Ok(binding)
}

/// Rebuild the graph with every Shape bound against `binding`. Node
/// IDs are preserved because we re-add ops in the same order via the
/// public `Graph::add_node` API (which allocates IDs sequentially).
fn resolve_graph(graph: &Graph, binding: &DimBinding) -> Graph {
    let mut fresh = Graph::new(&graph.name);
    for node in graph.nodes() {
        let bound: Shape = node.shape.bind(binding);
        // add_node preserves declaration order → preserves NodeIds.
        fresh.add_node(node.op.clone(), node.inputs.clone(), bound);
    }
    fresh.set_outputs(graph.outputs.clone());
    fresh
}

/// Build an additive `[seq_q, seq_k]` SDPA mask for sliding-window
/// attention: 0 where (ki <= qi) AND (qi - ki <= window), -inf
/// elsewhere. Constructed host-side as f32 because MLX SDPA wants
/// the mask added to the pre-softmax scores.
fn build_sliding_window_mask(s_q: i32, s_k: i32, window: i32) -> Result<Array, MlxError> {
    let neg_inf = f32::NEG_INFINITY;
    let s_q = s_q as usize;
    let s_k = s_k as usize;
    let w = window as i64;
    let mut buf = vec![neg_inf; s_q * s_k];
    for qi in 0..s_q {
        for ki in 0..s_k {
            let q = qi as i64;
            let k = ki as i64;
            // Causal + bounded distance.
            if k <= q && (q - k) <= w {
                buf[qi * s_k + ki] = 0.0;
            }
        }
    }
    Array::from_f32_slice(&buf, &[s_q, s_k], DType::F32)
}

fn quant_scheme_to_mlx(scheme: &rlx_ir::QuantScheme) -> Result<(i32, i32), MlxError> {
    use rlx_ir::QuantScheme as Q;
    let bits = scheme.bits_per_element() as i32;
    let gs = match scheme {
        Q::Int8Block { block_size } => *block_size as i32,
        Q::Int8BlockAsym { block_size } => *block_size as i32,
        Q::Int4Block { block_size } => *block_size as i32,
        other => {
            return Err(MlxError(format!(
                "MLX quantized_matmul: unsupported scheme {other:?}"
            )));
        }
    };
    Ok((bits, gs))
}

// ── GGUF dequant cache ──────────────────────────────────────────────────
//
// Each generate() call drives a fresh lower_and_run_typed traversal, so
// the Op::DequantMatMul branch above would otherwise re-dequant the full
// Q4K weight tensor on every dispatch. We key the cache by Param name —
// stable across compilations of the same model — and store the already-
// transposed `[k, n]` Array so the matmul gets MLX's tuned f32 matmul
// without per-call setup.
//
// The cache lives for the process lifetime; Arrays are reference-counted
// (cloning is free). Off-switch: `RLX_MLX_DEQUANT_CACHE_DISABLE=1`.
use std::sync::Mutex;
use std::sync::OnceLock;

fn dequant_cache_disabled() -> bool {
    std::env::var("RLX_MLX_DEQUANT_CACHE_DISABLE").as_deref() == Ok("1")
}

fn dequant_cache() -> &'static Mutex<HashMap<String, Array>> {
    static CACHE: OnceLock<Mutex<HashMap<String, Array>>> = OnceLock::new();
    CACHE.get_or_init(|| Mutex::new(HashMap::new()))
}

fn mlx_dequant_cache_get(key: &str) -> Result<Option<Array>, MlxError> {
    if dequant_cache_disabled() {
        return Ok(None);
    }
    let guard = match dequant_cache().lock() {
        Ok(g) => g,
        Err(_) => return Ok(None),
    };
    match guard.get(key) {
        Some(a) => Ok(Some(a.clone_handle()?)),
        None => Ok(None),
    }
}

fn mlx_dequant_cache_put(key: String, arr: Array) {
    if dequant_cache_disabled() {
        return;
    }
    if let Ok(mut m) = dequant_cache().lock() {
        m.insert(key, arr);
    }
}

fn build_dequanted_kn(
    w_bytes: &[u8],
    k: usize,
    n: usize,
    scheme: &rlx_ir::QuantScheme,
) -> Result<Array, MlxError> {
    let block_bytes = scheme.gguf_block_bytes() as usize;
    let block_elems = scheme.gguf_block_size() as usize;
    let blocks_actual = w_bytes.len() / block_bytes;
    let elems_actual = blocks_actual * block_elems;
    let elems_required = k * n;
    let elems_for_dequant = elems_required.min(elems_actual);
    let mut w_f32 = match scheme {
        rlx_ir::QuantScheme::GgufQ4K => rlx_gguf::dequant_q4_k(w_bytes, elems_for_dequant)
            .map_err(|e| MlxError(format!("GGUF Q4_K dequant: {e}")))?,
        _ => dequant_gguf_weight(w_bytes, k, n, scheme)?,
    };
    if w_f32.len() < elems_required {
        w_f32.resize(elems_required, 0.0);
    }
    let w_nk = Array::from_f32_slice(&w_f32, &[n, k], DType::F32)?;
    ops::transpose(&w_nk, &[1, 0])
}

fn dequant_gguf_weight(
    w_bytes: &[u8],
    k: usize,
    n: usize,
    scheme: &rlx_ir::QuantScheme,
) -> Result<Vec<f32>, MlxError> {
    use rlx_ir::QuantScheme as Q;
    let elems = k * n;
    match scheme {
        Q::GgufQ4K => rlx_gguf::dequant_q4_k(w_bytes, elems)
            .map_err(|e| MlxError(format!("GGUF Q4_K dequant: {e}"))),
        Q::GgufQ5K => rlx_gguf::dequant_q5_k(w_bytes, elems)
            .map_err(|e| MlxError(format!("GGUF Q5_K dequant: {e}"))),
        Q::GgufQ6K => rlx_gguf::dequant_q6_k(w_bytes, elems)
            .map_err(|e| MlxError(format!("GGUF Q6_K dequant: {e}"))),
        Q::GgufQ8K => rlx_gguf::dequant_q8_k(w_bytes, elems)
            .map_err(|e| MlxError(format!("GGUF Q8_K dequant: {e}"))),
        Q::GgufQ2K => rlx_gguf::dequant_q2_k(w_bytes, elems)
            .map_err(|e| MlxError(format!("GGUF Q2_K dequant: {e}"))),
        Q::GgufQ3K => rlx_gguf::dequant_q3_k(w_bytes, elems)
            .map_err(|e| MlxError(format!("GGUF Q3_K dequant: {e}"))),
        Q::GgufQ4_0 => rlx_gguf::dequant_q4_0(w_bytes, elems)
            .map_err(|e| MlxError(format!("GGUF Q4_0 dequant: {e}"))),
        Q::GgufQ8_0 => rlx_gguf::dequant_q8_0(w_bytes, elems)
            .map_err(|e| MlxError(format!("GGUF Q8_0 dequant: {e}"))),
        Q::GgufIQ4NL => rlx_gguf::iq_dequant::dequant_iq4_nl(w_bytes, elems)
            .map_err(|e| MlxError(format!("GGUF IQ4_NL dequant: {e}"))),
        Q::GgufIQ4XS => rlx_gguf::iq_dequant::dequant_iq4_xs(w_bytes, elems)
            .map_err(|e| MlxError(format!("GGUF IQ4_XS dequant: {e}"))),
        Q::GgufIQ2XXS => rlx_gguf::iq_dequant::dequant_iq2_xxs(w_bytes, elems)
            .map_err(|e| MlxError(format!("GGUF IQ2_XXS dequant: {e}"))),
        Q::GgufIQ2XS => rlx_gguf::iq_dequant::dequant_iq2_xs(w_bytes, elems)
            .map_err(|e| MlxError(format!("GGUF IQ2_XS dequant: {e}"))),
        Q::GgufIQ2S => rlx_gguf::iq_dequant::dequant_iq2_s(w_bytes, elems)
            .map_err(|e| MlxError(format!("GGUF IQ2_S dequant: {e}"))),
        Q::GgufIQ3XXS => rlx_gguf::iq_dequant::dequant_iq3_xxs(w_bytes, elems)
            .map_err(|e| MlxError(format!("GGUF IQ3_XXS dequant: {e}"))),
        Q::GgufIQ3S => rlx_gguf::iq_dequant::dequant_iq3_s(w_bytes, elems)
            .map_err(|e| MlxError(format!("GGUF IQ3_S dequant: {e}"))),
        Q::GgufIQ1S => rlx_gguf::iq_dequant::dequant_iq1_s(w_bytes, elems)
            .map_err(|e| MlxError(format!("GGUF IQ1_S dequant: {e}"))),
        Q::GgufIQ1M => rlx_gguf::iq_dequant::dequant_iq1_m(w_bytes, elems)
            .map_err(|e| MlxError(format!("GGUF IQ1_M dequant: {e}"))),
        Q::GgufTQ1_0 => rlx_gguf::tq_dequant::dequant_tq1_0(w_bytes, elems)
            .map_err(|e| MlxError(format!("GGUF TQ1_0 dequant: {e}"))),
        Q::GgufTQ2_0 => rlx_gguf::tq_dequant::dequant_tq2_0(w_bytes, elems)
            .map_err(|e| MlxError(format!("GGUF TQ2_0 dequant: {e}"))),
        Q::GgufMXFP4 => rlx_gguf::mx_dequant::dequant_mxfp4(w_bytes, elems)
            .map_err(|e| MlxError(format!("GGUF MXFP4 dequant: {e}"))),
        Q::GgufNVFP4 => rlx_gguf::mx_dequant::dequant_nvfp4(w_bytes, elems)
            .map_err(|e| MlxError(format!("GGUF NVFP4 dequant: {e}"))),
        other => Err(MlxError(format!(
            "MLX DequantMatMul: unsupported GGUF scheme {other:?}"
        ))),
    }
}

/// Lower `Op::GatedDeltaNet` by unrolling the time loop into MLX
/// primitives (same strategy as [`Op::SelectiveScan`]).
///
/// When `state_in` is `Some`, threads recurrent state in/out (written
/// back by the caller to the state input node).
fn lower_gated_delta_net(
    q: &Array,
    k: &Array,
    v: &Array,
    g_in: &Array,
    beta: &Array,
    state_size: usize,
    state_in: Option<&Array>,
    q_shape: Vec<i32>,
) -> Result<(Array, Option<Array>), MlxError> {
    if q_shape.len() != 4 {
        return Err(MlxError(format!(
            "GatedDeltaNet: q must be rank-4 [B, S, H, N], got rank {}",
            q_shape.len()
        )));
    }
    let batch = q_shape[0];
    let seq = q_shape[1];
    let heads = q_shape[2];
    let n = state_size as i32;
    if n != q_shape[3] {
        return Err(MlxError(format!(
            "GatedDeltaNet: state_size={state_size} != q last dim {}",
            q_shape[3]
        )));
    }
    let bh = batch * heads;

    let mut state = if let Some(s0) = state_in {
        s0.clone_handle()?
    } else {
        let zero = Array::from_f32_slice(&[0.0], &[1], DType::F32)?;
        ops::broadcast_to(&zero, &[batch, heads, n, n])?
    };

    let scale = 1.0f32 / (n as f32).sqrt();
    let scale_arr = Array::from_f32_slice(&[scale], &[1], DType::F32)?;

    let mut ys: Vec<Array> = Vec::with_capacity(seq as usize);
    for t in 0..seq {
        let qt = ops::slice(q, &[0, t, 0, 0], &[batch, t + 1, heads, n])?;
        let kt = ops::slice(k, &[0, t, 0, 0], &[batch, t + 1, heads, n])?;
        let vt = ops::slice(v, &[0, t, 0, 0], &[batch, t + 1, heads, n])?;
        let gt = ops::slice(g_in, &[0, t, 0], &[batch, t + 1, heads])?;
        let beta_t = ops::slice(beta, &[0, t, 0], &[batch, t + 1, heads])?;

        let gt = ops::reshape(&gt, &[batch, heads, 1, 1])?;
        let beta_bh = ops::reshape(&beta_t, &[bh, 1, 1])?;
        let exp_g = ops::unary(&gt, MlxUnary::Exp)?;
        state = ops::mul(&state, &exp_g)?;

        let state_bh = ops::reshape(&state, &[bh, n, n])?;
        let kt_bh = ops::reshape(&kt, &[bh, 1, n])?;
        let vt_bh = ops::reshape(&vt, &[bh, 1, n])?;

        let mut sk = ops::matmul(&kt_bh, &state_bh)?;
        sk = ops::sub(&vt_bh, &sk)?;
        sk = ops::mul(&sk, &beta_bh)?;

        let kt_col = ops::reshape(&kt, &[bh, n, 1])?;
        let sk_row = ops::reshape(&sk, &[bh, 1, n])?;
        let outer = ops::mul(&kt_col, &sk_row)?;
        state = ops::add(&state, &ops::reshape(&outer, &[batch, heads, n, n])?)?;

        let state_bh = ops::reshape(&state, &[bh, n, n])?;
        let qt_bh = ops::reshape(&qt, &[bh, 1, n])?;
        let mut out_t = ops::matmul(&qt_bh, &state_bh)?;
        out_t = ops::mul(&out_t, &scale_arr)?;
        out_t = ops::reshape(&out_t, &[batch, 1, heads, n])?;
        ys.push(out_t);
    }

    let refs: Vec<&Array> = ys.iter().collect();
    let out = ops::concat(&refs, 1)?;
    Ok((out, state_in.map(|_| state)))
}

fn node_input_shape(graph: &Graph, id: NodeId) -> Vec<i32> {
    graph
        .node(id)
        .shape
        .dims()
        .iter()
        .map(|d| d.unwrap_static() as i32)
        .collect()
}

/// ONNX `Expand`: broadcast input to the **output node's** shape (not the
/// op's `target_shape` hint, which can be a lower-rank broadcast template).
/// Mirrors CPU/Metal leading-1 padding + stride-0 broadcast semantics.
fn mlx_expand(
    graph: &Graph,
    input_id: NodeId,
    out_node: &rlx_ir::Node,
    x: &Array,
) -> Result<Array, MlxError> {
    let x_rt: Vec<i32> = x.shape()?.iter().map(|&d| d as i32).collect();
    let mut out_graph = node_input_shape(graph, out_node.id);
    let in_graph = node_input_shape(graph, input_id);
    let in_dims = if x_rt.len() == in_graph.len() || !x_rt.is_empty() {
        x_rt.clone()
    } else {
        in_graph.clone()
    };

    // Align padded compile seq (512) with active runtime seq on output shape.
    if in_dims.len() == out_graph.len() {
        for i in 0..in_dims.len() {
            if in_dims[i] > 1 && out_graph[i] > 1 && in_dims[i] != out_graph[i] {
                out_graph[i] = mlx_pick_seq_dim(in_dims[i] as usize, out_graph[i] as usize) as i32;
            }
        }
    }

    let pad = out_graph.len().saturating_sub(in_dims.len());
    let mut padded: Vec<i32> = vec![1; pad];
    padded.extend_from_slice(&in_dims);

    let mut x_adj = x.clone_handle()?;
    for i in 0..padded.len().min(out_graph.len()) {
        if padded[i] > out_graph[i] && out_graph[i] > 1 {
            let rt = x_adj.shape()?;
            if rt.len() == padded.len() {
                let start = vec![0i32; rt.len()];
                let mut stop: Vec<i32> = rt.iter().map(|&d| d as i32).collect();
                stop[i] = out_graph[i];
                x_adj = ops::slice(&x_adj, &start, &stop)?;
                padded[i] = out_graph[i];
            }
        } else if padded[i] != out_graph[i] && padded[i] != 1 && out_graph[i] != 1 {
            return Err(MlxError(format!(
                "Expand: incompatible dim {i} (in={in_d}, out={out_d})",
                in_d = padded[i],
                out_d = out_graph[i]
            )));
        }
    }
    let x = if pad > 0 {
        ops::reshape(&x_adj, &padded)?
    } else {
        x_adj
    };
    ops::broadcast_to(&x, &out_graph)
}

/// Element-wise add with seq/layout alignment for rank-3 Kitten tensors.
fn mlx_add_aligned(a: &Array, b: &Array) -> Result<Array, MlxError> {
    if a.shape()?.len() == 3 && b.shape()?.len() == 3 {
        let (a, b) = mlx_align_rank3_seq_pair(a, b)?;
        return ops::add(&a, &b);
    }
    ops::add(a, b)
}

/// Pick a common seq axis for broadcast: padded compile tables (e.g. 512)
/// narrow to the active row; two compile-scale mismatches (12 vs 14) widen
/// to the larger compile slot count.
fn mlx_pick_seq_dim(a: usize, b: usize) -> usize {
    let (big, small) = if a > b { (a, b) } else { (b, a) };
    if big > 128 && small <= 128 {
        return small;
    }
    if a.max(b) > 128 { a.min(b) } else { a.max(b) }
}

/// When compile-time reshape targets embed padded mel/time axes (514) but
/// runtime tensors are shorter, infer the matching dim from element count.
fn mlx_fix_reshape_shape(in_shape: &[usize], target: &[i64]) -> Vec<i32> {
    let target: Vec<i32> = target.iter().map(|&d| d as i32).collect();
    let in_n: i64 = in_shape.iter().map(|&d| d as i64).product::<i64>();
    let out_n: i64 = target.iter().map(|&d| i64::from(d.max(1))).product::<i64>();
    if in_n == out_n {
        return target;
    }
    let mut out = target.clone();
    for i in 0..out.len() {
        if out[i] <= 128 {
            continue;
        }
        let rest: i64 = out
            .iter()
            .enumerate()
            .filter(|(j, _)| *j != i)
            .map(|(_, &d)| i64::from(d.max(1)))
            .product::<i64>()
            .max(1);
        if rest > 0 && in_n % rest == 0 {
            out[i] = (in_n / rest) as i32;
            let check: i64 = out.iter().map(|&d| i64::from(d.max(1))).product::<i64>();
            if check == in_n {
                return out;
            }
        }
    }
    for i in 0..out.len() {
        let rest: i64 = out
            .iter()
            .enumerate()
            .filter(|(j, _)| *j != i)
            .map(|(_, &d)| i64::from(d.max(1)))
            .product::<i64>()
            .max(1);
        if rest > 0 && in_n % rest == 0 {
            let mut trial = out.clone();
            trial[i] = (in_n / rest) as i32;
            if trial.iter().map(|&d| i64::from(d.max(1))).product::<i64>() == in_n {
                return trial;
            }
        }
    }
    target
}

/// Narrow `[1,S,C]` (or transpose of `[S,1,C]`) to `len` on axis 1.
fn mlx_narrow_axis1(arr: &Array, len: usize) -> Result<Array, MlxError> {
    let rt = arr.shape()?;
    if rt.len() != 3 || rt[1] <= len {
        return arr.clone_handle();
    }
    let start = vec![0i32; 3];
    let mut stop: Vec<i32> = rt.iter().map(|&d| d as i32).collect();
    stop[1] = len as i32;
    ops::slice(arr, &start, &stop)
}

/// Normalize rank-3 tensors to batch-major; seq-first `[S,1,C]` → `[1,S,C]`.
/// Feature-first `[H,1,L]` (H > 128) → `[1,L,H]`.
fn mlx_batch_major_rank3(arr: &Array) -> Result<Array, MlxError> {
    let s = arr.shape()?;
    if s.len() != 3 {
        return arr.clone_handle();
    }
    if s[0] == 1 {
        return arr.clone_handle();
    }
    if s[1] == 1 {
        if s[0] > 128 && s[2] <= 128 {
            let t = ops::transpose(arr, &[2, 1, 0])?;
            return ops::transpose(&t, &[1, 0, 2]);
        }
        return ops::transpose(arr, &[1, 0, 2]);
    }
    arr.clone_handle()
}

/// Known channel widths in Kitten — do not treat as sequence when paired
/// with a small leading count (e.g. LSTM `num_directions=2`).
const MLX_KITTEN_CHANNEL_DIMS: &[usize] = &[128, 256, 512, 768, 1024];

/// Which axis carries token/time in `[1, ?, ?]` (if any).
fn mlx_rank3_seq_axis(s: &[usize]) -> Option<usize> {
    if s.len() != 3 || s[0] != 1 {
        return None;
    }
    // BERT-style `[1, S, 128|256|512]` (padded seq on axis 1).
    if MLX_KITTEN_CHANNEL_DIMS.contains(&s[2]) && s[1] != s[2] {
        return Some(1);
    }
    if s[1] > 128 && s[2] == 1 {
        return Some(1);
    }
    if s[1] <= 128 && s[2] > 128 {
        return Some(1);
    }
    if s[2] <= 128 && s[1] > 128 {
        return Some(2);
    }
    if s[1] <= 128 && s[2] <= 128 {
        return Some(1);
    }
    None
}

fn mlx_narrow_rank3_axis(arr: &Array, axis: usize, len: usize) -> Result<Array, MlxError> {
    let rt = arr.shape()?;
    if rt.len() != 3 || rt[axis] <= len {
        return arr.clone_handle();
    }
    let start = vec![0i32; 3];
    let mut stop: Vec<i32> = rt.iter().map(|&d| d as i32).collect();
    stop[axis] = len as i32;
    ops::slice(arr, &start, &stop)
}

fn mlx_looks_like_channel_vs_small(a1: usize, b1: usize) -> bool {
    let (big, small) = if a1 > b1 { (a1, b1) } else { (b1, a1) };
    // LSTM `num_directions`, harmonic counts — not token rows.
    const STRUCTURAL_SMALL: &[usize] = &[1, 2, 4, 9];
    MLX_KITTEN_CHANNEL_DIMS.contains(&big) && STRUCTURAL_SMALL.contains(&small)
}

/// Align seq-first `[S,1,C]` with batch-major `[1,S,C]`, and narrow padded
/// compile-time seq (e.g. 512) to the active runtime seq before broadcast.
fn mlx_align_rank3_seq_pair(a: &Array, b: &Array) -> Result<(Array, Array), MlxError> {
    let as_ = a.shape()?;
    let bs = b.shape()?;
    if as_.len() != 3 || bs.len() != 3 {
        return Ok((a.clone_handle()?, b.clone_handle()?));
    }
    let mut a = mlx_batch_major_rank3(a)?;
    let mut b = mlx_batch_major_rank3(b)?;
    let as_ = a.shape()?;
    let bs = b.shape()?;
    if as_[0] != 1 || bs[0] != 1 {
        return Ok((a, b));
    }
    let Some(a_axis) = mlx_rank3_seq_axis(&as_) else {
        return Ok((a, b));
    };
    let Some(b_axis) = mlx_rank3_seq_axis(&bs) else {
        return Ok((a, b));
    };
    if a_axis != b_axis {
        return Ok((a, b));
    }
    let a_len = as_[a_axis];
    let b_len = bs[b_axis];
    if a_len == b_len {
        return Ok((a, b));
    }
    // A size-1 axis is a NumPy broadcast (e.g. a per-feature bias `[1,1,C]`
    // added to `[1,S,C]`), never a padded compile-time seq table (those are
    // >1, like 512). Narrowing the larger operand to 1 here would collapse a
    // real sequence — so let MLX broadcast it natively instead.
    if a_len == 1 || b_len == 1 {
        return Ok((a, b));
    }
    if mlx_looks_like_channel_vs_small(a_len, b_len) {
        return Ok((a, b));
    }
    let seq = mlx_pick_seq_dim(a_len, b_len);
    a = mlx_narrow_rank3_axis(&a, a_axis, seq)?;
    b = mlx_narrow_rank3_axis(&b, b_axis, seq)?;
    Ok((a, b))
}

/// Align concat inputs that mix padded compile seq with active runtime seq.
///
/// This equalizes the *seq* axis of rank-3 inputs to the active runtime
/// length (padded-decode buckets leave stale, longer compile-time seq dims).
/// It must NEVER touch the axis actually being concatenated — sizes along the
/// concat axis are intentionally different (e.g. pooling `[1,1,C]` ++ tokens
/// `[1,N,C]` along axis 1 produces `[1,1+N,C]`), and narrowing it silently
/// drops the data being concatenated.
fn mlx_align_concat_inputs(inputs: &[&Array], axis: usize) -> Result<Vec<Array>, MlxError> {
    let mut out: Vec<Array> = inputs
        .iter()
        .map(|a| a.clone_handle())
        .collect::<Result<_, _>>()?;
    let mut min_seq: Option<usize> = None;
    for a in &out {
        let s = a.shape()?;
        if s.len() == 3 {
            // The "seq" axis is 1 when batch-leading (`[1,S,C]`) or 0 when
            // `[S,1,C]`. Skip it when it is the concat axis.
            let (seq_axis, seq) = if s[0] == 1 {
                (1usize, s[1])
            } else if s[1] == 1 {
                (0usize, s[0])
            } else {
                continue;
            };
            if seq_axis == axis {
                continue;
            }
            min_seq = Some(min_seq.map_or(seq, |m| mlx_pick_seq_dim(m, seq)));
        }
    }
    let Some(seq) = min_seq else {
        return Ok(out);
    };
    for a in &mut out {
        let s = a.shape()?;
        if s.len() != 3 {
            continue;
        }
        if s[0] == 1 && s[1] > seq && axis != 1 {
            *a = mlx_narrow_axis1(a, seq)?;
        } else if s[1] == 1 && s[0] > seq && axis != 0 {
            let t = ops::transpose(a, &[1, 0, 2])?;
            *a = mlx_narrow_axis1(&t, seq)?;
        }
    }
    Ok(out)
}

/// Materialize gather/scatter indices as I64 (bundle params and TopK
/// outputs are often F32 at the MLX lazy boundary).
fn mlx_indices_i64(idx: &Array) -> Result<Array, MlxError> {
    // Lazy index conversion — must NOT host-eval the index array. A
    // `to_bytes()` round-trip here forces evaluation, which is forbidden
    // inside `mlx::compile` ("Attempting to eval an array during function
    // transformations…") and made any graph with a dynamic-index
    // `Op::Gather` (e.g. token-embedding lookups) crash in Compiled mode.
    // Gather indices are integer-valued, so the truncating f32→i64 cast
    // matches the previous host-side round while staying fully lazy.
    ops::contiguous(&ops::cast(idx, DType::I64)?)
}

/// MLX `layer_norm` / `rms_norm` expect 1-D scale vectors; graph params may
/// be broadcast-expanded to rank 3 (`[1,1,C]` or `[1,S,C]`).
fn mlx_norm_scale_1d(w: &Array) -> Result<Array, MlxError> {
    let s = w.shape()?;
    if s.len() == 1 {
        return w.clone_handle();
    }
    if let Some(&last) = s.last() {
        return ops::reshape(w, &[last as i32]);
    }
    w.clone_handle()
}

/// Prefer runtime shape when rank matches the graph (dynamic seq
/// specialization can leave stale static dims in the IR).
fn runtime_shape_or_graph(arr: &Array, graph_shape: &[i32]) -> Result<Vec<i32>, MlxError> {
    let rt = arr.shape()?;
    if rt.len() == graph_shape.len() {
        Ok(rt.iter().map(|&d| d as i32).collect())
    } else {
        Ok(graph_shape.to_vec())
    }
}

/// Batch/seq from runtime hidden `[B,S,H]` when available (graph dims can
/// lag dynamic specialization); fall back to graph shape otherwise.
fn runtime_bsh_dims(hidden: &Array, graph_h: &[i32]) -> Result<(i32, i32), MlxError> {
    let rt = hidden.shape()?;
    if rt.len() == 3 {
        Ok((rt[0] as i32, rt[1] as i32))
    } else if graph_h.len() == 3 {
        Ok((graph_h[0], graph_h[1]))
    } else {
        Err(MlxError(format!(
            "runtime_bsh_dims: expected rank-3 hidden, got runtime {rt:?} graph {graph_h:?}"
        )))
    }
}

/// When the graph flattened leading dims (e.g. `[batch*seq, K]`) but MLX
/// still carries them as `[1, batch*seq, K]`, squeeze the unit batch
/// dim before matmul. Only applied when the trailing dims match exactly.
fn flatten_matmul_lhs_if_needed(
    a: &Array,
    graph_a: &[i32],
    graph_out: &[i32],
) -> Result<Array, MlxError> {
    if graph_a.len() < 2 || graph_out.len() != graph_a.len() {
        return a.clone_handle();
    }
    let a_rt = a.shape()?;
    if a_rt.len() != graph_a.len() + 1 || a_rt[0] != 1 {
        return a.clone_handle();
    }
    let matches = graph_a
        .iter()
        .enumerate()
        .all(|(i, &d)| a_rt[i + 1] == d as usize);
    if matches {
        ops::reshape(a, graph_a)
    } else {
        a.clone_handle()
    }
}

/// Map a graph axis index onto the runtime rank when leading dims were
/// preserved by MLX (graph rank < runtime rank).
fn map_graph_axis_to_runtime(axis: usize, graph_rank: usize, runtime_rank: usize) -> usize {
    if runtime_rank <= graph_rank {
        axis
    } else {
        axis + (runtime_rank - graph_rank)
    }
}

/// Evaluate an [`Op::ElementwiseRegion`] (or one batch slice) on MLX arrays.
fn eval_elementwise_region_on_inputs(
    env: &HashMap<NodeId, Array>,
    node_inputs: &[NodeId],
    chain: &[ChainStep],
    prologue: RegionPrologue,
) -> Result<Array, MlxError> {
    let mut input0_up: Option<Array> = None;
    if prologue == RegionPrologue::ResizeNearest2x {
        let x = lookup(env, node_inputs[0])?;
        input0_up = Some(ops::resize_nearest_2x_nchw(x)?);
    }
    let mut steps: Vec<Array> = Vec::with_capacity(chain.len());
    for step in chain {
        let arr = match step {
            ChainStep::Activation(act, x_op) => {
                let x =
                    resolve_region_operand(*x_op, node_inputs, input0_up.as_ref(), env, &steps)?;
                match act {
                    Activation::Gelu => ops::gelu(x)?,
                    Activation::GeluApprox => ops::gelu_approx(x)?,
                    Activation::Silu => ops::silu(x)?,
                    Activation::Relu => ops::unary(x, MlxUnary::Relu)?,
                    Activation::Sigmoid => ops::unary(x, MlxUnary::Sigmoid)?,
                    Activation::Tanh => ops::unary(x, MlxUnary::Tanh)?,
                    Activation::Exp => ops::unary(x, MlxUnary::Exp)?,
                    Activation::Log => ops::unary(x, MlxUnary::Log)?,
                    Activation::Sqrt => ops::unary(x, MlxUnary::Sqrt)?,
                    Activation::Rsqrt => ops::unary(x, MlxUnary::Rsqrt)?,
                    Activation::Neg => ops::unary(x, MlxUnary::Neg)?,
                    Activation::Abs => ops::unary(x, MlxUnary::Abs)?,
                    Activation::Round => ops::unary(x, MlxUnary::Round)?,
                    Activation::Sin => ops::unary(x, MlxUnary::Sin)?,
                    Activation::Cos => ops::unary(x, MlxUnary::Cos)?,
                    Activation::Tan => ops::unary(x, MlxUnary::Tan)?,
                    Activation::Atan => ops::unary(x, MlxUnary::Atan)?,
                }
            }
            ChainStep::Cast(to, x_op) => {
                let x =
                    resolve_region_operand(*x_op, node_inputs, input0_up.as_ref(), env, &steps)?;
                ops::cast(x, *to)?
            }
            ChainStep::Binary(bop, l_op, r_op) => {
                let a =
                    resolve_region_operand(*l_op, node_inputs, input0_up.as_ref(), env, &steps)?;
                let b =
                    resolve_region_operand(*r_op, node_inputs, input0_up.as_ref(), env, &steps)?;
                match bop {
                    BinaryOp::Add => ops::add(a, b)?,
                    BinaryOp::Mul => ops::mul(a, b)?,
                    BinaryOp::Sub => ops::sub(a, b)?,
                    BinaryOp::Div => ops::div(a, b)?,
                    BinaryOp::Max => ops::max(a, b)?,
                    BinaryOp::Min => ops::min(a, b)?,
                    BinaryOp::Pow => ops::pow(a, b)?,
                }
            }
            ChainStep::Compare(cop, l_op, r_op) => {
                let a =
                    resolve_region_operand(*l_op, node_inputs, input0_up.as_ref(), env, &steps)?;
                let b =
                    resolve_region_operand(*r_op, node_inputs, input0_up.as_ref(), env, &steps)?;
                match cop {
                    CmpOp::Eq => ops::eq(a, b)?,
                    CmpOp::Ne => ops::ne(a, b)?,
                    CmpOp::Lt => ops::lt(a, b)?,
                    CmpOp::Le => ops::le(a, b)?,
                    CmpOp::Gt => ops::gt(a, b)?,
                    CmpOp::Ge => ops::ge(a, b)?,
                }
            }
            ChainStep::Where(c_op, t_op, f_op) => {
                let c =
                    resolve_region_operand(*c_op, node_inputs, input0_up.as_ref(), env, &steps)?;
                let t =
                    resolve_region_operand(*t_op, node_inputs, input0_up.as_ref(), env, &steps)?;
                let f =
                    resolve_region_operand(*f_op, node_inputs, input0_up.as_ref(), env, &steps)?;
                ops::select(c, t, f)?
            }
        };
        steps.push(arr);
    }
    steps
        .pop()
        .ok_or_else(|| MlxError("ElementwiseRegion: empty chain has no output".into()))
}

fn resolve_region_operand<'a>(
    op: ChainOperand,
    node_inputs: &[NodeId],
    input0_up: Option<&'a Array>,
    env: &'a HashMap<NodeId, Array>,
    steps: &'a [Array],
) -> Result<&'a Array, MlxError> {
    match op {
        ChainOperand::Input(i) => {
            let i = i as usize;
            if i == 0 {
                if let Some(up) = input0_up {
                    return Ok(up);
                }
            }
            let id = *node_inputs.get(i).ok_or_else(|| {
                MlxError(format!(
                    "ElementwiseRegion: ChainOperand::Input({i}) out of range"
                ))
            })?;
            env.get(&id).ok_or_else(|| {
                MlxError(format!(
                    "ElementwiseRegion: missing input node for Input({i})"
                ))
            })
        }
        ChainOperand::Step(i) => {
            let i = i as usize;
            steps.get(i).ok_or_else(|| {
                MlxError(format!(
                    "ElementwiseRegion: ChainOperand::Step({i}) \
                     references step not yet produced (have {} steps)",
                    steps.len()
                ))
            })
        }
    }
}

fn lookup(env: &HashMap<NodeId, Array>, id: NodeId) -> Result<&Array, MlxError> {
    env.get(&id)
        .ok_or_else(|| MlxError(format!("node {id:?} referenced before being lowered")))
}

fn unsupported<T>(what: String) -> Result<T, MlxError> {
    Err(MlxError(format!("MLX backend: unsupported op {what}")))
}

/// Zero-inflate a 4-D NHWC array along the two spatial axes by factors
/// (`sh`, `sw`). Produces a new array of shape
/// `[N, (H − 1)·sh + 1, (W − 1)·sw + 1, C]`, with original values at
/// strided positions and zeros between them.
///
/// Workaround for an MLX `conv_general` limitation: when `groups > 1`
/// AND `input_dilation > 1`, the kernel produces incorrect output. We
/// materialize the input dilation explicitly (reshape → pad → reshape
/// per spatial axis) so the downstream `conv_general` can run with
/// `input_dilation=[1,1]`.
fn inflate_spatial_2d(a: &Array, sh: usize, sw: usize) -> Result<Array, MlxError> {
    if sh == 1 && sw == 1 {
        return a.clone_handle();
    }
    let shape = a.shape()?;
    if shape.len() != 4 {
        return Err(MlxError(format!(
            "inflate_spatial_2d: expected rank-4 NHWC, got rank {}",
            shape.len()
        )));
    }
    let n = shape[0] as i32;
    let h = shape[1] as i32;
    let w = shape[2] as i32;
    let c = shape[3] as i32;

    let mut cur = a.clone_handle()?;
    if sh > 1 {
        let sh_i = sh as i32;
        // [N, H, W, C] → [N, H, 1, W, C] → pad axis 2 by (0, sh-1) →
        // [N, H, sh, W, C] → reshape [N, H*sh, W, C] → slice trailing
        // (sh-1) frames so dim becomes (H-1)*sh + 1.
        let r1 = ops::reshape(&cur, &[n, h, 1, w, c])?;
        let padded = ops::pad(
            &r1,
            /*low =*/ &[0, 0, 0, 0, 0],
            /*high=*/ &[0, 0, sh_i - 1, 0, 0],
            /*pad_value=*/ 0.0,
        )?;
        let merged = ops::reshape(&padded, &[n, h * sh_i, w, c])?;
        let new_h = (h - 1) * sh_i + 1;
        cur = ops::slice(&merged, &[0, 0, 0, 0], &[n, new_h, w, c])?;
    }
    if sw > 1 {
        let sw_i = sw as i32;
        let cur_shape = cur.shape()?;
        let cur_h = cur_shape[1] as i32;
        let r1 = ops::reshape(&cur, &[n, cur_h, w, 1, c])?;
        let padded = ops::pad(
            &r1,
            /*low =*/ &[0, 0, 0, 0, 0],
            /*high=*/ &[0, 0, 0, sw_i - 1, 0],
            /*pad_value=*/ 0.0,
        )?;
        let merged = ops::reshape(&padded, &[n, cur_h, w * sw_i, c])?;
        let new_w = (w - 1) * sw_i + 1;
        cur = ops::slice(&merged, &[0, 0, 0, 0], &[n, cur_h, new_w, c])?;
    }
    Ok(cur)
}

/// Zero-inflate a rank-3 NLC array along its single spatial axis by factor `s`
/// (insert `s-1` zeros between elements), giving length `(L-1)*s + 1`.
fn inflate_spatial_1d(a: &Array, s: usize) -> Result<Array, MlxError> {
    if s == 1 {
        return a.clone_handle();
    }
    let shape = a.shape()?;
    if shape.len() != 3 {
        return Err(MlxError(format!(
            "inflate_spatial_1d: expected rank-3 NLC, got rank {}",
            shape.len()
        )));
    }
    let n = shape[0] as i32;
    let l = shape[1] as i32;
    let c = shape[2] as i32;
    let si = s as i32;
    let r1 = ops::reshape(a, &[n, l, 1, c])?;
    let padded = ops::pad(&r1, &[0, 0, 0, 0], &[0, 0, si - 1, 0], 0.0)?;
    let merged = ops::reshape(&padded, &[n, l * si, c])?;
    let new_l = (l - 1) * si + 1;
    ops::slice(&merged, &[0, 0, 0], &[n, new_l, c])
}

/// Map `bits` ∈ {8, 4, 2} to its quantization range `q_max`.
fn fq_q_max(bits: u8) -> Result<f32, MlxError> {
    match bits {
        8 => Ok(127.0),
        4 => Ok(7.0),
        2 => Ok(1.0),
        n => Err(MlxError(format!("FakeQuantize: unsupported bits {n}"))),
    }
}

/// PerBatch-style scale: per-channel `max(|x|) / q_max`, floored at
/// `1e-12` so dividing by it never blows up. Returned shape is
/// broadcast-compatible against `x` (via `keep_dim=true` on the reduce).
fn fq_scale_perbatch(
    x: &Array,
    x_shape: &[i32],
    axis: Option<usize>,
    q_max: f32,
    dtype: DType,
) -> Result<Array, MlxError> {
    let abs_x = ops::unary(x, MlxUnary::Abs)?;
    let reduce_axes: Vec<i32> = match axis {
        None => (0..x_shape.len() as i32).collect(),
        Some(c) => (0..x_shape.len() as i32)
            .filter(|&i| i != c as i32)
            .collect(),
    };
    let max_abs = ops::reduce(
        &abs_x,
        MlxReduce::Max,
        &reduce_axes,
        /*keep_dim=*/ true,
    )?;
    let q_max_arr = Array::from_f32_slice(&[q_max], &[1], dtype)?;
    let scale_unclamped = ops::div(&max_abs, &q_max_arr)?;
    let eps = Array::from_f32_slice(&[1e-12], &[1], dtype)?;
    ops::max(&scale_unclamped, &eps)
}

/// Build a broadcast-shaped scale tensor from a 1-D `state` (shape `[C]`
/// for per-channel; `[1]` for per-tensor) so it broadcasts against `x`.
fn fq_scale_from_state(
    state: &Array,
    x_shape: &[i32],
    axis: Option<usize>,
    dtype: DType,
) -> Result<Array, MlxError> {
    let eps = Array::from_f32_slice(&[1e-12], &[1], dtype)?;
    let clamped = ops::max(state, &eps)?;
    match axis {
        None => Ok(clamped),
        Some(c) => {
            let state_dim = state.shape()?;
            let dim_c = state_dim.first().copied().unwrap_or(1) as i32;
            let mut bcast: Vec<i32> = vec![1; x_shape.len()];
            bcast[c] = dim_c;
            ops::reshape(&clamped, &bcast)
        }
    }
}

/// Shared quant + dequant tail of `Op::FakeQuantize`. Same formula
/// regardless of which `scale_mode` produced `scale`.
fn fq_quantize_dequantize(
    x: &Array,
    scale: &Array,
    q_max: f32,
    dtype: DType,
) -> Result<Array, MlxError> {
    let scaled = ops::div(x, scale)?;
    let rounded = ops::unary(&scaled, MlxUnary::Round)?;
    let neg_qmax = Array::from_f32_slice(&[-q_max], &[1], dtype)?;
    let pos_qmax = Array::from_f32_slice(&[q_max], &[1], dtype)?;
    let clamped = ops::max(&rounded, &neg_qmax)?;
    let clamped = ops::min(&clamped, &pos_qmax)?;
    ops::mul(&clamped, scale)
}

/// `[N, C]` one-hot encoding of f32-valued integer labels.
/// `oh[n, c] = 1.0` if `labels[n] == c` else `0.0`.
fn one_hot_2d(labels: &Array, n: usize, c: usize, dtype: DType) -> Result<Array, MlxError> {
    let arange_data: Vec<f32> = (0..c).map(|i| i as f32).collect();
    let arange = Array::from_f32_slice(&arange_data, &[c], dtype)?;
    let arange_2d = ops::reshape(&arange, &[1, c as i32])?;
    let labels_2d = ops::reshape(labels, &[n as i32, 1])?;
    let mask_bool = ops::eq(&labels_2d, &arange_2d)?;
    ops::cast(&mask_bool, dtype)
}

/// Closed-form derivative of every `Activation` kind. Mirrors
/// `rlx-cpu/src/thunk.rs::activation_backward_kernel`.
fn activation_backward_compose(
    x: &Array,
    dy: &Array,
    kind: Activation,
    dtype: DType,
) -> Result<Array, MlxError> {
    use Activation::*;
    match kind {
        Relu => {
            let zero = Array::from_f32_slice(&[0.0], &[1], dtype)?;
            let mask = ops::gt(x, &zero)?;
            ops::select(&mask, dy, &zero)
        }
        Sigmoid => {
            // dy · σ(x) · (1 − σ(x))
            let s = ops::unary(x, MlxUnary::Sigmoid)?;
            let one = Array::from_f32_slice(&[1.0], &[1], dtype)?;
            let one_minus_s = ops::sub(&one, &s)?;
            let s_compl = ops::mul(&s, &one_minus_s)?;
            ops::mul(dy, &s_compl)
        }
        Tanh => {
            // dy · (1 − tanh²(x))
            let t = ops::unary(x, MlxUnary::Tanh)?;
            let t_sq = ops::mul(&t, &t)?;
            let one = Array::from_f32_slice(&[1.0], &[1], dtype)?;
            let factor = ops::sub(&one, &t_sq)?;
            ops::mul(dy, &factor)
        }
        Silu => {
            // dy · σ(x) · (1 + x · (1 − σ(x)))
            let s = ops::unary(x, MlxUnary::Sigmoid)?;
            let one = Array::from_f32_slice(&[1.0], &[1], dtype)?;
            let one_minus_s = ops::sub(&one, &s)?;
            let x_times = ops::mul(x, &one_minus_s)?;
            let inner = ops::add(&one, &x_times)?;
            let factor = ops::mul(&s, &inner)?;
            ops::mul(dy, &factor)
        }
        Gelu => {
            // dy · (½(1 + erf(x/√2)) + x · φ(x)),  φ(x) = exp(−x²/2)/√(2π)
            const INV_SQRT2: f32 = std::f32::consts::FRAC_1_SQRT_2;
            const INV_SQRT_2PI: f32 = 0.398_942_3;
            let inv_sqrt2 = Array::from_f32_slice(&[INV_SQRT2], &[1], dtype)?;
            let inv_sqrt_2pi = Array::from_f32_slice(&[INV_SQRT_2PI], &[1], dtype)?;
            let half = Array::from_f32_slice(&[0.5], &[1], dtype)?;
            let one = Array::from_f32_slice(&[1.0], &[1], dtype)?;
            let neg_half = Array::from_f32_slice(&[-0.5], &[1], dtype)?;

            let x_sc = ops::mul(x, &inv_sqrt2)?;
            let erf_v = ops::unary(&x_sc, MlxUnary::Erf)?;
            let phi_inner = ops::add(&one, &erf_v)?;
            let phi = ops::mul(&half, &phi_inner)?;
            let x_sq = ops::mul(x, x)?;
            let arg = ops::mul(&x_sq, &neg_half)?;
            let pdf_e = ops::unary(&arg, MlxUnary::Exp)?;
            let pdf = ops::mul(&pdf_e, &inv_sqrt_2pi)?;
            let x_pdf = ops::mul(x, &pdf)?;
            let deriv = ops::add(&phi, &x_pdf)?;
            ops::mul(dy, &deriv)
        }
        GeluApprox => {
            // y = ½ x (1 + tanh(c (x + a x³))), c = √(2/π), a = 0.044715
            // dy/dx = ½(1+t) + ½ x (1−t²) · c (1 + 3 a x²)
            const C: f32 = 0.797_884_6;
            const A: f32 = 0.044_715;
            let half = Array::from_f32_slice(&[0.5], &[1], dtype)?;
            let one = Array::from_f32_slice(&[1.0], &[1], dtype)?;
            let c_arr = Array::from_f32_slice(&[C], &[1], dtype)?;
            let a_arr = Array::from_f32_slice(&[A], &[1], dtype)?;
            let three_a = Array::from_f32_slice(&[3.0 * A], &[1], dtype)?;

            let x_sq = ops::mul(x, x)?;
            let x_cu = ops::mul(&x_sq, x)?;
            let a_x_cu = ops::mul(&a_arr, &x_cu)?;
            let inner_sum = ops::add(x, &a_x_cu)?;
            let inner = ops::mul(&c_arr, &inner_sum)?;
            let t = ops::unary(&inner, MlxUnary::Tanh)?;
            let one_plus_t = ops::add(&one, &t)?;
            let term1 = ops::mul(&half, &one_plus_t)?;
            let t_sq = ops::mul(&t, &t)?;
            let one_minus_t_sq = ops::sub(&one, &t_sq)?;
            let three_a_x_sq = ops::mul(&three_a, &x_sq)?;
            let one_plus_3ax2 = ops::add(&one, &three_a_x_sq)?;
            let dinner = ops::mul(&c_arr, &one_plus_3ax2)?;
            let half_x = ops::mul(&half, x)?;
            let part2_a = ops::mul(&half_x, &one_minus_t_sq)?;
            let term2 = ops::mul(&part2_a, &dinner)?;
            let deriv = ops::add(&term1, &term2)?;
            ops::mul(dy, &deriv)
        }
        Exp => {
            let ex = ops::unary(x, MlxUnary::Exp)?;
            ops::mul(dy, &ex)
        }
        Log => ops::div(dy, x),
        Sqrt => {
            // 0.5 · dy / √x; zero where √x ≤ 0.
            let s = ops::unary(x, MlxUnary::Sqrt)?;
            let zero = Array::from_f32_slice(&[0.0], &[1], dtype)?;
            let half = Array::from_f32_slice(&[0.5], &[1], dtype)?;
            let mask = ops::gt(&s, &zero)?;
            let half_dy = ops::mul(&half, dy)?;
            let raw = ops::div(&half_dy, &s)?;
            ops::select(&mask, &raw, &zero)
        }
        Rsqrt => {
            // −0.5 · dy / (x · √x); zero where √x ≤ 0.
            let s = ops::unary(x, MlxUnary::Sqrt)?;
            let zero = Array::from_f32_slice(&[0.0], &[1], dtype)?;
            let neg_half = Array::from_f32_slice(&[-0.5], &[1], dtype)?;
            let mask = ops::gt(&s, &zero)?;
            let denom = ops::mul(x, &s)?;
            let neg_half_dy = ops::mul(&neg_half, dy)?;
            let raw = ops::div(&neg_half_dy, &denom)?;
            ops::select(&mask, &raw, &zero)
        }
        Neg => ops::unary(dy, MlxUnary::Neg),
        Abs => {
            // sign(x) · dy. CPU reference uses 0 at x=0 (not ±0).
            let zero = Array::from_f32_slice(&[0.0], &[1], dtype)?;
            let one = Array::from_f32_slice(&[1.0], &[1], dtype)?;
            let neg_one = Array::from_f32_slice(&[-1.0], &[1], dtype)?;
            let pos = ops::gt(x, &zero)?;
            let neg = ops::lt(x, &zero)?;
            let inner = ops::select(&neg, &neg_one, &zero)?;
            let sign = ops::select(&pos, &one, &inner)?;
            ops::mul(&sign, dy)
        }
        Round => {
            // STE: pretend Round was identity (zero-grad almost everywhere
            // means the optimizer can't learn through it without this).
            dy.clone_handle()
        }
        Sin => {
            // d/dx sin(x) = cos(x) · upstream.
            let c = ops::unary(x, MlxUnary::Cos)?;
            ops::mul(&c, dy)
        }
        Cos => {
            // d/dx cos(x) = −sin(x) · upstream.
            let s = ops::unary(x, MlxUnary::Sin)?;
            let neg_s = ops::unary(&s, MlxUnary::Neg)?;
            ops::mul(&neg_s, dy)
        }
        Tan => {
            // dy · (1 + tan²(x))
            let t = ops::unary(x, MlxUnary::Tan)?;
            let t2 = ops::mul(&t, &t)?;
            let one = Array::from_f32_slice(&[1.0], &[1], dtype)?;
            let sec2 = ops::add(&one, &t2)?;
            ops::mul(dy, &sec2)
        }
        Atan => {
            // dy · (1 / (1 + x²))
            let x2 = ops::mul(x, x)?;
            let one = Array::from_f32_slice(&[1.0], &[1], dtype)?;
            let denom = ops::add(&one, &x2)?;
            ops::div(dy, &denom)
        }
    }
}