mlx-native 0.9.0

//! GGML block-format expert-routed (MoE) quantized matrix-vector multiply dispatch.
//!
//! Encodes a GPU compute command that performs, for each (token, expert-slot):
//!   expert_id = ids[token * top_k + slot]
//!   output[token*top_k + slot][col] = sum_k(dequant(weight[expert_id][col][k]) * input[token][k])
//!
//! This is the _id variant of quantized_matmul_ggml: same GGML block dequantization
//! but with per-token expert selection via an ids buffer, enabling fused MoE dispatch.
//!
//! Derived from candle-metal-kernels (Apache-2.0) kernel_mul_mv_id template
//! and mlx-native's quantized_matmul_ggml kernels.

use crate::buffer::MlxBuffer;
use crate::device::MlxDevice;
use crate::dtypes::DType;
use crate::encoder::{CapturedOpKind, CommandEncoder, DispatchRecord, KernelArg, as_bytes};
use crate::env_flags::{cached_env_default_true, cached_env_eq_one};
use std::sync::atomic::AtomicI8;

// ADR-029 iter-175 Step 1an: cached hot-path env-flag gates for dispatch_id_mv.
// Uncached env::var costs ~70 ns/call (H-N microbench); cached AtomicI8 ~2 ns.
// dispatch_id_mv is called ~150 times/token; 2 env reads × 150 = 300 × 68 ns
// = 20 µs/tok ≈ 0.2% wall just from these 2 sites.
static CACHED_Q6K_ID_MV_NR2: AtomicI8 = AtomicI8::new(-1);
static CACHED_Q8_0_ID_MV_NR2: AtomicI8 = AtomicI8::new(-1);
use crate::error::{MlxError, Result};
use crate::kernel_registry::KernelRegistry;
use crate::ops::quantized_matmul_ggml::GgmlType;

// ---- GPU params struct ----

/// GPU-side params struct — must match the Metal shader's `GgmlMatvecIdParams`.
#[repr(C)]
#[derive(Debug, Clone, Copy, bytemuck::Pod, bytemuck::Zeroable)]
struct GgmlMatvecIdGpuParams {
    ne00: i64,           // K
    ne01: i64,           // N
    ne02: i64,           // 1 (unused)
    ne10: i64,           // K
    ne12: i64,           // 1 (unused)
    ne0: i64,            // N (output stride)
    ne1: i64,            // total output rows = n_tokens * top_k
    r2: u32,             // 1
    r3: u32,             // 1
    top_k: u32,          // experts per token
    n_tokens: u32,       // number of input tokens
    expert_stride: i64,  // bytes between expert weight slices
}

// ---- Public types ----

/// Parameters describing the expert-routed GGML quantized matmul dimensions.
#[derive(Debug, Clone, Copy)]
pub struct GgmlQuantizedMatmulIdParams {
    /// Number of input tokens.
    pub n_tokens: u32,
    /// Number of experts each token is routed to (top-k).
    pub top_k: u32,
    /// Number of output columns per expert (weight rows).
    pub n: u32,
    /// Input dimension (weight cols before quantization).
    /// Must be divisible by the GGML block QK value.
    pub k: u32,
    /// Total number of experts in the stacked weight buffer.
    pub n_experts: u32,
    /// Byte stride between expert weight slices in the stacked buffer.
    pub expert_stride: u64,
    /// GGML quantization type.
    pub ggml_type: GgmlType,
}

impl GgmlType {
    /// Metal kernel function name for the mat-vec `_id` variant.
    fn id_kernel_name(self) -> &'static str {
        match self {
            GgmlType::Q4_0 => "kernel_mul_mv_id_q4_0_f32",
            GgmlType::Q8_0 => "kernel_mul_mv_id_q8_0_f32",
            // ADR-013 P7 — Q4_K mv_id ported from llama.cpp
            // (ggml-metal.metal:10349) for dwq46/dwq48 MoE expert weights.
            GgmlType::Q4_K => "kernel_mul_mv_id_q4_K_f32",
            GgmlType::Q5_K => "kernel_mul_mv_id_q5_K_f32",
            GgmlType::Q6_K => "kernel_mul_mv_id_q6_K_f32",
            // ADR-022 Phase 1 P1.5 — Q5_1 / IQ4_NL mv_id ports.
            GgmlType::Q5_1 => "kernel_mul_mv_id_q5_1_f32",
            GgmlType::IQ4_NL => "kernel_mul_mv_id_iq4_nl_f32",
            GgmlType::F32 | GgmlType::F16 | GgmlType::I16 => "unsupported",
        }
    }

    /// Metal kernel function name for the mat-mat `_id` variant (ADR-011
    /// Phase 3 Wave P3a port of llama.cpp's `kernel_mul_mm_id_<q>_f32`).
    fn id_mm_kernel_name(self) -> &'static str {
        match self {
            GgmlType::Q4_0 => "kernel_mul_mm_id_q4_0_f32",
            GgmlType::Q8_0 => "kernel_mul_mm_id_q8_0_f32",
            // ADR-022 Phase 2 — Q5_K mm_id ported.
            GgmlType::Q5_K => "kernel_mul_mm_id_q5_K_f32",
            GgmlType::Q6_K => "kernel_mul_mm_id_q6_K_f32",
            // ADR-013 P16 — Q4_K mm_id ported (port of llama.cpp
            // `kernel_mul_mm_id_q4_K_f32` at ggml-metal.metal:10169).
            GgmlType::Q4_K => "kernel_mul_mm_id_q4_K_f32",
            // ADR-022 Phase 1 P1.6 — Q5_1 / IQ4_NL mm_id ported.
            GgmlType::Q5_1 => "kernel_mul_mm_id_q5_1_f32",
            GgmlType::IQ4_NL => "kernel_mul_mm_id_iq4_nl_f32",
            GgmlType::F32 | GgmlType::F16 | GgmlType::I16 => "unsupported",
        }
    }

    /// Tensor-API variant of the mm_id kernel (ADR-011 Phase 3 Wave
    /// P3b-tensor).
    fn id_mm_tensor_kernel_name(self) -> &'static str {
        match self {
            GgmlType::Q4_0 => "kernel_mul_mm_id_q4_0_tensor_f32",
            GgmlType::Q8_0 => "kernel_mul_mm_id_q8_0_tensor_f32",
            // ADR-022 Phase 2 — Q5_K mm_id_tensor ported.
            GgmlType::Q5_K => "kernel_mul_mm_id_q5_K_tensor_f32",
            GgmlType::Q6_K => "kernel_mul_mm_id_q6_K_tensor_f32",
            // ADR-013 P16 — Q4_K tensor-API mm_id ported.
            GgmlType::Q4_K => "kernel_mul_mm_id_q4_K_tensor_f32",
            // ADR-022 Phase 1 P1.6 — Q5_1 / IQ4_NL tensor-API mm_id ported.
            GgmlType::Q5_1 => "kernel_mul_mm_id_q5_1_tensor_f32",
            GgmlType::IQ4_NL => "kernel_mul_mm_id_iq4_nl_tensor_f32",
            GgmlType::F32 | GgmlType::F16 | GgmlType::I16 => "unsupported",
        }
    }
}

/// One-shot probe for mm_id tensor-API availability.  Cached separately
/// from the dense-mm probe in quantized_matmul_ggml.rs because these are
/// distinct shader files; whichever runs first pays its own compile cost.
static TENSOR_MM_ID_AVAILABLE: std::sync::OnceLock<bool> = std::sync::OnceLock::new();

fn probe_tensor_mm_id(registry: &mut KernelRegistry, device: &MlxDevice) -> bool {
    *TENSOR_MM_ID_AVAILABLE.get_or_init(|| {
        // ADR-013 P17 — env-gate to force the simdgroup variant for A/B
        // bench. llama.cpp does NOT use a tensor-API path for K-quants
        // (only f32/f16/bf16), so for Q4_K/Q5_K/Q6_K the tensor variant
        // is our addition and may be slower than the simdgroup MMA path.
        if std::env::var("HF2Q_DISABLE_TENSOR_MM_ID").is_ok() {
            if std::env::var("MLX_LOG_TENSOR_PROBE").is_ok() {
                eprintln!("[mlx-native] tensor_mm_id: DISABLED via HF2Q_DISABLE_TENSOR_MM_ID");
            }
            return false;
        }
        let ok = registry
            .get_pipeline("kernel_mul_mm_id_q4_0_tensor_f32", device.metal_device())
            .is_ok();
        if std::env::var("MLX_LOG_TENSOR_PROBE").is_ok() {
            eprintln!("[mlx-native] tensor_mm_id probe: {}", if ok { "OK (using tensor variant for MoE)" } else { "FAILED (falling back to simdgroup MMA)" });
        }
        ok
    })
}

/// Encode an expert-routed GGML quantized matrix-vector multiply.
///
/// Weight buffer contains raw GGML blocks stacked as `[n_experts, N, packed_K]`.
/// Input is f32 `[n_tokens, K]`, output is f32 `[n_tokens * top_k, N]`.
/// The `ids` buffer `[n_tokens * top_k]` u32 selects which expert to use for
/// each (token, slot) pair.
///
/// # Arguments
///
/// * `encoder`  -- Command encoder to record the dispatch into.
/// * `registry` -- Kernel registry (compiles shader on first call).
/// * `device`   -- Metal device.
/// * `input`    -- f32 input buffer, shape `[n_tokens, K]`.
/// * `weight`   -- Stacked GGML block weight buffer, `[n_experts, N, packed_K]`.
/// * `ids`      -- u32 expert index buffer, shape `[n_tokens * top_k]`.
/// * `output`   -- f32 output buffer, shape `[n_tokens * top_k, N]`.
/// * `params`   -- Dimensions and quantization parameters.
///
/// # Errors
///
/// Returns `MlxError::InvalidArgument` if:
/// - K is not divisible by the GGML block QK value
/// - Buffer sizes don't match expected dimensions
/// - Any dimension is zero
#[allow(clippy::too_many_arguments)]
pub fn quantized_matmul_id_ggml(
    encoder: &mut CommandEncoder,
    registry: &mut KernelRegistry,
    device: &MlxDevice,
    input: &MlxBuffer,
    weight: &MlxBuffer,
    ids: &MlxBuffer,
    output: &MlxBuffer,
    params: &GgmlQuantizedMatmulIdParams,
) -> Result<()> {
    let qk = params.ggml_type.block_values();
    let block_bytes = params.ggml_type.block_bytes();

    // --- Validate dimensions ---
    if params.n_tokens == 0 || params.k == 0 || params.n == 0 {
        return Err(MlxError::InvalidArgument(
            "quantized_matmul_id_ggml: n_tokens, K, and N must all be > 0".into(),
        ));
    }
    if params.top_k == 0 {
        return Err(MlxError::InvalidArgument(
            "quantized_matmul_id_ggml: top_k must be > 0".into(),
        ));
    }
    if params.n_experts == 0 {
        return Err(MlxError::InvalidArgument(
            "quantized_matmul_id_ggml: n_experts must be > 0".into(),
        ));
    }
    if params.k % qk != 0 {
        return Err(MlxError::InvalidArgument(format!(
            "quantized_matmul_id_ggml: K ({}) must be divisible by block QK ({})",
            params.k, qk
        )));
    }

    // --- Validate buffer sizes ---
    let expected_input_bytes =
        (params.n_tokens as usize) * (params.k as usize) * DType::F32.size_of();
    if input.byte_len() < expected_input_bytes {
        return Err(MlxError::InvalidArgument(format!(
            "quantized_matmul_id_ggml: input buffer too small: expected {} bytes for [{} x {}] f32, got {}",
            expected_input_bytes, params.n_tokens, params.k, input.byte_len()
        )));
    }

    let blocks_per_row = params.k / qk;
    let per_expert_bytes =
        (params.n as usize) * (blocks_per_row as usize) * (block_bytes as usize);

    // Validate expert_stride is sane
    if params.expert_stride < per_expert_bytes as u64 {
        return Err(MlxError::InvalidArgument(format!(
            "quantized_matmul_id_ggml: expert_stride ({}) < per_expert_bytes ({})",
            params.expert_stride, per_expert_bytes
        )));
    }

    let total_weight_bytes = per_expert_bytes * (params.n_experts as usize);
    if weight.byte_len() < total_weight_bytes {
        return Err(MlxError::InvalidArgument(format!(
            "quantized_matmul_id_ggml: weight buffer too small: expected {} bytes for {} experts, got {}",
            total_weight_bytes, params.n_experts, weight.byte_len()
        )));
    }

    let total_rows = (params.n_tokens as usize) * (params.top_k as usize);
    let expected_ids_bytes = total_rows * DType::U32.size_of();
    if ids.byte_len() < expected_ids_bytes {
        return Err(MlxError::InvalidArgument(format!(
            "quantized_matmul_id_ggml: ids buffer too small: expected {} bytes for [{} * {}] u32, got {}",
            expected_ids_bytes, params.n_tokens, params.top_k, ids.byte_len()
        )));
    }

    let expected_output_bytes = total_rows * (params.n as usize) * DType::F32.size_of();
    if output.byte_len() < expected_output_bytes {
        return Err(MlxError::InvalidArgument(format!(
            "quantized_matmul_id_ggml: output buffer too small: expected {} bytes for [{} x {}] f32, got {}",
            expected_output_bytes, total_rows, params.n, output.byte_len()
        )));
    }

    // ADR-011 Phase 3 — route on n_tokens threshold.
    //
    // At prefill with n_tokens > 8, dispatch the two-stage mm_id kernels
    // (map0 + mm).  Each expert's weight tile is staged once to
    // threadgroup shmem per 32-row block of that expert's routed tokens
    // — identical to the dense mm dispatcher's win but per-expert.
    //
    // P3b-tensor.2 — extended to top_k=1 (Gemma 4's MoE down call).
    // Without this the down call's 19,640-row matmul falls back to mv_id
    // and re-reads each expert's weights once per row — ~50% of prefill
    // wall time burnt on weight re-reads.  Today we have ne20_1 and
    // ne20_8 instantiations; other top_k values still fall back to mv_id.
    //
    // Falls back to the mv_id path for:
    //   * decode (n_tokens <= 8)
    //   * top_k values without a map0 instantiation
    //   * K < 32 (mm tile requires NK=32)
    // ADR-013 P16 — Q4_K mm_id ported; eligible for the prefill route.
    // ADR-022 Phase 2 — Q5_K mm_id ported; the Q5_K bypass at this site
    // (and at the pooled entry below) was retained until iter-19's port
    // closed the gap.
    if params.n_tokens > mm_id_routing_threshold()
        && (params.top_k == 1 || params.top_k == 8)
        && params.k >= 32
    {
        // ADR-022 AC-4: env-gated trace so operators can confirm mm_id
        // engages on prefill. `HF2Q_LOG_MM_ID_ROUTE=1` enables the line.
        if std::env::var("HF2Q_LOG_MM_ID_ROUTE").is_ok() {
            eprintln!(
                "[mlx-native adr-022 AC-4] dispatch_id_mm engaged: type={:?} \
                 n_tokens={} top_k={} k={} n={} n_experts={}",
                params.ggml_type,
                params.n_tokens,
                params.top_k,
                params.k,
                params.n,
                params.n_experts,
            );
        }
        return dispatch_id_mm(
            encoder, registry, device, input, weight, ids, output, params,
        );
    }

    dispatch_id_mv(encoder, registry, device, input, weight, ids, output, params)
}

/// Same contract as `quantized_matmul_id_ggml`, but takes caller-owned
/// `IdMmScratch` so batched-prefill dispatches avoid the per-call
/// `MTLDevice.newBufferWithLength:` allocations the auto entry point
/// incurs (ADR-011 Phase 3 Wave P3b — "scratch pooling").
///
/// When the dispatch routes to the mv_id path (decode / top_k != 8 /
/// K < 32), the scratch is not touched — it is only used on the mm_id
/// path.  Callers may over-size the scratch once per prefill and share
/// it across every mm_id call in the forward pass.
#[allow(clippy::too_many_arguments)]
pub fn quantized_matmul_id_ggml_pooled(
    encoder: &mut CommandEncoder,
    registry: &mut KernelRegistry,
    device: &MlxDevice,
    input: &MlxBuffer,
    weight: &MlxBuffer,
    ids: &MlxBuffer,
    output: &MlxBuffer,
    scratch: &mut IdMmScratch,
    params: &GgmlQuantizedMatmulIdParams,
) -> Result<()> {
    // Mirror the validation + routing logic from `quantized_matmul_id_ggml`
    // so the pooled path has identical correctness invariants.  (We keep
    // the two entry points separate rather than extracting a shared inner
    // because the scratch is only relevant on the mm_id branch — lifting
    // scratch into the mv branch would add unused parameters.)
    let qk = params.ggml_type.block_values();
    let block_bytes = params.ggml_type.block_bytes();

    if params.n_tokens == 0 || params.k == 0 || params.n == 0 {
        return Err(MlxError::InvalidArgument(
            "quantized_matmul_id_ggml_pooled: n_tokens, K, and N must all be > 0".into(),
        ));
    }
    if params.top_k == 0 {
        return Err(MlxError::InvalidArgument(
            "quantized_matmul_id_ggml_pooled: top_k must be > 0".into(),
        ));
    }
    if params.n_experts == 0 {
        return Err(MlxError::InvalidArgument(
            "quantized_matmul_id_ggml_pooled: n_experts must be > 0".into(),
        ));
    }
    if params.k % qk != 0 {
        return Err(MlxError::InvalidArgument(format!(
            "quantized_matmul_id_ggml_pooled: K ({}) must be divisible by block QK ({})",
            params.k, qk
        )));
    }

    let expected_input_bytes =
        (params.n_tokens as usize) * (params.k as usize) * DType::F32.size_of();
    if input.byte_len() < expected_input_bytes {
        return Err(MlxError::InvalidArgument(format!(
            "quantized_matmul_id_ggml_pooled: input buffer too small: expected {} bytes for [{} x {}] f32, got {}",
            expected_input_bytes, params.n_tokens, params.k, input.byte_len()
        )));
    }

    let blocks_per_row = params.k / qk;
    let per_expert_bytes =
        (params.n as usize) * (blocks_per_row as usize) * (block_bytes as usize);

    if params.expert_stride < per_expert_bytes as u64 {
        return Err(MlxError::InvalidArgument(format!(
            "quantized_matmul_id_ggml_pooled: expert_stride ({}) < per_expert_bytes ({})",
            params.expert_stride, per_expert_bytes
        )));
    }

    let total_weight_bytes = per_expert_bytes * (params.n_experts as usize);
    if weight.byte_len() < total_weight_bytes {
        return Err(MlxError::InvalidArgument(format!(
            "quantized_matmul_id_ggml_pooled: weight buffer too small: expected {} bytes for {} experts, got {}",
            total_weight_bytes, params.n_experts, weight.byte_len()
        )));
    }

    let total_rows = (params.n_tokens as usize) * (params.top_k as usize);
    let expected_ids_bytes = total_rows * DType::U32.size_of();
    if ids.byte_len() < expected_ids_bytes {
        return Err(MlxError::InvalidArgument(format!(
            "quantized_matmul_id_ggml_pooled: ids buffer too small: expected {} bytes for [{} * {}] u32, got {}",
            expected_ids_bytes, params.n_tokens, params.top_k, ids.byte_len()
        )));
    }

    let expected_output_bytes = total_rows * (params.n as usize) * DType::F32.size_of();
    if output.byte_len() < expected_output_bytes {
        return Err(MlxError::InvalidArgument(format!(
            "quantized_matmul_id_ggml_pooled: output buffer too small: expected {} bytes for [{} x {}] f32, got {}",
            expected_output_bytes, total_rows, params.n, output.byte_len()
        )));
    }

    // P3b-tensor.2 — accept top_k ∈ {1, 8} (Gemma 4's MoE down/gate_up).
    // ADR-013 P16 — Q4_K mm_id ported; eligible for the prefill route.
    // ADR-022 Phase 2 — Q5_K mm_id ported; the previous Q5_K bypass here
    // is retired (kernels live in id_mm.metal + id_mm_tensor.metal).
    // ADR-022 AC-4: env-gated trace (HF2Q_LOG_MM_ID_ROUTE=1) confirms mm_id
    // engagement on the qwen35 prefill path which goes through this pooled
    // entry, not the auto entry above.
    if params.n_tokens > mm_id_routing_threshold()
        && (params.top_k == 1 || params.top_k == 8)
        && params.k >= 32
    {
        if std::env::var("HF2Q_LOG_MM_ID_ROUTE").is_ok() {
            eprintln!(
                "[mlx-native adr-022 AC-4 pooled] dispatch_id_mm_pooled engaged: \
                 type={:?} n_tokens={} top_k={} k={} n={} n_experts={}",
                params.ggml_type,
                params.n_tokens,
                params.top_k,
                params.k,
                params.n,
                params.n_experts,
            );
        }
        return dispatch_id_mm_pooled(
            encoder, registry, device, input, weight, ids, output,
            scratch, params,
        );
    }

    dispatch_id_mv(encoder, registry, device, input, weight, ids, output, params)
}

/// The n_tokens threshold at which `quantized_matmul_id_ggml` switches
/// from the mv_id kernel to the mm_id kernel.  Matches llama.cpp's
/// `ne11_mm_min = 8` (ggml-metal-ops.cpp:2046).
// ADR-013 P17 — bumped 8 → 32 to match llama.cpp's `ne21_mm_id_min = 32`
// in `ggml-metal-ops.cpp:2312`. Below 32 tokens, mv_id is faster than mm_id
// (the mm tile-reuse setup overhead doesn't amortize at small n_tokens).
pub const MM_ID_ROUTING_THRESHOLD: u32 = 32;

/// ADR-013 P19 H11 (2026-05-01) — runtime override for the mm_id routing
/// threshold via `HF2Q_MM_ID_ROUTING_THRESHOLD` env. Setting it to a very
/// large value (e.g. `99999`) forces every dispatch onto the `mv_id` route,
/// which falsifies / confirms the "mm_id setup overhead is dominant at
/// pp ≤ 256" hypothesis without recompiling. Read once into a `OnceLock`
/// at first call and reused; absent or unparseable env falls back to the
/// compile-time const above.
fn mm_id_routing_threshold() -> u32 {
    static CACHED: std::sync::OnceLock<u32> = std::sync::OnceLock::new();
    *CACHED.get_or_init(|| {
        std::env::var("HF2Q_MM_ID_ROUTING_THRESHOLD")
            .ok()
            .and_then(|s| s.parse::<u32>().ok())
            .map(|v| {
                if std::env::var("MLX_LOG_TENSOR_PROBE").is_ok() {
                    eprintln!(
                        "[mlx-native] mm_id_routing_threshold: OVERRIDE via HF2Q_MM_ID_ROUTING_THRESHOLD={v} (default {})",
                        MM_ID_ROUTING_THRESHOLD
                    );
                }
                v
            })
            .unwrap_or(MM_ID_ROUTING_THRESHOLD)
    })
}

/// Matrix-vector `_id` dispatch (decode path, unchanged from pre-Phase-3).
#[allow(clippy::too_many_arguments)]
fn dispatch_id_mv(
    encoder: &mut CommandEncoder,
    registry: &mut KernelRegistry,
    device: &MlxDevice,
    input: &MlxBuffer,
    weight: &MlxBuffer,
    ids: &MlxBuffer,
    output: &MlxBuffer,
    params: &GgmlQuantizedMatmulIdParams,
) -> Result<()> {
    let total_rows = (params.n_tokens as usize) * (params.top_k as usize);

    // ADR-028 iter-321 — nr0=2 variant for q6_K _id mat-vec.  Mirrors
    // iter-309's non-_id work.
    //
    // ADR-028 iter-326 default-flipped to ON (operator REFRAME #2).
    // Opt out with `HF2Q_Q6K_ID_MV_NR2=0` / `=false` / `=off`.
    let use_q6k_id_nr2 = matches!(params.ggml_type, GgmlType::Q6_K)
        && cached_env_default_true(&CACHED_Q6K_ID_MV_NR2, "HF2Q_Q6K_ID_MV_NR2");
    // ADR-029 iter-6 — nr0=2 nsg=4 variant for q8_0 _id mat-vec.
    // Matches peer's N_R0_Q8_0=2 + N_SG_Q8_0=4 in ggml-metal-impl.h:27,40.
    // gemma4 APEX-Q5_K_M MoE down_exps is Q8_0 → 30 dispatches/decode-tok
    // on this path.  Opt-in via `HF2Q_Q8_0_ID_MV_NR2=1`; default-off
    // until coherence + bench validation.  ADR-029 iter-175 Step 1an:
    // cached via AtomicI8.
    let use_q8_0_id_nr2 = matches!(params.ggml_type, GgmlType::Q8_0)
        && cached_env_eq_one(&CACHED_Q8_0_ID_MV_NR2, "HF2Q_Q8_0_ID_MV_NR2");
    let kernel_name = if use_q6k_id_nr2 {
        "kernel_mul_mv_id_q6_K_f32_nr2"
    } else if use_q8_0_id_nr2 {
        "kernel_mul_mv_id_q8_0_f32_nr2"
    } else {
        params.ggml_type.id_kernel_name()
    };
    let pipeline = registry.get_pipeline(kernel_name, device.metal_device())?;

    let gpu_params = GgmlMatvecIdGpuParams {
        ne00: params.k as i64,
        ne01: params.n as i64,
        ne02: 1,
        ne10: params.k as i64,
        ne12: 1,
        ne0: params.n as i64,
        ne1: total_rows as i64,
        r2: 1,
        r3: 1,
        top_k: params.top_k,
        n_tokens: params.n_tokens,
        expert_stride: params.expert_stride as i64,
    };

    let (nth0, nth1, align) = match params.ggml_type {
        // Q4_0/Q8_0: historical (8, 8) layout = 64 threads = 2 simdgroups
        // of 32.  Tested 2026-04-26 against llama.cpp's (32, 2) layout —
        // (32, 2) gave a 1.8% short-bench improvement on dwq46 but a
        // 2.0% REGRESSION on the 5-cold-run 256-token decode bench
        // (median 108.0 vs 110.2 t/s).  Likely cache/scheduling
        // interaction with the multi-token KV-cache memory access
        // pattern that the (8, 8) layout happens to align with on M5
        // Max.  6th confirmed M5 Max static-evidence kernel hypothesis
        // falsified — Metal compiler/scheduler optimizes both layouts
        // similarly, with workload-specific edges that don't match
        // llama.cpp's tuning.
        // ADR-022: Q5_1 and IQ4_NL are 32-element legacy formats; share
        // the (8, 8) layout with Q4_0 / Q8_0. Confirmed against llama.cpp's
        // dispatch_id_mv launch geometry for `kernel_mul_mv_id_q5_1_f32`
        // and `kernel_mul_mv_id_iq4_nl_f32` (both NWG=2, NSIMDGROUP=2,
        // ngroups along K = nb/4 → 8 thread blocks of 8 rows each).
        GgmlType::Q4_0
        | GgmlType::Q8_0
        | GgmlType::Q5_1
        | GgmlType::IQ4_NL => (8u64, 8u64, 8usize),
        // Q4_K, Q5_K, and Q6_K all use the 2-row-per-threadgroup (2, 32)
        // geometry.  ADR-013 P7 — Q4_K added; mirrors Q5_K (NSG=2,
        // 1 row per simdgroup; same kmask scale-decode).
        GgmlType::Q4_K | GgmlType::Q5_K | GgmlType::Q6_K => (2u64, 32u64, 2usize),
        GgmlType::F32
        | GgmlType::F16
        | GgmlType::I16 => {
            return Err(MlxError::InvalidArgument(format!(
                "quantized_matmul_id_ggml does not support {:?}",
                params.ggml_type
            )));
        }
    };
    // ADR-028 iter-321 — nr0=2 doubles rows-per-TG to 4. Same 2 SGs × 32
    // threads, but each SG handles 2 rows so align=4.
    let align = if use_q6k_id_nr2 { 4usize } else { align };
    // ADR-029 iter-6 — Q8_0 _id NR2 NSG=4: threads_per_tg=(32, 4), align=2
    // (each TG covers NR0=2 rows; all 4 SGs collaborate on K-dim).
    // Override geometry AFTER the Q4_0/Q8_0/Q5_1/IQ4_NL match arm above.
    let (nth0, nth1, align) = if use_q8_0_id_nr2 {
        (32u64, 4u64, 2usize)
    } else {
        (nth0, nth1, align)
    };

    let n = params.n as usize;
    let m = total_rows;

    // Dispatch routing dim in Y, NOT Z (despite llama.cpp's mul_mv_id using
    // ne123 in z at ggml-metal-ops.cpp:2452).  Tested 2026-04-26: switching
    // to z-routing on M5 Max regressed dwq46 256-token decode from 112 t/s
    // to 90.9 t/s (-19%).  Apple GPU's threadgroup scheduler distributes
    // this dispatch shape better via y than z — 7th confirmed static-
    // evidence kernel hypothesis falsified per
    // `project_metal_compiler_auto_optimizes_static_levers.md`.
    let threadgroups = metal::MTLSize::new(
        div_ceil(n, align) as u64,
        m as u64,
        1,
    );
    let threads_per_tg = metal::MTLSize::new(nth0, nth1, 1);

    if use_q8_0_id_nr2 {
        // ADR-029 iter-6: cross-SG reduction needs threadgroup memory:
        // NR0 * NW * sizeof(float) = 2 * 32 * 4 = 256 bytes.
        let smem_bytes: u64 = 2 * 32 * std::mem::size_of::<f32>() as u64;
        encoder.encode_threadgroups_with_args_and_shared(
            pipeline,
            &[
                (0, KernelArg::Buffer(weight)),
                (1, KernelArg::Buffer(input)),
                (2, KernelArg::Buffer(output)),
                (3, KernelArg::Buffer(ids)),
                (4, KernelArg::Bytes(as_bytes(&gpu_params))),
            ],
            &[(0, smem_bytes)],
            threadgroups,
            threads_per_tg,
        );
    } else {
        encoder.encode_threadgroups_with_args(
            pipeline,
            &[
                (0, KernelArg::Buffer(weight)),
                (1, KernelArg::Buffer(input)),
                (2, KernelArg::Buffer(output)),
                (3, KernelArg::Buffer(ids)),
                (4, KernelArg::Bytes(as_bytes(&gpu_params))),
            ],
            threadgroups,
            threads_per_tg,
        );
    }

    Ok(())
}

/// ADR-029 iter-175 Step 1e — pre-bake the per-weight Q6_K `_id` NR2
/// `m=1` decode dispatch into a `DispatchRecord`.
///
/// The MoE gate_up dispatch in gemma4 APEX-Q5_K_M is Q6_K, hits ~30
/// calls/decode-tok (1 dispatch per layer, n_tokens=1, total_rows=top_k
/// folded into `threadgroups.y` per `dispatch_id_mv`'s geometry).  Same
/// rationale as `quantized_matmul_ggml::build_q6k_nr2_m1_record` — bake
/// the load-time-immutable parts of the call once at first-dispatch.
///
/// Pre-bakes:
///   - Pipeline reference (skips `KernelRegistry` HashMap lookup per call)
///   - MTLSize threadgroups + threads_per_tg (skips MTLSize::new + match)
///   - `GgmlMatvecIdGpuParams` bytes (skips struct construction + bytemuck)
///   - Binding slot order: weight=0, input=1, output=2, ids=3, params=4
///
/// Returns `None` when `HF2Q_Q6K_ID_MV_NR2` is set off — the
/// non-NR2 kernel uses a different geometry (2 SGs × 1 row/SG = align=2)
/// and a record baked for the NR2 layout would be wrong.  Callers MUST
/// fall through to the unbaked `dispatch_id_mv` path in that case.
///
/// Bake-time validation: pipeline lookup must succeed.  Threadgroup
/// geometry is hard-coded to the Q6_K_ID NR2 contract (NSG=2, nr0=2,
/// align=4 rows/TG, threads=(2, 32, 1)) — mirrors the kernel
/// `kernel_mul_mv_id_q6_K_f32_nr2`'s `constexpr int NSG = 2; constexpr
/// int nr0 = 2;` at `quantized_matmul_id_ggml.metal:1025-1026`.
///
/// `top_k` and `expert_stride` are weight-specific and folded into the
/// baked `GgmlMatvecIdGpuParams.ne1 = top_k` (since `n_tokens=1` at
/// decode) + `expert_stride`.  Callers must pass the same `top_k` and
/// `expert_stride` they would have used in `GgmlQuantizedMatmulIdParams`.
pub fn build_q6k_id_nr2_m1_record(
    registry: &mut KernelRegistry,
    device: &metal::DeviceRef,
    n: u32,
    k: u32,
    top_k: u32,
    expert_stride: u64,
) -> Result<Option<DispatchRecord>> {
    // Only bakeable when the NR2 variant is the selected one — same
    // gate as `dispatch_id_mv`'s `use_q6k_id_nr2` branch.
    if !cached_env_default_true(&CACHED_Q6K_ID_MV_NR2, "HF2Q_Q6K_ID_MV_NR2") {
        return Ok(None);
    }

    // Pipeline lookup — `kernel_mul_mv_id_q6_K_f32_nr2` takes no
    // function constants (unlike the non-_id Q6_K NR2 which carries
    // 700/701/702 for ne12/r2/r3 promotion).
    let pipeline = registry
        .get_pipeline("kernel_mul_mv_id_q6_K_f32_nr2", device)?
        .clone();

    // GgmlMatvecIdGpuParams for n_tokens=1.  total_rows = 1 * top_k.
    let gpu_params = GgmlMatvecIdGpuParams {
        ne00: k as i64,
        ne01: n as i64,
        ne02: 1,
        ne10: k as i64,
        ne12: 1,
        ne0: n as i64,
        ne1: top_k as i64,
        r2: 1,
        r3: 1,
        top_k,
        n_tokens: 1,
        expert_stride: expert_stride as i64,
    };
    let params_bytes = as_bytes(&gpu_params).to_vec();

    // Q6_K_ID NR2: align=4 rows per TG, threads = (nth0=2, nth1=32, 1)
    // (matches `dispatch_id_mv`'s Q6_K NR2 branch override).
    const ALIGN: u32 = 4;
    let threadgroups = metal::MTLSize::new(
        div_ceil(n as usize, ALIGN as usize) as u64,
        top_k as u64,
        1,
    );
    let threads_per_tg = metal::MTLSize::new(2, 32, 1);

    Ok(Some(DispatchRecord {
        pipeline,
        threadgroups,
        threads_per_tg,
        threadgroup_mem: Vec::new(), // Q6_K_ID NR2 doesn't use shmem
        params_bytes,
        params_slot: 4,
        buffer_slots: vec![0, 1, 2, 3], // weight, input, output, ids
        op_kind: CapturedOpKind::Other,
        kernel_name: "kernel_mul_mv_id_q6_K_f32_nr2".to_string(),
    }))
}

/// ADR-029 iter-175 Step 1e2 — pre-bake the per-weight Q8_0 `_id` (regular,
/// non-NR2) decode dispatch into a `DispatchRecord`.
///
/// The MoE down dispatch in gemma4 APEX-Q5_K_M is Q8_0 → ~30 calls/decode-tok
/// (1 down dispatch per layer × 30 layers).  Unlike the gate_up path, the
/// down call site passes `n_tokens = real_top_k, top_k = 1` to
/// `quantized_matmul_id_ggml` — so the kernel sees `total_rows = real_top_k`
/// folded into `threadgroups.y`, and `params.ne1 = real_top_k`.  At decode
/// time `real_top_k` is fixed by model config, so the bake is valid.
///
/// Pre-bakes:
///   - Pipeline reference for `kernel_mul_mv_id_q8_0_f32` (regular variant)
///   - threadgroups = `(div_ceil(n, 8), real_top_k, 1)`
///   - threads_per_tg = `(8, 8, 1)` (N_DST=4 × N_SIMDGROUP=2 contract;
///     2 SGs × 32 threads = 64 threads/TG; 8 rows/TG)
///   - `GgmlMatvecIdGpuParams` bytes (n_tokens=real_top_k, top_k=1, ne1=real_top_k)
///   - Binding slot order: weight=0, input=1, output=2, ids=3, params=4
///
/// Returns `None` when `HF2Q_Q8_0_ID_MV_NR2` is on — the opt-in NR2 kernel
/// uses different geometry (threads=(32, 4, 1), align=2, shmem=256 bytes)
/// and a record baked for the regular layout would be wrong.  Callers MUST
/// fall through to the unbaked `dispatch_id_mv` path in that case.
///
/// Bake-time validation: pipeline lookup must succeed.  Geometry is
/// hard-coded to the regular Q8_0_ID contract (N_DST=4, N_SIMDGROUP=2,
/// align=8) — mirrors the kernel `kernel_mul_mv_id_q8_0_f32` constants
/// at `quantized_matmul_id_ggml.metal:460-462` and the dispatch_id_mv
/// `(nth0=8, nth1=8, align=8)` branch.
///
/// `real_top_k` and `expert_stride` are weight-specific; callers must
/// pass the same `top_k` value the model config carries (folded into
/// `params.n_tokens` at the down call site) and the same `expert_stride`
/// they would have used in `GgmlQuantizedMatmulIdParams`.
pub fn build_q8_0_id_decode_record(
    registry: &mut KernelRegistry,
    device: &metal::DeviceRef,
    n: u32,
    k: u32,
    real_top_k: u32,
    expert_stride: u64,
) -> Result<Option<DispatchRecord>> {
    // Only bakeable when the regular (non-NR2) Q8_0_ID kernel is selected.
    // Mirrors the negation of `dispatch_id_mv`'s `use_q8_0_id_nr2` branch.
    if cached_env_eq_one(&CACHED_Q8_0_ID_MV_NR2, "HF2Q_Q8_0_ID_MV_NR2") {
        return Ok(None);
    }

    // Pipeline lookup — regular `kernel_mul_mv_id_q8_0_f32` takes no
    // function constants.
    let pipeline = registry
        .get_pipeline("kernel_mul_mv_id_q8_0_f32", device)?
        .clone();

    // `GgmlMatvecIdGpuParams` for the down dispatch: n_tokens=real_top_k,
    // top_k=1, total_rows = n_tokens * top_k = real_top_k.
    let gpu_params = GgmlMatvecIdGpuParams {
        ne00: k as i64,
        ne01: n as i64,
        ne02: 1,
        ne10: k as i64,
        ne12: 1,
        ne0: n as i64,
        ne1: real_top_k as i64,
        r2: 1,
        r3: 1,
        top_k: 1,
        n_tokens: real_top_k,
        expert_stride: expert_stride as i64,
    };
    let params_bytes = as_bytes(&gpu_params).to_vec();

    // Regular Q8_0_ID: align=8 rows per TG, threads=(8, 8, 1).
    const ALIGN: u32 = 8;
    let threadgroups = metal::MTLSize::new(
        div_ceil(n as usize, ALIGN as usize) as u64,
        real_top_k as u64,
        1,
    );
    let threads_per_tg = metal::MTLSize::new(8, 8, 1);

    Ok(Some(DispatchRecord {
        pipeline,
        threadgroups,
        threads_per_tg,
        threadgroup_mem: Vec::new(), // regular Q8_0_ID doesn't use shmem
        params_bytes,
        params_slot: 4,
        buffer_slots: vec![0, 1, 2, 3], // weight, input, output, ids
        op_kind: CapturedOpKind::Other,
        kernel_name: "kernel_mul_mv_id_q8_0_f32".to_string(),
    }))
}

/// Fused SwiGLU + expert-routed Q4_0 mat-vec.
///
/// Computes `output[r][n] = sum_k(dequant(W_q4_0[ids[r]][n][k]) * (silu(gate[r][k]) * up[r][k]))`
/// in a single dispatch — replaces the `silu_mul + quantized_matmul_id_ggml`
/// sequence that hf2q's MoE FFN decode path used (Phase D + Phase E in
/// `gpu_ffn.rs:build_moe_ffn_layer_gpu_q_into`).
///
/// Saves one dispatch + one memory_barrier per MoE layer × 40 layers per
/// decode token, targeting the dispatch-count component of the dwq46
/// 0.93× decode parity gap (per the `MLX_PROFILE_CB=1` per-cb breakdown
/// in ADR-012 §Optimize / Task #15).
///
/// Currently supports `GgmlType::Q4_0` only (the dominant expert-down
/// quant type in dwq46).  Q8_0 / Q6_K support is straightforward to add
/// by templating the inner dot-product over the dequant kernel; not yet
/// implemented because dwq46's expert_down is 95% Q4_0.
///
/// # Arguments
///
/// * `encoder`  -- Command encoder.
/// * `registry` -- Kernel registry.
/// * `device`   -- Metal device.
/// * `gate`     -- f32 gate input `[n_tokens*top_k, K]`.
/// * `up`       -- f32 up input `[n_tokens*top_k, K]`.
/// * `weight`   -- Q4_0 expert weight stack `[n_experts, N, packed_K]`.
/// * `ids`      -- u32 expert index buffer `[n_tokens*top_k]`.
/// * `output`   -- f32 output `[n_tokens*top_k, N]`.
/// * `params`   -- Same params struct as `quantized_matmul_id_ggml` (must have ggml_type=Q4_0).
///
/// # Errors
///
/// Returns `MlxError::InvalidArgument` if dimensions don't match expected
/// shapes or `params.ggml_type` is not `Q4_0`.
#[allow(clippy::too_many_arguments)]
pub fn quantized_matmul_id_swiglu_q4_0(
    encoder: &mut CommandEncoder,
    registry: &mut KernelRegistry,
    device: &MlxDevice,
    gate: &MlxBuffer,
    up: &MlxBuffer,
    weight: &MlxBuffer,
    ids: &MlxBuffer,
    output: &MlxBuffer,
    params: &GgmlQuantizedMatmulIdParams,
) -> Result<()> {
    if params.ggml_type != GgmlType::Q4_0 {
        return Err(MlxError::InvalidArgument(format!(
            "quantized_matmul_id_swiglu_q4_0: expected Q4_0, got {:?}",
            params.ggml_type
        )));
    }
    let qk = GgmlType::Q4_0.block_values();

    // --- Validate dimensions (mirror quantized_matmul_id_ggml) ---
    if params.n_tokens == 0 || params.k == 0 || params.n == 0 {
        return Err(MlxError::InvalidArgument(
            "quantized_matmul_id_swiglu_q4_0: n_tokens, K, and N must all be > 0".into(),
        ));
    }
    if params.top_k == 0 {
        return Err(MlxError::InvalidArgument(
            "quantized_matmul_id_swiglu_q4_0: top_k must be > 0".into(),
        ));
    }
    if params.k % qk != 0 {
        return Err(MlxError::InvalidArgument(format!(
            "quantized_matmul_id_swiglu_q4_0: K ({}) must be divisible by block QK ({})",
            params.k, qk
        )));
    }

    let total_rows = (params.n_tokens as usize) * (params.top_k as usize);
    let expected_in_bytes = total_rows * (params.k as usize) * DType::F32.size_of();
    if gate.byte_len() < expected_in_bytes {
        return Err(MlxError::InvalidArgument(format!(
            "quantized_matmul_id_swiglu_q4_0: gate buffer too small: expected {} bytes, got {}",
            expected_in_bytes, gate.byte_len()
        )));
    }
    if up.byte_len() < expected_in_bytes {
        return Err(MlxError::InvalidArgument(format!(
            "quantized_matmul_id_swiglu_q4_0: up buffer too small: expected {} bytes, got {}",
            expected_in_bytes, up.byte_len()
        )));
    }
    let expected_out_bytes = total_rows * (params.n as usize) * DType::F32.size_of();
    if output.byte_len() < expected_out_bytes {
        return Err(MlxError::InvalidArgument(format!(
            "quantized_matmul_id_swiglu_q4_0: output buffer too small: expected {} bytes, got {}",
            expected_out_bytes, output.byte_len()
        )));
    }

    let pipeline = registry.get_pipeline(
        "kernel_mul_mv_id_q4_0_f32_swiglu",
        device.metal_device(),
    )?;

    let gpu_params = GgmlMatvecIdGpuParams {
        ne00: params.k as i64,
        ne01: params.n as i64,
        ne02: 1,
        ne10: params.k as i64,
        ne12: 1,
        ne0: params.n as i64,
        ne1: total_rows as i64,
        r2: 1,
        r3: 1,
        top_k: params.top_k,
        n_tokens: params.n_tokens,
        expert_stride: params.expert_stride as i64,
    };

    // Same (8, 8) geometry as the unfused Q4_0 mv_id kernel.
    let (nth0, nth1, align) = (8u64, 8u64, 8usize);
    let n = params.n as usize;
    let m = total_rows;
    let threadgroups = metal::MTLSize::new(
        div_ceil(n, align) as u64,
        m as u64,
        1,
    );
    let threads_per_tg = metal::MTLSize::new(nth0, nth1, 1);

    encoder.encode_threadgroups_with_args(
        pipeline,
        &[
            (0, KernelArg::Buffer(weight)),
            (1, KernelArg::Buffer(gate)),
            (2, KernelArg::Buffer(up)),
            (3, KernelArg::Buffer(output)),
            (4, KernelArg::Buffer(ids)),
            (5, KernelArg::Bytes(as_bytes(&gpu_params))),
        ],
        threadgroups,
        threads_per_tg,
    );

    Ok(())
}

/// Caller-owned scratch for the `_id` mm path's map0 stage.
///
/// Holds the two small buffers map0 writes and mm_id reads
/// (`htpe`: `[n_experts]` per-expert routed-token count, `hids`:
/// `[n_experts, n_tokens]` per-expert routed-token list).  Passing one
/// instance through every mm_id call in a prefill amortises what would
/// otherwise be two Metal allocations per MoE layer — ~60 allocations
/// per Gemma 4 prefill.
///
/// Size the scratch for the largest `(n_experts, n_tokens)` pair the
/// session will dispatch; callers use `IdMmScratch::alloc(dev, n_experts,
/// max_n_tokens)` once at prefill start.  Smaller subsequent dispatches
/// reuse the same buffers (kernel only touches the first
/// `n_experts * n_tokens` u32s).
pub struct IdMmScratch {
    pub htpe: MlxBuffer,
    pub hids: MlxBuffer,
    n_experts_cap: u32,
    n_tokens_cap: u32,
}

impl IdMmScratch {
    /// Allocate scratch sized to `n_experts * max_n_tokens` u32s.
    pub fn alloc(
        device: &MlxDevice,
        n_experts: u32,
        max_n_tokens: u32,
    ) -> Result<Self> {
        let htpe = device.alloc_buffer(
            (n_experts as usize) * DType::U32.size_of(),
            DType::U32,
            vec![n_experts as usize],
        )?;
        let hids = device.alloc_buffer(
            (n_experts as usize) * (max_n_tokens as usize) * DType::U32.size_of(),
            DType::U32,
            vec![n_experts as usize, max_n_tokens as usize],
        )?;
        Ok(Self {
            htpe,
            hids,
            n_experts_cap: n_experts,
            n_tokens_cap: max_n_tokens,
        })
    }

    fn check_capacity(&self, n_experts: u32, n_tokens: u32) -> Result<()> {
        if n_experts > self.n_experts_cap {
            return Err(MlxError::InvalidArgument(format!(
                "IdMmScratch: n_experts ({}) > cap ({})",
                n_experts, self.n_experts_cap,
            )));
        }
        if n_tokens > self.n_tokens_cap {
            return Err(MlxError::InvalidArgument(format!(
                "IdMmScratch: n_tokens ({}) > cap ({})",
                n_tokens, self.n_tokens_cap,
            )));
        }
        Ok(())
    }
}

/// Matrix-matrix `_id` dispatch using caller-owned scratch (ADR-011 Phase
/// 3 Wave P3b).
#[allow(clippy::too_many_arguments)]
fn dispatch_id_mm_pooled(
    encoder: &mut CommandEncoder,
    registry: &mut KernelRegistry,
    device: &MlxDevice,
    input: &MlxBuffer,
    weight: &MlxBuffer,
    ids: &MlxBuffer,
    output: &MlxBuffer,
    scratch: &mut IdMmScratch,
    params: &GgmlQuantizedMatmulIdParams,
) -> Result<()> {
    scratch.check_capacity(params.n_experts, params.n_tokens)?;

    // Translate the dispatcher-facing params to the mm_id internal dispatch
    // shape.  Same fields; different type keeps the public mv params from
    // becoming mm-specific.
    let dispatch = GgmlIdMmDispatchParams {
        n_tokens: params.n_tokens,
        top_k: params.top_k,
        n: params.n,
        k: params.k,
        n_experts: params.n_experts,
        expert_stride: params.expert_stride,
        ggml_type: params.ggml_type,
    };

    dispatch_id_mm_for_test(
        encoder, registry, device,
        input, weight, ids,
        &mut scratch.htpe, &mut scratch.hids, output, &dispatch,
    )
}

/// Matrix-matrix `_id` dispatch that allocates scratch on every call.
///
/// Retained for the auto-allocating `quantized_matmul_id_ggml` entry
/// point (tests, non-prefill callers); the pooled entry point
/// `quantized_matmul_id_ggml_pooled` is preferred for batched prefill.
#[allow(clippy::too_many_arguments)]
fn dispatch_id_mm(
    encoder: &mut CommandEncoder,
    registry: &mut KernelRegistry,
    device: &MlxDevice,
    input: &MlxBuffer,
    weight: &MlxBuffer,
    ids: &MlxBuffer,
    output: &MlxBuffer,
    params: &GgmlQuantizedMatmulIdParams,
) -> Result<()> {
    let mut scratch = IdMmScratch::alloc(device, params.n_experts, params.n_tokens)?;
    dispatch_id_mm_pooled(
        encoder, registry, device,
        input, weight, ids, output,
        &mut scratch, params,
    )
}

fn div_ceil(a: usize, b: usize) -> usize {
    (a + b - 1) / b
}

// ============================================================================
// ADR-011 Phase 3 Wave P3a: `_id` matrix-matrix (mm) path.
//
// Ports llama.cpp's `kernel_mul_mm_id_map0_ne20_<N>` + `kernel_mul_mm_id_<q>_f32`
// two-stage dispatch.  Used for MoE projections at prefill — instead of
// re-reading each expert's weight blocks once per routed (token, slot) pair,
// the mm kernel stages a 64x32 expert weight tile into threadgroup shared
// memory and reuses it across a 32-row block of the expert's routed tokens.
//
// The preprocessing step (`map0`) is what lets mm work for MoE: it
// regroups the flat `[n_tokens, top_k]` ids table into per-expert routed
// token lists so each mm tile is homogeneous in its choice of expert
// weight slab.  Without map0, consecutive M-rows in a tile could route to
// different experts, defeating weight reuse.
//
// Same staging strategy as Wave P3a Commit 1 (non-id): the kernel exists,
// tests verify correctness, but the public `quantized_matmul_id_ggml`
// dispatcher is NOT rerouted yet — tests call `dispatch_id_mm_for_test`.
// ============================================================================

/// Host-side params for the `_id` mm path's `map0` preprocessor.
///
/// Matches `GgmlMatmulIdMm_Map0Params` in
/// `/opt/mlx-native/src/shaders/quantized_matmul_id_mm.metal`.  Explicit
/// 4-byte trailing padding so the struct aligns to 8 (u64).
#[repr(C)]
#[derive(Debug, Clone, Copy, bytemuck::Pod, bytemuck::Zeroable)]
struct GgmlIdMmMap0GpuParams {
    ne10: i32,       // unused, kept for struct symmetry
    ne11: i32,       // n_expert_used (bcast, == ne20)
    nb11: u64,       // unused
    nb12: u64,       // unused
    ne21: i32,       // n_tokens
    ne20: i32,       // n_expert_used (top_k)
    nb21: u64,       // bytes per token in the ids table (= ne20 * sizeof(i32))
}

/// Host-side params for the `_id` mm kernel.
///
/// Matches `GgmlMatmulIdMm_MmParams` in
/// `/opt/mlx-native/src/shaders/quantized_matmul_id_mm.metal`.
#[repr(C)]
#[derive(Debug, Clone, Copy, bytemuck::Pod, bytemuck::Zeroable)]
struct GgmlIdMmMmGpuParams {
    ne00: i32,   // K
    ne02: i32,   // n_experts
    nb01: u64,   // bytes per weight row (within one expert's slab)
    nb02: u64,   // bytes per expert weight slab (= nb01 * N)
    nb03: u64,
    ne11: i32,   // n_expert_used (bcast)
    _pad0: u32,
    nb10: u64,   // = sizeof(float)
    nb11: u64,   // bytes per input row (= K * 4)
    nb12: u64,   // bytes per input batch (= n_tokens * nb11)
    nb13: u64,
    ne20: i32,   // n_expert_used (top_k)
    ne21: i32,   // n_tokens
    ne0: i32,    // N (per-expert output rows)
    ne1: i32,    // batch stride (== ne20 for our packed layout)
    r2: i16,
    r3: i16,
    _pad1: u32,
}

/// Parameters for the `_id` mm dispatch (scratch-buffer sized view).
#[derive(Debug, Clone, Copy)]
pub struct GgmlIdMmDispatchParams {
    /// Number of input tokens.
    pub n_tokens: u32,
    /// Number of experts each token is routed to (top-k).
    pub top_k: u32,
    /// Number of output columns per expert (weight rows).
    pub n: u32,
    /// Input dimension (weight cols before quantization).
    pub k: u32,
    /// Total experts in the stacked weight buffer.
    pub n_experts: u32,
    /// Byte stride between expert weight slices in the stacked buffer.
    pub expert_stride: u64,
    /// GGML quantization type.
    pub ggml_type: GgmlType,
}

impl GgmlIdMmDispatchParams {
    /// Bytes required for the `htpe` scratch buffer (per-expert routed count).
    pub fn htpe_bytes(&self) -> usize {
        (self.n_experts as usize) * DType::U32.size_of()
    }

    /// Bytes required for the `hids` scratch buffer (per-expert routed-token list).
    /// Layout: `[n_experts, n_tokens]` int32 row-major.
    pub fn hids_bytes(&self) -> usize {
        (self.n_experts as usize) * (self.n_tokens as usize) * DType::U32.size_of()
    }
}

/// Test-only helper: force the `_id` mm two-stage dispatch path.
///
/// Runs `kernel_mul_mm_id_map0_ne20_<top_k>` followed by
/// `kernel_mul_mm_id_<qtype>_f32`.
///
/// Input:
///   * `input`   — f32 input rows `[n_tokens, K]`.
///   * `weight`  — stacked expert weights `[n_experts, N, packed_K]`.
///   * `ids`     — flat expert-id table `[n_tokens, top_k]` viewed as
///                 i32 (u32 is byte-equivalent in this range).
///   * `output`  — f32 output `[n_tokens, top_k, N]` row-major.
///
/// Scratch (caller-allocated, zero-init not required):
///   * `htpe`    — `[n_experts]` u32 (per-expert count).
///   * `hids`    — `[n_experts, n_tokens]` i32 (per-expert routed list).
///
/// Not intended for production callers — the public `quantized_matmul_id_ggml`
/// entry point stays on the mv path until the follow-up commit wires the
/// m > 8 threshold.
#[doc(hidden)]
#[allow(clippy::too_many_arguments)]
pub fn dispatch_id_mm_for_test(
    encoder: &mut CommandEncoder,
    registry: &mut KernelRegistry,
    device: &MlxDevice,
    input: &MlxBuffer,
    weight: &MlxBuffer,
    ids: &MlxBuffer,
    htpe: &MlxBuffer,
    hids: &MlxBuffer,
    output: &MlxBuffer,
    params: &GgmlIdMmDispatchParams,
) -> Result<()> {
    let qk = params.ggml_type.block_values();

    // ---- Validate common shapes ----
    // ADR-013 P16 — Q4_K added. ADR-022 P1.6 — Q5_1 / IQ4_NL added.
    // ADR-022 Phase 2 — Q5_K added.
    match params.ggml_type {
        GgmlType::Q4_0
        | GgmlType::Q8_0
        | GgmlType::Q4_K
        | GgmlType::Q5_K
        | GgmlType::Q6_K
        | GgmlType::Q5_1
        | GgmlType::IQ4_NL => {}
        other => {
            return Err(MlxError::InvalidArgument(format!(
                "dispatch_id_mm_for_test does not support {:?}", other
            )));
        }
    }
    if params.n_tokens == 0 || params.k == 0 || params.n == 0
        || params.top_k == 0 || params.n_experts == 0
    {
        return Err(MlxError::InvalidArgument(
            "n_tokens, K, N, top_k, n_experts must all be > 0".into(),
        ));
    }
    if params.k % qk != 0 {
        return Err(MlxError::InvalidArgument(format!(
            "K ({}) must be divisible by block QK ({})", params.k, qk
        )));
    }

    // Match the top_k template instantiations available in the shader.
    // P3b-tensor.2 — added ne20_1 alongside ne20_8 (Gemma 4 MoE down).
    if params.top_k != 1 && params.top_k != 8 {
        return Err(MlxError::InvalidArgument(format!(
            "dispatch_id_mm_for_test: top_k {} has no map0 instantiation (need 1 or 8)",
            params.top_k
        )));
    }

    let blocks_per_row = params.k / qk;
    let block_bytes = params.ggml_type.block_bytes();
    let per_expert_bytes =
        (params.n as usize) * (blocks_per_row as usize) * (block_bytes as usize);

    if (params.expert_stride as usize) < per_expert_bytes {
        return Err(MlxError::InvalidArgument(format!(
            "expert_stride ({}) < per_expert_bytes ({})",
            params.expert_stride, per_expert_bytes
        )));
    }

    if weight.byte_len() < per_expert_bytes * params.n_experts as usize {
        return Err(MlxError::InvalidArgument(
            "dispatch_id_mm_for_test: weight buffer too small".into(),
        ));
    }
    if input.byte_len()
        < (params.n_tokens as usize) * (params.k as usize) * DType::F32.size_of()
    {
        return Err(MlxError::InvalidArgument(
            "dispatch_id_mm_for_test: input buffer too small".into(),
        ));
    }
    let total_rows = (params.n_tokens as usize) * (params.top_k as usize);
    if ids.byte_len() < total_rows * DType::U32.size_of() {
        return Err(MlxError::InvalidArgument(
            "dispatch_id_mm_for_test: ids buffer too small".into(),
        ));
    }
    if output.byte_len() < total_rows * (params.n as usize) * DType::F32.size_of() {
        return Err(MlxError::InvalidArgument(
            "dispatch_id_mm_for_test: output buffer too small".into(),
        ));
    }
    if htpe.byte_len() < params.htpe_bytes() {
        return Err(MlxError::InvalidArgument(
            "dispatch_id_mm_for_test: htpe buffer too small".into(),
        ));
    }
    if hids.byte_len() < params.hids_bytes() {
        return Err(MlxError::InvalidArgument(
            "dispatch_id_mm_for_test: hids buffer too small".into(),
        ));
    }

    // ---- Stage 1: map0 — build per-expert routed-token lists ----
    //
    // Dispatch: 1 threadgroup of `n_experts` threads.  Shared memory:
    // `n_experts * top_k * sizeof(uint16)` staging area.
    //
    // ADR-011 Phase 3 Wave P3b-tensor.2 — pick the map0 instantiation
    // whose `ne20` template arg matches our top_k.  Gemma 4 needs both:
    // `ne20_8` for the gate_up call (top_k=8) and `ne20_1` for the
    // down call (top_k=1, where each output row routes to a single
    // expert).  Without ne20_1 the top_k=1 caller falls back to mv_id
    // and re-reads each expert's weights once per (seq_len*top_k) row —
    // ~50% of prefill time wasted on weight re-reads.
    let map0_kernel_name = match params.top_k {
        1 => "kernel_mul_mm_id_map0_ne20_1",
        8 => "kernel_mul_mm_id_map0_ne20_8",
        other => return Err(MlxError::InvalidArgument(format!(
            "dispatch_id_mm_for_test: no map0 instantiation for top_k={}",
            other
        ))),
    };
    let map0_pipeline = registry.get_pipeline(map0_kernel_name, device.metal_device())?;

    let map0_params = GgmlIdMmMap0GpuParams {
        ne10: params.n.try_into().map_err(|_| {
            MlxError::InvalidArgument("N out of i32 range".into())
        })?,
        ne11: params.top_k as i32,
        nb11: 0,
        nb12: 0,
        ne21: params.n_tokens as i32,
        ne20: params.top_k as i32,
        nb21: (params.top_k as u64) * (DType::U32.size_of() as u64),
    };

    let map0_shmem =
        (params.n_experts as u64) * (params.top_k as u64) * std::mem::size_of::<u16>() as u64;
    let map0_threadgroups = metal::MTLSize::new(1, 1, 1);
    let map0_threads = metal::MTLSize::new(params.n_experts as u64, 1, 1);

    encoder.encode_threadgroups_with_args_and_shared(
        map0_pipeline,
        &[
            (0, KernelArg::Bytes(as_bytes(&map0_params))),
            (1, KernelArg::Buffer(ids)),
            (2, KernelArg::Buffer(htpe)),
            (3, KernelArg::Buffer(hids)),
        ],
        &[(0, map0_shmem)],
        map0_threadgroups,
        map0_threads,
    );

    // Memory barrier: the mm kernel reads htpe + hids, map0 wrote them.
    // Without this, Metal's concurrent-dispatch compute encoder lets the
    // two dispatches overlap — mm would read zeros (all-expert early-exit).
    // llama.cpp does the same via `ggml_metal_op_concurrency_reset`
    // (ggml-metal-ops.cpp:2353).
    encoder.memory_barrier();

    // ---- Stage 2: mm_id — matmul with the per-expert lists ----
    //
    // ADR-011 Phase 3 Wave P3b-tensor — prefer the tensor_ops::matmul2d
    // mm_id variant on M3+.  The probe caches the decision after the
    // first dispatch; subsequent calls are branch-free.
    let use_tensor = probe_tensor_mm_id(registry, device);
    let mm_kernel_name = if use_tensor {
        params.ggml_type.id_mm_tensor_kernel_name()
    } else {
        params.ggml_type.id_mm_kernel_name()
    };
    let mm_pipeline = registry.get_pipeline(mm_kernel_name, device.metal_device())?;

    let nb01 = (blocks_per_row as u64) * (block_bytes as u64);
    let row_bytes = (params.k as u64) * (DType::F32.size_of() as u64);

    // Input layout: `[n_tokens, K]` f32 flat.  There is ONE input row per
    // token (shared across all top_k slots), so:
    //   * nb11 (slot stride) = 0 — the kernel advances by `i11 * nb11`
    //     inside the K loop; zero means every slot reads the same token row.
    //   * nb12 (token stride) = K * 4.
    //
    // This differs from llama.cpp's upstream MUL_MAT_ID where `src1` has
    // shape `[K, n_expert_used, n_tokens]` (pre-replicated per slot),
    // making `nb11 = K * 4` and `nb12 = top_k * K * 4` there.  Our mv_id
    // port uses the flat `[n_tokens, K]` layout and so does mm_id.
    let mm_params = GgmlIdMmMmGpuParams {
        ne00: params.k as i32,
        ne02: params.n_experts as i32,
        nb01,
        nb02: params.expert_stride,
        nb03: 0,
        ne11: params.top_k as i32,
        _pad0: 0,
        nb10: DType::F32.size_of() as u64,
        nb11: 0,             // no slot dim in our input
        nb12: row_bytes,     // per-token stride
        nb13: 0,
        ne20: params.top_k as i32,
        ne21: params.n_tokens as i32,
        ne0: params.n as i32,
        ne1: params.top_k as i32,
        r2: 1,
        r3: 1,
        _pad1: 0,
    };

    const NR0: u64 = 64;
    const NR1: u64 = 32;
    const THREADS_PER_TG: u64 = 128;

    let mm_threadgroups = metal::MTLSize::new(
        (params.n_tokens as u64 + NR1 - 1) / NR1,
        (params.n as u64 + NR0 - 1) / NR0,
        params.n_experts as u64,
    );
    let mm_threads = metal::MTLSize::new(THREADS_PER_TG, 1, 1);

    const MM_SHMEM_BYTES: u64 = 8192;

    encoder.encode_threadgroups_with_args_and_shared(
        mm_pipeline,
        &[
            (0, KernelArg::Bytes(as_bytes(&mm_params))),
            (1, KernelArg::Buffer(weight)),
            (2, KernelArg::Buffer(input)),
            (3, KernelArg::Buffer(htpe)),
            (4, KernelArg::Buffer(hids)),
            (5, KernelArg::Buffer(output)),
        ],
        &[(0, MM_SHMEM_BYTES)],
        mm_threadgroups,
        mm_threads,
    );

    Ok(())
}