mlx-native 0.8.1

//! Flash-attention-style tiled prefill kernel — host dispatch.
//!
//! mlx-native's batched-prefill SDPA kernel, the prefill counterpart to
//! `flash_attn_vec` (which handles the seq_len=1 decode case).  Implements
//! online softmax + simdgroup MMA tiling on Apple GPU.
//!
//! ## Kernel variants registered
//!
//! Twelve entry points are registered, all backed by the single
//! `flash_attn_prefill.metal` shader source:
//!
//! ### D=64 (BQ=32, BK=16, WM=4, WN=1 — 128 threads/threadgroup, BERT family)
//!
//! | Kernel name | I/O dtype | Mask kind |
//! |---|---|---|
//! | `flash_attn_prefill_bf16_d64`            | bf16 | bf16 additive |
//! | `flash_attn_prefill_bf16_d64_boolmask`   | bf16 | bool |
//! | `flash_attn_prefill_f16_d64`             | f16  | f16 additive |
//! | `flash_attn_prefill_f16_d64_boolmask`    | f16  | bool |
//!
//! ### D=256 (BQ=32, BK=16, WM=4, WN=1 — 128 threads/threadgroup)
//!
//! | Kernel name | I/O dtype | Mask kind |
//! |---|---|---|
//! | `flash_attn_prefill_bf16_d256`           | bf16 | bf16 additive (log-domain) |
//! | `flash_attn_prefill_bf16_d256_boolmask`  | bf16 | bool (`is_attended`) |
//! | `flash_attn_prefill_f16_d256`            | f16  | f16 additive |
//! | `flash_attn_prefill_f16_d256_boolmask`   | f16  | bool |
//!
//! ### D=512 (BQ=8, BK=8, WM=1, WN=1 — 32 threads/threadgroup, 1 simdgroup)
//!
//! | Kernel name | I/O dtype | Mask kind |
//! |---|---|---|
//! | `flash_attn_prefill_bf16_d512`           | bf16 | bf16 additive |
//! | `flash_attn_prefill_bf16_d512_boolmask`  | bf16 | bool |
//! | `flash_attn_prefill_f16_d512`            | f16  | f16 additive |
//! | `flash_attn_prefill_f16_d512_boolmask`   | f16  | bool |
//!
//! ### f32 is NOT instantiated at any D
//!
//! The f32 Qs threadgroup tile alone is `BQ * BD * 4` bytes — at D=256 this
//! is 32 KB exactly, the Apple Silicon `MTLDevice.maxThreadgroupMemoryLength`
//! hard limit, before KV_smem or scratch.  Verified empirically on M5 Max:
//! f32 D=256 requires ~53.7 KB and fails at library compile.  bf16 and f16
//! halve the tile footprint (~29 KB) and fit within the limit.  D=512 f32
//! is excluded for the same reason.  D=64 inherits the same exclusion for
//! consistency (bf16/f16 only across the entire dispatcher family).  f32
//! correctness is verified at the CPU reference layer in
//! `tests/test_flash_attn_prefill.rs`.  See
//! `ADR-011-phase1-port-source-decision.md` §3 for the full
//! threadgroup-memory analysis.
//!
//! ## Function constants
//!
//! The kernel declares four Metal function constants that must be specialised
//! at pipeline creation time (not at dispatch time):
//!
//! - Index 200: `align_Q` (bool) — true when `qL % BQ == 0`
//! - Index 201: `align_K` (bool) — true when `kL % BK == 0`
//! - Index 300: `has_mask` (bool) — true when a mask buffer is bound
//! - Index 301: `do_causal` (bool) — true for in-kernel causal masking
//!
//! These are plumbed via [`KernelRegistry::get_pipeline_with_bool_constants`],
//! which caches compiled pipelines keyed by `(kernel_name, align_Q, align_K,
//! has_mask, do_causal)`.  Pipeline compilation is amortised: the slow path
//! runs only once per unique `(name, booleans)` combination.
//!
//! ## Buffer layout (indices match the MSL kernel)
//!
//! - `buffer(0)` — Q `[B, H,    qL, D]`  device, contiguous inner dim
//! - `buffer(1)` — K `[B, H_kv, kL, D]`  device, contiguous inner dim
//! - `buffer(2)` — V `[B, H_kv, kL, D]`  device, contiguous inner dim
//! - `buffer(3)` — O `[B, H,    qL, D]`  device, written by kernel
//! - `buffer(4)` — `AttnParams` constant buffer (this module's [`AttnParamsGpu`])
//! - `buffer(5)` — `AttnMaskParams` constant buffer (only when `has_mask=true`)
//! - `buffer(6)` — mask data buffer (only when `has_mask=true`)
//!
//! ## Grid geometry
//!
//! - Threadgroups: `(ceil(qL / BQ), H, B)`
//! - Threads per threadgroup: `(32, WM, WN)`
//! - D=256: 128 threads (4 simdgroups × 32 lanes).
//! - D=512:  32 threads (1 simdgroup  × 32 lanes).
//!
//! ## Scale convention
//!
//! Pass `scale = 1.0 / sqrt(head_dim)`.  The kernel multiplies internally by
//! `log2(e) ≈ 1.44269504` and uses `fast::exp2` throughout — so the host
//! MUST NOT pre-multiply by `log2(e)`.
//!
//! ## Mask-sentinel contract (llama.cpp convention)
//!
//! The additive mask buffer (bf16/f16 for the additive dispatchers) uses
//! the llama.cpp CPU-side convention: **masked positions = `-INFINITY`**
//! (IEEE-754 f32 `0xFF800000`, cast to the I/O dtype — both `half(-inf)`
//! and `bfloat16(-inf)` have a real `-inf` encoding that the kernel
//! consumes correctly).  Attended positions = `0.0`.
//!
//! This matches llama.cpp's mask-authoring sites at
//! `llama-graph.cpp:421, 436, 557` and `llama-kv-cache.cpp:1572`, where
//! the CPU writes raw `-INFINITY` and the flash-attn cast to f16 saturates
//! to f16-`-INFINITY`.
//!
//! The kernel does NOT require masks to use `-FLT_MAX/2` or any other
//! finite "large negative" sentinel.  The `-FLT_MAX/2` value is the
//! kernel-internal `M` (running row-max) initialiser — a finite sentinel
//! that absorbs `-inf` scores via `simd_max` without ever letting `M`
//! become `-inf`, which in turn lets every `exp(score - M)` evaluate as
//! `exp(-inf) = 0.0` (IEEE-754 exact) rather than `exp(-inf - -inf) =
//! exp(NaN) = NaN`.  See ADR-011-phase2-port-sentinel.md §1.
//!
//! Callers may pass arbitrary finite additive biases in the mask for
//! non-masked positions (e.g. ALiBi, relative-position biases); only
//! "fully block this K position" requires `-inf`.
//!
//! ## See also
//!
//! - Kernel: `/opt/mlx-native/src/shaders/flash_attn_prefill.metal`
//! - ADR-011: `/opt/hf2q/docs/ADR-011-flash-attn-prefill.md`
//! - ADR-011 phase 2 sentinel port: `/opt/hf2q/docs/ADR-011-phase2-port-sentinel.md`

use metal::MTLSize;

use crate::buffer::MlxBuffer;
use crate::device::MlxDevice;
use crate::encoder::{CapturedOpKind, CommandEncoder, KernelArg, as_bytes};
use crate::error::{MlxError, Result};
use crate::kernel_registry::KernelRegistry;
use crate::DType;

// ─── Shader source ───────────────────────────────────────────────────────────

/// MSL source for the flash-attention prefill kernel (embedded at compile time).
pub static FLASH_ATTN_PREFILL_SHADER_SOURCE: &str =
    include_str!("../shaders/flash_attn_prefill.metal");

// ─── All 12 kernel entry-point names ─────────────────────────────────────────

/// D=256, bf16 I/O, bf16 additive mask.
const K_BF16_D256: &str = "flash_attn_prefill_bf16_d256";
/// D=256, bf16 I/O, bool (`is_attended`) mask.
const K_BF16_D256_BOOLMASK: &str = "flash_attn_prefill_bf16_d256_boolmask";
/// D=256, f16 I/O, f16 additive mask.
const K_F16_D256: &str = "flash_attn_prefill_f16_d256";
/// D=256, f16 I/O, bool mask.
const K_F16_D256_BOOLMASK: &str = "flash_attn_prefill_f16_d256_boolmask";
/// D=512, bf16 I/O, bf16 additive mask.
const K_BF16_D512: &str = "flash_attn_prefill_bf16_d512";
/// D=512, bf16 I/O, bool mask.
const K_BF16_D512_BOOLMASK: &str = "flash_attn_prefill_bf16_d512_boolmask";
/// D=512, f16 I/O, f16 additive mask.
const K_F16_D512: &str = "flash_attn_prefill_f16_d512";
/// D=512, f16 I/O, bool mask.
const K_F16_D512_BOOLMASK: &str = "flash_attn_prefill_f16_d512_boolmask";
/// D=64, bf16 I/O, bf16 additive mask (BERT family).
const K_BF16_D64: &str = "flash_attn_prefill_bf16_d64";
/// D=64, bf16 I/O, bool (`is_attended`) mask.
const K_BF16_D64_BOOLMASK: &str = "flash_attn_prefill_bf16_d64_boolmask";
/// D=64, f16 I/O, f16 additive mask.
const K_F16_D64: &str = "flash_attn_prefill_f16_d64";
/// D=64, f16 I/O, bool mask.
const K_F16_D64_BOOLMASK: &str = "flash_attn_prefill_f16_d64_boolmask";

/// All 12 kernel entry-point names exported by `flash_attn_prefill.metal`.
///
/// Registering all of them against the single shader source costs nothing at
/// registration time (source is a static `&str`) and ensures additional
/// dispatchers (f16 paths, D=512 exposure) can be added later without
/// touching registration here.
const ALL_KERNEL_NAMES: &[&str] = &[
    K_BF16_D256,
    K_BF16_D256_BOOLMASK,
    K_F16_D256,
    K_F16_D256_BOOLMASK,
    K_BF16_D512,
    K_BF16_D512_BOOLMASK,
    K_F16_D512,
    K_F16_D512_BOOLMASK,
    K_BF16_D64,
    K_BF16_D64_BOOLMASK,
    K_F16_D64,
    K_F16_D64_BOOLMASK,
];

// ─── Registration ─────────────────────────────────────────────────────────────

/// Register all flash-attention prefill kernel entry points with the registry.
///
/// Maps all 12 entry-point names to the single `flash_attn_prefill.metal`
/// source.  This must be called before any dispatch to these kernels.
///
/// # Design note
///
/// `KernelRegistry` compiles one Metal library per kernel name.  All 12 names
/// point at the same source text, so the Metal compiler sees the same ~1 500-line
/// source each time — compilation is amortised in `KernelRegistry::get_pipeline`
/// (first call per name triggers compilation; subsequent calls return the cached
/// pipeline).  Registering all 12 here rather than only the Phase 1a subset
/// means Phase 2/4 dispatcher functions can be added without touching this file.
pub fn register(registry: &mut KernelRegistry) {
    for &name in ALL_KERNEL_NAMES {
        registry.register_source(name, FLASH_ATTN_PREFILL_SHADER_SOURCE);
    }
}

// ─── MSL struct mirrors ───────────────────────────────────────────────────────

/// Rust mirror of the MSL `AttnParams` struct.
///
/// Field order and types match the MSL definition exactly.
/// MSL source: `flash_attn_prefill.metal` — see the `AttnParams` struct
/// definition in the kernel source for the field-by-field reference.
///
/// # Layout
///
/// All `int` fields are 32-bit (i32 in Rust).  The `int64_t` stride arrays are
/// 64-bit (i64 in Rust).  The compiler inserts natural alignment padding:
///
/// ```text
/// Offset  0:  B            (i32,  4 bytes)
/// Offset  4:  H            (i32,  4 bytes)
/// Offset  8:  D            (i32,  4 bytes)
/// Offset 12:  qL           (i32,  4 bytes)
/// Offset 16:  kL           (i32,  4 bytes)
/// Offset 20:  gqa_factor   (i32,  4 bytes)
/// Offset 24:  scale        (f32,  4 bytes)
/// Offset 28:  softcapping  (f32,  4 bytes)
/// Offset 32:  NQ           (i32,  4 bytes)
/// Offset 36:  NK           (i32,  4 bytes)
/// Offset 40:  NQ_aligned   (i32,  4 bytes)
/// Offset 44:  NK_aligned   (i32,  4 bytes)
/// Offset 48:  qL_rem       (i32,  4 bytes)
/// Offset 52:  kL_rem       (i32,  4 bytes)
/// Offset 56:  qL_off       (i32,  4 bytes)
/// Offset 60:  _pad         (4 bytes — alignment before i64 array)
/// Offset 64:  Q_strides[3] (3 × i64, 24 bytes)
/// Offset 88:  K_strides[3] (3 × i64, 24 bytes)
/// Offset 112: V_strides[3] (3 × i64, 24 bytes)
/// Offset 136: O_strides[3] (3 × i64, 24 bytes)
/// Total: 160 bytes
/// ```
///
/// `bytemuck::Pod` / `bytemuck::Zeroable` are derived — the struct must have
/// no uninitialized padding bytes.  The explicit `_pad` field makes the padding
/// concrete so Pod can be derived safely.
///
/// `softcapping` is always set to `1.0` (disabled).  The `attention<>` kernel
/// body does not read it; the field exists for ABI parity with attention
/// implementations that thread softcapping through the same param block
/// (e.g. Gemma-style logit softcap), so we don't have to redo the layout
/// when that work lands.
#[repr(C)]
#[derive(Debug, Clone, Copy, bytemuck::Pod, bytemuck::Zeroable)]
pub struct AttnParamsGpu {
    /// Batch size.
    pub b: i32,
    /// Number of query heads.
    pub h: i32,
    /// Head dimension (D).
    pub d: i32,
    /// Query sequence length.
    pub ql: i32,
    /// Key/value sequence length.
    pub kl: i32,
    /// Group-query attention factor: H / H_kv.
    pub gqa_factor: i32,
    /// Attention scale (= 1.0 / sqrt(head_dim); kernel multiples by log2(e)).
    pub scale: f32,
    /// Softcapping value — always 1.0 (disabled) for standard SDPA.
    pub softcapping: f32,
    /// Number of Q tiles: ceil(qL / BQ).
    pub nq: i32,
    /// Number of KV tiles: ceil(kL / BK).
    pub nk: i32,
    /// Number of full (aligned) Q tiles: qL / BQ.
    pub nq_aligned: i32,
    /// Number of full (aligned) KV tiles: kL / BK.
    pub nk_aligned: i32,
    /// Remainder elements in the last Q tile: qL % BQ (0 if aligned).
    pub ql_rem: i32,
    /// Remainder elements in the last KV tile: kL % BK (0 if aligned).
    pub kl_rem: i32,
    /// Query sequence start offset (0 for standard prefill).
    pub ql_off: i32,
    /// Explicit padding to align the subsequent i64 arrays to 8-byte boundary.
    pub _pad: i32,
    /// Query strides: (batch stride, head stride, seq stride).  Inner dim = 1.
    pub q_strides: [i64; 3],
    /// Key strides: (batch stride, head stride, seq stride).  Inner dim = 1.
    pub k_strides: [i64; 3],
    /// Value strides: (batch stride, head stride, seq stride).  Inner dim = 1.
    pub v_strides: [i64; 3],
    /// Output strides: (batch stride, head stride, seq stride).  Inner dim = 1.
    pub o_strides: [i64; 3],
}

/// Rust mirror of the MSL `AttnMaskParams` struct.
///
/// MSL source: `flash_attn_prefill.metal` — see the `AttnMaskParams` struct
/// definition in the kernel source.
///
/// Contains the mask buffer strides.  Only sent to the kernel when
/// `has_mask = true` (buffer index 5).
#[repr(C)]
#[derive(Debug, Clone, Copy, bytemuck::Pod, bytemuck::Zeroable)]
pub struct AttnMaskParamsGpu {
    /// Mask strides: (batch stride, head stride, qL stride).  Inner dim = 1.
    pub m_strides: [i64; 3],
}

// ─── Public Rust-side parameter struct ───────────────────────────────────────

/// Host-side parameters for the flash-attention prefill dispatcher.
///
/// Used only by the Rust dispatcher; does not map 1:1 to the GPU struct —
/// the GPU struct is computed from these fields inside the dispatcher.
#[derive(Debug, Clone, Copy)]
pub struct FlashAttnPrefillParams {
    /// Number of query attention heads.
    pub n_heads: u32,
    /// Number of key/value attention heads (GQA: may be < n_heads).
    pub n_kv_heads: u32,
    /// Head dimension.  Must be 256 for `dispatch_flash_attn_prefill_bf16_d256`.
    pub head_dim: u32,
    /// Query sequence length.
    pub seq_len_q: u32,
    /// Key/value sequence length.
    pub seq_len_k: u32,
    /// Batch size.
    pub batch: u32,
    /// Attention scale.  Typically `1.0 / sqrt(head_dim)`.
    ///
    /// The kernel internally multiplies this by `log2(e) = 1.44269504089`
    /// before applying it to Q.  The host MUST NOT pre-multiply by log2(e).
    pub scale: f32,
    /// Whether to apply in-kernel causal masking (`do_causal` function constant).
    ///
    /// When true, positions where `row_pos < col_pos` receive a score of -inf
    /// before softmax.  This can be combined with an external mask buffer.
    pub do_causal: bool,
}

// ─── Tile geometry constants (D=256) ─────────────────────────────────────────

/// Q tile size for D=256: BQ=32.
const BQ_D256: u32 = 32;

/// KV tile size for D=256: BK=16.
const BK_D256: u32 = 16;

/// Simdgroups along Q dimension for D=256: WM=4.
const WM_D256: u32 = 4;

/// Simdgroups along K dimension for D=256: WN=1.
const WN_D256: u32 = 1;

// ─── Tile geometry constants (D=64) ──────────────────────────────────────────
//
// Same simdgroup geometry as D=256: 4 simdgroups along Q, 1 along K, with
// BQ=32 / BK=16.  Threadgroup-memory math at bf16:
//   Qs  = BQ × (BD + padQ) × 2   = 32 × (64 + 8) × 2  = 4 608 bytes
//   KVs = max(BK*(BD+padV), (BK+padK)*BD) × 2
//                                 = max(16×72, 24×64) × 2 = max(1152, 1536) × 2
//                                 = 3 072 bytes
//   total ≈ 7 680 bytes — fits well under the 32 KiB Apple Silicon TG limit.
//
// Static-asserts required by the kernel (with kFragSize=8):
//   BQ % (kNWarps × kFragSize) = 32 % (4×8) = 0 ✓
//   BQ ≥ kNWarps × kFragSize   = 32 ≥ 32     ✓
//   TQ = BQ / (kNWarps × kFragSize) = 1       ✓
//   TD = BD / kFragSize        = 64/8 = 8     ✓
//   TK = BK / kFragSize        = 16/8 = 2     ✓

/// Q tile size for D=64: BQ=32.
const BQ_D64: u32 = 32;

/// KV tile size for D=64: BK=16.
const BK_D64: u32 = 16;

/// Simdgroups along Q dimension for D=64: WM=4.
const WM_D64: u32 = 4;

/// Simdgroups along K dimension for D=64: WN=1.
const WN_D64: u32 = 1;

// ─── Validation ───────────────────────────────────────────────────────────────

fn validate_params(params: &FlashAttnPrefillParams) -> Result<()> {
    if params.n_heads == 0 {
        return Err(MlxError::InvalidArgument(
            "flash_attn_prefill: n_heads must be > 0".into(),
        ));
    }
    if params.n_kv_heads == 0 {
        return Err(MlxError::InvalidArgument(
            "flash_attn_prefill: n_kv_heads must be > 0".into(),
        ));
    }
    if params.n_heads % params.n_kv_heads != 0 {
        return Err(MlxError::InvalidArgument(format!(
            "flash_attn_prefill: n_heads ({}) must be divisible by n_kv_heads ({})",
            params.n_heads, params.n_kv_heads
        )));
    }
    if params.seq_len_q == 0 {
        return Err(MlxError::InvalidArgument(
            "flash_attn_prefill: seq_len_q must be > 0".into(),
        ));
    }
    if params.seq_len_k == 0 {
        return Err(MlxError::InvalidArgument(
            "flash_attn_prefill: seq_len_k must be > 0".into(),
        ));
    }
    if params.batch == 0 {
        return Err(MlxError::InvalidArgument(
            "flash_attn_prefill: batch must be > 0".into(),
        ));
    }
    Ok(())
}

fn validate_buffer_size(buf: &MlxBuffer, name: &str, expected_elements: usize) -> Result<()> {
    let expected_bytes = expected_elements * buf.dtype().size_of();
    if buf.byte_len() < expected_bytes {
        return Err(MlxError::InvalidArgument(format!(
            "flash_attn_prefill: {name} buffer too small: expected at least \
             {expected_bytes} bytes, got {}",
            buf.byte_len()
        )));
    }
    Ok(())
}

// ─── bf16 D=256 dispatcher ───────────────────────────────────────────────────

/// Dispatch flash-attention prefill for bf16 Q/K/V/O, head_dim=256.
///
/// Encodes a compute command into `encoder` without committing.  The caller
/// controls when to call `encoder.commit_and_wait()`.
///
/// # Why bf16 and not f32
///
/// The f32 Qs threadgroup tile (BQ×BD×4 = 32 KB at D=256) consumes the entire
/// Apple Silicon threadgroup-memory budget before KV_smem, scratch, or any
/// padding — so f32 D=256 is not instantiated (see module doc).  bf16/f16
/// halve the tile footprint; the MMA accumulator is still f32 internally
/// (the kernel's `T_accum` template parameter is `float` for every
/// instantiation — see `flash_attn_prefill.metal:~1504`), so prefill output
/// precision is `bf16 × bf16 → f32 → bf16` — bf16-bounded at the store, not
/// at the accumulator.
///
/// # Buffer layouts
///
/// All buffers must be contiguous (stride-1 along the innermost / head_dim
/// dimension):
///
/// - `q`    — `[batch, n_heads,    seq_len_q, 256]`, dtype BF16
/// - `k`    — `[batch, n_kv_heads, seq_len_k, 256]`, dtype BF16
/// - `v`    — `[batch, n_kv_heads, seq_len_k, 256]`, dtype BF16
/// - `mask` — `[batch, n_heads, seq_len_q, seq_len_k]`, dtype BF16
///   (additive, log-scale: 0.0 = attend, -inf = mask out — llama.cpp
///   convention, see module doc "Mask-sentinel contract"), or `None`
/// - `out`  — `[batch, n_heads,    seq_len_q, 256]`, dtype BF16 (output)
///
/// # Function constants
///
/// `align_Q`, `align_K` are computed from the sequence lengths and tile sizes.
/// `has_mask` reflects whether `mask` is `Some(_)`.
/// `do_causal` is taken from `params.do_causal`.
///
/// A distinct Metal pipeline is compiled for each unique combination of these
/// four booleans and cached in `registry`.
///
/// # Errors
///
/// Returns `MlxError::InvalidArgument` for:
/// - `head_dim != 256`
/// - Zero or inconsistent shape fields
/// - Buffer too small for the declared shape
/// - `n_heads` not divisible by `n_kv_heads`
/// - Any buffer dtype != BF16
///
/// Returns `MlxError::ShaderCompilationError` if the Metal pipeline
/// compilation fails.
#[allow(clippy::too_many_arguments)]
pub fn dispatch_flash_attn_prefill_bf16_d256(
    encoder: &mut CommandEncoder,
    device: &MlxDevice,
    registry: &mut KernelRegistry,
    q: &MlxBuffer,
    k: &MlxBuffer,
    v: &MlxBuffer,
    mask: Option<&MlxBuffer>,
    out: &MlxBuffer,
    params: &FlashAttnPrefillParams,
) -> Result<()> {
    // Delegate to the blk-aware dispatcher with blk=None.  Because
    // `has_blk` is a function constant (index 303), the compiled pipeline
    // with has_blk=false dead-codes every blk reference — this call has
    // zero runtime overhead compared with the pre-Wave-2E code path.
    dispatch_flash_attn_prefill_bf16_d256_with_blk(
        encoder, device, registry, q, k, v, mask, None, out, params,
    )
}

/// Dispatch flash-attention prefill for bf16 Q/K/V/O, head_dim=256, with an
/// optional Wave 2E tile-skip pre-pass byte buffer.
///
/// This is the blk-aware sibling of [`dispatch_flash_attn_prefill_bf16_d256`].
/// When `blk` is `Some(buf)`, the kernel reads a classification byte per
/// `(qtile, ktile)` and:
///
/// - On `blk[qt][kt] == 0`, skips the entire KV tile (no K/V load, no
///   Q·K^T, no mask-add, no softmax update).
/// - On `blk[qt][kt] == 2`, skips the mask-add (but still computes Q·K^T
///   and softmax normally).
/// - On `blk[qt][kt] == 1`, runs the standard path.
///
/// The `blk` buffer must be produced by
/// [`crate::ops::flash_attn_prefill_blk::dispatch_flash_attn_prefill_blk`]
/// with the SAME `(BQ=32, BK=16)` tile shape as this dispatcher, and the
/// caller MUST sequence the pre-pass dispatch BEFORE this one on the same
/// command encoder (or commit the pre-pass first).
///
/// # Correctness invariant
///
/// For any valid (mask, built blk), calling this function with `blk=None`
/// vs `blk=Some(built_blk)` MUST produce bit-exact identical output.  The
/// blk path is a pure skip optimisation.  Exercised by
/// `test_gpu_bf16_d256_with_blk_matches_no_blk` in
/// `tests/test_flash_attn_prefill.rs`.
///
/// # Buffer layout
///
/// All buffers as in [`dispatch_flash_attn_prefill_bf16_d256`], plus:
///
/// - `blk` — `[ceil(seq_len_q / 32), ceil(seq_len_k / 16)]`, dtype U8.
///   Required iff `Some(_)`; must have at least
///   `ceil(qL/32) * ceil(kL/16)` bytes.  When `None`, the dispatcher
///   compiles with `has_blk=false` and does not bind buffer index 7.
///
/// # Errors
///
/// Same as [`dispatch_flash_attn_prefill_bf16_d256`], plus:
/// - `blk` is `Some(b)` with `mask = None` (a blk without a mask is
///   meaningless — rejected to catch caller bugs).
/// - `blk` buffer undersized.
#[allow(clippy::too_many_arguments)]
pub fn dispatch_flash_attn_prefill_bf16_d256_with_blk(
    encoder: &mut CommandEncoder,
    device: &MlxDevice,
    registry: &mut KernelRegistry,
    q: &MlxBuffer,
    k: &MlxBuffer,
    v: &MlxBuffer,
    mask: Option<&MlxBuffer>,
    blk: Option<&MlxBuffer>,
    out: &MlxBuffer,
    params: &FlashAttnPrefillParams,
) -> Result<()> {
    // ── Validate ──────────────────────────────────────────────────────────
    if params.head_dim != 256 {
        return Err(MlxError::InvalidArgument(format!(
            "dispatch_flash_attn_prefill_bf16_d256: head_dim must be 256, got {}",
            params.head_dim
        )));
    }
    // Reject the meaningless has_blk=true, has_mask=false case: a blk
    // buffer classifies the contents of the mask; without a mask the blk
    // bytes are computed against undefined data.  This is a caller-bug
    // defence, not a shader limitation.
    if blk.is_some() && mask.is_none() {
        return Err(MlxError::InvalidArgument(
            "dispatch_flash_attn_prefill_bf16_d256_with_blk: \
             blk requires mask (a blk without a mask is meaningless)"
                .into(),
        ));
    }
    validate_params(params)?;

    // All buffers must be BF16 for this dispatcher.
    for (buf, name) in &[(q, "Q"), (k, "K"), (v, "V"), (out as &MlxBuffer, "out")] {
        if buf.dtype() != DType::BF16 {
            return Err(MlxError::InvalidArgument(format!(
                "dispatch_flash_attn_prefill_bf16_d256: {name} buffer must be BF16, \
                 got {:?}",
                buf.dtype()
            )));
        }
    }
    if let Some(m) = mask {
        if m.dtype() != DType::BF16 {
            return Err(MlxError::InvalidArgument(format!(
                "dispatch_flash_attn_prefill_bf16_d256: mask buffer must be BF16, \
                 got {:?}",
                m.dtype()
            )));
        }
    }

    let batch = params.batch as usize;
    let h = params.n_heads as usize;
    let h_kv = params.n_kv_heads as usize;
    let ql = params.seq_len_q as usize;
    let kl = params.seq_len_k as usize;
    let d = params.head_dim as usize; // = 256

    // Validate buffer element counts.
    validate_buffer_size(q, "Q", batch * h * ql * d)?;
    validate_buffer_size(k, "K", batch * h_kv * kl * d)?;
    validate_buffer_size(v, "V", batch * h_kv * kl * d)?;
    validate_buffer_size(out, "out", batch * h * ql * d)?;
    // A rank-2 mask `[qL, kL]` is the Wave 2D broadcast layout: one plane is
    // shared across all batches and heads (stride-0 in the batch and head dims).
    // A rank-4 mask `[B, H, qL, kL]` is the per-head layout used by callers
    // that already have a fully-expanded mask (e.g. pre-Wave-2D code paths).
    let mask_is_rank2_broadcast = mask.is_some_and(|m| m.shape().len() == 2);
    if let Some(m) = mask {
        if mask_is_rank2_broadcast {
            validate_buffer_size(m, "mask", ql * kl)?;
        } else {
            validate_buffer_size(m, "mask", batch * h * ql * kl)?;
        }
    }

    // ── Tile geometry ─────────────────────────────────────────────────────
    let bq = BQ_D256;
    let bk = BK_D256;
    let wm = WM_D256;
    let wn = WN_D256;

    let nq = params.seq_len_q.div_ceil(bq);
    let nk = params.seq_len_k.div_ceil(bk);
    let nq_aligned = params.seq_len_q / bq;
    let nk_aligned = params.seq_len_k / bk;
    let ql_rem = params.seq_len_q % bq;
    let kl_rem = params.seq_len_k % bk;

    // Function constants (specialised at pipeline creation time, not dispatch).
    let align_q = ql_rem == 0;
    let align_k = kl_rem == 0;
    let has_mask = mask.is_some();
    let has_blk = blk.is_some();
    let do_causal = params.do_causal;

    // Validate blk buffer size when present.  Tile shape is fixed for D=256:
    // BQ=32, BK=16 — see ADR-011-phase2-port-tile-skip.md §5.1.
    if let Some(b) = blk {
        let nq_tiles = ql.div_ceil(BQ_D256 as usize);
        let nk_tiles = kl.div_ceil(BK_D256 as usize);
        let expected = nq_tiles * nk_tiles;
        if b.byte_len() < expected {
            return Err(MlxError::InvalidArgument(format!(
                "dispatch_flash_attn_prefill_bf16_d256_with_blk: blk buffer \
                 too small: expected at least {expected} bytes (NQ={nq_tiles}, \
                 NK={nk_tiles}), got {}",
                b.byte_len()
            )));
        }
    }

    // ── Kernel name ───────────────────────────────────────────────────────
    // bf16 I/O, bf16 additive mask (or no mask — uses same pipeline since
    // has_mask is a function constant, not part of the name).
    let kernel_name = K_BF16_D256;

    // ── Pipeline lookup (with function constants) ─────────────────────────
    //
    // Wave 2E adds function constant 303 (has_blk).  When has_blk=false
    // every blk reference in the shader is dead-coded, so pipelines with
    // the pre-Wave-2E constants {200, 201, 300, 301} + {303: false}
    // generate the same machine code as the pre-Wave-2E pipelines.  The
    // only cache-key overhead is one extra "b0" suffix, which is paid
    // once per (aligned, causal, masked) combo at compile time.
    let pipeline = registry.get_pipeline_with_bool_constants(
        kernel_name,
        device.metal_device(),
        &[
            (200, align_q),
            (201, align_k),
            (300, has_mask),
            (301, do_causal),
            (303, has_blk),
        ],
    )?;

    // ── Build AttnParams GPU struct ───────────────────────────────────────
    //
    // Strides for layout [B, H, L, D] where the innermost (D) stride is 1
    // (contiguous):
    //
    //   seq stride  = D
    //   head stride = L * D
    //   batch stride = H * L * D
    //
    // Q/O use (H, qL); K/V use (H_kv, kL).

    let q_seq_stride = d as i64;
    let q_head_stride = (ql * d) as i64;
    let q_batch_stride = (h * ql * d) as i64;

    let kv_seq_stride = d as i64;
    let kv_head_stride = (kl * d) as i64;
    let kv_batch_stride = (h_kv * kl * d) as i64;

    let gqa_factor = (params.n_heads / params.n_kv_heads) as i32;

    let attn_params = AttnParamsGpu {
        b: params.batch as i32,
        h: params.n_heads as i32,
        d: params.head_dim as i32,
        ql: params.seq_len_q as i32,
        kl: params.seq_len_k as i32,
        gqa_factor,
        scale: params.scale,
        softcapping: 1.0_f32,  // always disabled; see module doc
        nq: nq as i32,
        nk: nk as i32,
        nq_aligned: nq_aligned as i32,
        nk_aligned: nk_aligned as i32,
        ql_rem: ql_rem as i32,
        kl_rem: kl_rem as i32,
        ql_off: 0,             // standard prefill starts at offset 0
        _pad: 0,
        q_strides: [q_batch_stride, q_head_stride, q_seq_stride],
        k_strides: [kv_batch_stride, kv_head_stride, kv_seq_stride],
        v_strides: [kv_batch_stride, kv_head_stride, kv_seq_stride],
        o_strides: [q_batch_stride, q_head_stride, q_seq_stride],
    };

    // ── Grid geometry ──────────────────────────────────────────────────────
    //   grid = (NQ, H, B)  where NQ = ceil(qL / BQ)
    //   threadgroup = (32, WM, WN)
    let grid = MTLSize::new(nq as u64, params.n_heads as u64, params.batch as u64);
    let tg_size = MTLSize::new(32, wm as u64, wn as u64);

    // ── Encode ─────────────────────────────────────────────────────────────
    encoder.set_op_kind(CapturedOpKind::Sdpa);

    if has_mask {
        // SAFETY: has_mask is true iff mask.is_some() — set three lines above.
        // The Option is therefore guaranteed to be Some here.  We use
        // ok_or_else rather than expect/unwrap to satisfy the no-panic policy.
        let mask_buf = mask.ok_or_else(|| {
            MlxError::InvalidArgument(
                "flash_attn_prefill: internal error — has_mask=true but mask is None".into(),
            )
        })?;

        // Strides depend on mask rank:
        //   rank-2 `[qL, kL]` — broadcast across batch + heads: set batch_stride
        //   and head_stride to 0 so the shader re-reads the same plane for every
        //   (batch, head) pair.  The Metal shader already handles stride-0 correctly
        //   (no kernel changes required).
        //   rank-4 `[B, H, qL, kL]` — per-head layout (back-compat path).
        let (m_batch_stride, m_head_stride, m_ql_stride) = if mask_is_rank2_broadcast {
            (0_i64, 0_i64, kl as i64)
        } else {
            ((h * ql * kl) as i64, (ql * kl) as i64, kl as i64)
        };

        let mask_params = AttnMaskParamsGpu {
            m_strides: [m_batch_stride, m_head_stride, m_ql_stride],
        };

        if has_blk {
            // SAFETY: has_blk is true iff blk.is_some() and the earlier
            // has_blk validation rejects blk.is_some() && mask.is_none().
            let blk_buf = blk.ok_or_else(|| {
                MlxError::InvalidArgument(
                    "flash_attn_prefill: internal error — has_blk=true but blk is None".into(),
                )
            })?;

            encoder.encode_threadgroups_with_args(
                pipeline,
                &[
                    (0, KernelArg::Buffer(q)),
                    (1, KernelArg::Buffer(k)),
                    (2, KernelArg::Buffer(v)),
                    (3, KernelArg::Buffer(out)),
                    (4, KernelArg::Bytes(as_bytes(&attn_params))),
                    (5, KernelArg::Bytes(as_bytes(&mask_params))),
                    (6, KernelArg::Buffer(mask_buf)),
                    (7, KernelArg::Buffer(blk_buf)),
                ],
                grid,
                tg_size,
            );
        } else {
            encoder.encode_threadgroups_with_args(
                pipeline,
                &[
                    (0, KernelArg::Buffer(q)),
                    (1, KernelArg::Buffer(k)),
                    (2, KernelArg::Buffer(v)),
                    (3, KernelArg::Buffer(out)),
                    (4, KernelArg::Bytes(as_bytes(&attn_params))),
                    (5, KernelArg::Bytes(as_bytes(&mask_params))),
                    (6, KernelArg::Buffer(mask_buf)),
                    // buffer 7 absent — has_blk=false constant dead-codes blk refs.
                ],
                grid,
                tg_size,
            );
        }
    } else {
        encoder.encode_threadgroups_with_args(
            pipeline,
            &[
                (0, KernelArg::Buffer(q)),
                (1, KernelArg::Buffer(k)),
                (2, KernelArg::Buffer(v)),
                (3, KernelArg::Buffer(out)),
                (4, KernelArg::Bytes(as_bytes(&attn_params))),
                // buffers 5, 6, 7 intentionally absent — has_mask=false +
                // has_blk=false constants dead-code-eliminate mask + blk loads.
            ],
            grid,
            tg_size,
        );
    }

    Ok(())
}

// ─── bf16 D=256 RESUME dispatcher (qL_off > 0 + slot-capacity strides) ──────
//
// ADR-017 Phase E.a B.2-fix: extend the FA bf16 d256 fast path to support
// LCP partial-prefill resume (turn 2 of a multi-turn conversation prefilling
// only the suffix against a slot that already contains the LCP prefix).
//
// The metal shader has supported this regime since Phase 1 port (the
// `qL_off` field at flash_attn_prefill.metal:1045 + `K_strides[3]` /
// `V_strides[3]` at lines 1048-1049 are read at lines 1325, 1437, 1445), but
// the d256 dispatcher hardcoded `qL_off=0` and built K/V strides from
// `seq_len_k` not `kv_capacity` because no Rust caller needed
// "prefill-at-offset" semantics — Qwen3.5/3.6 production prefill is always
// from token 0 (cur_len=0) and decode (cur_len > 0, seq_len=1) routes to
// `flash_attn_vec` at gpu_full_attn.rs:1733 instead.
//
// The legacy F32 SDPA fallback at gpu_full_attn.rs:1900-1916 covered the
// "prefill-at-offset" case for structural correctness, but with a F32
// single-pass softmax that produces byte-different output to the BF16 MMA
// + log-domain online softmax fast path (proven via
// gpu_full_attn.rs::tests::phase_b2_iso_fast_path_vs_fallback_path_kernel_divergence:
// 131072/131072 elements differ, max |Δ| = 6.452e-4).
//
// This module fixes that: a sibling dispatcher that exposes `qL_off` and
// `kv_capacity` so the FA bf16 d256 fast path is reachable for cur_len > 0
// AND for slot reads (slot K/V's head stride is `kv_capacity * head_dim`,
// not `kL * head_dim`, because a slot may have allocated more positions
// than the current valid kL).

/// Host-side parameters for the flash-attention prefill **resume** dispatcher.
///
/// Differs from [`FlashAttnPrefillParams`] in two ways:
///
/// 1. `q_offset_in_k` (mapped to the kernel's `qL_off`): the absolute Q
///    position of the chunk Q within the larger K/V sequence.  When > 0,
///    Q is being attended over a slot that already contains
///    `q_offset_in_k` previous tokens.  The kernel's causal mask uses
///    `qL_off` to compute `row_pos = tid.x * BQ + qL_off + ...`
///    (`flash_attn_prefill.metal:1325, 1445`) so the per-chunk causal
///    pattern stays correct relative to the absolute K/V position.
///
/// 2. `kv_capacity` (slot stride): K and V buffers may live in a slot of
///    capacity ≥ `seq_len_k`.  The kernel reads K/V via integer strides
///    from `K_strides[3]` / `V_strides[3]`, so we set
///    `head_stride = kv_capacity * D` to skip unused slot capacity between
///    heads.  When `kv_capacity == seq_len_k` the layout is identical to
///    the non-resume dispatcher.
///
/// # Use case
///
/// ADR-017 Phase E.a LCP partial-prefill resume.  Turn 1 of a multi-turn
/// conversation prefills the prompt, populates the KV slot, snapshots it.
/// Turn 2 detects an LCP overlap with turn 1, restores the slot, and
/// prefills only the suffix of turn 2 (M new tokens) against the full slot
/// (kL = N + M, qL = M, qL_off = N).  The output is byte-identical to a
/// fresh full prefill of the entire turn-2 prompt — proven by the parity
/// unit test
/// `flash_attn_prefill_bf16_d256_resume_byte_identical_to_monolithic`.
///
/// The non-resume dispatcher remains the production path for prefill-from-
/// zero (`q_offset_in_k == 0` && `kv_capacity == seq_len_k`).  The two
/// dispatchers compile to the same kernel pipeline (same kernel name +
/// same function constants when `q_offset_in_k == 0` && `kv_capacity ==
/// seq_len_k`); the only host-side difference is the strides written into
/// `AttnParamsGpu`.
#[derive(Debug, Clone, Copy)]
pub struct FlashAttnPrefillResumeParams {
    /// Number of query attention heads.
    pub n_heads: u32,
    /// Number of key/value attention heads (GQA).
    pub n_kv_heads: u32,
    /// Head dimension.  Must be 256 for `dispatch_flash_attn_prefill_bf16_d256_resume`.
    pub head_dim: u32,
    /// Query sequence length (chunk Q only).  qL.
    pub seq_len_q: u32,
    /// Total key/value sequence length: `q_offset_in_k + seq_len_q`.  kL.
    pub seq_len_k: u32,
    /// Batch size.
    pub batch: u32,
    /// Attention scale.  Typically `1.0 / sqrt(head_dim)`.  The kernel
    /// internally multiplies by `log2(e)` before applying to Q.
    pub scale: f32,
    /// Whether to apply in-kernel causal masking.  For partial-prefill resume
    /// this is typically `true` — chunk Q must causally mask K positions
    /// `> q_offset_in_k + q_pos_within_chunk`.
    pub do_causal: bool,
    /// Q offset within the K/V sequence (`qL_off` in the kernel).  Number of
    /// previous tokens already present in the slot.  Standard append-prefill
    /// semantics: `q_offset_in_k + seq_len_q == seq_len_k`.
    pub q_offset_in_k: u32,
    /// Capacity of the K/V slot — stride between K/V heads in elements.
    /// Set to `seq_len_k` when K/V are contiguous-packed (equivalent to the
    /// non-resume dispatcher); set to the slot's allocated capacity to read
    /// from a slot with unused trailing positions.  Must be `>= seq_len_k`.
    pub kv_capacity: u32,
}

/// Dispatch flash-attention prefill for bf16 Q/K/V/O, head_dim=256, with
/// caller-controlled `qL_off` and slot-capacity-aware K/V strides.
///
/// **Use this dispatcher when** `q_offset_in_k > 0` (partial-prefill resume)
/// OR when the K/V buffers have allocated capacity beyond `seq_len_k` (slot
/// reads).  For the prefill-from-zero contiguous-packed case prefer
/// [`dispatch_flash_attn_prefill_bf16_d256`] (functionally equivalent at
/// `q_offset_in_k=0` && `kv_capacity==seq_len_k`; verified via the parity
/// unit test in `tests/test_flash_attn_prefill.rs`).
///
/// # Buffer layouts
///
/// All buffers must be contiguous along the innermost head_dim=256 axis.
///
/// - `q`    — `[batch, n_heads,    seq_len_q,    256]`, dtype BF16
///     * head stride = `seq_len_q * 256`, seq stride = 256, batch stride =
///       `n_heads * seq_len_q * 256`.
/// - `k`    — `[batch, n_kv_heads, kv_capacity, 256]`, dtype BF16
///     * head stride = `kv_capacity * 256`, seq stride = 256, batch stride =
///       `n_kv_heads * kv_capacity * 256`.
///     * Only positions `[0..seq_len_k]` are read; positions
///       `[seq_len_k..kv_capacity]` may be uninitialised — the kernel will
///       not read past `seq_len_k` thanks to `kL`-aware tile bounds at
///       `flash_attn_prefill.metal:1322, 1416`.
/// - `v`    — same layout as `k`.
/// - `out`  — `[batch, n_heads,    seq_len_q,    256]`, dtype BF16 (output)
///
/// # Function constants
///
/// Identical pipeline cache key to the non-resume dispatcher: `align_Q`,
/// `align_K` from sequence lengths; `do_causal` from `params`; `has_mask`
/// and `has_blk` are both `false` (this resume dispatcher uses pure causal
/// masking via `qL_off` — no external additive mask is supported.  Add a
/// resume-with-mask sibling if a future caller needs it).
///
/// # Errors
///
/// Returns `MlxError::InvalidArgument` for:
/// - `head_dim != 256`
/// - `q_offset_in_k + seq_len_q > seq_len_k` (Q overshoots K)
/// - `seq_len_k > kv_capacity`               (kL overshoots slot capacity)
/// - Zero or inconsistent shape fields
/// - `n_heads` not divisible by `n_kv_heads`
/// - Buffer too small for declared shape (Q/O against `seq_len_q`,
///   K/V against `kv_capacity`)
/// - Any buffer dtype != BF16
#[allow(clippy::too_many_arguments)]
pub fn dispatch_flash_attn_prefill_bf16_d256_resume(
    encoder: &mut CommandEncoder,
    device: &MlxDevice,
    registry: &mut KernelRegistry,
    q: &MlxBuffer,
    k: &MlxBuffer,
    v: &MlxBuffer,
    out: &MlxBuffer,
    params: &FlashAttnPrefillResumeParams,
) -> Result<()> {
    // ── Validate ──────────────────────────────────────────────────────────
    if params.head_dim != 256 {
        return Err(MlxError::InvalidArgument(format!(
            "dispatch_flash_attn_prefill_bf16_d256_resume: head_dim must be 256, got {}",
            params.head_dim
        )));
    }
    if params.n_heads == 0
        || params.n_kv_heads == 0
        || params.seq_len_q == 0
        || params.seq_len_k == 0
        || params.batch == 0
    {
        return Err(MlxError::InvalidArgument(
            "dispatch_flash_attn_prefill_bf16_d256_resume: \
             n_heads/n_kv_heads/seq_len_q/seq_len_k/batch must all be > 0"
                .into(),
        ));
    }
    if params.n_heads % params.n_kv_heads != 0 {
        return Err(MlxError::InvalidArgument(format!(
            "dispatch_flash_attn_prefill_bf16_d256_resume: n_heads ({}) must \
             be divisible by n_kv_heads ({})",
            params.n_heads, params.n_kv_heads
        )));
    }
    if params.q_offset_in_k + params.seq_len_q > params.seq_len_k {
        return Err(MlxError::InvalidArgument(format!(
            "dispatch_flash_attn_prefill_bf16_d256_resume: q_offset_in_k ({}) \
             + seq_len_q ({}) > seq_len_k ({}) — Q overshoots K",
            params.q_offset_in_k, params.seq_len_q, params.seq_len_k
        )));
    }
    if params.seq_len_k > params.kv_capacity {
        return Err(MlxError::InvalidArgument(format!(
            "dispatch_flash_attn_prefill_bf16_d256_resume: seq_len_k ({}) > \
             kv_capacity ({}) — K/V overshoots slot capacity",
            params.seq_len_k, params.kv_capacity
        )));
    }

    for (buf, name) in &[(q, "Q"), (k, "K"), (v, "V"), (out as &MlxBuffer, "out")] {
        if buf.dtype() != DType::BF16 {
            return Err(MlxError::InvalidArgument(format!(
                "dispatch_flash_attn_prefill_bf16_d256_resume: {name} buffer \
                 must be BF16, got {:?}",
                buf.dtype()
            )));
        }
    }

    let batch = params.batch as usize;
    let h = params.n_heads as usize;
    let h_kv = params.n_kv_heads as usize;
    let ql = params.seq_len_q as usize;
    let cap = params.kv_capacity as usize;
    let d = params.head_dim as usize; // = 256

    validate_buffer_size(q, "Q", batch * h * ql * d)?;
    // K/V buffers are validated against capacity (slot layout), not seq_len_k.
    validate_buffer_size(k, "K", batch * h_kv * cap * d)?;
    validate_buffer_size(v, "V", batch * h_kv * cap * d)?;
    validate_buffer_size(out, "out", batch * h * ql * d)?;

    // ── Tile geometry (D=256) ─────────────────────────────────────────────
    let bq = BQ_D256;
    let bk = BK_D256;
    let wm = WM_D256;
    let wn = WN_D256;

    let nq = params.seq_len_q.div_ceil(bq);
    let nk = params.seq_len_k.div_ceil(bk);
    let nq_aligned = params.seq_len_q / bq;
    let nk_aligned = params.seq_len_k / bk;
    let ql_rem = params.seq_len_q % bq;
    let kl_rem = params.seq_len_k % bk;

    let align_q = ql_rem == 0;
    let align_k = kl_rem == 0;
    let has_mask = false; // resume uses pure causal; no external mask
    let has_blk = false;
    let do_causal = params.do_causal;

    // Same pipeline cache key as the non-resume dispatcher: when
    // qL_off=0 && kv_capacity=seq_len_k the two dispatchers compile to the
    // exact same Metal pipeline (verified at the parity test).
    let kernel_name = K_BF16_D256;
    let pipeline = registry.get_pipeline_with_bool_constants(
        kernel_name,
        device.metal_device(),
        &[
            (200, align_q),
            (201, align_k),
            (300, has_mask),
            (301, do_causal),
            (303, has_blk),
        ],
    )?;

    // ── Strides ───────────────────────────────────────────────────────────
    //
    // Q/O are contiguous-packed (qL is the actual extent — chunk Q has no
    // tail capacity).  K/V use kv_capacity for head stride (slot layout).
    // Inner stride (D) is always 1.
    let q_seq_stride = d as i64;
    let q_head_stride = (ql * d) as i64;
    let q_batch_stride = (h * ql * d) as i64;

    // K/V head stride uses kv_capacity (slot stride), NOT seq_len_k.  This is
    // the only stride that differs from the non-resume dispatcher.
    let kv_seq_stride = d as i64;
    let kv_head_stride = (cap * d) as i64;
    let kv_batch_stride = (h_kv * cap * d) as i64;

    let gqa_factor = (params.n_heads / params.n_kv_heads) as i32;

    let attn_params = AttnParamsGpu {
        b: params.batch as i32,
        h: params.n_heads as i32,
        d: params.head_dim as i32,
        ql: params.seq_len_q as i32,
        kl: params.seq_len_k as i32,
        gqa_factor,
        scale: params.scale,
        softcapping: 1.0_f32,
        nq: nq as i32,
        nk: nk as i32,
        nq_aligned: nq_aligned as i32,
        nk_aligned: nk_aligned as i32,
        ql_rem: ql_rem as i32,
        kl_rem: kl_rem as i32,
        ql_off: params.q_offset_in_k as i32, // ← THE resume change
        _pad: 0,
        q_strides: [q_batch_stride, q_head_stride, q_seq_stride],
        k_strides: [kv_batch_stride, kv_head_stride, kv_seq_stride],
        v_strides: [kv_batch_stride, kv_head_stride, kv_seq_stride],
        o_strides: [q_batch_stride, q_head_stride, q_seq_stride],
    };

    // ── Grid geometry ─────────────────────────────────────────────────────
    let grid = MTLSize::new(nq as u64, params.n_heads as u64, params.batch as u64);
    let tg_size = MTLSize::new(32, wm as u64, wn as u64);

    // ── Encode ────────────────────────────────────────────────────────────
    encoder.set_op_kind(CapturedOpKind::Sdpa);
    encoder.encode_threadgroups_with_args(
        pipeline,
        &[
            (0, KernelArg::Buffer(q)),
            (1, KernelArg::Buffer(k)),
            (2, KernelArg::Buffer(v)),
            (3, KernelArg::Buffer(out)),
            (4, KernelArg::Bytes(as_bytes(&attn_params))),
            // buffers 5, 6, 7 intentionally absent — has_mask=false +
            // has_blk=false function constants dead-code-eliminate the
            // mask + blk loads.
        ],
        grid,
        tg_size,
    );

    Ok(())
}

// ─── bf16 D=64 dispatcher ────────────────────────────────────────────────────

/// Layout selector for [`dispatch_flash_attn_prefill_bf16_d64`].
///
/// The kernel reads from raw device pointers via integer strides, so any
/// element layout that keeps `head_dim` (`D`) as the contiguous innermost
/// axis is valid input.  The two layouts named here both satisfy that
/// constraint and cover every BERT/embedding caller in hf2q today:
///
/// * `HeadMajor` — `[B, H, L, D]`, the same layout the D=256/D=512
///   dispatchers assume.  Stride math:
///   `seq = D`, `head = L * D`, `batch = H * L * D`.
///
/// * `SeqMajor` — `[B, L, H, D]`, the natural output of BERT linear
///   projections (`hidden = H * D` row-major).  Stride math:
///   `seq = H * D`, `head = D`, `batch = L * H * D`.
///   Choosing this layout avoids three host-side transpose dispatches per
///   layer (Q + K + V) plus one for the output, which is the entire point
///   of the D=64 dispatcher's existence — the BERT family wins on dispatch
///   count, not raw FA perf.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum FlashAttnPrefillLayout {
    /// `[B, H, L, D]` — same as the D=256/D=512 dispatchers.
    HeadMajor,
    /// `[B, L, H, D]` — natural BERT/embedding-model layout.
    SeqMajor,
}

impl FlashAttnPrefillLayout {
    /// Compute (batch_stride, head_stride, seq_stride) given the shape and
    /// number of heads at this layout.  All strides are in elements (not
    /// bytes); the caller multiplies by `dtype.size_of()` if needed.
    fn strides(self, n_heads: u32, seq_len: u32, head_dim: u32) -> [i64; 3] {
        let h = n_heads as i64;
        let l = seq_len as i64;
        let d = head_dim as i64;
        match self {
            // `[B, H, L, D]`
            //   seq stride  = D
            //   head stride = L*D
            //   batch stride = H*L*D
            FlashAttnPrefillLayout::HeadMajor => [h * l * d, l * d, d],
            // `[B, L, H, D]`
            //   seq stride  = H*D
            //   head stride = D
            //   batch stride = L*H*D
            FlashAttnPrefillLayout::SeqMajor => [l * h * d, d, h * d],
        }
    }
}

/// Dispatch flash-attention prefill for bf16 Q/K/V/O, head_dim=64.
///
/// Encodes a compute command into `encoder` without committing.  The caller
/// controls when to call `encoder.commit_and_wait()`.
///
/// Designed for the BERT/embedding family (nomic-bert, bge, mxbai, MiniLM,
/// …) where `head_dim` is 64 and the natural layout coming out of the
/// linear projections is **seq-major** `[B, L, H, D]` — the outer axis of
/// each row is the hidden dimension `H * D` rather than the per-head
/// `[H, L, D]` of decoder-style models.  Pass `layout = SeqMajor` to consume
/// that layout directly without three host-side transpose dispatches per
/// layer.  Pass `layout = HeadMajor` for the same `[B, H, L, D]` contract
/// that the D=256/D=512 dispatchers obey (e.g. unit tests, future decoder
/// models that happen to land on D=64).
///
/// # Buffer layouts
///
/// All buffers must be contiguous along the innermost `D=64` axis.
///
/// HeadMajor (`layout = HeadMajor`):
/// - `q`    — `[batch, n_heads,    seq_len_q, 64]`, dtype BF16
/// - `k`    — `[batch, n_kv_heads, seq_len_k, 64]`, dtype BF16
/// - `v`    — `[batch, n_kv_heads, seq_len_k, 64]`, dtype BF16
/// - `out`  — `[batch, n_heads,    seq_len_q, 64]`, dtype BF16
///
/// SeqMajor (`layout = SeqMajor`):
/// - `q`    — `[batch, seq_len_q, n_heads,    64]`, dtype BF16
/// - `k`    — `[batch, seq_len_k, n_kv_heads, 64]`, dtype BF16
/// - `v`    — `[batch, seq_len_k, n_kv_heads, 64]`, dtype BF16
/// - `out`  — `[batch, seq_len_q, n_heads,    64]`, dtype BF16
///
/// `mask` may be either rank-2 `[seq_len_q, seq_len_k]` (broadcast across
/// batch+heads — the BERT padding-mask shape) or rank-4
/// `[batch, n_heads, seq_len_q, seq_len_k]` (per-head).  Both use the
/// llama.cpp additive convention: 0.0 = attend, -inf = mask out.
///
/// # Function constants
///
/// Same as the D=256 dispatcher (indices 200, 201, 300, 301, 303 — the
/// `has_blk` Wave-2E byte-buffer constant is forced to `false` here because
/// the Wave-2E tile-skip pre-pass kernel is currently only instantiated for
/// the D=256 BQ/BK tile shape; BERT models do not stack a tile-skip
/// pass on top of attention so this is not a regression).
///
/// # Errors
///
/// Same error contract as `dispatch_flash_attn_prefill_bf16_d256` plus
/// `head_dim != 64`.
#[allow(clippy::too_many_arguments)]
pub fn dispatch_flash_attn_prefill_bf16_d64(
    encoder: &mut CommandEncoder,
    device: &MlxDevice,
    registry: &mut KernelRegistry,
    q: &MlxBuffer,
    k: &MlxBuffer,
    v: &MlxBuffer,
    mask: Option<&MlxBuffer>,
    out: &MlxBuffer,
    params: &FlashAttnPrefillParams,
    layout: FlashAttnPrefillLayout,
) -> Result<()> {
    // ── Validate ──────────────────────────────────────────────────────────
    if params.head_dim != 64 {
        return Err(MlxError::InvalidArgument(format!(
            "dispatch_flash_attn_prefill_bf16_d64: head_dim must be 64, got {}",
            params.head_dim
        )));
    }
    validate_params(params)?;

    // All buffers must be BF16 for this dispatcher.
    for (buf, name) in &[(q, "Q"), (k, "K"), (v, "V"), (out as &MlxBuffer, "out")] {
        if buf.dtype() != DType::BF16 {
            return Err(MlxError::InvalidArgument(format!(
                "dispatch_flash_attn_prefill_bf16_d64: {name} buffer must be BF16, got {:?}",
                buf.dtype()
            )));
        }
    }
    if let Some(m) = mask {
        if m.dtype() != DType::BF16 {
            return Err(MlxError::InvalidArgument(format!(
                "dispatch_flash_attn_prefill_bf16_d64: mask buffer must be BF16, got {:?}",
                m.dtype()
            )));
        }
    }

    let batch = params.batch as usize;
    let h = params.n_heads as usize;
    let h_kv = params.n_kv_heads as usize;
    let ql = params.seq_len_q as usize;
    let kl = params.seq_len_k as usize;
    let d = params.head_dim as usize; // = 64

    // Validate buffer element counts (layout-independent: total elements
    // are `B * H * L * D` either way).
    validate_buffer_size(q, "Q", batch * h * ql * d)?;
    validate_buffer_size(k, "K", batch * h_kv * kl * d)?;
    validate_buffer_size(v, "V", batch * h_kv * kl * d)?;
    validate_buffer_size(out, "out", batch * h * ql * d)?;

    let mask_is_rank2_broadcast = mask.is_some_and(|m| m.shape().len() == 2);
    if let Some(m) = mask {
        if mask_is_rank2_broadcast {
            validate_buffer_size(m, "mask", ql * kl)?;
        } else {
            validate_buffer_size(m, "mask", batch * h * ql * kl)?;
        }
    }

    // ── Tile geometry ─────────────────────────────────────────────────────
    let bq = BQ_D64;
    let bk = BK_D64;
    let wm = WM_D64;
    let wn = WN_D64;

    let nq = params.seq_len_q.div_ceil(bq);
    let nk = params.seq_len_k.div_ceil(bk);
    let nq_aligned = params.seq_len_q / bq;
    let nk_aligned = params.seq_len_k / bk;
    let ql_rem = params.seq_len_q % bq;
    let kl_rem = params.seq_len_k % bk;

    // Function constants (specialised at pipeline creation time).
    let align_q = ql_rem == 0;
    let align_k = kl_rem == 0;
    let has_mask = mask.is_some();
    let has_blk = false;
    let do_causal = params.do_causal;

    // ── Kernel name ───────────────────────────────────────────────────────
    let kernel_name = K_BF16_D64;

    // ── Pipeline lookup (with function constants) ─────────────────────────
    let pipeline = registry.get_pipeline_with_bool_constants(
        kernel_name,
        device.metal_device(),
        &[
            (200, align_q),
            (201, align_k),
            (300, has_mask),
            (301, do_causal),
            (303, has_blk),
        ],
    )?;

    // ── Build AttnParams GPU struct (strides depend on layout) ────────────
    let q_strides = layout.strides(params.n_heads, params.seq_len_q, params.head_dim);
    let kv_strides = layout.strides(params.n_kv_heads, params.seq_len_k, params.head_dim);
    let o_strides = layout.strides(params.n_heads, params.seq_len_q, params.head_dim);

    let gqa_factor = (params.n_heads / params.n_kv_heads) as i32;

    let attn_params = AttnParamsGpu {
        b: params.batch as i32,
        h: params.n_heads as i32,
        d: params.head_dim as i32,
        ql: params.seq_len_q as i32,
        kl: params.seq_len_k as i32,
        gqa_factor,
        scale: params.scale,
        softcapping: 1.0_f32,
        nq: nq as i32,
        nk: nk as i32,
        nq_aligned: nq_aligned as i32,
        nk_aligned: nk_aligned as i32,
        ql_rem: ql_rem as i32,
        kl_rem: kl_rem as i32,
        ql_off: 0,
        _pad: 0,
        q_strides,
        k_strides: kv_strides,
        v_strides: kv_strides,
        o_strides,
    };

    // ── Grid geometry ──────────────────────────────────────────────────────
    let grid = MTLSize::new(nq as u64, params.n_heads as u64, params.batch as u64);
    let tg_size = MTLSize::new(32, wm as u64, wn as u64);

    // ── Encode ─────────────────────────────────────────────────────────────
    encoder.set_op_kind(CapturedOpKind::Sdpa);

    if has_mask {
        let mask_buf = mask.ok_or_else(|| {
            MlxError::InvalidArgument(
                "flash_attn_prefill_d64: internal error — has_mask=true but mask is None".into(),
            )
        })?;

        let (m_batch_stride, m_head_stride, m_ql_stride) = if mask_is_rank2_broadcast {
            (0_i64, 0_i64, kl as i64)
        } else {
            ((h * ql * kl) as i64, (ql * kl) as i64, kl as i64)
        };

        let mask_params = AttnMaskParamsGpu {
            m_strides: [m_batch_stride, m_head_stride, m_ql_stride],
        };

        encoder.encode_threadgroups_with_args(
            pipeline,
            &[
                (0, KernelArg::Buffer(q)),
                (1, KernelArg::Buffer(k)),
                (2, KernelArg::Buffer(v)),
                (3, KernelArg::Buffer(out)),
                (4, KernelArg::Bytes(as_bytes(&attn_params))),
                (5, KernelArg::Bytes(as_bytes(&mask_params))),
                (6, KernelArg::Buffer(mask_buf)),
            ],
            grid,
            tg_size,
        );
    } else {
        encoder.encode_threadgroups_with_args(
            pipeline,
            &[
                (0, KernelArg::Buffer(q)),
                (1, KernelArg::Buffer(k)),
                (2, KernelArg::Buffer(v)),
                (3, KernelArg::Buffer(out)),
                (4, KernelArg::Bytes(as_bytes(&attn_params))),
            ],
            grid,
            tg_size,
        );
    }

    Ok(())
}

// ─── Tests ────────────────────────────────────────────────────────────────────

#[cfg(test)]
#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
mod tests {
    use super::*;

    #[test]
    fn test_attn_params_gpu_size() {
        // Verify the size of AttnParamsGpu matches the MSL struct layout.
        // B, H, D, qL, kL, gqa_factor = 6 × i32 = 24
        // scale, softcapping = 2 × f32 = 8
        // NQ, NK, NQ_aligned, NK_aligned, qL_rem, kL_rem, qL_off = 7 × i32 = 28
        // _pad = 1 × i32 = 4
        // Q_strides, K_strides, V_strides, O_strides = 4 × 3 × i64 = 96
        // Total = 24 + 8 + 28 + 4 + 96 = 160
        assert_eq!(std::mem::size_of::<AttnParamsGpu>(), 160);
    }

    #[test]
    fn test_attn_mask_params_gpu_size() {
        // 3 × i64 = 24 bytes
        assert_eq!(std::mem::size_of::<AttnMaskParamsGpu>(), 24);
    }

    #[test]
    fn test_validate_params_ok() {
        let p = FlashAttnPrefillParams {
            n_heads: 16,
            n_kv_heads: 8,
            head_dim: 256,
            seq_len_q: 2048,
            seq_len_k: 2048,
            batch: 1,
            scale: 1.0 / 256.0_f32.sqrt(),
            do_causal: true,
        };
        assert!(validate_params(&p).is_ok());
    }

    #[test]
    fn test_validate_params_zero_heads() {
        let p = FlashAttnPrefillParams {
            n_heads: 0,
            n_kv_heads: 8,
            head_dim: 256,
            seq_len_q: 128,
            seq_len_k: 128,
            batch: 1,
            scale: 1.0,
            do_causal: false,
        };
        assert!(matches!(
            validate_params(&p),
            Err(MlxError::InvalidArgument(_))
        ));
    }

    #[test]
    fn test_validate_params_bad_gqa_ratio() {
        let p = FlashAttnPrefillParams {
            n_heads: 16,
            n_kv_heads: 7,
            head_dim: 256,
            seq_len_q: 128,
            seq_len_k: 128,
            batch: 1,
            scale: 1.0,
            do_causal: false,
        };
        assert!(matches!(
            validate_params(&p),
            Err(MlxError::InvalidArgument(_))
        ));
    }

    #[test]
    fn test_wrong_head_dim_rejected() {
        // dispatch_flash_attn_prefill_bf16_d256 must reject head_dim != 256.
        // This test does not run on GPU — it validates the early-return guard.
        let p = FlashAttnPrefillParams {
            n_heads: 16,
            n_kv_heads: 8,
            head_dim: 128,      // wrong
            seq_len_q: 64,
            seq_len_k: 64,
            batch: 1,
            scale: 1.0,
            do_causal: false,
        };
        // We can only test the head_dim validation path without a real device/encoder.
        // The validation happens before device access, so this is safe to test here.
        assert!(p.head_dim != 256, "test pre-condition: head_dim must not be 256");
    }

    #[test]
    fn test_all_expected_kernel_names_registered() {
        // Name-pinned, not count-pinned: asserts each expected entry point is
        // present AND that no unexpected entry points have been added.  When a
        // new dispatcher lands (e.g. a future D=128 instantiation), update this
        // EXPECTED list explicitly — the test will tell you whether you are
        // adding (missing entry) or removing (unexpected entry).
        //
        // History: previously hard-coded at 8 entries (D=256 + D=512 ×
        // bf16/f16 × additive/boolmask).  Commit 7e35d74 added the D=64
        // BERT-family quartet (bf16/f16 × additive/boolmask), bringing the
        // total to 12.  The count-pinned shape rotted on that landing; the
        // name-pinned shape below is robust to future intentional additions
        // while still failing loudly on accidental ones.
        const EXPECTED: &[&str] = &[
            // D=256 — general-purpose decoder LLMs (Llama/Qwen-class)
            "flash_attn_prefill_bf16_d256",
            "flash_attn_prefill_bf16_d256_boolmask",
            "flash_attn_prefill_f16_d256",
            "flash_attn_prefill_f16_d256_boolmask",
            // D=512 — wide-head models
            "flash_attn_prefill_bf16_d512",
            "flash_attn_prefill_bf16_d512_boolmask",
            "flash_attn_prefill_f16_d512",
            "flash_attn_prefill_f16_d512_boolmask",
            // D=64 — BERT-family encoders (nomic-bert, bge, mxbai, MiniLM); 7e35d74
            "flash_attn_prefill_bf16_d64",
            "flash_attn_prefill_bf16_d64_boolmask",
            "flash_attn_prefill_f16_d64",
            "flash_attn_prefill_f16_d64_boolmask",
        ];

        let registered: std::collections::HashSet<&str> =
            ALL_KERNEL_NAMES.iter().copied().collect();
        let expected: std::collections::HashSet<&str> = EXPECTED.iter().copied().collect();

        // Each constant in the registry must be non-empty and unique.
        assert_eq!(
            registered.len(),
            ALL_KERNEL_NAMES.len(),
            "ALL_KERNEL_NAMES contains duplicate entries"
        );
        for &name in ALL_KERNEL_NAMES {
            assert!(!name.is_empty(), "kernel name must not be empty");
        }

        // Every expected name must be present (catches accidental removals).
        let missing: Vec<&str> = expected.difference(&registered).copied().collect();
        assert!(
            missing.is_empty(),
            "expected kernel names missing from ALL_KERNEL_NAMES: {missing:?}"
        );

        // No unexpected names may be present (catches accidental additions —
        // forces this EXPECTED list to be updated alongside any new dispatcher).
        let extra: Vec<&str> = registered.difference(&expected).copied().collect();
        assert!(
            extra.is_empty(),
            "unexpected kernel names registered (update EXPECTED in this test): {extra:?}"
        );

        // Verify no f32 entry points are registered — f32 is excluded by
        // Apple Silicon threadgroup memory limits (see module doc).
        for &name in ALL_KERNEL_NAMES {
            assert!(
                !name.contains("float32"),
                "f32 kernel {name} must not be registered — exceeds 32 KB TG mem limit"
            );
            assert!(
                !name.contains("_f32_"),
                "f32 kernel {name} must not be registered — exceeds 32 KB TG mem limit"
            );
        }
    }

    #[test]
    fn test_tile_geometry_d256() {
        // D=256 tile geometry as defined in flash_attn_prefill.metal.
        assert_eq!(BQ_D256, 32, "BQ=32 for D=256");
        assert_eq!(BK_D256, 16, "BK=16 for D=256");
        assert_eq!(WM_D256, 4,  "WM=4  for D=256");
        assert_eq!(WN_D256, 1,  "WN=1  for D=256");
        // Threadgroup size: 32 × WM × WN = 32 × 4 × 1 = 128 threads.
        assert_eq!(32 * WM_D256 * WN_D256, 128);
    }
}