mlx-native 0.4.3

//! Flash-attention-style tiled prefill kernel — host dispatch.
//!
//! mlx-native's batched-prefill SDPA kernel, the prefill counterpart to
//! `flash_attn_vec` (which handles the seq_len=1 decode case).  Implements
//! online softmax + simdgroup MMA tiling on Apple GPU.
//!
//! ## Kernel variants registered
//!
//! Eight entry points are registered, all backed by the single
//! `flash_attn_prefill.metal` shader source:
//!
//! ### D=256 (BQ=32, BK=16, WM=4, WN=1 — 128 threads/threadgroup)
//!
//! | Kernel name | I/O dtype | Mask kind |
//! |---|---|---|
//! | `flash_attn_prefill_bf16_d256`           | bf16 | bf16 additive (log-domain) |
//! | `flash_attn_prefill_bf16_d256_boolmask`  | bf16 | bool (`is_attended`) |
//! | `flash_attn_prefill_f16_d256`            | f16  | f16 additive |
//! | `flash_attn_prefill_f16_d256_boolmask`   | f16  | bool |
//!
//! ### D=512 (BQ=8, BK=8, WM=1, WN=1 — 32 threads/threadgroup, 1 simdgroup)
//!
//! | Kernel name | I/O dtype | Mask kind |
//! |---|---|---|
//! | `flash_attn_prefill_bf16_d512`           | bf16 | bf16 additive |
//! | `flash_attn_prefill_bf16_d512_boolmask`  | bf16 | bool |
//! | `flash_attn_prefill_f16_d512`            | f16  | f16 additive |
//! | `flash_attn_prefill_f16_d512_boolmask`   | f16  | bool |
//!
//! ### f32 is NOT instantiated at either D
//!
//! The f32 Qs threadgroup tile alone is `BQ * BD * 4` bytes — at D=256 this
//! is 32 KB exactly, the Apple Silicon `MTLDevice.maxThreadgroupMemoryLength`
//! hard limit, before KV_smem or scratch.  Verified empirically on M5 Max:
//! f32 D=256 requires ~53.7 KB and fails at library compile.  bf16 and f16
//! halve the tile footprint (~29 KB) and fit within the limit.  f32
//! correctness is verified at the CPU reference layer in
//! `tests/test_flash_attn_prefill.rs`.  D=512 f32 is excluded for the same
//! reason.  See `ADR-011-phase1-port-source-decision.md` §3 for the full
//! threadgroup-memory analysis.
//!
//! ## Function constants
//!
//! The kernel declares four Metal function constants that must be specialised
//! at pipeline creation time (not at dispatch time):
//!
//! - Index 200: `align_Q` (bool) — true when `qL % BQ == 0`
//! - Index 201: `align_K` (bool) — true when `kL % BK == 0`
//! - Index 300: `has_mask` (bool) — true when a mask buffer is bound
//! - Index 301: `do_causal` (bool) — true for in-kernel causal masking
//!
//! These are plumbed via [`KernelRegistry::get_pipeline_with_bool_constants`],
//! which caches compiled pipelines keyed by `(kernel_name, align_Q, align_K,
//! has_mask, do_causal)`.  Pipeline compilation is amortised: the slow path
//! runs only once per unique `(name, booleans)` combination.
//!
//! ## Buffer layout (indices match the MSL kernel)
//!
//! - `buffer(0)` — Q `[B, H,    qL, D]`  device, contiguous inner dim
//! - `buffer(1)` — K `[B, H_kv, kL, D]`  device, contiguous inner dim
//! - `buffer(2)` — V `[B, H_kv, kL, D]`  device, contiguous inner dim
//! - `buffer(3)` — O `[B, H,    qL, D]`  device, written by kernel
//! - `buffer(4)` — `AttnParams` constant buffer (this module's [`AttnParamsGpu`])
//! - `buffer(5)` — `AttnMaskParams` constant buffer (only when `has_mask=true`)
//! - `buffer(6)` — mask data buffer (only when `has_mask=true`)
//!
//! ## Grid geometry
//!
//! - Threadgroups: `(ceil(qL / BQ), H, B)`
//! - Threads per threadgroup: `(32, WM, WN)`
//! - D=256: 128 threads (4 simdgroups × 32 lanes).
//! - D=512:  32 threads (1 simdgroup  × 32 lanes).
//!
//! ## Scale convention
//!
//! Pass `scale = 1.0 / sqrt(head_dim)`.  The kernel multiplies internally by
//! `log2(e) ≈ 1.44269504` and uses `fast::exp2` throughout — so the host
//! MUST NOT pre-multiply by `log2(e)`.
//!
//! ## Mask-sentinel contract (llama.cpp convention)
//!
//! The additive mask buffer (bf16/f16 for the additive dispatchers) uses
//! the llama.cpp CPU-side convention: **masked positions = `-INFINITY`**
//! (IEEE-754 f32 `0xFF800000`, cast to the I/O dtype — both `half(-inf)`
//! and `bfloat16(-inf)` have a real `-inf` encoding that the kernel
//! consumes correctly).  Attended positions = `0.0`.
//!
//! This matches llama.cpp's mask-authoring sites at
//! `llama-graph.cpp:421, 436, 557` and `llama-kv-cache.cpp:1572`, where
//! the CPU writes raw `-INFINITY` and the flash-attn cast to f16 saturates
//! to f16-`-INFINITY`.
//!
//! The kernel does NOT require masks to use `-FLT_MAX/2` or any other
//! finite "large negative" sentinel.  The `-FLT_MAX/2` value is the
//! kernel-internal `M` (running row-max) initialiser — a finite sentinel
//! that absorbs `-inf` scores via `simd_max` without ever letting `M`
//! become `-inf`, which in turn lets every `exp(score - M)` evaluate as
//! `exp(-inf) = 0.0` (IEEE-754 exact) rather than `exp(-inf - -inf) =
//! exp(NaN) = NaN`.  See ADR-011-phase2-port-sentinel.md §1.
//!
//! Callers may pass arbitrary finite additive biases in the mask for
//! non-masked positions (e.g. ALiBi, relative-position biases); only
//! "fully block this K position" requires `-inf`.
//!
//! ## See also
//!
//! - Kernel: `/opt/mlx-native/src/shaders/flash_attn_prefill.metal`
//! - ADR-011: `/opt/hf2q/docs/ADR-011-flash-attn-prefill.md`
//! - ADR-011 phase 2 sentinel port: `/opt/hf2q/docs/ADR-011-phase2-port-sentinel.md`

use metal::MTLSize;

use crate::buffer::MlxBuffer;
use crate::device::MlxDevice;
use crate::encoder::{CapturedOpKind, CommandEncoder, KernelArg, as_bytes};
use crate::error::{MlxError, Result};
use crate::kernel_registry::KernelRegistry;
use crate::DType;

// ─── Shader source ───────────────────────────────────────────────────────────

/// MSL source for the flash-attention prefill kernel (embedded at compile time).
pub static FLASH_ATTN_PREFILL_SHADER_SOURCE: &str =
    include_str!("../shaders/flash_attn_prefill.metal");

// ─── All 8 kernel entry-point names ──────────────────────────────────────────

/// D=256, bf16 I/O, bf16 additive mask.
const K_BF16_D256: &str = "flash_attn_prefill_bf16_d256";
/// D=256, bf16 I/O, bool (`is_attended`) mask.
const K_BF16_D256_BOOLMASK: &str = "flash_attn_prefill_bf16_d256_boolmask";
/// D=256, f16 I/O, f16 additive mask.
const K_F16_D256: &str = "flash_attn_prefill_f16_d256";
/// D=256, f16 I/O, bool mask.
const K_F16_D256_BOOLMASK: &str = "flash_attn_prefill_f16_d256_boolmask";
/// D=512, bf16 I/O, bf16 additive mask.
const K_BF16_D512: &str = "flash_attn_prefill_bf16_d512";
/// D=512, bf16 I/O, bool mask.
const K_BF16_D512_BOOLMASK: &str = "flash_attn_prefill_bf16_d512_boolmask";
/// D=512, f16 I/O, f16 additive mask.
const K_F16_D512: &str = "flash_attn_prefill_f16_d512";
/// D=512, f16 I/O, bool mask.
const K_F16_D512_BOOLMASK: &str = "flash_attn_prefill_f16_d512_boolmask";

/// All 8 kernel entry-point names exported by `flash_attn_prefill.metal`.
///
/// Registering all 8 against the single shader source costs nothing at
/// registration time (source is a static `&str`) and ensures additional
/// dispatchers (f16 paths, D=512 exposure) can be added later without
/// touching registration here.
const ALL_KERNEL_NAMES: &[&str] = &[
    K_BF16_D256,
    K_BF16_D256_BOOLMASK,
    K_F16_D256,
    K_F16_D256_BOOLMASK,
    K_BF16_D512,
    K_BF16_D512_BOOLMASK,
    K_F16_D512,
    K_F16_D512_BOOLMASK,
];

// ─── Registration ─────────────────────────────────────────────────────────────

/// Register all flash-attention prefill kernel entry points with the registry.
///
/// Maps all 8 entry-point names to the single `flash_attn_prefill.metal`
/// source.  This must be called before any dispatch to these kernels.
///
/// # Design note
///
/// `KernelRegistry` compiles one Metal library per kernel name.  All 8 names
/// point at the same source text, so the Metal compiler sees the same ~1 500-line
/// source each time — compilation is amortised in `KernelRegistry::get_pipeline`
/// (first call per name triggers compilation; subsequent calls return the cached
/// pipeline).  Registering all 8 here rather than only the Phase 1a subset
/// means Phase 2/4 dispatcher functions can be added without touching this file.
pub fn register(registry: &mut KernelRegistry) {
    for &name in ALL_KERNEL_NAMES {
        registry.register_source(name, FLASH_ATTN_PREFILL_SHADER_SOURCE);
    }
}

// ─── MSL struct mirrors ───────────────────────────────────────────────────────

/// Rust mirror of the MSL `AttnParams` struct.
///
/// Field order and types match the MSL definition exactly.
/// MSL source: `flash_attn_prefill.metal` — see the `AttnParams` struct
/// definition in the kernel source for the field-by-field reference.
///
/// # Layout
///
/// All `int` fields are 32-bit (i32 in Rust).  The `int64_t` stride arrays are
/// 64-bit (i64 in Rust).  The compiler inserts natural alignment padding:
///
/// ```text
/// Offset  0:  B            (i32,  4 bytes)
/// Offset  4:  H            (i32,  4 bytes)
/// Offset  8:  D            (i32,  4 bytes)
/// Offset 12:  qL           (i32,  4 bytes)
/// Offset 16:  kL           (i32,  4 bytes)
/// Offset 20:  gqa_factor   (i32,  4 bytes)
/// Offset 24:  scale        (f32,  4 bytes)
/// Offset 28:  softcapping  (f32,  4 bytes)
/// Offset 32:  NQ           (i32,  4 bytes)
/// Offset 36:  NK           (i32,  4 bytes)
/// Offset 40:  NQ_aligned   (i32,  4 bytes)
/// Offset 44:  NK_aligned   (i32,  4 bytes)
/// Offset 48:  qL_rem       (i32,  4 bytes)
/// Offset 52:  kL_rem       (i32,  4 bytes)
/// Offset 56:  qL_off       (i32,  4 bytes)
/// Offset 60:  _pad         (4 bytes — alignment before i64 array)
/// Offset 64:  Q_strides[3] (3 × i64, 24 bytes)
/// Offset 88:  K_strides[3] (3 × i64, 24 bytes)
/// Offset 112: V_strides[3] (3 × i64, 24 bytes)
/// Offset 136: O_strides[3] (3 × i64, 24 bytes)
/// Total: 160 bytes
/// ```
///
/// `bytemuck::Pod` / `bytemuck::Zeroable` are derived — the struct must have
/// no uninitialized padding bytes.  The explicit `_pad` field makes the padding
/// concrete so Pod can be derived safely.
///
/// `softcapping` is always set to `1.0` (disabled).  The `attention<>` kernel
/// body does not read it; the field exists for ABI parity with attention
/// implementations that thread softcapping through the same param block
/// (e.g. Gemma-style logit softcap), so we don't have to redo the layout
/// when that work lands.
#[repr(C)]
#[derive(Debug, Clone, Copy, bytemuck::Pod, bytemuck::Zeroable)]
pub struct AttnParamsGpu {
    /// Batch size.
    pub b: i32,
    /// Number of query heads.
    pub h: i32,
    /// Head dimension (D).
    pub d: i32,
    /// Query sequence length.
    pub ql: i32,
    /// Key/value sequence length.
    pub kl: i32,
    /// Group-query attention factor: H / H_kv.
    pub gqa_factor: i32,
    /// Attention scale (= 1.0 / sqrt(head_dim); kernel multiples by log2(e)).
    pub scale: f32,
    /// Softcapping value — always 1.0 (disabled) for standard SDPA.
    pub softcapping: f32,
    /// Number of Q tiles: ceil(qL / BQ).
    pub nq: i32,
    /// Number of KV tiles: ceil(kL / BK).
    pub nk: i32,
    /// Number of full (aligned) Q tiles: qL / BQ.
    pub nq_aligned: i32,
    /// Number of full (aligned) KV tiles: kL / BK.
    pub nk_aligned: i32,
    /// Remainder elements in the last Q tile: qL % BQ (0 if aligned).
    pub ql_rem: i32,
    /// Remainder elements in the last KV tile: kL % BK (0 if aligned).
    pub kl_rem: i32,
    /// Query sequence start offset (0 for standard prefill).
    pub ql_off: i32,
    /// Explicit padding to align the subsequent i64 arrays to 8-byte boundary.
    pub _pad: i32,
    /// Query strides: (batch stride, head stride, seq stride).  Inner dim = 1.
    pub q_strides: [i64; 3],
    /// Key strides: (batch stride, head stride, seq stride).  Inner dim = 1.
    pub k_strides: [i64; 3],
    /// Value strides: (batch stride, head stride, seq stride).  Inner dim = 1.
    pub v_strides: [i64; 3],
    /// Output strides: (batch stride, head stride, seq stride).  Inner dim = 1.
    pub o_strides: [i64; 3],
}

/// Rust mirror of the MSL `AttnMaskParams` struct.
///
/// MSL source: `flash_attn_prefill.metal` — see the `AttnMaskParams` struct
/// definition in the kernel source.
///
/// Contains the mask buffer strides.  Only sent to the kernel when
/// `has_mask = true` (buffer index 5).
#[repr(C)]
#[derive(Debug, Clone, Copy, bytemuck::Pod, bytemuck::Zeroable)]
pub struct AttnMaskParamsGpu {
    /// Mask strides: (batch stride, head stride, qL stride).  Inner dim = 1.
    pub m_strides: [i64; 3],
}

// ─── Public Rust-side parameter struct ───────────────────────────────────────

/// Host-side parameters for the flash-attention prefill dispatcher.
///
/// Used only by the Rust dispatcher; does not map 1:1 to the GPU struct —
/// the GPU struct is computed from these fields inside the dispatcher.
#[derive(Debug, Clone, Copy)]
pub struct FlashAttnPrefillParams {
    /// Number of query attention heads.
    pub n_heads: u32,
    /// Number of key/value attention heads (GQA: may be < n_heads).
    pub n_kv_heads: u32,
    /// Head dimension.  Must be 256 for `dispatch_flash_attn_prefill_bf16_d256`.
    pub head_dim: u32,
    /// Query sequence length.
    pub seq_len_q: u32,
    /// Key/value sequence length.
    pub seq_len_k: u32,
    /// Batch size.
    pub batch: u32,
    /// Attention scale.  Typically `1.0 / sqrt(head_dim)`.
    ///
    /// The kernel internally multiplies this by `log2(e) = 1.44269504089`
    /// before applying it to Q.  The host MUST NOT pre-multiply by log2(e).
    pub scale: f32,
    /// Whether to apply in-kernel causal masking (`do_causal` function constant).
    ///
    /// When true, positions where `row_pos < col_pos` receive a score of -inf
    /// before softmax.  This can be combined with an external mask buffer.
    pub do_causal: bool,
}

// ─── Tile geometry constants (D=256) ─────────────────────────────────────────

/// Q tile size for D=256: BQ=32.
const BQ_D256: u32 = 32;

/// KV tile size for D=256: BK=16.
const BK_D256: u32 = 16;

/// Simdgroups along Q dimension for D=256: WM=4.
const WM_D256: u32 = 4;

/// Simdgroups along K dimension for D=256: WN=1.
const WN_D256: u32 = 1;

// ─── Validation ───────────────────────────────────────────────────────────────

fn validate_params(params: &FlashAttnPrefillParams) -> Result<()> {
    if params.n_heads == 0 {
        return Err(MlxError::InvalidArgument(
            "flash_attn_prefill: n_heads must be > 0".into(),
        ));
    }
    if params.n_kv_heads == 0 {
        return Err(MlxError::InvalidArgument(
            "flash_attn_prefill: n_kv_heads must be > 0".into(),
        ));
    }
    if params.n_heads % params.n_kv_heads != 0 {
        return Err(MlxError::InvalidArgument(format!(
            "flash_attn_prefill: n_heads ({}) must be divisible by n_kv_heads ({})",
            params.n_heads, params.n_kv_heads
        )));
    }
    if params.seq_len_q == 0 {
        return Err(MlxError::InvalidArgument(
            "flash_attn_prefill: seq_len_q must be > 0".into(),
        ));
    }
    if params.seq_len_k == 0 {
        return Err(MlxError::InvalidArgument(
            "flash_attn_prefill: seq_len_k must be > 0".into(),
        ));
    }
    if params.batch == 0 {
        return Err(MlxError::InvalidArgument(
            "flash_attn_prefill: batch must be > 0".into(),
        ));
    }
    Ok(())
}

fn validate_buffer_size(buf: &MlxBuffer, name: &str, expected_elements: usize) -> Result<()> {
    let expected_bytes = expected_elements * buf.dtype().size_of();
    if buf.byte_len() < expected_bytes {
        return Err(MlxError::InvalidArgument(format!(
            "flash_attn_prefill: {name} buffer too small: expected at least \
             {expected_bytes} bytes, got {}",
            buf.byte_len()
        )));
    }
    Ok(())
}

// ─── bf16 D=256 dispatcher ───────────────────────────────────────────────────

/// Dispatch flash-attention prefill for bf16 Q/K/V/O, head_dim=256.
///
/// Encodes a compute command into `encoder` without committing.  The caller
/// controls when to call `encoder.commit_and_wait()`.
///
/// # Why bf16 and not f32
///
/// The f32 Qs threadgroup tile (BQ×BD×4 = 32 KB at D=256) consumes the entire
/// Apple Silicon threadgroup-memory budget before KV_smem, scratch, or any
/// padding — so f32 D=256 is not instantiated (see module doc).  bf16/f16
/// halve the tile footprint; the MMA accumulator is still f32 internally
/// (the kernel's `T_accum` template parameter is `float` for every
/// instantiation — see `flash_attn_prefill.metal:~1504`), so prefill output
/// precision is `bf16 × bf16 → f32 → bf16` — bf16-bounded at the store, not
/// at the accumulator.
///
/// # Buffer layouts
///
/// All buffers must be contiguous (stride-1 along the innermost / head_dim
/// dimension):
///
/// - `q`    — `[batch, n_heads,    seq_len_q, 256]`, dtype BF16
/// - `k`    — `[batch, n_kv_heads, seq_len_k, 256]`, dtype BF16
/// - `v`    — `[batch, n_kv_heads, seq_len_k, 256]`, dtype BF16
/// - `mask` — `[batch, n_heads, seq_len_q, seq_len_k]`, dtype BF16
///   (additive, log-scale: 0.0 = attend, -inf = mask out — llama.cpp
///   convention, see module doc "Mask-sentinel contract"), or `None`
/// - `out`  — `[batch, n_heads,    seq_len_q, 256]`, dtype BF16 (output)
///
/// # Function constants
///
/// `align_Q`, `align_K` are computed from the sequence lengths and tile sizes.
/// `has_mask` reflects whether `mask` is `Some(_)`.
/// `do_causal` is taken from `params.do_causal`.
///
/// A distinct Metal pipeline is compiled for each unique combination of these
/// four booleans and cached in `registry`.
///
/// # Errors
///
/// Returns `MlxError::InvalidArgument` for:
/// - `head_dim != 256`
/// - Zero or inconsistent shape fields
/// - Buffer too small for the declared shape
/// - `n_heads` not divisible by `n_kv_heads`
/// - Any buffer dtype != BF16
///
/// Returns `MlxError::ShaderCompilationError` if the Metal pipeline
/// compilation fails.
#[allow(clippy::too_many_arguments)]
pub fn dispatch_flash_attn_prefill_bf16_d256(
    encoder: &mut CommandEncoder,
    device: &MlxDevice,
    registry: &mut KernelRegistry,
    q: &MlxBuffer,
    k: &MlxBuffer,
    v: &MlxBuffer,
    mask: Option<&MlxBuffer>,
    out: &mut MlxBuffer,
    params: &FlashAttnPrefillParams,
) -> Result<()> {
    // Delegate to the blk-aware dispatcher with blk=None.  Because
    // `has_blk` is a function constant (index 303), the compiled pipeline
    // with has_blk=false dead-codes every blk reference — this call has
    // zero runtime overhead compared with the pre-Wave-2E code path.
    dispatch_flash_attn_prefill_bf16_d256_with_blk(
        encoder, device, registry, q, k, v, mask, None, out, params,
    )
}

/// Dispatch flash-attention prefill for bf16 Q/K/V/O, head_dim=256, with an
/// optional Wave 2E tile-skip pre-pass byte buffer.
///
/// This is the blk-aware sibling of [`dispatch_flash_attn_prefill_bf16_d256`].
/// When `blk` is `Some(buf)`, the kernel reads a classification byte per
/// `(qtile, ktile)` and:
///
/// - On `blk[qt][kt] == 0`, skips the entire KV tile (no K/V load, no
///   Q·K^T, no mask-add, no softmax update).
/// - On `blk[qt][kt] == 2`, skips the mask-add (but still computes Q·K^T
///   and softmax normally).
/// - On `blk[qt][kt] == 1`, runs the standard path.
///
/// The `blk` buffer must be produced by
/// [`crate::ops::flash_attn_prefill_blk::dispatch_flash_attn_prefill_blk`]
/// with the SAME `(BQ=32, BK=16)` tile shape as this dispatcher, and the
/// caller MUST sequence the pre-pass dispatch BEFORE this one on the same
/// command encoder (or commit the pre-pass first).
///
/// # Correctness invariant
///
/// For any valid (mask, built blk), calling this function with `blk=None`
/// vs `blk=Some(built_blk)` MUST produce bit-exact identical output.  The
/// blk path is a pure skip optimisation.  Exercised by
/// `test_gpu_bf16_d256_with_blk_matches_no_blk` in
/// `tests/test_flash_attn_prefill.rs`.
///
/// # Buffer layout
///
/// All buffers as in [`dispatch_flash_attn_prefill_bf16_d256`], plus:
///
/// - `blk` — `[ceil(seq_len_q / 32), ceil(seq_len_k / 16)]`, dtype U8.
///   Required iff `Some(_)`; must have at least
///   `ceil(qL/32) * ceil(kL/16)` bytes.  When `None`, the dispatcher
///   compiles with `has_blk=false` and does not bind buffer index 7.
///
/// # Errors
///
/// Same as [`dispatch_flash_attn_prefill_bf16_d256`], plus:
/// - `blk` is `Some(b)` with `mask = None` (a blk without a mask is
///   meaningless — rejected to catch caller bugs).
/// - `blk` buffer undersized.
#[allow(clippy::too_many_arguments)]
pub fn dispatch_flash_attn_prefill_bf16_d256_with_blk(
    encoder: &mut CommandEncoder,
    device: &MlxDevice,
    registry: &mut KernelRegistry,
    q: &MlxBuffer,
    k: &MlxBuffer,
    v: &MlxBuffer,
    mask: Option<&MlxBuffer>,
    blk: Option<&MlxBuffer>,
    out: &mut MlxBuffer,
    params: &FlashAttnPrefillParams,
) -> Result<()> {
    // ── Validate ──────────────────────────────────────────────────────────
    if params.head_dim != 256 {
        return Err(MlxError::InvalidArgument(format!(
            "dispatch_flash_attn_prefill_bf16_d256: head_dim must be 256, got {}",
            params.head_dim
        )));
    }
    // Reject the meaningless has_blk=true, has_mask=false case: a blk
    // buffer classifies the contents of the mask; without a mask the blk
    // bytes are computed against undefined data.  This is a caller-bug
    // defence, not a shader limitation.
    if blk.is_some() && mask.is_none() {
        return Err(MlxError::InvalidArgument(
            "dispatch_flash_attn_prefill_bf16_d256_with_blk: \
             blk requires mask (a blk without a mask is meaningless)"
                .into(),
        ));
    }
    validate_params(params)?;

    // All buffers must be BF16 for this dispatcher.
    for (buf, name) in &[(q, "Q"), (k, "K"), (v, "V"), (out as &MlxBuffer, "out")] {
        if buf.dtype() != DType::BF16 {
            return Err(MlxError::InvalidArgument(format!(
                "dispatch_flash_attn_prefill_bf16_d256: {name} buffer must be BF16, \
                 got {:?}",
                buf.dtype()
            )));
        }
    }
    if let Some(m) = mask {
        if m.dtype() != DType::BF16 {
            return Err(MlxError::InvalidArgument(format!(
                "dispatch_flash_attn_prefill_bf16_d256: mask buffer must be BF16, \
                 got {:?}",
                m.dtype()
            )));
        }
    }

    let batch = params.batch as usize;
    let h = params.n_heads as usize;
    let h_kv = params.n_kv_heads as usize;
    let ql = params.seq_len_q as usize;
    let kl = params.seq_len_k as usize;
    let d = params.head_dim as usize; // = 256

    // Validate buffer element counts.
    validate_buffer_size(q, "Q", batch * h * ql * d)?;
    validate_buffer_size(k, "K", batch * h_kv * kl * d)?;
    validate_buffer_size(v, "V", batch * h_kv * kl * d)?;
    validate_buffer_size(out, "out", batch * h * ql * d)?;
    // A rank-2 mask `[qL, kL]` is the Wave 2D broadcast layout: one plane is
    // shared across all batches and heads (stride-0 in the batch and head dims).
    // A rank-4 mask `[B, H, qL, kL]` is the per-head layout used by callers
    // that already have a fully-expanded mask (e.g. pre-Wave-2D code paths).
    let mask_is_rank2_broadcast = mask.is_some_and(|m| m.shape().len() == 2);
    if let Some(m) = mask {
        if mask_is_rank2_broadcast {
            validate_buffer_size(m, "mask", ql * kl)?;
        } else {
            validate_buffer_size(m, "mask", batch * h * ql * kl)?;
        }
    }

    // ── Tile geometry ─────────────────────────────────────────────────────
    let bq = BQ_D256;
    let bk = BK_D256;
    let wm = WM_D256;
    let wn = WN_D256;

    let nq = params.seq_len_q.div_ceil(bq);
    let nk = params.seq_len_k.div_ceil(bk);
    let nq_aligned = params.seq_len_q / bq;
    let nk_aligned = params.seq_len_k / bk;
    let ql_rem = params.seq_len_q % bq;
    let kl_rem = params.seq_len_k % bk;

    // Function constants (specialised at pipeline creation time, not dispatch).
    let align_q = ql_rem == 0;
    let align_k = kl_rem == 0;
    let has_mask = mask.is_some();
    let has_blk = blk.is_some();
    let do_causal = params.do_causal;

    // Validate blk buffer size when present.  Tile shape is fixed for D=256:
    // BQ=32, BK=16 — see ADR-011-phase2-port-tile-skip.md §5.1.
    if let Some(b) = blk {
        let nq_tiles = ql.div_ceil(BQ_D256 as usize);
        let nk_tiles = kl.div_ceil(BK_D256 as usize);
        let expected = nq_tiles * nk_tiles;
        if b.byte_len() < expected {
            return Err(MlxError::InvalidArgument(format!(
                "dispatch_flash_attn_prefill_bf16_d256_with_blk: blk buffer \
                 too small: expected at least {expected} bytes (NQ={nq_tiles}, \
                 NK={nk_tiles}), got {}",
                b.byte_len()
            )));
        }
    }

    // ── Kernel name ───────────────────────────────────────────────────────
    // bf16 I/O, bf16 additive mask (or no mask — uses same pipeline since
    // has_mask is a function constant, not part of the name).
    let kernel_name = K_BF16_D256;

    // ── Pipeline lookup (with function constants) ─────────────────────────
    //
    // Wave 2E adds function constant 303 (has_blk).  When has_blk=false
    // every blk reference in the shader is dead-coded, so pipelines with
    // the pre-Wave-2E constants {200, 201, 300, 301} + {303: false}
    // generate the same machine code as the pre-Wave-2E pipelines.  The
    // only cache-key overhead is one extra "b0" suffix, which is paid
    // once per (aligned, causal, masked) combo at compile time.
    let pipeline = registry.get_pipeline_with_bool_constants(
        kernel_name,
        device.metal_device(),
        &[
            (200, align_q),
            (201, align_k),
            (300, has_mask),
            (301, do_causal),
            (303, has_blk),
        ],
    )?;

    // ── Build AttnParams GPU struct ───────────────────────────────────────
    //
    // Strides for layout [B, H, L, D] where the innermost (D) stride is 1
    // (contiguous):
    //
    //   seq stride  = D
    //   head stride = L * D
    //   batch stride = H * L * D
    //
    // Q/O use (H, qL); K/V use (H_kv, kL).

    let q_seq_stride = d as i64;
    let q_head_stride = (ql * d) as i64;
    let q_batch_stride = (h * ql * d) as i64;

    let kv_seq_stride = d as i64;
    let kv_head_stride = (kl * d) as i64;
    let kv_batch_stride = (h_kv * kl * d) as i64;

    let gqa_factor = (params.n_heads / params.n_kv_heads) as i32;

    let attn_params = AttnParamsGpu {
        b: params.batch as i32,
        h: params.n_heads as i32,
        d: params.head_dim as i32,
        ql: params.seq_len_q as i32,
        kl: params.seq_len_k as i32,
        gqa_factor,
        scale: params.scale,
        softcapping: 1.0_f32,  // always disabled; see module doc
        nq: nq as i32,
        nk: nk as i32,
        nq_aligned: nq_aligned as i32,
        nk_aligned: nk_aligned as i32,
        ql_rem: ql_rem as i32,
        kl_rem: kl_rem as i32,
        ql_off: 0,             // standard prefill starts at offset 0
        _pad: 0,
        q_strides: [q_batch_stride, q_head_stride, q_seq_stride],
        k_strides: [kv_batch_stride, kv_head_stride, kv_seq_stride],
        v_strides: [kv_batch_stride, kv_head_stride, kv_seq_stride],
        o_strides: [q_batch_stride, q_head_stride, q_seq_stride],
    };

    // ── Grid geometry ──────────────────────────────────────────────────────
    //   grid = (NQ, H, B)  where NQ = ceil(qL / BQ)
    //   threadgroup = (32, WM, WN)
    let grid = MTLSize::new(nq as u64, params.n_heads as u64, params.batch as u64);
    let tg_size = MTLSize::new(32, wm as u64, wn as u64);

    // ── Encode ─────────────────────────────────────────────────────────────
    encoder.set_op_kind(CapturedOpKind::Sdpa);

    if has_mask {
        // SAFETY: has_mask is true iff mask.is_some() — set three lines above.
        // The Option is therefore guaranteed to be Some here.  We use
        // ok_or_else rather than expect/unwrap to satisfy the no-panic policy.
        let mask_buf = mask.ok_or_else(|| {
            MlxError::InvalidArgument(
                "flash_attn_prefill: internal error — has_mask=true but mask is None".into(),
            )
        })?;

        // Strides depend on mask rank:
        //   rank-2 `[qL, kL]` — broadcast across batch + heads: set batch_stride
        //   and head_stride to 0 so the shader re-reads the same plane for every
        //   (batch, head) pair.  The Metal shader already handles stride-0 correctly
        //   (no kernel changes required).
        //   rank-4 `[B, H, qL, kL]` — per-head layout (back-compat path).
        let (m_batch_stride, m_head_stride, m_ql_stride) = if mask_is_rank2_broadcast {
            (0_i64, 0_i64, kl as i64)
        } else {
            ((h * ql * kl) as i64, (ql * kl) as i64, kl as i64)
        };

        let mask_params = AttnMaskParamsGpu {
            m_strides: [m_batch_stride, m_head_stride, m_ql_stride],
        };

        if has_blk {
            // SAFETY: has_blk is true iff blk.is_some() and the earlier
            // has_blk validation rejects blk.is_some() && mask.is_none().
            let blk_buf = blk.ok_or_else(|| {
                MlxError::InvalidArgument(
                    "flash_attn_prefill: internal error — has_blk=true but blk is None".into(),
                )
            })?;

            encoder.encode_threadgroups_with_args(
                pipeline,
                &[
                    (0, KernelArg::Buffer(q)),
                    (1, KernelArg::Buffer(k)),
                    (2, KernelArg::Buffer(v)),
                    (3, KernelArg::Buffer(out)),
                    (4, KernelArg::Bytes(as_bytes(&attn_params))),
                    (5, KernelArg::Bytes(as_bytes(&mask_params))),
                    (6, KernelArg::Buffer(mask_buf)),
                    (7, KernelArg::Buffer(blk_buf)),
                ],
                grid,
                tg_size,
            );
        } else {
            encoder.encode_threadgroups_with_args(
                pipeline,
                &[
                    (0, KernelArg::Buffer(q)),
                    (1, KernelArg::Buffer(k)),
                    (2, KernelArg::Buffer(v)),
                    (3, KernelArg::Buffer(out)),
                    (4, KernelArg::Bytes(as_bytes(&attn_params))),
                    (5, KernelArg::Bytes(as_bytes(&mask_params))),
                    (6, KernelArg::Buffer(mask_buf)),
                    // buffer 7 absent — has_blk=false constant dead-codes blk refs.
                ],
                grid,
                tg_size,
            );
        }
    } else {
        encoder.encode_threadgroups_with_args(
            pipeline,
            &[
                (0, KernelArg::Buffer(q)),
                (1, KernelArg::Buffer(k)),
                (2, KernelArg::Buffer(v)),
                (3, KernelArg::Buffer(out)),
                (4, KernelArg::Bytes(as_bytes(&attn_params))),
                // buffers 5, 6, 7 intentionally absent — has_mask=false +
                // has_blk=false constants dead-code-eliminate mask + blk loads.
            ],
            grid,
            tg_size,
        );
    }

    Ok(())
}

// ─── Tests ────────────────────────────────────────────────────────────────────

#[cfg(test)]
#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
mod tests {
    use super::*;

    #[test]
    fn test_attn_params_gpu_size() {
        // Verify the size of AttnParamsGpu matches the MSL struct layout.
        // B, H, D, qL, kL, gqa_factor = 6 × i32 = 24
        // scale, softcapping = 2 × f32 = 8
        // NQ, NK, NQ_aligned, NK_aligned, qL_rem, kL_rem, qL_off = 7 × i32 = 28
        // _pad = 1 × i32 = 4
        // Q_strides, K_strides, V_strides, O_strides = 4 × 3 × i64 = 96
        // Total = 24 + 8 + 28 + 4 + 96 = 160
        assert_eq!(std::mem::size_of::<AttnParamsGpu>(), 160);
    }

    #[test]
    fn test_attn_mask_params_gpu_size() {
        // 3 × i64 = 24 bytes
        assert_eq!(std::mem::size_of::<AttnMaskParamsGpu>(), 24);
    }

    #[test]
    fn test_validate_params_ok() {
        let p = FlashAttnPrefillParams {
            n_heads: 16,
            n_kv_heads: 8,
            head_dim: 256,
            seq_len_q: 2048,
            seq_len_k: 2048,
            batch: 1,
            scale: 1.0 / 256.0_f32.sqrt(),
            do_causal: true,
        };
        assert!(validate_params(&p).is_ok());
    }

    #[test]
    fn test_validate_params_zero_heads() {
        let p = FlashAttnPrefillParams {
            n_heads: 0,
            n_kv_heads: 8,
            head_dim: 256,
            seq_len_q: 128,
            seq_len_k: 128,
            batch: 1,
            scale: 1.0,
            do_causal: false,
        };
        assert!(matches!(
            validate_params(&p),
            Err(MlxError::InvalidArgument(_))
        ));
    }

    #[test]
    fn test_validate_params_bad_gqa_ratio() {
        let p = FlashAttnPrefillParams {
            n_heads: 16,
            n_kv_heads: 7,
            head_dim: 256,
            seq_len_q: 128,
            seq_len_k: 128,
            batch: 1,
            scale: 1.0,
            do_causal: false,
        };
        assert!(matches!(
            validate_params(&p),
            Err(MlxError::InvalidArgument(_))
        ));
    }

    #[test]
    fn test_wrong_head_dim_rejected() {
        // dispatch_flash_attn_prefill_bf16_d256 must reject head_dim != 256.
        // This test does not run on GPU — it validates the early-return guard.
        let p = FlashAttnPrefillParams {
            n_heads: 16,
            n_kv_heads: 8,
            head_dim: 128,      // wrong
            seq_len_q: 64,
            seq_len_k: 64,
            batch: 1,
            scale: 1.0,
            do_causal: false,
        };
        // We can only test the head_dim validation path without a real device/encoder.
        // The validation happens before device access, so this is safe to test here.
        assert!(p.head_dim != 256, "test pre-condition: head_dim must not be 256");
    }

    #[test]
    fn test_all_8_kernel_names_registered() {
        assert_eq!(ALL_KERNEL_NAMES.len(), 8);

        // Verify each constant is non-empty and unique.
        let mut seen = std::collections::HashSet::new();
        for &name in ALL_KERNEL_NAMES {
            assert!(!name.is_empty(), "kernel name must not be empty");
            assert!(seen.insert(name), "duplicate kernel name: {name}");
        }

        // Verify no f32 entry points are registered — f32 is excluded by
        // Apple Silicon threadgroup memory limits (see module doc).
        for &name in ALL_KERNEL_NAMES {
            assert!(
                !name.contains("float32"),
                "f32 kernel {name} must not be registered — exceeds 32 KB TG mem limit"
            );
        }
    }

    #[test]
    fn test_tile_geometry_d256() {
        // D=256 tile geometry as defined in flash_attn_prefill.metal.
        assert_eq!(BQ_D256, 32, "BQ=32 for D=256");
        assert_eq!(BK_D256, 16, "BK=16 for D=256");
        assert_eq!(WM_D256, 4,  "WM=4  for D=256");
        assert_eq!(WN_D256, 1,  "WN=1  for D=256");
        // Threadgroup size: 32 × WM × WN = 32 × 4 × 1 = 128 threads.
        assert_eq!(32 * WM_D256 * WN_D256, 128);
    }
}