mlx-native 0.9.0

//! TQ kernel replay binary for ADR-007 C-1-unlock harness fix.
//!
//! Runs the flash_attn_vec_tq / flash_attn_vec kernels against captured inputs
//! and compares to a CPU reference SDPA computed from the same TQ-packed data.
//!
//! Fixes applied in C-1-unlock:
//!   D1 - encoder.memory_barrier() inserted at 3 sites (mirroring forward_mlx.rs:1429-1431,
//!        1441-1446, 1477-1480).
//!   D2 - Variation C replaced with true dense control: flash_attn_vec on dequantized F32 K/V.
//!   D3 - Canary in-range: --canary in-range mutates k_norms[head=0, pos=10] *= 2.0.
//!   D4 - Raw sdpa_out .bin written per variation alongside metrics JSON.
//!   D5 - kv_seq_len=23 accepted from manifest; CPU reference loops 0..kvl.
//!
//! Usage:
//!   cargo run --release --example tq_kernel_replay -- \
//!     --manifest /tmp/cfa-20260422-C1-unlock/manifest.json \
//!     --variation A \
//!     [--canary in-range] \
//!     --out /tmp/cfa-20260422-C1-unlock/out/claude/A
//!
//! Variations:
//!   A  Full production path: forward-FWHT(Q) + TQ kernel + inverse-FWHT(output)
//!   B  FWHT-disabled: skip both FWHT dispatches; pass Q as-is to TQ kernel
//!   C  Dense control: flash_attn_vec (F32 K/V, natural basis) — no FWHT on either side
//!
//! Canary (--canary in-range): k_norms[head=0 * kv_capacity + pos=10] *= 2.0 before H2D.
//!   In-range mutation (pos=10 < kv_seq_len=23); expected nrmse_delta vs A baseline > 0.01.
//!
//! Exit codes:
//!   0  Success
//!   1  Argument / IO error
//!   2  GPU dispatch error or NaN/Inf in output

#![allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
#![cfg(target_vendor = "apple")]

use mlx_native::ops::flash_attn_vec::{self, FlashAttnVecParams};
use mlx_native::ops::flash_attn_vec_tq::{self, FlashAttnVecTqParams};
use mlx_native::ops::fwht_standalone;
use mlx_native::turboquant::{fwht_inplace, CODEBOOK_4BIT};
use mlx_native::{DType, KernelRegistry, MlxDevice};
use serde::{Deserialize, Serialize};
use std::fs;
use std::path::PathBuf;
use std::time::SystemTime;

// ---------------------------------------------------------------------------
// iter-5 pre-registered NRMSE band (catalog #11: never widen after measurement)
//
// These constants are COMMITTED here, BEFORE any measurement is run.
// If any sweep point returns nrmse outside [LOWER, UPPER], the binary panics
// with exit code 2 and reports BAND_PRE_FALSIFIED — NO band edits permitted.
// Violating this rule is catalog #11 (post-measurement widening, iter-4 HIGH-1 defect).
// ---------------------------------------------------------------------------

/// Lower bound of pre-registered iter-5 NRMSE band.
/// Catalog #11: pre-registered, no post-measurement widening.
const NRMSE_BAND_LOWER: f32 = 0.05;

/// Upper bound of pre-registered iter-5 NRMSE band.
/// Catalog #11: pre-registered, no post-measurement widening.
const NRMSE_BAND_UPPER: f32 = 0.35;

// ---------------------------------------------------------------------------
// CLI parsing (no clap dep — simple std::env)
// ---------------------------------------------------------------------------

/// Oracle mode: what reference to compare the TQ GPU output against.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum OracleMode {
    /// Dequant oracle only (C-1-unlock behavior, default for backward compat).
    Dequant,
    /// Independent-floor oracle only: dense flash_attn_vec on pre-quant F32 K/V.
    IndependentFloor,
    /// Both oracles — C-2 happy path; emits two nrmse columns.
    Both,
}

/// Replay mode.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum ReplayMode {
    /// Single-step: load a manifest and replay it (backward-compat, default).
    Singlestep,
    /// Multi-step: synthesize K/V from seed and replay at 4 canonical positions.
    Multistep,
    /// Production-faithful v2: iter-5 controlled sweep with pre-registered band,
    /// subprocess regression gates, and single-seed deterministic draws.
    ProductionFaithful,
}

struct Args {
    manifest: Option<PathBuf>,
    variation: Variation,
    canary: CanaryMode,
    out: PathBuf,
    oracle: OracleMode,
    mode: ReplayMode,
    seed: u64,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum CanaryMode {
    None,
    InRange,
    OutOfRange, // legacy: k_norms at positions >= kv_seq_len set to 1e9
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum Variation {
    A,
    B,
    C,
}

impl std::fmt::Display for Variation {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Variation::A => write!(f, "A"),
            Variation::B => write!(f, "B"),
            Variation::C => write!(f, "C (dense control)"),
        }
    }
}

fn parse_args() -> Result<Args, String> {
    let argv: Vec<String> = std::env::args().collect();
    let mut manifest: Option<PathBuf> = None;
    let mut variation: Option<Variation> = None;
    let mut canary = CanaryMode::None;
    let mut out: Option<PathBuf> = None;
    let mut oracle = OracleMode::Dequant;
    let mut mode = ReplayMode::Singlestep;
    let mut seed: u64 = 0x00C2_5EED;

    let mut i = 1;
    while i < argv.len() {
        match argv[i].as_str() {
            "--manifest" => {
                i += 1;
                manifest = Some(PathBuf::from(argv.get(i).ok_or("--manifest needs a value")?));
            }
            "--variation" => {
                i += 1;
                variation = Some(match argv.get(i).map(|s| s.as_str()) {
                    Some("A") => Variation::A,
                    Some("B") => Variation::B,
                    Some("C") => Variation::C,
                    other => return Err(format!("unknown variation {:?}; expected A, B, or C", other)),
                });
            }
            "--canary" => {
                // Accept "--canary in-range", "--canary out-of-range", or bare "--canary" (= in-range)
                if let Some(next) = argv.get(i + 1) {
                    match next.as_str() {
                        "in-range" => {
                            canary = CanaryMode::InRange;
                            i += 1;
                        }
                        "out-of-range" => {
                            canary = CanaryMode::OutOfRange;
                            i += 1;
                        }
                        s if !s.starts_with('-') => {
                            // Legacy: numeric value like "1e9" → treat as out-of-range
                            canary = CanaryMode::OutOfRange;
                            i += 1;
                        }
                        _ => {
                            // Next arg is a flag — bare --canary defaults to in-range
                            canary = CanaryMode::InRange;
                        }
                    }
                } else {
                    canary = CanaryMode::InRange;
                }
            }
            "--out" => {
                i += 1;
                out = Some(PathBuf::from(argv.get(i).ok_or("--out needs a value")?));
            }
            "--oracle" => {
                i += 1;
                oracle = match argv.get(i).map(|s| s.as_str()) {
                    Some("dequant") => OracleMode::Dequant,
                    Some("independent-floor") => OracleMode::IndependentFloor,
                    Some("both") => OracleMode::Both,
                    other => return Err(format!("unknown --oracle {:?}; expected dequant, independent-floor, or both", other)),
                };
            }
            "--singlestep" => {
                mode = ReplayMode::Singlestep;
            }
            "--multistep" => {
                mode = ReplayMode::Multistep;
            }
            "--production-faithful" => {
                mode = ReplayMode::ProductionFaithful;
            }
            "--seed" => {
                i += 1;
                let s = argv.get(i).ok_or("--seed needs a value")?;
                seed = if let Some(hex) = s.strip_prefix("0x").or_else(|| s.strip_prefix("0X")) {
                    u64::from_str_radix(hex, 16)
                        .map_err(|e| format!("--seed hex parse error: {}", e))?
                } else {
                    s.parse::<u64>()
                        .map_err(|e| format!("--seed decimal parse error: {}", e))?
                };
            }
            other => return Err(format!("unknown argument: {}", other)),
        }
        i += 1;
    }

    // Validate: singlestep requires --manifest; multistep and production-faithful do not.
    if mode == ReplayMode::Singlestep && manifest.is_none() {
        return Err("--singlestep (or default) mode requires --manifest".into());
    }

    Ok(Args {
        manifest,
        variation: variation.unwrap_or(Variation::A),
        canary,
        out: out.ok_or("--out is required")?,
        oracle,
        mode,
        seed,
    })
}

// ---------------------------------------------------------------------------
// Manifest schema — supports the instrumenter's C-1-unlock format.
//
// The instrumenter manifest uses `dump_paths` (not `inputs`) and has no
// `compact_sources` section. Compact K/V for CPU reference is derived
// in-memory by slicing rows 0..kvl from the padded buffers.
// ---------------------------------------------------------------------------

#[derive(Debug, Deserialize)]
struct ManifestParams {
    num_heads: u32,
    num_kv_heads: u32,
    head_dim: u32,
    kv_seq_len: u32,
    kv_capacity: u32,
    scale: f32,
    mask_type: u32,
    sliding_window: u32,
    softcap: f32,
    ring_start: u32,
}

/// Paths section — accepts both the new `dump_paths` key (instrumenter format)
/// and the old `inputs` key (C-1 format) via `#[serde(alias)]`.
#[derive(Debug, Deserialize)]
struct ManifestPaths {
    #[serde(alias = "k_packed_post_quant", alias = "k_packed_padded")]
    k_packed_padded: String,
    #[serde(alias = "v_packed_post_quant", alias = "v_packed_padded")]
    v_packed_padded: String,
    #[serde(alias = "k_norms_post_quant", alias = "k_norms_padded")]
    k_norms_padded: String,
    #[serde(alias = "v_norms_post_quant", alias = "v_norms_padded")]
    v_norms_padded: String,
    q_natural: String,
    // Optional legacy canary files (old format only)
    #[serde(default)]
    k_norms_canary: String,
    #[serde(default)]
    v_norms_canary: String,
    /// Optional pre-quant F32 K dump (from HF2Q_DUMP_PRE_QUANT=1).
    /// When both k_pre_quant and v_pre_quant are present, the independent-floor oracle is available.
    /// Layout: [nkv, hd] F32 little-endian (current token only; NOT the full ring buffer).
    #[serde(default)]
    k_pre_quant: Option<String>,
    #[serde(default)]
    v_pre_quant: Option<String>,
}

/// Top-level manifest. Accepts both:
///   - New format: `dump_paths` key (instrumenter C-1-unlock)
///   - Old format: `inputs` key (C-1 harness)
#[derive(Debug, Deserialize)]
struct Manifest {
    params: ManifestParams,
    /// New instrumenter format uses `dump_paths`; old harness format uses `inputs`.
    #[serde(alias = "inputs")]
    dump_paths: ManifestPaths,
    /// Old format only — if absent, compact sources are derived in-memory.
    #[serde(default)]
    compact_sources: Option<LegacyCompactSources>,
}

#[allow(dead_code)]
#[derive(Debug, Deserialize, Default)]
struct LegacyCompactSources {
    k_packed_compact: String,
    v_packed_compact: String,
    k_norms_compact: String,
    v_norms_compact: String,
}

// ---------------------------------------------------------------------------
// Output schema
// ---------------------------------------------------------------------------

#[derive(Debug, Serialize)]
struct PerHeadDiff {
    head: usize,
    max_abs_diff: f32,
}

#[derive(Debug, Serialize)]
struct ReplayMetrics {
    variation: String,
    canary: String,
    ran_at: String,
    /// Primary dequant oracle nrmse (nrmse(gpu_out, cpu_sdpa_from_dequant)).
    /// Alias for backward compatibility: was `nrmse` in C-1-unlock output.
    #[serde(rename = "dequant_oracle_nrmse")]
    nrmse: f64,
    max_abs_diff: f32,
    per_head_max_abs_diff: Vec<PerHeadDiff>,
    any_nan_inf_in_gpu_output: bool,
    exit_status: String,
    bin_path: String,
    /// Independent-floor oracle nrmse: nrmse(gpu_out, flash_attn_vec on pre-quant F32 K/V).
    /// None when pre-quant paths are absent or --oracle dequant.
    independent_floor_nrmse: Option<f64>,
}

// ---------------------------------------------------------------------------
// CPU helpers (mirror test_flash_attn_vec_tq.rs)
// ---------------------------------------------------------------------------

fn boundaries_4bit() -> [f32; 15] {
    let mut b = [0.0f32; 15];
    for i in 0..15 {
        b[i] = (CODEBOOK_4BIT[i] + CODEBOOK_4BIT[i + 1]) / 2.0;
    }
    b
}

fn nearest_centroid_4bit(value: f32) -> u8 {
    let boundaries = boundaries_4bit();
    let mut idx: u8 = 0;
    for &b in &boundaries {
        if value > b {
            idx += 1;
        }
    }
    idx
}

/// Quantize a head vector into nibble-packed format (mirrors test file).
fn nibble_quantize(x: &[f32], head_dim: usize) -> (Vec<u8>, f32) {
    let mut rotated = x.to_vec();
    fwht_inplace(&mut rotated).unwrap();

    let norm: f32 = rotated.iter().map(|v| v * v).sum::<f32>().sqrt();
    if norm < 1e-30 {
        return (vec![0u8; head_dim / 2], 0.0);
    }

    let inv_norm = 1.0 / norm;
    let scale = (head_dim as f32).sqrt();

    let mut packed = vec![0u8; head_dim / 2];
    for c in 0..head_dim {
        let scaled = rotated[c] * inv_norm * scale;
        let idx = nearest_centroid_4bit(scaled);
        let byte_idx = c / 2;
        if c % 2 == 0 {
            packed[byte_idx] = idx & 0xF;
        } else {
            packed[byte_idx] |= (idx & 0xF) << 4;
        }
    }

    (packed, norm)
}

/// Dequantize from nibble-packed format (mirrors test file).
fn nibble_dequantize(packed: &[u8], norm: f32, head_dim: usize) -> Vec<f32> {
    let inv_scale = 1.0 / (head_dim as f32).sqrt();
    let mut rotated = Vec::with_capacity(head_dim);

    for c in 0..head_dim {
        let byte_idx = c / 2;
        let idx = if c % 2 == 0 {
            (packed[byte_idx] & 0xF) as usize
        } else {
            ((packed[byte_idx] >> 4) & 0xF) as usize
        };
        rotated.push(CODEBOOK_4BIT[idx] * inv_scale * norm);
    }

    fwht_inplace(&mut rotated).unwrap();
    rotated
}

/// CPU SDPA reference (mirrors test_flash_attn_vec_tq.rs cpu_sdpa).
///
/// Q: flat [num_heads * head_dim] F32 (natural basis)
/// k_dequant: [num_kv_heads * kvl_logical] entries of [head_dim] each,
///            indexed in CHRONOLOGICAL order (pos 0 = oldest, pos kvl_logical-1 = newest).
/// v_dequant: same layout as k_dequant.
/// kvl_logical: number of valid chronological positions (= min(abs_pos+1, kv_capacity)).
/// kv_capacity: physical ring buffer capacity (used only for ring_start modulo).
/// mask_type: 0=none/dense (attend all), 1=causal (all <= current step), 2=sliding_window.
/// sliding_window: only last sliding_window chronological positions attend (mask_type=2 only).
/// ring_start: chronological position 0 maps to physical row ring_start. For the dequant
///             oracle path, k_dequant is already compact (chronological order), so ring_start
///             does NOT remap into k_dequant — it is passed here for interface symmetry and
///             used only by the independent-floor path where physical layout matters.
///             In the dequant oracle, iterate p in 0..kvl_logical directly.
/// softcap: logit soft-capping. When > 0: score = softcap * tanh(score * scale / softcap).
///          When 0: score *= scale (standard).
///
/// Returns: flat [num_heads * head_dim] F32
fn cpu_sdpa(
    q: &[f32],
    k_dequant: &[Vec<f32>],
    v_dequant: &[Vec<f32>],
    num_heads: usize,
    num_kv_heads: usize,
    head_dim: usize,
    kvl_logical: usize,
    kv_capacity: usize,
    scale: f32,
    mask_type: u32,
    sliding_window: u32,
    _ring_start: u32,  // unused in dequant oracle path (k_dequant is already chronological)
    softcap: f32,
) -> Vec<f32> {
    let mut output = vec![0.0f32; num_heads * head_dim];
    let heads_per_kv = num_heads / num_kv_heads;

    for h in 0..num_heads {
        let kv_h = h / heads_per_kv;
        let q_offset = h * head_dim;

        let mut scores: Vec<f32> = Vec::with_capacity(kvl_logical);
        // Bitmask: which chronological positions are masked in.
        // For sliding (mask_type=2): only last sliding_window positions attend.
        // For causal (mask_type=1) and none (mask_type=0): all positions attend.
        let first_valid: usize = if mask_type == 2 {
            let sw = sliding_window as usize;
            if kvl_logical > sw { kvl_logical - sw } else { 0 }
        } else {
            0
        };

        for p in 0..kvl_logical {
            if p < first_valid {
                // Masked out — push NEG_INFINITY so softmax weight → 0.
                scores.push(f32::NEG_INFINITY);
                continue;
            }
            let mut dot = 0.0f32;
            for c in 0..head_dim {
                dot += q[q_offset + c] * k_dequant[kv_h * kvl_logical + p][c];
            }
            let score = if softcap > 0.0 {
                softcap * (dot * scale / softcap).tanh()
            } else {
                dot * scale
            };
            scores.push(score);
        }

        // Online softmax: ignore -inf entries (masked positions).
        let max_score = scores.iter().copied().fold(f32::NEG_INFINITY, f32::max);
        let mut exp_scores: Vec<f32> = scores
            .iter()
            .map(|&s| if s == f32::NEG_INFINITY { 0.0f32 } else { (s - max_score).exp() })
            .collect();
        let sum: f32 = exp_scores.iter().sum();
        if sum > 0.0 {
            for e in &mut exp_scores {
                *e /= sum;
            }
        }

        let o_offset = h * head_dim;
        for p in 0..kvl_logical {
            let w = exp_scores[p];
            if w == 0.0 {
                continue;
            }
            for c in 0..head_dim {
                output[o_offset + c] += w * v_dequant[kv_h * kvl_logical + p][c];
            }
        }
    }

    // kv_capacity is retained as a parameter for interface symmetry with the
    // independent-floor oracle path; suppress the unused-variable warning.
    let _ = kv_capacity;

    output
}

// ---------------------------------------------------------------------------
// Load binary files as typed slices
// ---------------------------------------------------------------------------

fn load_f32(path: &str) -> Vec<f32> {
    let bytes = fs::read(path).unwrap_or_else(|e| panic!("failed to read {}: {}", path, e));
    assert!(bytes.len() % 4 == 0, "file {} is not 4-byte aligned", path);
    bytes
        .chunks_exact(4)
        .map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
        .collect()
}

fn load_u8(path: &str) -> Vec<u8> {
    fs::read(path).unwrap_or_else(|e| panic!("failed to read {}: {}", path, e))
}

// ---------------------------------------------------------------------------
// Compute diff metrics
// ---------------------------------------------------------------------------

fn compute_metrics(
    cpu_ref: &[f32],
    gpu_out: &[f32],
    num_heads: usize,
    head_dim: usize,
) -> (f64, f32, Vec<PerHeadDiff>) {
    let mut sum_sq_diff = 0.0f64;
    let mut sum_sq_ref = 0.0f64;
    let mut global_max = 0.0f32;
    let mut per_head = Vec::with_capacity(num_heads);

    for h in 0..num_heads {
        let mut head_max = 0.0f32;
        for c in 0..head_dim {
            let i = h * head_dim + c;
            let diff = (cpu_ref[i] - gpu_out[i]).abs();
            if diff > head_max {
                head_max = diff;
            }
            if diff > global_max {
                global_max = diff;
            }
            sum_sq_diff += (diff as f64) * (diff as f64);
            sum_sq_ref += (cpu_ref[i] as f64) * (cpu_ref[i] as f64);
        }
        per_head.push(PerHeadDiff {
            head: h,
            max_abs_diff: head_max,
        });
    }

    let nrmse = if sum_sq_ref > 0.0 {
        (sum_sq_diff / sum_sq_ref).sqrt()
    } else {
        0.0
    };

    (nrmse, global_max, per_head)
}

// ---------------------------------------------------------------------------
// Derive compact K/V from padded buffers by slicing rows 0..kvl
//
// Padded layout:  [nkv, kv_capacity, hd/2] u8 — stride h*kv_capacity*(hd/2) + pos*(hd/2)
// Compact layout: [nkv, kvl, hd/2] u8         — stride h*kvl*(hd/2) + pos*(hd/2)
// ---------------------------------------------------------------------------

fn compact_from_padded_u8(
    padded: &[u8],
    nkv: usize,
    kv_capacity: usize,
    kvl: usize,
    hd: usize,
) -> Vec<u8> {
    let half_hd = hd / 2;
    let mut compact = vec![0u8; nkv * kvl * half_hd];
    for kv_h in 0..nkv {
        for pos in 0..kvl {
            let src_off = kv_h * kv_capacity * half_hd + pos * half_hd;
            let dst_off = kv_h * kvl * half_hd + pos * half_hd;
            compact[dst_off..dst_off + half_hd]
                .copy_from_slice(&padded[src_off..src_off + half_hd]);
        }
    }
    compact
}

fn compact_from_padded_f32(
    padded: &[f32],
    nkv: usize,
    kv_capacity: usize,
    kvl: usize,
) -> Vec<f32> {
    let mut compact = vec![0.0f32; nkv * kvl];
    for kv_h in 0..nkv {
        for pos in 0..kvl {
            let src_off = kv_h * kv_capacity + pos;
            let dst_off = kv_h * kvl + pos;
            compact[dst_off] = padded[src_off];
        }
    }
    compact
}

// ---------------------------------------------------------------------------
// Core replay logic
// ---------------------------------------------------------------------------

fn run_variation(
    manifest: &Manifest,
    variation: Variation,
    canary: CanaryMode,
    oracle_mode: OracleMode,
    out_path: &PathBuf,
    device: &MlxDevice,
    registry: &mut KernelRegistry,
) -> ReplayMetrics {
    let p = &manifest.params;
    let paths = &manifest.dump_paths;

    let nh = p.num_heads as usize;
    let nkv = p.num_kv_heads as usize;
    let hd = p.head_dim as usize;
    let kvl = p.kv_seq_len as usize; // 23 in C-1-unlock
    let kv_capacity = p.kv_capacity as usize;

    // --- Load padded inputs ---
    let q_natural: Vec<f32> = load_f32(&paths.q_natural);
    assert_eq!(q_natural.len(), nh * hd, "q_natural size mismatch");

    let k_packed_padded: Vec<u8> = load_u8(&paths.k_packed_padded);
    let v_packed_padded: Vec<u8> = load_u8(&paths.v_packed_padded);
    assert_eq!(k_packed_padded.len(), nkv * kv_capacity * (hd / 2),
        "k_packed_padded size mismatch: expected {} got {}",
        nkv * kv_capacity * (hd / 2), k_packed_padded.len());
    assert_eq!(v_packed_padded.len(), nkv * kv_capacity * (hd / 2));

    let k_norms_padded_base: Vec<f32> = load_f32(&paths.k_norms_padded);
    let v_norms_padded_base: Vec<f32> = load_f32(&paths.v_norms_padded);
    assert_eq!(k_norms_padded_base.len(), nkv * kv_capacity);
    assert_eq!(v_norms_padded_base.len(), nkv * kv_capacity);

    // --- Derive compact K/V (rows 0..kvl) for CPU reference ---
    // New instrumenter format has no compact_sources — derive in-memory by slicing.
    // Legacy format may have compact_sources on disk.
    let (k_packed_compact, v_packed_compact, k_norms_compact, v_norms_compact) =
        if let Some(ref cs) = manifest.compact_sources {
            if !cs.k_packed_compact.is_empty() {
                // Legacy: load from disk
                let kp = load_u8(&cs.k_packed_compact);
                let vp = load_u8(&cs.v_packed_compact);
                let kn = load_f32(&cs.k_norms_compact);
                let vn = load_f32(&cs.v_norms_compact);
                (kp, vp, kn, vn)
            } else {
                // Empty legacy struct — derive in-memory
                let kp = compact_from_padded_u8(&k_packed_padded, nkv, kv_capacity, kvl, hd);
                let vp = compact_from_padded_u8(&v_packed_padded, nkv, kv_capacity, kvl, hd);
                let kn = compact_from_padded_f32(&k_norms_padded_base, nkv, kv_capacity, kvl);
                let vn = compact_from_padded_f32(&v_norms_padded_base, nkv, kv_capacity, kvl);
                (kp, vp, kn, vn)
            }
        } else {
            // No compact_sources key — instrumenter format, derive in-memory
            let kp = compact_from_padded_u8(&k_packed_padded, nkv, kv_capacity, kvl, hd);
            let vp = compact_from_padded_u8(&v_packed_padded, nkv, kv_capacity, kvl, hd);
            let kn = compact_from_padded_f32(&k_norms_padded_base, nkv, kv_capacity, kvl);
            let vn = compact_from_padded_f32(&v_norms_padded_base, nkv, kv_capacity, kvl);
            (kp, vp, kn, vn)
        };

    assert_eq!(k_packed_compact.len(), nkv * kvl * (hd / 2));
    assert_eq!(v_packed_compact.len(), nkv * kvl * (hd / 2));
    assert_eq!(k_norms_compact.len(), nkv * kvl);
    assert_eq!(v_norms_compact.len(), nkv * kvl);

    // --- P2 canary symmetry fix: pre-mutate compact norms BEFORE building k_dequant ---
    // When canary=InRange, both the GPU path AND the dequant CPU reference must see the
    // mutation at (head=0, pos=10). We apply it here to k_norms_compact so that k_dequant
    // is rebuilt from the mutated norms. This produces a symmetric canary: both oracle and
    // kernel see the 2x norm at head=0/pos=10, so nrmse returns to the baseline ~5.1e-5.
    //
    // To recover the ASYMMETRIC (C-1-unlock) behavior and reproduce ~0.111 nrmse, set the
    // env var HF2Q_REPLAY_CANARY_ASYMMETRIC=1. This skips the compact-norm mutation so the
    // CPU oracle sees the unmutated norm while the GPU sees the 2x version.
    // P2 canary asymmetric debug flag: set HF2Q_REPLAY_CANARY_ASYMMETRIC=1 to reproduce
    // C-1-unlock's 0.111 nrmse (one-sided mutation: GPU sees 2x, CPU oracle does not).
    // Expected: symmetric run → nrmse ≤ 1e-4; asymmetric → ~0.111.
    let canary_asymmetric_mode =
        std::env::var("HF2Q_REPLAY_CANARY_ASYMMETRIC").is_ok_and(|v| v == "1");
    let mut k_norms_compact = k_norms_compact; // make mutable
    if canary == CanaryMode::InRange && !canary_asymmetric_mode {
        // Symmetric fix: also mutate compact norm so CPU reference is consistent.
        let compact_canary_idx = 0 * kvl + 10;
        if compact_canary_idx < k_norms_compact.len() {
            let old_val = k_norms_compact[compact_canary_idx];
            k_norms_compact[compact_canary_idx] *= 2.0;
            eprintln!(
                "[canary symmetric] k_norms_compact[head=0, pos=10] *= 2.0: {} → {}",
                old_val, k_norms_compact[compact_canary_idx]
            );
        }
    } else if canary == CanaryMode::InRange {
        eprintln!("[canary ASYMMETRIC] HF2Q_REPLAY_CANARY_ASYMMETRIC=1: skipping compact norm mutation (reproduces C-1-unlock 0.111 nrmse)");
    }

    // --- Compute CPU reference: dequantize TQ-packed (kvl rows) → natural-basis K/V ---
    // CPU reference is the same for all variations (A, B, C): natural-basis SDPA from TQ dequant.
    // NOTE: uses k_norms_compact AFTER the canary mutation above (symmetric fix).
    let mut k_dequant: Vec<Vec<f32>> = Vec::with_capacity(nkv * kvl);
    let mut v_dequant: Vec<Vec<f32>> = Vec::with_capacity(nkv * kvl);

    for kv_h in 0..nkv {
        for pos in 0..kvl {
            let packed_offset = (kv_h * kvl + pos) * (hd / 2);
            let norm_offset = kv_h * kvl + pos;

            let k_vec = nibble_dequantize(
                &k_packed_compact[packed_offset..packed_offset + hd / 2],
                k_norms_compact[norm_offset],
                hd,
            );
            k_dequant.push(k_vec);

            let v_vec = nibble_dequantize(
                &v_packed_compact[packed_offset..packed_offset + hd / 2],
                v_norms_compact[norm_offset],
                hd,
            );
            v_dequant.push(v_vec);
        }
    }

    // CPU SDPA in natural basis (same reference for all variations A/B/C)
    let cpu_ref = cpu_sdpa(
        &q_natural,
        &k_dequant,
        &v_dequant,
        nh,
        nkv,
        hd,
        kvl,
        kv_capacity,
        p.scale,
        p.mask_type,
        p.sliding_window,
        p.ring_start,
        p.softcap,
    );

    // --- Prepare norms with optional canary mutation (GPU path) ---
    // Start from the padded baseline norms
    let mut k_norms_gpu: Vec<f32> = k_norms_padded_base.clone();
    let mut v_norms_gpu: Vec<f32> = v_norms_padded_base.clone();

    match canary {
        CanaryMode::None => {
            // No mutation — use baseline norms as-is
        }
        CanaryMode::InRange => {
            // D3: in-range canary — mutate k_norms at (head=0, pos=10) in the GPU buffer.
            // pos=10 is within kv_seq_len=23, so the kernel provably reads this position.
            // Mutation: scale norm by 2x → dequantized K[h=0, pos=10, :] magnitudes ~2x.
            // Mirror canary_spec: k_norms_padded[0 * kv_capacity + 10] *= 2.0
            let canary_idx = 0 * kv_capacity + 10;
            k_norms_gpu[canary_idx] *= 2.0;
            eprintln!(
                "[canary in-range GPU] k_norms[head=0, pos=10] *= 2.0 → new value = {}",
                k_norms_gpu[canary_idx]
            );
        }
        CanaryMode::OutOfRange => {
            // Legacy out-of-range canary: positions >= kvl set to 1e9.
            // If manifest has old canary files, load them; otherwise construct in-memory.
            if !paths.k_norms_canary.is_empty() && !paths.v_norms_canary.is_empty() {
                k_norms_gpu = load_f32(&paths.k_norms_canary);
                v_norms_gpu = load_f32(&paths.v_norms_canary);
                assert_eq!(k_norms_gpu.len(), nkv * kv_capacity);
                assert_eq!(v_norms_gpu.len(), nkv * kv_capacity);
            } else {
                for kv_h in 0..nkv {
                    for pos in kvl..kv_capacity {
                        k_norms_gpu[kv_h * kv_capacity + pos] = 1e9;
                        v_norms_gpu[kv_h * kv_capacity + pos] = 1e9;
                    }
                }
            }
        }
    }

    // --- GPU buffer allocation ---
    // Q: [nh, 1, hd] F32 — production shape
    let mut q_buf = device
        .alloc_buffer(nh * hd * 4, DType::F32, vec![nh, 1, hd])
        .expect("alloc Q");
    q_buf.as_mut_slice::<f32>().expect("write Q")[..nh * hd]
        .copy_from_slice(&q_natural);

    // K/V packed: [nkv, kv_capacity, hd/2] u8 (used by A/B)
    let k_packed_bytes = nkv * kv_capacity * (hd / 2);
    let v_packed_bytes = nkv * kv_capacity * (hd / 2);

    let mut k_packed_buf = device
        .alloc_buffer(k_packed_bytes, DType::U8, vec![nkv, kv_capacity, hd / 2])
        .expect("alloc K packed");
    k_packed_buf.as_mut_slice::<u8>().expect("write K packed")
        .copy_from_slice(&k_packed_padded);

    let mut v_packed_buf = device
        .alloc_buffer(v_packed_bytes, DType::U8, vec![nkv, kv_capacity, hd / 2])
        .expect("alloc V packed");
    v_packed_buf.as_mut_slice::<u8>().expect("write V packed")
        .copy_from_slice(&v_packed_padded);

    // Norms: [nkv, kv_capacity] f32 (includes canary mutation if active)
    let norms_bytes = nkv * kv_capacity * 4;

    let mut k_norms_buf = device
        .alloc_buffer(norms_bytes, DType::F32, vec![nkv, kv_capacity])
        .expect("alloc K norms");
    k_norms_buf.as_mut_slice::<f32>().expect("write K norms")
        .copy_from_slice(&k_norms_gpu);

    let mut v_norms_buf = device
        .alloc_buffer(norms_bytes, DType::F32, vec![nkv, kv_capacity])
        .expect("alloc V norms");
    v_norms_buf.as_mut_slice::<f32>().expect("write V norms")
        .copy_from_slice(&v_norms_gpu);

    // Output buffer: [nh, 1, hd] F32
    let output_buf = device
        .alloc_buffer(nh * hd * 4, DType::F32, vec![nh, 1, hd])
        .expect("alloc output");

    // Tmp buffer for TQ SDPA kernel
    let tmp_bytes_tq = flash_attn_vec_tq::tmp_buffer_bytes(p.num_heads, p.head_dim);
    let tmp_buf = device
        .alloc_buffer(tmp_bytes_tq, DType::F32, vec![tmp_bytes_tq / 4])
        .expect("alloc tmp");

    // --- TQ SDPA params from manifest ---
    let tq_params = FlashAttnVecTqParams {
        num_heads: p.num_heads,
        num_kv_heads: p.num_kv_heads,
        head_dim: p.head_dim,
        kv_seq_len: p.kv_seq_len,
        kv_capacity: p.kv_capacity,
        scale: p.scale,
        mask_type: p.mask_type,
        sliding_window: p.sliding_window,
        softcap: p.softcap,
        ring_start: p.ring_start,
        scale_factor_d512: 1.0,
    };

    // --- Dispatch ---
    let mut encoder = device.command_encoder().expect("command_encoder");

    match variation {
        Variation::A => {
            // Mirror forward_mlx.rs:1429-1431 — RAW on q_buf before in-place forward FWHT
            encoder.memory_barrier(); // BARRIER 1 (D1): before forward FWHT on Q

            // Forward FWHT on Q (in-place) — mirrors forward_mlx.rs:1433-1437
            fwht_standalone::dispatch_fwht_f32(
                &mut encoder,
                registry,
                device.metal_device(),
                &q_buf,
                p.num_heads,
                p.head_dim,
            )
            .expect("FWHT forward-Q dispatch");

            // Mirror forward_mlx.rs:1441-1446 — publish Q (post-FWHT) + packed K/V + norms
            encoder.memory_barrier(); // BARRIER 2 (D1): before TQ SDPA

            // TQ SDPA kernel — mirrors forward_mlx.rs:1464-1474
            flash_attn_vec_tq::flash_attn_vec_tq(
                &mut encoder,
                registry,
                device,
                &q_buf,
                &k_packed_buf,
                &k_norms_buf,
                &v_packed_buf,
                &v_norms_buf,
                &output_buf,
                &tmp_buf,
                &tq_params,
            )
            .expect("flash_attn_vec_tq dispatch");

            // Mirror forward_mlx.rs:1477-1480 — RAW on sdpa_out before in-place inverse FWHT
            encoder.memory_barrier(); // BARRIER 3 (D1): before inverse FWHT on output

            // Inverse FWHT on output (in-place) — mirrors forward_mlx.rs:1481-1485
            fwht_standalone::dispatch_fwht_f32(
                &mut encoder,
                registry,
                device.metal_device(),
                &output_buf,
                p.num_heads,
                p.head_dim,
            )
            .expect("FWHT inverse-output dispatch");
        }

        Variation::B => {
            // FWHT-disabled: pass Q in natural basis; no FWHT on either side.
            // Only barrier_2 equivalent: publish packed K/V + norms before kernel reads.
            //
            // Mirror forward_mlx.rs:1441-1446 — publish packed K/V + norms before TQ SDPA
            encoder.memory_barrier(); // BARRIER 1 of B (D1): before TQ SDPA

            flash_attn_vec_tq::flash_attn_vec_tq(
                &mut encoder,
                registry,
                device,
                &q_buf,
                &k_packed_buf,
                &k_norms_buf,
                &v_packed_buf,
                &v_norms_buf,
                &output_buf,
                &tmp_buf,
                &tq_params,
            )
            .expect("flash_attn_vec_tq dispatch (no FWHT)");
            // No inverse FWHT — output remains in rotated domain.
        }

        Variation::C => {
            // D2: Dense control — flash_attn_vec on dequantized F32 K/V.
            // Natural basis on both sides (no FWHT on Q or output).
            // Allocate F32 dense K/V: [nkv, kv_capacity, hd]; fill 0..kvl from k_dequant/v_dequant.
            let dense_kv_bytes = nkv * kv_capacity * hd * 4;
            let mut k_dense_buf = device
                .alloc_buffer(dense_kv_bytes, DType::F32, vec![nkv, kv_capacity, hd])
                .expect("alloc K dense");
            let mut v_dense_buf = device
                .alloc_buffer(dense_kv_bytes, DType::F32, vec![nkv, kv_capacity, hd])
                .expect("alloc V dense");

            {
                let k_slice = k_dense_buf.as_mut_slice::<f32>().expect("write K dense");
                let v_slice = v_dense_buf.as_mut_slice::<f32>().expect("write V dense");
                // Fill: stride is h*kv_capacity*hd + pos*hd
                for kv_h in 0..nkv {
                    for pos in 0..kvl {
                        let deq_idx = kv_h * kvl + pos;
                        let dst_off = kv_h * kv_capacity * hd + pos * hd;
                        k_slice[dst_off..dst_off + hd]
                            .copy_from_slice(&k_dequant[deq_idx]);
                        v_slice[dst_off..dst_off + hd]
                            .copy_from_slice(&v_dequant[deq_idx]);
                    }
                    // Positions kvl..kv_capacity remain 0.0f32
                }
            }

            // Tmp buffer for dense flash_attn_vec kernel
            let tmp_bytes_dense = flash_attn_vec::tmp_buffer_bytes(p.num_heads, p.head_dim);
            let tmp_dense_buf = device
                .alloc_buffer(tmp_bytes_dense, DType::F32, vec![tmp_bytes_dense / 4])
                .expect("alloc tmp dense");

            // Dense flash_attn_vec params — no ring_start (implicit 0 when kv_seq_len < kv_capacity)
            let dense_params = FlashAttnVecParams {
                num_heads: p.num_heads,
                num_kv_heads: p.num_kv_heads,
                head_dim: p.head_dim,
                kv_seq_len: p.kv_seq_len,
                kv_capacity: p.kv_capacity,
                scale: p.scale,
                mask_type: p.mask_type,        // 2 (sliding window)
                sliding_window: p.sliding_window, // 1024
                softcap: p.softcap,
            };

            // Mirror: ONE barrier before flash_attn_vec dispatch (publish q + k_dense + v_dense)
            encoder.memory_barrier(); // BARRIER 1 of C (D1): before dense flash_attn_vec

            // Dispatch dense SDPA (not flash_attn_vec_tq)
            flash_attn_vec::flash_attn_vec(
                &mut encoder,
                registry,
                device,
                &q_buf,
                &k_dense_buf,
                &v_dense_buf,
                &output_buf,
                &tmp_dense_buf,
                &dense_params,
            )
            .expect("flash_attn_vec dispatch (dense control)");
            // No forward/inverse FWHT on q_buf or output_buf — natural basis throughout.
        }
    }

    encoder.commit_and_wait().expect("commit_and_wait");

    // --- Read GPU output ---
    let gpu_output: Vec<f32> = output_buf
        .as_slice::<f32>()
        .expect("read output")
        .to_vec();
    assert_eq!(gpu_output.len(), nh * hd);

    // --- Check for NaN/Inf ---
    let has_nan_inf = gpu_output.iter().any(|v| !v.is_finite());

    // --- Compute dequant oracle metrics (primary nrmse) ---
    let (nrmse, max_abs_diff, per_head) =
        compute_metrics(&cpu_ref, &gpu_output, nh, hd);

    // --- Independent-floor oracle (P1b) ---
    // When oracle_mode includes IndependentFloor AND manifest has k_pre_quant + v_pre_quant,
    // load the pre-quant F32 K/V, build a [nkv, kv_capacity, hd] dense buffer in physical-row
    // layout (ring-rotated for ring_start != 0), run flash_attn_vec, compare to gpu_output.
    let independent_floor_nrmse: Option<f64> = if matches!(oracle_mode, OracleMode::IndependentFloor | OracleMode::Both) {
        if let (Some(k_pre_path), Some(v_pre_path)) = (&paths.k_pre_quant, &paths.v_pre_quant) {
            eprintln!("[ORACLE] independent-floor: using pre-quant F32 from k={} v={}", k_pre_path, v_pre_path);

            // Load pre-quant F32 K and V. Shape: [nkv, hd] F32 (current token only; 1 row per KV head).
            // For the independent-floor, we treat this as the FULL ring buffer contents by replicating
            // the single row as a synthetic ring. In practice for kv_seq_len=23, we build a [nkv, kvl]
            // dense buffer from the k_dequant vectors derived from the dequant path — but for true
            // independence we load directly from the pre-quant dump.
            //
            // The pre-quant dump from HF2Q_DUMP_PRE_QUANT=1 gives attn_k_normed at [nkv, hd], which
            // is the single-token K for the current decode step. For a complete independent-floor oracle
            // covering all kvl tokens, we would need a dump of all ring buffer rows BEFORE quantization.
            // Since only the current token's pre-quant is available in the dump, we use the dequanted
            // K/V (from k_dequant/v_dequant) for positions 0..kvl-1 and the pre-quant row only for
            // position kvl-1 (the most recent token).
            //
            // For the multistep mode (no manifest pre-quant), we use the synthetic pre-quant K/V.
            let k_pre_raw = load_f32(k_pre_path);
            let v_pre_raw = load_f32(v_pre_path);
            // k_pre_raw shape: [nkv, hd], i.e. nkv*hd elements.
            assert_eq!(k_pre_raw.len(), nkv * hd,
                "k_pre_quant size mismatch: expected {}*{}={} got {}",
                nkv, hd, nkv*hd, k_pre_raw.len());
            assert_eq!(v_pre_raw.len(), nkv * hd,
                "v_pre_quant size mismatch: expected {}*{}={} got {}",
                nkv, hd, nkv*hd, v_pre_raw.len());

            // Apply canary to the pre-quant K vector at head=0, pos=10 — but pos=10 refers
            // to a position in the ring, not in the single-token dump. Since this dump has
            // only the CURRENT token (the newest one = pos kvl-1), we can only apply the
            // canary to the pre-quant buffer if kvl-1 == 10 (which it won't be for kv_seq_len=23).
            // For the independent-floor to be symmetric with the canary, we apply it to the
            // dequant-derived K buffer used below (k_dequant[head=0 * kvl + 10]).
            // The pre-quant single-token buffer is used only for position kvl-1.

            // Build dense K/V buffer: [nkv, kv_capacity, hd] F32, physical-row layout.
            // Physical row (ring_start + i) % kv_capacity = chronological pos i.
            // For positions 0..kvl-1: use k_dequant (dequantized from TQ packed).
            // For position kvl-1 (newest): use k_pre_raw (raw F32 pre-quant, single row per kv_head).
            let ring_start = p.ring_start as usize;
            let dense_kv_elems = nkv * kv_capacity * hd;
            let mut k_dense_pre: Vec<f32> = vec![0.0f32; dense_kv_elems];
            let mut v_dense_pre: Vec<f32> = vec![0.0f32; dense_kv_elems];

            // Apply canary to k_dequant reference if in-range (symmetric to GPU path).
            // For the independent-floor, we re-apply: the k_dequant was already built with
            // the canary mutation (from k_norms_compact symmetric fix above). The pre_quant
            // single row is for the current token (pos = kvl-1), not pos=10.
            // So all positions filled from k_dequant already have the canary applied correctly.

            for kv_h in 0..nkv {
                for logical_i in 0..kvl {
                    // Physical row for chronological position i.
                    let phys_row = (ring_start + logical_i) % kv_capacity;
                    let k_dst_off = kv_h * kv_capacity * hd + phys_row * hd;
                    let v_dst_off = kv_h * kv_capacity * hd + phys_row * hd;

                    if logical_i == kvl - 1 {
                        // Newest token: use pre-quant F32 directly.
                        let k_src_off = kv_h * hd;
                        k_dense_pre[k_dst_off..k_dst_off + hd]
                            .copy_from_slice(&k_pre_raw[k_src_off..k_src_off + hd]);
                        // Apply canary to the pre-quant row if it corresponds to pos=10.
                        // (kvl-1 == 10 only when kvl=11; for kvl=23 this won't fire.)
                        if canary == CanaryMode::InRange && kv_h == 0 && logical_i == 10 && !canary_asymmetric_mode {
                            // Scale the entire K vector at head=0, pos=10 by 2x (pre-quant analogue).
                            for c in 0..hd {
                                k_dense_pre[k_dst_off + c] *= 2.0;
                            }
                        }
                        v_dense_pre[v_dst_off..v_dst_off + hd]
                            .copy_from_slice(&v_pre_raw[kv_h * hd..kv_h * hd + hd]);
                    } else {
                        // Older positions: use dequant (already contains canary mutation at pos=10).
                        let deq_idx = kv_h * kvl + logical_i;
                        k_dense_pre[k_dst_off..k_dst_off + hd].copy_from_slice(&k_dequant[deq_idx]);
                        v_dense_pre[v_dst_off..v_dst_off + hd].copy_from_slice(&v_dequant[deq_idx]);
                    }
                }
            }

            // Dispatch independent-floor: flash_attn_vec on pre-rotated K/V dense buffer.
            let dense_kv_bytes = dense_kv_elems * 4;
            let mut k_floor_buf = device
                .alloc_buffer(dense_kv_bytes, DType::F32, vec![nkv, kv_capacity, hd])
                .expect("alloc K floor");
            let mut v_floor_buf = device
                .alloc_buffer(dense_kv_bytes, DType::F32, vec![nkv, kv_capacity, hd])
                .expect("alloc V floor");
            k_floor_buf.as_mut_slice::<f32>().expect("write K floor")
                .copy_from_slice(&k_dense_pre);
            v_floor_buf.as_mut_slice::<f32>().expect("write V floor")
                .copy_from_slice(&v_dense_pre);

            let floor_output_buf = device
                .alloc_buffer(nh * hd * 4, DType::F32, vec![nh, 1, hd])
                .expect("alloc floor output");
            let tmp_bytes_floor = flash_attn_vec::tmp_buffer_bytes(p.num_heads, p.head_dim);
            let tmp_floor_buf = device
                .alloc_buffer(tmp_bytes_floor, DType::F32, vec![tmp_bytes_floor / 4])
                .expect("alloc floor tmp");

            // Q buffer for floor: natural basis (no FWHT).
            let mut q_floor_buf = device
                .alloc_buffer(nh * hd * 4, DType::F32, vec![nh, 1, hd])
                .expect("alloc Q floor");
            q_floor_buf.as_mut_slice::<f32>().expect("write Q floor")
                .copy_from_slice(&q_natural);

            let floor_params = FlashAttnVecParams {
                num_heads: p.num_heads,
                num_kv_heads: p.num_kv_heads,
                head_dim: p.head_dim,
                kv_seq_len: p.kv_seq_len,
                kv_capacity: p.kv_capacity,
                scale: p.scale,
                mask_type: p.mask_type,
                sliding_window: p.sliding_window,
                softcap: p.softcap,
            };

            let mut floor_encoder = device.command_encoder().expect("floor encoder");
            floor_encoder.memory_barrier();
            flash_attn_vec::flash_attn_vec(
                &mut floor_encoder,
                registry,
                device,
                &q_floor_buf,
                &k_floor_buf,
                &v_floor_buf,
                &floor_output_buf,
                &tmp_floor_buf,
                &floor_params,
            ).expect("independent-floor flash_attn_vec dispatch");
            floor_encoder.commit_and_wait().expect("floor commit_and_wait");

            let floor_output: Vec<f32> = floor_output_buf
                .as_slice::<f32>()
                .expect("read floor output")
                .to_vec();
            let (floor_nrmse, _floor_max, _floor_per_head) =
                compute_metrics(&floor_output, &gpu_output, nh, hd);
            eprintln!("[ORACLE] independent-floor nrmse = {:.6e}", floor_nrmse);
            Some(floor_nrmse)
        } else {
            eprintln!("[ORACLE] independent-floor requested but k_pre_quant/v_pre_quant absent in manifest — skipping");
            None
        }
    } else {
        None
    };

    // --- D4: Write raw sdpa_out .bin alongside the metrics JSON ---
    // Format: raw F32 little-endian, shape [nh, hd] = nh*hd*4 bytes = 16384 bytes for nh=16, hd=256
    let gpu_out_bytes: Vec<u8> = gpu_output
        .iter()
        .flat_map(|v| v.to_le_bytes())
        .collect();

    // Derive bin path from out_path: strip any .json extension, append _sdpa_out.bin
    let out_stem = if out_path.extension().map(|e| e == "json").unwrap_or(false) {
        out_path.with_extension("")
    } else {
        out_path.clone()
    };
    let bin_path = {
        let mut p = out_stem.into_os_string();
        p.push("_sdpa_out.bin");
        PathBuf::from(p)
    };

    if let Some(parent) = bin_path.parent() {
        fs::create_dir_all(parent).ok();
    }
    fs::write(&bin_path, &gpu_out_bytes).unwrap_or_else(|e| {
        eprintln!("ERROR: failed to write sdpa_out bin to {:?}: {}", bin_path, e);
        std::process::exit(1);
    });
    eprintln!("sdpa_out bin written: {:?} ({} bytes)", bin_path, gpu_out_bytes.len());

    let ran_at = SystemTime::now()
        .duration_since(SystemTime::UNIX_EPOCH)
        .map(|d| d.as_secs().to_string())
        .unwrap_or_else(|_| "unknown".into());

    let canary_str = match canary {
        CanaryMode::None => "none".to_string(),
        CanaryMode::InRange => "in-range".to_string(),
        CanaryMode::OutOfRange => "out-of-range".to_string(),
    };

    let metrics = ReplayMetrics {
        variation: variation.to_string(),
        canary: canary_str,
        ran_at,
        nrmse,
        max_abs_diff,
        per_head_max_abs_diff: per_head,
        any_nan_inf_in_gpu_output: has_nan_inf,
        exit_status: if has_nan_inf { "NaN/Inf" } else { "ok" }.into(),
        bin_path: bin_path.to_string_lossy().into_owned(),
        independent_floor_nrmse,
    };

    if has_nan_inf {
        eprintln!(
            "ERROR: GPU output for variation {} contains NaN or Inf",
            variation
        );
        std::process::exit(2);
    }

    metrics
}

// ---------------------------------------------------------------------------
// main
// ---------------------------------------------------------------------------

fn main() {
    let args = match parse_args() {
        Ok(a) => a,
        Err(e) => {
            eprintln!("argument error: {}", e);
            eprintln!(concat!(
                "usage: tq_kernel_replay\n",
                "  [--singlestep] --manifest <path> --variation <A|B|C>\n",
                "  [--multistep] --seed <hex_or_dec>\n",
                "  [--oracle dequant|independent-floor|both]\n",
                "  [--canary in-range|out-of-range]\n",
                "  --out <path>"
            ));
            std::process::exit(1);
        }
    };

    // Initialise Metal device and kernel registry
    let device = MlxDevice::new().expect("MlxDevice::new");
    let mut registry = KernelRegistry::new();
    flash_attn_vec_tq::register(&mut registry);
    mlx_native::ops::flash_attn_vec::register(&mut registry);
    // fwht_standalone kernels are pre-registered inside KernelRegistry::new()

    match args.mode {
        ReplayMode::Singlestep => {
            let manifest_path = args.manifest.as_ref().expect("manifest required for singlestep");

            // Load manifest
            let manifest_bytes = fs::read(manifest_path).unwrap_or_else(|e| {
                eprintln!("failed to read manifest {:?}: {}", manifest_path, e);
                std::process::exit(1);
            });
            let manifest: Manifest = serde_json::from_slice(&manifest_bytes).unwrap_or_else(|e| {
                eprintln!("failed to parse manifest: {}", e);
                std::process::exit(1);
            });

            eprintln!(
                "tq_kernel_replay: singlestep variation={} canary={:?} oracle={:?} manifest={:?}",
                args.variation, args.canary, args.oracle, manifest_path
            );

            let metrics = run_variation(
                &manifest,
                args.variation,
                args.canary,
                args.oracle,
                &args.out,
                &device,
                &mut registry,
            );

            // Print summary to stdout
            let json = serde_json::to_string_pretty(&metrics).expect("serialize metrics");
            println!("{}", json);

            // Write metrics JSON to --out path
            let out_json = if args.out.extension().map(|e| e == "json").unwrap_or(false) {
                args.out.clone()
            } else {
                args.out.with_extension("json")
            };

            if let Some(parent) = out_json.parent() {
                fs::create_dir_all(parent).ok();
            }
            fs::write(&out_json, &json).unwrap_or_else(|e| {
                eprintln!("failed to write metrics to {:?}: {}", out_json, e);
                std::process::exit(1);
            });

            eprintln!(
                "RESULT: variation={} canary={:?} dequant_oracle_nrmse={:.6e} max_abs_diff={:.6} nan_inf={} independent_floor_nrmse={:?}",
                metrics.variation, args.canary, metrics.nrmse, metrics.max_abs_diff,
                metrics.any_nan_inf_in_gpu_output, metrics.independent_floor_nrmse
            );
            eprintln!("metrics written to {:?}", out_json);
        }

        ReplayMode::Multistep => {
            run_multistep(&args, &device, &mut registry);
        }

        ReplayMode::ProductionFaithful => {
            let out_dir = args.out.clone();
            run_multistep_production_faithful(&out_dir, &device, &mut registry);
        }
    }
}

// ---------------------------------------------------------------------------
// Multistep driver (P3b)
// ---------------------------------------------------------------------------

use serde_json::Value as JsonValue;

/// Multistep output row in JSON.
/// Emitted as a 4-row Markdown table + JSON for the 4 canonical positions {50, 500, 1050, 2048}.
/// ring_start = (abs_pos+1) % kv_capacity when abs_pos+1 >= kv_capacity, else 0.
/// kvl_logical = min(abs_pos+1, kv_capacity).
#[derive(Debug, Serialize)]
struct MultistepRow {
    abs_pos: u64,
    kvl_logical: usize,
    ring_start: u32,
    dequant_oracle_nrmse: f64,
    independent_floor_nrmse: f64,
    max_abs_diff: f32,
    verdict: String,
}

/// Derive a sub-seed from (base, pos, index) without XOR.
/// Used ONLY by the legacy --multistep mode (catalog #13 known defect, preserved for compat).
/// The --production-faithful mode does NOT use this function.
fn seeded_gaussian_seed(base: u64, pos: u64, idx: u64) -> u64 {
    // Splitmix64: advance base + pos*3 + idx steps.
    let mut z = base.wrapping_add(pos.wrapping_mul(3).wrapping_add(idx).wrapping_mul(0x9E3779B97F4A7C15));
    z = (z.wrapping_shr(30)).wrapping_mul(0xBF58476D1CE4E5B9) ^ z;
    z = (z.wrapping_shr(27)).wrapping_mul(0x94D049BB133111EB) ^ z;
    z ^ z.wrapping_shr(31)
}

/// Seeded Box-Muller Gaussian PRNG — matches the deterministic seed spec.
/// Uses StdRng::seed_from_u64(seed) from the `rand` crate path re-exported
/// by mlx-native (or we implement our own if not available).
fn seeded_gaussian(initial: u64, n: usize) -> Vec<f32> {
    use std::collections::hash_map::DefaultHasher;
    use std::hash::{Hash, Hasher};

    // Simple deterministic Box-Muller using a Lehmer/LCG sequence seeded by initial.
    // Uses a linear congruential generator for portability without external deps.
    // NOTE: This legacy function is used only by --multistep mode; NOT by --production-faithful.
    let mut state: u64 = initial.wrapping_add(0x9e3779b97f4a7c15);
    let mut out = Vec::with_capacity(n);

    let next_u32 = |s: &mut u64| -> u32 {
        // Splitmix64 step
        *s = s.wrapping_add(0x9e3779b97f4a7c15);
        let mut z = *s;
        z = (z ^ (z >> 30)).wrapping_mul(0xbf58476d1ce4e5b9);
        z = (z ^ (z >> 27)).wrapping_mul(0x94d049bb133111eb);
        z = z ^ (z >> 31);
        (z >> 32) as u32
    };

    let to_unit = |u: u32| -> f32 {
        // Map [0, 2^32) to (0, 1) — avoid exact 0 for log.
        let v = (u as f64) / (u32::MAX as f64 + 1.0);
        if v < 1e-38 { 1e-38f32 } else { v as f32 }
    };

    let mut i = 0;
    while i < n {
        let u1 = to_unit(next_u32(&mut state));
        let u2 = to_unit(next_u32(&mut state));
        let mag = (-2.0 * u1.ln()).sqrt();
        let theta = 2.0 * std::f32::consts::PI * u2;
        out.push(mag * theta.cos());
        i += 1;
        if i < n {
            out.push(mag * theta.sin());
            i += 1;
        }
    }

    let _ = DefaultHasher::new(); // suppress unused import
    out
}

fn run_multistep(
    args: &Args,
    device: &MlxDevice,
    registry: &mut KernelRegistry,
) {
    use mlx_native::ops::hadamard_quantize_kv;

    // Fixed Gemma-4 sliding layer params.
    let num_heads: u32 = 8;   // use a small but realistic value for synthetic runs
    let num_kv_heads: u32 = 4;
    let head_dim: u32 = 256;
    let kv_capacity: u32 = 1024;
    let scale: f32 = 1.0;
    let mask_type: u32 = 2;       // sliding
    let sliding_window: u32 = 1024;
    let softcap: f32 = 0.0;

    let nh = num_heads as usize;
    let nkv = num_kv_heads as usize;
    let hd = head_dim as usize;
    let kvc = kv_capacity as usize;

    // 4 canonical positions.
    let positions: &[u64] = &[50, 500, 1050, 2048];

    let seed_base = args.seed; // 0xC25EED
    eprintln!("tq_kernel_replay multistep: seed={:#x} positions={:?}", seed_base, positions);

    let mut rows: Vec<MultistepRow> = Vec::new();

    for &abs_pos in positions {
        let kvl_logical = ((abs_pos + 1) as usize).min(kvc);
        let ring_start: u32 = if abs_pos + 1 >= kvc as u64 {
            ((abs_pos + 1) % kvc as u64) as u32
        } else {
            0
        };

        eprintln!("--- multistep pos={} kvl_logical={} ring_start={} ---", abs_pos, kvl_logical, ring_start);

        // Generate deterministic Gaussian K/V history: [nkv, kvl_logical, hd] F32.
        // Legacy iter-3 seeding: per-position splitmix64 derivatives (not used by production-faithful).
        // NOTE: This sub-seeding pattern is a known defect (catalog #13); it is preserved here
        // only for backward compat with the --multistep mode. The --production-faithful mode uses
        // a single Xoshiro256StarStar instance with no per-position reseeding.
        let k_seed = seeded_gaussian_seed(seed_base, abs_pos, 0);
        let v_seed = seeded_gaussian_seed(seed_base, abs_pos, 1);
        let q_seed = seeded_gaussian_seed(seed_base, abs_pos, 2);

        let k_pre_flat: Vec<f32> = seeded_gaussian(k_seed, nkv * kvl_logical * hd);
        let v_pre_flat: Vec<f32> = seeded_gaussian(v_seed, nkv * kvl_logical * hd);
        let q_natural: Vec<f32> = seeded_gaussian(q_seed, nh * hd);

        // Encode K/V via hadamard_quantize_kv GPU dispatch for EACH chronological position.
        // Layout: k_packed [nkv, kv_capacity, hd/2] u8; k_norms [nkv, kv_capacity] f32.
        let k_packed_bytes = nkv * kvc * (hd / 2);
        let norms_bytes = nkv * kvc * 4;
        let k_dense_bytes = nkv * kvc * hd * 4;

        let mut k_packed_buf = device
            .alloc_buffer(k_packed_bytes, DType::U8, vec![nkv, kvc, hd / 2])
            .expect("alloc K packed multistep");
        let mut k_norms_buf = device
            .alloc_buffer(norms_bytes, DType::F32, vec![nkv, kvc])
            .expect("alloc K norms multistep");
        let mut v_packed_buf = device
            .alloc_buffer(k_packed_bytes, DType::U8, vec![nkv, kvc, hd / 2])
            .expect("alloc V packed multistep");
        let mut v_norms_buf = device
            .alloc_buffer(norms_bytes, DType::F32, vec![nkv, kvc])
            .expect("alloc V norms multistep");

        // Zero-initialize norms (positions not written will have 0 norm = silence).
        k_norms_buf.as_mut_slice::<f32>().expect("zero K norms").iter_mut().for_each(|v| *v = 0.0);
        v_norms_buf.as_mut_slice::<f32>().expect("zero V norms").iter_mut().for_each(|v| *v = 0.0);

        // For each chronological position i, write the K/V vector at physical row (ring_start + i) % kvc.
        // Use dispatch_hadamard_quantize_kv with cache_pos = physical row.
        // Batch all positions into one encoder.
        {
            let mut enc = device.command_encoder().expect("enc multistep encode");
            for logical_i in 0..kvl_logical {
                let phys_row = ((ring_start as usize) + logical_i) % kvc;

                // Single-token K/V: [nkv, hd] F32. Build a temp buf.
                let k_token_bytes = nkv * hd * 4;
                let mut k_token_buf = device
                    .alloc_buffer(k_token_bytes, DType::F32, vec![nkv, hd])
                    .expect("alloc K token");
                let mut v_token_buf = device
                    .alloc_buffer(k_token_bytes, DType::F32, vec![nkv, hd])
                    .expect("alloc V token");

                {
                    let k_src_off = logical_i * nkv * hd; // NO — layout is [nkv, kvl, hd], so:
                    // k_pre_flat[kv_h * kvl_logical * hd + logical_i * hd + c]
                    // Build as [nkv, hd] interleaved.
                    let kslice = k_token_buf.as_mut_slice::<f32>().expect("write K token");
                    let vslice = v_token_buf.as_mut_slice::<f32>().expect("write V token");
                    for kv_h in 0..nkv {
                        let src_off = kv_h * kvl_logical * hd + logical_i * hd;
                        let dst_off = kv_h * hd;
                        kslice[dst_off..dst_off + hd].copy_from_slice(
                            &k_pre_flat[src_off..src_off + hd]);
                        vslice[dst_off..dst_off + hd].copy_from_slice(
                            &v_pre_flat[src_off..src_off + hd]);
                    }
                }

                enc.memory_barrier();
                hadamard_quantize_kv::dispatch_hadamard_quantize_kv(
                    &mut enc, registry, device.metal_device(),
                    &k_token_buf,
                    &k_packed_buf,
                    &k_norms_buf,
                    nkv as u32, head_dim, kvc as u32, phys_row as u32,
                    true, // kv_is_sliding (use ring-mode write)
                    None, None,
                ).expect("hadamard_quantize K multistep");
                enc.memory_barrier();
                hadamard_quantize_kv::dispatch_hadamard_quantize_kv(
                    &mut enc, registry, device.metal_device(),
                    &v_token_buf,
                    &v_packed_buf,
                    &v_norms_buf,
                    nkv as u32, head_dim, kvc as u32, phys_row as u32,
                    true, None, None,
                ).expect("hadamard_quantize V multistep");
            }
            enc.commit_and_wait().expect("multistep encode commit");
        }

        // Read back packed K/V and norms for CPU dequant oracle.
        let k_packed_all: Vec<u8> = k_packed_buf.as_slice::<u8>().expect("read K packed").to_vec();
        let v_packed_all: Vec<u8> = v_packed_buf.as_slice::<u8>().expect("read V packed").to_vec();
        let k_norms_all: Vec<f32> = k_norms_buf.as_slice::<f32>().expect("read K norms").to_vec();
        let v_norms_all: Vec<f32> = v_norms_buf.as_slice::<f32>().expect("read V norms").to_vec();

        // Build compact K/V (chronological order 0..kvl) from physical ring layout.
        // Physical row for logical i = (ring_start + i) % kvc.
        let mut k_packed_compact: Vec<u8> = vec![0u8; nkv * kvl_logical * (hd / 2)];
        let mut v_packed_compact: Vec<u8> = vec![0u8; nkv * kvl_logical * (hd / 2)];
        let mut k_norms_compact_ms: Vec<f32> = vec![0.0f32; nkv * kvl_logical];
        let mut v_norms_compact_ms: Vec<f32> = vec![0.0f32; nkv * kvl_logical];

        for kv_h in 0..nkv {
            for logical_i in 0..kvl_logical {
                let phys_row = ((ring_start as usize) + logical_i) % kvc;
                let src_pack_off = kv_h * kvc * (hd / 2) + phys_row * (hd / 2);
                let dst_pack_off = kv_h * kvl_logical * (hd / 2) + logical_i * (hd / 2);
                k_packed_compact[dst_pack_off..dst_pack_off + hd / 2]
                    .copy_from_slice(&k_packed_all[src_pack_off..src_pack_off + hd / 2]);
                v_packed_compact[dst_pack_off..dst_pack_off + hd / 2]
                    .copy_from_slice(&v_packed_all[src_pack_off..src_pack_off + hd / 2]);
                k_norms_compact_ms[kv_h * kvl_logical + logical_i] = k_norms_all[kv_h * kvc + phys_row];
                v_norms_compact_ms[kv_h * kvl_logical + logical_i] = v_norms_all[kv_h * kvc + phys_row];
            }
        }

        // Dequant oracle K/V (chronological order).
        let mut k_dequant: Vec<Vec<f32>> = Vec::with_capacity(nkv * kvl_logical);
        let mut v_dequant: Vec<Vec<f32>> = Vec::with_capacity(nkv * kvl_logical);
        for kv_h in 0..nkv {
            for pos in 0..kvl_logical {
                let pack_off = (kv_h * kvl_logical + pos) * (hd / 2);
                let norm_off = kv_h * kvl_logical + pos;
                k_dequant.push(nibble_dequantize(&k_packed_compact[pack_off..pack_off + hd / 2],
                    k_norms_compact_ms[norm_off], hd));
                v_dequant.push(nibble_dequantize(&v_packed_compact[pack_off..pack_off + hd / 2],
                    v_norms_compact_ms[norm_off], hd));
            }
        }

        // Dequant oracle cpu_sdpa.
        let cpu_ref = cpu_sdpa(
            &q_natural, &k_dequant, &v_dequant,
            nh, nkv, hd, kvl_logical, kvc, scale,
            mask_type, sliding_window, ring_start, softcap,
        );

        // Build GPU Q buffer.
        let mut q_buf = device.alloc_buffer(nh * hd * 4, DType::F32, vec![nh, 1, hd]).expect("alloc Q ms");
        q_buf.as_mut_slice::<f32>().expect("write Q ms").copy_from_slice(&q_natural);

        // TQ SDPA GPU dispatch.
        let output_buf = device.alloc_buffer(nh * hd * 4, DType::F32, vec![nh, 1, hd]).expect("alloc out ms");
        let tmp_bytes_tq = flash_attn_vec_tq::tmp_buffer_bytes(num_heads, head_dim);
        let tmp_buf = device.alloc_buffer(tmp_bytes_tq, DType::F32, vec![tmp_bytes_tq / 4]).expect("alloc tmp ms");

        let tq_params = mlx_native::ops::flash_attn_vec_tq::FlashAttnVecTqParams {
            num_heads,
            num_kv_heads,
            head_dim,
            kv_seq_len: kvl_logical as u32,
            kv_capacity,
            scale,
            mask_type,
            sliding_window,
            softcap,
            ring_start,
            scale_factor_d512: 1.0,
        };

        {
            let mut enc = device.command_encoder().expect("enc tq ms");
            enc.memory_barrier();
            // Forward FWHT on Q (Variation A).
            mlx_native::ops::fwht_standalone::dispatch_fwht_f32(
                &mut enc, registry, device.metal_device(), &q_buf, num_heads, head_dim,
            ).expect("FWHT Q ms");
            enc.memory_barrier();
            flash_attn_vec_tq::flash_attn_vec_tq(
                &mut enc, registry, device,
                &q_buf, &k_packed_buf, &k_norms_buf, &v_packed_buf, &v_norms_buf,
                &output_buf, &tmp_buf, &tq_params,
            ).expect("TQ SDPA ms");
            enc.memory_barrier();
            mlx_native::ops::fwht_standalone::dispatch_fwht_f32(
                &mut enc, registry, device.metal_device(), &output_buf, num_heads, head_dim,
            ).expect("FWHT out ms");
            enc.commit_and_wait().expect("tq ms commit");
        }

        let gpu_output: Vec<f32> = output_buf.as_slice::<f32>().expect("read out ms").to_vec();
        let (dequant_nrmse, max_abs_diff, _) = compute_metrics(&cpu_ref, &gpu_output, nh, hd);

        // Independent-floor oracle: pre-quant F32 K/V in physical-row layout → flash_attn_vec.
        let dense_kv_elems = nkv * kvc * hd;
        let mut k_dense_pre: Vec<f32> = vec![0.0f32; dense_kv_elems];
        let mut v_dense_pre: Vec<f32> = vec![0.0f32; dense_kv_elems];

        for kv_h in 0..nkv {
            for logical_i in 0..kvl_logical {
                let phys_row = ((ring_start as usize) + logical_i) % kvc;
                let src_off = kv_h * kvl_logical * hd + logical_i * hd;
                let dst_off = kv_h * kvc * hd + phys_row * hd;
                k_dense_pre[dst_off..dst_off + hd].copy_from_slice(&k_pre_flat[src_off..src_off + hd]);
                v_dense_pre[dst_off..dst_off + hd].copy_from_slice(&v_pre_flat[src_off..src_off + hd]);
            }
        }

        let dense_kv_bytes = dense_kv_elems * 4;
        let mut k_floor_buf = device.alloc_buffer(dense_kv_bytes, DType::F32, vec![nkv, kvc, hd]).expect("alloc K floor ms");
        let mut v_floor_buf = device.alloc_buffer(dense_kv_bytes, DType::F32, vec![nkv, kvc, hd]).expect("alloc V floor ms");
        k_floor_buf.as_mut_slice::<f32>().expect("write K floor ms").copy_from_slice(&k_dense_pre);
        v_floor_buf.as_mut_slice::<f32>().expect("write V floor ms").copy_from_slice(&v_dense_pre);

        let floor_output_buf = device.alloc_buffer(nh * hd * 4, DType::F32, vec![nh, 1, hd]).expect("alloc floor out ms");
        let tmp_bytes_dense = flash_attn_vec::tmp_buffer_bytes(num_heads, head_dim);
        let tmp_floor_buf = device.alloc_buffer(tmp_bytes_dense, DType::F32, vec![tmp_bytes_dense / 4]).expect("alloc tmp floor ms");

        // Q in natural basis for independent-floor (no FWHT).
        let mut q_floor_buf = device.alloc_buffer(nh * hd * 4, DType::F32, vec![nh, 1, hd]).expect("alloc Q floor ms");
        q_floor_buf.as_mut_slice::<f32>().expect("write Q floor ms").copy_from_slice(&q_natural);

        let floor_params = FlashAttnVecParams {
            num_heads,
            num_kv_heads,
            head_dim,
            kv_seq_len: kvl_logical as u32,
            kv_capacity,
            scale,
            mask_type,
            sliding_window,
            softcap,
        };

        {
            let mut enc = device.command_encoder().expect("enc floor ms");
            enc.memory_barrier();
            flash_attn_vec::flash_attn_vec(
                &mut enc, registry, device,
                &q_floor_buf, &k_floor_buf, &v_floor_buf,
                &floor_output_buf, &tmp_floor_buf, &floor_params,
            ).expect("floor flash_attn_vec ms");
            enc.commit_and_wait().expect("floor ms commit");
        }

        let floor_output: Vec<f32> = floor_output_buf.as_slice::<f32>().expect("read floor ms").to_vec();
        let (floor_nrmse, _, _) = compute_metrics(&floor_output, &gpu_output, nh, hd);

        // Decision-tree verdict for this position.
        let verdict = if dequant_nrmse < 0.01 && floor_nrmse < 0.01 {
            "kernel_end_to_end_correct".to_string()
        } else if dequant_nrmse < 0.01 && floor_nrmse >= 0.01 {
            "dequant_spec_bug_confirmed".to_string()
        } else if dequant_nrmse >= 0.01 && floor_nrmse < 0.01 {
            "fwht_pipeline_bug".to_string()
        } else {
            // Both diverge — check if ring-wrap-specific.
            if abs_pos > 1000 {
                "ring_start_or_dispatch_bug".to_string()
            } else {
                "h1_kernel_bug".to_string()
            }
        };

        eprintln!(
            "pos={} kvl={} ring_start={} dequant_nrmse={:.4e} floor_nrmse={:.4e} verdict={}",
            abs_pos, kvl_logical, ring_start, dequant_nrmse, floor_nrmse, verdict
        );

        rows.push(MultistepRow {
            abs_pos,
            kvl_logical,
            ring_start,
            dequant_oracle_nrmse: dequant_nrmse,
            independent_floor_nrmse: floor_nrmse,
            max_abs_diff,
            verdict,
        });
    }

    // Emit Markdown table.
    let md_table = {
        let mut s = String::new();
        s.push_str("| pos | kvl_logical | ring_start | dequant_oracle_nrmse | independent_floor_nrmse | verdict |\n");
        s.push_str("|-----|-------------|------------|---------------------|------------------------|--------|\n");
        for r in &rows {
            s.push_str(&format!(
                "| {} | {} | {} | {:.4e} | {:.4e} | {} |\n",
                r.abs_pos, r.kvl_logical, r.ring_start,
                r.dequant_oracle_nrmse, r.independent_floor_nrmse, r.verdict
            ));
        }
        s
    };
    println!("{}", md_table);

    // Emit JSON.
    let json_out = serde_json::to_string_pretty(&rows).expect("serialize multistep rows");

    // Write .md and .json files.
    let out_base = &args.out;
    if let Some(parent) = out_base.parent() {
        fs::create_dir_all(parent).ok();
    }

    let md_path = {
        let mut p = out_base.as_os_str().to_owned();
        p.push(".md");
        PathBuf::from(p)
    };
    let json_path = {
        let mut p = out_base.as_os_str().to_owned();
        p.push(".json");
        PathBuf::from(p)
    };

    fs::write(&md_path, md_table.as_bytes()).unwrap_or_else(|e| {
        eprintln!("failed to write multistep md {:?}: {}", md_path, e);
    });
    fs::write(&json_path, json_out.as_bytes()).unwrap_or_else(|e| {
        eprintln!("failed to write multistep json {:?}: {}", json_path, e);
    });

    eprintln!("multistep results written to {:?} and {:?}", md_path, json_path);

    // Overall decision-tree verdict — mirror of per-row reducer (lines 1651-1664)
    // applied to the aggregate matrix. Four dequant-vs-floor branches then
    // ring-wrap-vs-all fallback when both oracles diverge.
    let all_dequant_clean = rows.iter().all(|r| r.dequant_oracle_nrmse < 0.01);
    let all_floor_clean = rows.iter().all(|r| r.independent_floor_nrmse < 0.01);
    let overall = if all_dequant_clean && all_floor_clean {
        "kernel_end_to_end_correct"
    } else if all_dequant_clean && !all_floor_clean {
        "dequant_spec_bug_confirmed"
    } else if !all_dequant_clean && all_floor_clean {
        "fwht_pipeline_bug"
    } else {
        // Both oracles show divergence. Distinguish ring-wrap-only from whole-matrix.
        let pre_wrap_clean = rows.iter()
            .filter(|r| r.abs_pos <= 500)
            .all(|r| r.dequant_oracle_nrmse < 0.01 && r.independent_floor_nrmse < 0.01);
        let wrap_divergent = rows.iter()
            .filter(|r| r.abs_pos > 1000)
            .any(|r| r.dequant_oracle_nrmse >= 0.01 || r.independent_floor_nrmse >= 0.01);
        if pre_wrap_clean && wrap_divergent {
            "ring_start_or_dispatch_bug"
        } else {
            "h1_kernel_bug"
        }
    };
    eprintln!("OVERALL decision-tree branch: {}", overall);

    // Suppress unused import warning.
    let _: Option<JsonValue> = None;
}

// ---------------------------------------------------------------------------
// iter-5 production-faithful controlled sweep
// ---------------------------------------------------------------------------

/// Xoshiro256** PRNG — same implementation as tests/round_trip_identity.rs.
/// ONE instance is created at the top of run_multistep_production_faithful and
/// advances through ALL data generation for ALL sweep points in declaration order.
/// Catalog #13: NO per-position reseed-via-xor, NO xor-with-abs_pos, NO xor-with-kvl, NO XOR derivation.
#[derive(Clone)]
struct Xoshiro256StarStar {
    s: [u64; 4],
}

impl Xoshiro256StarStar {
    fn seed_from_u64(seed: u64) -> Self {
        // SplitMix64 initialiser — same as round_trip_identity.rs
        let mut z = seed;
        let mut s = [0u64; 4];
        for si in s.iter_mut() {
            z = z.wrapping_add(0x9E3779B97F4A7C15);
            let mut x = z;
            x = (x ^ (x >> 30)).wrapping_mul(0xBF58476D1CE4E5B9);
            x = (x ^ (x >> 27)).wrapping_mul(0x94D049BB133111EB);
            *si = x ^ (x >> 31);
        }
        Self { s }
    }

    fn next_u64(&mut self) -> u64 {
        let result = self.s[1].wrapping_mul(5).rotate_left(7).wrapping_mul(9);
        let t = self.s[1] << 17;
        self.s[2] ^= self.s[0];
        self.s[3] ^= self.s[1];
        self.s[1] ^= self.s[2];
        self.s[0] ^= self.s[3];
        self.s[2] ^= t;
        self.s[3] = self.s[3].rotate_left(45);
        result
    }

    /// Draw one Box-Muller pair of N(0,1) samples.
    fn next_gaussian_pair(&mut self) -> (f32, f32) {
        // Draw two uniform (0,1) values.
        let u1 = {
            let v = (self.next_u64() >> 11) as f64 / (1u64 << 53) as f64;
            // Avoid exact 0 for ln.
            if v < 1e-38 { 1e-38f64 } else { v }
        };
        let u2 = (self.next_u64() >> 11) as f64 / (1u64 << 53) as f64;
        let mag = (-2.0 * u1.ln()).sqrt() as f32;
        let theta = (2.0 * std::f64::consts::PI * u2) as f32;
        (mag * theta.cos(), mag * theta.sin())
    }

    /// Draw n N(0,1) samples, consuming ceil(n/2) pairs from the PRNG.
    fn draw_gaussian(&mut self, n: usize) -> Vec<f32> {
        let mut out = Vec::with_capacity(n);
        let mut i = 0;
        while i < n {
            let (a, b) = self.next_gaussian_pair();
            out.push(a);
            i += 1;
            if i < n {
                out.push(b);
                i += 1;
            }
        }
        out
    }
}

/// Apply per-head RMSNorm with eps=1e-6 and UNIT weights.
///
/// Production path (forward_mlx.rs:1144-1165) uses learned q_norm_weight/k_norm_weight
/// (forward_mlx.rs:1148, 1159). V uses dispatch_rms_norm_unit_perhead (no learned weight,
/// forward_mlx.rs:1178-1205 direct read confirms unit-weight-only path for V).
///
/// Iter-5 uses UNIT weights for Q and K norm as well: GGUF is not available on test machine.
/// Regime-faithful in shape/scale/eps/formula; not literal end-to-end weight parity.
/// This is disclosed in audit.json under regime.rmsnorm_weights = "unit_fallback".
///
/// eps=1e-6 matches config.rs:100 (rms_norm_eps=1e-6).
/// Formula: x / sqrt(mean(x^2) + eps)  — catalog #4: +eps is mandatory.
fn rms_norm_per_head(x: &mut [f32], num_rows: usize, head_dim: usize) {
    assert_eq!(x.len(), num_rows * head_dim);
    let eps = 1e-6f32;
    for row in 0..num_rows {
        let off = row * head_dim;
        let mean_sq: f32 = x[off..off + head_dim].iter().map(|&v| v * v).sum::<f32>() / head_dim as f32;
        let inv_rms = 1.0 / (mean_sq + eps).sqrt();
        for c in 0..head_dim {
            x[off + c] *= inv_rms;
        }
    }
}

/// Apply NeoX-style RoPE rotation in-place.
///
/// NeoX convention: first half of head_dim paired with second half.
/// theta=10000 matches config.rs:101 (rope_theta_sliding=10000).
/// Applied to Q (at abs_pos) and to each K row (at its chronological position p).
///
/// Evidence: forward_mlx.rs:1144-1165 dispatches fused_head_norm_rope on Q and K
/// using theta_sliding=10000 and NeoX rotation style.
fn apply_rope_neox(x: &mut [f32], num_rows: usize, head_dim: usize, abs_pos: usize, theta: f32) {
    assert_eq!(x.len(), num_rows * head_dim);
    let half = head_dim / 2;
    for row in 0..num_rows {
        let off = row * head_dim;
        for i in 0..half {
            let freq = 1.0 / theta.powf(i as f32 * 2.0 / head_dim as f32);
            let angle = abs_pos as f32 * freq;
            let (sin_a, cos_a) = angle.sin_cos();
            let x0 = x[off + i];
            let x1 = x[off + i + half];
            x[off + i]      = x0 * cos_a - x1 * sin_a;
            x[off + i + half] = x0 * sin_a + x1 * cos_a;
        }
    }
}

/// CPU reference SDPA used in the production-faithful sweep.
///
/// Q is post-RMSNorm post-RoPE F32 (natural basis), same as what hadamard_quantize_kv receives.
/// K/V come from the pre-quant F32 path (independent-floor oracle, #7 compliance).
/// scale=1.0 per forward_mlx.rs:1664.
/// mask_type=2 (sliding window) per forward_mlx.rs:1665.
fn cpu_sdpa_pf(
    q: &[f32],           // [nh, hd]
    k: &[Vec<f32>],      // [nkv * kvl, hd] chronological
    v: &[Vec<f32>],      // [nkv * kvl, hd] chronological
    nh: usize,
    nkv: usize,
    hd: usize,
    kvl: usize,
    scale: f32,
    mask_type: u32,
    sliding_window: u32,
    softcap: f32,
) -> Vec<f32> {
    let mut output = vec![0.0f32; nh * hd];
    let heads_per_kv = nh / nkv;

    for h in 0..nh {
        let kv_h = h / heads_per_kv;
        let q_off = h * hd;
        let first_valid: usize = if mask_type == 2 {
            let sw = sliding_window as usize;
            if kvl > sw { kvl - sw } else { 0 }
        } else {
            0
        };

        let mut scores: Vec<f32> = Vec::with_capacity(kvl);
        for p in 0..kvl {
            if p < first_valid {
                scores.push(f32::NEG_INFINITY);
                continue;
            }
            let k_vec = &k[kv_h * kvl + p];
            let mut dot = 0.0f32;
            for c in 0..hd {
                dot += q[q_off + c] * k_vec[c];
            }
            let score = if softcap > 0.0 {
                softcap * (dot * scale / softcap).tanh()
            } else {
                dot * scale
            };
            scores.push(score);
        }

        let max_score = scores.iter().copied().fold(f32::NEG_INFINITY, f32::max);
        let mut exp_scores: Vec<f32> = scores.iter().map(|&s| {
            if s == f32::NEG_INFINITY { 0.0f32 } else { (s - max_score).exp() }
        }).collect();
        let sum: f32 = exp_scores.iter().sum();
        if sum > 0.0 {
            for e in &mut exp_scores { *e /= sum; }
        }

        let o_off = h * hd;
        for p in 0..kvl {
            let w = exp_scores[p];
            if w == 0.0 { continue; }
            let v_vec = &v[kv_h * kvl + p];
            for c in 0..hd {
                output[o_off + c] += w * v_vec[c];
            }
        }
    }
    output
}

/// NRMSE: sqrt(sum_sq(a - b) / sum_sq(b)).
fn nrmse_f32(a: &[f32], b: &[f32]) -> f32 {
    assert_eq!(a.len(), b.len());
    let mut ss_diff = 0.0f64;
    let mut ss_ref  = 0.0f64;
    for (&ai, &bi) in a.iter().zip(b.iter()) {
        let diff = (ai - bi) as f64;
        ss_diff += diff * diff;
        ss_ref  += (bi as f64) * (bi as f64);
    }
    if ss_ref == 0.0 { return 0.0; }
    (ss_diff / ss_ref).sqrt() as f32
}

/// Run the prerequisite regression gates via std::process::Command.
/// Returns the structured gate results. Panics if any gate fails exit_code != 0.
/// Catalog #12: gate statuses MUST be binary-emitted, not narrative-injected.
/// Catalog #14: manifest path resolved at compile time from env!("CARGO_MANIFEST_DIR") so
///   gates run against the WORKTREE (the checkout that was compiled), not a hardcoded main.
///   The resolved path is emitted into audit.json.regression_gates.manifest_path.
fn run_regression_gates() -> serde_json::Value {
    use std::process::Command;
    use std::time::Instant;

    // H1 (catalog #14): compile-time resolution — CARGO_MANIFEST_DIR is set by cargo to
    // the package root at build time. Because this binary is built FROM the worktree,
    // this path is guaranteed to point to the worktree's Cargo.toml, NOT to /opt/mlx-native.
    const MANIFEST_DIR: &str = env!("CARGO_MANIFEST_DIR");
    let manifest_path: String = format!("{}/Cargo.toml", MANIFEST_DIR);
    let mp = manifest_path.as_str(); // borrow for use in Vec<&str> below
    let gates: &[(&str, Vec<&str>)] = &[
        (
            "gate_round_trip_identity",
            vec![
                "test", "--release",
                "--manifest-path", mp,
                "--test", "round_trip_identity",
                "--", "--nocapture",
            ],
        ),
        (
            "gate_bitwidth_ab",
            vec![
                "test", "--release",
                "--manifest-path", mp,
                "--test", "bitwidth_ab",
                "--", "--nocapture",
            ],
        ),
        (
            "gate_multistep_self_check",
            vec![
                "test", "--release",
                "--manifest-path", mp,
                "--test", "test_flash_attn_vec_tq",
                "--", "--nocapture",
            ],
        ),
    ];

    let mut gate_results = serde_json::Map::new();

    // Emit resolved manifest_path for R-11 / AC-3 verification.
    gate_results.insert("manifest_path".to_string(), serde_json::json!(&manifest_path));

    for (gate_id, cargo_args) in gates {
        eprintln!("[gate] running: cargo {}", cargo_args.join(" "));
        let start = Instant::now();
        let result = Command::new("cargo")
            .args(cargo_args)
            .output()
            .unwrap_or_else(|e| panic!("prerequisite gate {} failed to spawn: {}", gate_id, e));
        let duration_ms = start.elapsed().as_millis() as u64;

        let exit_code = result.status.code().unwrap_or(-1);

        // Capture last 40 lines of stdout and stderr.
        let stdout_str = String::from_utf8_lossy(&result.stdout);
        let stderr_str = String::from_utf8_lossy(&result.stderr);
        let last_40_stdout: Vec<&str> = stdout_str.lines().collect::<Vec<_>>()
            .into_iter().rev().take(40).rev().collect();
        let last_40_stderr: Vec<&str> = stderr_str.lines().collect::<Vec<_>>()
            .into_iter().rev().take(40).rev().collect();

        let status = if exit_code == 0 { "PASS" } else { "FAIL" };

        eprintln!("[gate] {} exit_code={} status={} duration={}ms", gate_id, exit_code, status, duration_ms);

        gate_results.insert(gate_id.to_string(), serde_json::json!({
            "exit_code": exit_code,
            "status": status,
            "duration_ms": duration_ms,
            "last_40_stdout_lines": last_40_stdout,
            "last_40_stderr_lines": last_40_stderr,
        }));

        if exit_code != 0 {
            panic!("prerequisite gate {} failed with exit_code={}; iter-5 REJECTED before measurement",
                gate_id, exit_code);
        }
    }

    serde_json::Value::Object(gate_results)
}

/// Struct for one sweep row result.
#[derive(Debug, Serialize)]
struct SweepRow {
    abs_pos: usize,
    kvl_logical: usize,
    sliding_window: u32,
    nrmse: f32,
    band_ok: bool,
    rng_u64s_consumed_before: u64,
}

/// Encode K/V for one sweep point using hadamard_quantize_kv and return the
/// TQ-packed ring buffer + norms. K and V are provided as pre-quant F32
/// in physical-ring layout [nkv, kv_capacity, hd].
///
/// This function also returns the dequantized compact K/V in chronological
/// order for the CPU oracle reference.
fn encode_and_get_oracle(
    k_pre_ring: &[f32],   // [nkv, kvc, hd] F32 physical layout
    v_pre_ring: &[f32],
    nkv: usize,
    kvc: usize,
    hd: usize,
    kvl: usize,
    ring_start: usize,
    device: &MlxDevice,
    registry: &mut KernelRegistry,
) -> (
    Vec<u8>,   // k_packed_buf [nkv, kvc, hd/2]
    Vec<f32>,  // k_norms [nkv, kvc]
    Vec<u8>,   // v_packed_buf [nkv, kvc, hd/2]
    Vec<f32>,  // v_norms [nkv, kvc]
) {
    use mlx_native::ops::hadamard_quantize_kv;

    let k_packed_bytes = nkv * kvc * (hd / 2);
    let norms_bytes    = nkv * kvc * 4;

    let mut k_packed_buf = device.alloc_buffer(k_packed_bytes, DType::U8, vec![nkv, kvc, hd / 2])
        .expect("alloc K packed pf");
    let mut k_norms_buf  = device.alloc_buffer(norms_bytes, DType::F32, vec![nkv, kvc])
        .expect("alloc K norms pf");
    let mut v_packed_buf = device.alloc_buffer(k_packed_bytes, DType::U8, vec![nkv, kvc, hd / 2])
        .expect("alloc V packed pf");
    let mut v_norms_buf  = device.alloc_buffer(norms_bytes, DType::F32, vec![nkv, kvc])
        .expect("alloc V norms pf");

    // Zero-init norms.
    k_norms_buf.as_mut_slice::<f32>().expect("zero K norms pf").iter_mut().for_each(|v| *v = 0.0);
    v_norms_buf.as_mut_slice::<f32>().expect("zero V norms pf").iter_mut().for_each(|v| *v = 0.0);

    // Encode each chronological position into the ring buffer.
    let mut enc = device.command_encoder().expect("enc pf encode");
    for logical_i in 0..kvl {
        let phys_row = (ring_start + logical_i) % kvc;

        // Single-token K/V: [nkv, hd] F32.
        let tok_bytes = nkv * hd * 4;
        let mut k_tok = device.alloc_buffer(tok_bytes, DType::F32, vec![nkv, hd])
            .expect("alloc K tok pf");
        let mut v_tok = device.alloc_buffer(tok_bytes, DType::F32, vec![nkv, hd])
            .expect("alloc V tok pf");

        {
            let ks = k_tok.as_mut_slice::<f32>().expect("write K tok");
            let vs = v_tok.as_mut_slice::<f32>().expect("write V tok");
            for kv_h in 0..nkv {
                let src = kv_h * kvc * hd + phys_row * hd;
                let dst = kv_h * hd;
                ks[dst..dst + hd].copy_from_slice(&k_pre_ring[src..src + hd]);
                vs[dst..dst + hd].copy_from_slice(&v_pre_ring[src..src + hd]);
            }
        }

        enc.memory_barrier();
        hadamard_quantize_kv::dispatch_hadamard_quantize_kv(
            &mut enc, registry, device.metal_device(),
            &k_tok, &k_packed_buf, &k_norms_buf,
            nkv as u32, hd as u32, kvc as u32, phys_row as u32, true, None, None,
        ).expect("hadamard_quantize K pf");
        enc.memory_barrier();
        hadamard_quantize_kv::dispatch_hadamard_quantize_kv(
            &mut enc, registry, device.metal_device(),
            &v_tok, &v_packed_buf, &v_norms_buf,
            nkv as u32, hd as u32, kvc as u32, phys_row as u32, true, None, None,
        ).expect("hadamard_quantize V pf");
    }
    enc.commit_and_wait().expect("pf encode commit");

    let k_packed_out = k_packed_buf.as_slice::<u8>().expect("read K packed pf").to_vec();
    let k_norms_out  = k_norms_buf.as_slice::<f32>().expect("read K norms pf").to_vec();
    let v_packed_out = v_packed_buf.as_slice::<u8>().expect("read V packed pf").to_vec();
    let v_norms_out  = v_norms_buf.as_slice::<f32>().expect("read V norms pf").to_vec();

    (k_packed_out, k_norms_out, v_packed_out, v_norms_out)
}

/// Run one sweep point: synthesize K/V/Q from rng, apply prod-regime transforms,
/// encode TQ, dispatch GPU kernel, compare to pre-quant F32 dense oracle.
/// Returns the nrmse of (tq_gpu_out, dense_floor_out).
///
/// Dense floor oracle: flash_attn_vec on POST-RMSNorm POST-RoPE F32 Q/K/V (same tensors
/// fed to hadamard_quantize_kv). This is the #7-compliant upstream-independent reference.
///
/// H3 (catalog #16): `override_ring_start` — when Some(x), use x as ring_start for BOTH
///   the kernel dispatch (FlashAttnVecTqParams.ring_start) AND the physical-ring layout
///   construction for K/V encoding (phys_row = (ring_start + logical_i) % kvc).
///   When None, the production formula is used: (abs_pos+1) % kvc when abs_pos+1 >= kvc.
///   The ring_wrap legs call this function TWICE with identical drawn data (via RNG clone/
///   restore by the caller) and different override_ring_start values, so ab_delta measures
///   kernel sensitivity to ring_start, not RNG noise.
fn run_sweep_point(
    rng: &mut Xoshiro256StarStar,
    rng_counter: &mut u64,
    abs_pos: usize,
    kvl: usize,
    kvc: usize,
    sliding_window: u32,
    override_ring_start: Option<u32>,  // H3: R-13 — flows to kernel AND oracle layout
    device: &MlxDevice,
    registry: &mut KernelRegistry,
) -> f32 {
    // Production-faithful Gemma 4 sliding layer constants.
    // forward_mlx.rs:1617 scale=1.0, forward_mlx.rs:1664 TQ scale=1.0.
    // forward_mlx.rs:1665 mask_type=2 (sliding). forward_mlx.rs:1666 sliding_window.
    // config.rs:100 rms_norm_eps=1e-6. config.rs:101 rope_theta_sliding=10000.
    let nh:  usize = 16;
    let nkv: usize = 8;
    let hd:  usize = 256;
    let scale:     f32 = 1.0;
    let mask_type: u32 = 2;
    let softcap:   f32 = 0.0;
    let rope_theta:  f32 = 10000.0;

    // H3 / R-13: use override if provided; otherwise compute production formula.
    let ring_start = override_ring_start
        .map(|x| x as usize)
        .unwrap_or_else(|| if abs_pos + 1 >= kvc { (abs_pos + 1) % kvc } else { 0 });

    // Draw K, V, Q from the persistent RNG (catalog #13: single seed, single instance).
    // Order per spec: draw (nkv × kvl × hd) K, then (nkv × kvl × hd) V, then (nh × hd) Q.
    let k_count = nkv * kvl * hd;
    let v_count = nkv * kvl * hd;
    let q_count = nh * hd;

    let k_raw = rng.draw_gaussian(k_count); *rng_counter += (k_count as u64 + 1) / 2 * 2;
    let v_raw = rng.draw_gaussian(v_count); *rng_counter += (v_count as u64 + 1) / 2 * 2;
    let q_raw = rng.draw_gaussian(q_count); *rng_counter += (q_count as u64 + 1) / 2 * 2;

    // Build pre-quant K/V in physical ring layout [nkv, kvc, hd].
    // For each chronological position logical_i, phys_row = (ring_start + logical_i) % kvc.
    // K and V at each position are post-RMSNorm. V is NOT RoPE'd (forward_mlx.rs:1167-1205).
    // K is RoPE'd at its chronological position.
    let mut k_pre_ring = vec![0.0f32; nkv * kvc * hd];
    let mut v_pre_ring = vec![0.0f32; nkv * kvc * hd];
    let mut k_chron: Vec<Vec<f32>> = Vec::with_capacity(nkv * kvl);
    let mut v_chron: Vec<Vec<f32>> = Vec::with_capacity(nkv * kvl);

    for logical_i in 0..kvl {
        let phys_row = (ring_start + logical_i) % kvc;
        // The chronological position of this token is (abs_pos - kvl + 1 + logical_i).
        // abs_pos is the current position (newest), so oldest = abs_pos - kvl + 1.
        // Use isize to handle synthetic cases where kvl > abs_pos+1 (sweep_B at small abs_pos).
        // When token_abs_pos is negative, clamp to 0 (same RoPE angle as position 0).
        let token_abs_pos_signed: isize =
            abs_pos as isize + 1 - kvl as isize + logical_i as isize;
        let token_abs_pos: usize = if token_abs_pos_signed < 0 { 0 } else { token_abs_pos_signed as usize };

        // Build [nkv, hd] K and V for this position from the raw draws.
        let mut k_tok = vec![0.0f32; nkv * hd];
        let mut v_tok = vec![0.0f32; nkv * hd];
        for kv_h in 0..nkv {
            let src = kv_h * kvl * hd + logical_i * hd;
            let dst = kv_h * hd;
            k_tok[dst..dst + hd].copy_from_slice(&k_raw[src..src + hd]);
            v_tok[dst..dst + hd].copy_from_slice(&v_raw[src..src + hd]);
        }

        // Apply per-head RMSNorm to K (catalog #4: +eps, eps=1e-6).
        rms_norm_per_head(&mut k_tok, nkv, hd);
        // Apply per-head RMSNorm to V (forward_mlx.rs:1178-1205: unit-weight RMSNorm on V).
        rms_norm_per_head(&mut v_tok, nkv, hd);

        // Apply RoPE to K at token_abs_pos (NeoX convention, theta=10000).
        apply_rope_neox(&mut k_tok, nkv, hd, token_abs_pos, rope_theta);
        // V is NOT RoPE'd (forward_mlx.rs:1167 section only norms V, no RoPE dispatch).

        // Write into physical ring layout.
        for kv_h in 0..nkv {
            let src = kv_h * hd;
            let dst = kv_h * kvc * hd + phys_row * hd;
            k_pre_ring[dst..dst + hd].copy_from_slice(&k_tok[src..src + hd]);
            v_pre_ring[dst..dst + hd].copy_from_slice(&v_tok[src..src + hd]);
        }

        // Collect chronological K/V for CPU oracle.
        for kv_h in 0..nkv {
            let src = kv_h * hd;
            k_chron.push(k_tok[src..src + hd].to_vec());
        }
        for kv_h in 0..nkv {
            let src = kv_h * hd;
            v_chron.push(v_tok[src..src + hd].to_vec());
        }
    }

    // Apply per-head RMSNorm to Q (catalog #4: +eps, eps=1e-6, per forward_mlx.rs:1144-1154).
    let mut q_normed = q_raw.clone();
    rms_norm_per_head(&mut q_normed, nh, hd);
    // Apply RoPE to Q at abs_pos (NeoX convention, theta=10000, per forward_mlx.rs:1144-1154).
    apply_rope_neox(&mut q_normed, nh, hd, abs_pos, rope_theta);

    // CPU dense floor oracle: post-RMSNorm post-RoPE F32 Q/K/V in chronological order.
    // This is the UPSTREAM-INDEPENDENT reference (catalog #7 compliance).
    // k_chron layout: [kvl, nkv, hd] — need to reorder to [nkv * kvl, hd].
    let mut k_oracle: Vec<Vec<f32>> = vec![vec![0.0f32; hd]; nkv * kvl];
    let mut v_oracle: Vec<Vec<f32>> = vec![vec![0.0f32; hd]; nkv * kvl];
    for logical_i in 0..kvl {
        for kv_h in 0..nkv {
            let src_k = &k_chron[logical_i * nkv + kv_h];
            let src_v = &v_chron[logical_i * nkv + kv_h];
            k_oracle[kv_h * kvl + logical_i].copy_from_slice(src_k);
            v_oracle[kv_h * kvl + logical_i].copy_from_slice(src_v);
        }
    }

    let dense_out = cpu_sdpa_pf(
        &q_normed, &k_oracle, &v_oracle,
        nh, nkv, hd, kvl, scale, mask_type, sliding_window, softcap,
    );

    // Encode K/V into TQ ring buffer via hadamard_quantize_kv GPU kernel.
    let (k_packed, k_norms, v_packed, v_norms) = encode_and_get_oracle(
        &k_pre_ring, &v_pre_ring, nkv, kvc, hd, kvl, ring_start, device, registry,
    );

    // Allocate GPU buffers.
    let kvc_u32  = kvc as u32;
    let nh_u32   = nh  as u32;
    let nkv_u32  = nkv as u32;
    let hd_u32   = hd  as u32;
    let kvl_u32  = kvl as u32;

    let k_pack_bytes = nkv * kvc * (hd / 2);
    let norm_bytes   = nkv * kvc * 4;

    let mut k_packed_buf = device.alloc_buffer(k_pack_bytes, DType::U8, vec![nkv, kvc, hd / 2])
        .expect("alloc K packed sweep");
    let mut k_norms_buf  = device.alloc_buffer(norm_bytes, DType::F32, vec![nkv, kvc])
        .expect("alloc K norms sweep");
    let mut v_packed_buf = device.alloc_buffer(k_pack_bytes, DType::U8, vec![nkv, kvc, hd / 2])
        .expect("alloc V packed sweep");
    let mut v_norms_buf  = device.alloc_buffer(norm_bytes, DType::F32, vec![nkv, kvc])
        .expect("alloc V norms sweep");

    k_packed_buf.as_mut_slice::<u8>().expect("write K packed").copy_from_slice(&k_packed);
    k_norms_buf.as_mut_slice::<f32>().expect("write K norms").copy_from_slice(&k_norms);
    v_packed_buf.as_mut_slice::<u8>().expect("write V packed").copy_from_slice(&v_packed);
    v_norms_buf.as_mut_slice::<f32>().expect("write V norms").copy_from_slice(&v_norms);

    // Q buffer (FWHT-domain: forward FWHT applied before TQ SDPA dispatch).
    let mut q_buf = device.alloc_buffer(nh * hd * 4, DType::F32, vec![nh, 1, hd])
        .expect("alloc Q sweep");
    let mut q_fwht = q_normed.clone();
    // Apply FWHT per head (to match Variation A dispatch path in production).
    for h in 0..nh {
        let off = h * hd;
        fwht_inplace(&mut q_fwht[off..off + hd]).expect("FWHT Q sweep");
    }
    q_buf.as_mut_slice::<f32>().expect("write Q sweep").copy_from_slice(&q_fwht);

    // TQ SDPA GPU dispatch.
    let out_buf = device.alloc_buffer(nh * hd * 4, DType::F32, vec![nh, 1, hd])
        .expect("alloc out sweep");
    let tmp_bytes = flash_attn_vec_tq::tmp_buffer_bytes(nh_u32, hd_u32);
    let tmp_buf   = device.alloc_buffer(tmp_bytes, DType::F32, vec![tmp_bytes / 4])
        .expect("alloc tmp sweep");

    let tq_params = FlashAttnVecTqParams {
        num_heads: nh_u32,
        num_kv_heads: nkv_u32,
        head_dim: hd_u32,
        kv_seq_len: kvl_u32,
        kv_capacity: kvc_u32,
        scale,
        mask_type,
        sliding_window,
        softcap,
        ring_start: ring_start as u32,
        scale_factor_d512: 1.0,
    };

    {
        let mut enc = device.command_encoder().expect("enc sweep tq");
        enc.memory_barrier();
        flash_attn_vec_tq::flash_attn_vec_tq(
            &mut enc, registry, device,
            &q_buf, &k_packed_buf, &k_norms_buf, &v_packed_buf, &v_norms_buf,
            &out_buf, &tmp_buf, &tq_params,
        ).expect("TQ SDPA sweep");
        enc.memory_barrier();
        // Inverse FWHT on output (Variation A path).
        mlx_native::ops::fwht_standalone::dispatch_fwht_f32(
            &mut enc, registry, device.metal_device(), &out_buf, nh_u32, hd_u32,
        ).expect("FWHT inv sweep");
        enc.commit_and_wait().expect("sweep commit");
    }

    let tq_out: Vec<f32> = out_buf.as_slice::<f32>().expect("read tq out").to_vec();

    // NRMSE: tq_out vs dense_out (independent pre-quant F32 oracle, #7 compliant).
    nrmse_f32(&tq_out, &dense_out)
}

/// The iter-6 production-faithful controlled sweep (additive on iter-5 carcass 75116ad).
///
/// PRODUCTION CONTRACT CITATIONS:
///   forward_mlx.rs:1617 — dense scale=1.0
///   forward_mlx.rs:1664 — TQ scale=1.0 (ADR-005:1181: Gemma 4 intentional scale=1.0 on per-head RMS-normed Q/K)
///   config.rs:100        — rms_norm_eps=1e-6
///   config.rs:101        — rope_theta_sliding=10000
///   forward_mlx.rs:1665  — mask_type=2 for sliding
///   forward_mlx.rs:1666  — sliding_window from config=1024
///
/// MISTAKES CATALOG CITATIONS (must appear verbatim per AC-9 / R-10):
///   #3:  Verdict gates too loose — tighten to physics-justified narrow bands.
///   #8:  Ring-chronology tests need kvl_logical < sliding_window to manifest.
///   #9:  Narrative overclaim vs code-generated evidence — emit statuses from binary.
///   #11: Pre-registered asserts bands — never widen after measurement.
///   #12: Regression-gate statuses MUST be binary-emitted, not narrative-injected.
///   #13: Non-controlled sweeps confound the claim — fix seed, vary one param only.
///   #14: Subprocess gates must run against the worktree, not a hardcoded other checkout.
///   #15: Copied-intersection-as-determinism-tautology — both sweeps must independently measure.
///   #16: Ring-wrap A/B without independent ring_start control is measuring RNG noise.
///   #17: Parallel artifact sources-of-truth violate single-source evidence discipline.
///
/// META-CLASS: report-vs-measurement drift — every field in audit.json must correspond to
///   a real function evaluation at measurement time; no pre-computed, copied, or constructed values.
fn run_multistep_production_faithful(
    out_dir: &PathBuf,
    device: &MlxDevice,
    registry: &mut KernelRegistry,
) {
    eprintln!("[pf] iter-5 production-faithful controlled sweep starting");
    eprintln!("[pf] band: [{}, {}] — pre-registered, no post-measurement widening (#11)", NRMSE_BAND_LOWER, NRMSE_BAND_UPPER);

    // STEP 1: Subprocess regression gates BEFORE any RNG or measurement (catalog #12).
    eprintln!("[pf] running prerequisite regression gates...");
    let regression_gates = run_regression_gates();
    eprintln!("[pf] all regression gates passed");

    // STEP 2: Create output directory.
    fs::create_dir_all(out_dir).unwrap_or_else(|e| {
        panic!("failed to create output dir {:?}: {}", out_dir, e);
    });

    // STEP 3: Single RNG instance — ONE u64 literal, ONE Xoshiro256StarStar::seed_from_u64 call.
    // Catalog #13: no XOR derivation, no per-point reseeding.
    let mut rng = Xoshiro256StarStar::seed_from_u64(0x00C2_5EED_u64);
    let mut rng_counter: u64 = 0;

    // Production shape (config.rs:95-98,103).
    let kvc: usize = 1024;

    // STEP 4: Sweep A — fix abs_pos=500, vary kvl ∈ {128, 256, 500, 512, 768, 1024}.
    // Purpose: isolate the LENGTH effect. Phase is held constant.
    // H2 (catalog #15): kvl=500 is NOW included so (abs_pos=500, kvl=500) is measured
    // INDEPENDENTLY in sweep_A with its own RNG state, not copied from sweep_B.
    let sweep_a_kvls:     &[usize] = &[128, 256, 500, 512, 768, 1024]; // 6 elements, kvl=500 added
    let sweep_a_abs_pos:   usize   = 500;
    let sweep_a_sw:        u32     = 1024;

    let mut sweep_a: Vec<SweepRow> = Vec::new();
    // Track intersection point value from sweep_A's OWN measurement (catalog #15).
    let mut sweep_a_nrmse_at_500: Option<f32> = None;

    for &kvl in sweep_a_kvls {
        let before_count = rng_counter;
        let nrmse = run_sweep_point(
            &mut rng, &mut rng_counter,
            sweep_a_abs_pos, kvl, kvc, sweep_a_sw,
            None, // no ring_start override for sweep legs
            device, registry,
        );
        let band_ok = nrmse >= NRMSE_BAND_LOWER && nrmse <= NRMSE_BAND_UPPER;

        eprintln!("[sweep_A] abs_pos={} kvl={} sw={} nrmse={:.7} band_ok={}", sweep_a_abs_pos, kvl, sweep_a_sw, nrmse, band_ok);

        if kvl == 500 {
            // H2: Record the independently-measured sweep_A value at the intersection point.
            sweep_a_nrmse_at_500 = Some(nrmse);
        }

        if !band_ok {
            // Catalog #11: log BAND_PRE_FALSIFIED message but CONTINUE collecting to emit full audit.
            // Do NOT widen band. Do NOT edit NRMSE_BAND_UPPER. Exit code 2 at the end.
            eprintln!(
                "BAND_PRE_FALSIFIED: sweep_A/kvl={} nrmse={:.7} outside pre-registered band [{}, {}]; iter-6 verdict REJECT; no remeasurement; no band edit",
                kvl, nrmse, NRMSE_BAND_LOWER, NRMSE_BAND_UPPER
            );
        }

        sweep_a.push(SweepRow {
            abs_pos: sweep_a_abs_pos,
            kvl_logical: kvl,
            sliding_window: sweep_a_sw,
            nrmse,
            band_ok,
            rng_u64s_consumed_before: before_count,
        });
    }

    // STEP 5: Sweep B — fix kvl=500, vary abs_pos ∈ {50, 100, 200, 500, 1000}.
    // Purpose: isolate the PHASE effect. Length is held constant.
    let sweep_b_abs_poses: &[usize] = &[50, 100, 200, 500, 1000];
    let sweep_b_kvl:        usize   = 500;
    let sweep_b_sw:         u32     = 1024;

    let mut sweep_b: Vec<SweepRow> = Vec::new();
    let mut sweep_b_nrmse_at_500: Option<f32> = None;

    for &abs_pos in sweep_b_abs_poses {
        // Sweep B uses literal kvl=500 regardless of abs_pos. This is a synthetic test
        // isolating RoPE phase: kvl=500 K/V entries are used even when abs_pos < 500.
        // For abs_pos < kvl, token_abs_pos for early K entries is clamped to 0 in run_sweep_point.
        // AC-4: all sweep_B rows must have kvl_logical=500.
        let effective_kvl = sweep_b_kvl; // literal 500, no clamping
        let before_count = rng_counter;
        let nrmse = run_sweep_point(
            &mut rng, &mut rng_counter,
            abs_pos, effective_kvl, kvc, sweep_b_sw,
            None, // no ring_start override for sweep legs
            device, registry,
        );
        let band_ok = nrmse >= NRMSE_BAND_LOWER && nrmse <= NRMSE_BAND_UPPER;

        eprintln!("[sweep_B] abs_pos={} kvl={} sw={} nrmse={:.7} band_ok={}", abs_pos, effective_kvl, sweep_b_sw, nrmse, band_ok);

        if abs_pos == 500 {
            // H2: Record sweep_B's independently-measured value at the intersection point.
            sweep_b_nrmse_at_500 = Some(nrmse);
        }

        if !band_ok {
            // Catalog #11: log but continue collecting. Exit code 2 emitted at end.
            eprintln!(
                "BAND_PRE_FALSIFIED: sweep_B/abs_pos={} nrmse={:.7} outside pre-registered band [{}, {}]; iter-6 verdict REJECT; no remeasurement; no band edit",
                abs_pos, nrmse, NRMSE_BAND_LOWER, NRMSE_BAND_UPPER
            );
        }

        sweep_b.push(SweepRow {
            abs_pos,
            kvl_logical: effective_kvl,
            sliding_window: sweep_b_sw,
            nrmse,
            band_ok,
            rng_u64s_consumed_before: before_count,
        });
    }

    // STEP 6: Intersection determinism check (AC-5, H2, catalog #15).
    // The intersection point is (abs_pos=500, kvl=500).
    // H2 fix: sweep_A NOW includes kvl=500, so BOTH sweeps independently measure this point.
    // sweep_A measured it at its own RNG state; sweep_B measured it at a later RNG state.
    // The two values are EXPECTED to differ (different RNG advance = different random data).
    // We binary-compute equality at 7 decimal places to confirm this is NOT a tautological copy.
    // A mismatch is the HONEST outcome; a match would itself be suspicious (coincidental f32 equality).
    let a_val: f32 = sweep_a_nrmse_at_500
        .expect("sweep_A kvl=500 row must exist (H2: added to sweep_a_kvls)");
    let b_val: f32 = sweep_b_nrmse_at_500
        .expect("sweep_B abs_pos=500 row must exist");

    // Binary-compute match: round both to 7 decimal places and compare as integers.
    // (catalog #15: must be computed, NEVER hardcoded true)
    let match_to_7_decimal_places: bool =
        ((a_val as f64 * 1e7).round() as i64) == ((b_val as f64 * 1e7).round() as i64);
    let absdiff: f64 = ((a_val as f64) - (b_val as f64)).abs();

    let intersection_band_ok = a_val >= NRMSE_BAND_LOWER && a_val <= NRMSE_BAND_UPPER;

    eprintln!(
        "[intersection] sweep_A/kvl=500 nrmse_A={:.7} sweep_B/abs_pos=500 nrmse_B={:.7} absdiff={:.2e} match_7dp={} (H2: two independent RNG states; mismatch is expected)",
        a_val, b_val, absdiff, match_to_7_decimal_places
    );

    // Band check for intersection (sweep_A measurement).
    if !intersection_band_ok {
        eprintln!(
            "BAND_PRE_FALSIFIED: intersection abs_pos=500 kvl=500 sweep_A_nrmse={:.7} outside band [{}, {}]",
            a_val, NRMSE_BAND_LOWER, NRMSE_BAND_UPPER
        );
    }

    // STEP 7: Ring-wrap legs.
    // M1 (catalog #8): kvl_logical MUST be < sliding_window for mask to differentiate slot
    //   chronology. iter-5 had kvl=1024 >= sliding_window=512 — degenerate (both ring_start
    //   formulas expose the full slot set). Fixed: kvl=256 < sliding_window=512.
    //   This is synthetic (production at abs_pos=1024 would have kvl=1024) but required by
    //   catalog #8 for chronology differences to physically manifest in the mask.
    // H3 (catalog #16): draw K/V/Q ONCE per abs_pos using an RNG clone/restore mechanism,
    //   then dispatch kernel TWICE with override_ring_start=Some(ring_start_a) and
    //   override_ring_start=Some(ring_start_b) on BYTE-IDENTICAL data.
    //   ab_delta = |nrmse_a - nrmse_b| measures kernel sensitivity to ring_start, not RNG noise.
    let ring_wrap_points = [(1024usize, 512u32), (1050usize, 512u32)];
    let ring_wrap_kvl: usize = 256; // strictly < sliding_window=512 (catalog #8 / M1)
    let mut ring_wrap: Vec<serde_json::Value> = Vec::new();

    for (abs_pos, sw) in ring_wrap_points {
        let kvl = ring_wrap_kvl; // 256 < 512 (M1 fix: catalog #8)
        let ring_start_a: u32 = if abs_pos + 1 >= kvc { ((abs_pos + 1) % kvc) as u32 } else { 0 };
        let ring_start_b: u32 = if abs_pos + 1 > kvc  { (abs_pos % kvc) as u32 } else { 0 };
        let before_count = rng_counter;

        // H3: Save RNG state before drawing data for this abs_pos.
        // Xoshiro256StarStar derives Clone (added in iter-6), so we can snapshot and restore.
        let rng_snapshot = rng.clone();
        let counter_snapshot = rng_counter;

        // First invocation: ring_start_a (production formula). Advances rng.
        let nrmse_a = run_sweep_point(
            &mut rng, &mut rng_counter,
            abs_pos, kvl, kvc, sw,
            Some(ring_start_a), // H3: override_ring_start flows to kernel dispatch + CPU oracle
            device, registry,
        );

        // H3: Restore RNG to pre-draw state so nrmse_b uses BYTE-IDENTICAL K/V/Q data.
        // The rng state is reset to exactly what it was before the A draw, then B draws
        // the same sequence — only ring_start differs in the kernel dispatch and oracle layout.
        rng = rng_snapshot;
        rng_counter = counter_snapshot;

        // Second invocation: ring_start_b (alternative formula). Same data as A.
        let nrmse_b = run_sweep_point(
            &mut rng, &mut rng_counter,
            abs_pos, kvl, kvc, sw,
            Some(ring_start_b), // H3: different ring_start, same K/V/Q data
            device, registry,
        );

        let ab_delta = (nrmse_a - nrmse_b).abs();

        eprintln!(
            "[ring_wrap] abs_pos={} kvl={} sw={} ring_start_A={} ring_start_B={} nrmse_a={:.7} nrmse_b={:.7} ab_delta={:.2e} (H3: byte-identical data, different ring_start)",
            abs_pos, kvl, sw, ring_start_a, ring_start_b, nrmse_a, nrmse_b, ab_delta
        );

        let band_ok_a = nrmse_a >= NRMSE_BAND_LOWER && nrmse_a <= NRMSE_BAND_UPPER;
        let band_ok_b = nrmse_b >= NRMSE_BAND_LOWER && nrmse_b <= NRMSE_BAND_UPPER;

        if !band_ok_a {
            eprintln!(
                "BAND_PRE_FALSIFIED: ring_wrap abs_pos={} nrmse_a={:.7} outside band [{}, {}]",
                abs_pos, nrmse_a, NRMSE_BAND_LOWER, NRMSE_BAND_UPPER
            );
        }
        if !band_ok_b {
            eprintln!(
                "BAND_PRE_FALSIFIED: ring_wrap abs_pos={} nrmse_b={:.7} outside band [{}, {}]",
                abs_pos, nrmse_b, NRMSE_BAND_LOWER, NRMSE_BAND_UPPER
            );
        }

        ring_wrap.push(serde_json::json!({
            "abs_pos": abs_pos,
            "kvl_logical": kvl,          // 256 < sliding_window=512 (M1 / catalog #8)
            "sliding_window": sw,         // 512
            "ring_start_a": ring_start_a,
            "ring_start_b": ring_start_b,
            "ring_start_A_passed_to_kernel": ring_start_a, // R-13: emitted for AC-11 verification
            "ring_start_B_passed_to_kernel": ring_start_b, // R-13: emitted for AC-11 verification
            "ring_start_A_nrmse": nrmse_a,
            "ring_start_B_nrmse": nrmse_b,
            "ab_delta": ab_delta,
            "band_ok": band_ok_a && band_ok_b,
            "rng_u64s_consumed_before": before_count,
            "h3_data_reuse": "byte-identical K/V/Q via RNG clone/restore; only ring_start differs",
        }));
    }

    // STEP 8: Verdict classification — deterministic from measured matrix (catalog #9).
    // Exactly one of four declared strings.
    let all_band_ok = sweep_a.iter().all(|r| r.band_ok)
        && sweep_b.iter().all(|r| r.band_ok)
        && ring_wrap.iter().all(|v| v["band_ok"].as_bool().unwrap_or(false));

    let verdict: &str = if !all_band_ok {
        // One or more sweep points are out of band: verdict is BAND_PRE_FALSIFIED.
        // Binary will exit with code 2 after writing audit.json.
        "BAND_PRE_FALSIFIED"
    } else {
        // Spearman rho for sweep_A (length effect): monotone-rising → rho > 0.7
        // H2: kvl=500 is NOW a real sweep_A row; include all 6 rows in the Spearman analysis.
        let sweep_a_nrmse: Vec<f32> = sweep_a.iter()
            .map(|r| r.nrmse).collect();
        let n_a = sweep_a_nrmse.len() as f32;
        let spearman_rho_a = if n_a >= 2.0 {
            // Rank correlation: rank each element, compute rho.
            let mut indexed: Vec<(usize, f32)> = sweep_a_nrmse.iter().copied().enumerate().collect();
            indexed.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
            let mut ranks = vec![0.0f32; indexed.len()];
            for (rank, (idx, _)) in indexed.iter().enumerate() {
                ranks[*idx] = rank as f32 + 1.0;
            }
            // Spearman = 1 - 6*sum_d2 / (n*(n^2-1))
            let natural_ranks: Vec<f32> = (1..=indexed.len()).map(|i| i as f32).collect();
            let sum_d2: f32 = ranks.iter().zip(natural_ranks.iter())
                .map(|(r, nr)| (r - nr).powi(2)).sum();
            1.0 - 6.0 * sum_d2 / (n_a * (n_a * n_a - 1.0))
        } else { 0.0 };

        // Sweep B range for phase effect.
        let sweep_b_nrmse_core: Vec<f32> = sweep_b.iter().map(|r| r.nrmse).collect();
        let sweep_b_max = sweep_b_nrmse_core.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
        let sweep_b_min = sweep_b_nrmse_core.iter().cloned().fold(f32::INFINITY, f32::min);
        let sweep_b_range = sweep_b_max - sweep_b_min;

        // H2: all 6 sweep_A rows included (kvl=500 is now a real row, not a phantom).
        let sweep_a_range_core: Vec<f32> = sweep_a.iter()
            .map(|r| r.nrmse).collect();
        let sweep_a_max = sweep_a_range_core.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
        let sweep_a_min = sweep_a_range_core.iter().cloned().fold(f32::INFINITY, f32::min);
        let sweep_a_range = sweep_a_max - sweep_a_min;

        eprintln!("[verdict] spearman_rho_A={:.4} sweep_B_range={:.4} sweep_A_range={:.4}", spearman_rho_a, sweep_b_range, sweep_a_range);

        if spearman_rho_a > 0.7 && sweep_b_range < 0.05 {
            "LENGTH_EFFECT_CONFIRMED"
        } else if sweep_b_range > 0.10 && sweep_a_range < 0.05 {
            "PHASE_EFFECT_CONFIRMED"
        } else {
            "FLOOR_IS_PHYSICS_CONSISTENT"
        }
    };

    eprintln!("[pf] verdict = {}", verdict);

    // STEP 9: Regime documentation.
    // V-norm policy: forward_mlx.rs:1167-1205 direct read confirms V gets
    // dispatch_rms_norm_unit_perhead (unit weights, no learned weight tensor for V).
    // Q/K: forward_mlx.rs:1144-1165 uses learned q_norm_weight/k_norm_weight.
    // Iter-6 uses unit weights for Q/K (GGUF not present on test machine).
    let regime = serde_json::json!({
        "rmsnorm_weights": "unit_fallback",
        "rmsnorm_weights_reason": "Gemma-4-27B GGUF not present on test machine; GGUF extraction path non-trivial. Q/K use unit RMSNorm weights (learned q_norm_weight/k_norm_weight available in production at forward_mlx.rs:1148,1159). Delta vs learned: unit weights normalize Q/K to unit sphere before RoPE; learned weights add a per-element multiplicative scale. For N(0,1) synthetic inputs the difference is O(weight_scale - 1), typically <5% for near-unit weights in trained models. Regime-faithful in shape/scale/eps/formula.",
        "v_norm_policy": "unit_weights_per_production",
        "v_norm_evidence": "forward_mlx.rs:1178-1205 direct read: dispatch_rms_norm_unit_perhead on V (no learned v_norm_weight tensor; Gemma 4 has no v_norm_weight per spec grep confirming only q_norm_weight and k_norm_weight at forward_mlx.rs:289-290,714-719).",
        "scale": 1.0,
        "scale_evidence": "forward_mlx.rs:1617 (dense), forward_mlx.rs:1664 (TQ), ADR-005:1181",
        "rms_norm_eps": 1e-6,
        "rms_norm_eps_evidence": "config.rs:100",
        "rope_theta": 10000.0,
        "rope_theta_evidence": "config.rs:101 (rope_theta_sliding=10000)",
        "rope_convention": "NeoX half-split, applied to Q at abs_pos and K at chronological position",
        "shapes": {"num_heads": 16, "num_kv_heads": 8, "head_dim": 256, "kv_capacity": 1024},
        "dense_floor_reference": "POST-RMSNorm POST-RoPE F32 Q/K/V (same tensors fed to hadamard_quantize_kv) — catalog #7 upstream-independent reference",
        "mask_type": 2,
        "mask_type_evidence": "forward_mlx.rs:1665 (mask_type=2 for sliding layers)",
        "softcap": 0.0,
        // M1 / catalog #8: ring_wrap uses kvl=256 < sliding_window=512 so chronology manifests.
        "ring_wrap_kvl_reason": "catalog #8: ring-chronology tests need kvl_logical < sliding_window to manifest. At kvl_logical >= sliding_window both ring_start formulas expose the full slot set; chronology differences physically cannot show. ring_wrap uses kvl=256 < sliding_window=512 (synthetic; production at abs_pos=1024 would have kvl=1024, but the A/B test is measuring kernel dispatch sensitivity to ring_start, which requires mask differentiation of slot chronology).",
    });

    // STEP 10: Write audit.json — SOLE reporting artifact (catalog #17 / M2).
    // No "pending", "TBD", or "pending_manual_run" strings anywhere (catalog #12, AC-3).
    // M2: sweep_A and sweep_B are embedded as arrays in audit.json; NO sidecar CSVs written.
    let audit = serde_json::json!({
        "session": "cfa-20260422-C4t3i6-evidence-package-integrity",
        "iter": 6,
        "ran_at": SystemTime::now()
            .duration_since(SystemTime::UNIX_EPOCH)
            .map(|d| d.as_secs())
            .unwrap_or(0),
        "verdict": verdict,
        "band": {
            "lower": NRMSE_BAND_LOWER,
            "upper": NRMSE_BAND_UPPER,
            "registration": "pre-registered as const f32 at module scope before any measurement; catalog #11",
        },
        "regression_gates": regression_gates,
        "regime": regime,
        "sweep_A": sweep_a,
        "sweep_B": sweep_b,
        "ring_wrap": ring_wrap,
        "rng": {
            "seed_literal": "0x00C2_5EED_u64",
            "algorithm": "Xoshiro256StarStar",
            "single_instance": true,
            "total_u64s_consumed": rng_counter,
        },
        "intersection_check": {
            "abs_pos": 500,
            "kvl_logical": 500,
            // H2 (catalog #15): sweep_A_intersection_nrmse and sweep_B_intersection_nrmse are
            // INDEPENDENTLY MEASURED from distinct RNG states (sweep_A draws first; sweep_B draws
            // after sweep_A has advanced the RNG). A mismatch is the EXPECTED honest outcome.
            "sweep_A_intersection_nrmse": a_val,   // from sweep_A's kvl=500 row (independently measured)
            "sweep_B_intersection_nrmse": b_val,   // from sweep_B's abs_pos=500 row (different RNG state)
            // Binary-computed equality — NEVER hardcoded (catalog #15 / AC-8).
            "match_to_7_decimal_places": match_to_7_decimal_places,
            "absdiff": absdiff,     // numeric distance for AC-7 verification
            "band_ok": intersection_band_ok,
            "note": "H2 / catalog #15: (abs_pos=500, kvl=500) is measured TWICE with distinct RNG states. sweep_A includes kvl=500 as of iter-6 (6 rows total). sweep_B has abs_pos=500 as its 4th row. The two values come from different RNG advances so a numerical mismatch is expected and is the HONEST outcome. match_to_7_decimal_places is COMPUTED, not hardcoded.",
        },
        "mistakes_catalog_citations": [
            "#3: Verdict gates too loose — tighten to physics-justified narrow bands",
            "#8: Ring-chronology tests need kvl_logical < sliding_window to manifest",
            "#9: Narrative overclaim vs code-generated evidence — emit statuses from binary",
            "#11: Pre-registered asserts bands — never widen after measurement (iter-4 HIGH-1 defect)",
            "#12: Regression-gate statuses MUST be binary-emitted, not narrative-injected (iter-4 HIGH-2 defect)",
            "#13: Non-controlled sweeps confound the claim — fix seed, vary one param only (iter-4 MED defect)",
            "#14: Subprocess gates must run against the worktree, not a hardcoded other checkout (iter-5 HIGH-1 defect)",
            "#15: Copied-intersection-as-determinism-tautology — both sweeps must independently measure (iter-5 HIGH-2 defect)",
            "#16: Ring-wrap A/B without independent ring_start control is measuring RNG noise (iter-5 HIGH-3 defect)",
            "#17: Parallel artifact sources-of-truth violate single-source evidence discipline (iter-5 MED-2 defect)",
            "meta-class: report-vs-measurement drift — every field in audit.json must correspond to a real function evaluation at measurement time; no pre-computed, copied, or constructed values",
        ],
        // M2 (catalog #17): CSV-equivalent documentation for downstream jq post-processing.
        // The binary writes ONLY audit.json. No sidecar CSVs. R-15 / AC-13 / AC-14.
        "csv_equivalent": {
            "sweep_a_columns": ["abs_pos", "kvl_logical", "sliding_window", "nrmse", "band_ok", "rng_u64s_consumed_before"],
            "sweep_b_columns": ["abs_pos", "kvl_logical", "sliding_window", "nrmse", "band_ok", "rng_u64s_consumed_before"],
            "ring_wrap_columns": ["abs_pos", "kvl_logical", "sliding_window", "ring_start_a", "ring_start_b", "ring_start_A_nrmse", "ring_start_B_nrmse", "ab_delta", "band_ok", "rng_u64s_consumed_before"],
            "note": "sweep_A and sweep_B arrays in this audit.json are the canonical source. jq one-liner: jq -r '.sweep_A[] | [.abs_pos,.kvl_logical,.sliding_window,.nrmse,.band_ok,.rng_u64s_consumed_before] | @csv' audit.json",
        },
    });

    let audit_json = serde_json::to_string_pretty(&audit).expect("serialize audit");

    // M2 (catalog #17): write ONLY audit.json — the SOLE reporting artifact.
    // Sidecar sweep_a.csv / sweep_b.csv REMOVED in iter-6. Use csv_equivalent.note for jq.
    let audit_path = out_dir.join("audit.json");
    fs::write(&audit_path, audit_json.as_bytes())
        .unwrap_or_else(|e| panic!("failed to write audit.json: {}", e));
    eprintln!("[pf] audit.json written to {:?} (sole artifact; no sidecar CSVs — catalog #17 / M2)", audit_path);

    // Print summary to stdout.
    println!("=== iter-5 production-faithful verdict: {} ===", verdict);
    println!("sweep_A (abs_pos=500, vary kvl):");
    for r in &sweep_a {
        println!("  kvl={} nrmse={:.7} band_ok={}", r.kvl_logical, r.nrmse, r.band_ok);
    }
    println!("sweep_B (kvl=~500, vary abs_pos):");
    for r in &sweep_b {
        println!("  abs_pos={} kvl={} nrmse={:.7} band_ok={}", r.abs_pos, r.kvl_logical, r.nrmse, r.band_ok);
    }
    println!("ring_wrap:");
    for rw in &ring_wrap {
        println!("  abs_pos={} sw={} nrmse_a={:.7} nrmse_b={:.7} ab_delta={:.2e}",
            rw["abs_pos"], rw["sliding_window"],
            rw["ring_start_A_nrmse"].as_f64().unwrap_or(0.0),
            rw["ring_start_B_nrmse"].as_f64().unwrap_or(0.0),
            rw["ab_delta"].as_f64().unwrap_or(0.0));
    }

    // Exit with appropriate code.
    let exit_code = if verdict == "BAND_PRE_FALSIFIED" { 2i32 } else { 0i32 };
    if exit_code != 0 {
        eprintln!("[pf] exiting with code {} (BAND_PRE_FALSIFIED)", exit_code);
        std::process::exit(exit_code);
    }
    eprintln!("[pf] complete — exit 0");
}