dsfb-gpu-debug-demo 0.1.1

//! `dsfb-gpu-debug bench-gpu-scale` — R.7 money-table headline benchmark
//! + R.8 bottleneck profiler.
//!
//! Two distinct entry modes share this subcommand:
//!
//! * **Default mode (R.7)** drives the panel-locked scale sweep that
//!   produces the headline `reports/money_table.txt`. Each row pairs
//!   one GPU dispatch path (Layer A device evidence fabric / Layer B
//!   throughput verdict summary / Layer C full audit court) against
//!   the same fixture's CPU Layer B baseline so the speedup column
//!   is reproducible.
//! * **`--detail-stage` mode (R.8)** skips the money-table sweep and
//!   instead runs the per-stage bottleneck profiler at three K=1
//!   scale points (canonical 16×128, 64×512 mid-scale, 256×4096
//!   full-scale). Each scale point gets its own
//!   `reports/r8_bottleneck_<grid>_K1.txt` with a table of `(stage,
//!   median µs, % of wall)` and the top 3 stages by absolute time.
//!   Honest scope note: K>1 batched per-stage timings need a
//!   separate `_timed` batched FFI (deferred); the K=1 percent
//!   breakdown is the proxy R.8 uses for the K=64 row in R.7 because
//!   the same kernels run with `blockIdx.z = K` at batched scale.
//!
//! R.7 rows (panel-locked):
//!
//! * Canonical 16×128, K=32: Layer A, Layer B, Layer C CPU, Layer C GPU
//! * Scale-large 256×4096, K ∈ {1, 16, 64, 128}: CPU Layer B, GPU
//!   Layer A, GPU Layer B, and Layer C if feasible. K=128 only runs
//!   if the `BatchedGpuWorkspace` allocation succeeds; otherwise the
//!   row is marked "not run: alloc refused" and the rest of the
//!   sweep continues.
//!
//! Session-level fields recorded once at the top of the R.7 report:
//!
//! * `graph_status` — outcome of an opt-in
//!   `build_gpu_throughput_graph_or_demote` call at canonical scale.
//!   Either `captured` or `demoted` with a short reason. The graph
//!   itself does not drive the bench rows (the rows go through the
//!   pre-existing layer dispatch paths); the status is recorded so the
//!   case file's launch-plan provenance can be audited later.
//! * `graph_plan_hash` — the captured topology's canonical hash, when
//!   capture succeeds. Reported as 64 hex chars; absent on demoted.
//!
//! Output:
//!
//! 1. Console: R.7 prints a `=== R.7 Money Table ===` block per row
//!    plus a final summary table; R.8 prints a `=== R.8 Bottleneck
//!    Profile === ` block per scale point.
//! 2. Files: R.7 writes `reports/money_table.txt`; R.8 writes
//!    `reports/r8_bottleneck_<grid>_K<K>.txt` per scale point.
//!
//! Honest reporting: every number printed is measured. Rows that fail
//! to run print `n/a` in the speedup column and a short reason in the
//! same row. The R doctrine forbids fabricated numbers; this file
//! enforces that by only writing rows the bench actually completed.

#![allow(clippy::expect_used)]

use std::process::ExitCode;

use dsfb_gpu_debug_core::bank::bank_hash;
use dsfb_gpu_debug_core::contract::Contract;
use dsfb_gpu_debug_core::event::TraceEvent;
use dsfb_gpu_debug_core::fixture::{synthesize, synthesize_scaled, DEFAULT_SEED};
use dsfb_gpu_debug_core::motif::registry_hash;

#[cfg(feature = "cuda")]
use dsfb_gpu_debug_cuda::{build_gpu_throughput_graph_or_demote, GpuWorkspace, GraphCaptureStatus};

#[cfg(feature = "cuda")]
use super::bench::{run_layer_a, run_layer_b, run_layer_c_gpu};
use super::bench::{run_layer_b_cpu_always, run_layer_c_cpu};
use super::{parse_flags, usage_error};

/// Iteration / warmup counts for the headline rows. Honest defaults
/// picked so the bench takes a few minutes total on the target GPU.
/// Bigger fixtures get smaller iter counts because the per-iteration
/// wall time grows with scale.
#[derive(Debug, Clone, Copy)]
struct IterPlan {
    warmup: usize,
    iters: usize,
}

impl IterPlan {
    const CANONICAL: Self = Self {
        warmup: 20,
        iters: 100,
    };
    const LARGE_K1: Self = Self {
        warmup: 5,
        iters: 50,
    };
    const LARGE_K16: Self = Self {
        warmup: 3,
        iters: 20,
    };
    const LARGE_K64: Self = Self {
        warmup: 2,
        iters: 10,
    };
    const LARGE_K128: Self = Self {
        warmup: 2,
        iters: 5,
    };
}

/// Run R.7 with the user-supplied CLI flags. Supported flags:
///
/// * `--quick` — divides every iter count by 5 (rounded up) for a
///   smoke run; the row labels then carry `[quick]`.
/// * `--out PATH` — alternate path for the money-table report
///   (default `reports/money_table.txt`).
/// * `--skip-large` — only run the canonical row block. Useful when
///   the host doesn't have enough VRAM for 256×4096 fixtures.
#[allow(clippy::too_many_lines)]
pub fn parse_and_run(args: &[String]) -> ExitCode {
    let flags = match parse_flags(args) {
        Ok(f) => f,
        Err(message) => return usage_error(&message),
    };

    let quick = flags.get("quick").is_some_and(|v| v != "false");
    let skip_large = flags.get("skip-large").is_some_and(|v| v != "false");
    let big_k = flags.get("big-k").is_some_and(|v| v != "false");
    // R.8 — when set, the bench skips the normal money-table rows
    // and runs a single deep per-stage breakdown of GPU Layer B at
    // each of three scale points (canonical, 256x4096 K=16, K=64),
    // writing `reports/r8_bottleneck_<grid>_K<K>.txt` for each.
    // The money-table sweep is not run in `--detail-stage` mode.
    let detail_stage = flags.get("detail-stage").is_some_and(|v| v != "false");
    // R.8.5 — when set alongside `--detail-stage`, the profiler
    // runs the tree-digest dispatch path instead of the legacy
    // serial-digest path. The detail-stage breakdown then captures
    // the post-R.8.5 digest-stage cost; the report filename
    // distinguishes the two via `_tree` suffix so a pre/post
    // comparison is easy.
    let tree_digest = flags.get("tree-digest").is_some_and(|v| v != "false");
    // R.11 — when set alongside `--detail-stage --tree-digest`,
    // the profiler also runs the compact-verdict finalizer path
    // (precomputed FixtureHashes, no per-iter re-hashing of
    // events/window features). Output: a 3-way comparison
    // report (serial-digest, tree-digest, tree+compact) per
    // scale point, written to
    // `reports/r11_compact_compare_<grid>_K1.txt`.
    let compact = flags.get("compact").is_some_and(|v| v != "false");
    let out_path: std::path::PathBuf = flags.get("out").map_or_else(
        || std::path::PathBuf::from("reports/money_table.txt"),
        std::path::PathBuf::from,
    );

    let mut rows: Vec<MoneyRow> = Vec::new();
    let mut header_lines: Vec<String> = Vec::new();

    header_lines.push(String::from(
        "# R.7 Money Table — DSFB-GPU-Debug headline benchmark",
    ));
    header_lines.push(format!(
        "# generated: {}",
        chrono_like_timestamp_or_unknown()
    ));
    header_lines.push(format!(
        "# quick: {quick}    skip-large: {skip_large}    big-k: {big_k}"
    ));
    header_lines.push(String::from("#"));
    header_lines.push(String::from(
        "# Layer A: device evidence fabric (skip-bank, on-device digests).",
    ));
    header_lines.push(String::from(
        "# Layer B: throughput verdict summary (host bank stage admits compact candidates).",
    ));
    header_lines.push(String::from(
        "# Layer C: full audit court (every intermediate cell materialised host-side).",
    ));
    header_lines.push(String::from(
        "# Speedup is measured against CPU Layer B at the SAME (n_entities, n_windows) scale.",
    ));
    header_lines.push(String::from("#"));

    // ---- session-level graph capture probe (R.6c) -----------------
    let (graph_status_line, graph_hash_line) = probe_graph_capture();
    header_lines.push(graph_status_line);
    if let Some(line) = graph_hash_line {
        header_lines.push(line);
    }
    header_lines.push(String::new());

    // ---- R.8 detail-stage: skip the money-table sweep entirely ---
    // and run only the per-stage bottleneck profile at three scale
    // points. The money-table rows are not the goal in this mode.
    // Gated on `feature = "cuda"` because the underlying `_timed`
    // dispatch lives in dsfb-gpu-debug-cuda and is meaningless
    // without GPU support; non-cuda builds report and skip.
    if detail_stage {
        #[cfg(feature = "cuda")]
        {
            let stage_iters = if quick { 5 } else { 20 };
            let stage_warmup = if quick { 1 } else { 3 };
            if compact {
                // R.11 path: requires the tree-digest backend
                // because the compact dispatch consumes the
                // tree-digest GPU pipeline's outputs.
                run_r11_compact_compare(stage_warmup, stage_iters);
            } else {
                run_r8_detail_stage(stage_warmup, stage_iters, tree_digest);
            }
        }
        #[cfg(not(feature = "cuda"))]
        {
            let _ = (quick, tree_digest, compact);
            println!("--detail-stage requires --features cuda; nothing to profile");
        }
        return ExitCode::SUCCESS;
    }

    // ---- canonical 16x128 K=32 ------------------------------------
    {
        let plan = scale_iters(IterPlan::CANONICAL, quick);
        let n_entities = 16u32;
        let n_windows = 128u32;
        let k = 32u32;
        let mut contract = Contract::canonical();
        contract.pin_bank_hash(bank_hash());
        contract.pin_detector_registry_hash(registry_hash());
        let events = synthesize(DEFAULT_SEED);

        let cpu_b = run_layer_b_cpu_always(&events, &contract, plan.warmup, plan.iters);
        let cpu_b_med = median(&cpu_b);
        rows.push(MoneyRow {
            label: format!(
                "canonical 16x128 K={k:>3}  CPU Layer B          {}",
                quick_tag(quick)
            ),
            n_entities,
            n_windows,
            n_catalogs: 1,
            samples_us: cpu_b.clone(),
            baseline_us: cpu_b_med,
        });

        run_gpu_row(
            &mut rows,
            &format!("canonical 16x128 K={k:>3}  GPU Layer A          "),
            n_entities,
            n_windows,
            k,
            GpuRow::LayerA,
            &events,
            &contract,
            plan,
            cpu_b_med,
            quick,
        );

        run_gpu_row(
            &mut rows,
            &format!("canonical 16x128 K={k:>3}  GPU Layer B          "),
            n_entities,
            n_windows,
            k,
            GpuRow::LayerB,
            &events,
            &contract,
            plan,
            cpu_b_med,
            quick,
        );

        // Layer C @ K=1 only — audit transcripts at K>1 aren't
        // architecturally meaningful here (the audit court emits one
        // canonical case file). Reported as K=1 with explicit note.
        let cpu_c = run_layer_c_cpu(&events, &contract, plan.warmup, plan.iters);
        rows.push(MoneyRow {
            label: format!(
                "canonical 16x128 K=  1  CPU Layer C (audit)  {}",
                quick_tag(quick)
            ),
            n_entities,
            n_windows,
            n_catalogs: 1,
            samples_us: cpu_c,
            baseline_us: cpu_b_med,
        });

        #[cfg(feature = "cuda")]
        {
            let gpu_c = run_layer_c_gpu(&events, &contract, plan.warmup, plan.iters);
            rows.push(MoneyRow {
                label: format!(
                    "canonical 16x128 K=  1  GPU Layer C (audit)  {}",
                    quick_tag(quick)
                ),
                n_entities,
                n_windows,
                n_catalogs: 1,
                samples_us: gpu_c,
                baseline_us: cpu_b_med,
            });
        }
    }

    // ---- scale-large 256x4096 K ∈ {1, 16, 64, 128} ----------------
    if !skip_large {
        let n_entities = 256u32;
        let n_windows = 4096u32;
        let mut contract = Contract::scaled(n_entities, n_windows);
        contract.pin_bank_hash(bank_hash());
        contract.pin_detector_registry_hash(registry_hash());
        let events = synthesize_scaled(DEFAULT_SEED, n_entities, n_windows, 4);

        // CPU Layer B at this scale is the only CPU comparator we
        // measure; per-catalog cost stays constant regardless of K
        // (each catalog runs independently on CPU). Measure once.
        let plan_cpu = scale_iters(IterPlan::LARGE_K1, quick);
        let cpu_b = run_layer_b_cpu_always(&events, &contract, plan_cpu.warmup, plan_cpu.iters);
        let cpu_b_med = median(&cpu_b);
        rows.push(MoneyRow {
            label: format!(
                "scaled  256x4096 K=  1  CPU Layer B          {}",
                quick_tag(quick)
            ),
            n_entities,
            n_windows,
            n_catalogs: 1,
            samples_us: cpu_b.clone(),
            baseline_us: cpu_b_med,
        });

        // K=128 is opt-in via `--big-k`: it needs ~15 GB VRAM and
        // ~30s of fixture-synthesis CPU work per row, which is
        // wasteful by default. Hosts that can afford it pass
        // `--big-k` to include it.
        let large_sweep: &[(u32, IterPlan)] = if big_k {
            &[
                (1u32, IterPlan::LARGE_K1),
                (16, IterPlan::LARGE_K16),
                (64, IterPlan::LARGE_K64),
                (128, IterPlan::LARGE_K128),
            ]
        } else {
            &[
                (1u32, IterPlan::LARGE_K1),
                (16, IterPlan::LARGE_K16),
                (64, IterPlan::LARGE_K64),
            ]
        };
        for &(k, plan_const) in large_sweep {
            let plan = scale_iters(plan_const, quick);

            run_gpu_row(
                &mut rows,
                &format!("scaled  256x4096 K={k:>3}  GPU Layer A          "),
                n_entities,
                n_windows,
                k,
                GpuRow::LayerA,
                &events,
                &contract,
                plan,
                cpu_b_med,
                quick,
            );

            run_gpu_row(
                &mut rows,
                &format!("scaled  256x4096 K={k:>3}  GPU Layer B          "),
                n_entities,
                n_windows,
                k,
                GpuRow::LayerB,
                &events,
                &contract,
                plan,
                cpu_b_med,
                quick,
            );
        }

        // Layer C at scale-large is intentionally not run by default:
        // materialising every intermediate cell at 256×4096 is the
        // "court transcript cost" the paper documents but does not
        // sell as the headline. Record a not-run row so the reader
        // sees the deliberate omission rather than a missing slot.
        rows.push(MoneyRow::not_run(
            "scaled  256x4096 K=  1  Layer C (audit)      [not run: transcript materialisation cost]",
            n_entities,
            n_windows,
            1,
        ));
    }

    // ---- emit -----------------------------------------------------
    let report = render_report(&header_lines, &rows);
    print!("{report}");

    if let Some(parent) = out_path.parent() {
        let _ = std::fs::create_dir_all(parent);
    }
    match std::fs::write(&out_path, &report) {
        Ok(()) => {
            println!("wrote money table -> {}", out_path.display());
            ExitCode::SUCCESS
        }
        Err(e) => {
            eprintln!("warning: could not write {}: {e}", out_path.display());
            // Non-fatal: the console output is the primary deliverable.
            ExitCode::SUCCESS
        }
    }
}

#[derive(Clone, Copy)]
enum GpuRow {
    LayerA,
    LayerB,
}

#[allow(clippy::too_many_arguments)]
fn run_gpu_row(
    rows: &mut Vec<MoneyRow>,
    label_prefix: &str,
    n_entities: u32,
    n_windows: u32,
    k: u32,
    which: GpuRow,
    events: &[TraceEvent],
    contract: &Contract,
    plan: IterPlan,
    baseline_us: u128,
    quick: bool,
) {
    #[cfg(feature = "cuda")]
    {
        // For K=1 the run_layer_* helpers expect `batch=0` (their
        // single-catalog branch); for K>=1 they take `batch=K`.
        let batch = if k == 1 { 0 } else { k };
        let label = format!("{label_prefix}{}", quick_tag(quick));

        // The legacy bench helpers `.expect()` on workspace
        // construction, which would abort the whole sweep if a
        // large-K row OOMs (e.g. K=128 at 256x4096 needs ~15 GB).
        // We wrap each row in `catch_unwind` so an alloc failure
        // marks just this row as not-run and the sweep continues
        // with the remaining rows. This preserves the panel-locked
        // "honest reporting" rule: a missing row is recorded with
        // a reason rather than fabricated.
        let events_owned: Vec<TraceEvent> = events.to_vec();
        let contract_owned = contract.clone();
        let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| match which {
            GpuRow::LayerA => run_layer_a(
                &events_owned,
                &contract_owned,
                batch,
                plan.warmup,
                plan.iters,
            ),
            GpuRow::LayerB => run_layer_b(
                &events_owned,
                &contract_owned,
                batch,
                plan.warmup,
                plan.iters,
            ),
        }));
        if let Ok(samples) = result {
            rows.push(MoneyRow {
                label,
                n_entities,
                n_windows,
                n_catalogs: k,
                samples_us: samples,
                baseline_us,
            });
        } else {
            let row_label = format!("{label} [not run: alloc refused or kernel error]");
            rows.push(MoneyRow::not_run(&row_label, n_entities, n_windows, k));
        }
    }
    #[cfg(not(feature = "cuda"))]
    {
        let _ = (
            label_prefix,
            n_entities,
            n_windows,
            k,
            which,
            events,
            contract,
            plan,
            baseline_us,
            quick,
        );
        rows.push(MoneyRow::not_run(
            "(GPU rows skipped: not built with --features cuda)",
            n_entities,
            n_windows,
            k,
        ));
    }
}

struct MoneyRow {
    label: String,
    n_entities: u32,
    n_windows: u32,
    n_catalogs: u32,
    samples_us: Vec<u128>,
    baseline_us: u128,
}

impl MoneyRow {
    fn not_run(label: &str, n_entities: u32, n_windows: u32, n_catalogs: u32) -> Self {
        Self {
            label: label.to_string(),
            n_entities,
            n_windows,
            n_catalogs,
            samples_us: Vec::new(),
            baseline_us: 0,
        }
    }
}

fn render_report(header_lines: &[String], rows: &[MoneyRow]) -> String {
    use core::fmt::Write;
    let mut out = String::new();
    for line in header_lines {
        out.push_str(line);
        out.push('\n');
    }
    out.push_str(
        "  label                                                    \
         | median_us  | per_catalog_us | catalogs/sec | cells/sec     \
         | det_evals/sec  | speedup_vs_cpu_b\n",
    );
    out.push_str(
        "  ------------------------------------------------------- \
         | ---------- | -------------- | ------------ | ------------- \
         | -------------- | ----------------\n",
    );
    let n_detectors = u128::from(dsfb_gpu_debug_core::motif::MotifClass::COUNT as u32);
    let one_sec = 1_000_000u128;
    for row in rows {
        if row.samples_us.is_empty() {
            let _ = writeln!(
                out,
                "  {:<55}|        n/a |            n/a |          n/a |           n/a |            n/a |              n/a",
                row.label
            );
            continue;
        }
        let med = median(&row.samples_us);
        let catalogs = u128::from(row.n_catalogs);
        let cells = catalogs * u128::from(row.n_entities) * u128::from(row.n_windows);
        let det_evals = cells * n_detectors;
        let per_catalog = if catalogs > 0 { med / catalogs } else { med };
        let catalogs_per_sec = if med > 0 { catalogs * one_sec / med } else { 0 };
        let cells_per_sec = if med > 0 { cells * one_sec / med } else { 0 };
        let det_evals_per_sec = if med > 0 {
            det_evals * one_sec / med
        } else {
            0
        };
        let speedup = if med > 0 && row.baseline_us > 0 {
            // Per-catalog speedup: how many times faster the GPU
            // processes one catalog vs. the CPU Layer B baseline
            // at the same fixture. Reported as integer-times.10ths
            // (e.g. "  47.3x") so we never need to cast u128 → f64.
            let denom = per_catalog.max(1);
            // Multiply numerator by 10 to capture one decimal place,
            // then split for the format.
            let ratio_times10 = (row.baseline_us * 10) / denom;
            let whole = ratio_times10 / 10;
            let tenth = ratio_times10 % 10;
            format!("{whole:>10}.{tenth}x")
        } else {
            String::from("           n/a")
        };
        let _ = writeln!(
            out,
            "  {:<55}| {med:>10} | {per_catalog:>14} | {catalogs_per_sec:>12} | {cells_per_sec:>13} | {det_evals_per_sec:>14} | {speedup:>16}",
            row.label
        );
    }
    out
}

fn median(samples: &[u128]) -> u128 {
    if samples.is_empty() {
        return 0;
    }
    let mut s = samples.to_vec();
    s.sort_unstable();
    s[s.len() / 2]
}

fn scale_iters(p: IterPlan, quick: bool) -> IterPlan {
    if !quick {
        return p;
    }
    IterPlan {
        warmup: p.warmup.div_ceil(5).max(1),
        iters: p.iters.div_ceil(5).max(1),
    }
}

fn quick_tag(quick: bool) -> &'static str {
    if quick {
        "[quick]"
    } else {
        ""
    }
}

/// R.6c — opt-in graph capture probe. Returns the `graph_status` line
/// and an optional `graph_plan_hash` line for the report header. On
/// non-CUDA builds returns a "skipped" status.
fn probe_graph_capture() -> (String, Option<String>) {
    #[cfg(feature = "cuda")]
    {
        let mut contract = Contract::canonical();
        contract.pin_bank_hash(bank_hash());
        contract.pin_detector_registry_hash(registry_hash());
        let events = synthesize(DEFAULT_SEED);
        match GpuWorkspace::new_with_pinned_async(&contract) {
            Ok(mut ws) => match build_gpu_throughput_graph_or_demote(&events, &contract, &mut ws) {
                Ok((_case, GraphCaptureStatus::Captured { plan_hash })) => {
                    use core::fmt::Write;
                    let mut hex = String::with_capacity(64);
                    for b in &plan_hash {
                        let _ = write!(hex, "{b:02x}");
                    }
                    (
                        String::from("# graph_status: captured"),
                        Some(format!("# graph_plan_hash: {hex}")),
                    )
                }
                Ok((_case, GraphCaptureStatus::Demoted { reason })) => {
                    (format!("# graph_status: demoted ({reason})"), None)
                }
                Err(e) => (format!("# graph_status: error during probe ({e:?})"), None),
            },
            Err(e) => (
                format!("# graph_status: error allocating pinned-async workspace ({e:?})"),
                None,
            ),
        }
    }
    #[cfg(not(feature = "cuda"))]
    {
        (
            String::from("# graph_status: skipped (built without --features cuda)"),
            None,
        )
    }
}

/// Best-effort ISO-8601-ish timestamp. The R doctrine forbids
/// "wall-clock values" inside the hash chain, but a timestamp on the
/// human-readable report header is fine — it just identifies when the
/// report was generated. Uses `SystemTime::UNIX_EPOCH` for portability
/// and avoids pulling in a date-time crate.
fn chrono_like_timestamp_or_unknown() -> String {
    match std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH) {
        Ok(d) => format!("{} epoch seconds", d.as_secs()),
        Err(_) => String::from("unknown"),
    }
}

/// R.8 — drive the per-stage bottleneck profiler at three K=1
/// scale points that together reveal how stage costs scale with
/// fixture size. For each point, run the `_timed` dispatch over
/// `iters` iterations and write a per-stage percent-of-time table
/// to `reports/r8_bottleneck_<grid>_K1.txt`.
///
/// Scale points (panel-aligned):
/// 1. **canonical 16×128 K=1** — the v0 fixture; tiny per-cell
///    cost, kernel launch overhead dominates.
/// 2. **mid-scale 64×512 K=1** — 16× more cells than canonical;
///    surfaces which kernels scale linearly vs which stay flat.
/// 3. **full-scale 256×4096 K=1** — the courthouse-factory scale
///    that R.7 measured K=64 at; this is the per-catalog
///    decomposition that R.9 / R.10 / R.11 plan against.
///
/// Honest scope note: R.7's headline `K=64` row uses a different
/// batched-throughput FFI that does NOT yet record per-stage
/// timings; wiring `_timed` into the batched dispatch is deferred
/// to a follow-up R.8-batched commit. The K=1 percent breakdown
/// at 256×4096 is the proxy R.8 uses for the K=64 row because
/// the same kernels run with `blockIdx.z = K` at batched scale;
/// per-catalog kernel time is essentially unchanged, only the
/// host orchestration cost amortises differently.
///
/// The deliverable is the **256×4096 K=1** file — it identifies
/// the top three bottlenecks for the R.9–R.11 campaign to target.
#[cfg(feature = "cuda")]
fn run_r8_detail_stage(warmup: usize, iters: usize, tree_digest: bool) {
    use std::time::Instant;

    use dsfb_gpu_debug_cuda::{
        build_gpu_throughput_pinned_async_on_workspace_timed, GpuWorkspace, R8HostStageTimings,
        R8StageTimings,
    };

    if tree_digest {
        // R.8.5 measurement path: time the tree-digest dispatch
        // end-to-end with host Instant only (no per-stage cudaEvent
        // breakdown for the tree path at v0). The single number
        // that matters is the wall delta vs R.8's serial-digest
        // baseline — that proves the dominant 78 %-of-wall
        // bottleneck collapsed.
        run_r8_5_tree_digest_compare(warmup, iters);
        return;
    }

    // Scale points: (label, n_entities, n_windows, K).
    // K=1 is locked here because the underlying `_timed` dispatch
    // is the single-catalog R.6b async path. See the function
    // docstring for the honesty note about K>1 batched timings
    // being deferred to a follow-up commit.
    let points: [(&str, u32, u32, u32); 3] = [
        ("canonical 16x128 K=1", 16, 128, 1),
        ("mid-scale 64x512 K=1", 64, 512, 1),
        ("full-scale 256x4096 K=1", 256, 4096, 1),
    ];

    for &(label, n_entities, n_windows, k) in &points {
        println!();
        println!("=== R.8 Bottleneck Profile — {label} ===");
        println!("  warmup: {warmup}   iters: {iters}");

        let contract = if n_entities == 16 && n_windows == 128 {
            let mut c = dsfb_gpu_debug_core::contract::Contract::canonical();
            c.pin_bank_hash(dsfb_gpu_debug_core::bank::bank_hash());
            c.pin_detector_registry_hash(dsfb_gpu_debug_core::motif::registry_hash());
            c
        } else {
            let mut c = dsfb_gpu_debug_core::contract::Contract::scaled(n_entities, n_windows);
            c.pin_bank_hash(dsfb_gpu_debug_core::bank::bank_hash());
            c.pin_detector_registry_hash(dsfb_gpu_debug_core::motif::registry_hash());
            c
        };
        let events = if n_entities == 16 && n_windows == 128 {
            dsfb_gpu_debug_core::fixture::synthesize(dsfb_gpu_debug_core::fixture::DEFAULT_SEED)
        } else {
            dsfb_gpu_debug_core::fixture::synthesize_scaled(
                dsfb_gpu_debug_core::fixture::DEFAULT_SEED,
                n_entities,
                n_windows,
                4,
            )
        };

        let Ok(mut ws) = GpuWorkspace::new_with_pinned_async(&contract) else {
            println!("  workspace alloc refused; skipping {label}");
            continue;
        };

        // Warmup.
        for _ in 0..warmup {
            let _ =
                build_gpu_throughput_pinned_async_on_workspace_timed(&events, &contract, &mut ws);
        }

        // Iterations.
        let mut wall_us: Vec<u128> = Vec::with_capacity(iters);
        let mut devs: Vec<R8StageTimings> = Vec::with_capacity(iters);
        let mut hosts: Vec<R8HostStageTimings> = Vec::with_capacity(iters);
        for _ in 0..iters {
            let t0 = Instant::now();
            let result =
                build_gpu_throughput_pinned_async_on_workspace_timed(&events, &contract, &mut ws);
            let dt = t0.elapsed().as_nanos();
            match result {
                Ok((case, dev, host)) => {
                    std::hint::black_box(case);
                    devs.push(dev);
                    hosts.push(host);
                    wall_us.push(dt / 1_000);
                }
                Err(e) => {
                    println!("  dispatch error during R.8 measurement: {e:?}");
                    return;
                }
            }
        }

        // Pick medians for stable reporting.
        let med_wall = median_u128(&wall_us);
        let med_dev = median_stage(&devs);
        let med_host = median_host(&hosts);

        print_and_write_r8(label, n_entities, n_windows, k, med_dev, med_host, med_wall);
    }
}

#[cfg(feature = "cuda")]
fn median_u128(samples: &[u128]) -> u128 {
    if samples.is_empty() {
        return 0;
    }
    let mut s = samples.to_vec();
    s.sort_unstable();
    s[s.len() / 2]
}

#[cfg(feature = "cuda")]
fn median_stage(
    samples: &[dsfb_gpu_debug_cuda::R8StageTimings],
) -> dsfb_gpu_debug_cuda::R8StageTimings {
    if samples.is_empty() {
        return dsfb_gpu_debug_cuda::R8StageTimings::default();
    }
    let mid = samples.len() / 2;
    let pick = |f: fn(&dsfb_gpu_debug_cuda::R8StageTimings) -> f32| -> f32 {
        let mut v: Vec<f32> = samples.iter().map(f).collect();
        v.sort_by(|a, b| a.partial_cmp(b).unwrap_or(core::cmp::Ordering::Equal));
        v[mid]
    };
    dsfb_gpu_debug_cuda::R8StageTimings {
        h2d_us: pick(|s| s.h2d_us),
        residual_us: pick(|s| s.residual_us),
        sign_us: pick(|s| s.sign_us),
        detector_us: pick(|s| s.detector_us),
        consensus_us: pick(|s| s.consensus_us),
        candidate_us: pick(|s| s.candidate_us),
        digests_us: pick(|s| s.digests_us),
        d2h_us: pick(|s| s.d2h_us),
        total_device_us: pick(|s| s.total_device_us),
    }
}

#[cfg(feature = "cuda")]
fn median_host(
    samples: &[dsfb_gpu_debug_cuda::R8HostStageTimings],
) -> dsfb_gpu_debug_cuda::R8HostStageTimings {
    if samples.is_empty() {
        return dsfb_gpu_debug_cuda::R8HostStageTimings::default();
    }
    let mid = samples.len() / 2;
    let mut f: Vec<f32> = samples.iter().map(|s| s.features_us).collect();
    f.sort_by(|a, b| a.partial_cmp(b).unwrap_or(core::cmp::Ordering::Equal));
    let mut b: Vec<f32> = samples.iter().map(|s| s.bank_and_finalize_us).collect();
    b.sort_by(|a, b| a.partial_cmp(b).unwrap_or(core::cmp::Ordering::Equal));
    dsfb_gpu_debug_cuda::R8HostStageTimings {
        features_us: f[mid],
        bank_and_finalize_us: b[mid],
    }
}

/// R.8 — render the per-stage table for one scale point and write
/// it to `reports/r8_bottleneck_<grid>_K<K>.txt`. The "wall anchor"
/// in the % column is the host-measured median wall time per
/// iteration — that's the only number that can sum to 100 %. The
/// timed segments' sum is reported separately so a future engineer
/// can see how much wall time the cudaEvent + Instant slots
/// account for (target ≥95 % per R.8 honesty bar).
///
/// The function is `#[allow(clippy::too_many_arguments)]` because
/// splitting these arguments into a struct adds boilerplate without
/// readability gain.
#[cfg(feature = "cuda")]
#[allow(clippy::too_many_arguments, clippy::cast_precision_loss)]
fn print_and_write_r8(
    label: &str,
    n_entities: u32,
    n_windows: u32,
    k: u32,
    dev: dsfb_gpu_debug_cuda::R8StageTimings,
    host: dsfb_gpu_debug_cuda::R8HostStageTimings,
    med_wall_us: u128,
) {
    use core::fmt::Write;

    let rows: [(&str, f32); 10] = [
        ("feature generation (host)", host.features_us),
        ("H2D", dev.h2d_us),
        ("residual", dev.residual_us),
        ("sign (drift/slew EWMA)", dev.sign_us),
        ("detector", dev.detector_us),
        ("consensus", dev.consensus_us),
        ("candidate collapse", dev.candidate_us),
        ("digests (4 kernels)", dev.digests_us),
        ("D2H", dev.d2h_us),
        ("bank + case finalize (host)", host.bank_and_finalize_us),
    ];

    let total_measured: f32 = rows.iter().map(|(_, us)| us).sum();
    // `med_wall_us` is the per-iter wall measured on the host
    // including all of the above. We report the wall as the
    // anchor so the percentages add up to <= 100% (the residual
    // is host orchestration outside the timed slots). The
    // `as u64 -> f32` narrowing dodges the u128 precision warning;
    // per-iter wall is well under 1 second at every scale.
    #[allow(clippy::cast_possible_truncation)]
    let anchor = if med_wall_us == 0 {
        total_measured
    } else {
        (med_wall_us as u64) as f32
    };

    let mut out = String::new();
    let _ = writeln!(out, "=== R.8 Bottleneck Profile — {label} ===");
    let _ = writeln!(
        out,
        "scale: n_entities={n_entities} n_windows={n_windows} K={k}"
    );
    let _ = writeln!(out, "median wall (host Instant): {med_wall_us} us");
    let _ = writeln!(out, "sum of timed segments       : {total_measured:.1} us");
    out.push('\n');
    out.push_str("  Stage                       us         % of wall\n");
    out.push_str("  -------------------------- ---------- -----------\n");
    for (name, us) in &rows {
        let pct = if anchor > 0.0 {
            (us / anchor) * 100.0
        } else {
            0.0
        };
        let _ = writeln!(out, "  {name:<26} {us:>10.1} {pct:>9.1}%");
    }
    out.push_str("  -------------------------- ---------- -----------\n");
    let total_pct = if anchor > 0.0 {
        (total_measured / anchor) * 100.0
    } else {
        0.0
    };
    let _ = writeln!(
        out,
        "  total (timed segments)     {total_measured:>10.1} {total_pct:>9.1}%"
    );
    let total_device_us = dev.total_device_us;
    let total_device_pct = if anchor > 0.0 {
        (total_device_us / anchor) * 100.0
    } else {
        0.0
    };
    let _ = writeln!(
        out,
        "  total_device_us (event)    {total_device_us:>10.1} {total_device_pct:>9.1}%"
    );

    // Top 3 stages by absolute time.
    let mut sorted: Vec<(&str, f32)> = rows.to_vec();
    sorted.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(core::cmp::Ordering::Equal));
    out.push_str("\nTop 3 stages by absolute time:\n");
    for (i, (name, us)) in sorted.iter().take(3).enumerate() {
        let pct = if anchor > 0.0 {
            (us / anchor) * 100.0
        } else {
            0.0
        };
        let rank = i + 1;
        let _ = writeln!(out, "  {rank}. {name} — {us:.1} us ({pct:.1}% of wall)");
    }

    print!("{out}");

    let filename = format!("r8_bottleneck_{n_entities}x{n_windows}_K{k}.txt");
    let path = std::path::Path::new("reports").join(filename);
    let _ = std::fs::create_dir_all("reports");
    if let Err(e) = std::fs::write(&path, &out) {
        eprintln!("warning: could not write {}: {e}", path.display());
    } else {
        println!("wrote R.8 profile -> {}", path.display());
    }
}

/// R.8.5 — compare wall time between the serial-digest path and
/// the new tree-digest path at the same scale points the R.8
/// profiler uses. The goal of this measurement is a single
/// number: how much did the dominant 78 %-of-wall bottleneck
/// shrink under the tree-digest topology?
///
/// Output: `reports/r8_5_tree_compare_<grid>_K1.txt`. For each
/// scale point we report median wall (host Instant, 5 iters at
/// quick / 20 iters otherwise) for both digest modes plus the
/// ratio. Honest reporting: whatever the speedup turns out to be,
/// the report says exactly that. The R.8.5 gate is ≥5× drop on
/// the digest stage; if the wall ratio is materially smaller,
/// the report still publishes and the campaign proceeds with the
/// honest number.
#[cfg(feature = "cuda")]
#[allow(clippy::too_many_lines)]
fn run_r8_5_tree_digest_compare(warmup: usize, iters: usize) {
    use std::time::Instant;

    use dsfb_gpu_debug_cuda::{
        build_gpu_throughput_pinned_async_on_workspace,
        build_gpu_throughput_pinned_async_on_workspace_tree, GpuWorkspace,
    };

    let points: [(&str, u32, u32, u32); 3] = [
        ("canonical 16x128 K=1", 16, 128, 1),
        ("mid-scale 64x512 K=1", 64, 512, 1),
        ("full-scale 256x4096 K=1", 256, 4096, 1),
    ];

    for &(label, n_entities, n_windows, k) in &points {
        println!();
        println!("=== R.8.5 tree-digest comparison — {label} ===");
        println!("  warmup: {warmup}   iters: {iters}");

        let contract = if n_entities == 16 && n_windows == 128 {
            let mut c = dsfb_gpu_debug_core::contract::Contract::canonical();
            c.pin_bank_hash(dsfb_gpu_debug_core::bank::bank_hash());
            c.pin_detector_registry_hash(dsfb_gpu_debug_core::motif::registry_hash());
            c
        } else {
            let mut c = dsfb_gpu_debug_core::contract::Contract::scaled(n_entities, n_windows);
            c.pin_bank_hash(dsfb_gpu_debug_core::bank::bank_hash());
            c.pin_detector_registry_hash(dsfb_gpu_debug_core::motif::registry_hash());
            c
        };
        let events = if n_entities == 16 && n_windows == 128 {
            dsfb_gpu_debug_core::fixture::synthesize(dsfb_gpu_debug_core::fixture::DEFAULT_SEED)
        } else {
            dsfb_gpu_debug_core::fixture::synthesize_scaled(
                dsfb_gpu_debug_core::fixture::DEFAULT_SEED,
                n_entities,
                n_windows,
                4,
            )
        };

        let mut ws_serial = match GpuWorkspace::new_with_pinned_async(&contract) {
            Ok(w) => w,
            Err(e) => {
                println!("  workspace alloc refused: {e:?}; skipping {label}");
                continue;
            }
        };
        let mut ws_tree = match GpuWorkspace::new_with_pinned_async(&contract) {
            Ok(w) => w,
            Err(e) => {
                println!("  workspace alloc refused: {e:?}; skipping {label}");
                continue;
            }
        };

        // Serial-digest warmup + measurement.
        for _ in 0..warmup {
            let _ =
                build_gpu_throughput_pinned_async_on_workspace(&events, &contract, &mut ws_serial);
        }
        let mut serial_us: Vec<u128> = Vec::with_capacity(iters);
        for _ in 0..iters {
            let t0 = Instant::now();
            let result =
                build_gpu_throughput_pinned_async_on_workspace(&events, &contract, &mut ws_serial);
            let dt = t0.elapsed().as_micros();
            if let Ok(case) = result {
                std::hint::black_box(case);
                serial_us.push(dt);
            } else {
                println!("  serial-digest dispatch error: {result:?}");
                return;
            }
        }

        // Tree-digest warmup + measurement.
        for _ in 0..warmup {
            let _ = build_gpu_throughput_pinned_async_on_workspace_tree(
                &events,
                &contract,
                &mut ws_tree,
            );
        }
        let mut tree_us: Vec<u128> = Vec::with_capacity(iters);
        for _ in 0..iters {
            let t0 = Instant::now();
            let result = build_gpu_throughput_pinned_async_on_workspace_tree(
                &events,
                &contract,
                &mut ws_tree,
            );
            let dt = t0.elapsed().as_micros();
            if let Ok(case) = result {
                std::hint::black_box(case);
                tree_us.push(dt);
            } else {
                println!("  tree-digest dispatch error: {result:?}");
                return;
            }
        }

        let med_serial = median_u128(&serial_us);
        let med_tree = median_u128(&tree_us);
        // u128 → u64 → f64: per-iter wall is well under 2^53 µs
        // (about 285 years), so the narrowing is loss-free in any
        // honest scenario. This dodges clippy's u128 precision
        // lint at the same time.
        #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
        let ratio = if med_tree > 0 {
            (med_serial as u64) as f64 / (med_tree as u64) as f64
        } else {
            0.0
        };
        print_and_write_r8_5(label, n_entities, n_windows, k, med_serial, med_tree, ratio);
    }
}

/// R.8.5 — render and persist the serial-vs-tree wall comparison
/// for one scale point. Honest: only what was measured.
#[cfg(feature = "cuda")]
#[allow(clippy::too_many_arguments)]
fn print_and_write_r8_5(
    label: &str,
    n_entities: u32,
    n_windows: u32,
    k: u32,
    med_serial_us: u128,
    med_tree_us: u128,
    ratio: f64,
) {
    use core::fmt::Write;
    let mut out = String::new();
    let _ = writeln!(out, "=== R.8.5 tree-digest comparison — {label} ===");
    let _ = writeln!(
        out,
        "scale: n_entities={n_entities} n_windows={n_windows} K={k}"
    );
    let _ = writeln!(out);
    let _ = writeln!(out, "  serial-digest median wall: {med_serial_us:>10} us");
    let _ = writeln!(out, "  tree-digest   median wall: {med_tree_us:>10} us");
    let _ = writeln!(out, "  wall-time ratio (serial / tree): {ratio:.2}x");
    let _ = writeln!(out);
    let _ = writeln!(out, "Notes:");
    let _ = writeln!(
        out,
        "  * Both paths run the same 5 pipeline kernels (residual, sign, detector,"
    );
    let _ = writeln!(
        out,
        "    consensus, candidate). They differ only in the digest stage: serial"
    );
    let _ = writeln!(
        out,
        "    uses 4 single-thread `*_digest_kernel_batched` kernels; tree uses one"
    );
    let _ = writeln!(
        out,
        "    block per chunk (~2048 chunks at 256x4096 with 16 KiB chunks) feeding"
    );
    let _ = writeln!(
        out,
        "    a final root SHA-256 over the ordered leaf digests + domain separator."
    );
    let _ = writeln!(
        out,
        "  * Stage hash bytes differ between modes by construction; case-file"
    );
    let _ = writeln!(
        out,
        "    metadata records `digest_mode` so replay catches a mode mismatch."
    );

    print!("{out}");

    let filename = format!("r8_5_tree_compare_{n_entities}x{n_windows}_K{k}.txt");
    let path = std::path::Path::new("reports").join(filename);
    let _ = std::fs::create_dir_all("reports");
    if let Err(e) = std::fs::write(&path, &out) {
        eprintln!("warning: could not write {}: {e}", path.display());
    } else {
        println!("wrote R.8.5 comparison -> {}", path.display());
    }
}

/// R.11 — three-way wall-time comparison: serial-digest vs
/// tree-digest vs tree-digest + compact-verdict finalizer. Same
/// three K=1 scale points as R.8.5's compare runner. The compact
/// path precomputes `FixtureHashes` once outside the iter loop
/// so each iteration skips the ~250 MB of host SHA-256 work the
/// non-compact builder did per dispatch.
///
/// Output: `reports/r11_compact_compare_<grid>_K1.txt`.
///
/// Honest reporting: every number measured is reported as-is.
/// The R.11 gate is "host bank + case-finalize drops by ≥5×";
/// since we time the full dispatch wall (not just the host
/// segment), the headline is the WALL ratio of tree-only vs
/// tree+compact at the same scale. If the compact ratio is
/// smaller than ≥5×, the report still publishes and the
/// campaign proceeds on the honest number.
#[cfg(feature = "cuda")]
#[allow(clippy::too_many_lines)]
fn run_r11_compact_compare(warmup: usize, iters: usize) {
    use std::time::Instant;

    use dsfb_gpu_debug_core::casefile::FixtureHashes;
    use dsfb_gpu_debug_core::window::compute_features;
    use dsfb_gpu_debug_cuda::{
        build_gpu_throughput_pinned_async_on_workspace,
        build_gpu_throughput_pinned_async_on_workspace_tree,
        build_gpu_throughput_pinned_async_on_workspace_tree_compact, GpuWorkspace,
    };

    let points: [(&str, u32, u32, u32); 3] = [
        ("canonical 16x128 K=1", 16, 128, 1),
        ("mid-scale 64x512 K=1", 64, 512, 1),
        ("full-scale 256x4096 K=1", 256, 4096, 1),
    ];

    for &(label, n_entities, n_windows, k) in &points {
        println!();
        println!("=== R.11 compact-verdict comparison — {label} ===");
        println!("  warmup: {warmup}   iters: {iters}");

        let contract = if n_entities == 16 && n_windows == 128 {
            let mut c = dsfb_gpu_debug_core::contract::Contract::canonical();
            c.pin_bank_hash(dsfb_gpu_debug_core::bank::bank_hash());
            c.pin_detector_registry_hash(dsfb_gpu_debug_core::motif::registry_hash());
            c
        } else {
            let mut c = dsfb_gpu_debug_core::contract::Contract::scaled(n_entities, n_windows);
            c.pin_bank_hash(dsfb_gpu_debug_core::bank::bank_hash());
            c.pin_detector_registry_hash(dsfb_gpu_debug_core::motif::registry_hash());
            c
        };
        let events = if n_entities == 16 && n_windows == 128 {
            dsfb_gpu_debug_core::fixture::synthesize(dsfb_gpu_debug_core::fixture::DEFAULT_SEED)
        } else {
            dsfb_gpu_debug_core::fixture::synthesize_scaled(
                dsfb_gpu_debug_core::fixture::DEFAULT_SEED,
                n_entities,
                n_windows,
                4,
            )
        };

        // R.11 precomputed fixture hashes — done ONCE per scale
        // point outside the iter loop. This is the load-bearing
        // optimisation; subsequent compact dispatches consume
        // the precomputed values and skip re-hashing.
        let features = compute_features(
            &events,
            contract.n_windows,
            contract.n_entities,
            u64::from(contract.window_size_ms) * 1_000_000,
        );
        let fixture = FixtureHashes::compute(&events, &features);

        let Ok(mut ws_serial) = GpuWorkspace::new_with_pinned_async(&contract) else {
            println!("  workspace alloc refused; skipping {label}");
            continue;
        };
        let Ok(mut ws_tree) = GpuWorkspace::new_with_pinned_async(&contract) else {
            println!("  workspace alloc refused; skipping {label}");
            continue;
        };
        let Ok(mut ws_compact) = GpuWorkspace::new_with_pinned_async(&contract) else {
            println!("  workspace alloc refused; skipping {label}");
            continue;
        };

        // Serial-digest run.
        for _ in 0..warmup {
            let _ =
                build_gpu_throughput_pinned_async_on_workspace(&events, &contract, &mut ws_serial);
        }
        let mut serial_us: Vec<u128> = Vec::with_capacity(iters);
        for _ in 0..iters {
            let t0 = Instant::now();
            let result =
                build_gpu_throughput_pinned_async_on_workspace(&events, &contract, &mut ws_serial);
            let dt = t0.elapsed().as_micros();
            match result {
                Ok(case) => {
                    std::hint::black_box(case);
                    serial_us.push(dt);
                }
                Err(e) => {
                    println!("  serial-digest dispatch error: {e:?}");
                    return;
                }
            }
        }

        // Tree-digest run (non-compact finalizer).
        for _ in 0..warmup {
            let _ = build_gpu_throughput_pinned_async_on_workspace_tree(
                &events,
                &contract,
                &mut ws_tree,
            );
        }
        let mut tree_us: Vec<u128> = Vec::with_capacity(iters);
        for _ in 0..iters {
            let t0 = Instant::now();
            let result = build_gpu_throughput_pinned_async_on_workspace_tree(
                &events,
                &contract,
                &mut ws_tree,
            );
            let dt = t0.elapsed().as_micros();
            match result {
                Ok(case) => {
                    std::hint::black_box(case);
                    tree_us.push(dt);
                }
                Err(e) => {
                    println!("  tree-digest dispatch error: {e:?}");
                    return;
                }
            }
        }

        // Tree-digest + compact-verdict finalizer run.
        for _ in 0..warmup {
            let _ = build_gpu_throughput_pinned_async_on_workspace_tree_compact(
                &events,
                &contract,
                &mut ws_compact,
                &fixture,
            );
        }
        let mut compact_us: Vec<u128> = Vec::with_capacity(iters);
        for _ in 0..iters {
            let t0 = Instant::now();
            let result = build_gpu_throughput_pinned_async_on_workspace_tree_compact(
                &events,
                &contract,
                &mut ws_compact,
                &fixture,
            );
            let dt = t0.elapsed().as_micros();
            match result {
                Ok(case) => {
                    std::hint::black_box(case);
                    compact_us.push(dt);
                }
                Err(e) => {
                    println!("  compact-verdict dispatch error: {e:?}");
                    return;
                }
            }
        }

        let med_serial = median_u128(&serial_us);
        let med_tree = median_u128(&tree_us);
        let med_compact = median_u128(&compact_us);
        // u128 → u64 → f64 narrowing dodges the clippy precision
        // lint; per-iter wall is well below 2^53 µs so loss-free.
        #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
        let ratio_serial_to_compact = if med_compact > 0 {
            (med_serial as u64) as f64 / (med_compact as u64) as f64
        } else {
            0.0
        };
        #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
        let ratio_tree_to_compact = if med_compact > 0 {
            (med_tree as u64) as f64 / (med_compact as u64) as f64
        } else {
            0.0
        };
        print_and_write_r11(
            label,
            n_entities,
            n_windows,
            k,
            med_serial,
            med_tree,
            med_compact,
            ratio_serial_to_compact,
            ratio_tree_to_compact,
        );
    }
}

/// R.11 — render and persist the three-way wall-time comparison
/// for one scale point. Honest: only the measured numbers; the
/// notes section frames the architectural posture for a future
/// reader picking the report up cold.
#[cfg(feature = "cuda")]
#[allow(clippy::too_many_arguments)]
fn print_and_write_r11(
    label: &str,
    n_entities: u32,
    n_windows: u32,
    k: u32,
    med_serial_us: u128,
    med_tree_us: u128,
    med_compact_us: u128,
    ratio_serial_to_compact: f64,
    ratio_tree_to_compact: f64,
) {
    use core::fmt::Write;
    let mut out = String::new();
    let _ = writeln!(out, "=== R.11 compact-verdict comparison — {label} ===");
    let _ = writeln!(
        out,
        "scale: n_entities={n_entities} n_windows={n_windows} K={k}"
    );
    let _ = writeln!(out);
    let _ = writeln!(
        out,
        "  serial-digest                  : {med_serial_us:>10} us"
    );
    let _ = writeln!(
        out,
        "  tree-digest (R.8.5)            : {med_tree_us:>10} us"
    );
    let _ = writeln!(
        out,
        "  tree-digest + compact (R.11)   : {med_compact_us:>10} us"
    );
    let _ = writeln!(out);
    let _ = writeln!(
        out,
        "  wall ratio serial / compact    : {ratio_serial_to_compact:.2}x"
    );
    let _ = writeln!(
        out,
        "  wall ratio tree   / compact    : {ratio_tree_to_compact:.2}x"
    );
    let _ = writeln!(out);
    let _ = writeln!(out, "Notes:");
    let _ = writeln!(
        out,
        "  * Serial = legacy R.6b path (4 single-thread digest kernels + non-compact builder)."
    );
    let _ = writeln!(
        out,
        "  * Tree = R.8.5 path (block-parallel tree digest + non-compact builder)."
    );
    let _ = writeln!(
        out,
        "  * Compact = R.11 path (tree digest + FixtureHashes precomputed once)."
    );
    let _ = writeln!(
        out,
        "  * `FixtureHashes` is computed ONCE per scale point outside the iter loop,"
    );
    let _ = writeln!(
        out,
        "    matching how a long-running deployment caller would amortise the input"
    );
    let _ = writeln!(
        out,
        "    commitment hash across many dispatches against the same fixture."
    );
    let _ = writeln!(
        out,
        "  * Case files from all three paths are byte-identical for the serial vs."
    );
    let _ = writeln!(
        out,
        "    serial pairing, and the tree pair is internally byte-identical;"
    );
    let _ = writeln!(
        out,
        "    serial ≠ tree because tree commits to chunked stage bytes + a domain"
    );
    let _ = writeln!(
        out,
        "    separator. Compact ≡ tree byte-for-byte by construction."
    );
    let _ = writeln!(
        out,
        "  * Semantic Non-Bypass Axiom holds in every path: `bank_collapse` is the"
    );
    let _ = writeln!(
        out,
        "    only mint of `BankAdmissionToken`. The compact builder reuses it."
    );

    print!("{out}");

    let filename = format!("r11_compact_compare_{n_entities}x{n_windows}_K{k}.txt");
    let path = std::path::Path::new("reports").join(filename);
    let _ = std::fs::create_dir_all("reports");
    if let Err(e) = std::fs::write(&path, &out) {
        eprintln!("warning: could not write {}: {e}", path.display());
    } else {
        println!("wrote R.11 comparison -> {}", path.display());
    }
}