dsfb_gpu_debug_demo/cli/
bench_gpu_scale.rs

1//! `dsfb-gpu-debug bench-gpu-scale` — R.7 money-table headline benchmark
2//! + R.8 bottleneck profiler.
3//!
4//! Two distinct entry modes share this subcommand:
5//!
6//! * **Default mode (R.7)** drives the panel-locked scale sweep that
7//!   produces the headline `reports/money_table.txt`. Each row pairs
8//!   one GPU dispatch path (Layer A device evidence fabric / Layer B
9//!   throughput verdict summary / Layer C full audit court) against
10//!   the same fixture's CPU Layer B baseline so the speedup column
11//!   is reproducible.
12//! * **`--detail-stage` mode (R.8)** skips the money-table sweep and
13//!   instead runs the per-stage bottleneck profiler at three K=1
14//!   scale points (canonical 16×128, 64×512 mid-scale, 256×4096
15//!   full-scale). Each scale point gets its own
16//!   `reports/r8_bottleneck_<grid>_K1.txt` with a table of `(stage,
17//!   median µs, % of wall)` and the top 3 stages by absolute time.
18//!   Honest scope note: K>1 batched per-stage timings need a
19//!   separate `_timed` batched FFI (deferred); the K=1 percent
20//!   breakdown is the proxy R.8 uses for the K=64 row in R.7 because
21//!   the same kernels run with `blockIdx.z = K` at batched scale.
22//!
23//! R.7 rows (panel-locked):
24//!
25//! * Canonical 16×128, K=32: Layer A, Layer B, Layer C CPU, Layer C GPU
26//! * Scale-large 256×4096, K ∈ {1, 16, 64, 128}: CPU Layer B, GPU
27//!   Layer A, GPU Layer B, and Layer C if feasible. K=128 only runs
28//!   if the `BatchedGpuWorkspace` allocation succeeds; otherwise the
29//!   row is marked "not run: alloc refused" and the rest of the
30//!   sweep continues.
31//!
32//! Session-level fields recorded once at the top of the R.7 report:
33//!
34//! * `graph_status` — outcome of an opt-in
35//!   `build_gpu_throughput_graph_or_demote` call at canonical scale.
36//!   Either `captured` or `demoted` with a short reason. The graph
37//!   itself does not drive the bench rows (the rows go through the
38//!   pre-existing layer dispatch paths); the status is recorded so the
39//!   case file's launch-plan provenance can be audited later.
40//! * `graph_plan_hash` — the captured topology's canonical hash, when
41//!   capture succeeds. Reported as 64 hex chars; absent on demoted.
42//!
43//! Output:
44//!
45//! 1. Console: R.7 prints a `=== R.7 Money Table ===` block per row
46//!    plus a final summary table; R.8 prints a `=== R.8 Bottleneck
47//!    Profile === ` block per scale point.
48//! 2. Files: R.7 writes `reports/money_table.txt`; R.8 writes
49//!    `reports/r8_bottleneck_<grid>_K<K>.txt` per scale point.
50//!
51//! Honest reporting: every number printed is measured. Rows that fail
52//! to run print `n/a` in the speedup column and a short reason in the
53//! same row. The R doctrine forbids fabricated numbers; this file
54//! enforces that by only writing rows the bench actually completed.
55
56#![allow(clippy::expect_used)]
57
58use std::process::ExitCode;
59
60use dsfb_gpu_debug_core::bank::bank_hash;
61use dsfb_gpu_debug_core::contract::Contract;
62use dsfb_gpu_debug_core::event::TraceEvent;
63use dsfb_gpu_debug_core::fixture::{synthesize, synthesize_scaled, DEFAULT_SEED};
64use dsfb_gpu_debug_core::motif::registry_hash;
65
66#[cfg(feature = "cuda")]
67use dsfb_gpu_debug_cuda::{build_gpu_throughput_graph_or_demote, GpuWorkspace, GraphCaptureStatus};
68
69#[cfg(feature = "cuda")]
70use super::bench::{run_layer_a, run_layer_b, run_layer_c_gpu};
71use super::bench::{run_layer_b_cpu_always, run_layer_c_cpu};
72use super::{parse_flags, usage_error};
73
74/// Iteration / warmup counts for the headline rows. Honest defaults
75/// picked so the bench takes a few minutes total on the target GPU.
76/// Bigger fixtures get smaller iter counts because the per-iteration
77/// wall time grows with scale.
78#[derive(Debug, Clone, Copy)]
79struct IterPlan {
80    warmup: usize,
81    iters: usize,
82}
83
84impl IterPlan {
85    const CANONICAL: Self = Self {
86        warmup: 20,
87        iters: 100,
88    };
89    const LARGE_K1: Self = Self {
90        warmup: 5,
91        iters: 50,
92    };
93    const LARGE_K16: Self = Self {
94        warmup: 3,
95        iters: 20,
96    };
97    const LARGE_K64: Self = Self {
98        warmup: 2,
99        iters: 10,
100    };
101    const LARGE_K128: Self = Self {
102        warmup: 2,
103        iters: 5,
104    };
105}
106
107/// Run R.7 with the user-supplied CLI flags. Supported flags:
108///
109/// * `--quick` — divides every iter count by 5 (rounded up) for a
110///   smoke run; the row labels then carry `[quick]`.
111/// * `--out PATH` — alternate path for the money-table report
112///   (default `reports/money_table.txt`).
113/// * `--skip-large` — only run the canonical row block. Useful when
114///   the host doesn't have enough VRAM for 256×4096 fixtures.
115#[allow(clippy::too_many_lines)]
116pub fn parse_and_run(args: &[String]) -> ExitCode {
117    let flags = match parse_flags(args) {
118        Ok(f) => f,
119        Err(message) => return usage_error(&message),
120    };
121
122    let quick = flags.get("quick").is_some_and(|v| v != "false");
123    let skip_large = flags.get("skip-large").is_some_and(|v| v != "false");
124    let big_k = flags.get("big-k").is_some_and(|v| v != "false");
125    // R.8 — when set, the bench skips the normal money-table rows
126    // and runs a single deep per-stage breakdown of GPU Layer B at
127    // each of three scale points (canonical, 256x4096 K=16, K=64),
128    // writing `reports/r8_bottleneck_<grid>_K<K>.txt` for each.
129    // The money-table sweep is not run in `--detail-stage` mode.
130    let detail_stage = flags.get("detail-stage").is_some_and(|v| v != "false");
131    // R.8.5 — when set alongside `--detail-stage`, the profiler
132    // runs the tree-digest dispatch path instead of the legacy
133    // serial-digest path. The detail-stage breakdown then captures
134    // the post-R.8.5 digest-stage cost; the report filename
135    // distinguishes the two via `_tree` suffix so a pre/post
136    // comparison is easy.
137    let tree_digest = flags.get("tree-digest").is_some_and(|v| v != "false");
138    // R.11 — when set alongside `--detail-stage --tree-digest`,
139    // the profiler also runs the compact-verdict finalizer path
140    // (precomputed FixtureHashes, no per-iter re-hashing of
141    // events/window features). Output: a 3-way comparison
142    // report (serial-digest, tree-digest, tree+compact) per
143    // scale point, written to
144    // `reports/r11_compact_compare_<grid>_K1.txt`.
145    let compact = flags.get("compact").is_some_and(|v| v != "false");
146    let out_path: std::path::PathBuf = flags.get("out").map_or_else(
147        || std::path::PathBuf::from("reports/money_table.txt"),
148        std::path::PathBuf::from,
149    );
150
151    let mut rows: Vec<MoneyRow> = Vec::new();
152    let mut header_lines: Vec<String> = Vec::new();
153
154    header_lines.push(String::from(
155        "# R.7 Money Table — DSFB-GPU-Debug headline benchmark",
156    ));
157    header_lines.push(format!(
158        "# generated: {}",
159        chrono_like_timestamp_or_unknown()
160    ));
161    header_lines.push(format!(
162        "# quick: {quick}    skip-large: {skip_large}    big-k: {big_k}"
163    ));
164    header_lines.push(String::from("#"));
165    header_lines.push(String::from(
166        "# Layer A: device evidence fabric (skip-bank, on-device digests).",
167    ));
168    header_lines.push(String::from(
169        "# Layer B: throughput verdict summary (host bank stage admits compact candidates).",
170    ));
171    header_lines.push(String::from(
172        "# Layer C: full audit court (every intermediate cell materialised host-side).",
173    ));
174    header_lines.push(String::from(
175        "# Speedup is measured against CPU Layer B at the SAME (n_entities, n_windows) scale.",
176    ));
177    header_lines.push(String::from("#"));
178
179    // ---- session-level graph capture probe (R.6c) -----------------
180    let (graph_status_line, graph_hash_line) = probe_graph_capture();
181    header_lines.push(graph_status_line);
182    if let Some(line) = graph_hash_line {
183        header_lines.push(line);
184    }
185    header_lines.push(String::new());
186
187    // ---- R.8 detail-stage: skip the money-table sweep entirely ---
188    // and run only the per-stage bottleneck profile at three scale
189    // points. The money-table rows are not the goal in this mode.
190    // Gated on `feature = "cuda"` because the underlying `_timed`
191    // dispatch lives in dsfb-gpu-debug-cuda and is meaningless
192    // without GPU support; non-cuda builds report and skip.
193    if detail_stage {
194        #[cfg(feature = "cuda")]
195        {
196            let stage_iters = if quick { 5 } else { 20 };
197            let stage_warmup = if quick { 1 } else { 3 };
198            if compact {
199                // R.11 path: requires the tree-digest backend
200                // because the compact dispatch consumes the
201                // tree-digest GPU pipeline's outputs.
202                run_r11_compact_compare(stage_warmup, stage_iters);
203            } else {
204                run_r8_detail_stage(stage_warmup, stage_iters, tree_digest);
205            }
206        }
207        #[cfg(not(feature = "cuda"))]
208        {
209            let _ = (quick, tree_digest, compact);
210            println!("--detail-stage requires --features cuda; nothing to profile");
211        }
212        return ExitCode::SUCCESS;
213    }
214
215    // ---- canonical 16x128 K=32 ------------------------------------
216    {
217        let plan = scale_iters(IterPlan::CANONICAL, quick);
218        let n_entities = 16u32;
219        let n_windows = 128u32;
220        let k = 32u32;
221        let mut contract = Contract::canonical();
222        contract.pin_bank_hash(bank_hash());
223        contract.pin_detector_registry_hash(registry_hash());
224        let events = synthesize(DEFAULT_SEED);
225
226        let cpu_b = run_layer_b_cpu_always(&events, &contract, plan.warmup, plan.iters);
227        let cpu_b_med = median(&cpu_b);
228        rows.push(MoneyRow {
229            label: format!(
230                "canonical 16x128 K={k:>3}  CPU Layer B          {}",
231                quick_tag(quick)
232            ),
233            n_entities,
234            n_windows,
235            n_catalogs: 1,
236            samples_us: cpu_b.clone(),
237            baseline_us: cpu_b_med,
238        });
239
240        run_gpu_row(
241            &mut rows,
242            &format!("canonical 16x128 K={k:>3}  GPU Layer A          "),
243            n_entities,
244            n_windows,
245            k,
246            GpuRow::LayerA,
247            &events,
248            &contract,
249            plan,
250            cpu_b_med,
251            quick,
252        );
253
254        run_gpu_row(
255            &mut rows,
256            &format!("canonical 16x128 K={k:>3}  GPU Layer B          "),
257            n_entities,
258            n_windows,
259            k,
260            GpuRow::LayerB,
261            &events,
262            &contract,
263            plan,
264            cpu_b_med,
265            quick,
266        );
267
268        // Layer C @ K=1 only — audit transcripts at K>1 aren't
269        // architecturally meaningful here (the audit court emits one
270        // canonical case file). Reported as K=1 with explicit note.
271        let cpu_c = run_layer_c_cpu(&events, &contract, plan.warmup, plan.iters);
272        rows.push(MoneyRow {
273            label: format!(
274                "canonical 16x128 K=  1  CPU Layer C (audit)  {}",
275                quick_tag(quick)
276            ),
277            n_entities,
278            n_windows,
279            n_catalogs: 1,
280            samples_us: cpu_c,
281            baseline_us: cpu_b_med,
282        });
283
284        #[cfg(feature = "cuda")]
285        {
286            let gpu_c = run_layer_c_gpu(&events, &contract, plan.warmup, plan.iters);
287            rows.push(MoneyRow {
288                label: format!(
289                    "canonical 16x128 K=  1  GPU Layer C (audit)  {}",
290                    quick_tag(quick)
291                ),
292                n_entities,
293                n_windows,
294                n_catalogs: 1,
295                samples_us: gpu_c,
296                baseline_us: cpu_b_med,
297            });
298        }
299    }
300
301    // ---- scale-large 256x4096 K ∈ {1, 16, 64, 128} ----------------
302    if !skip_large {
303        let n_entities = 256u32;
304        let n_windows = 4096u32;
305        let mut contract = Contract::scaled(n_entities, n_windows);
306        contract.pin_bank_hash(bank_hash());
307        contract.pin_detector_registry_hash(registry_hash());
308        let events = synthesize_scaled(DEFAULT_SEED, n_entities, n_windows, 4);
309
310        // CPU Layer B at this scale is the only CPU comparator we
311        // measure; per-catalog cost stays constant regardless of K
312        // (each catalog runs independently on CPU). Measure once.
313        let plan_cpu = scale_iters(IterPlan::LARGE_K1, quick);
314        let cpu_b = run_layer_b_cpu_always(&events, &contract, plan_cpu.warmup, plan_cpu.iters);
315        let cpu_b_med = median(&cpu_b);
316        rows.push(MoneyRow {
317            label: format!(
318                "scaled  256x4096 K=  1  CPU Layer B          {}",
319                quick_tag(quick)
320            ),
321            n_entities,
322            n_windows,
323            n_catalogs: 1,
324            samples_us: cpu_b.clone(),
325            baseline_us: cpu_b_med,
326        });
327
328        // K=128 is opt-in via `--big-k`: it needs ~15 GB VRAM and
329        // ~30s of fixture-synthesis CPU work per row, which is
330        // wasteful by default. Hosts that can afford it pass
331        // `--big-k` to include it.
332        let large_sweep: &[(u32, IterPlan)] = if big_k {
333            &[
334                (1u32, IterPlan::LARGE_K1),
335                (16, IterPlan::LARGE_K16),
336                (64, IterPlan::LARGE_K64),
337                (128, IterPlan::LARGE_K128),
338            ]
339        } else {
340            &[
341                (1u32, IterPlan::LARGE_K1),
342                (16, IterPlan::LARGE_K16),
343                (64, IterPlan::LARGE_K64),
344            ]
345        };
346        for &(k, plan_const) in large_sweep {
347            let plan = scale_iters(plan_const, quick);
348
349            run_gpu_row(
350                &mut rows,
351                &format!("scaled  256x4096 K={k:>3}  GPU Layer A          "),
352                n_entities,
353                n_windows,
354                k,
355                GpuRow::LayerA,
356                &events,
357                &contract,
358                plan,
359                cpu_b_med,
360                quick,
361            );
362
363            run_gpu_row(
364                &mut rows,
365                &format!("scaled  256x4096 K={k:>3}  GPU Layer B          "),
366                n_entities,
367                n_windows,
368                k,
369                GpuRow::LayerB,
370                &events,
371                &contract,
372                plan,
373                cpu_b_med,
374                quick,
375            );
376        }
377
378        // Layer C at scale-large is intentionally not run by default:
379        // materialising every intermediate cell at 256×4096 is the
380        // "court transcript cost" the paper documents but does not
381        // sell as the headline. Record a not-run row so the reader
382        // sees the deliberate omission rather than a missing slot.
383        rows.push(MoneyRow::not_run(
384            "scaled  256x4096 K=  1  Layer C (audit)      [not run: transcript materialisation cost]",
385            n_entities,
386            n_windows,
387            1,
388        ));
389    }
390
391    // ---- emit -----------------------------------------------------
392    let report = render_report(&header_lines, &rows);
393    print!("{report}");
394
395    if let Some(parent) = out_path.parent() {
396        let _ = std::fs::create_dir_all(parent);
397    }
398    match std::fs::write(&out_path, &report) {
399        Ok(()) => {
400            println!("wrote money table -> {}", out_path.display());
401            ExitCode::SUCCESS
402        }
403        Err(e) => {
404            eprintln!("warning: could not write {}: {e}", out_path.display());
405            // Non-fatal: the console output is the primary deliverable.
406            ExitCode::SUCCESS
407        }
408    }
409}
410
411#[derive(Clone, Copy)]
412enum GpuRow {
413    LayerA,
414    LayerB,
415}
416
417#[allow(clippy::too_many_arguments)]
418fn run_gpu_row(
419    rows: &mut Vec<MoneyRow>,
420    label_prefix: &str,
421    n_entities: u32,
422    n_windows: u32,
423    k: u32,
424    which: GpuRow,
425    events: &[TraceEvent],
426    contract: &Contract,
427    plan: IterPlan,
428    baseline_us: u128,
429    quick: bool,
430) {
431    #[cfg(feature = "cuda")]
432    {
433        // For K=1 the run_layer_* helpers expect `batch=0` (their
434        // single-catalog branch); for K>=1 they take `batch=K`.
435        let batch = if k == 1 { 0 } else { k };
436        let label = format!("{label_prefix}{}", quick_tag(quick));
437
438        // The legacy bench helpers `.expect()` on workspace
439        // construction, which would abort the whole sweep if a
440        // large-K row OOMs (e.g. K=128 at 256x4096 needs ~15 GB).
441        // We wrap each row in `catch_unwind` so an alloc failure
442        // marks just this row as not-run and the sweep continues
443        // with the remaining rows. This preserves the panel-locked
444        // "honest reporting" rule: a missing row is recorded with
445        // a reason rather than fabricated.
446        let events_owned: Vec<TraceEvent> = events.to_vec();
447        let contract_owned = contract.clone();
448        let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| match which {
449            GpuRow::LayerA => run_layer_a(
450                &events_owned,
451                &contract_owned,
452                batch,
453                plan.warmup,
454                plan.iters,
455            ),
456            GpuRow::LayerB => run_layer_b(
457                &events_owned,
458                &contract_owned,
459                batch,
460                plan.warmup,
461                plan.iters,
462            ),
463        }));
464        if let Ok(samples) = result {
465            rows.push(MoneyRow {
466                label,
467                n_entities,
468                n_windows,
469                n_catalogs: k,
470                samples_us: samples,
471                baseline_us,
472            });
473        } else {
474            let row_label = format!("{label} [not run: alloc refused or kernel error]");
475            rows.push(MoneyRow::not_run(&row_label, n_entities, n_windows, k));
476        }
477    }
478    #[cfg(not(feature = "cuda"))]
479    {
480        let _ = (
481            label_prefix,
482            n_entities,
483            n_windows,
484            k,
485            which,
486            events,
487            contract,
488            plan,
489            baseline_us,
490            quick,
491        );
492        rows.push(MoneyRow::not_run(
493            "(GPU rows skipped: not built with --features cuda)",
494            n_entities,
495            n_windows,
496            k,
497        ));
498    }
499}
500
501struct MoneyRow {
502    label: String,
503    n_entities: u32,
504    n_windows: u32,
505    n_catalogs: u32,
506    samples_us: Vec<u128>,
507    baseline_us: u128,
508}
509
510impl MoneyRow {
511    fn not_run(label: &str, n_entities: u32, n_windows: u32, n_catalogs: u32) -> Self {
512        Self {
513            label: label.to_string(),
514            n_entities,
515            n_windows,
516            n_catalogs,
517            samples_us: Vec::new(),
518            baseline_us: 0,
519        }
520    }
521}
522
523fn render_report(header_lines: &[String], rows: &[MoneyRow]) -> String {
524    use core::fmt::Write;
525    let mut out = String::new();
526    for line in header_lines {
527        out.push_str(line);
528        out.push('\n');
529    }
530    out.push_str(
531        "  label                                                    \
532         | median_us  | per_catalog_us | catalogs/sec | cells/sec     \
533         | det_evals/sec  | speedup_vs_cpu_b\n",
534    );
535    out.push_str(
536        "  ------------------------------------------------------- \
537         | ---------- | -------------- | ------------ | ------------- \
538         | -------------- | ----------------\n",
539    );
540    let n_detectors = u128::from(dsfb_gpu_debug_core::motif::MotifClass::COUNT as u32);
541    let one_sec = 1_000_000u128;
542    for row in rows {
543        if row.samples_us.is_empty() {
544            let _ = writeln!(
545                out,
546                "  {:<55}|        n/a |            n/a |          n/a |           n/a |            n/a |              n/a",
547                row.label
548            );
549            continue;
550        }
551        let med = median(&row.samples_us);
552        let catalogs = u128::from(row.n_catalogs);
553        let cells = catalogs * u128::from(row.n_entities) * u128::from(row.n_windows);
554        let det_evals = cells * n_detectors;
555        let per_catalog = if catalogs > 0 { med / catalogs } else { med };
556        let catalogs_per_sec = if med > 0 { catalogs * one_sec / med } else { 0 };
557        let cells_per_sec = if med > 0 { cells * one_sec / med } else { 0 };
558        let det_evals_per_sec = if med > 0 {
559            det_evals * one_sec / med
560        } else {
561            0
562        };
563        let speedup = if med > 0 && row.baseline_us > 0 {
564            // Per-catalog speedup: how many times faster the GPU
565            // processes one catalog vs. the CPU Layer B baseline
566            // at the same fixture. Reported as integer-times.10ths
567            // (e.g. "  47.3x") so we never need to cast u128 → f64.
568            let denom = per_catalog.max(1);
569            // Multiply numerator by 10 to capture one decimal place,
570            // then split for the format.
571            let ratio_times10 = (row.baseline_us * 10) / denom;
572            let whole = ratio_times10 / 10;
573            let tenth = ratio_times10 % 10;
574            format!("{whole:>10}.{tenth}x")
575        } else {
576            String::from("           n/a")
577        };
578        let _ = writeln!(
579            out,
580            "  {:<55}| {med:>10} | {per_catalog:>14} | {catalogs_per_sec:>12} | {cells_per_sec:>13} | {det_evals_per_sec:>14} | {speedup:>16}",
581            row.label
582        );
583    }
584    out
585}
586
587fn median(samples: &[u128]) -> u128 {
588    if samples.is_empty() {
589        return 0;
590    }
591    let mut s = samples.to_vec();
592    s.sort_unstable();
593    s[s.len() / 2]
594}
595
596fn scale_iters(p: IterPlan, quick: bool) -> IterPlan {
597    if !quick {
598        return p;
599    }
600    IterPlan {
601        warmup: p.warmup.div_ceil(5).max(1),
602        iters: p.iters.div_ceil(5).max(1),
603    }
604}
605
606fn quick_tag(quick: bool) -> &'static str {
607    if quick {
608        "[quick]"
609    } else {
610        ""
611    }
612}
613
614/// R.6c — opt-in graph capture probe. Returns the `graph_status` line
615/// and an optional `graph_plan_hash` line for the report header. On
616/// non-CUDA builds returns a "skipped" status.
617fn probe_graph_capture() -> (String, Option<String>) {
618    #[cfg(feature = "cuda")]
619    {
620        let mut contract = Contract::canonical();
621        contract.pin_bank_hash(bank_hash());
622        contract.pin_detector_registry_hash(registry_hash());
623        let events = synthesize(DEFAULT_SEED);
624        match GpuWorkspace::new_with_pinned_async(&contract) {
625            Ok(mut ws) => match build_gpu_throughput_graph_or_demote(&events, &contract, &mut ws) {
626                Ok((_case, GraphCaptureStatus::Captured { plan_hash })) => {
627                    use core::fmt::Write;
628                    let mut hex = String::with_capacity(64);
629                    for b in &plan_hash {
630                        let _ = write!(hex, "{b:02x}");
631                    }
632                    (
633                        String::from("# graph_status: captured"),
634                        Some(format!("# graph_plan_hash: {hex}")),
635                    )
636                }
637                Ok((_case, GraphCaptureStatus::Demoted { reason })) => {
638                    (format!("# graph_status: demoted ({reason})"), None)
639                }
640                Err(e) => (format!("# graph_status: error during probe ({e:?})"), None),
641            },
642            Err(e) => (
643                format!("# graph_status: error allocating pinned-async workspace ({e:?})"),
644                None,
645            ),
646        }
647    }
648    #[cfg(not(feature = "cuda"))]
649    {
650        (
651            String::from("# graph_status: skipped (built without --features cuda)"),
652            None,
653        )
654    }
655}
656
657/// Best-effort ISO-8601-ish timestamp. The R doctrine forbids
658/// "wall-clock values" inside the hash chain, but a timestamp on the
659/// human-readable report header is fine — it just identifies when the
660/// report was generated. Uses `SystemTime::UNIX_EPOCH` for portability
661/// and avoids pulling in a date-time crate.
662fn chrono_like_timestamp_or_unknown() -> String {
663    match std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH) {
664        Ok(d) => format!("{} epoch seconds", d.as_secs()),
665        Err(_) => String::from("unknown"),
666    }
667}
668
669/// R.8 — drive the per-stage bottleneck profiler at three K=1
670/// scale points that together reveal how stage costs scale with
671/// fixture size. For each point, run the `_timed` dispatch over
672/// `iters` iterations and write a per-stage percent-of-time table
673/// to `reports/r8_bottleneck_<grid>_K1.txt`.
674///
675/// Scale points (panel-aligned):
676/// 1. **canonical 16×128 K=1** — the v0 fixture; tiny per-cell
677///    cost, kernel launch overhead dominates.
678/// 2. **mid-scale 64×512 K=1** — 16× more cells than canonical;
679///    surfaces which kernels scale linearly vs which stay flat.
680/// 3. **full-scale 256×4096 K=1** — the courthouse-factory scale
681///    that R.7 measured K=64 at; this is the per-catalog
682///    decomposition that R.9 / R.10 / R.11 plan against.
683///
684/// Honest scope note: R.7's headline `K=64` row uses a different
685/// batched-throughput FFI that does NOT yet record per-stage
686/// timings; wiring `_timed` into the batched dispatch is deferred
687/// to a follow-up R.8-batched commit. The K=1 percent breakdown
688/// at 256×4096 is the proxy R.8 uses for the K=64 row because
689/// the same kernels run with `blockIdx.z = K` at batched scale;
690/// per-catalog kernel time is essentially unchanged, only the
691/// host orchestration cost amortises differently.
692///
693/// The deliverable is the **256×4096 K=1** file — it identifies
694/// the top three bottlenecks for the R.9–R.11 campaign to target.
695#[cfg(feature = "cuda")]
696fn run_r8_detail_stage(warmup: usize, iters: usize, tree_digest: bool) {
697    use std::time::Instant;
698
699    use dsfb_gpu_debug_cuda::{
700        build_gpu_throughput_pinned_async_on_workspace_timed, GpuWorkspace, R8HostStageTimings,
701        R8StageTimings,
702    };
703
704    if tree_digest {
705        // R.8.5 measurement path: time the tree-digest dispatch
706        // end-to-end with host Instant only (no per-stage cudaEvent
707        // breakdown for the tree path at v0). The single number
708        // that matters is the wall delta vs R.8's serial-digest
709        // baseline — that proves the dominant 78 %-of-wall
710        // bottleneck collapsed.
711        run_r8_5_tree_digest_compare(warmup, iters);
712        return;
713    }
714
715    // Scale points: (label, n_entities, n_windows, K).
716    // K=1 is locked here because the underlying `_timed` dispatch
717    // is the single-catalog R.6b async path. See the function
718    // docstring for the honesty note about K>1 batched timings
719    // being deferred to a follow-up commit.
720    let points: [(&str, u32, u32, u32); 3] = [
721        ("canonical 16x128 K=1", 16, 128, 1),
722        ("mid-scale 64x512 K=1", 64, 512, 1),
723        ("full-scale 256x4096 K=1", 256, 4096, 1),
724    ];
725
726    for &(label, n_entities, n_windows, k) in &points {
727        println!();
728        println!("=== R.8 Bottleneck Profile — {label} ===");
729        println!("  warmup: {warmup}   iters: {iters}");
730
731        let contract = if n_entities == 16 && n_windows == 128 {
732            let mut c = dsfb_gpu_debug_core::contract::Contract::canonical();
733            c.pin_bank_hash(dsfb_gpu_debug_core::bank::bank_hash());
734            c.pin_detector_registry_hash(dsfb_gpu_debug_core::motif::registry_hash());
735            c
736        } else {
737            let mut c = dsfb_gpu_debug_core::contract::Contract::scaled(n_entities, n_windows);
738            c.pin_bank_hash(dsfb_gpu_debug_core::bank::bank_hash());
739            c.pin_detector_registry_hash(dsfb_gpu_debug_core::motif::registry_hash());
740            c
741        };
742        let events = if n_entities == 16 && n_windows == 128 {
743            dsfb_gpu_debug_core::fixture::synthesize(dsfb_gpu_debug_core::fixture::DEFAULT_SEED)
744        } else {
745            dsfb_gpu_debug_core::fixture::synthesize_scaled(
746                dsfb_gpu_debug_core::fixture::DEFAULT_SEED,
747                n_entities,
748                n_windows,
749                4,
750            )
751        };
752
753        let Ok(mut ws) = GpuWorkspace::new_with_pinned_async(&contract) else {
754            println!("  workspace alloc refused; skipping {label}");
755            continue;
756        };
757
758        // Warmup.
759        for _ in 0..warmup {
760            let _ =
761                build_gpu_throughput_pinned_async_on_workspace_timed(&events, &contract, &mut ws);
762        }
763
764        // Iterations.
765        let mut wall_us: Vec<u128> = Vec::with_capacity(iters);
766        let mut devs: Vec<R8StageTimings> = Vec::with_capacity(iters);
767        let mut hosts: Vec<R8HostStageTimings> = Vec::with_capacity(iters);
768        for _ in 0..iters {
769            let t0 = Instant::now();
770            let result =
771                build_gpu_throughput_pinned_async_on_workspace_timed(&events, &contract, &mut ws);
772            let dt = t0.elapsed().as_nanos();
773            match result {
774                Ok((case, dev, host)) => {
775                    std::hint::black_box(case);
776                    devs.push(dev);
777                    hosts.push(host);
778                    wall_us.push(dt / 1_000);
779                }
780                Err(e) => {
781                    println!("  dispatch error during R.8 measurement: {e:?}");
782                    return;
783                }
784            }
785        }
786
787        // Pick medians for stable reporting.
788        let med_wall = median_u128(&wall_us);
789        let med_dev = median_stage(&devs);
790        let med_host = median_host(&hosts);
791
792        print_and_write_r8(label, n_entities, n_windows, k, med_dev, med_host, med_wall);
793    }
794}
795
796#[cfg(feature = "cuda")]
797fn median_u128(samples: &[u128]) -> u128 {
798    if samples.is_empty() {
799        return 0;
800    }
801    let mut s = samples.to_vec();
802    s.sort_unstable();
803    s[s.len() / 2]
804}
805
806#[cfg(feature = "cuda")]
807fn median_stage(
808    samples: &[dsfb_gpu_debug_cuda::R8StageTimings],
809) -> dsfb_gpu_debug_cuda::R8StageTimings {
810    if samples.is_empty() {
811        return dsfb_gpu_debug_cuda::R8StageTimings::default();
812    }
813    let mid = samples.len() / 2;
814    let pick = |f: fn(&dsfb_gpu_debug_cuda::R8StageTimings) -> f32| -> f32 {
815        let mut v: Vec<f32> = samples.iter().map(f).collect();
816        v.sort_by(|a, b| a.partial_cmp(b).unwrap_or(core::cmp::Ordering::Equal));
817        v[mid]
818    };
819    dsfb_gpu_debug_cuda::R8StageTimings {
820        h2d_us: pick(|s| s.h2d_us),
821        residual_us: pick(|s| s.residual_us),
822        sign_us: pick(|s| s.sign_us),
823        detector_us: pick(|s| s.detector_us),
824        consensus_us: pick(|s| s.consensus_us),
825        candidate_us: pick(|s| s.candidate_us),
826        digests_us: pick(|s| s.digests_us),
827        d2h_us: pick(|s| s.d2h_us),
828        total_device_us: pick(|s| s.total_device_us),
829    }
830}
831
832#[cfg(feature = "cuda")]
833fn median_host(
834    samples: &[dsfb_gpu_debug_cuda::R8HostStageTimings],
835) -> dsfb_gpu_debug_cuda::R8HostStageTimings {
836    if samples.is_empty() {
837        return dsfb_gpu_debug_cuda::R8HostStageTimings::default();
838    }
839    let mid = samples.len() / 2;
840    let mut f: Vec<f32> = samples.iter().map(|s| s.features_us).collect();
841    f.sort_by(|a, b| a.partial_cmp(b).unwrap_or(core::cmp::Ordering::Equal));
842    let mut b: Vec<f32> = samples.iter().map(|s| s.bank_and_finalize_us).collect();
843    b.sort_by(|a, b| a.partial_cmp(b).unwrap_or(core::cmp::Ordering::Equal));
844    dsfb_gpu_debug_cuda::R8HostStageTimings {
845        features_us: f[mid],
846        bank_and_finalize_us: b[mid],
847    }
848}
849
850/// R.8 — render the per-stage table for one scale point and write
851/// it to `reports/r8_bottleneck_<grid>_K<K>.txt`. The "wall anchor"
852/// in the % column is the host-measured median wall time per
853/// iteration — that's the only number that can sum to 100 %. The
854/// timed segments' sum is reported separately so a future engineer
855/// can see how much wall time the cudaEvent + Instant slots
856/// account for (target ≥95 % per R.8 honesty bar).
857///
858/// The function is `#[allow(clippy::too_many_arguments)]` because
859/// splitting these arguments into a struct adds boilerplate without
860/// readability gain.
861#[cfg(feature = "cuda")]
862#[allow(clippy::too_many_arguments, clippy::cast_precision_loss)]
863fn print_and_write_r8(
864    label: &str,
865    n_entities: u32,
866    n_windows: u32,
867    k: u32,
868    dev: dsfb_gpu_debug_cuda::R8StageTimings,
869    host: dsfb_gpu_debug_cuda::R8HostStageTimings,
870    med_wall_us: u128,
871) {
872    use core::fmt::Write;
873
874    let rows: [(&str, f32); 10] = [
875        ("feature generation (host)", host.features_us),
876        ("H2D", dev.h2d_us),
877        ("residual", dev.residual_us),
878        ("sign (drift/slew EWMA)", dev.sign_us),
879        ("detector", dev.detector_us),
880        ("consensus", dev.consensus_us),
881        ("candidate collapse", dev.candidate_us),
882        ("digests (4 kernels)", dev.digests_us),
883        ("D2H", dev.d2h_us),
884        ("bank + case finalize (host)", host.bank_and_finalize_us),
885    ];
886
887    let total_measured: f32 = rows.iter().map(|(_, us)| us).sum();
888    // `med_wall_us` is the per-iter wall measured on the host
889    // including all of the above. We report the wall as the
890    // anchor so the percentages add up to <= 100% (the residual
891    // is host orchestration outside the timed slots). The
892    // `as u64 -> f32` narrowing dodges the u128 precision warning;
893    // per-iter wall is well under 1 second at every scale.
894    #[allow(clippy::cast_possible_truncation)]
895    let anchor = if med_wall_us == 0 {
896        total_measured
897    } else {
898        (med_wall_us as u64) as f32
899    };
900
901    let mut out = String::new();
902    let _ = writeln!(out, "=== R.8 Bottleneck Profile — {label} ===");
903    let _ = writeln!(
904        out,
905        "scale: n_entities={n_entities} n_windows={n_windows} K={k}"
906    );
907    let _ = writeln!(out, "median wall (host Instant): {med_wall_us} us");
908    let _ = writeln!(out, "sum of timed segments       : {total_measured:.1} us");
909    out.push('\n');
910    out.push_str("  Stage                       us         % of wall\n");
911    out.push_str("  -------------------------- ---------- -----------\n");
912    for (name, us) in &rows {
913        let pct = if anchor > 0.0 {
914            (us / anchor) * 100.0
915        } else {
916            0.0
917        };
918        let _ = writeln!(out, "  {name:<26} {us:>10.1} {pct:>9.1}%");
919    }
920    out.push_str("  -------------------------- ---------- -----------\n");
921    let total_pct = if anchor > 0.0 {
922        (total_measured / anchor) * 100.0
923    } else {
924        0.0
925    };
926    let _ = writeln!(
927        out,
928        "  total (timed segments)     {total_measured:>10.1} {total_pct:>9.1}%"
929    );
930    let total_device_us = dev.total_device_us;
931    let total_device_pct = if anchor > 0.0 {
932        (total_device_us / anchor) * 100.0
933    } else {
934        0.0
935    };
936    let _ = writeln!(
937        out,
938        "  total_device_us (event)    {total_device_us:>10.1} {total_device_pct:>9.1}%"
939    );
940
941    // Top 3 stages by absolute time.
942    let mut sorted: Vec<(&str, f32)> = rows.to_vec();
943    sorted.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(core::cmp::Ordering::Equal));
944    out.push_str("\nTop 3 stages by absolute time:\n");
945    for (i, (name, us)) in sorted.iter().take(3).enumerate() {
946        let pct = if anchor > 0.0 {
947            (us / anchor) * 100.0
948        } else {
949            0.0
950        };
951        let rank = i + 1;
952        let _ = writeln!(out, "  {rank}. {name} — {us:.1} us ({pct:.1}% of wall)");
953    }
954
955    print!("{out}");
956
957    let filename = format!("r8_bottleneck_{n_entities}x{n_windows}_K{k}.txt");
958    let path = std::path::Path::new("reports").join(filename);
959    let _ = std::fs::create_dir_all("reports");
960    if let Err(e) = std::fs::write(&path, &out) {
961        eprintln!("warning: could not write {}: {e}", path.display());
962    } else {
963        println!("wrote R.8 profile -> {}", path.display());
964    }
965}
966
967/// R.8.5 — compare wall time between the serial-digest path and
968/// the new tree-digest path at the same scale points the R.8
969/// profiler uses. The goal of this measurement is a single
970/// number: how much did the dominant 78 %-of-wall bottleneck
971/// shrink under the tree-digest topology?
972///
973/// Output: `reports/r8_5_tree_compare_<grid>_K1.txt`. For each
974/// scale point we report median wall (host Instant, 5 iters at
975/// quick / 20 iters otherwise) for both digest modes plus the
976/// ratio. Honest reporting: whatever the speedup turns out to be,
977/// the report says exactly that. The R.8.5 gate is ≥5× drop on
978/// the digest stage; if the wall ratio is materially smaller,
979/// the report still publishes and the campaign proceeds with the
980/// honest number.
981#[cfg(feature = "cuda")]
982#[allow(clippy::too_many_lines)]
983fn run_r8_5_tree_digest_compare(warmup: usize, iters: usize) {
984    use std::time::Instant;
985
986    use dsfb_gpu_debug_cuda::{
987        build_gpu_throughput_pinned_async_on_workspace,
988        build_gpu_throughput_pinned_async_on_workspace_tree, GpuWorkspace,
989    };
990
991    let points: [(&str, u32, u32, u32); 3] = [
992        ("canonical 16x128 K=1", 16, 128, 1),
993        ("mid-scale 64x512 K=1", 64, 512, 1),
994        ("full-scale 256x4096 K=1", 256, 4096, 1),
995    ];
996
997    for &(label, n_entities, n_windows, k) in &points {
998        println!();
999        println!("=== R.8.5 tree-digest comparison — {label} ===");
1000        println!("  warmup: {warmup}   iters: {iters}");
1001
1002        let contract = if n_entities == 16 && n_windows == 128 {
1003            let mut c = dsfb_gpu_debug_core::contract::Contract::canonical();
1004            c.pin_bank_hash(dsfb_gpu_debug_core::bank::bank_hash());
1005            c.pin_detector_registry_hash(dsfb_gpu_debug_core::motif::registry_hash());
1006            c
1007        } else {
1008            let mut c = dsfb_gpu_debug_core::contract::Contract::scaled(n_entities, n_windows);
1009            c.pin_bank_hash(dsfb_gpu_debug_core::bank::bank_hash());
1010            c.pin_detector_registry_hash(dsfb_gpu_debug_core::motif::registry_hash());
1011            c
1012        };
1013        let events = if n_entities == 16 && n_windows == 128 {
1014            dsfb_gpu_debug_core::fixture::synthesize(dsfb_gpu_debug_core::fixture::DEFAULT_SEED)
1015        } else {
1016            dsfb_gpu_debug_core::fixture::synthesize_scaled(
1017                dsfb_gpu_debug_core::fixture::DEFAULT_SEED,
1018                n_entities,
1019                n_windows,
1020                4,
1021            )
1022        };
1023
1024        let mut ws_serial = match GpuWorkspace::new_with_pinned_async(&contract) {
1025            Ok(w) => w,
1026            Err(e) => {
1027                println!("  workspace alloc refused: {e:?}; skipping {label}");
1028                continue;
1029            }
1030        };
1031        let mut ws_tree = match GpuWorkspace::new_with_pinned_async(&contract) {
1032            Ok(w) => w,
1033            Err(e) => {
1034                println!("  workspace alloc refused: {e:?}; skipping {label}");
1035                continue;
1036            }
1037        };
1038
1039        // Serial-digest warmup + measurement.
1040        for _ in 0..warmup {
1041            let _ =
1042                build_gpu_throughput_pinned_async_on_workspace(&events, &contract, &mut ws_serial);
1043        }
1044        let mut serial_us: Vec<u128> = Vec::with_capacity(iters);
1045        for _ in 0..iters {
1046            let t0 = Instant::now();
1047            let result =
1048                build_gpu_throughput_pinned_async_on_workspace(&events, &contract, &mut ws_serial);
1049            let dt = t0.elapsed().as_micros();
1050            if let Ok(case) = result {
1051                std::hint::black_box(case);
1052                serial_us.push(dt);
1053            } else {
1054                println!("  serial-digest dispatch error: {result:?}");
1055                return;
1056            }
1057        }
1058
1059        // Tree-digest warmup + measurement.
1060        for _ in 0..warmup {
1061            let _ = build_gpu_throughput_pinned_async_on_workspace_tree(
1062                &events,
1063                &contract,
1064                &mut ws_tree,
1065            );
1066        }
1067        let mut tree_us: Vec<u128> = Vec::with_capacity(iters);
1068        for _ in 0..iters {
1069            let t0 = Instant::now();
1070            let result = build_gpu_throughput_pinned_async_on_workspace_tree(
1071                &events,
1072                &contract,
1073                &mut ws_tree,
1074            );
1075            let dt = t0.elapsed().as_micros();
1076            if let Ok(case) = result {
1077                std::hint::black_box(case);
1078                tree_us.push(dt);
1079            } else {
1080                println!("  tree-digest dispatch error: {result:?}");
1081                return;
1082            }
1083        }
1084
1085        let med_serial = median_u128(&serial_us);
1086        let med_tree = median_u128(&tree_us);
1087        // u128 → u64 → f64: per-iter wall is well under 2^53 µs
1088        // (about 285 years), so the narrowing is loss-free in any
1089        // honest scenario. This dodges clippy's u128 precision
1090        // lint at the same time.
1091        #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
1092        let ratio = if med_tree > 0 {
1093            (med_serial as u64) as f64 / (med_tree as u64) as f64
1094        } else {
1095            0.0
1096        };
1097        print_and_write_r8_5(label, n_entities, n_windows, k, med_serial, med_tree, ratio);
1098    }
1099}
1100
1101/// R.8.5 — render and persist the serial-vs-tree wall comparison
1102/// for one scale point. Honest: only what was measured.
1103#[cfg(feature = "cuda")]
1104#[allow(clippy::too_many_arguments)]
1105fn print_and_write_r8_5(
1106    label: &str,
1107    n_entities: u32,
1108    n_windows: u32,
1109    k: u32,
1110    med_serial_us: u128,
1111    med_tree_us: u128,
1112    ratio: f64,
1113) {
1114    use core::fmt::Write;
1115    let mut out = String::new();
1116    let _ = writeln!(out, "=== R.8.5 tree-digest comparison — {label} ===");
1117    let _ = writeln!(
1118        out,
1119        "scale: n_entities={n_entities} n_windows={n_windows} K={k}"
1120    );
1121    let _ = writeln!(out);
1122    let _ = writeln!(out, "  serial-digest median wall: {med_serial_us:>10} us");
1123    let _ = writeln!(out, "  tree-digest   median wall: {med_tree_us:>10} us");
1124    let _ = writeln!(out, "  wall-time ratio (serial / tree): {ratio:.2}x");
1125    let _ = writeln!(out);
1126    let _ = writeln!(out, "Notes:");
1127    let _ = writeln!(
1128        out,
1129        "  * Both paths run the same 5 pipeline kernels (residual, sign, detector,"
1130    );
1131    let _ = writeln!(
1132        out,
1133        "    consensus, candidate). They differ only in the digest stage: serial"
1134    );
1135    let _ = writeln!(
1136        out,
1137        "    uses 4 single-thread `*_digest_kernel_batched` kernels; tree uses one"
1138    );
1139    let _ = writeln!(
1140        out,
1141        "    block per chunk (~2048 chunks at 256x4096 with 16 KiB chunks) feeding"
1142    );
1143    let _ = writeln!(
1144        out,
1145        "    a final root SHA-256 over the ordered leaf digests + domain separator."
1146    );
1147    let _ = writeln!(
1148        out,
1149        "  * Stage hash bytes differ between modes by construction; case-file"
1150    );
1151    let _ = writeln!(
1152        out,
1153        "    metadata records `digest_mode` so replay catches a mode mismatch."
1154    );
1155
1156    print!("{out}");
1157
1158    let filename = format!("r8_5_tree_compare_{n_entities}x{n_windows}_K{k}.txt");
1159    let path = std::path::Path::new("reports").join(filename);
1160    let _ = std::fs::create_dir_all("reports");
1161    if let Err(e) = std::fs::write(&path, &out) {
1162        eprintln!("warning: could not write {}: {e}", path.display());
1163    } else {
1164        println!("wrote R.8.5 comparison -> {}", path.display());
1165    }
1166}
1167
1168/// R.11 — three-way wall-time comparison: serial-digest vs
1169/// tree-digest vs tree-digest + compact-verdict finalizer. Same
1170/// three K=1 scale points as R.8.5's compare runner. The compact
1171/// path precomputes `FixtureHashes` once outside the iter loop
1172/// so each iteration skips the ~250 MB of host SHA-256 work the
1173/// non-compact builder did per dispatch.
1174///
1175/// Output: `reports/r11_compact_compare_<grid>_K1.txt`.
1176///
1177/// Honest reporting: every number measured is reported as-is.
1178/// The R.11 gate is "host bank + case-finalize drops by ≥5×";
1179/// since we time the full dispatch wall (not just the host
1180/// segment), the headline is the WALL ratio of tree-only vs
1181/// tree+compact at the same scale. If the compact ratio is
1182/// smaller than ≥5×, the report still publishes and the
1183/// campaign proceeds on the honest number.
1184#[cfg(feature = "cuda")]
1185#[allow(clippy::too_many_lines)]
1186fn run_r11_compact_compare(warmup: usize, iters: usize) {
1187    use std::time::Instant;
1188
1189    use dsfb_gpu_debug_core::casefile::FixtureHashes;
1190    use dsfb_gpu_debug_core::window::compute_features;
1191    use dsfb_gpu_debug_cuda::{
1192        build_gpu_throughput_pinned_async_on_workspace,
1193        build_gpu_throughput_pinned_async_on_workspace_tree,
1194        build_gpu_throughput_pinned_async_on_workspace_tree_compact, GpuWorkspace,
1195    };
1196
1197    let points: [(&str, u32, u32, u32); 3] = [
1198        ("canonical 16x128 K=1", 16, 128, 1),
1199        ("mid-scale 64x512 K=1", 64, 512, 1),
1200        ("full-scale 256x4096 K=1", 256, 4096, 1),
1201    ];
1202
1203    for &(label, n_entities, n_windows, k) in &points {
1204        println!();
1205        println!("=== R.11 compact-verdict comparison — {label} ===");
1206        println!("  warmup: {warmup}   iters: {iters}");
1207
1208        let contract = if n_entities == 16 && n_windows == 128 {
1209            let mut c = dsfb_gpu_debug_core::contract::Contract::canonical();
1210            c.pin_bank_hash(dsfb_gpu_debug_core::bank::bank_hash());
1211            c.pin_detector_registry_hash(dsfb_gpu_debug_core::motif::registry_hash());
1212            c
1213        } else {
1214            let mut c = dsfb_gpu_debug_core::contract::Contract::scaled(n_entities, n_windows);
1215            c.pin_bank_hash(dsfb_gpu_debug_core::bank::bank_hash());
1216            c.pin_detector_registry_hash(dsfb_gpu_debug_core::motif::registry_hash());
1217            c
1218        };
1219        let events = if n_entities == 16 && n_windows == 128 {
1220            dsfb_gpu_debug_core::fixture::synthesize(dsfb_gpu_debug_core::fixture::DEFAULT_SEED)
1221        } else {
1222            dsfb_gpu_debug_core::fixture::synthesize_scaled(
1223                dsfb_gpu_debug_core::fixture::DEFAULT_SEED,
1224                n_entities,
1225                n_windows,
1226                4,
1227            )
1228        };
1229
1230        // R.11 precomputed fixture hashes — done ONCE per scale
1231        // point outside the iter loop. This is the load-bearing
1232        // optimisation; subsequent compact dispatches consume
1233        // the precomputed values and skip re-hashing.
1234        let features = compute_features(
1235            &events,
1236            contract.n_windows,
1237            contract.n_entities,
1238            u64::from(contract.window_size_ms) * 1_000_000,
1239        );
1240        let fixture = FixtureHashes::compute(&events, &features);
1241
1242        let Ok(mut ws_serial) = GpuWorkspace::new_with_pinned_async(&contract) else {
1243            println!("  workspace alloc refused; skipping {label}");
1244            continue;
1245        };
1246        let Ok(mut ws_tree) = GpuWorkspace::new_with_pinned_async(&contract) else {
1247            println!("  workspace alloc refused; skipping {label}");
1248            continue;
1249        };
1250        let Ok(mut ws_compact) = GpuWorkspace::new_with_pinned_async(&contract) else {
1251            println!("  workspace alloc refused; skipping {label}");
1252            continue;
1253        };
1254
1255        // Serial-digest run.
1256        for _ in 0..warmup {
1257            let _ =
1258                build_gpu_throughput_pinned_async_on_workspace(&events, &contract, &mut ws_serial);
1259        }
1260        let mut serial_us: Vec<u128> = Vec::with_capacity(iters);
1261        for _ in 0..iters {
1262            let t0 = Instant::now();
1263            let result =
1264                build_gpu_throughput_pinned_async_on_workspace(&events, &contract, &mut ws_serial);
1265            let dt = t0.elapsed().as_micros();
1266            match result {
1267                Ok(case) => {
1268                    std::hint::black_box(case);
1269                    serial_us.push(dt);
1270                }
1271                Err(e) => {
1272                    println!("  serial-digest dispatch error: {e:?}");
1273                    return;
1274                }
1275            }
1276        }
1277
1278        // Tree-digest run (non-compact finalizer).
1279        for _ in 0..warmup {
1280            let _ = build_gpu_throughput_pinned_async_on_workspace_tree(
1281                &events,
1282                &contract,
1283                &mut ws_tree,
1284            );
1285        }
1286        let mut tree_us: Vec<u128> = Vec::with_capacity(iters);
1287        for _ in 0..iters {
1288            let t0 = Instant::now();
1289            let result = build_gpu_throughput_pinned_async_on_workspace_tree(
1290                &events,
1291                &contract,
1292                &mut ws_tree,
1293            );
1294            let dt = t0.elapsed().as_micros();
1295            match result {
1296                Ok(case) => {
1297                    std::hint::black_box(case);
1298                    tree_us.push(dt);
1299                }
1300                Err(e) => {
1301                    println!("  tree-digest dispatch error: {e:?}");
1302                    return;
1303                }
1304            }
1305        }
1306
1307        // Tree-digest + compact-verdict finalizer run.
1308        for _ in 0..warmup {
1309            let _ = build_gpu_throughput_pinned_async_on_workspace_tree_compact(
1310                &events,
1311                &contract,
1312                &mut ws_compact,
1313                &fixture,
1314            );
1315        }
1316        let mut compact_us: Vec<u128> = Vec::with_capacity(iters);
1317        for _ in 0..iters {
1318            let t0 = Instant::now();
1319            let result = build_gpu_throughput_pinned_async_on_workspace_tree_compact(
1320                &events,
1321                &contract,
1322                &mut ws_compact,
1323                &fixture,
1324            );
1325            let dt = t0.elapsed().as_micros();
1326            match result {
1327                Ok(case) => {
1328                    std::hint::black_box(case);
1329                    compact_us.push(dt);
1330                }
1331                Err(e) => {
1332                    println!("  compact-verdict dispatch error: {e:?}");
1333                    return;
1334                }
1335            }
1336        }
1337
1338        let med_serial = median_u128(&serial_us);
1339        let med_tree = median_u128(&tree_us);
1340        let med_compact = median_u128(&compact_us);
1341        // u128 → u64 → f64 narrowing dodges the clippy precision
1342        // lint; per-iter wall is well below 2^53 µs so loss-free.
1343        #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
1344        let ratio_serial_to_compact = if med_compact > 0 {
1345            (med_serial as u64) as f64 / (med_compact as u64) as f64
1346        } else {
1347            0.0
1348        };
1349        #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
1350        let ratio_tree_to_compact = if med_compact > 0 {
1351            (med_tree as u64) as f64 / (med_compact as u64) as f64
1352        } else {
1353            0.0
1354        };
1355        print_and_write_r11(
1356            label,
1357            n_entities,
1358            n_windows,
1359            k,
1360            med_serial,
1361            med_tree,
1362            med_compact,
1363            ratio_serial_to_compact,
1364            ratio_tree_to_compact,
1365        );
1366    }
1367}
1368
1369/// R.11 — render and persist the three-way wall-time comparison
1370/// for one scale point. Honest: only the measured numbers; the
1371/// notes section frames the architectural posture for a future
1372/// reader picking the report up cold.
1373#[cfg(feature = "cuda")]
1374#[allow(clippy::too_many_arguments)]
1375fn print_and_write_r11(
1376    label: &str,
1377    n_entities: u32,
1378    n_windows: u32,
1379    k: u32,
1380    med_serial_us: u128,
1381    med_tree_us: u128,
1382    med_compact_us: u128,
1383    ratio_serial_to_compact: f64,
1384    ratio_tree_to_compact: f64,
1385) {
1386    use core::fmt::Write;
1387    let mut out = String::new();
1388    let _ = writeln!(out, "=== R.11 compact-verdict comparison — {label} ===");
1389    let _ = writeln!(
1390        out,
1391        "scale: n_entities={n_entities} n_windows={n_windows} K={k}"
1392    );
1393    let _ = writeln!(out);
1394    let _ = writeln!(
1395        out,
1396        "  serial-digest                  : {med_serial_us:>10} us"
1397    );
1398    let _ = writeln!(
1399        out,
1400        "  tree-digest (R.8.5)            : {med_tree_us:>10} us"
1401    );
1402    let _ = writeln!(
1403        out,
1404        "  tree-digest + compact (R.11)   : {med_compact_us:>10} us"
1405    );
1406    let _ = writeln!(out);
1407    let _ = writeln!(
1408        out,
1409        "  wall ratio serial / compact    : {ratio_serial_to_compact:.2}x"
1410    );
1411    let _ = writeln!(
1412        out,
1413        "  wall ratio tree   / compact    : {ratio_tree_to_compact:.2}x"
1414    );
1415    let _ = writeln!(out);
1416    let _ = writeln!(out, "Notes:");
1417    let _ = writeln!(
1418        out,
1419        "  * Serial = legacy R.6b path (4 single-thread digest kernels + non-compact builder)."
1420    );
1421    let _ = writeln!(
1422        out,
1423        "  * Tree = R.8.5 path (block-parallel tree digest + non-compact builder)."
1424    );
1425    let _ = writeln!(
1426        out,
1427        "  * Compact = R.11 path (tree digest + FixtureHashes precomputed once)."
1428    );
1429    let _ = writeln!(
1430        out,
1431        "  * `FixtureHashes` is computed ONCE per scale point outside the iter loop,"
1432    );
1433    let _ = writeln!(
1434        out,
1435        "    matching how a long-running deployment caller would amortise the input"
1436    );
1437    let _ = writeln!(
1438        out,
1439        "    commitment hash across many dispatches against the same fixture."
1440    );
1441    let _ = writeln!(
1442        out,
1443        "  * Case files from all three paths are byte-identical for the serial vs."
1444    );
1445    let _ = writeln!(
1446        out,
1447        "    serial pairing, and the tree pair is internally byte-identical;"
1448    );
1449    let _ = writeln!(
1450        out,
1451        "    serial ≠ tree because tree commits to chunked stage bytes + a domain"
1452    );
1453    let _ = writeln!(
1454        out,
1455        "    separator. Compact ≡ tree byte-for-byte by construction."
1456    );
1457    let _ = writeln!(
1458        out,
1459        "  * Semantic Non-Bypass Axiom holds in every path: `bank_collapse` is the"
1460    );
1461    let _ = writeln!(
1462        out,
1463        "    only mint of `BankAdmissionToken`. The compact builder reuses it."
1464    );
1465
1466    print!("{out}");
1467
1468    let filename = format!("r11_compact_compare_{n_entities}x{n_windows}_K{k}.txt");
1469    let path = std::path::Path::new("reports").join(filename);
1470    let _ = std::fs::create_dir_all("reports");
1471    if let Err(e) = std::fs::write(&path, &out) {
1472        eprintln!("warning: could not write {}: {e}", path.display());
1473    } else {
1474        println!("wrote R.11 comparison -> {}", path.display());
1475    }
1476}
dsfb_gpu_debug_demo/cli/bench_gpu_scale.rs

dsfb_gpu_debug_demo/cli/
bench_gpu_scale.rs