Skip to main content

dsfb_gpu_debug_demo/cli/
bench.rs

1//! `dsfb-gpu-debug bench` — measure pipeline wall-clock on CPU and GPU.
2//!
3//! Times only the pipeline functions (no fixture I/O, no canonical JSON
4//! emission, no hash-chain construction outside the bare pipeline). The
5//! goal is to surface the cost of the deterministic-inference chain
6//! itself, separated from the surrounding orchestration.
7//!
8//! Flags:
9//!
10//! * `--iters N` (default 100): measured iterations.
11//! * `--warmup N` (default 10): warmup iterations excluded from stats.
12//!   Excluding the warmup from stats is what keeps first-call CUDA
13//!   context creation out of the published numbers — the user's
14//!   step 6 in the optimization roadmap.
15//! * `--backend cpu|gpu|both` (default `both`).
16//! * `--detail` (no value): also report per-stage CUDA-event timings
17//!   for the GPU path (alloc / H2D / kernel 1..5 / D2H / free / total).
18//!   Implies `--backend gpu` if not otherwise set.
19//!
20//! Numbers are reported as min / median / mean / max in microseconds.
21//! No formal benchmarking framework — this is a transparency tool, not
22//! a deployment performance claim. v0 launch geometry is 1 thread per
23//! entity (16 threads), which is dramatically under-utilized hardware;
24//! the bench reports what the architecture actually does today and
25//! makes that posture observable.
26//!
27//! `.expect()` is used inside the GPU bench so a CUDA error aborts
28//! loudly rather than silently producing meaningless timings.
29
30#![allow(clippy::expect_used)]
31
32use std::process::ExitCode;
33use std::time::Instant;
34
35use dsfb_gpu_debug_core::bank::bank_hash;
36use dsfb_gpu_debug_core::casefile::{build_cpu, build_cpu_throughput};
37use dsfb_gpu_debug_core::contract::Contract;
38use dsfb_gpu_debug_core::event::TraceEvent;
39use dsfb_gpu_debug_core::fixture::{synthesize, synthesize_scaled, DEFAULT_SEED};
40use dsfb_gpu_debug_core::motif::registry_hash;
41
42#[cfg(feature = "cuda")]
43use dsfb_gpu_debug_cuda::{
44    build_gpu_batched_throughput, build_gpu_batched_throughput_device_digests,
45    build_gpu_layer_a_batched, build_gpu_layer_a_on_workspace, build_gpu_on_workspace,
46    build_gpu_throughput_device_digests_on_workspace, build_gpu_throughput_on_workspace,
47    build_gpu_timed_on_workspace, BatchedGpuWorkspace, GpuWorkspace,
48};
49
50use super::{parse_flags, usage_error};
51
52#[allow(clippy::too_many_lines)]
53pub fn parse_and_run(args: &[String]) -> ExitCode {
54    let flags = match parse_flags(args) {
55        Ok(f) => f,
56        Err(message) => return usage_error(&message),
57    };
58
59    let iters: usize = flags
60        .get("iters")
61        .map_or(100, |s| s.parse::<usize>().unwrap_or(100));
62    let warmup: usize = flags
63        .get("warmup")
64        .map_or(10, |s| s.parse::<usize>().unwrap_or(10));
65    // `--detail` accepts no value; presence in the flags map (under the
66    // hand-rolled parser's `--key value` convention) is enough. Treat any
67    // value other than `false` as opt-in.
68    let detail = flags.get("detail").is_some_and(|v| v != "false");
69    // If the user asked for `--detail` without setting `--backend`, run
70    // the GPU path; per-stage timings aren't meaningful for the CPU path.
71    let backend = if detail && !flags.contains_key("backend") {
72        "gpu"
73    } else {
74        flags.get("backend").map_or("both", String::as_str)
75    };
76    // `--mode audit|throughput|both`. Default is `audit` so the legacy
77    // numbers continue to be the headline. `both` runs each mode in
78    // sequence for direct comparison.
79    let mode = flags.get("mode").map_or("audit", String::as_str);
80    // `--layer A|B|C|all`. R.1: three-layer benchmark taxonomy.
81    //
82    //   * Layer A — device evidence fabric: GPU kernels + on-device
83    //     digests; no host bank stage. (Skip-bank specialization
84    //     lands in R.3; for R.1 we report the device-digests path
85    //     as Layer A and note the caveat in the output file.)
86    //   * Layer B — throughput verdict summary: Layer A + host bank
87    //     stage finalises compact candidates into admitted episodes.
88    //   * Layer C — full audit court: every intermediate cell
89    //     materialised host-side; canonical-JSON case file emitted.
90    //
91    // When `--layer` is unset, the bench falls through to the legacy
92    // `--mode`-based dispatch for backwards compatibility. When set,
93    // the bench runs the selected layer's path and writes a layer
94    // report to `reports/layer_<L>_<grid>x<windows>_K<N>.txt`.
95    let layer = flags.get("layer").map(String::as_str);
96    // `--scale n_entities x n_windows`. When set, replaces the canonical
97    // 16x128 grid with a larger one and uses `synthesize_scaled` to fill
98    // it. Recommended values for the paper's scaling table: 16x128,
99    // 64x512, 128x1024, 256x1024.
100    //
101    // `--scale-large` (R.2): shorthand for `--scale 256x4096`, the
102    // panel-locked headline profile for the courthouse-factory money
103    // table. Mutually exclusive with `--scale`; explicit `--scale` wins.
104    let scale = flags.get("scale").and_then(|s| parse_scale(s)).or_else(|| {
105        flags
106            .get("scale-large")
107            .filter(|v| v.as_str() != "false")
108            .map(|_| (256u32, 4096u32))
109    });
110    // `--materialize-catalog J` (R.3a): when set under `--layer A`,
111    // after the Layer A bench reports the device-fabric measurements,
112    // synthesize catalog J via the courthouse-factory generator and
113    // run a single Layer C (full audit court) on it. The other K-1
114    // catalogs stay in compact-summary form. This is the panel's
115    // "docket digest with on-demand transcript expansion" — most
116    // catalogs ride the fast path; the one a reviewer asks about
117    // gets the full court transcript.
118    let materialize_catalog: Option<u32> = flags
119        .get("materialize-catalog")
120        .and_then(|s| s.parse::<u32>().ok());
121    // `--batch K`. When set (and the cuda feature is on), runs K
122    // independent catalogs through `build_gpu_batched_throughput` per
123    // iteration. Reports cases/sec and per-catalog amortized time so
124    // the GPU-side parallelism win is visible. The cfg gate silences
125    // the unused-variable lint on no-cuda builds; the rest of the
126    // logic below dispatches the batched runner conditionally.
127    #[cfg(feature = "cuda")]
128    let batch: u32 = flags
129        .get("batch")
130        .and_then(|s| s.parse::<u32>().ok())
131        .unwrap_or(0);
132    #[cfg(not(feature = "cuda"))]
133    let _ = flags.get("batch");
134
135    // Build the canonical inputs once.
136    let (events, contract_dims, scaled_label) = match scale {
137        None => (synthesize(DEFAULT_SEED), (16u32, 128u32), String::new()),
138        Some((n_entities, n_windows)) => {
139            let events = synthesize_scaled(DEFAULT_SEED, n_entities, n_windows, 4);
140            (
141                events,
142                (n_entities, n_windows),
143                format!(" [scaled {n_entities}x{n_windows}]"),
144            )
145        }
146    };
147    let mut contract = if scale.is_some() {
148        Contract::scaled(contract_dims.0, contract_dims.1)
149    } else {
150        Contract::canonical()
151    };
152    contract.pin_bank_hash(bank_hash());
153    contract.pin_detector_registry_hash(registry_hash());
154
155    println!("dsfb-gpu-debug bench:{scaled_label}");
156    println!("  events    : {}", events.len());
157    println!("  n_entities: {}", contract.n_entities);
158    println!("  n_windows : {}", contract.n_windows);
159    println!("  warmup    : {warmup}");
160    println!("  iters     : {iters}");
161    println!();
162
163    let run_audit = mode == "audit" || mode == "both";
164    let run_throughput = mode == "throughput" || mode == "both";
165
166    // R.1 — three-layer benchmark taxonomy. When `--layer` is set, the
167    // layer-aware dispatcher runs and the legacy `--mode` path is
168    // skipped. The legacy path remains the default for backwards
169    // compatibility so existing bench invocations keep their numbers.
170    if let Some(layer_spec) = layer {
171        let layers: &[char] = match layer_spec {
172            "A" | "a" => &['A'],
173            "B" | "b" => &['B'],
174            "C" | "c" => &['C'],
175            "all" | "ABC" | "abc" => &['A', 'B', 'C'],
176            other => {
177                eprintln!("unknown --layer {other:?}; expected A | B | C | all");
178                return ExitCode::from(1);
179            }
180        };
181        let reports_dir = std::path::Path::new("reports");
182        for &l in layers {
183            run_layer_bench(
184                l,
185                &events,
186                &contract,
187                warmup,
188                iters,
189                #[cfg(feature = "cuda")]
190                batch,
191                #[cfg(not(feature = "cuda"))]
192                0,
193                Some(reports_dir),
194            );
195        }
196        // R.3a — opt-in single-catalog transcript expansion. After the
197        // Layer A/B/C bench reports the fabric numbers, if the caller
198        // asked for `--materialize-catalog J`, run a Layer C audit on
199        // catalog J specifically and report it. This honours the
200        // "docket digest with on-demand transcript expansion" framing.
201        if let Some(j) = materialize_catalog {
202            run_materialize_catalog(j, &events, &contract, warmup.max(1), iters.max(1));
203        }
204        return ExitCode::SUCCESS;
205    }
206
207    if backend == "cpu" || backend == "both" {
208        if run_audit {
209            run_cpu_bench_audit(&events, &contract, warmup, iters);
210        }
211        if run_throughput {
212            run_cpu_bench_throughput(&events, &contract, warmup, iters);
213        }
214    }
215
216    // Tier 3B device-digest variants. `--device-digests` enables the
217    // on-device per-stage SHA-256 path. When also batched (`--batch K`),
218    // runs the parallel digest dispatcher with K SHA-256 streams. The
219    // case files are byte-identical to the host-digest Throughput path
220    // (pinned by `throughput_device_digests_equivalence`); only the
221    // wall-time bookkeeping differs.
222    #[cfg(feature = "cuda")]
223    let device_digests = flags.get("device-digests").is_some_and(|v| v != "false");
224    #[cfg(not(feature = "cuda"))]
225    let _ = flags.get("device-digests");
226
227    #[cfg(feature = "cuda")]
228    if backend == "gpu" || backend == "both" {
229        if detail {
230            run_gpu_bench_with_detail(&events, &contract, warmup, iters);
231        } else {
232            if run_audit {
233                run_gpu_bench_audit(&events, &contract, warmup, iters);
234            }
235            if run_throughput {
236                run_gpu_bench_throughput(&events, &contract, warmup, iters);
237            }
238            if device_digests && run_throughput {
239                run_gpu_bench_throughput_device_digests(&events, &contract, warmup, iters);
240            }
241            if batch > 0 {
242                run_gpu_bench_batched(&events, &contract, batch, warmup, iters);
243            }
244            if batch > 0 && device_digests {
245                run_gpu_bench_batched_device_digests(&events, &contract, batch, warmup, iters);
246            }
247        }
248    }
249    #[cfg(not(feature = "cuda"))]
250    if backend == "gpu" || backend == "both" {
251        let _ = detail;
252        println!("GPU pipeline: built without --features cuda; skipping");
253    }
254
255    ExitCode::SUCCESS
256}
257
258/// Parse `<n_entities>x<n_windows>` (e.g. "256x1024") into a dimension
259/// pair. Returns `None` for malformed input; the CLI then runs with
260/// the canonical 16x128 contract.
261fn parse_scale(s: &str) -> Option<(u32, u32)> {
262    let (n_entities_s, n_windows_s) = s.split_once('x')?;
263    let n_entities: u32 = n_entities_s.parse().ok()?;
264    let n_windows: u32 = n_windows_s.parse().ok()?;
265    if n_entities == 0 || n_windows == 0 {
266        return None;
267    }
268    Some((n_entities, n_windows))
269}
270
271fn run_cpu_bench_audit(events: &[TraceEvent], contract: &Contract, warmup: usize, iters: usize) {
272    for _ in 0..warmup {
273        let _ = std::hint::black_box(build_cpu(events, contract));
274    }
275    let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
276    for _ in 0..iters {
277        let t0 = Instant::now();
278        let case = build_cpu(events, contract);
279        let dt = t0.elapsed().as_micros();
280        std::hint::black_box(case);
281        samples_us.push(dt);
282    }
283    report("CPU pipeline (Audit, build_cpu)", &samples_us);
284}
285
286fn run_cpu_bench_throughput(
287    events: &[TraceEvent],
288    contract: &Contract,
289    warmup: usize,
290    iters: usize,
291) {
292    for _ in 0..warmup {
293        let _ = std::hint::black_box(build_cpu_throughput(events, contract));
294    }
295    let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
296    for _ in 0..iters {
297        let t0 = Instant::now();
298        let case = build_cpu_throughput(events, contract);
299        let dt = t0.elapsed().as_micros();
300        std::hint::black_box(case);
301        samples_us.push(dt);
302    }
303    report(
304        "CPU pipeline (Throughput, build_cpu_throughput)",
305        &samples_us,
306    );
307}
308
309#[cfg(feature = "cuda")]
310fn run_gpu_bench_audit(events: &[TraceEvent], contract: &Contract, warmup: usize, iters: usize) {
311    // Workspace allocated once, before warmup and the measured iters.
312    // This is what makes the GPU numbers reflect the steady-state cost
313    // of the deterministic-inference pipeline rather than the one-shot
314    // cudaMalloc storm a fresh call paid for in v0.
315    let mut workspace = GpuWorkspace::new(contract).expect("workspace allocation");
316    for _ in 0..warmup {
317        let case = build_gpu_on_workspace(&mut workspace, events, contract).expect("CUDA pipeline");
318        std::hint::black_box(case);
319    }
320    let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
321    for _ in 0..iters {
322        let t0 = Instant::now();
323        let case = build_gpu_on_workspace(&mut workspace, events, contract).expect("CUDA pipeline");
324        let dt = t0.elapsed().as_micros();
325        std::hint::black_box(case);
326        samples_us.push(dt);
327    }
328    report(
329        "GPU pipeline (Audit, workspace-resident, sm_75/80/89)",
330        &samples_us,
331    );
332}
333
334#[cfg(feature = "cuda")]
335fn run_gpu_bench_batched(
336    events: &[TraceEvent],
337    contract: &Contract,
338    batch: u32,
339    warmup: usize,
340    iters: usize,
341) {
342    // Build K independent fixtures by perturbing the LCG seed. The
343    // event payload differs per catalog, so the GPU sees real cross-
344    // catalog independence (not K copies of the same bytes that the
345    // memory subsystem could trivially de-duplicate).
346    // R.2 courthouse-factory generator. Catalog 0 reuses the bench's
347    // canonical `events` slice (so single-vs-batched comparisons share
348    // catalog 0 exactly); the remaining K-1 catalogs come from the
349    // factory at the same `(n_entities, n_windows)` grid. The Knuth
350    // golden-ratio constant matches the previous `wrapping_mul(0x9E37)`
351    // pattern that the bench used before R.2.
352    let mut fixtures: Vec<Vec<TraceEvent>> = Vec::with_capacity(batch as usize);
353    fixtures.push(events.to_vec());
354    if batch > 1 {
355        let extra = dsfb_gpu_debug_core::fixture::synthesize_courthouse_factory(
356            dsfb_gpu_debug_core::fixture::DEFAULT_SEED.wrapping_add(0x9E37_79B9_7F4A_7C15),
357            batch - 1,
358            contract.n_entities,
359            contract.n_windows,
360            4,
361        );
362        fixtures.extend(extra);
363    }
364    let event_slices: Vec<&[TraceEvent]> = fixtures.iter().map(Vec::as_slice).collect();
365
366    let mut workspace =
367        BatchedGpuWorkspace::new(batch, contract).expect("batched workspace allocation");
368
369    for _ in 0..warmup {
370        let cases =
371            build_gpu_batched_throughput(&mut workspace, &event_slices, contract).expect("CUDA");
372        std::hint::black_box(cases);
373    }
374
375    let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
376    for _ in 0..iters {
377        let t0 = Instant::now();
378        let cases =
379            build_gpu_batched_throughput(&mut workspace, &event_slices, contract).expect("CUDA");
380        let dt = t0.elapsed().as_micros();
381        std::hint::black_box(cases);
382        samples_us.push(dt);
383    }
384
385    let label = format!("GPU pipeline (Batched K={batch}, workspace-resident, sm_75/80/89)");
386    report(&label, &samples_us);
387
388    // Per-catalog amortized time + cases/sec. These are the metrics
389    // the user roadmap calls the headline numbers for the batched
390    // dispatch.
391    let mut sorted = samples_us.clone();
392    sorted.sort_unstable();
393    let median_us = sorted[sorted.len() / 2];
394    let per_catalog_us = median_us / u128::from(batch);
395    let cases_per_sec = if median_us > 0 {
396        1_000_000u128 * u128::from(batch) / median_us
397    } else {
398        0
399    };
400    println!(
401        "  per-catalog amortized: {per_catalog_us} us    throughput: {cases_per_sec} cases/sec"
402    );
403    println!();
404}
405
406#[cfg(feature = "cuda")]
407fn run_gpu_bench_throughput(
408    events: &[TraceEvent],
409    contract: &Contract,
410    warmup: usize,
411    iters: usize,
412) {
413    let mut workspace = GpuWorkspace::new(contract).expect("workspace allocation");
414    for _ in 0..warmup {
415        let case = build_gpu_throughput_on_workspace(&mut workspace, events, contract)
416            .expect("CUDA pipeline");
417        std::hint::black_box(case);
418    }
419    let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
420    for _ in 0..iters {
421        let t0 = Instant::now();
422        let case = build_gpu_throughput_on_workspace(&mut workspace, events, contract)
423            .expect("CUDA pipeline");
424        let dt = t0.elapsed().as_micros();
425        std::hint::black_box(case);
426        samples_us.push(dt);
427    }
428    report(
429        "GPU pipeline (Throughput, workspace-resident, sm_75/80/89)",
430        &samples_us,
431    );
432}
433
434#[cfg(feature = "cuda")]
435fn run_gpu_bench_with_detail(
436    events: &[TraceEvent],
437    contract: &Contract,
438    warmup: usize,
439    iters: usize,
440) {
441    // Same pattern as `run_gpu_bench`: one workspace amortized across
442    // warmup + measured iterations.
443    let mut workspace = GpuWorkspace::new(contract).expect("workspace allocation");
444
445    // Warmup uses the timing path too so its CUDA-event setup cost is
446    // not folded into the measured samples.
447    for _ in 0..warmup {
448        let (case, _) =
449            build_gpu_timed_on_workspace(&mut workspace, events, contract).expect("CUDA pipeline");
450        std::hint::black_box(case);
451    }
452
453    // Per-iteration: collect host wall-clock plus the eight per-stage
454    // microsecond fields from the CUDA-event timings struct. Each is
455    // reported with the same min/median/mean/max statistics.
456    let mut wall_us: Vec<u128> = Vec::with_capacity(iters);
457    let mut alloc_us: Vec<u128> = Vec::with_capacity(iters);
458    let mut h2d_us: Vec<u128> = Vec::with_capacity(iters);
459    let mut k1_us: Vec<u128> = Vec::with_capacity(iters);
460    let mut k2_us: Vec<u128> = Vec::with_capacity(iters);
461    let mut k3_us: Vec<u128> = Vec::with_capacity(iters);
462    let mut k4_us: Vec<u128> = Vec::with_capacity(iters);
463    let mut k5_us: Vec<u128> = Vec::with_capacity(iters);
464    let mut d2h_us: Vec<u128> = Vec::with_capacity(iters);
465    let mut free_us: Vec<u128> = Vec::with_capacity(iters);
466    let mut device_total_us: Vec<u128> = Vec::with_capacity(iters);
467
468    for _ in 0..iters {
469        let t0 = Instant::now();
470        let (case, t) =
471            build_gpu_timed_on_workspace(&mut workspace, events, contract).expect("CUDA pipeline");
472        let dt = t0.elapsed().as_micros();
473        std::hint::black_box(case);
474        wall_us.push(dt);
475        push_f32_as_u128(&mut alloc_us, t.alloc_us);
476        push_f32_as_u128(&mut h2d_us, t.h2d_us);
477        push_f32_as_u128(&mut k1_us, t.k1_residual_us);
478        push_f32_as_u128(&mut k2_us, t.k2_sign_us);
479        push_f32_as_u128(&mut k3_us, t.k3_detector_us);
480        push_f32_as_u128(&mut k4_us, t.k4_consensus_us);
481        push_f32_as_u128(&mut k5_us, t.k5_candidate_us);
482        push_f32_as_u128(&mut d2h_us, t.d2h_us);
483        push_f32_as_u128(&mut free_us, t.free_us);
484        push_f32_as_u128(&mut device_total_us, t.total_us);
485    }
486
487    println!("GPU pipeline (build_gpu_timed --detail)");
488    report_inline("host wall time     ", &wall_us);
489    report_inline("device alloc       ", &alloc_us);
490    report_inline("H2D (window feats) ", &h2d_us);
491    report_inline("k1 residual_field  ", &k1_us);
492    report_inline("k2 drift_slew_sign ", &k2_us);
493    report_inline("k3 detector_motif  ", &k3_us);
494    report_inline("k4 consensus_grid  ", &k4_us);
495    report_inline("k5 candidate_coll. ", &k5_us);
496    report_inline("D2H (all stages)   ", &d2h_us);
497    report_inline("device free        ", &free_us);
498    report_inline("device total       ", &device_total_us);
499    println!();
500}
501
502/// Convert a `f32` microsecond reading to `u128` for stat aggregation.
503/// CUDA events return millisecond floats; the kernel wrapper has
504/// already converted to µs. We round to the nearest integer µs to
505/// keep the sample type uniform with the wall-clock samples.
506#[cfg(feature = "cuda")]
507fn push_f32_as_u128(samples: &mut Vec<u128>, val: f32) {
508    samples.push(val.round().max(0.0) as u128);
509}
510
511#[cfg(feature = "cuda")]
512fn report_inline(label: &str, samples_us: &[u128]) {
513    if samples_us.is_empty() {
514        return;
515    }
516    let mut sorted = samples_us.to_vec();
517    sorted.sort_unstable();
518    let n = sorted.len() as u128;
519    let min = *sorted.first().unwrap_or(&0);
520    let max = *sorted.last().unwrap_or(&0);
521    let median = sorted[sorted.len() / 2];
522    let mean = sorted.iter().sum::<u128>() / n;
523    println!(
524        "  {label}  min={min:>6} us  median={median:>6} us  mean={mean:>6} us  max={max:>6} us"
525    );
526}
527
528fn report(label: &str, samples_us: &[u128]) {
529    let mut sorted = samples_us.to_vec();
530    sorted.sort_unstable();
531    let n = sorted.len() as u128;
532    let min = *sorted.first().unwrap_or(&0);
533    let max = *sorted.last().unwrap_or(&0);
534    let median = sorted[sorted.len() / 2];
535    let sum: u128 = sorted.iter().sum();
536    let mean = if n == 0 { 0 } else { sum / n };
537    println!("{label}");
538    println!("  min    : {min:>8} us");
539    println!("  median : {median:>8} us");
540    println!("  mean   : {mean:>8} us");
541    println!("  max    : {max:>8} us");
542    println!("  samples: {n}");
543    println!();
544}
545
546/// Layer-specific metrics block for R.1's three-layer benchmark
547/// taxonomy. Reports catalogs/sec, cells/sec, and detector-evaluations
548/// per second alongside the standard min/median/mean/max. The cells
549/// and detector-evaluations counts use the actual grid dimensions and
550/// the canonical 16-motif registry, so the per-second figures reflect
551/// real evidence-field throughput rather than per-call wall time alone.
552///
553/// `samples_us` is the per-iteration wall time. `n_catalogs` is 1 for
554/// the single-catalog path and K for the batched path. The median is
555/// the reference because it is robust to occasional CUDA-event noise.
556#[allow(clippy::too_many_arguments)]
557fn report_layer(
558    label: &str,
559    samples_us: &[u128],
560    layer: char,
561    n_entities: u32,
562    n_windows: u32,
563    n_catalogs: u32,
564    n_detectors: u32,
565    out_dir: Option<&std::path::Path>,
566    file_tag: &str,
567) {
568    let mut sorted = samples_us.to_vec();
569    sorted.sort_unstable();
570    let n_samples = sorted.len() as u128;
571    let min = *sorted.first().unwrap_or(&0);
572    let max = *sorted.last().unwrap_or(&0);
573    let median = if sorted.is_empty() {
574        0
575    } else {
576        sorted[sorted.len() / 2]
577    };
578    let sum: u128 = sorted.iter().sum();
579    let mean = if n_samples == 0 { 0 } else { sum / n_samples };
580
581    let catalogs = u128::from(n_catalogs);
582    let cells = catalogs * u128::from(n_entities) * u128::from(n_windows);
583    let det_evals = cells * u128::from(n_detectors);
584    let one_sec = 1_000_000u128;
585    let catalogs_per_sec = if median > 0 {
586        catalogs * one_sec / median
587    } else {
588        0
589    };
590    let cells_per_sec = if median > 0 {
591        cells * one_sec / median
592    } else {
593        0
594    };
595    let det_evals_per_sec = if median > 0 {
596        det_evals * one_sec / median
597    } else {
598        0
599    };
600    let per_catalog_us = if catalogs > 0 {
601        median / catalogs
602    } else {
603        median
604    };
605
606    println!("{label} [Layer {layer}]");
607    println!("  min                  : {min:>10} us");
608    println!("  median               : {median:>10} us");
609    println!("  mean                 : {mean:>10} us");
610    println!("  max                  : {max:>10} us");
611    println!("  samples              : {n_samples}");
612    println!("  n_catalogs (K)       : {n_catalogs}");
613    println!("  per-catalog amortized: {per_catalog_us:>10} us");
614    println!("  catalogs/sec         : {catalogs_per_sec}");
615    println!("  cells/sec            : {cells_per_sec}");
616    println!("  detector-evals/sec   : {det_evals_per_sec}");
617    println!();
618
619    if let Some(out_dir) = out_dir {
620        let _ = std::fs::create_dir_all(out_dir);
621        let filename =
622            format!("layer_{layer}{file_tag}_{n_entities}x{n_windows}_K{n_catalogs}.txt");
623        let path = out_dir.join(filename);
624        let body = format!(
625            "{label} [Layer {layer}]\n\
626             n_entities       : {n_entities}\n\
627             n_windows        : {n_windows}\n\
628             n_catalogs (K)   : {n_catalogs}\n\
629             n_detectors      : {n_detectors}\n\
630             samples          : {n_samples}\n\
631             min_us           : {min}\n\
632             median_us        : {median}\n\
633             mean_us          : {mean}\n\
634             max_us           : {max}\n\
635             per_catalog_us   : {per_catalog_us}\n\
636             catalogs_per_sec : {catalogs_per_sec}\n\
637             cells_per_sec    : {cells_per_sec}\n\
638             det_evals_per_sec: {det_evals_per_sec}\n"
639        );
640        if let Err(e) = std::fs::write(&path, body) {
641            eprintln!("warning: could not write {}: {e}", path.display());
642        } else {
643            println!("  wrote layer report -> {}", path.display());
644            println!();
645        }
646    }
647}
648
649#[cfg(feature = "cuda")]
650fn run_gpu_bench_throughput_device_digests(
651    events: &[TraceEvent],
652    contract: &Contract,
653    warmup: usize,
654    iters: usize,
655) {
656    let mut workspace = GpuWorkspace::new(contract).expect("workspace allocation");
657    for _ in 0..warmup {
658        let case =
659            build_gpu_throughput_device_digests_on_workspace(events, contract, &mut workspace)
660                .expect("CUDA pipeline (device digests)");
661        std::hint::black_box(case);
662    }
663    let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
664    for _ in 0..iters {
665        let t0 = Instant::now();
666        let case =
667            build_gpu_throughput_device_digests_on_workspace(events, contract, &mut workspace)
668                .expect("CUDA pipeline (device digests)");
669        let dt = t0.elapsed().as_micros();
670        std::hint::black_box(case);
671        samples_us.push(dt);
672    }
673    report(
674        "GPU pipeline (Throughput, Tier 3B on-device SHA-256, sm_75/80/89)",
675        &samples_us,
676    );
677}
678
679#[cfg(feature = "cuda")]
680fn run_gpu_bench_batched_device_digests(
681    events: &[TraceEvent],
682    contract: &Contract,
683    batch: u32,
684    warmup: usize,
685    iters: usize,
686) {
687    // R.2 courthouse-factory generator. Catalog 0 reuses the bench's
688    // canonical `events` slice (so single-vs-batched comparisons share
689    // catalog 0 exactly); the remaining K-1 catalogs come from the
690    // factory at the same `(n_entities, n_windows)` grid. The Knuth
691    // golden-ratio constant matches the previous `wrapping_mul(0x9E37)`
692    // pattern that the bench used before R.2.
693    let mut fixtures: Vec<Vec<TraceEvent>> = Vec::with_capacity(batch as usize);
694    fixtures.push(events.to_vec());
695    if batch > 1 {
696        let extra = dsfb_gpu_debug_core::fixture::synthesize_courthouse_factory(
697            dsfb_gpu_debug_core::fixture::DEFAULT_SEED.wrapping_add(0x9E37_79B9_7F4A_7C15),
698            batch - 1,
699            contract.n_entities,
700            contract.n_windows,
701            4,
702        );
703        fixtures.extend(extra);
704    }
705    let event_slices: Vec<&[TraceEvent]> = fixtures.iter().map(Vec::as_slice).collect();
706
707    let mut workspace =
708        BatchedGpuWorkspace::new(batch, contract).expect("batched workspace allocation");
709
710    for _ in 0..warmup {
711        let cases =
712            build_gpu_batched_throughput_device_digests(&mut workspace, &event_slices, contract)
713                .expect("CUDA pipeline (batched device digests)");
714        std::hint::black_box(cases);
715    }
716    let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
717    for _ in 0..iters {
718        let t0 = Instant::now();
719        let cases =
720            build_gpu_batched_throughput_device_digests(&mut workspace, &event_slices, contract)
721                .expect("CUDA pipeline (batched device digests)");
722        let dt = t0.elapsed().as_micros();
723        std::hint::black_box(cases);
724        samples_us.push(dt);
725    }
726    let label = format!("GPU pipeline (Batched K={batch}, Tier 3B on-device SHA-256, sm_75/80/89)");
727    report(&label, &samples_us);
728
729    let mut sorted = samples_us.clone();
730    sorted.sort_unstable();
731    let median_us = sorted[sorted.len() / 2];
732    let per_catalog_us = median_us / u128::from(batch);
733    let cases_per_sec = if median_us > 0 {
734        1_000_000u128 * u128::from(batch) / median_us
735    } else {
736        0
737    };
738    println!(
739        "  per-catalog amortized: {per_catalog_us} us    throughput: {cases_per_sec} cases/sec"
740    );
741    println!();
742}
743
744/// R.1 layer-aware bench runner. Dispatches one of the three layer
745/// flavours, collects samples, and emits a layer report (console +
746/// `reports/layer_<L>_<grid>x<windows>_K<N>.txt`).
747///
748/// Layer mapping at R.1 (skip-bank specialization for Layer A arrives in R.3):
749///   * Layer A — GPU device-digests path (closest current proxy for
750///     "device evidence fabric only"; full skip-bank lands in R.3).
751///   * Layer B — GPU throughput path with host bank stage.
752///   * Layer C — full audit court (CPU and GPU audit, side by side).
753///
754/// `batch` is the K from `--batch K`; 0 means single-catalog. When the
755/// cuda feature is off, GPU layers print a notice and return; CPU
756/// Layer C still runs.
757#[allow(clippy::too_many_lines)]
758fn run_layer_bench(
759    layer: char,
760    events: &[TraceEvent],
761    contract: &Contract,
762    warmup: usize,
763    iters: usize,
764    batch: u32,
765    out_dir: Option<&std::path::Path>,
766) {
767    let n_entities = contract.n_entities;
768    let n_windows = contract.n_windows;
769    let n_detectors = dsfb_gpu_debug_core::motif::MotifClass::COUNT as u32;
770    // Reported K (1 for single-catalog, batch otherwise). Used inside the
771    // cuda branches; the `let _` silences the unused-variable lint on
772    // builds without the cuda feature where the variable is informational.
773    #[cfg(feature = "cuda")]
774    let n_catalogs = if batch == 0 { 1 } else { batch };
775    #[cfg(not(feature = "cuda"))]
776    let n_catalogs: u32 = if batch == 0 { 1 } else { batch };
777    #[cfg(not(feature = "cuda"))]
778    let _ = n_catalogs;
779
780    match layer {
781        'A' => {
782            #[cfg(feature = "cuda")]
783            {
784                let samples = run_layer_a(events, contract, batch, warmup, iters);
785                let label = if batch == 0 {
786                    String::from(
787                        "Layer A — device evidence fabric (Tier 3B device-digests, single-catalog)",
788                    )
789                } else {
790                    format!(
791                        "Layer A — device evidence fabric (Tier 3B device-digests, K={batch} batched)"
792                    )
793                };
794                report_layer(
795                    &label,
796                    &samples,
797                    'A',
798                    n_entities,
799                    n_windows,
800                    n_catalogs,
801                    n_detectors,
802                    out_dir,
803                    "",
804                );
805            }
806            #[cfg(not(feature = "cuda"))]
807            {
808                let _ = (events, contract, batch, warmup, iters);
809                println!("Layer A — GPU pipeline: built without --features cuda; skipping");
810                println!();
811            }
812        }
813        'B' => {
814            #[cfg(feature = "cuda")]
815            {
816                let samples = run_layer_b(events, contract, batch, warmup, iters);
817                let label = if batch == 0 {
818                    String::from(
819                        "Layer B — throughput verdict summary (host bank stage, single-catalog)",
820                    )
821                } else {
822                    format!(
823                        "Layer B — throughput verdict summary (host bank stage, K={batch} batched)"
824                    )
825                };
826                report_layer(
827                    &label,
828                    &samples,
829                    'B',
830                    n_entities,
831                    n_windows,
832                    n_catalogs,
833                    n_detectors,
834                    out_dir,
835                    "",
836                );
837            }
838            #[cfg(not(feature = "cuda"))]
839            {
840                let samples = run_layer_b_cpu(events, contract, warmup, iters);
841                report_layer(
842                    "Layer B — throughput verdict summary (CPU-only, no CUDA feature)",
843                    &samples,
844                    'B',
845                    n_entities,
846                    n_windows,
847                    1,
848                    n_detectors,
849                    out_dir,
850                    "_cpu",
851                );
852            }
853        }
854        'C' => {
855            // Layer C: full audit court. Run both CPU and GPU sides so
856            // the audit cost is visible per backend. The CPU and GPU
857            // reports write to distinct files via the `_cpu` / `_gpu`
858            // file_tag suffix so neither overwrites the other.
859            let cpu_samples = run_layer_c_cpu(events, contract, warmup, iters);
860            report_layer(
861                "Layer C — full audit court (CPU)",
862                &cpu_samples,
863                'C',
864                n_entities,
865                n_windows,
866                1,
867                n_detectors,
868                out_dir,
869                "_cpu",
870            );
871            #[cfg(feature = "cuda")]
872            {
873                let gpu_samples = run_layer_c_gpu(events, contract, warmup, iters);
874                report_layer(
875                    "Layer C — full audit court (GPU)",
876                    &gpu_samples,
877                    'C',
878                    n_entities,
879                    n_windows,
880                    1,
881                    n_detectors,
882                    out_dir,
883                    "_gpu",
884                );
885            }
886        }
887        other => {
888            eprintln!("run_layer_bench: unknown layer '{other}'");
889        }
890    }
891}
892
893#[cfg(feature = "cuda")]
894pub(crate) fn run_layer_a(
895    events: &[TraceEvent],
896    contract: &Contract,
897    batch: u32,
898    warmup: usize,
899    iters: usize,
900) -> Vec<u128> {
901    // R.3a — Layer A bench uses the new skip-bank dispatch path
902    // (`build_gpu_layer_a_*`) which returns `CompactCaseSummary` and
903    // does NOT run the host bank stage. The bench therefore measures
904    // the pure evidence-fabric cost, separated from per-catalog bank
905    // admission. The single-vs-batched dispatch mirrors the Tier 3B
906    // pattern from Section Q.
907    let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
908    if batch == 0 {
909        let mut workspace = GpuWorkspace::new(contract).expect("workspace allocation");
910        for _ in 0..warmup {
911            let summary = build_gpu_layer_a_on_workspace(events, contract, &mut workspace)
912                .expect("CUDA Layer A (skip-bank) pipeline");
913            std::hint::black_box(summary);
914        }
915        for _ in 0..iters {
916            let t0 = Instant::now();
917            let summary = build_gpu_layer_a_on_workspace(events, contract, &mut workspace)
918                .expect("CUDA Layer A (skip-bank) pipeline");
919            let dt = t0.elapsed().as_micros();
920            std::hint::black_box(summary);
921            samples_us.push(dt);
922        }
923    } else {
924        // R.2 courthouse-factory generator. Catalog 0 reuses the bench's
925        // canonical `events` slice (so single-vs-batched comparisons share
926        // catalog 0 exactly); the remaining K-1 catalogs come from the
927        // factory at the same `(n_entities, n_windows)` grid.
928        let mut fixtures: Vec<Vec<TraceEvent>> = Vec::with_capacity(batch as usize);
929        fixtures.push(events.to_vec());
930        if batch > 1 {
931            let extra = dsfb_gpu_debug_core::fixture::synthesize_courthouse_factory(
932                dsfb_gpu_debug_core::fixture::DEFAULT_SEED.wrapping_add(0x9E37_79B9_7F4A_7C15),
933                batch - 1,
934                contract.n_entities,
935                contract.n_windows,
936                4,
937            );
938            fixtures.extend(extra);
939        }
940        let event_slices: Vec<&[TraceEvent]> = fixtures.iter().map(Vec::as_slice).collect();
941        let mut workspace =
942            BatchedGpuWorkspace::new(batch, contract).expect("batched workspace allocation");
943        for _ in 0..warmup {
944            let summaries = build_gpu_layer_a_batched(&mut workspace, &event_slices, contract)
945                .expect("CUDA Layer A (batched skip-bank) pipeline");
946            std::hint::black_box(summaries);
947        }
948        for _ in 0..iters {
949            let t0 = Instant::now();
950            let summaries = build_gpu_layer_a_batched(&mut workspace, &event_slices, contract)
951                .expect("CUDA Layer A (batched skip-bank) pipeline");
952            let dt = t0.elapsed().as_micros();
953            std::hint::black_box(summaries);
954            samples_us.push(dt);
955        }
956    }
957    samples_us
958}
959
960#[cfg(feature = "cuda")]
961pub(crate) fn run_layer_b(
962    events: &[TraceEvent],
963    contract: &Contract,
964    batch: u32,
965    warmup: usize,
966    iters: usize,
967) -> Vec<u128> {
968    let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
969    if batch == 0 {
970        let mut workspace = GpuWorkspace::new(contract).expect("workspace allocation");
971        for _ in 0..warmup {
972            let case = build_gpu_throughput_on_workspace(&mut workspace, events, contract)
973                .expect("CUDA pipeline (throughput)");
974            std::hint::black_box(case);
975        }
976        for _ in 0..iters {
977            let t0 = Instant::now();
978            let case = build_gpu_throughput_on_workspace(&mut workspace, events, contract)
979                .expect("CUDA pipeline (throughput)");
980            let dt = t0.elapsed().as_micros();
981            std::hint::black_box(case);
982            samples_us.push(dt);
983        }
984    } else {
985        let fixtures: Vec<Vec<TraceEvent>> = (0..batch as u64)
986            .map(|i| {
987                if i == 0 {
988                    events.to_vec()
989                } else {
990                    dsfb_gpu_debug_core::fixture::synthesize(
991                        dsfb_gpu_debug_core::fixture::DEFAULT_SEED
992                            .wrapping_add(i.wrapping_mul(0x9E37)),
993                    )
994                }
995            })
996            .collect();
997        let event_slices: Vec<&[TraceEvent]> = fixtures.iter().map(Vec::as_slice).collect();
998        let mut workspace =
999            BatchedGpuWorkspace::new(batch, contract).expect("batched workspace allocation");
1000        for _ in 0..warmup {
1001            let cases = build_gpu_batched_throughput(&mut workspace, &event_slices, contract)
1002                .expect("CUDA pipeline (batched throughput)");
1003            std::hint::black_box(cases);
1004        }
1005        for _ in 0..iters {
1006            let t0 = Instant::now();
1007            let cases = build_gpu_batched_throughput(&mut workspace, &event_slices, contract)
1008                .expect("CUDA pipeline (batched throughput)");
1009            let dt = t0.elapsed().as_micros();
1010            std::hint::black_box(cases);
1011            samples_us.push(dt);
1012        }
1013    }
1014    samples_us
1015}
1016
1017#[cfg(not(feature = "cuda"))]
1018fn run_layer_b_cpu(
1019    events: &[TraceEvent],
1020    contract: &Contract,
1021    warmup: usize,
1022    iters: usize,
1023) -> Vec<u128> {
1024    let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
1025    for _ in 0..warmup {
1026        let _ = std::hint::black_box(build_cpu_throughput(events, contract));
1027    }
1028    for _ in 0..iters {
1029        let t0 = Instant::now();
1030        let case = build_cpu_throughput(events, contract);
1031        let dt = t0.elapsed().as_micros();
1032        std::hint::black_box(case);
1033        samples_us.push(dt);
1034    }
1035    samples_us
1036}
1037
1038/// CPU Layer B runner: single-catalog throughput case-file build on the
1039/// host. The headline comparator for the money table — every GPU row at
1040/// any scale measures speedup against this exact number. Always
1041/// available regardless of `--features cuda`.
1042pub(crate) fn run_layer_b_cpu_always(
1043    events: &[TraceEvent],
1044    contract: &Contract,
1045    warmup: usize,
1046    iters: usize,
1047) -> Vec<u128> {
1048    let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
1049    for _ in 0..warmup {
1050        let _ = std::hint::black_box(build_cpu_throughput(events, contract));
1051    }
1052    for _ in 0..iters {
1053        let t0 = Instant::now();
1054        let case = build_cpu_throughput(events, contract);
1055        let dt = t0.elapsed().as_micros();
1056        std::hint::black_box(case);
1057        samples_us.push(dt);
1058    }
1059    samples_us
1060}
1061
1062pub(crate) fn run_layer_c_cpu(
1063    events: &[TraceEvent],
1064    contract: &Contract,
1065    warmup: usize,
1066    iters: usize,
1067) -> Vec<u128> {
1068    let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
1069    for _ in 0..warmup {
1070        let _ = std::hint::black_box(build_cpu(events, contract));
1071    }
1072    for _ in 0..iters {
1073        let t0 = Instant::now();
1074        let case = build_cpu(events, contract);
1075        let dt = t0.elapsed().as_micros();
1076        std::hint::black_box(case);
1077        samples_us.push(dt);
1078    }
1079    samples_us
1080}
1081
1082#[cfg(feature = "cuda")]
1083pub(crate) fn run_layer_c_gpu(
1084    events: &[TraceEvent],
1085    contract: &Contract,
1086    warmup: usize,
1087    iters: usize,
1088) -> Vec<u128> {
1089    let mut workspace = GpuWorkspace::new(contract).expect("workspace allocation");
1090    let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
1091    for _ in 0..warmup {
1092        let case = build_gpu_on_workspace(&mut workspace, events, contract)
1093            .expect("CUDA pipeline (audit)");
1094        std::hint::black_box(case);
1095    }
1096    for _ in 0..iters {
1097        let t0 = Instant::now();
1098        let case = build_gpu_on_workspace(&mut workspace, events, contract)
1099            .expect("CUDA pipeline (audit)");
1100        let dt = t0.elapsed().as_micros();
1101        std::hint::black_box(case);
1102        samples_us.push(dt);
1103    }
1104    samples_us
1105}
1106
1107/// R.3a — opt-in transcript expansion for a specific catalog index.
1108///
1109/// When the caller asks `--materialize-catalog J` alongside a `--layer`
1110/// run, this routine synthesises catalog `J` deterministically via the
1111/// courthouse-factory generator and runs a single Layer C audit court
1112/// over it. The Layer C report writes to
1113/// `reports/layer_C_materialize_J_<grid>x<windows>_K1.txt` so it does
1114/// not collide with the Layer C aggregate file from the main run.
1115///
1116/// `J = 0` re-runs catalog 0 (which is the bench's primary `events`
1117/// vector); other indices use the same Knuth-golden-ratio seed
1118/// derivation as the batched Layer A/B paths.
1119fn run_materialize_catalog(
1120    j: u32,
1121    primary_events: &[TraceEvent],
1122    contract: &Contract,
1123    warmup: usize,
1124    iters: usize,
1125) {
1126    let events: Vec<TraceEvent> = if j == 0 {
1127        primary_events.to_vec()
1128    } else {
1129        let derived_seed = dsfb_gpu_debug_core::fixture::DEFAULT_SEED
1130            .wrapping_add(0x9E37_79B9_7F4A_7C15)
1131            ^ u64::from(j - 1).wrapping_mul(0x9E37_79B9_7F4A_7C15);
1132        dsfb_gpu_debug_core::fixture::synthesize_scaled(
1133            derived_seed,
1134            contract.n_entities,
1135            contract.n_windows,
1136            4,
1137        )
1138    };
1139
1140    println!();
1141    println!("Materialising catalog J={j} as Layer C transcript on demand (R.3a opt-in)");
1142    println!("  derived events     : {}", events.len());
1143
1144    let n_entities = contract.n_entities;
1145    let n_windows = contract.n_windows;
1146    let n_detectors = dsfb_gpu_debug_core::motif::MotifClass::COUNT as u32;
1147    let reports_dir = std::path::Path::new("reports");
1148
1149    let cpu_samples = run_layer_c_cpu(&events, contract, warmup, iters);
1150    let cpu_label = format!("Layer C — materialised catalog J={j} (CPU)");
1151    report_layer(
1152        &cpu_label,
1153        &cpu_samples,
1154        'C',
1155        n_entities,
1156        n_windows,
1157        1,
1158        n_detectors,
1159        Some(reports_dir),
1160        &format!("_materialize_{j}_cpu"),
1161    );
1162
1163    #[cfg(feature = "cuda")]
1164    {
1165        let gpu_samples = run_layer_c_gpu(&events, contract, warmup, iters);
1166        let gpu_label = format!("Layer C — materialised catalog J={j} (GPU)");
1167        report_layer(
1168            &gpu_label,
1169            &gpu_samples,
1170            'C',
1171            n_entities,
1172            n_windows,
1173            1,
1174            n_detectors,
1175            Some(reports_dir),
1176            &format!("_materialize_{j}_gpu"),
1177        );
1178    }
1179}