1#![allow(clippy::expect_used)]
31
32use std::process::ExitCode;
33use std::time::Instant;
34
35use dsfb_gpu_debug_core::bank::bank_hash;
36use dsfb_gpu_debug_core::casefile::{build_cpu, build_cpu_throughput};
37use dsfb_gpu_debug_core::contract::Contract;
38use dsfb_gpu_debug_core::event::TraceEvent;
39use dsfb_gpu_debug_core::fixture::{synthesize, synthesize_scaled, DEFAULT_SEED};
40use dsfb_gpu_debug_core::motif::registry_hash;
41
42#[cfg(feature = "cuda")]
43use dsfb_gpu_debug_cuda::{
44 build_gpu_batched_throughput, build_gpu_batched_throughput_device_digests,
45 build_gpu_layer_a_batched, build_gpu_layer_a_on_workspace, build_gpu_on_workspace,
46 build_gpu_throughput_device_digests_on_workspace, build_gpu_throughput_on_workspace,
47 build_gpu_timed_on_workspace, BatchedGpuWorkspace, GpuWorkspace,
48};
49
50use super::{parse_flags, usage_error};
51
52#[allow(clippy::too_many_lines)]
53pub fn parse_and_run(args: &[String]) -> ExitCode {
54 let flags = match parse_flags(args) {
55 Ok(f) => f,
56 Err(message) => return usage_error(&message),
57 };
58
59 let iters: usize = flags
60 .get("iters")
61 .map_or(100, |s| s.parse::<usize>().unwrap_or(100));
62 let warmup: usize = flags
63 .get("warmup")
64 .map_or(10, |s| s.parse::<usize>().unwrap_or(10));
65 let detail = flags.get("detail").is_some_and(|v| v != "false");
69 let backend = if detail && !flags.contains_key("backend") {
72 "gpu"
73 } else {
74 flags.get("backend").map_or("both", String::as_str)
75 };
76 let mode = flags.get("mode").map_or("audit", String::as_str);
80 let layer = flags.get("layer").map(String::as_str);
96 let scale = flags.get("scale").and_then(|s| parse_scale(s)).or_else(|| {
105 flags
106 .get("scale-large")
107 .filter(|v| v.as_str() != "false")
108 .map(|_| (256u32, 4096u32))
109 });
110 let materialize_catalog: Option<u32> = flags
119 .get("materialize-catalog")
120 .and_then(|s| s.parse::<u32>().ok());
121 #[cfg(feature = "cuda")]
128 let batch: u32 = flags
129 .get("batch")
130 .and_then(|s| s.parse::<u32>().ok())
131 .unwrap_or(0);
132 #[cfg(not(feature = "cuda"))]
133 let _ = flags.get("batch");
134
135 let (events, contract_dims, scaled_label) = match scale {
137 None => (synthesize(DEFAULT_SEED), (16u32, 128u32), String::new()),
138 Some((n_entities, n_windows)) => {
139 let events = synthesize_scaled(DEFAULT_SEED, n_entities, n_windows, 4);
140 (
141 events,
142 (n_entities, n_windows),
143 format!(" [scaled {n_entities}x{n_windows}]"),
144 )
145 }
146 };
147 let mut contract = if scale.is_some() {
148 Contract::scaled(contract_dims.0, contract_dims.1)
149 } else {
150 Contract::canonical()
151 };
152 contract.pin_bank_hash(bank_hash());
153 contract.pin_detector_registry_hash(registry_hash());
154
155 println!("dsfb-gpu-debug bench:{scaled_label}");
156 println!(" events : {}", events.len());
157 println!(" n_entities: {}", contract.n_entities);
158 println!(" n_windows : {}", contract.n_windows);
159 println!(" warmup : {warmup}");
160 println!(" iters : {iters}");
161 println!();
162
163 let run_audit = mode == "audit" || mode == "both";
164 let run_throughput = mode == "throughput" || mode == "both";
165
166 if let Some(layer_spec) = layer {
171 let layers: &[char] = match layer_spec {
172 "A" | "a" => &['A'],
173 "B" | "b" => &['B'],
174 "C" | "c" => &['C'],
175 "all" | "ABC" | "abc" => &['A', 'B', 'C'],
176 other => {
177 eprintln!("unknown --layer {other:?}; expected A | B | C | all");
178 return ExitCode::from(1);
179 }
180 };
181 let reports_dir = std::path::Path::new("reports");
182 for &l in layers {
183 run_layer_bench(
184 l,
185 &events,
186 &contract,
187 warmup,
188 iters,
189 #[cfg(feature = "cuda")]
190 batch,
191 #[cfg(not(feature = "cuda"))]
192 0,
193 Some(reports_dir),
194 );
195 }
196 if let Some(j) = materialize_catalog {
202 run_materialize_catalog(j, &events, &contract, warmup.max(1), iters.max(1));
203 }
204 return ExitCode::SUCCESS;
205 }
206
207 if backend == "cpu" || backend == "both" {
208 if run_audit {
209 run_cpu_bench_audit(&events, &contract, warmup, iters);
210 }
211 if run_throughput {
212 run_cpu_bench_throughput(&events, &contract, warmup, iters);
213 }
214 }
215
216 #[cfg(feature = "cuda")]
223 let device_digests = flags.get("device-digests").is_some_and(|v| v != "false");
224 #[cfg(not(feature = "cuda"))]
225 let _ = flags.get("device-digests");
226
227 #[cfg(feature = "cuda")]
228 if backend == "gpu" || backend == "both" {
229 if detail {
230 run_gpu_bench_with_detail(&events, &contract, warmup, iters);
231 } else {
232 if run_audit {
233 run_gpu_bench_audit(&events, &contract, warmup, iters);
234 }
235 if run_throughput {
236 run_gpu_bench_throughput(&events, &contract, warmup, iters);
237 }
238 if device_digests && run_throughput {
239 run_gpu_bench_throughput_device_digests(&events, &contract, warmup, iters);
240 }
241 if batch > 0 {
242 run_gpu_bench_batched(&events, &contract, batch, warmup, iters);
243 }
244 if batch > 0 && device_digests {
245 run_gpu_bench_batched_device_digests(&events, &contract, batch, warmup, iters);
246 }
247 }
248 }
249 #[cfg(not(feature = "cuda"))]
250 if backend == "gpu" || backend == "both" {
251 let _ = detail;
252 println!("GPU pipeline: built without --features cuda; skipping");
253 }
254
255 ExitCode::SUCCESS
256}
257
258fn parse_scale(s: &str) -> Option<(u32, u32)> {
262 let (n_entities_s, n_windows_s) = s.split_once('x')?;
263 let n_entities: u32 = n_entities_s.parse().ok()?;
264 let n_windows: u32 = n_windows_s.parse().ok()?;
265 if n_entities == 0 || n_windows == 0 {
266 return None;
267 }
268 Some((n_entities, n_windows))
269}
270
271fn run_cpu_bench_audit(events: &[TraceEvent], contract: &Contract, warmup: usize, iters: usize) {
272 for _ in 0..warmup {
273 let _ = std::hint::black_box(build_cpu(events, contract));
274 }
275 let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
276 for _ in 0..iters {
277 let t0 = Instant::now();
278 let case = build_cpu(events, contract);
279 let dt = t0.elapsed().as_micros();
280 std::hint::black_box(case);
281 samples_us.push(dt);
282 }
283 report("CPU pipeline (Audit, build_cpu)", &samples_us);
284}
285
286fn run_cpu_bench_throughput(
287 events: &[TraceEvent],
288 contract: &Contract,
289 warmup: usize,
290 iters: usize,
291) {
292 for _ in 0..warmup {
293 let _ = std::hint::black_box(build_cpu_throughput(events, contract));
294 }
295 let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
296 for _ in 0..iters {
297 let t0 = Instant::now();
298 let case = build_cpu_throughput(events, contract);
299 let dt = t0.elapsed().as_micros();
300 std::hint::black_box(case);
301 samples_us.push(dt);
302 }
303 report(
304 "CPU pipeline (Throughput, build_cpu_throughput)",
305 &samples_us,
306 );
307}
308
309#[cfg(feature = "cuda")]
310fn run_gpu_bench_audit(events: &[TraceEvent], contract: &Contract, warmup: usize, iters: usize) {
311 let mut workspace = GpuWorkspace::new(contract).expect("workspace allocation");
316 for _ in 0..warmup {
317 let case = build_gpu_on_workspace(&mut workspace, events, contract).expect("CUDA pipeline");
318 std::hint::black_box(case);
319 }
320 let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
321 for _ in 0..iters {
322 let t0 = Instant::now();
323 let case = build_gpu_on_workspace(&mut workspace, events, contract).expect("CUDA pipeline");
324 let dt = t0.elapsed().as_micros();
325 std::hint::black_box(case);
326 samples_us.push(dt);
327 }
328 report(
329 "GPU pipeline (Audit, workspace-resident, sm_75/80/89)",
330 &samples_us,
331 );
332}
333
334#[cfg(feature = "cuda")]
335fn run_gpu_bench_batched(
336 events: &[TraceEvent],
337 contract: &Contract,
338 batch: u32,
339 warmup: usize,
340 iters: usize,
341) {
342 let mut fixtures: Vec<Vec<TraceEvent>> = Vec::with_capacity(batch as usize);
353 fixtures.push(events.to_vec());
354 if batch > 1 {
355 let extra = dsfb_gpu_debug_core::fixture::synthesize_courthouse_factory(
356 dsfb_gpu_debug_core::fixture::DEFAULT_SEED.wrapping_add(0x9E37_79B9_7F4A_7C15),
357 batch - 1,
358 contract.n_entities,
359 contract.n_windows,
360 4,
361 );
362 fixtures.extend(extra);
363 }
364 let event_slices: Vec<&[TraceEvent]> = fixtures.iter().map(Vec::as_slice).collect();
365
366 let mut workspace =
367 BatchedGpuWorkspace::new(batch, contract).expect("batched workspace allocation");
368
369 for _ in 0..warmup {
370 let cases =
371 build_gpu_batched_throughput(&mut workspace, &event_slices, contract).expect("CUDA");
372 std::hint::black_box(cases);
373 }
374
375 let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
376 for _ in 0..iters {
377 let t0 = Instant::now();
378 let cases =
379 build_gpu_batched_throughput(&mut workspace, &event_slices, contract).expect("CUDA");
380 let dt = t0.elapsed().as_micros();
381 std::hint::black_box(cases);
382 samples_us.push(dt);
383 }
384
385 let label = format!("GPU pipeline (Batched K={batch}, workspace-resident, sm_75/80/89)");
386 report(&label, &samples_us);
387
388 let mut sorted = samples_us.clone();
392 sorted.sort_unstable();
393 let median_us = sorted[sorted.len() / 2];
394 let per_catalog_us = median_us / u128::from(batch);
395 let cases_per_sec = if median_us > 0 {
396 1_000_000u128 * u128::from(batch) / median_us
397 } else {
398 0
399 };
400 println!(
401 " per-catalog amortized: {per_catalog_us} us throughput: {cases_per_sec} cases/sec"
402 );
403 println!();
404}
405
406#[cfg(feature = "cuda")]
407fn run_gpu_bench_throughput(
408 events: &[TraceEvent],
409 contract: &Contract,
410 warmup: usize,
411 iters: usize,
412) {
413 let mut workspace = GpuWorkspace::new(contract).expect("workspace allocation");
414 for _ in 0..warmup {
415 let case = build_gpu_throughput_on_workspace(&mut workspace, events, contract)
416 .expect("CUDA pipeline");
417 std::hint::black_box(case);
418 }
419 let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
420 for _ in 0..iters {
421 let t0 = Instant::now();
422 let case = build_gpu_throughput_on_workspace(&mut workspace, events, contract)
423 .expect("CUDA pipeline");
424 let dt = t0.elapsed().as_micros();
425 std::hint::black_box(case);
426 samples_us.push(dt);
427 }
428 report(
429 "GPU pipeline (Throughput, workspace-resident, sm_75/80/89)",
430 &samples_us,
431 );
432}
433
434#[cfg(feature = "cuda")]
435fn run_gpu_bench_with_detail(
436 events: &[TraceEvent],
437 contract: &Contract,
438 warmup: usize,
439 iters: usize,
440) {
441 let mut workspace = GpuWorkspace::new(contract).expect("workspace allocation");
444
445 for _ in 0..warmup {
448 let (case, _) =
449 build_gpu_timed_on_workspace(&mut workspace, events, contract).expect("CUDA pipeline");
450 std::hint::black_box(case);
451 }
452
453 let mut wall_us: Vec<u128> = Vec::with_capacity(iters);
457 let mut alloc_us: Vec<u128> = Vec::with_capacity(iters);
458 let mut h2d_us: Vec<u128> = Vec::with_capacity(iters);
459 let mut k1_us: Vec<u128> = Vec::with_capacity(iters);
460 let mut k2_us: Vec<u128> = Vec::with_capacity(iters);
461 let mut k3_us: Vec<u128> = Vec::with_capacity(iters);
462 let mut k4_us: Vec<u128> = Vec::with_capacity(iters);
463 let mut k5_us: Vec<u128> = Vec::with_capacity(iters);
464 let mut d2h_us: Vec<u128> = Vec::with_capacity(iters);
465 let mut free_us: Vec<u128> = Vec::with_capacity(iters);
466 let mut device_total_us: Vec<u128> = Vec::with_capacity(iters);
467
468 for _ in 0..iters {
469 let t0 = Instant::now();
470 let (case, t) =
471 build_gpu_timed_on_workspace(&mut workspace, events, contract).expect("CUDA pipeline");
472 let dt = t0.elapsed().as_micros();
473 std::hint::black_box(case);
474 wall_us.push(dt);
475 push_f32_as_u128(&mut alloc_us, t.alloc_us);
476 push_f32_as_u128(&mut h2d_us, t.h2d_us);
477 push_f32_as_u128(&mut k1_us, t.k1_residual_us);
478 push_f32_as_u128(&mut k2_us, t.k2_sign_us);
479 push_f32_as_u128(&mut k3_us, t.k3_detector_us);
480 push_f32_as_u128(&mut k4_us, t.k4_consensus_us);
481 push_f32_as_u128(&mut k5_us, t.k5_candidate_us);
482 push_f32_as_u128(&mut d2h_us, t.d2h_us);
483 push_f32_as_u128(&mut free_us, t.free_us);
484 push_f32_as_u128(&mut device_total_us, t.total_us);
485 }
486
487 println!("GPU pipeline (build_gpu_timed --detail)");
488 report_inline("host wall time ", &wall_us);
489 report_inline("device alloc ", &alloc_us);
490 report_inline("H2D (window feats) ", &h2d_us);
491 report_inline("k1 residual_field ", &k1_us);
492 report_inline("k2 drift_slew_sign ", &k2_us);
493 report_inline("k3 detector_motif ", &k3_us);
494 report_inline("k4 consensus_grid ", &k4_us);
495 report_inline("k5 candidate_coll. ", &k5_us);
496 report_inline("D2H (all stages) ", &d2h_us);
497 report_inline("device free ", &free_us);
498 report_inline("device total ", &device_total_us);
499 println!();
500}
501
502#[cfg(feature = "cuda")]
507fn push_f32_as_u128(samples: &mut Vec<u128>, val: f32) {
508 samples.push(val.round().max(0.0) as u128);
509}
510
511#[cfg(feature = "cuda")]
512fn report_inline(label: &str, samples_us: &[u128]) {
513 if samples_us.is_empty() {
514 return;
515 }
516 let mut sorted = samples_us.to_vec();
517 sorted.sort_unstable();
518 let n = sorted.len() as u128;
519 let min = *sorted.first().unwrap_or(&0);
520 let max = *sorted.last().unwrap_or(&0);
521 let median = sorted[sorted.len() / 2];
522 let mean = sorted.iter().sum::<u128>() / n;
523 println!(
524 " {label} min={min:>6} us median={median:>6} us mean={mean:>6} us max={max:>6} us"
525 );
526}
527
528fn report(label: &str, samples_us: &[u128]) {
529 let mut sorted = samples_us.to_vec();
530 sorted.sort_unstable();
531 let n = sorted.len() as u128;
532 let min = *sorted.first().unwrap_or(&0);
533 let max = *sorted.last().unwrap_or(&0);
534 let median = sorted[sorted.len() / 2];
535 let sum: u128 = sorted.iter().sum();
536 let mean = if n == 0 { 0 } else { sum / n };
537 println!("{label}");
538 println!(" min : {min:>8} us");
539 println!(" median : {median:>8} us");
540 println!(" mean : {mean:>8} us");
541 println!(" max : {max:>8} us");
542 println!(" samples: {n}");
543 println!();
544}
545
546#[allow(clippy::too_many_arguments)]
557fn report_layer(
558 label: &str,
559 samples_us: &[u128],
560 layer: char,
561 n_entities: u32,
562 n_windows: u32,
563 n_catalogs: u32,
564 n_detectors: u32,
565 out_dir: Option<&std::path::Path>,
566 file_tag: &str,
567) {
568 let mut sorted = samples_us.to_vec();
569 sorted.sort_unstable();
570 let n_samples = sorted.len() as u128;
571 let min = *sorted.first().unwrap_or(&0);
572 let max = *sorted.last().unwrap_or(&0);
573 let median = if sorted.is_empty() {
574 0
575 } else {
576 sorted[sorted.len() / 2]
577 };
578 let sum: u128 = sorted.iter().sum();
579 let mean = if n_samples == 0 { 0 } else { sum / n_samples };
580
581 let catalogs = u128::from(n_catalogs);
582 let cells = catalogs * u128::from(n_entities) * u128::from(n_windows);
583 let det_evals = cells * u128::from(n_detectors);
584 let one_sec = 1_000_000u128;
585 let catalogs_per_sec = if median > 0 {
586 catalogs * one_sec / median
587 } else {
588 0
589 };
590 let cells_per_sec = if median > 0 {
591 cells * one_sec / median
592 } else {
593 0
594 };
595 let det_evals_per_sec = if median > 0 {
596 det_evals * one_sec / median
597 } else {
598 0
599 };
600 let per_catalog_us = if catalogs > 0 {
601 median / catalogs
602 } else {
603 median
604 };
605
606 println!("{label} [Layer {layer}]");
607 println!(" min : {min:>10} us");
608 println!(" median : {median:>10} us");
609 println!(" mean : {mean:>10} us");
610 println!(" max : {max:>10} us");
611 println!(" samples : {n_samples}");
612 println!(" n_catalogs (K) : {n_catalogs}");
613 println!(" per-catalog amortized: {per_catalog_us:>10} us");
614 println!(" catalogs/sec : {catalogs_per_sec}");
615 println!(" cells/sec : {cells_per_sec}");
616 println!(" detector-evals/sec : {det_evals_per_sec}");
617 println!();
618
619 if let Some(out_dir) = out_dir {
620 let _ = std::fs::create_dir_all(out_dir);
621 let filename =
622 format!("layer_{layer}{file_tag}_{n_entities}x{n_windows}_K{n_catalogs}.txt");
623 let path = out_dir.join(filename);
624 let body = format!(
625 "{label} [Layer {layer}]\n\
626 n_entities : {n_entities}\n\
627 n_windows : {n_windows}\n\
628 n_catalogs (K) : {n_catalogs}\n\
629 n_detectors : {n_detectors}\n\
630 samples : {n_samples}\n\
631 min_us : {min}\n\
632 median_us : {median}\n\
633 mean_us : {mean}\n\
634 max_us : {max}\n\
635 per_catalog_us : {per_catalog_us}\n\
636 catalogs_per_sec : {catalogs_per_sec}\n\
637 cells_per_sec : {cells_per_sec}\n\
638 det_evals_per_sec: {det_evals_per_sec}\n"
639 );
640 if let Err(e) = std::fs::write(&path, body) {
641 eprintln!("warning: could not write {}: {e}", path.display());
642 } else {
643 println!(" wrote layer report -> {}", path.display());
644 println!();
645 }
646 }
647}
648
649#[cfg(feature = "cuda")]
650fn run_gpu_bench_throughput_device_digests(
651 events: &[TraceEvent],
652 contract: &Contract,
653 warmup: usize,
654 iters: usize,
655) {
656 let mut workspace = GpuWorkspace::new(contract).expect("workspace allocation");
657 for _ in 0..warmup {
658 let case =
659 build_gpu_throughput_device_digests_on_workspace(events, contract, &mut workspace)
660 .expect("CUDA pipeline (device digests)");
661 std::hint::black_box(case);
662 }
663 let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
664 for _ in 0..iters {
665 let t0 = Instant::now();
666 let case =
667 build_gpu_throughput_device_digests_on_workspace(events, contract, &mut workspace)
668 .expect("CUDA pipeline (device digests)");
669 let dt = t0.elapsed().as_micros();
670 std::hint::black_box(case);
671 samples_us.push(dt);
672 }
673 report(
674 "GPU pipeline (Throughput, Tier 3B on-device SHA-256, sm_75/80/89)",
675 &samples_us,
676 );
677}
678
679#[cfg(feature = "cuda")]
680fn run_gpu_bench_batched_device_digests(
681 events: &[TraceEvent],
682 contract: &Contract,
683 batch: u32,
684 warmup: usize,
685 iters: usize,
686) {
687 let mut fixtures: Vec<Vec<TraceEvent>> = Vec::with_capacity(batch as usize);
694 fixtures.push(events.to_vec());
695 if batch > 1 {
696 let extra = dsfb_gpu_debug_core::fixture::synthesize_courthouse_factory(
697 dsfb_gpu_debug_core::fixture::DEFAULT_SEED.wrapping_add(0x9E37_79B9_7F4A_7C15),
698 batch - 1,
699 contract.n_entities,
700 contract.n_windows,
701 4,
702 );
703 fixtures.extend(extra);
704 }
705 let event_slices: Vec<&[TraceEvent]> = fixtures.iter().map(Vec::as_slice).collect();
706
707 let mut workspace =
708 BatchedGpuWorkspace::new(batch, contract).expect("batched workspace allocation");
709
710 for _ in 0..warmup {
711 let cases =
712 build_gpu_batched_throughput_device_digests(&mut workspace, &event_slices, contract)
713 .expect("CUDA pipeline (batched device digests)");
714 std::hint::black_box(cases);
715 }
716 let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
717 for _ in 0..iters {
718 let t0 = Instant::now();
719 let cases =
720 build_gpu_batched_throughput_device_digests(&mut workspace, &event_slices, contract)
721 .expect("CUDA pipeline (batched device digests)");
722 let dt = t0.elapsed().as_micros();
723 std::hint::black_box(cases);
724 samples_us.push(dt);
725 }
726 let label = format!("GPU pipeline (Batched K={batch}, Tier 3B on-device SHA-256, sm_75/80/89)");
727 report(&label, &samples_us);
728
729 let mut sorted = samples_us.clone();
730 sorted.sort_unstable();
731 let median_us = sorted[sorted.len() / 2];
732 let per_catalog_us = median_us / u128::from(batch);
733 let cases_per_sec = if median_us > 0 {
734 1_000_000u128 * u128::from(batch) / median_us
735 } else {
736 0
737 };
738 println!(
739 " per-catalog amortized: {per_catalog_us} us throughput: {cases_per_sec} cases/sec"
740 );
741 println!();
742}
743
744#[allow(clippy::too_many_lines)]
758fn run_layer_bench(
759 layer: char,
760 events: &[TraceEvent],
761 contract: &Contract,
762 warmup: usize,
763 iters: usize,
764 batch: u32,
765 out_dir: Option<&std::path::Path>,
766) {
767 let n_entities = contract.n_entities;
768 let n_windows = contract.n_windows;
769 let n_detectors = dsfb_gpu_debug_core::motif::MotifClass::COUNT as u32;
770 #[cfg(feature = "cuda")]
774 let n_catalogs = if batch == 0 { 1 } else { batch };
775 #[cfg(not(feature = "cuda"))]
776 let n_catalogs: u32 = if batch == 0 { 1 } else { batch };
777 #[cfg(not(feature = "cuda"))]
778 let _ = n_catalogs;
779
780 match layer {
781 'A' => {
782 #[cfg(feature = "cuda")]
783 {
784 let samples = run_layer_a(events, contract, batch, warmup, iters);
785 let label = if batch == 0 {
786 String::from(
787 "Layer A — device evidence fabric (Tier 3B device-digests, single-catalog)",
788 )
789 } else {
790 format!(
791 "Layer A — device evidence fabric (Tier 3B device-digests, K={batch} batched)"
792 )
793 };
794 report_layer(
795 &label,
796 &samples,
797 'A',
798 n_entities,
799 n_windows,
800 n_catalogs,
801 n_detectors,
802 out_dir,
803 "",
804 );
805 }
806 #[cfg(not(feature = "cuda"))]
807 {
808 let _ = (events, contract, batch, warmup, iters);
809 println!("Layer A — GPU pipeline: built without --features cuda; skipping");
810 println!();
811 }
812 }
813 'B' => {
814 #[cfg(feature = "cuda")]
815 {
816 let samples = run_layer_b(events, contract, batch, warmup, iters);
817 let label = if batch == 0 {
818 String::from(
819 "Layer B — throughput verdict summary (host bank stage, single-catalog)",
820 )
821 } else {
822 format!(
823 "Layer B — throughput verdict summary (host bank stage, K={batch} batched)"
824 )
825 };
826 report_layer(
827 &label,
828 &samples,
829 'B',
830 n_entities,
831 n_windows,
832 n_catalogs,
833 n_detectors,
834 out_dir,
835 "",
836 );
837 }
838 #[cfg(not(feature = "cuda"))]
839 {
840 let samples = run_layer_b_cpu(events, contract, warmup, iters);
841 report_layer(
842 "Layer B — throughput verdict summary (CPU-only, no CUDA feature)",
843 &samples,
844 'B',
845 n_entities,
846 n_windows,
847 1,
848 n_detectors,
849 out_dir,
850 "_cpu",
851 );
852 }
853 }
854 'C' => {
855 let cpu_samples = run_layer_c_cpu(events, contract, warmup, iters);
860 report_layer(
861 "Layer C — full audit court (CPU)",
862 &cpu_samples,
863 'C',
864 n_entities,
865 n_windows,
866 1,
867 n_detectors,
868 out_dir,
869 "_cpu",
870 );
871 #[cfg(feature = "cuda")]
872 {
873 let gpu_samples = run_layer_c_gpu(events, contract, warmup, iters);
874 report_layer(
875 "Layer C — full audit court (GPU)",
876 &gpu_samples,
877 'C',
878 n_entities,
879 n_windows,
880 1,
881 n_detectors,
882 out_dir,
883 "_gpu",
884 );
885 }
886 }
887 other => {
888 eprintln!("run_layer_bench: unknown layer '{other}'");
889 }
890 }
891}
892
893#[cfg(feature = "cuda")]
894pub(crate) fn run_layer_a(
895 events: &[TraceEvent],
896 contract: &Contract,
897 batch: u32,
898 warmup: usize,
899 iters: usize,
900) -> Vec<u128> {
901 let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
908 if batch == 0 {
909 let mut workspace = GpuWorkspace::new(contract).expect("workspace allocation");
910 for _ in 0..warmup {
911 let summary = build_gpu_layer_a_on_workspace(events, contract, &mut workspace)
912 .expect("CUDA Layer A (skip-bank) pipeline");
913 std::hint::black_box(summary);
914 }
915 for _ in 0..iters {
916 let t0 = Instant::now();
917 let summary = build_gpu_layer_a_on_workspace(events, contract, &mut workspace)
918 .expect("CUDA Layer A (skip-bank) pipeline");
919 let dt = t0.elapsed().as_micros();
920 std::hint::black_box(summary);
921 samples_us.push(dt);
922 }
923 } else {
924 let mut fixtures: Vec<Vec<TraceEvent>> = Vec::with_capacity(batch as usize);
929 fixtures.push(events.to_vec());
930 if batch > 1 {
931 let extra = dsfb_gpu_debug_core::fixture::synthesize_courthouse_factory(
932 dsfb_gpu_debug_core::fixture::DEFAULT_SEED.wrapping_add(0x9E37_79B9_7F4A_7C15),
933 batch - 1,
934 contract.n_entities,
935 contract.n_windows,
936 4,
937 );
938 fixtures.extend(extra);
939 }
940 let event_slices: Vec<&[TraceEvent]> = fixtures.iter().map(Vec::as_slice).collect();
941 let mut workspace =
942 BatchedGpuWorkspace::new(batch, contract).expect("batched workspace allocation");
943 for _ in 0..warmup {
944 let summaries = build_gpu_layer_a_batched(&mut workspace, &event_slices, contract)
945 .expect("CUDA Layer A (batched skip-bank) pipeline");
946 std::hint::black_box(summaries);
947 }
948 for _ in 0..iters {
949 let t0 = Instant::now();
950 let summaries = build_gpu_layer_a_batched(&mut workspace, &event_slices, contract)
951 .expect("CUDA Layer A (batched skip-bank) pipeline");
952 let dt = t0.elapsed().as_micros();
953 std::hint::black_box(summaries);
954 samples_us.push(dt);
955 }
956 }
957 samples_us
958}
959
960#[cfg(feature = "cuda")]
961pub(crate) fn run_layer_b(
962 events: &[TraceEvent],
963 contract: &Contract,
964 batch: u32,
965 warmup: usize,
966 iters: usize,
967) -> Vec<u128> {
968 let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
969 if batch == 0 {
970 let mut workspace = GpuWorkspace::new(contract).expect("workspace allocation");
971 for _ in 0..warmup {
972 let case = build_gpu_throughput_on_workspace(&mut workspace, events, contract)
973 .expect("CUDA pipeline (throughput)");
974 std::hint::black_box(case);
975 }
976 for _ in 0..iters {
977 let t0 = Instant::now();
978 let case = build_gpu_throughput_on_workspace(&mut workspace, events, contract)
979 .expect("CUDA pipeline (throughput)");
980 let dt = t0.elapsed().as_micros();
981 std::hint::black_box(case);
982 samples_us.push(dt);
983 }
984 } else {
985 let fixtures: Vec<Vec<TraceEvent>> = (0..batch as u64)
986 .map(|i| {
987 if i == 0 {
988 events.to_vec()
989 } else {
990 dsfb_gpu_debug_core::fixture::synthesize(
991 dsfb_gpu_debug_core::fixture::DEFAULT_SEED
992 .wrapping_add(i.wrapping_mul(0x9E37)),
993 )
994 }
995 })
996 .collect();
997 let event_slices: Vec<&[TraceEvent]> = fixtures.iter().map(Vec::as_slice).collect();
998 let mut workspace =
999 BatchedGpuWorkspace::new(batch, contract).expect("batched workspace allocation");
1000 for _ in 0..warmup {
1001 let cases = build_gpu_batched_throughput(&mut workspace, &event_slices, contract)
1002 .expect("CUDA pipeline (batched throughput)");
1003 std::hint::black_box(cases);
1004 }
1005 for _ in 0..iters {
1006 let t0 = Instant::now();
1007 let cases = build_gpu_batched_throughput(&mut workspace, &event_slices, contract)
1008 .expect("CUDA pipeline (batched throughput)");
1009 let dt = t0.elapsed().as_micros();
1010 std::hint::black_box(cases);
1011 samples_us.push(dt);
1012 }
1013 }
1014 samples_us
1015}
1016
1017#[cfg(not(feature = "cuda"))]
1018fn run_layer_b_cpu(
1019 events: &[TraceEvent],
1020 contract: &Contract,
1021 warmup: usize,
1022 iters: usize,
1023) -> Vec<u128> {
1024 let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
1025 for _ in 0..warmup {
1026 let _ = std::hint::black_box(build_cpu_throughput(events, contract));
1027 }
1028 for _ in 0..iters {
1029 let t0 = Instant::now();
1030 let case = build_cpu_throughput(events, contract);
1031 let dt = t0.elapsed().as_micros();
1032 std::hint::black_box(case);
1033 samples_us.push(dt);
1034 }
1035 samples_us
1036}
1037
1038pub(crate) fn run_layer_b_cpu_always(
1043 events: &[TraceEvent],
1044 contract: &Contract,
1045 warmup: usize,
1046 iters: usize,
1047) -> Vec<u128> {
1048 let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
1049 for _ in 0..warmup {
1050 let _ = std::hint::black_box(build_cpu_throughput(events, contract));
1051 }
1052 for _ in 0..iters {
1053 let t0 = Instant::now();
1054 let case = build_cpu_throughput(events, contract);
1055 let dt = t0.elapsed().as_micros();
1056 std::hint::black_box(case);
1057 samples_us.push(dt);
1058 }
1059 samples_us
1060}
1061
1062pub(crate) fn run_layer_c_cpu(
1063 events: &[TraceEvent],
1064 contract: &Contract,
1065 warmup: usize,
1066 iters: usize,
1067) -> Vec<u128> {
1068 let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
1069 for _ in 0..warmup {
1070 let _ = std::hint::black_box(build_cpu(events, contract));
1071 }
1072 for _ in 0..iters {
1073 let t0 = Instant::now();
1074 let case = build_cpu(events, contract);
1075 let dt = t0.elapsed().as_micros();
1076 std::hint::black_box(case);
1077 samples_us.push(dt);
1078 }
1079 samples_us
1080}
1081
1082#[cfg(feature = "cuda")]
1083pub(crate) fn run_layer_c_gpu(
1084 events: &[TraceEvent],
1085 contract: &Contract,
1086 warmup: usize,
1087 iters: usize,
1088) -> Vec<u128> {
1089 let mut workspace = GpuWorkspace::new(contract).expect("workspace allocation");
1090 let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
1091 for _ in 0..warmup {
1092 let case = build_gpu_on_workspace(&mut workspace, events, contract)
1093 .expect("CUDA pipeline (audit)");
1094 std::hint::black_box(case);
1095 }
1096 for _ in 0..iters {
1097 let t0 = Instant::now();
1098 let case = build_gpu_on_workspace(&mut workspace, events, contract)
1099 .expect("CUDA pipeline (audit)");
1100 let dt = t0.elapsed().as_micros();
1101 std::hint::black_box(case);
1102 samples_us.push(dt);
1103 }
1104 samples_us
1105}
1106
1107fn run_materialize_catalog(
1120 j: u32,
1121 primary_events: &[TraceEvent],
1122 contract: &Contract,
1123 warmup: usize,
1124 iters: usize,
1125) {
1126 let events: Vec<TraceEvent> = if j == 0 {
1127 primary_events.to_vec()
1128 } else {
1129 let derived_seed = dsfb_gpu_debug_core::fixture::DEFAULT_SEED
1130 .wrapping_add(0x9E37_79B9_7F4A_7C15)
1131 ^ u64::from(j - 1).wrapping_mul(0x9E37_79B9_7F4A_7C15);
1132 dsfb_gpu_debug_core::fixture::synthesize_scaled(
1133 derived_seed,
1134 contract.n_entities,
1135 contract.n_windows,
1136 4,
1137 )
1138 };
1139
1140 println!();
1141 println!("Materialising catalog J={j} as Layer C transcript on demand (R.3a opt-in)");
1142 println!(" derived events : {}", events.len());
1143
1144 let n_entities = contract.n_entities;
1145 let n_windows = contract.n_windows;
1146 let n_detectors = dsfb_gpu_debug_core::motif::MotifClass::COUNT as u32;
1147 let reports_dir = std::path::Path::new("reports");
1148
1149 let cpu_samples = run_layer_c_cpu(&events, contract, warmup, iters);
1150 let cpu_label = format!("Layer C — materialised catalog J={j} (CPU)");
1151 report_layer(
1152 &cpu_label,
1153 &cpu_samples,
1154 'C',
1155 n_entities,
1156 n_windows,
1157 1,
1158 n_detectors,
1159 Some(reports_dir),
1160 &format!("_materialize_{j}_cpu"),
1161 );
1162
1163 #[cfg(feature = "cuda")]
1164 {
1165 let gpu_samples = run_layer_c_gpu(&events, contract, warmup, iters);
1166 let gpu_label = format!("Layer C — materialised catalog J={j} (GPU)");
1167 report_layer(
1168 &gpu_label,
1169 &gpu_samples,
1170 'C',
1171 n_entities,
1172 n_windows,
1173 1,
1174 n_detectors,
1175 Some(reports_dir),
1176 &format!("_materialize_{j}_gpu"),
1177 );
1178 }
1179}