1#![allow(clippy::expect_used)]
57
58use std::process::ExitCode;
59
60use dsfb_gpu_debug_core::bank::bank_hash;
61use dsfb_gpu_debug_core::contract::Contract;
62use dsfb_gpu_debug_core::event::TraceEvent;
63use dsfb_gpu_debug_core::fixture::{synthesize, synthesize_scaled, DEFAULT_SEED};
64use dsfb_gpu_debug_core::motif::registry_hash;
65
66#[cfg(feature = "cuda")]
67use dsfb_gpu_debug_cuda::{build_gpu_throughput_graph_or_demote, GpuWorkspace, GraphCaptureStatus};
68
69#[cfg(feature = "cuda")]
70use super::bench::{run_layer_a, run_layer_b, run_layer_c_gpu};
71use super::bench::{run_layer_b_cpu_always, run_layer_c_cpu};
72use super::{parse_flags, usage_error};
73
74#[derive(Debug, Clone, Copy)]
79struct IterPlan {
80 warmup: usize,
81 iters: usize,
82}
83
84impl IterPlan {
85 const CANONICAL: Self = Self {
86 warmup: 20,
87 iters: 100,
88 };
89 const LARGE_K1: Self = Self {
90 warmup: 5,
91 iters: 50,
92 };
93 const LARGE_K16: Self = Self {
94 warmup: 3,
95 iters: 20,
96 };
97 const LARGE_K64: Self = Self {
98 warmup: 2,
99 iters: 10,
100 };
101 const LARGE_K128: Self = Self {
102 warmup: 2,
103 iters: 5,
104 };
105}
106
107#[allow(clippy::too_many_lines)]
116pub fn parse_and_run(args: &[String]) -> ExitCode {
117 let flags = match parse_flags(args) {
118 Ok(f) => f,
119 Err(message) => return usage_error(&message),
120 };
121
122 let quick = flags.get("quick").is_some_and(|v| v != "false");
123 let skip_large = flags.get("skip-large").is_some_and(|v| v != "false");
124 let big_k = flags.get("big-k").is_some_and(|v| v != "false");
125 let detail_stage = flags.get("detail-stage").is_some_and(|v| v != "false");
131 let tree_digest = flags.get("tree-digest").is_some_and(|v| v != "false");
138 let compact = flags.get("compact").is_some_and(|v| v != "false");
146 let out_path: std::path::PathBuf = flags.get("out").map_or_else(
147 || std::path::PathBuf::from("reports/money_table.txt"),
148 std::path::PathBuf::from,
149 );
150
151 let mut rows: Vec<MoneyRow> = Vec::new();
152 let mut header_lines: Vec<String> = Vec::new();
153
154 header_lines.push(String::from(
155 "# R.7 Money Table — DSFB-GPU-Debug headline benchmark",
156 ));
157 header_lines.push(format!(
158 "# generated: {}",
159 chrono_like_timestamp_or_unknown()
160 ));
161 header_lines.push(format!(
162 "# quick: {quick} skip-large: {skip_large} big-k: {big_k}"
163 ));
164 header_lines.push(String::from("#"));
165 header_lines.push(String::from(
166 "# Layer A: device evidence fabric (skip-bank, on-device digests).",
167 ));
168 header_lines.push(String::from(
169 "# Layer B: throughput verdict summary (host bank stage admits compact candidates).",
170 ));
171 header_lines.push(String::from(
172 "# Layer C: full audit court (every intermediate cell materialised host-side).",
173 ));
174 header_lines.push(String::from(
175 "# Speedup is measured against CPU Layer B at the SAME (n_entities, n_windows) scale.",
176 ));
177 header_lines.push(String::from("#"));
178
179 let (graph_status_line, graph_hash_line) = probe_graph_capture();
181 header_lines.push(graph_status_line);
182 if let Some(line) = graph_hash_line {
183 header_lines.push(line);
184 }
185 header_lines.push(String::new());
186
187 if detail_stage {
194 #[cfg(feature = "cuda")]
195 {
196 let stage_iters = if quick { 5 } else { 20 };
197 let stage_warmup = if quick { 1 } else { 3 };
198 if compact {
199 run_r11_compact_compare(stage_warmup, stage_iters);
203 } else {
204 run_r8_detail_stage(stage_warmup, stage_iters, tree_digest);
205 }
206 }
207 #[cfg(not(feature = "cuda"))]
208 {
209 let _ = (quick, tree_digest, compact);
210 println!("--detail-stage requires --features cuda; nothing to profile");
211 }
212 return ExitCode::SUCCESS;
213 }
214
215 {
217 let plan = scale_iters(IterPlan::CANONICAL, quick);
218 let n_entities = 16u32;
219 let n_windows = 128u32;
220 let k = 32u32;
221 let mut contract = Contract::canonical();
222 contract.pin_bank_hash(bank_hash());
223 contract.pin_detector_registry_hash(registry_hash());
224 let events = synthesize(DEFAULT_SEED);
225
226 let cpu_b = run_layer_b_cpu_always(&events, &contract, plan.warmup, plan.iters);
227 let cpu_b_med = median(&cpu_b);
228 rows.push(MoneyRow {
229 label: format!(
230 "canonical 16x128 K={k:>3} CPU Layer B {}",
231 quick_tag(quick)
232 ),
233 n_entities,
234 n_windows,
235 n_catalogs: 1,
236 samples_us: cpu_b.clone(),
237 baseline_us: cpu_b_med,
238 });
239
240 run_gpu_row(
241 &mut rows,
242 &format!("canonical 16x128 K={k:>3} GPU Layer A "),
243 n_entities,
244 n_windows,
245 k,
246 GpuRow::LayerA,
247 &events,
248 &contract,
249 plan,
250 cpu_b_med,
251 quick,
252 );
253
254 run_gpu_row(
255 &mut rows,
256 &format!("canonical 16x128 K={k:>3} GPU Layer B "),
257 n_entities,
258 n_windows,
259 k,
260 GpuRow::LayerB,
261 &events,
262 &contract,
263 plan,
264 cpu_b_med,
265 quick,
266 );
267
268 let cpu_c = run_layer_c_cpu(&events, &contract, plan.warmup, plan.iters);
272 rows.push(MoneyRow {
273 label: format!(
274 "canonical 16x128 K= 1 CPU Layer C (audit) {}",
275 quick_tag(quick)
276 ),
277 n_entities,
278 n_windows,
279 n_catalogs: 1,
280 samples_us: cpu_c,
281 baseline_us: cpu_b_med,
282 });
283
284 #[cfg(feature = "cuda")]
285 {
286 let gpu_c = run_layer_c_gpu(&events, &contract, plan.warmup, plan.iters);
287 rows.push(MoneyRow {
288 label: format!(
289 "canonical 16x128 K= 1 GPU Layer C (audit) {}",
290 quick_tag(quick)
291 ),
292 n_entities,
293 n_windows,
294 n_catalogs: 1,
295 samples_us: gpu_c,
296 baseline_us: cpu_b_med,
297 });
298 }
299 }
300
301 if !skip_large {
303 let n_entities = 256u32;
304 let n_windows = 4096u32;
305 let mut contract = Contract::scaled(n_entities, n_windows);
306 contract.pin_bank_hash(bank_hash());
307 contract.pin_detector_registry_hash(registry_hash());
308 let events = synthesize_scaled(DEFAULT_SEED, n_entities, n_windows, 4);
309
310 let plan_cpu = scale_iters(IterPlan::LARGE_K1, quick);
314 let cpu_b = run_layer_b_cpu_always(&events, &contract, plan_cpu.warmup, plan_cpu.iters);
315 let cpu_b_med = median(&cpu_b);
316 rows.push(MoneyRow {
317 label: format!(
318 "scaled 256x4096 K= 1 CPU Layer B {}",
319 quick_tag(quick)
320 ),
321 n_entities,
322 n_windows,
323 n_catalogs: 1,
324 samples_us: cpu_b.clone(),
325 baseline_us: cpu_b_med,
326 });
327
328 let large_sweep: &[(u32, IterPlan)] = if big_k {
333 &[
334 (1u32, IterPlan::LARGE_K1),
335 (16, IterPlan::LARGE_K16),
336 (64, IterPlan::LARGE_K64),
337 (128, IterPlan::LARGE_K128),
338 ]
339 } else {
340 &[
341 (1u32, IterPlan::LARGE_K1),
342 (16, IterPlan::LARGE_K16),
343 (64, IterPlan::LARGE_K64),
344 ]
345 };
346 for &(k, plan_const) in large_sweep {
347 let plan = scale_iters(plan_const, quick);
348
349 run_gpu_row(
350 &mut rows,
351 &format!("scaled 256x4096 K={k:>3} GPU Layer A "),
352 n_entities,
353 n_windows,
354 k,
355 GpuRow::LayerA,
356 &events,
357 &contract,
358 plan,
359 cpu_b_med,
360 quick,
361 );
362
363 run_gpu_row(
364 &mut rows,
365 &format!("scaled 256x4096 K={k:>3} GPU Layer B "),
366 n_entities,
367 n_windows,
368 k,
369 GpuRow::LayerB,
370 &events,
371 &contract,
372 plan,
373 cpu_b_med,
374 quick,
375 );
376 }
377
378 rows.push(MoneyRow::not_run(
384 "scaled 256x4096 K= 1 Layer C (audit) [not run: transcript materialisation cost]",
385 n_entities,
386 n_windows,
387 1,
388 ));
389 }
390
391 let report = render_report(&header_lines, &rows);
393 print!("{report}");
394
395 if let Some(parent) = out_path.parent() {
396 let _ = std::fs::create_dir_all(parent);
397 }
398 match std::fs::write(&out_path, &report) {
399 Ok(()) => {
400 println!("wrote money table -> {}", out_path.display());
401 ExitCode::SUCCESS
402 }
403 Err(e) => {
404 eprintln!("warning: could not write {}: {e}", out_path.display());
405 ExitCode::SUCCESS
407 }
408 }
409}
410
411#[derive(Clone, Copy)]
412enum GpuRow {
413 LayerA,
414 LayerB,
415}
416
417#[allow(clippy::too_many_arguments)]
418fn run_gpu_row(
419 rows: &mut Vec<MoneyRow>,
420 label_prefix: &str,
421 n_entities: u32,
422 n_windows: u32,
423 k: u32,
424 which: GpuRow,
425 events: &[TraceEvent],
426 contract: &Contract,
427 plan: IterPlan,
428 baseline_us: u128,
429 quick: bool,
430) {
431 #[cfg(feature = "cuda")]
432 {
433 let batch = if k == 1 { 0 } else { k };
436 let label = format!("{label_prefix}{}", quick_tag(quick));
437
438 let events_owned: Vec<TraceEvent> = events.to_vec();
447 let contract_owned = contract.clone();
448 let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| match which {
449 GpuRow::LayerA => run_layer_a(
450 &events_owned,
451 &contract_owned,
452 batch,
453 plan.warmup,
454 plan.iters,
455 ),
456 GpuRow::LayerB => run_layer_b(
457 &events_owned,
458 &contract_owned,
459 batch,
460 plan.warmup,
461 plan.iters,
462 ),
463 }));
464 if let Ok(samples) = result {
465 rows.push(MoneyRow {
466 label,
467 n_entities,
468 n_windows,
469 n_catalogs: k,
470 samples_us: samples,
471 baseline_us,
472 });
473 } else {
474 let row_label = format!("{label} [not run: alloc refused or kernel error]");
475 rows.push(MoneyRow::not_run(&row_label, n_entities, n_windows, k));
476 }
477 }
478 #[cfg(not(feature = "cuda"))]
479 {
480 let _ = (
481 label_prefix,
482 n_entities,
483 n_windows,
484 k,
485 which,
486 events,
487 contract,
488 plan,
489 baseline_us,
490 quick,
491 );
492 rows.push(MoneyRow::not_run(
493 "(GPU rows skipped: not built with --features cuda)",
494 n_entities,
495 n_windows,
496 k,
497 ));
498 }
499}
500
501struct MoneyRow {
502 label: String,
503 n_entities: u32,
504 n_windows: u32,
505 n_catalogs: u32,
506 samples_us: Vec<u128>,
507 baseline_us: u128,
508}
509
510impl MoneyRow {
511 fn not_run(label: &str, n_entities: u32, n_windows: u32, n_catalogs: u32) -> Self {
512 Self {
513 label: label.to_string(),
514 n_entities,
515 n_windows,
516 n_catalogs,
517 samples_us: Vec::new(),
518 baseline_us: 0,
519 }
520 }
521}
522
523fn render_report(header_lines: &[String], rows: &[MoneyRow]) -> String {
524 use core::fmt::Write;
525 let mut out = String::new();
526 for line in header_lines {
527 out.push_str(line);
528 out.push('\n');
529 }
530 out.push_str(
531 " label \
532 | median_us | per_catalog_us | catalogs/sec | cells/sec \
533 | det_evals/sec | speedup_vs_cpu_b\n",
534 );
535 out.push_str(
536 " ------------------------------------------------------- \
537 | ---------- | -------------- | ------------ | ------------- \
538 | -------------- | ----------------\n",
539 );
540 let n_detectors = u128::from(dsfb_gpu_debug_core::motif::MotifClass::COUNT as u32);
541 let one_sec = 1_000_000u128;
542 for row in rows {
543 if row.samples_us.is_empty() {
544 let _ = writeln!(
545 out,
546 " {:<55}| n/a | n/a | n/a | n/a | n/a | n/a",
547 row.label
548 );
549 continue;
550 }
551 let med = median(&row.samples_us);
552 let catalogs = u128::from(row.n_catalogs);
553 let cells = catalogs * u128::from(row.n_entities) * u128::from(row.n_windows);
554 let det_evals = cells * n_detectors;
555 let per_catalog = if catalogs > 0 { med / catalogs } else { med };
556 let catalogs_per_sec = if med > 0 { catalogs * one_sec / med } else { 0 };
557 let cells_per_sec = if med > 0 { cells * one_sec / med } else { 0 };
558 let det_evals_per_sec = if med > 0 {
559 det_evals * one_sec / med
560 } else {
561 0
562 };
563 let speedup = if med > 0 && row.baseline_us > 0 {
564 let denom = per_catalog.max(1);
569 let ratio_times10 = (row.baseline_us * 10) / denom;
572 let whole = ratio_times10 / 10;
573 let tenth = ratio_times10 % 10;
574 format!("{whole:>10}.{tenth}x")
575 } else {
576 String::from(" n/a")
577 };
578 let _ = writeln!(
579 out,
580 " {:<55}| {med:>10} | {per_catalog:>14} | {catalogs_per_sec:>12} | {cells_per_sec:>13} | {det_evals_per_sec:>14} | {speedup:>16}",
581 row.label
582 );
583 }
584 out
585}
586
587fn median(samples: &[u128]) -> u128 {
588 if samples.is_empty() {
589 return 0;
590 }
591 let mut s = samples.to_vec();
592 s.sort_unstable();
593 s[s.len() / 2]
594}
595
596fn scale_iters(p: IterPlan, quick: bool) -> IterPlan {
597 if !quick {
598 return p;
599 }
600 IterPlan {
601 warmup: p.warmup.div_ceil(5).max(1),
602 iters: p.iters.div_ceil(5).max(1),
603 }
604}
605
606fn quick_tag(quick: bool) -> &'static str {
607 if quick {
608 "[quick]"
609 } else {
610 ""
611 }
612}
613
614fn probe_graph_capture() -> (String, Option<String>) {
618 #[cfg(feature = "cuda")]
619 {
620 let mut contract = Contract::canonical();
621 contract.pin_bank_hash(bank_hash());
622 contract.pin_detector_registry_hash(registry_hash());
623 let events = synthesize(DEFAULT_SEED);
624 match GpuWorkspace::new_with_pinned_async(&contract) {
625 Ok(mut ws) => match build_gpu_throughput_graph_or_demote(&events, &contract, &mut ws) {
626 Ok((_case, GraphCaptureStatus::Captured { plan_hash })) => {
627 use core::fmt::Write;
628 let mut hex = String::with_capacity(64);
629 for b in &plan_hash {
630 let _ = write!(hex, "{b:02x}");
631 }
632 (
633 String::from("# graph_status: captured"),
634 Some(format!("# graph_plan_hash: {hex}")),
635 )
636 }
637 Ok((_case, GraphCaptureStatus::Demoted { reason })) => {
638 (format!("# graph_status: demoted ({reason})"), None)
639 }
640 Err(e) => (format!("# graph_status: error during probe ({e:?})"), None),
641 },
642 Err(e) => (
643 format!("# graph_status: error allocating pinned-async workspace ({e:?})"),
644 None,
645 ),
646 }
647 }
648 #[cfg(not(feature = "cuda"))]
649 {
650 (
651 String::from("# graph_status: skipped (built without --features cuda)"),
652 None,
653 )
654 }
655}
656
657fn chrono_like_timestamp_or_unknown() -> String {
663 match std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH) {
664 Ok(d) => format!("{} epoch seconds", d.as_secs()),
665 Err(_) => String::from("unknown"),
666 }
667}
668
669#[cfg(feature = "cuda")]
696fn run_r8_detail_stage(warmup: usize, iters: usize, tree_digest: bool) {
697 use std::time::Instant;
698
699 use dsfb_gpu_debug_cuda::{
700 build_gpu_throughput_pinned_async_on_workspace_timed, GpuWorkspace, R8HostStageTimings,
701 R8StageTimings,
702 };
703
704 if tree_digest {
705 run_r8_5_tree_digest_compare(warmup, iters);
712 return;
713 }
714
715 let points: [(&str, u32, u32, u32); 3] = [
721 ("canonical 16x128 K=1", 16, 128, 1),
722 ("mid-scale 64x512 K=1", 64, 512, 1),
723 ("full-scale 256x4096 K=1", 256, 4096, 1),
724 ];
725
726 for &(label, n_entities, n_windows, k) in &points {
727 println!();
728 println!("=== R.8 Bottleneck Profile — {label} ===");
729 println!(" warmup: {warmup} iters: {iters}");
730
731 let contract = if n_entities == 16 && n_windows == 128 {
732 let mut c = dsfb_gpu_debug_core::contract::Contract::canonical();
733 c.pin_bank_hash(dsfb_gpu_debug_core::bank::bank_hash());
734 c.pin_detector_registry_hash(dsfb_gpu_debug_core::motif::registry_hash());
735 c
736 } else {
737 let mut c = dsfb_gpu_debug_core::contract::Contract::scaled(n_entities, n_windows);
738 c.pin_bank_hash(dsfb_gpu_debug_core::bank::bank_hash());
739 c.pin_detector_registry_hash(dsfb_gpu_debug_core::motif::registry_hash());
740 c
741 };
742 let events = if n_entities == 16 && n_windows == 128 {
743 dsfb_gpu_debug_core::fixture::synthesize(dsfb_gpu_debug_core::fixture::DEFAULT_SEED)
744 } else {
745 dsfb_gpu_debug_core::fixture::synthesize_scaled(
746 dsfb_gpu_debug_core::fixture::DEFAULT_SEED,
747 n_entities,
748 n_windows,
749 4,
750 )
751 };
752
753 let Ok(mut ws) = GpuWorkspace::new_with_pinned_async(&contract) else {
754 println!(" workspace alloc refused; skipping {label}");
755 continue;
756 };
757
758 for _ in 0..warmup {
760 let _ =
761 build_gpu_throughput_pinned_async_on_workspace_timed(&events, &contract, &mut ws);
762 }
763
764 let mut wall_us: Vec<u128> = Vec::with_capacity(iters);
766 let mut devs: Vec<R8StageTimings> = Vec::with_capacity(iters);
767 let mut hosts: Vec<R8HostStageTimings> = Vec::with_capacity(iters);
768 for _ in 0..iters {
769 let t0 = Instant::now();
770 let result =
771 build_gpu_throughput_pinned_async_on_workspace_timed(&events, &contract, &mut ws);
772 let dt = t0.elapsed().as_nanos();
773 match result {
774 Ok((case, dev, host)) => {
775 std::hint::black_box(case);
776 devs.push(dev);
777 hosts.push(host);
778 wall_us.push(dt / 1_000);
779 }
780 Err(e) => {
781 println!(" dispatch error during R.8 measurement: {e:?}");
782 return;
783 }
784 }
785 }
786
787 let med_wall = median_u128(&wall_us);
789 let med_dev = median_stage(&devs);
790 let med_host = median_host(&hosts);
791
792 print_and_write_r8(label, n_entities, n_windows, k, med_dev, med_host, med_wall);
793 }
794}
795
796#[cfg(feature = "cuda")]
797fn median_u128(samples: &[u128]) -> u128 {
798 if samples.is_empty() {
799 return 0;
800 }
801 let mut s = samples.to_vec();
802 s.sort_unstable();
803 s[s.len() / 2]
804}
805
806#[cfg(feature = "cuda")]
807fn median_stage(
808 samples: &[dsfb_gpu_debug_cuda::R8StageTimings],
809) -> dsfb_gpu_debug_cuda::R8StageTimings {
810 if samples.is_empty() {
811 return dsfb_gpu_debug_cuda::R8StageTimings::default();
812 }
813 let mid = samples.len() / 2;
814 let pick = |f: fn(&dsfb_gpu_debug_cuda::R8StageTimings) -> f32| -> f32 {
815 let mut v: Vec<f32> = samples.iter().map(f).collect();
816 v.sort_by(|a, b| a.partial_cmp(b).unwrap_or(core::cmp::Ordering::Equal));
817 v[mid]
818 };
819 dsfb_gpu_debug_cuda::R8StageTimings {
820 h2d_us: pick(|s| s.h2d_us),
821 residual_us: pick(|s| s.residual_us),
822 sign_us: pick(|s| s.sign_us),
823 detector_us: pick(|s| s.detector_us),
824 consensus_us: pick(|s| s.consensus_us),
825 candidate_us: pick(|s| s.candidate_us),
826 digests_us: pick(|s| s.digests_us),
827 d2h_us: pick(|s| s.d2h_us),
828 total_device_us: pick(|s| s.total_device_us),
829 }
830}
831
832#[cfg(feature = "cuda")]
833fn median_host(
834 samples: &[dsfb_gpu_debug_cuda::R8HostStageTimings],
835) -> dsfb_gpu_debug_cuda::R8HostStageTimings {
836 if samples.is_empty() {
837 return dsfb_gpu_debug_cuda::R8HostStageTimings::default();
838 }
839 let mid = samples.len() / 2;
840 let mut f: Vec<f32> = samples.iter().map(|s| s.features_us).collect();
841 f.sort_by(|a, b| a.partial_cmp(b).unwrap_or(core::cmp::Ordering::Equal));
842 let mut b: Vec<f32> = samples.iter().map(|s| s.bank_and_finalize_us).collect();
843 b.sort_by(|a, b| a.partial_cmp(b).unwrap_or(core::cmp::Ordering::Equal));
844 dsfb_gpu_debug_cuda::R8HostStageTimings {
845 features_us: f[mid],
846 bank_and_finalize_us: b[mid],
847 }
848}
849
850#[cfg(feature = "cuda")]
862#[allow(clippy::too_many_arguments, clippy::cast_precision_loss)]
863fn print_and_write_r8(
864 label: &str,
865 n_entities: u32,
866 n_windows: u32,
867 k: u32,
868 dev: dsfb_gpu_debug_cuda::R8StageTimings,
869 host: dsfb_gpu_debug_cuda::R8HostStageTimings,
870 med_wall_us: u128,
871) {
872 use core::fmt::Write;
873
874 let rows: [(&str, f32); 10] = [
875 ("feature generation (host)", host.features_us),
876 ("H2D", dev.h2d_us),
877 ("residual", dev.residual_us),
878 ("sign (drift/slew EWMA)", dev.sign_us),
879 ("detector", dev.detector_us),
880 ("consensus", dev.consensus_us),
881 ("candidate collapse", dev.candidate_us),
882 ("digests (4 kernels)", dev.digests_us),
883 ("D2H", dev.d2h_us),
884 ("bank + case finalize (host)", host.bank_and_finalize_us),
885 ];
886
887 let total_measured: f32 = rows.iter().map(|(_, us)| us).sum();
888 #[allow(clippy::cast_possible_truncation)]
895 let anchor = if med_wall_us == 0 {
896 total_measured
897 } else {
898 (med_wall_us as u64) as f32
899 };
900
901 let mut out = String::new();
902 let _ = writeln!(out, "=== R.8 Bottleneck Profile — {label} ===");
903 let _ = writeln!(
904 out,
905 "scale: n_entities={n_entities} n_windows={n_windows} K={k}"
906 );
907 let _ = writeln!(out, "median wall (host Instant): {med_wall_us} us");
908 let _ = writeln!(out, "sum of timed segments : {total_measured:.1} us");
909 out.push('\n');
910 out.push_str(" Stage us % of wall\n");
911 out.push_str(" -------------------------- ---------- -----------\n");
912 for (name, us) in &rows {
913 let pct = if anchor > 0.0 {
914 (us / anchor) * 100.0
915 } else {
916 0.0
917 };
918 let _ = writeln!(out, " {name:<26} {us:>10.1} {pct:>9.1}%");
919 }
920 out.push_str(" -------------------------- ---------- -----------\n");
921 let total_pct = if anchor > 0.0 {
922 (total_measured / anchor) * 100.0
923 } else {
924 0.0
925 };
926 let _ = writeln!(
927 out,
928 " total (timed segments) {total_measured:>10.1} {total_pct:>9.1}%"
929 );
930 let total_device_us = dev.total_device_us;
931 let total_device_pct = if anchor > 0.0 {
932 (total_device_us / anchor) * 100.0
933 } else {
934 0.0
935 };
936 let _ = writeln!(
937 out,
938 " total_device_us (event) {total_device_us:>10.1} {total_device_pct:>9.1}%"
939 );
940
941 let mut sorted: Vec<(&str, f32)> = rows.to_vec();
943 sorted.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(core::cmp::Ordering::Equal));
944 out.push_str("\nTop 3 stages by absolute time:\n");
945 for (i, (name, us)) in sorted.iter().take(3).enumerate() {
946 let pct = if anchor > 0.0 {
947 (us / anchor) * 100.0
948 } else {
949 0.0
950 };
951 let rank = i + 1;
952 let _ = writeln!(out, " {rank}. {name} — {us:.1} us ({pct:.1}% of wall)");
953 }
954
955 print!("{out}");
956
957 let filename = format!("r8_bottleneck_{n_entities}x{n_windows}_K{k}.txt");
958 let path = std::path::Path::new("reports").join(filename);
959 let _ = std::fs::create_dir_all("reports");
960 if let Err(e) = std::fs::write(&path, &out) {
961 eprintln!("warning: could not write {}: {e}", path.display());
962 } else {
963 println!("wrote R.8 profile -> {}", path.display());
964 }
965}
966
967#[cfg(feature = "cuda")]
982#[allow(clippy::too_many_lines)]
983fn run_r8_5_tree_digest_compare(warmup: usize, iters: usize) {
984 use std::time::Instant;
985
986 use dsfb_gpu_debug_cuda::{
987 build_gpu_throughput_pinned_async_on_workspace,
988 build_gpu_throughput_pinned_async_on_workspace_tree, GpuWorkspace,
989 };
990
991 let points: [(&str, u32, u32, u32); 3] = [
992 ("canonical 16x128 K=1", 16, 128, 1),
993 ("mid-scale 64x512 K=1", 64, 512, 1),
994 ("full-scale 256x4096 K=1", 256, 4096, 1),
995 ];
996
997 for &(label, n_entities, n_windows, k) in &points {
998 println!();
999 println!("=== R.8.5 tree-digest comparison — {label} ===");
1000 println!(" warmup: {warmup} iters: {iters}");
1001
1002 let contract = if n_entities == 16 && n_windows == 128 {
1003 let mut c = dsfb_gpu_debug_core::contract::Contract::canonical();
1004 c.pin_bank_hash(dsfb_gpu_debug_core::bank::bank_hash());
1005 c.pin_detector_registry_hash(dsfb_gpu_debug_core::motif::registry_hash());
1006 c
1007 } else {
1008 let mut c = dsfb_gpu_debug_core::contract::Contract::scaled(n_entities, n_windows);
1009 c.pin_bank_hash(dsfb_gpu_debug_core::bank::bank_hash());
1010 c.pin_detector_registry_hash(dsfb_gpu_debug_core::motif::registry_hash());
1011 c
1012 };
1013 let events = if n_entities == 16 && n_windows == 128 {
1014 dsfb_gpu_debug_core::fixture::synthesize(dsfb_gpu_debug_core::fixture::DEFAULT_SEED)
1015 } else {
1016 dsfb_gpu_debug_core::fixture::synthesize_scaled(
1017 dsfb_gpu_debug_core::fixture::DEFAULT_SEED,
1018 n_entities,
1019 n_windows,
1020 4,
1021 )
1022 };
1023
1024 let mut ws_serial = match GpuWorkspace::new_with_pinned_async(&contract) {
1025 Ok(w) => w,
1026 Err(e) => {
1027 println!(" workspace alloc refused: {e:?}; skipping {label}");
1028 continue;
1029 }
1030 };
1031 let mut ws_tree = match GpuWorkspace::new_with_pinned_async(&contract) {
1032 Ok(w) => w,
1033 Err(e) => {
1034 println!(" workspace alloc refused: {e:?}; skipping {label}");
1035 continue;
1036 }
1037 };
1038
1039 for _ in 0..warmup {
1041 let _ =
1042 build_gpu_throughput_pinned_async_on_workspace(&events, &contract, &mut ws_serial);
1043 }
1044 let mut serial_us: Vec<u128> = Vec::with_capacity(iters);
1045 for _ in 0..iters {
1046 let t0 = Instant::now();
1047 let result =
1048 build_gpu_throughput_pinned_async_on_workspace(&events, &contract, &mut ws_serial);
1049 let dt = t0.elapsed().as_micros();
1050 if let Ok(case) = result {
1051 std::hint::black_box(case);
1052 serial_us.push(dt);
1053 } else {
1054 println!(" serial-digest dispatch error: {result:?}");
1055 return;
1056 }
1057 }
1058
1059 for _ in 0..warmup {
1061 let _ = build_gpu_throughput_pinned_async_on_workspace_tree(
1062 &events,
1063 &contract,
1064 &mut ws_tree,
1065 );
1066 }
1067 let mut tree_us: Vec<u128> = Vec::with_capacity(iters);
1068 for _ in 0..iters {
1069 let t0 = Instant::now();
1070 let result = build_gpu_throughput_pinned_async_on_workspace_tree(
1071 &events,
1072 &contract,
1073 &mut ws_tree,
1074 );
1075 let dt = t0.elapsed().as_micros();
1076 if let Ok(case) = result {
1077 std::hint::black_box(case);
1078 tree_us.push(dt);
1079 } else {
1080 println!(" tree-digest dispatch error: {result:?}");
1081 return;
1082 }
1083 }
1084
1085 let med_serial = median_u128(&serial_us);
1086 let med_tree = median_u128(&tree_us);
1087 #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
1092 let ratio = if med_tree > 0 {
1093 (med_serial as u64) as f64 / (med_tree as u64) as f64
1094 } else {
1095 0.0
1096 };
1097 print_and_write_r8_5(label, n_entities, n_windows, k, med_serial, med_tree, ratio);
1098 }
1099}
1100
1101#[cfg(feature = "cuda")]
1104#[allow(clippy::too_many_arguments)]
1105fn print_and_write_r8_5(
1106 label: &str,
1107 n_entities: u32,
1108 n_windows: u32,
1109 k: u32,
1110 med_serial_us: u128,
1111 med_tree_us: u128,
1112 ratio: f64,
1113) {
1114 use core::fmt::Write;
1115 let mut out = String::new();
1116 let _ = writeln!(out, "=== R.8.5 tree-digest comparison — {label} ===");
1117 let _ = writeln!(
1118 out,
1119 "scale: n_entities={n_entities} n_windows={n_windows} K={k}"
1120 );
1121 let _ = writeln!(out);
1122 let _ = writeln!(out, " serial-digest median wall: {med_serial_us:>10} us");
1123 let _ = writeln!(out, " tree-digest median wall: {med_tree_us:>10} us");
1124 let _ = writeln!(out, " wall-time ratio (serial / tree): {ratio:.2}x");
1125 let _ = writeln!(out);
1126 let _ = writeln!(out, "Notes:");
1127 let _ = writeln!(
1128 out,
1129 " * Both paths run the same 5 pipeline kernels (residual, sign, detector,"
1130 );
1131 let _ = writeln!(
1132 out,
1133 " consensus, candidate). They differ only in the digest stage: serial"
1134 );
1135 let _ = writeln!(
1136 out,
1137 " uses 4 single-thread `*_digest_kernel_batched` kernels; tree uses one"
1138 );
1139 let _ = writeln!(
1140 out,
1141 " block per chunk (~2048 chunks at 256x4096 with 16 KiB chunks) feeding"
1142 );
1143 let _ = writeln!(
1144 out,
1145 " a final root SHA-256 over the ordered leaf digests + domain separator."
1146 );
1147 let _ = writeln!(
1148 out,
1149 " * Stage hash bytes differ between modes by construction; case-file"
1150 );
1151 let _ = writeln!(
1152 out,
1153 " metadata records `digest_mode` so replay catches a mode mismatch."
1154 );
1155
1156 print!("{out}");
1157
1158 let filename = format!("r8_5_tree_compare_{n_entities}x{n_windows}_K{k}.txt");
1159 let path = std::path::Path::new("reports").join(filename);
1160 let _ = std::fs::create_dir_all("reports");
1161 if let Err(e) = std::fs::write(&path, &out) {
1162 eprintln!("warning: could not write {}: {e}", path.display());
1163 } else {
1164 println!("wrote R.8.5 comparison -> {}", path.display());
1165 }
1166}
1167
1168#[cfg(feature = "cuda")]
1185#[allow(clippy::too_many_lines)]
1186fn run_r11_compact_compare(warmup: usize, iters: usize) {
1187 use std::time::Instant;
1188
1189 use dsfb_gpu_debug_core::casefile::FixtureHashes;
1190 use dsfb_gpu_debug_core::window::compute_features;
1191 use dsfb_gpu_debug_cuda::{
1192 build_gpu_throughput_pinned_async_on_workspace,
1193 build_gpu_throughput_pinned_async_on_workspace_tree,
1194 build_gpu_throughput_pinned_async_on_workspace_tree_compact, GpuWorkspace,
1195 };
1196
1197 let points: [(&str, u32, u32, u32); 3] = [
1198 ("canonical 16x128 K=1", 16, 128, 1),
1199 ("mid-scale 64x512 K=1", 64, 512, 1),
1200 ("full-scale 256x4096 K=1", 256, 4096, 1),
1201 ];
1202
1203 for &(label, n_entities, n_windows, k) in &points {
1204 println!();
1205 println!("=== R.11 compact-verdict comparison — {label} ===");
1206 println!(" warmup: {warmup} iters: {iters}");
1207
1208 let contract = if n_entities == 16 && n_windows == 128 {
1209 let mut c = dsfb_gpu_debug_core::contract::Contract::canonical();
1210 c.pin_bank_hash(dsfb_gpu_debug_core::bank::bank_hash());
1211 c.pin_detector_registry_hash(dsfb_gpu_debug_core::motif::registry_hash());
1212 c
1213 } else {
1214 let mut c = dsfb_gpu_debug_core::contract::Contract::scaled(n_entities, n_windows);
1215 c.pin_bank_hash(dsfb_gpu_debug_core::bank::bank_hash());
1216 c.pin_detector_registry_hash(dsfb_gpu_debug_core::motif::registry_hash());
1217 c
1218 };
1219 let events = if n_entities == 16 && n_windows == 128 {
1220 dsfb_gpu_debug_core::fixture::synthesize(dsfb_gpu_debug_core::fixture::DEFAULT_SEED)
1221 } else {
1222 dsfb_gpu_debug_core::fixture::synthesize_scaled(
1223 dsfb_gpu_debug_core::fixture::DEFAULT_SEED,
1224 n_entities,
1225 n_windows,
1226 4,
1227 )
1228 };
1229
1230 let features = compute_features(
1235 &events,
1236 contract.n_windows,
1237 contract.n_entities,
1238 u64::from(contract.window_size_ms) * 1_000_000,
1239 );
1240 let fixture = FixtureHashes::compute(&events, &features);
1241
1242 let Ok(mut ws_serial) = GpuWorkspace::new_with_pinned_async(&contract) else {
1243 println!(" workspace alloc refused; skipping {label}");
1244 continue;
1245 };
1246 let Ok(mut ws_tree) = GpuWorkspace::new_with_pinned_async(&contract) else {
1247 println!(" workspace alloc refused; skipping {label}");
1248 continue;
1249 };
1250 let Ok(mut ws_compact) = GpuWorkspace::new_with_pinned_async(&contract) else {
1251 println!(" workspace alloc refused; skipping {label}");
1252 continue;
1253 };
1254
1255 for _ in 0..warmup {
1257 let _ =
1258 build_gpu_throughput_pinned_async_on_workspace(&events, &contract, &mut ws_serial);
1259 }
1260 let mut serial_us: Vec<u128> = Vec::with_capacity(iters);
1261 for _ in 0..iters {
1262 let t0 = Instant::now();
1263 let result =
1264 build_gpu_throughput_pinned_async_on_workspace(&events, &contract, &mut ws_serial);
1265 let dt = t0.elapsed().as_micros();
1266 match result {
1267 Ok(case) => {
1268 std::hint::black_box(case);
1269 serial_us.push(dt);
1270 }
1271 Err(e) => {
1272 println!(" serial-digest dispatch error: {e:?}");
1273 return;
1274 }
1275 }
1276 }
1277
1278 for _ in 0..warmup {
1280 let _ = build_gpu_throughput_pinned_async_on_workspace_tree(
1281 &events,
1282 &contract,
1283 &mut ws_tree,
1284 );
1285 }
1286 let mut tree_us: Vec<u128> = Vec::with_capacity(iters);
1287 for _ in 0..iters {
1288 let t0 = Instant::now();
1289 let result = build_gpu_throughput_pinned_async_on_workspace_tree(
1290 &events,
1291 &contract,
1292 &mut ws_tree,
1293 );
1294 let dt = t0.elapsed().as_micros();
1295 match result {
1296 Ok(case) => {
1297 std::hint::black_box(case);
1298 tree_us.push(dt);
1299 }
1300 Err(e) => {
1301 println!(" tree-digest dispatch error: {e:?}");
1302 return;
1303 }
1304 }
1305 }
1306
1307 for _ in 0..warmup {
1309 let _ = build_gpu_throughput_pinned_async_on_workspace_tree_compact(
1310 &events,
1311 &contract,
1312 &mut ws_compact,
1313 &fixture,
1314 );
1315 }
1316 let mut compact_us: Vec<u128> = Vec::with_capacity(iters);
1317 for _ in 0..iters {
1318 let t0 = Instant::now();
1319 let result = build_gpu_throughput_pinned_async_on_workspace_tree_compact(
1320 &events,
1321 &contract,
1322 &mut ws_compact,
1323 &fixture,
1324 );
1325 let dt = t0.elapsed().as_micros();
1326 match result {
1327 Ok(case) => {
1328 std::hint::black_box(case);
1329 compact_us.push(dt);
1330 }
1331 Err(e) => {
1332 println!(" compact-verdict dispatch error: {e:?}");
1333 return;
1334 }
1335 }
1336 }
1337
1338 let med_serial = median_u128(&serial_us);
1339 let med_tree = median_u128(&tree_us);
1340 let med_compact = median_u128(&compact_us);
1341 #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
1344 let ratio_serial_to_compact = if med_compact > 0 {
1345 (med_serial as u64) as f64 / (med_compact as u64) as f64
1346 } else {
1347 0.0
1348 };
1349 #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
1350 let ratio_tree_to_compact = if med_compact > 0 {
1351 (med_tree as u64) as f64 / (med_compact as u64) as f64
1352 } else {
1353 0.0
1354 };
1355 print_and_write_r11(
1356 label,
1357 n_entities,
1358 n_windows,
1359 k,
1360 med_serial,
1361 med_tree,
1362 med_compact,
1363 ratio_serial_to_compact,
1364 ratio_tree_to_compact,
1365 );
1366 }
1367}
1368
1369#[cfg(feature = "cuda")]
1374#[allow(clippy::too_many_arguments)]
1375fn print_and_write_r11(
1376 label: &str,
1377 n_entities: u32,
1378 n_windows: u32,
1379 k: u32,
1380 med_serial_us: u128,
1381 med_tree_us: u128,
1382 med_compact_us: u128,
1383 ratio_serial_to_compact: f64,
1384 ratio_tree_to_compact: f64,
1385) {
1386 use core::fmt::Write;
1387 let mut out = String::new();
1388 let _ = writeln!(out, "=== R.11 compact-verdict comparison — {label} ===");
1389 let _ = writeln!(
1390 out,
1391 "scale: n_entities={n_entities} n_windows={n_windows} K={k}"
1392 );
1393 let _ = writeln!(out);
1394 let _ = writeln!(
1395 out,
1396 " serial-digest : {med_serial_us:>10} us"
1397 );
1398 let _ = writeln!(
1399 out,
1400 " tree-digest (R.8.5) : {med_tree_us:>10} us"
1401 );
1402 let _ = writeln!(
1403 out,
1404 " tree-digest + compact (R.11) : {med_compact_us:>10} us"
1405 );
1406 let _ = writeln!(out);
1407 let _ = writeln!(
1408 out,
1409 " wall ratio serial / compact : {ratio_serial_to_compact:.2}x"
1410 );
1411 let _ = writeln!(
1412 out,
1413 " wall ratio tree / compact : {ratio_tree_to_compact:.2}x"
1414 );
1415 let _ = writeln!(out);
1416 let _ = writeln!(out, "Notes:");
1417 let _ = writeln!(
1418 out,
1419 " * Serial = legacy R.6b path (4 single-thread digest kernels + non-compact builder)."
1420 );
1421 let _ = writeln!(
1422 out,
1423 " * Tree = R.8.5 path (block-parallel tree digest + non-compact builder)."
1424 );
1425 let _ = writeln!(
1426 out,
1427 " * Compact = R.11 path (tree digest + FixtureHashes precomputed once)."
1428 );
1429 let _ = writeln!(
1430 out,
1431 " * `FixtureHashes` is computed ONCE per scale point outside the iter loop,"
1432 );
1433 let _ = writeln!(
1434 out,
1435 " matching how a long-running deployment caller would amortise the input"
1436 );
1437 let _ = writeln!(
1438 out,
1439 " commitment hash across many dispatches against the same fixture."
1440 );
1441 let _ = writeln!(
1442 out,
1443 " * Case files from all three paths are byte-identical for the serial vs."
1444 );
1445 let _ = writeln!(
1446 out,
1447 " serial pairing, and the tree pair is internally byte-identical;"
1448 );
1449 let _ = writeln!(
1450 out,
1451 " serial ≠ tree because tree commits to chunked stage bytes + a domain"
1452 );
1453 let _ = writeln!(
1454 out,
1455 " separator. Compact ≡ tree byte-for-byte by construction."
1456 );
1457 let _ = writeln!(
1458 out,
1459 " * Semantic Non-Bypass Axiom holds in every path: `bank_collapse` is the"
1460 );
1461 let _ = writeln!(
1462 out,
1463 " only mint of `BankAdmissionToken`. The compact builder reuses it."
1464 );
1465
1466 print!("{out}");
1467
1468 let filename = format!("r11_compact_compare_{n_entities}x{n_windows}_K{k}.txt");
1469 let path = std::path::Path::new("reports").join(filename);
1470 let _ = std::fs::create_dir_all("reports");
1471 if let Err(e) = std::fs::write(&path, &out) {
1472 eprintln!("warning: could not write {}: {e}", path.display());
1473 } else {
1474 println!("wrote R.11 comparison -> {}", path.display());
1475 }
1476}