#![allow(clippy::expect_used)]
use std::process::ExitCode;
use dsfb_gpu_debug_core::bank::bank_hash;
use dsfb_gpu_debug_core::contract::Contract;
use dsfb_gpu_debug_core::event::TraceEvent;
use dsfb_gpu_debug_core::fixture::{synthesize, synthesize_scaled, DEFAULT_SEED};
use dsfb_gpu_debug_core::motif::registry_hash;
#[cfg(feature = "cuda")]
use dsfb_gpu_debug_cuda::{build_gpu_throughput_graph_or_demote, GpuWorkspace, GraphCaptureStatus};
#[cfg(feature = "cuda")]
use super::bench::{run_layer_a, run_layer_b, run_layer_c_gpu};
use super::bench::{run_layer_b_cpu_always, run_layer_c_cpu};
use super::{parse_flags, usage_error};
#[derive(Debug, Clone, Copy)]
struct IterPlan {
warmup: usize,
iters: usize,
}
impl IterPlan {
const CANONICAL: Self = Self {
warmup: 20,
iters: 100,
};
const LARGE_K1: Self = Self {
warmup: 5,
iters: 50,
};
const LARGE_K16: Self = Self {
warmup: 3,
iters: 20,
};
const LARGE_K64: Self = Self {
warmup: 2,
iters: 10,
};
const LARGE_K128: Self = Self {
warmup: 2,
iters: 5,
};
}
#[allow(clippy::too_many_lines)]
pub fn parse_and_run(args: &[String]) -> ExitCode {
let flags = match parse_flags(args) {
Ok(f) => f,
Err(message) => return usage_error(&message),
};
let quick = flags.get("quick").is_some_and(|v| v != "false");
let skip_large = flags.get("skip-large").is_some_and(|v| v != "false");
let big_k = flags.get("big-k").is_some_and(|v| v != "false");
let detail_stage = flags.get("detail-stage").is_some_and(|v| v != "false");
let tree_digest = flags.get("tree-digest").is_some_and(|v| v != "false");
let compact = flags.get("compact").is_some_and(|v| v != "false");
let out_path: std::path::PathBuf = flags.get("out").map_or_else(
|| std::path::PathBuf::from("reports/money_table.txt"),
std::path::PathBuf::from,
);
let mut rows: Vec<MoneyRow> = Vec::new();
let mut header_lines: Vec<String> = Vec::new();
header_lines.push(String::from(
"# R.7 Money Table — DSFB-GPU-Debug headline benchmark",
));
header_lines.push(format!(
"# generated: {}",
chrono_like_timestamp_or_unknown()
));
header_lines.push(format!(
"# quick: {quick} skip-large: {skip_large} big-k: {big_k}"
));
header_lines.push(String::from("#"));
header_lines.push(String::from(
"# Layer A: device evidence fabric (skip-bank, on-device digests).",
));
header_lines.push(String::from(
"# Layer B: throughput verdict summary (host bank stage admits compact candidates).",
));
header_lines.push(String::from(
"# Layer C: full audit court (every intermediate cell materialised host-side).",
));
header_lines.push(String::from(
"# Speedup is measured against CPU Layer B at the SAME (n_entities, n_windows) scale.",
));
header_lines.push(String::from("#"));
let (graph_status_line, graph_hash_line) = probe_graph_capture();
header_lines.push(graph_status_line);
if let Some(line) = graph_hash_line {
header_lines.push(line);
}
header_lines.push(String::new());
if detail_stage {
#[cfg(feature = "cuda")]
{
let stage_iters = if quick { 5 } else { 20 };
let stage_warmup = if quick { 1 } else { 3 };
if compact {
run_r11_compact_compare(stage_warmup, stage_iters);
} else {
run_r8_detail_stage(stage_warmup, stage_iters, tree_digest);
}
}
#[cfg(not(feature = "cuda"))]
{
let _ = (quick, tree_digest, compact);
println!("--detail-stage requires --features cuda; nothing to profile");
}
return ExitCode::SUCCESS;
}
{
let plan = scale_iters(IterPlan::CANONICAL, quick);
let n_entities = 16u32;
let n_windows = 128u32;
let k = 32u32;
let mut contract = Contract::canonical();
contract.pin_bank_hash(bank_hash());
contract.pin_detector_registry_hash(registry_hash());
let events = synthesize(DEFAULT_SEED);
let cpu_b = run_layer_b_cpu_always(&events, &contract, plan.warmup, plan.iters);
let cpu_b_med = median(&cpu_b);
rows.push(MoneyRow {
label: format!(
"canonical 16x128 K={k:>3} CPU Layer B {}",
quick_tag(quick)
),
n_entities,
n_windows,
n_catalogs: 1,
samples_us: cpu_b.clone(),
baseline_us: cpu_b_med,
});
run_gpu_row(
&mut rows,
&format!("canonical 16x128 K={k:>3} GPU Layer A "),
n_entities,
n_windows,
k,
GpuRow::LayerA,
&events,
&contract,
plan,
cpu_b_med,
quick,
);
run_gpu_row(
&mut rows,
&format!("canonical 16x128 K={k:>3} GPU Layer B "),
n_entities,
n_windows,
k,
GpuRow::LayerB,
&events,
&contract,
plan,
cpu_b_med,
quick,
);
let cpu_c = run_layer_c_cpu(&events, &contract, plan.warmup, plan.iters);
rows.push(MoneyRow {
label: format!(
"canonical 16x128 K= 1 CPU Layer C (audit) {}",
quick_tag(quick)
),
n_entities,
n_windows,
n_catalogs: 1,
samples_us: cpu_c,
baseline_us: cpu_b_med,
});
#[cfg(feature = "cuda")]
{
let gpu_c = run_layer_c_gpu(&events, &contract, plan.warmup, plan.iters);
rows.push(MoneyRow {
label: format!(
"canonical 16x128 K= 1 GPU Layer C (audit) {}",
quick_tag(quick)
),
n_entities,
n_windows,
n_catalogs: 1,
samples_us: gpu_c,
baseline_us: cpu_b_med,
});
}
}
if !skip_large {
let n_entities = 256u32;
let n_windows = 4096u32;
let mut contract = Contract::scaled(n_entities, n_windows);
contract.pin_bank_hash(bank_hash());
contract.pin_detector_registry_hash(registry_hash());
let events = synthesize_scaled(DEFAULT_SEED, n_entities, n_windows, 4);
let plan_cpu = scale_iters(IterPlan::LARGE_K1, quick);
let cpu_b = run_layer_b_cpu_always(&events, &contract, plan_cpu.warmup, plan_cpu.iters);
let cpu_b_med = median(&cpu_b);
rows.push(MoneyRow {
label: format!(
"scaled 256x4096 K= 1 CPU Layer B {}",
quick_tag(quick)
),
n_entities,
n_windows,
n_catalogs: 1,
samples_us: cpu_b.clone(),
baseline_us: cpu_b_med,
});
let large_sweep: &[(u32, IterPlan)] = if big_k {
&[
(1u32, IterPlan::LARGE_K1),
(16, IterPlan::LARGE_K16),
(64, IterPlan::LARGE_K64),
(128, IterPlan::LARGE_K128),
]
} else {
&[
(1u32, IterPlan::LARGE_K1),
(16, IterPlan::LARGE_K16),
(64, IterPlan::LARGE_K64),
]
};
for &(k, plan_const) in large_sweep {
let plan = scale_iters(plan_const, quick);
run_gpu_row(
&mut rows,
&format!("scaled 256x4096 K={k:>3} GPU Layer A "),
n_entities,
n_windows,
k,
GpuRow::LayerA,
&events,
&contract,
plan,
cpu_b_med,
quick,
);
run_gpu_row(
&mut rows,
&format!("scaled 256x4096 K={k:>3} GPU Layer B "),
n_entities,
n_windows,
k,
GpuRow::LayerB,
&events,
&contract,
plan,
cpu_b_med,
quick,
);
}
rows.push(MoneyRow::not_run(
"scaled 256x4096 K= 1 Layer C (audit) [not run: transcript materialisation cost]",
n_entities,
n_windows,
1,
));
}
let report = render_report(&header_lines, &rows);
print!("{report}");
if let Some(parent) = out_path.parent() {
let _ = std::fs::create_dir_all(parent);
}
match std::fs::write(&out_path, &report) {
Ok(()) => {
println!("wrote money table -> {}", out_path.display());
ExitCode::SUCCESS
}
Err(e) => {
eprintln!("warning: could not write {}: {e}", out_path.display());
ExitCode::SUCCESS
}
}
}
#[derive(Clone, Copy)]
enum GpuRow {
LayerA,
LayerB,
}
#[allow(clippy::too_many_arguments)]
fn run_gpu_row(
rows: &mut Vec<MoneyRow>,
label_prefix: &str,
n_entities: u32,
n_windows: u32,
k: u32,
which: GpuRow,
events: &[TraceEvent],
contract: &Contract,
plan: IterPlan,
baseline_us: u128,
quick: bool,
) {
#[cfg(feature = "cuda")]
{
let batch = if k == 1 { 0 } else { k };
let label = format!("{label_prefix}{}", quick_tag(quick));
let events_owned: Vec<TraceEvent> = events.to_vec();
let contract_owned = contract.clone();
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| match which {
GpuRow::LayerA => run_layer_a(
&events_owned,
&contract_owned,
batch,
plan.warmup,
plan.iters,
),
GpuRow::LayerB => run_layer_b(
&events_owned,
&contract_owned,
batch,
plan.warmup,
plan.iters,
),
}));
if let Ok(samples) = result {
rows.push(MoneyRow {
label,
n_entities,
n_windows,
n_catalogs: k,
samples_us: samples,
baseline_us,
});
} else {
let row_label = format!("{label} [not run: alloc refused or kernel error]");
rows.push(MoneyRow::not_run(&row_label, n_entities, n_windows, k));
}
}
#[cfg(not(feature = "cuda"))]
{
let _ = (
label_prefix,
n_entities,
n_windows,
k,
which,
events,
contract,
plan,
baseline_us,
quick,
);
rows.push(MoneyRow::not_run(
"(GPU rows skipped: not built with --features cuda)",
n_entities,
n_windows,
k,
));
}
}
struct MoneyRow {
label: String,
n_entities: u32,
n_windows: u32,
n_catalogs: u32,
samples_us: Vec<u128>,
baseline_us: u128,
}
impl MoneyRow {
fn not_run(label: &str, n_entities: u32, n_windows: u32, n_catalogs: u32) -> Self {
Self {
label: label.to_string(),
n_entities,
n_windows,
n_catalogs,
samples_us: Vec::new(),
baseline_us: 0,
}
}
}
fn render_report(header_lines: &[String], rows: &[MoneyRow]) -> String {
use core::fmt::Write;
let mut out = String::new();
for line in header_lines {
out.push_str(line);
out.push('\n');
}
out.push_str(
" label \
| median_us | per_catalog_us | catalogs/sec | cells/sec \
| det_evals/sec | speedup_vs_cpu_b\n",
);
out.push_str(
" ------------------------------------------------------- \
| ---------- | -------------- | ------------ | ------------- \
| -------------- | ----------------\n",
);
let n_detectors = u128::from(dsfb_gpu_debug_core::motif::MotifClass::COUNT as u32);
let one_sec = 1_000_000u128;
for row in rows {
if row.samples_us.is_empty() {
let _ = writeln!(
out,
" {:<55}| n/a | n/a | n/a | n/a | n/a | n/a",
row.label
);
continue;
}
let med = median(&row.samples_us);
let catalogs = u128::from(row.n_catalogs);
let cells = catalogs * u128::from(row.n_entities) * u128::from(row.n_windows);
let det_evals = cells * n_detectors;
let per_catalog = if catalogs > 0 { med / catalogs } else { med };
let catalogs_per_sec = if med > 0 { catalogs * one_sec / med } else { 0 };
let cells_per_sec = if med > 0 { cells * one_sec / med } else { 0 };
let det_evals_per_sec = if med > 0 {
det_evals * one_sec / med
} else {
0
};
let speedup = if med > 0 && row.baseline_us > 0 {
let denom = per_catalog.max(1);
let ratio_times10 = (row.baseline_us * 10) / denom;
let whole = ratio_times10 / 10;
let tenth = ratio_times10 % 10;
format!("{whole:>10}.{tenth}x")
} else {
String::from(" n/a")
};
let _ = writeln!(
out,
" {:<55}| {med:>10} | {per_catalog:>14} | {catalogs_per_sec:>12} | {cells_per_sec:>13} | {det_evals_per_sec:>14} | {speedup:>16}",
row.label
);
}
out
}
fn median(samples: &[u128]) -> u128 {
if samples.is_empty() {
return 0;
}
let mut s = samples.to_vec();
s.sort_unstable();
s[s.len() / 2]
}
fn scale_iters(p: IterPlan, quick: bool) -> IterPlan {
if !quick {
return p;
}
IterPlan {
warmup: p.warmup.div_ceil(5).max(1),
iters: p.iters.div_ceil(5).max(1),
}
}
fn quick_tag(quick: bool) -> &'static str {
if quick {
"[quick]"
} else {
""
}
}
fn probe_graph_capture() -> (String, Option<String>) {
#[cfg(feature = "cuda")]
{
let mut contract = Contract::canonical();
contract.pin_bank_hash(bank_hash());
contract.pin_detector_registry_hash(registry_hash());
let events = synthesize(DEFAULT_SEED);
match GpuWorkspace::new_with_pinned_async(&contract) {
Ok(mut ws) => match build_gpu_throughput_graph_or_demote(&events, &contract, &mut ws) {
Ok((_case, GraphCaptureStatus::Captured { plan_hash })) => {
use core::fmt::Write;
let mut hex = String::with_capacity(64);
for b in &plan_hash {
let _ = write!(hex, "{b:02x}");
}
(
String::from("# graph_status: captured"),
Some(format!("# graph_plan_hash: {hex}")),
)
}
Ok((_case, GraphCaptureStatus::Demoted { reason })) => {
(format!("# graph_status: demoted ({reason})"), None)
}
Err(e) => (format!("# graph_status: error during probe ({e:?})"), None),
},
Err(e) => (
format!("# graph_status: error allocating pinned-async workspace ({e:?})"),
None,
),
}
}
#[cfg(not(feature = "cuda"))]
{
(
String::from("# graph_status: skipped (built without --features cuda)"),
None,
)
}
}
fn chrono_like_timestamp_or_unknown() -> String {
match std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH) {
Ok(d) => format!("{} epoch seconds", d.as_secs()),
Err(_) => String::from("unknown"),
}
}
#[cfg(feature = "cuda")]
fn run_r8_detail_stage(warmup: usize, iters: usize, tree_digest: bool) {
use std::time::Instant;
use dsfb_gpu_debug_cuda::{
build_gpu_throughput_pinned_async_on_workspace_timed, GpuWorkspace, R8HostStageTimings,
R8StageTimings,
};
if tree_digest {
run_r8_5_tree_digest_compare(warmup, iters);
return;
}
let points: [(&str, u32, u32, u32); 3] = [
("canonical 16x128 K=1", 16, 128, 1),
("mid-scale 64x512 K=1", 64, 512, 1),
("full-scale 256x4096 K=1", 256, 4096, 1),
];
for &(label, n_entities, n_windows, k) in &points {
println!();
println!("=== R.8 Bottleneck Profile — {label} ===");
println!(" warmup: {warmup} iters: {iters}");
let contract = if n_entities == 16 && n_windows == 128 {
let mut c = dsfb_gpu_debug_core::contract::Contract::canonical();
c.pin_bank_hash(dsfb_gpu_debug_core::bank::bank_hash());
c.pin_detector_registry_hash(dsfb_gpu_debug_core::motif::registry_hash());
c
} else {
let mut c = dsfb_gpu_debug_core::contract::Contract::scaled(n_entities, n_windows);
c.pin_bank_hash(dsfb_gpu_debug_core::bank::bank_hash());
c.pin_detector_registry_hash(dsfb_gpu_debug_core::motif::registry_hash());
c
};
let events = if n_entities == 16 && n_windows == 128 {
dsfb_gpu_debug_core::fixture::synthesize(dsfb_gpu_debug_core::fixture::DEFAULT_SEED)
} else {
dsfb_gpu_debug_core::fixture::synthesize_scaled(
dsfb_gpu_debug_core::fixture::DEFAULT_SEED,
n_entities,
n_windows,
4,
)
};
let Ok(mut ws) = GpuWorkspace::new_with_pinned_async(&contract) else {
println!(" workspace alloc refused; skipping {label}");
continue;
};
for _ in 0..warmup {
let _ =
build_gpu_throughput_pinned_async_on_workspace_timed(&events, &contract, &mut ws);
}
let mut wall_us: Vec<u128> = Vec::with_capacity(iters);
let mut devs: Vec<R8StageTimings> = Vec::with_capacity(iters);
let mut hosts: Vec<R8HostStageTimings> = Vec::with_capacity(iters);
for _ in 0..iters {
let t0 = Instant::now();
let result =
build_gpu_throughput_pinned_async_on_workspace_timed(&events, &contract, &mut ws);
let dt = t0.elapsed().as_nanos();
match result {
Ok((case, dev, host)) => {
std::hint::black_box(case);
devs.push(dev);
hosts.push(host);
wall_us.push(dt / 1_000);
}
Err(e) => {
println!(" dispatch error during R.8 measurement: {e:?}");
return;
}
}
}
let med_wall = median_u128(&wall_us);
let med_dev = median_stage(&devs);
let med_host = median_host(&hosts);
print_and_write_r8(label, n_entities, n_windows, k, med_dev, med_host, med_wall);
}
}
#[cfg(feature = "cuda")]
fn median_u128(samples: &[u128]) -> u128 {
if samples.is_empty() {
return 0;
}
let mut s = samples.to_vec();
s.sort_unstable();
s[s.len() / 2]
}
#[cfg(feature = "cuda")]
fn median_stage(
samples: &[dsfb_gpu_debug_cuda::R8StageTimings],
) -> dsfb_gpu_debug_cuda::R8StageTimings {
if samples.is_empty() {
return dsfb_gpu_debug_cuda::R8StageTimings::default();
}
let mid = samples.len() / 2;
let pick = |f: fn(&dsfb_gpu_debug_cuda::R8StageTimings) -> f32| -> f32 {
let mut v: Vec<f32> = samples.iter().map(f).collect();
v.sort_by(|a, b| a.partial_cmp(b).unwrap_or(core::cmp::Ordering::Equal));
v[mid]
};
dsfb_gpu_debug_cuda::R8StageTimings {
h2d_us: pick(|s| s.h2d_us),
residual_us: pick(|s| s.residual_us),
sign_us: pick(|s| s.sign_us),
detector_us: pick(|s| s.detector_us),
consensus_us: pick(|s| s.consensus_us),
candidate_us: pick(|s| s.candidate_us),
digests_us: pick(|s| s.digests_us),
d2h_us: pick(|s| s.d2h_us),
total_device_us: pick(|s| s.total_device_us),
}
}
#[cfg(feature = "cuda")]
fn median_host(
samples: &[dsfb_gpu_debug_cuda::R8HostStageTimings],
) -> dsfb_gpu_debug_cuda::R8HostStageTimings {
if samples.is_empty() {
return dsfb_gpu_debug_cuda::R8HostStageTimings::default();
}
let mid = samples.len() / 2;
let mut f: Vec<f32> = samples.iter().map(|s| s.features_us).collect();
f.sort_by(|a, b| a.partial_cmp(b).unwrap_or(core::cmp::Ordering::Equal));
let mut b: Vec<f32> = samples.iter().map(|s| s.bank_and_finalize_us).collect();
b.sort_by(|a, b| a.partial_cmp(b).unwrap_or(core::cmp::Ordering::Equal));
dsfb_gpu_debug_cuda::R8HostStageTimings {
features_us: f[mid],
bank_and_finalize_us: b[mid],
}
}
#[cfg(feature = "cuda")]
#[allow(clippy::too_many_arguments, clippy::cast_precision_loss)]
fn print_and_write_r8(
label: &str,
n_entities: u32,
n_windows: u32,
k: u32,
dev: dsfb_gpu_debug_cuda::R8StageTimings,
host: dsfb_gpu_debug_cuda::R8HostStageTimings,
med_wall_us: u128,
) {
use core::fmt::Write;
let rows: [(&str, f32); 10] = [
("feature generation (host)", host.features_us),
("H2D", dev.h2d_us),
("residual", dev.residual_us),
("sign (drift/slew EWMA)", dev.sign_us),
("detector", dev.detector_us),
("consensus", dev.consensus_us),
("candidate collapse", dev.candidate_us),
("digests (4 kernels)", dev.digests_us),
("D2H", dev.d2h_us),
("bank + case finalize (host)", host.bank_and_finalize_us),
];
let total_measured: f32 = rows.iter().map(|(_, us)| us).sum();
#[allow(clippy::cast_possible_truncation)]
let anchor = if med_wall_us == 0 {
total_measured
} else {
(med_wall_us as u64) as f32
};
let mut out = String::new();
let _ = writeln!(out, "=== R.8 Bottleneck Profile — {label} ===");
let _ = writeln!(
out,
"scale: n_entities={n_entities} n_windows={n_windows} K={k}"
);
let _ = writeln!(out, "median wall (host Instant): {med_wall_us} us");
let _ = writeln!(out, "sum of timed segments : {total_measured:.1} us");
out.push('\n');
out.push_str(" Stage us % of wall\n");
out.push_str(" -------------------------- ---------- -----------\n");
for (name, us) in &rows {
let pct = if anchor > 0.0 {
(us / anchor) * 100.0
} else {
0.0
};
let _ = writeln!(out, " {name:<26} {us:>10.1} {pct:>9.1}%");
}
out.push_str(" -------------------------- ---------- -----------\n");
let total_pct = if anchor > 0.0 {
(total_measured / anchor) * 100.0
} else {
0.0
};
let _ = writeln!(
out,
" total (timed segments) {total_measured:>10.1} {total_pct:>9.1}%"
);
let total_device_us = dev.total_device_us;
let total_device_pct = if anchor > 0.0 {
(total_device_us / anchor) * 100.0
} else {
0.0
};
let _ = writeln!(
out,
" total_device_us (event) {total_device_us:>10.1} {total_device_pct:>9.1}%"
);
let mut sorted: Vec<(&str, f32)> = rows.to_vec();
sorted.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(core::cmp::Ordering::Equal));
out.push_str("\nTop 3 stages by absolute time:\n");
for (i, (name, us)) in sorted.iter().take(3).enumerate() {
let pct = if anchor > 0.0 {
(us / anchor) * 100.0
} else {
0.0
};
let rank = i + 1;
let _ = writeln!(out, " {rank}. {name} — {us:.1} us ({pct:.1}% of wall)");
}
print!("{out}");
let filename = format!("r8_bottleneck_{n_entities}x{n_windows}_K{k}.txt");
let path = std::path::Path::new("reports").join(filename);
let _ = std::fs::create_dir_all("reports");
if let Err(e) = std::fs::write(&path, &out) {
eprintln!("warning: could not write {}: {e}", path.display());
} else {
println!("wrote R.8 profile -> {}", path.display());
}
}
#[cfg(feature = "cuda")]
#[allow(clippy::too_many_lines)]
fn run_r8_5_tree_digest_compare(warmup: usize, iters: usize) {
use std::time::Instant;
use dsfb_gpu_debug_cuda::{
build_gpu_throughput_pinned_async_on_workspace,
build_gpu_throughput_pinned_async_on_workspace_tree, GpuWorkspace,
};
let points: [(&str, u32, u32, u32); 3] = [
("canonical 16x128 K=1", 16, 128, 1),
("mid-scale 64x512 K=1", 64, 512, 1),
("full-scale 256x4096 K=1", 256, 4096, 1),
];
for &(label, n_entities, n_windows, k) in &points {
println!();
println!("=== R.8.5 tree-digest comparison — {label} ===");
println!(" warmup: {warmup} iters: {iters}");
let contract = if n_entities == 16 && n_windows == 128 {
let mut c = dsfb_gpu_debug_core::contract::Contract::canonical();
c.pin_bank_hash(dsfb_gpu_debug_core::bank::bank_hash());
c.pin_detector_registry_hash(dsfb_gpu_debug_core::motif::registry_hash());
c
} else {
let mut c = dsfb_gpu_debug_core::contract::Contract::scaled(n_entities, n_windows);
c.pin_bank_hash(dsfb_gpu_debug_core::bank::bank_hash());
c.pin_detector_registry_hash(dsfb_gpu_debug_core::motif::registry_hash());
c
};
let events = if n_entities == 16 && n_windows == 128 {
dsfb_gpu_debug_core::fixture::synthesize(dsfb_gpu_debug_core::fixture::DEFAULT_SEED)
} else {
dsfb_gpu_debug_core::fixture::synthesize_scaled(
dsfb_gpu_debug_core::fixture::DEFAULT_SEED,
n_entities,
n_windows,
4,
)
};
let mut ws_serial = match GpuWorkspace::new_with_pinned_async(&contract) {
Ok(w) => w,
Err(e) => {
println!(" workspace alloc refused: {e:?}; skipping {label}");
continue;
}
};
let mut ws_tree = match GpuWorkspace::new_with_pinned_async(&contract) {
Ok(w) => w,
Err(e) => {
println!(" workspace alloc refused: {e:?}; skipping {label}");
continue;
}
};
for _ in 0..warmup {
let _ =
build_gpu_throughput_pinned_async_on_workspace(&events, &contract, &mut ws_serial);
}
let mut serial_us: Vec<u128> = Vec::with_capacity(iters);
for _ in 0..iters {
let t0 = Instant::now();
let result =
build_gpu_throughput_pinned_async_on_workspace(&events, &contract, &mut ws_serial);
let dt = t0.elapsed().as_micros();
if let Ok(case) = result {
std::hint::black_box(case);
serial_us.push(dt);
} else {
println!(" serial-digest dispatch error: {result:?}");
return;
}
}
for _ in 0..warmup {
let _ = build_gpu_throughput_pinned_async_on_workspace_tree(
&events,
&contract,
&mut ws_tree,
);
}
let mut tree_us: Vec<u128> = Vec::with_capacity(iters);
for _ in 0..iters {
let t0 = Instant::now();
let result = build_gpu_throughput_pinned_async_on_workspace_tree(
&events,
&contract,
&mut ws_tree,
);
let dt = t0.elapsed().as_micros();
if let Ok(case) = result {
std::hint::black_box(case);
tree_us.push(dt);
} else {
println!(" tree-digest dispatch error: {result:?}");
return;
}
}
let med_serial = median_u128(&serial_us);
let med_tree = median_u128(&tree_us);
#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
let ratio = if med_tree > 0 {
(med_serial as u64) as f64 / (med_tree as u64) as f64
} else {
0.0
};
print_and_write_r8_5(label, n_entities, n_windows, k, med_serial, med_tree, ratio);
}
}
#[cfg(feature = "cuda")]
#[allow(clippy::too_many_arguments)]
fn print_and_write_r8_5(
label: &str,
n_entities: u32,
n_windows: u32,
k: u32,
med_serial_us: u128,
med_tree_us: u128,
ratio: f64,
) {
use core::fmt::Write;
let mut out = String::new();
let _ = writeln!(out, "=== R.8.5 tree-digest comparison — {label} ===");
let _ = writeln!(
out,
"scale: n_entities={n_entities} n_windows={n_windows} K={k}"
);
let _ = writeln!(out);
let _ = writeln!(out, " serial-digest median wall: {med_serial_us:>10} us");
let _ = writeln!(out, " tree-digest median wall: {med_tree_us:>10} us");
let _ = writeln!(out, " wall-time ratio (serial / tree): {ratio:.2}x");
let _ = writeln!(out);
let _ = writeln!(out, "Notes:");
let _ = writeln!(
out,
" * Both paths run the same 5 pipeline kernels (residual, sign, detector,"
);
let _ = writeln!(
out,
" consensus, candidate). They differ only in the digest stage: serial"
);
let _ = writeln!(
out,
" uses 4 single-thread `*_digest_kernel_batched` kernels; tree uses one"
);
let _ = writeln!(
out,
" block per chunk (~2048 chunks at 256x4096 with 16 KiB chunks) feeding"
);
let _ = writeln!(
out,
" a final root SHA-256 over the ordered leaf digests + domain separator."
);
let _ = writeln!(
out,
" * Stage hash bytes differ between modes by construction; case-file"
);
let _ = writeln!(
out,
" metadata records `digest_mode` so replay catches a mode mismatch."
);
print!("{out}");
let filename = format!("r8_5_tree_compare_{n_entities}x{n_windows}_K{k}.txt");
let path = std::path::Path::new("reports").join(filename);
let _ = std::fs::create_dir_all("reports");
if let Err(e) = std::fs::write(&path, &out) {
eprintln!("warning: could not write {}: {e}", path.display());
} else {
println!("wrote R.8.5 comparison -> {}", path.display());
}
}
#[cfg(feature = "cuda")]
#[allow(clippy::too_many_lines)]
fn run_r11_compact_compare(warmup: usize, iters: usize) {
use std::time::Instant;
use dsfb_gpu_debug_core::casefile::FixtureHashes;
use dsfb_gpu_debug_core::window::compute_features;
use dsfb_gpu_debug_cuda::{
build_gpu_throughput_pinned_async_on_workspace,
build_gpu_throughput_pinned_async_on_workspace_tree,
build_gpu_throughput_pinned_async_on_workspace_tree_compact, GpuWorkspace,
};
let points: [(&str, u32, u32, u32); 3] = [
("canonical 16x128 K=1", 16, 128, 1),
("mid-scale 64x512 K=1", 64, 512, 1),
("full-scale 256x4096 K=1", 256, 4096, 1),
];
for &(label, n_entities, n_windows, k) in &points {
println!();
println!("=== R.11 compact-verdict comparison — {label} ===");
println!(" warmup: {warmup} iters: {iters}");
let contract = if n_entities == 16 && n_windows == 128 {
let mut c = dsfb_gpu_debug_core::contract::Contract::canonical();
c.pin_bank_hash(dsfb_gpu_debug_core::bank::bank_hash());
c.pin_detector_registry_hash(dsfb_gpu_debug_core::motif::registry_hash());
c
} else {
let mut c = dsfb_gpu_debug_core::contract::Contract::scaled(n_entities, n_windows);
c.pin_bank_hash(dsfb_gpu_debug_core::bank::bank_hash());
c.pin_detector_registry_hash(dsfb_gpu_debug_core::motif::registry_hash());
c
};
let events = if n_entities == 16 && n_windows == 128 {
dsfb_gpu_debug_core::fixture::synthesize(dsfb_gpu_debug_core::fixture::DEFAULT_SEED)
} else {
dsfb_gpu_debug_core::fixture::synthesize_scaled(
dsfb_gpu_debug_core::fixture::DEFAULT_SEED,
n_entities,
n_windows,
4,
)
};
let features = compute_features(
&events,
contract.n_windows,
contract.n_entities,
u64::from(contract.window_size_ms) * 1_000_000,
);
let fixture = FixtureHashes::compute(&events, &features);
let Ok(mut ws_serial) = GpuWorkspace::new_with_pinned_async(&contract) else {
println!(" workspace alloc refused; skipping {label}");
continue;
};
let Ok(mut ws_tree) = GpuWorkspace::new_with_pinned_async(&contract) else {
println!(" workspace alloc refused; skipping {label}");
continue;
};
let Ok(mut ws_compact) = GpuWorkspace::new_with_pinned_async(&contract) else {
println!(" workspace alloc refused; skipping {label}");
continue;
};
for _ in 0..warmup {
let _ =
build_gpu_throughput_pinned_async_on_workspace(&events, &contract, &mut ws_serial);
}
let mut serial_us: Vec<u128> = Vec::with_capacity(iters);
for _ in 0..iters {
let t0 = Instant::now();
let result =
build_gpu_throughput_pinned_async_on_workspace(&events, &contract, &mut ws_serial);
let dt = t0.elapsed().as_micros();
match result {
Ok(case) => {
std::hint::black_box(case);
serial_us.push(dt);
}
Err(e) => {
println!(" serial-digest dispatch error: {e:?}");
return;
}
}
}
for _ in 0..warmup {
let _ = build_gpu_throughput_pinned_async_on_workspace_tree(
&events,
&contract,
&mut ws_tree,
);
}
let mut tree_us: Vec<u128> = Vec::with_capacity(iters);
for _ in 0..iters {
let t0 = Instant::now();
let result = build_gpu_throughput_pinned_async_on_workspace_tree(
&events,
&contract,
&mut ws_tree,
);
let dt = t0.elapsed().as_micros();
match result {
Ok(case) => {
std::hint::black_box(case);
tree_us.push(dt);
}
Err(e) => {
println!(" tree-digest dispatch error: {e:?}");
return;
}
}
}
for _ in 0..warmup {
let _ = build_gpu_throughput_pinned_async_on_workspace_tree_compact(
&events,
&contract,
&mut ws_compact,
&fixture,
);
}
let mut compact_us: Vec<u128> = Vec::with_capacity(iters);
for _ in 0..iters {
let t0 = Instant::now();
let result = build_gpu_throughput_pinned_async_on_workspace_tree_compact(
&events,
&contract,
&mut ws_compact,
&fixture,
);
let dt = t0.elapsed().as_micros();
match result {
Ok(case) => {
std::hint::black_box(case);
compact_us.push(dt);
}
Err(e) => {
println!(" compact-verdict dispatch error: {e:?}");
return;
}
}
}
let med_serial = median_u128(&serial_us);
let med_tree = median_u128(&tree_us);
let med_compact = median_u128(&compact_us);
#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
let ratio_serial_to_compact = if med_compact > 0 {
(med_serial as u64) as f64 / (med_compact as u64) as f64
} else {
0.0
};
#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
let ratio_tree_to_compact = if med_compact > 0 {
(med_tree as u64) as f64 / (med_compact as u64) as f64
} else {
0.0
};
print_and_write_r11(
label,
n_entities,
n_windows,
k,
med_serial,
med_tree,
med_compact,
ratio_serial_to_compact,
ratio_tree_to_compact,
);
}
}
#[cfg(feature = "cuda")]
#[allow(clippy::too_many_arguments)]
fn print_and_write_r11(
label: &str,
n_entities: u32,
n_windows: u32,
k: u32,
med_serial_us: u128,
med_tree_us: u128,
med_compact_us: u128,
ratio_serial_to_compact: f64,
ratio_tree_to_compact: f64,
) {
use core::fmt::Write;
let mut out = String::new();
let _ = writeln!(out, "=== R.11 compact-verdict comparison — {label} ===");
let _ = writeln!(
out,
"scale: n_entities={n_entities} n_windows={n_windows} K={k}"
);
let _ = writeln!(out);
let _ = writeln!(
out,
" serial-digest : {med_serial_us:>10} us"
);
let _ = writeln!(
out,
" tree-digest (R.8.5) : {med_tree_us:>10} us"
);
let _ = writeln!(
out,
" tree-digest + compact (R.11) : {med_compact_us:>10} us"
);
let _ = writeln!(out);
let _ = writeln!(
out,
" wall ratio serial / compact : {ratio_serial_to_compact:.2}x"
);
let _ = writeln!(
out,
" wall ratio tree / compact : {ratio_tree_to_compact:.2}x"
);
let _ = writeln!(out);
let _ = writeln!(out, "Notes:");
let _ = writeln!(
out,
" * Serial = legacy R.6b path (4 single-thread digest kernels + non-compact builder)."
);
let _ = writeln!(
out,
" * Tree = R.8.5 path (block-parallel tree digest + non-compact builder)."
);
let _ = writeln!(
out,
" * Compact = R.11 path (tree digest + FixtureHashes precomputed once)."
);
let _ = writeln!(
out,
" * `FixtureHashes` is computed ONCE per scale point outside the iter loop,"
);
let _ = writeln!(
out,
" matching how a long-running deployment caller would amortise the input"
);
let _ = writeln!(
out,
" commitment hash across many dispatches against the same fixture."
);
let _ = writeln!(
out,
" * Case files from all three paths are byte-identical for the serial vs."
);
let _ = writeln!(
out,
" serial pairing, and the tree pair is internally byte-identical;"
);
let _ = writeln!(
out,
" serial ≠ tree because tree commits to chunked stage bytes + a domain"
);
let _ = writeln!(
out,
" separator. Compact ≡ tree byte-for-byte by construction."
);
let _ = writeln!(
out,
" * Semantic Non-Bypass Axiom holds in every path: `bank_collapse` is the"
);
let _ = writeln!(
out,
" only mint of `BankAdmissionToken`. The compact builder reuses it."
);
print!("{out}");
let filename = format!("r11_compact_compare_{n_entities}x{n_windows}_K{k}.txt");
let path = std::path::Path::new("reports").join(filename);
let _ = std::fs::create_dir_all("reports");
if let Err(e) = std::fs::write(&path, &out) {
eprintln!("warning: could not write {}: {e}", path.display());
} else {
println!("wrote R.11 comparison -> {}", path.display());
}
}