#![allow(clippy::expect_used)]
use std::process::ExitCode;
use std::time::Instant;
use dsfb_gpu_debug_core::bank::bank_hash;
use dsfb_gpu_debug_core::casefile::{build_cpu, build_cpu_throughput};
use dsfb_gpu_debug_core::contract::Contract;
use dsfb_gpu_debug_core::event::TraceEvent;
use dsfb_gpu_debug_core::fixture::{synthesize, synthesize_scaled, DEFAULT_SEED};
use dsfb_gpu_debug_core::motif::registry_hash;
#[cfg(feature = "cuda")]
use dsfb_gpu_debug_cuda::{
build_gpu_batched_throughput, build_gpu_batched_throughput_device_digests,
build_gpu_layer_a_batched, build_gpu_layer_a_on_workspace, build_gpu_on_workspace,
build_gpu_throughput_device_digests_on_workspace, build_gpu_throughput_on_workspace,
build_gpu_timed_on_workspace, BatchedGpuWorkspace, GpuWorkspace,
};
use super::{parse_flags, usage_error};
#[allow(clippy::too_many_lines)]
pub fn parse_and_run(args: &[String]) -> ExitCode {
let flags = match parse_flags(args) {
Ok(f) => f,
Err(message) => return usage_error(&message),
};
let iters: usize = flags
.get("iters")
.map_or(100, |s| s.parse::<usize>().unwrap_or(100));
let warmup: usize = flags
.get("warmup")
.map_or(10, |s| s.parse::<usize>().unwrap_or(10));
let detail = flags.get("detail").is_some_and(|v| v != "false");
let backend = if detail && !flags.contains_key("backend") {
"gpu"
} else {
flags.get("backend").map_or("both", String::as_str)
};
let mode = flags.get("mode").map_or("audit", String::as_str);
let layer = flags.get("layer").map(String::as_str);
let scale = flags.get("scale").and_then(|s| parse_scale(s)).or_else(|| {
flags
.get("scale-large")
.filter(|v| v.as_str() != "false")
.map(|_| (256u32, 4096u32))
});
let materialize_catalog: Option<u32> = flags
.get("materialize-catalog")
.and_then(|s| s.parse::<u32>().ok());
#[cfg(feature = "cuda")]
let batch: u32 = flags
.get("batch")
.and_then(|s| s.parse::<u32>().ok())
.unwrap_or(0);
#[cfg(not(feature = "cuda"))]
let _ = flags.get("batch");
let (events, contract_dims, scaled_label) = match scale {
None => (synthesize(DEFAULT_SEED), (16u32, 128u32), String::new()),
Some((n_entities, n_windows)) => {
let events = synthesize_scaled(DEFAULT_SEED, n_entities, n_windows, 4);
(
events,
(n_entities, n_windows),
format!(" [scaled {n_entities}x{n_windows}]"),
)
}
};
let mut contract = if scale.is_some() {
Contract::scaled(contract_dims.0, contract_dims.1)
} else {
Contract::canonical()
};
contract.pin_bank_hash(bank_hash());
contract.pin_detector_registry_hash(registry_hash());
println!("dsfb-gpu-debug bench:{scaled_label}");
println!(" events : {}", events.len());
println!(" n_entities: {}", contract.n_entities);
println!(" n_windows : {}", contract.n_windows);
println!(" warmup : {warmup}");
println!(" iters : {iters}");
println!();
let run_audit = mode == "audit" || mode == "both";
let run_throughput = mode == "throughput" || mode == "both";
if let Some(layer_spec) = layer {
let layers: &[char] = match layer_spec {
"A" | "a" => &['A'],
"B" | "b" => &['B'],
"C" | "c" => &['C'],
"all" | "ABC" | "abc" => &['A', 'B', 'C'],
other => {
eprintln!("unknown --layer {other:?}; expected A | B | C | all");
return ExitCode::from(1);
}
};
let reports_dir = std::path::Path::new("reports");
for &l in layers {
run_layer_bench(
l,
&events,
&contract,
warmup,
iters,
#[cfg(feature = "cuda")]
batch,
#[cfg(not(feature = "cuda"))]
0,
Some(reports_dir),
);
}
if let Some(j) = materialize_catalog {
run_materialize_catalog(j, &events, &contract, warmup.max(1), iters.max(1));
}
return ExitCode::SUCCESS;
}
if backend == "cpu" || backend == "both" {
if run_audit {
run_cpu_bench_audit(&events, &contract, warmup, iters);
}
if run_throughput {
run_cpu_bench_throughput(&events, &contract, warmup, iters);
}
}
#[cfg(feature = "cuda")]
let device_digests = flags.get("device-digests").is_some_and(|v| v != "false");
#[cfg(not(feature = "cuda"))]
let _ = flags.get("device-digests");
#[cfg(feature = "cuda")]
if backend == "gpu" || backend == "both" {
if detail {
run_gpu_bench_with_detail(&events, &contract, warmup, iters);
} else {
if run_audit {
run_gpu_bench_audit(&events, &contract, warmup, iters);
}
if run_throughput {
run_gpu_bench_throughput(&events, &contract, warmup, iters);
}
if device_digests && run_throughput {
run_gpu_bench_throughput_device_digests(&events, &contract, warmup, iters);
}
if batch > 0 {
run_gpu_bench_batched(&events, &contract, batch, warmup, iters);
}
if batch > 0 && device_digests {
run_gpu_bench_batched_device_digests(&events, &contract, batch, warmup, iters);
}
}
}
#[cfg(not(feature = "cuda"))]
if backend == "gpu" || backend == "both" {
let _ = detail;
println!("GPU pipeline: built without --features cuda; skipping");
}
ExitCode::SUCCESS
}
fn parse_scale(s: &str) -> Option<(u32, u32)> {
let (n_entities_s, n_windows_s) = s.split_once('x')?;
let n_entities: u32 = n_entities_s.parse().ok()?;
let n_windows: u32 = n_windows_s.parse().ok()?;
if n_entities == 0 || n_windows == 0 {
return None;
}
Some((n_entities, n_windows))
}
fn run_cpu_bench_audit(events: &[TraceEvent], contract: &Contract, warmup: usize, iters: usize) {
for _ in 0..warmup {
let _ = std::hint::black_box(build_cpu(events, contract));
}
let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
for _ in 0..iters {
let t0 = Instant::now();
let case = build_cpu(events, contract);
let dt = t0.elapsed().as_micros();
std::hint::black_box(case);
samples_us.push(dt);
}
report("CPU pipeline (Audit, build_cpu)", &samples_us);
}
fn run_cpu_bench_throughput(
events: &[TraceEvent],
contract: &Contract,
warmup: usize,
iters: usize,
) {
for _ in 0..warmup {
let _ = std::hint::black_box(build_cpu_throughput(events, contract));
}
let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
for _ in 0..iters {
let t0 = Instant::now();
let case = build_cpu_throughput(events, contract);
let dt = t0.elapsed().as_micros();
std::hint::black_box(case);
samples_us.push(dt);
}
report(
"CPU pipeline (Throughput, build_cpu_throughput)",
&samples_us,
);
}
#[cfg(feature = "cuda")]
fn run_gpu_bench_audit(events: &[TraceEvent], contract: &Contract, warmup: usize, iters: usize) {
let mut workspace = GpuWorkspace::new(contract).expect("workspace allocation");
for _ in 0..warmup {
let case = build_gpu_on_workspace(&mut workspace, events, contract).expect("CUDA pipeline");
std::hint::black_box(case);
}
let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
for _ in 0..iters {
let t0 = Instant::now();
let case = build_gpu_on_workspace(&mut workspace, events, contract).expect("CUDA pipeline");
let dt = t0.elapsed().as_micros();
std::hint::black_box(case);
samples_us.push(dt);
}
report(
"GPU pipeline (Audit, workspace-resident, sm_75/80/89)",
&samples_us,
);
}
#[cfg(feature = "cuda")]
fn run_gpu_bench_batched(
events: &[TraceEvent],
contract: &Contract,
batch: u32,
warmup: usize,
iters: usize,
) {
let mut fixtures: Vec<Vec<TraceEvent>> = Vec::with_capacity(batch as usize);
fixtures.push(events.to_vec());
if batch > 1 {
let extra = dsfb_gpu_debug_core::fixture::synthesize_courthouse_factory(
dsfb_gpu_debug_core::fixture::DEFAULT_SEED.wrapping_add(0x9E37_79B9_7F4A_7C15),
batch - 1,
contract.n_entities,
contract.n_windows,
4,
);
fixtures.extend(extra);
}
let event_slices: Vec<&[TraceEvent]> = fixtures.iter().map(Vec::as_slice).collect();
let mut workspace =
BatchedGpuWorkspace::new(batch, contract).expect("batched workspace allocation");
for _ in 0..warmup {
let cases =
build_gpu_batched_throughput(&mut workspace, &event_slices, contract).expect("CUDA");
std::hint::black_box(cases);
}
let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
for _ in 0..iters {
let t0 = Instant::now();
let cases =
build_gpu_batched_throughput(&mut workspace, &event_slices, contract).expect("CUDA");
let dt = t0.elapsed().as_micros();
std::hint::black_box(cases);
samples_us.push(dt);
}
let label = format!("GPU pipeline (Batched K={batch}, workspace-resident, sm_75/80/89)");
report(&label, &samples_us);
let mut sorted = samples_us.clone();
sorted.sort_unstable();
let median_us = sorted[sorted.len() / 2];
let per_catalog_us = median_us / u128::from(batch);
let cases_per_sec = if median_us > 0 {
1_000_000u128 * u128::from(batch) / median_us
} else {
0
};
println!(
" per-catalog amortized: {per_catalog_us} us throughput: {cases_per_sec} cases/sec"
);
println!();
}
#[cfg(feature = "cuda")]
fn run_gpu_bench_throughput(
events: &[TraceEvent],
contract: &Contract,
warmup: usize,
iters: usize,
) {
let mut workspace = GpuWorkspace::new(contract).expect("workspace allocation");
for _ in 0..warmup {
let case = build_gpu_throughput_on_workspace(&mut workspace, events, contract)
.expect("CUDA pipeline");
std::hint::black_box(case);
}
let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
for _ in 0..iters {
let t0 = Instant::now();
let case = build_gpu_throughput_on_workspace(&mut workspace, events, contract)
.expect("CUDA pipeline");
let dt = t0.elapsed().as_micros();
std::hint::black_box(case);
samples_us.push(dt);
}
report(
"GPU pipeline (Throughput, workspace-resident, sm_75/80/89)",
&samples_us,
);
}
#[cfg(feature = "cuda")]
fn run_gpu_bench_with_detail(
events: &[TraceEvent],
contract: &Contract,
warmup: usize,
iters: usize,
) {
let mut workspace = GpuWorkspace::new(contract).expect("workspace allocation");
for _ in 0..warmup {
let (case, _) =
build_gpu_timed_on_workspace(&mut workspace, events, contract).expect("CUDA pipeline");
std::hint::black_box(case);
}
let mut wall_us: Vec<u128> = Vec::with_capacity(iters);
let mut alloc_us: Vec<u128> = Vec::with_capacity(iters);
let mut h2d_us: Vec<u128> = Vec::with_capacity(iters);
let mut k1_us: Vec<u128> = Vec::with_capacity(iters);
let mut k2_us: Vec<u128> = Vec::with_capacity(iters);
let mut k3_us: Vec<u128> = Vec::with_capacity(iters);
let mut k4_us: Vec<u128> = Vec::with_capacity(iters);
let mut k5_us: Vec<u128> = Vec::with_capacity(iters);
let mut d2h_us: Vec<u128> = Vec::with_capacity(iters);
let mut free_us: Vec<u128> = Vec::with_capacity(iters);
let mut device_total_us: Vec<u128> = Vec::with_capacity(iters);
for _ in 0..iters {
let t0 = Instant::now();
let (case, t) =
build_gpu_timed_on_workspace(&mut workspace, events, contract).expect("CUDA pipeline");
let dt = t0.elapsed().as_micros();
std::hint::black_box(case);
wall_us.push(dt);
push_f32_as_u128(&mut alloc_us, t.alloc_us);
push_f32_as_u128(&mut h2d_us, t.h2d_us);
push_f32_as_u128(&mut k1_us, t.k1_residual_us);
push_f32_as_u128(&mut k2_us, t.k2_sign_us);
push_f32_as_u128(&mut k3_us, t.k3_detector_us);
push_f32_as_u128(&mut k4_us, t.k4_consensus_us);
push_f32_as_u128(&mut k5_us, t.k5_candidate_us);
push_f32_as_u128(&mut d2h_us, t.d2h_us);
push_f32_as_u128(&mut free_us, t.free_us);
push_f32_as_u128(&mut device_total_us, t.total_us);
}
println!("GPU pipeline (build_gpu_timed --detail)");
report_inline("host wall time ", &wall_us);
report_inline("device alloc ", &alloc_us);
report_inline("H2D (window feats) ", &h2d_us);
report_inline("k1 residual_field ", &k1_us);
report_inline("k2 drift_slew_sign ", &k2_us);
report_inline("k3 detector_motif ", &k3_us);
report_inline("k4 consensus_grid ", &k4_us);
report_inline("k5 candidate_coll. ", &k5_us);
report_inline("D2H (all stages) ", &d2h_us);
report_inline("device free ", &free_us);
report_inline("device total ", &device_total_us);
println!();
}
#[cfg(feature = "cuda")]
fn push_f32_as_u128(samples: &mut Vec<u128>, val: f32) {
samples.push(val.round().max(0.0) as u128);
}
#[cfg(feature = "cuda")]
fn report_inline(label: &str, samples_us: &[u128]) {
if samples_us.is_empty() {
return;
}
let mut sorted = samples_us.to_vec();
sorted.sort_unstable();
let n = sorted.len() as u128;
let min = *sorted.first().unwrap_or(&0);
let max = *sorted.last().unwrap_or(&0);
let median = sorted[sorted.len() / 2];
let mean = sorted.iter().sum::<u128>() / n;
println!(
" {label} min={min:>6} us median={median:>6} us mean={mean:>6} us max={max:>6} us"
);
}
fn report(label: &str, samples_us: &[u128]) {
let mut sorted = samples_us.to_vec();
sorted.sort_unstable();
let n = sorted.len() as u128;
let min = *sorted.first().unwrap_or(&0);
let max = *sorted.last().unwrap_or(&0);
let median = sorted[sorted.len() / 2];
let sum: u128 = sorted.iter().sum();
let mean = if n == 0 { 0 } else { sum / n };
println!("{label}");
println!(" min : {min:>8} us");
println!(" median : {median:>8} us");
println!(" mean : {mean:>8} us");
println!(" max : {max:>8} us");
println!(" samples: {n}");
println!();
}
#[allow(clippy::too_many_arguments)]
fn report_layer(
label: &str,
samples_us: &[u128],
layer: char,
n_entities: u32,
n_windows: u32,
n_catalogs: u32,
n_detectors: u32,
out_dir: Option<&std::path::Path>,
file_tag: &str,
) {
let mut sorted = samples_us.to_vec();
sorted.sort_unstable();
let n_samples = sorted.len() as u128;
let min = *sorted.first().unwrap_or(&0);
let max = *sorted.last().unwrap_or(&0);
let median = if sorted.is_empty() {
0
} else {
sorted[sorted.len() / 2]
};
let sum: u128 = sorted.iter().sum();
let mean = if n_samples == 0 { 0 } else { sum / n_samples };
let catalogs = u128::from(n_catalogs);
let cells = catalogs * u128::from(n_entities) * u128::from(n_windows);
let det_evals = cells * u128::from(n_detectors);
let one_sec = 1_000_000u128;
let catalogs_per_sec = if median > 0 {
catalogs * one_sec / median
} else {
0
};
let cells_per_sec = if median > 0 {
cells * one_sec / median
} else {
0
};
let det_evals_per_sec = if median > 0 {
det_evals * one_sec / median
} else {
0
};
let per_catalog_us = if catalogs > 0 {
median / catalogs
} else {
median
};
println!("{label} [Layer {layer}]");
println!(" min : {min:>10} us");
println!(" median : {median:>10} us");
println!(" mean : {mean:>10} us");
println!(" max : {max:>10} us");
println!(" samples : {n_samples}");
println!(" n_catalogs (K) : {n_catalogs}");
println!(" per-catalog amortized: {per_catalog_us:>10} us");
println!(" catalogs/sec : {catalogs_per_sec}");
println!(" cells/sec : {cells_per_sec}");
println!(" detector-evals/sec : {det_evals_per_sec}");
println!();
if let Some(out_dir) = out_dir {
let _ = std::fs::create_dir_all(out_dir);
let filename =
format!("layer_{layer}{file_tag}_{n_entities}x{n_windows}_K{n_catalogs}.txt");
let path = out_dir.join(filename);
let body = format!(
"{label} [Layer {layer}]\n\
n_entities : {n_entities}\n\
n_windows : {n_windows}\n\
n_catalogs (K) : {n_catalogs}\n\
n_detectors : {n_detectors}\n\
samples : {n_samples}\n\
min_us : {min}\n\
median_us : {median}\n\
mean_us : {mean}\n\
max_us : {max}\n\
per_catalog_us : {per_catalog_us}\n\
catalogs_per_sec : {catalogs_per_sec}\n\
cells_per_sec : {cells_per_sec}\n\
det_evals_per_sec: {det_evals_per_sec}\n"
);
if let Err(e) = std::fs::write(&path, body) {
eprintln!("warning: could not write {}: {e}", path.display());
} else {
println!(" wrote layer report -> {}", path.display());
println!();
}
}
}
#[cfg(feature = "cuda")]
fn run_gpu_bench_throughput_device_digests(
events: &[TraceEvent],
contract: &Contract,
warmup: usize,
iters: usize,
) {
let mut workspace = GpuWorkspace::new(contract).expect("workspace allocation");
for _ in 0..warmup {
let case =
build_gpu_throughput_device_digests_on_workspace(events, contract, &mut workspace)
.expect("CUDA pipeline (device digests)");
std::hint::black_box(case);
}
let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
for _ in 0..iters {
let t0 = Instant::now();
let case =
build_gpu_throughput_device_digests_on_workspace(events, contract, &mut workspace)
.expect("CUDA pipeline (device digests)");
let dt = t0.elapsed().as_micros();
std::hint::black_box(case);
samples_us.push(dt);
}
report(
"GPU pipeline (Throughput, Tier 3B on-device SHA-256, sm_75/80/89)",
&samples_us,
);
}
#[cfg(feature = "cuda")]
fn run_gpu_bench_batched_device_digests(
events: &[TraceEvent],
contract: &Contract,
batch: u32,
warmup: usize,
iters: usize,
) {
let mut fixtures: Vec<Vec<TraceEvent>> = Vec::with_capacity(batch as usize);
fixtures.push(events.to_vec());
if batch > 1 {
let extra = dsfb_gpu_debug_core::fixture::synthesize_courthouse_factory(
dsfb_gpu_debug_core::fixture::DEFAULT_SEED.wrapping_add(0x9E37_79B9_7F4A_7C15),
batch - 1,
contract.n_entities,
contract.n_windows,
4,
);
fixtures.extend(extra);
}
let event_slices: Vec<&[TraceEvent]> = fixtures.iter().map(Vec::as_slice).collect();
let mut workspace =
BatchedGpuWorkspace::new(batch, contract).expect("batched workspace allocation");
for _ in 0..warmup {
let cases =
build_gpu_batched_throughput_device_digests(&mut workspace, &event_slices, contract)
.expect("CUDA pipeline (batched device digests)");
std::hint::black_box(cases);
}
let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
for _ in 0..iters {
let t0 = Instant::now();
let cases =
build_gpu_batched_throughput_device_digests(&mut workspace, &event_slices, contract)
.expect("CUDA pipeline (batched device digests)");
let dt = t0.elapsed().as_micros();
std::hint::black_box(cases);
samples_us.push(dt);
}
let label = format!("GPU pipeline (Batched K={batch}, Tier 3B on-device SHA-256, sm_75/80/89)");
report(&label, &samples_us);
let mut sorted = samples_us.clone();
sorted.sort_unstable();
let median_us = sorted[sorted.len() / 2];
let per_catalog_us = median_us / u128::from(batch);
let cases_per_sec = if median_us > 0 {
1_000_000u128 * u128::from(batch) / median_us
} else {
0
};
println!(
" per-catalog amortized: {per_catalog_us} us throughput: {cases_per_sec} cases/sec"
);
println!();
}
#[allow(clippy::too_many_lines)]
fn run_layer_bench(
layer: char,
events: &[TraceEvent],
contract: &Contract,
warmup: usize,
iters: usize,
batch: u32,
out_dir: Option<&std::path::Path>,
) {
let n_entities = contract.n_entities;
let n_windows = contract.n_windows;
let n_detectors = dsfb_gpu_debug_core::motif::MotifClass::COUNT as u32;
#[cfg(feature = "cuda")]
let n_catalogs = if batch == 0 { 1 } else { batch };
#[cfg(not(feature = "cuda"))]
let n_catalogs: u32 = if batch == 0 { 1 } else { batch };
#[cfg(not(feature = "cuda"))]
let _ = n_catalogs;
match layer {
'A' => {
#[cfg(feature = "cuda")]
{
let samples = run_layer_a(events, contract, batch, warmup, iters);
let label = if batch == 0 {
String::from(
"Layer A — device evidence fabric (Tier 3B device-digests, single-catalog)",
)
} else {
format!(
"Layer A — device evidence fabric (Tier 3B device-digests, K={batch} batched)"
)
};
report_layer(
&label,
&samples,
'A',
n_entities,
n_windows,
n_catalogs,
n_detectors,
out_dir,
"",
);
}
#[cfg(not(feature = "cuda"))]
{
let _ = (events, contract, batch, warmup, iters);
println!("Layer A — GPU pipeline: built without --features cuda; skipping");
println!();
}
}
'B' => {
#[cfg(feature = "cuda")]
{
let samples = run_layer_b(events, contract, batch, warmup, iters);
let label = if batch == 0 {
String::from(
"Layer B — throughput verdict summary (host bank stage, single-catalog)",
)
} else {
format!(
"Layer B — throughput verdict summary (host bank stage, K={batch} batched)"
)
};
report_layer(
&label,
&samples,
'B',
n_entities,
n_windows,
n_catalogs,
n_detectors,
out_dir,
"",
);
}
#[cfg(not(feature = "cuda"))]
{
let samples = run_layer_b_cpu(events, contract, warmup, iters);
report_layer(
"Layer B — throughput verdict summary (CPU-only, no CUDA feature)",
&samples,
'B',
n_entities,
n_windows,
1,
n_detectors,
out_dir,
"_cpu",
);
}
}
'C' => {
let cpu_samples = run_layer_c_cpu(events, contract, warmup, iters);
report_layer(
"Layer C — full audit court (CPU)",
&cpu_samples,
'C',
n_entities,
n_windows,
1,
n_detectors,
out_dir,
"_cpu",
);
#[cfg(feature = "cuda")]
{
let gpu_samples = run_layer_c_gpu(events, contract, warmup, iters);
report_layer(
"Layer C — full audit court (GPU)",
&gpu_samples,
'C',
n_entities,
n_windows,
1,
n_detectors,
out_dir,
"_gpu",
);
}
}
other => {
eprintln!("run_layer_bench: unknown layer '{other}'");
}
}
}
#[cfg(feature = "cuda")]
pub(crate) fn run_layer_a(
events: &[TraceEvent],
contract: &Contract,
batch: u32,
warmup: usize,
iters: usize,
) -> Vec<u128> {
let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
if batch == 0 {
let mut workspace = GpuWorkspace::new(contract).expect("workspace allocation");
for _ in 0..warmup {
let summary = build_gpu_layer_a_on_workspace(events, contract, &mut workspace)
.expect("CUDA Layer A (skip-bank) pipeline");
std::hint::black_box(summary);
}
for _ in 0..iters {
let t0 = Instant::now();
let summary = build_gpu_layer_a_on_workspace(events, contract, &mut workspace)
.expect("CUDA Layer A (skip-bank) pipeline");
let dt = t0.elapsed().as_micros();
std::hint::black_box(summary);
samples_us.push(dt);
}
} else {
let mut fixtures: Vec<Vec<TraceEvent>> = Vec::with_capacity(batch as usize);
fixtures.push(events.to_vec());
if batch > 1 {
let extra = dsfb_gpu_debug_core::fixture::synthesize_courthouse_factory(
dsfb_gpu_debug_core::fixture::DEFAULT_SEED.wrapping_add(0x9E37_79B9_7F4A_7C15),
batch - 1,
contract.n_entities,
contract.n_windows,
4,
);
fixtures.extend(extra);
}
let event_slices: Vec<&[TraceEvent]> = fixtures.iter().map(Vec::as_slice).collect();
let mut workspace =
BatchedGpuWorkspace::new(batch, contract).expect("batched workspace allocation");
for _ in 0..warmup {
let summaries = build_gpu_layer_a_batched(&mut workspace, &event_slices, contract)
.expect("CUDA Layer A (batched skip-bank) pipeline");
std::hint::black_box(summaries);
}
for _ in 0..iters {
let t0 = Instant::now();
let summaries = build_gpu_layer_a_batched(&mut workspace, &event_slices, contract)
.expect("CUDA Layer A (batched skip-bank) pipeline");
let dt = t0.elapsed().as_micros();
std::hint::black_box(summaries);
samples_us.push(dt);
}
}
samples_us
}
#[cfg(feature = "cuda")]
pub(crate) fn run_layer_b(
events: &[TraceEvent],
contract: &Contract,
batch: u32,
warmup: usize,
iters: usize,
) -> Vec<u128> {
let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
if batch == 0 {
let mut workspace = GpuWorkspace::new(contract).expect("workspace allocation");
for _ in 0..warmup {
let case = build_gpu_throughput_on_workspace(&mut workspace, events, contract)
.expect("CUDA pipeline (throughput)");
std::hint::black_box(case);
}
for _ in 0..iters {
let t0 = Instant::now();
let case = build_gpu_throughput_on_workspace(&mut workspace, events, contract)
.expect("CUDA pipeline (throughput)");
let dt = t0.elapsed().as_micros();
std::hint::black_box(case);
samples_us.push(dt);
}
} else {
let fixtures: Vec<Vec<TraceEvent>> = (0..batch as u64)
.map(|i| {
if i == 0 {
events.to_vec()
} else {
dsfb_gpu_debug_core::fixture::synthesize(
dsfb_gpu_debug_core::fixture::DEFAULT_SEED
.wrapping_add(i.wrapping_mul(0x9E37)),
)
}
})
.collect();
let event_slices: Vec<&[TraceEvent]> = fixtures.iter().map(Vec::as_slice).collect();
let mut workspace =
BatchedGpuWorkspace::new(batch, contract).expect("batched workspace allocation");
for _ in 0..warmup {
let cases = build_gpu_batched_throughput(&mut workspace, &event_slices, contract)
.expect("CUDA pipeline (batched throughput)");
std::hint::black_box(cases);
}
for _ in 0..iters {
let t0 = Instant::now();
let cases = build_gpu_batched_throughput(&mut workspace, &event_slices, contract)
.expect("CUDA pipeline (batched throughput)");
let dt = t0.elapsed().as_micros();
std::hint::black_box(cases);
samples_us.push(dt);
}
}
samples_us
}
#[cfg(not(feature = "cuda"))]
fn run_layer_b_cpu(
events: &[TraceEvent],
contract: &Contract,
warmup: usize,
iters: usize,
) -> Vec<u128> {
let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
for _ in 0..warmup {
let _ = std::hint::black_box(build_cpu_throughput(events, contract));
}
for _ in 0..iters {
let t0 = Instant::now();
let case = build_cpu_throughput(events, contract);
let dt = t0.elapsed().as_micros();
std::hint::black_box(case);
samples_us.push(dt);
}
samples_us
}
pub(crate) fn run_layer_b_cpu_always(
events: &[TraceEvent],
contract: &Contract,
warmup: usize,
iters: usize,
) -> Vec<u128> {
let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
for _ in 0..warmup {
let _ = std::hint::black_box(build_cpu_throughput(events, contract));
}
for _ in 0..iters {
let t0 = Instant::now();
let case = build_cpu_throughput(events, contract);
let dt = t0.elapsed().as_micros();
std::hint::black_box(case);
samples_us.push(dt);
}
samples_us
}
pub(crate) fn run_layer_c_cpu(
events: &[TraceEvent],
contract: &Contract,
warmup: usize,
iters: usize,
) -> Vec<u128> {
let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
for _ in 0..warmup {
let _ = std::hint::black_box(build_cpu(events, contract));
}
for _ in 0..iters {
let t0 = Instant::now();
let case = build_cpu(events, contract);
let dt = t0.elapsed().as_micros();
std::hint::black_box(case);
samples_us.push(dt);
}
samples_us
}
#[cfg(feature = "cuda")]
pub(crate) fn run_layer_c_gpu(
events: &[TraceEvent],
contract: &Contract,
warmup: usize,
iters: usize,
) -> Vec<u128> {
let mut workspace = GpuWorkspace::new(contract).expect("workspace allocation");
let mut samples_us: Vec<u128> = Vec::with_capacity(iters);
for _ in 0..warmup {
let case = build_gpu_on_workspace(&mut workspace, events, contract)
.expect("CUDA pipeline (audit)");
std::hint::black_box(case);
}
for _ in 0..iters {
let t0 = Instant::now();
let case = build_gpu_on_workspace(&mut workspace, events, contract)
.expect("CUDA pipeline (audit)");
let dt = t0.elapsed().as_micros();
std::hint::black_box(case);
samples_us.push(dt);
}
samples_us
}
fn run_materialize_catalog(
j: u32,
primary_events: &[TraceEvent],
contract: &Contract,
warmup: usize,
iters: usize,
) {
let events: Vec<TraceEvent> = if j == 0 {
primary_events.to_vec()
} else {
let derived_seed = dsfb_gpu_debug_core::fixture::DEFAULT_SEED
.wrapping_add(0x9E37_79B9_7F4A_7C15)
^ u64::from(j - 1).wrapping_mul(0x9E37_79B9_7F4A_7C15);
dsfb_gpu_debug_core::fixture::synthesize_scaled(
derived_seed,
contract.n_entities,
contract.n_windows,
4,
)
};
println!();
println!("Materialising catalog J={j} as Layer C transcript on demand (R.3a opt-in)");
println!(" derived events : {}", events.len());
let n_entities = contract.n_entities;
let n_windows = contract.n_windows;
let n_detectors = dsfb_gpu_debug_core::motif::MotifClass::COUNT as u32;
let reports_dir = std::path::Path::new("reports");
let cpu_samples = run_layer_c_cpu(&events, contract, warmup, iters);
let cpu_label = format!("Layer C — materialised catalog J={j} (CPU)");
report_layer(
&cpu_label,
&cpu_samples,
'C',
n_entities,
n_windows,
1,
n_detectors,
Some(reports_dir),
&format!("_materialize_{j}_cpu"),
);
#[cfg(feature = "cuda")]
{
let gpu_samples = run_layer_c_gpu(&events, contract, warmup, iters);
let gpu_label = format!("Layer C — materialised catalog J={j} (GPU)");
report_layer(
&gpu_label,
&gpu_samples,
'C',
n_entities,
n_windows,
1,
n_detectors,
Some(reports_dir),
&format!("_materialize_{j}_gpu"),
);
}
}