//! ROCm/HIP p-adic benchmark harness (gated on `rocm-hip`).
//!
//! The `hip_padic_benchmarks` module wires the p-adic valuation
//! pilot into the `RocmBenchmarkReport` consumed by the
//! release-gate tests. Records source/compiler fingerprint and
//! per-benchmark `duration_ns`.
//!
use std::fs;
use std::path::Path;
use std::time::Instant;
use crate::backend::hip_padic_matmul::{
ROCM_HIP_PADIC_STRATIFIED_MATMUL_BACKEND, run_rocm_hip_padic_stratified_matmul_with_shape,
};
use crate::backend::rocm::{RocmHipCapabilityReport, detect_local_rocm_hip};
use crate::domain::{PadicDomain, PadicMatrix, PadicOutputCertificate};
use crate::{Error, Result};
pub const PADIC_STRATIFIED_BENCHMARK_ARTIFACT: &str =
"tokitai-padic-stratified-matmul-benchmark-report";
pub const PADIC_STRATIFIED_BENCHMARK_VERSION: u32 = 1;
pub const PADIC_STRATIFIED_BENCHMARK_DEFAULT_WARMUP_RUNS: usize = 1;
pub const PADIC_STRATIFIED_BENCHMARK_DEFAULT_MEASURED_RUNS: usize = 2;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct PadicStratifiedBenchmarkConfig {
pub warmup_runs: usize,
pub measured_runs: usize,
}
impl Default for PadicStratifiedBenchmarkConfig {
fn default() -> Self {
Self {
warmup_runs: PADIC_STRATIFIED_BENCHMARK_DEFAULT_WARMUP_RUNS,
measured_runs: PADIC_STRATIFIED_BENCHMARK_DEFAULT_MEASURED_RUNS,
}
}
}
impl PadicStratifiedBenchmarkConfig {
pub fn validate(self) -> Result<Self> {
if self.measured_runs < 2 {
return Err(Error::backend(
"p-adic benchmark measured_runs must be at least 2 for repeated timing summaries",
));
}
Ok(self)
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct PadicStratifiedBenchmarkReport {
pub artifact: String,
pub version: u32,
pub warmup_runs: usize,
pub measured_runs: usize,
pub rows: Vec<PadicStratifiedBenchmarkRow>,
pub fixtures: Vec<PadicStratifiedBenchmarkFixtureSummary>,
pub non_claims: Vec<String>,
pub speed_claim_policy: String,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct PadicStratifiedBenchmarkFixtureSummary {
pub name: String,
pub distribution: String,
pub shape: String,
pub prime: u64,
pub precision: u32,
pub lhs_bucket_fingerprint: String,
pub rhs_bucket_fingerprint: String,
pub lhs_valuation_histogram: String,
pub rhs_valuation_histogram: String,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct PadicStratifiedBenchmarkRow {
pub fixture: String,
pub distribution: String,
pub backend: String,
pub status: String,
pub shape: String,
pub prime: u64,
pub precision: u32,
pub skipped_products: usize,
pub evaluated_products: usize,
pub precision_margin_min: Option<u32>,
pub dense_cpu_oracle_matches: bool,
pub sparse_cpu_oracle_matches: bool,
pub certificate_coverage: bool,
pub output_residue_fingerprint: String,
pub device_fingerprint: String,
pub kernel_source_fingerprint: String,
pub compiler_fingerprint: String,
pub transfer_time_ns: u128,
pub kernel_time_ns: u128,
pub wall_clock_ns: u128,
pub warmup_runs: usize,
pub measured_runs: usize,
pub transfer_time_min_ns: u128,
pub transfer_time_median_ns: u128,
pub transfer_time_max_ns: u128,
pub kernel_time_min_ns: u128,
pub kernel_time_median_ns: u128,
pub kernel_time_max_ns: u128,
pub wall_clock_min_ns: u128,
pub wall_clock_median_ns: u128,
pub wall_clock_max_ns: u128,
pub timing_scope: String,
pub external_baseline_kind: String,
pub external_baseline_source: String,
pub external_baseline_wall_clock_ns: u128,
pub profiler_tool: String,
pub profiler_trace_status: String,
pub profiler_kernel_time_ns: u128,
pub speedup_evidence_status: String,
pub fallback_reason: String,
pub speed_claim_allowed: bool,
pub speed_claim_blocker: String,
}
#[derive(Debug, Clone)]
struct PadicStratifiedBenchmarkFixture {
name: &'static str,
distribution: &'static str,
rows: usize,
inner: usize,
cols: usize,
prime: u64,
precision: u32,
lhs: Vec<u64>,
rhs: Vec<u64>,
}
pub fn generate_padic_stratified_benchmark_report() -> Result<PadicStratifiedBenchmarkReport> {
generate_padic_stratified_benchmark_report_with_config(PadicStratifiedBenchmarkConfig::default())
}
pub fn generate_padic_stratified_benchmark_report_with_config(
config: PadicStratifiedBenchmarkConfig,
) -> Result<PadicStratifiedBenchmarkReport> {
let config = config.validate()?;
let device = detect_local_rocm_hip();
let fixtures = benchmark_fixtures();
let mut summaries = Vec::with_capacity(fixtures.len());
let mut rows = Vec::new();
for fixture in &fixtures {
let domain = PadicDomain::new(fixture.prime, fixture.precision)?;
let lhs = matrix_from_residues(&domain, fixture.rows, fixture.inner, &fixture.lhs)?;
let rhs = matrix_from_residues(&domain, fixture.inner, fixture.cols, &fixture.rhs)?;
summaries.push(fixture_summary(fixture, &lhs, &rhs)?);
let dense = dense_cpu_row(fixture, &domain, &lhs, &rhs, config)?;
let sparse = certified_sparse_cpu_row(fixture, &domain, &lhs, &rhs, &dense, config)?;
rows.push(dense.clone());
rows.push(sparse.clone());
rows.push(hip_or_fallback_row(
fixture, &domain, &device, &dense, &sparse, config,
)?);
}
let report = PadicStratifiedBenchmarkReport {
artifact: PADIC_STRATIFIED_BENCHMARK_ARTIFACT.to_string(),
version: PADIC_STRATIFIED_BENCHMARK_VERSION,
warmup_runs: config.warmup_runs,
measured_runs: config.measured_runs,
rows,
fixtures: summaries,
non_claims: vec![
"benchmark timings are repeated local measurements, not profiler-backed performance evidence"
.to_string(),
"work reduction is reported separately from hardware speedup".to_string(),
"external baseline and profiler fields are present but block speed claims until reviewed evidence is supplied".to_string(),
"ROCm/HIP rows are local hardware evidence, not portable AMD GPU support".to_string(),
"unsupported prime, precision, or shape rows are fallback evidence, not failed correctness"
.to_string(),
],
speed_claim_policy:
"speed claims remain blocked unless dense CPU, certified sparse CPU, HIP output, certificate coverage, repeated timing summaries, and external baseline criteria all pass for a fixture"
.to_string(),
};
validate_padic_stratified_speed_claims(&report)?;
Ok(report)
}
pub fn write_padic_stratified_benchmark_artifacts(
report: &PadicStratifiedBenchmarkReport,
dir: impl AsRef<Path>,
) -> Result<()> {
let dir = dir.as_ref();
fs::create_dir_all(dir)
.map_err(|err| Error::backend(format!("failed to create benchmark artifact dir: {err}")))?;
fs::write(
dir.join("padic-stratified-benchmarks.json"),
report.to_json(),
)
.map_err(|err| Error::backend(format!("failed to write benchmark JSON: {err}")))?;
fs::write(dir.join("padic-stratified-benchmarks.csv"), report.to_csv())
.map_err(|err| Error::backend(format!("failed to write benchmark CSV: {err}")))?;
fs::write(
dir.join("padic-stratified-benchmarks.md"),
report.to_markdown(),
)
.map_err(|err| Error::backend(format!("failed to write benchmark Markdown: {err}")))?;
Ok(())
}
pub fn validate_padic_stratified_speed_claims(
report: &PadicStratifiedBenchmarkReport,
) -> Result<()> {
for row in &report.rows {
if row.speed_claim_allowed
&& (!row.dense_cpu_oracle_matches
|| !row.sparse_cpu_oracle_matches
|| !row.certificate_coverage
|| row.measured_runs < 2
|| row.transfer_time_ns == 0
|| row.kernel_time_ns == 0
|| row.wall_clock_ns == 0
|| row.transfer_time_min_ns == 0
|| row.transfer_time_median_ns == 0
|| row.transfer_time_max_ns == 0
|| row.kernel_time_min_ns == 0
|| row.kernel_time_median_ns == 0
|| row.kernel_time_max_ns == 0
|| row.wall_clock_min_ns == 0
|| row.wall_clock_median_ns == 0
|| row.wall_clock_max_ns == 0)
{
return Err(Error::verification(format!(
"speed claim for fixture {} backend {} lacks correctness, certificate, repeated timing, or isolated timing coverage",
row.fixture, row.backend
)));
}
if row.speed_claim_allowed
&& (row.external_baseline_wall_clock_ns == 0
|| row.external_baseline_source == "not_provided"
|| row.profiler_trace_status != "captured"
|| row.profiler_kernel_time_ns == 0)
{
return Err(Error::verification(format!(
"speed claim for fixture {} backend {} lacks external baseline or profiler evidence",
row.fixture, row.backend
)));
}
if row.speed_claim_allowed && row.external_baseline_wall_clock_ns <= row.wall_clock_ns {
return Err(Error::verification(format!(
"speed claim for fixture {} backend {} lacks measured speedup against external baseline",
row.fixture, row.backend
)));
}
}
Ok(())
}
impl PadicStratifiedBenchmarkReport {
pub fn to_markdown(&self) -> String {
let mut lines = vec![
"# Valuation-Stratified p-adic Matmul Benchmarks".to_string(),
String::new(),
format!("artifact: {}", self.artifact),
format!("version: {}", self.version),
format!("warmup_runs: {}", self.warmup_runs),
format!("measured_runs: {}", self.measured_runs),
format!("speed_claim_policy: {}", self.speed_claim_policy),
String::new(),
"## Rows".to_string(),
"| Fixture | Distribution | Backend | Status | Shape | p | k | Skipped | Evaluated | Margin | Dense oracle | Sparse oracle | Certificates | Warmup | Measured | Transfer ns min/median/max | Kernel ns min/median/max | Wall ns min/median/max | External baseline | Profiler | Speed evidence | Speed claim | Fallback |".to_string(),
"| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |".to_string(),
];
for row in &self.rows {
lines.push(format!(
"| {} | {} | {} | {} | {} | {} | {} | {} | {} | {} | {} | {} | {} | {} | {} | {}/{}/{} | {}/{}/{} | {}/{}/{} | {}:{}ns | {}:{}ns | {} | {} | {} |",
md(&row.fixture),
md(&row.distribution),
md(&row.backend),
md(&row.status),
md(&row.shape),
row.prime,
row.precision,
row.skipped_products,
row.evaluated_products,
row.precision_margin_min
.map(|value| value.to_string())
.unwrap_or_else(|| "none".to_string()),
row.dense_cpu_oracle_matches,
row.sparse_cpu_oracle_matches,
row.certificate_coverage,
row.warmup_runs,
row.measured_runs,
row.transfer_time_min_ns,
row.transfer_time_median_ns,
row.transfer_time_max_ns,
row.kernel_time_min_ns,
row.kernel_time_median_ns,
row.kernel_time_max_ns,
row.wall_clock_min_ns,
row.wall_clock_median_ns,
row.wall_clock_max_ns,
md(&row.external_baseline_kind),
row.external_baseline_wall_clock_ns,
md(&row.profiler_trace_status),
row.profiler_kernel_time_ns,
md(&row.speedup_evidence_status),
row.speed_claim_allowed,
md(&row.fallback_reason)
));
}
lines.push(String::new());
lines.push("## Fixtures".to_string());
for fixture in &self.fixtures {
lines.push(format!(
"- {}: distribution={}, shape={}, p={}, precision={}, lhs_bucket={}, rhs_bucket={}, lhs_hist={}, rhs_hist={}",
fixture.name,
fixture.distribution,
fixture.shape,
fixture.prime,
fixture.precision,
fixture.lhs_bucket_fingerprint,
fixture.rhs_bucket_fingerprint,
fixture.lhs_valuation_histogram,
fixture.rhs_valuation_histogram
));
}
lines.push(String::new());
lines.push("## Non-Claims".to_string());
for item in &self.non_claims {
lines.push(format!("- {item}"));
}
lines.join("\n")
}
pub fn to_csv(&self) -> String {
let mut lines = vec![
"fixture,distribution,backend,status,shape,prime,precision,skipped_products,evaluated_products,precision_margin_min,dense_cpu_oracle_matches,sparse_cpu_oracle_matches,certificate_coverage,output_residue_fingerprint,device_fingerprint,kernel_source_fingerprint,compiler_fingerprint,transfer_time_ns,kernel_time_ns,wall_clock_ns,warmup_runs,measured_runs,transfer_time_min_ns,transfer_time_median_ns,transfer_time_max_ns,kernel_time_min_ns,kernel_time_median_ns,kernel_time_max_ns,wall_clock_min_ns,wall_clock_median_ns,wall_clock_max_ns,timing_scope,external_baseline_kind,external_baseline_source,external_baseline_wall_clock_ns,profiler_tool,profiler_trace_status,profiler_kernel_time_ns,speedup_evidence_status,fallback_reason,speed_claim_allowed,speed_claim_blocker".to_string(),
];
for row in &self.rows {
lines.push(
[
csv(&row.fixture),
csv(&row.distribution),
csv(&row.backend),
csv(&row.status),
csv(&row.shape),
row.prime.to_string(),
row.precision.to_string(),
row.skipped_products.to_string(),
row.evaluated_products.to_string(),
row.precision_margin_min
.map(|value| value.to_string())
.unwrap_or_else(String::new),
row.dense_cpu_oracle_matches.to_string(),
row.sparse_cpu_oracle_matches.to_string(),
row.certificate_coverage.to_string(),
csv(&row.output_residue_fingerprint),
csv(&row.device_fingerprint),
csv(&row.kernel_source_fingerprint),
csv(&row.compiler_fingerprint),
row.transfer_time_ns.to_string(),
row.kernel_time_ns.to_string(),
row.wall_clock_ns.to_string(),
row.warmup_runs.to_string(),
row.measured_runs.to_string(),
row.transfer_time_min_ns.to_string(),
row.transfer_time_median_ns.to_string(),
row.transfer_time_max_ns.to_string(),
row.kernel_time_min_ns.to_string(),
row.kernel_time_median_ns.to_string(),
row.kernel_time_max_ns.to_string(),
row.wall_clock_min_ns.to_string(),
row.wall_clock_median_ns.to_string(),
row.wall_clock_max_ns.to_string(),
csv(&row.timing_scope),
csv(&row.external_baseline_kind),
csv(&row.external_baseline_source),
row.external_baseline_wall_clock_ns.to_string(),
csv(&row.profiler_tool),
csv(&row.profiler_trace_status),
row.profiler_kernel_time_ns.to_string(),
csv(&row.speedup_evidence_status),
csv(&row.fallback_reason),
row.speed_claim_allowed.to_string(),
csv(&row.speed_claim_blocker),
]
.join(","),
);
}
lines.join("\n")
}
pub fn to_json(&self) -> String {
format!(
"{{\"artifact\":{},\"version\":{},\"warmup_runs\":{},\"measured_runs\":{},\"speed_claim_policy\":{},\"rows\":[{}],\"fixtures\":[{}],\"non_claims\":{}}}",
json_string(&self.artifact),
self.version,
self.warmup_runs,
self.measured_runs,
json_string(&self.speed_claim_policy),
self.rows
.iter()
.map(PadicStratifiedBenchmarkRow::to_json)
.collect::<Vec<_>>()
.join(","),
self.fixtures
.iter()
.map(PadicStratifiedBenchmarkFixtureSummary::to_json)
.collect::<Vec<_>>()
.join(","),
json_array(&self.non_claims)
)
}
}
impl PadicStratifiedBenchmarkFixtureSummary {
fn to_json(&self) -> String {
format!(
"{{\"name\":{},\"distribution\":{},\"shape\":{},\"prime\":{},\"precision\":{},\"lhs_bucket_fingerprint\":{},\"rhs_bucket_fingerprint\":{},\"lhs_valuation_histogram\":{},\"rhs_valuation_histogram\":{}}}",
json_string(&self.name),
json_string(&self.distribution),
json_string(&self.shape),
self.prime,
self.precision,
json_string(&self.lhs_bucket_fingerprint),
json_string(&self.rhs_bucket_fingerprint),
json_string(&self.lhs_valuation_histogram),
json_string(&self.rhs_valuation_histogram)
)
}
}
impl PadicStratifiedBenchmarkRow {
fn to_json(&self) -> String {
format!(
"{{\"fixture\":{},\"distribution\":{},\"backend\":{},\"status\":{},\"shape\":{},\"prime\":{},\"precision\":{},\"skipped_products\":{},\"evaluated_products\":{},\"precision_margin_min\":{},\"dense_cpu_oracle_matches\":{},\"sparse_cpu_oracle_matches\":{},\"certificate_coverage\":{},\"output_residue_fingerprint\":{},\"device_fingerprint\":{},\"kernel_source_fingerprint\":{},\"compiler_fingerprint\":{},\"transfer_time_ns\":{},\"kernel_time_ns\":{},\"wall_clock_ns\":{},\"warmup_runs\":{},\"measured_runs\":{},\"transfer_time_min_ns\":{},\"transfer_time_median_ns\":{},\"transfer_time_max_ns\":{},\"kernel_time_min_ns\":{},\"kernel_time_median_ns\":{},\"kernel_time_max_ns\":{},\"wall_clock_min_ns\":{},\"wall_clock_median_ns\":{},\"wall_clock_max_ns\":{},\"timing_scope\":{},\"external_baseline_kind\":{},\"external_baseline_source\":{},\"external_baseline_wall_clock_ns\":{},\"profiler_tool\":{},\"profiler_trace_status\":{},\"profiler_kernel_time_ns\":{},\"speedup_evidence_status\":{},\"fallback_reason\":{},\"speed_claim_allowed\":{},\"speed_claim_blocker\":{}}}",
json_string(&self.fixture),
json_string(&self.distribution),
json_string(&self.backend),
json_string(&self.status),
json_string(&self.shape),
self.prime,
self.precision,
self.skipped_products,
self.evaluated_products,
json_option_u32(self.precision_margin_min),
self.dense_cpu_oracle_matches,
self.sparse_cpu_oracle_matches,
self.certificate_coverage,
json_string(&self.output_residue_fingerprint),
json_string(&self.device_fingerprint),
json_string(&self.kernel_source_fingerprint),
json_string(&self.compiler_fingerprint),
self.transfer_time_ns,
self.kernel_time_ns,
self.wall_clock_ns,
self.warmup_runs,
self.measured_runs,
self.transfer_time_min_ns,
self.transfer_time_median_ns,
self.transfer_time_max_ns,
self.kernel_time_min_ns,
self.kernel_time_median_ns,
self.kernel_time_max_ns,
self.wall_clock_min_ns,
self.wall_clock_median_ns,
self.wall_clock_max_ns,
json_string(&self.timing_scope),
json_string(&self.external_baseline_kind),
json_string(&self.external_baseline_source),
self.external_baseline_wall_clock_ns,
json_string(&self.profiler_tool),
json_string(&self.profiler_trace_status),
self.profiler_kernel_time_ns,
json_string(&self.speedup_evidence_status),
json_string(&self.fallback_reason),
self.speed_claim_allowed,
json_string(&self.speed_claim_blocker)
)
}
}
fn dense_cpu_row(
fixture: &PadicStratifiedBenchmarkFixture,
domain: &PadicDomain,
lhs: &PadicMatrix,
rhs: &PadicMatrix,
config: PadicStratifiedBenchmarkConfig,
) -> Result<PadicStratifiedBenchmarkRow> {
for _ in 0..config.warmup_runs {
let _ = domain.dense_matrix_mul(lhs, rhs)?;
}
let mut wall_clock_samples = Vec::with_capacity(config.measured_runs);
let mut dense = None;
for _ in 0..config.measured_runs {
let start = Instant::now();
let output = domain.dense_matrix_mul(lhs, rhs)?;
wall_clock_samples.push(start.elapsed().as_nanos());
dense = Some(output);
}
let dense = dense.ok_or_else(|| Error::backend("missing dense CPU benchmark measurement"))?;
let wall_clock = TimingSummary::from_samples(&wall_clock_samples)?;
Ok(PadicStratifiedBenchmarkRow {
fixture: fixture.name.to_string(),
distribution: fixture.distribution.to_string(),
backend: "dense_cpu_padic_matmul".to_string(),
status: "passed".to_string(),
shape: fixture.shape(),
prime: fixture.prime,
precision: fixture.precision,
skipped_products: 0,
evaluated_products: fixture.rows * fixture.inner * fixture.cols,
precision_margin_min: None,
dense_cpu_oracle_matches: true,
sparse_cpu_oracle_matches: true,
certificate_coverage: false,
output_residue_fingerprint: residue_fingerprint(&dense.data),
device_fingerprint: "cpu".to_string(),
kernel_source_fingerprint: "not_applicable".to_string(),
compiler_fingerprint: "rust_cpu".to_string(),
transfer_time_ns: 0,
kernel_time_ns: 0,
wall_clock_ns: wall_clock.median,
warmup_runs: config.warmup_runs,
measured_runs: config.measured_runs,
transfer_time_min_ns: 0,
transfer_time_median_ns: 0,
transfer_time_max_ns: 0,
kernel_time_min_ns: 0,
kernel_time_median_ns: 0,
kernel_time_max_ns: 0,
wall_clock_min_ns: wall_clock.min,
wall_clock_median_ns: wall_clock.median,
wall_clock_max_ns: wall_clock.max,
timing_scope: "repeated_dense_cpu_oracle_wall_clock".to_string(),
external_baseline_kind: "not_applicable_cpu_oracle".to_string(),
external_baseline_source: "not_provided".to_string(),
external_baseline_wall_clock_ns: 0,
profiler_tool: "not_captured".to_string(),
profiler_trace_status: "missing".to_string(),
profiler_kernel_time_ns: 0,
speedup_evidence_status: "blocked_cpu_oracle_row".to_string(),
fallback_reason: "none".to_string(),
speed_claim_allowed: false,
speed_claim_blocker: "dense CPU row is an oracle baseline, not an acceleration claim"
.to_string(),
})
}
fn certified_sparse_cpu_row(
fixture: &PadicStratifiedBenchmarkFixture,
domain: &PadicDomain,
lhs: &PadicMatrix,
rhs: &PadicMatrix,
dense: &PadicStratifiedBenchmarkRow,
config: PadicStratifiedBenchmarkConfig,
) -> Result<PadicStratifiedBenchmarkRow> {
for _ in 0..config.warmup_runs {
let _ = domain.certified_valuation_sparse_matrix_mul(lhs, rhs)?;
}
let mut wall_clock_samples = Vec::with_capacity(config.measured_runs);
let mut sparse = None;
for _ in 0..config.measured_runs {
let start = Instant::now();
let output = domain.certified_valuation_sparse_matrix_mul(lhs, rhs)?;
wall_clock_samples.push(start.elapsed().as_nanos());
sparse = Some(output);
}
let sparse = sparse
.ok_or_else(|| Error::backend("missing certified sparse CPU benchmark measurement"))?;
let wall_clock = TimingSummary::from_samples(&wall_clock_samples)?;
let sparse_fingerprint = residue_fingerprint(&sparse.output.data);
let oracle_matches =
sparse.dense_oracle_matches && sparse_fingerprint == dense.output_residue_fingerprint;
let certificate_coverage = certificate_coverage(
&sparse.output_certificates,
fixture.rows,
fixture.cols,
sparse.evaluated_products,
sparse.skipped_products,
);
Ok(PadicStratifiedBenchmarkRow {
fixture: fixture.name.to_string(),
distribution: fixture.distribution.to_string(),
backend: "certified_sparse_cpu_padic_matmul".to_string(),
status: "passed".to_string(),
shape: fixture.shape(),
prime: fixture.prime,
precision: fixture.precision,
skipped_products: sparse.skipped_products,
evaluated_products: sparse.evaluated_products,
precision_margin_min: min_margin(&sparse.output_certificates),
dense_cpu_oracle_matches: oracle_matches,
sparse_cpu_oracle_matches: oracle_matches,
certificate_coverage,
output_residue_fingerprint: sparse_fingerprint,
device_fingerprint: "cpu".to_string(),
kernel_source_fingerprint: "not_applicable".to_string(),
compiler_fingerprint: "rust_cpu".to_string(),
transfer_time_ns: 0,
kernel_time_ns: 0,
wall_clock_ns: wall_clock.median,
warmup_runs: config.warmup_runs,
measured_runs: config.measured_runs,
transfer_time_min_ns: 0,
transfer_time_median_ns: 0,
transfer_time_max_ns: 0,
kernel_time_min_ns: 0,
kernel_time_median_ns: 0,
kernel_time_max_ns: 0,
wall_clock_min_ns: wall_clock.min,
wall_clock_median_ns: wall_clock.median,
wall_clock_max_ns: wall_clock.max,
timing_scope: "repeated_certified_sparse_cpu_oracle_wall_clock".to_string(),
external_baseline_kind: "not_applicable_cpu_oracle".to_string(),
external_baseline_source: "not_provided".to_string(),
external_baseline_wall_clock_ns: 0,
profiler_tool: "not_captured".to_string(),
profiler_trace_status: "missing".to_string(),
profiler_kernel_time_ns: 0,
speedup_evidence_status: "blocked_cpu_work_reduction_not_speedup".to_string(),
fallback_reason: "none".to_string(),
speed_claim_allowed: false,
speed_claim_blocker:
"CPU sparse row quantifies work reduction but does not isolate speedup".to_string(),
})
}
fn hip_or_fallback_row(
fixture: &PadicStratifiedBenchmarkFixture,
domain: &PadicDomain,
device: &RocmHipCapabilityReport,
dense: &PadicStratifiedBenchmarkRow,
sparse: &PadicStratifiedBenchmarkRow,
config: PadicStratifiedBenchmarkConfig,
) -> Result<PadicStratifiedBenchmarkRow> {
if !fixture.hip_supported() {
return Ok(hip_fallback_row(
fixture,
device,
dense,
sparse,
"unsupported prime, precision, or shape for the runtime-shape HIP pilot",
config,
));
}
if !device.available {
return Ok(hip_fallback_row(
fixture,
device,
dense,
sparse,
"ROCm/HIP unavailable on this host",
config,
));
}
for _ in 0..config.warmup_runs {
let _ = run_rocm_hip_padic_stratified_matmul_with_shape(
domain,
(fixture.rows, fixture.inner, fixture.cols),
&fixture.lhs,
&fixture.rhs,
)?;
}
let mut transfer_samples = Vec::with_capacity(config.measured_runs);
let mut kernel_samples = Vec::with_capacity(config.measured_runs);
let mut wall_clock_samples = Vec::with_capacity(config.measured_runs);
let mut report = None;
for _ in 0..config.measured_runs {
let start = Instant::now();
let measured_report = run_rocm_hip_padic_stratified_matmul_with_shape(
domain,
(fixture.rows, fixture.inner, fixture.cols),
&fixture.lhs,
&fixture.rhs,
)?;
wall_clock_samples.push(start.elapsed().as_nanos());
transfer_samples.push(measured_report.transfer_time_ns);
kernel_samples.push(measured_report.kernel_time_ns);
report = Some(measured_report);
}
let report = report.ok_or_else(|| Error::backend("missing HIP benchmark measurement"))?;
let transfer = TimingSummary::from_samples(&transfer_samples)?;
let kernel = TimingSummary::from_samples(&kernel_samples)?;
let wall_clock = TimingSummary::from_samples(&wall_clock_samples)?;
let output_residue_fingerprint = u64_residue_fingerprint(&report.hip_output_residues);
let dense_matches = report.cpu_dense_oracle_matches
&& output_residue_fingerprint == dense.output_residue_fingerprint;
let sparse_matches = report.cpu_sparse_oracle_matches
&& output_residue_fingerprint == sparse.output_residue_fingerprint;
let skipped_products = report
.hip_certificates
.iter()
.map(|cert| cert.skipped_product_count)
.sum();
let evaluated_products = report
.hip_certificates
.iter()
.map(|cert| cert.evaluated_product_count)
.sum();
let certificate_coverage = certificate_coverage(
&report.hip_certificates,
fixture.rows,
fixture.cols,
evaluated_products,
skipped_products,
) && report.certificate_oracle_matches;
Ok(PadicStratifiedBenchmarkRow {
fixture: fixture.name.to_string(),
distribution: fixture.distribution.to_string(),
backend: report.backend,
status: "passed".to_string(),
shape: fixture.shape(),
prime: fixture.prime,
precision: fixture.precision,
skipped_products,
evaluated_products,
precision_margin_min: min_margin(&report.hip_certificates),
dense_cpu_oracle_matches: dense_matches,
sparse_cpu_oracle_matches: sparse_matches,
certificate_coverage,
output_residue_fingerprint,
device_fingerprint: report.device_evidence.capability_fingerprint,
kernel_source_fingerprint: report.kernel_source_fingerprint,
compiler_fingerprint: report.compiler_fingerprint,
transfer_time_ns: transfer.median,
kernel_time_ns: kernel.median,
wall_clock_ns: wall_clock.median,
warmup_runs: config.warmup_runs,
measured_runs: config.measured_runs,
transfer_time_min_ns: transfer.min,
transfer_time_median_ns: transfer.median,
transfer_time_max_ns: transfer.max,
kernel_time_min_ns: kernel.min,
kernel_time_median_ns: kernel.median,
kernel_time_max_ns: kernel.max,
wall_clock_min_ns: wall_clock.min,
wall_clock_median_ns: wall_clock.median,
wall_clock_max_ns: wall_clock.max,
timing_scope: "warmup+repeated_compile_transfer_kernel_cpu_oracle_wall_clock_with_kernel_transfer_split"
.to_string(),
external_baseline_kind: "missing_external_baseline".to_string(),
external_baseline_source: "not_provided".to_string(),
external_baseline_wall_clock_ns: 0,
profiler_tool: "not_captured".to_string(),
profiler_trace_status: "missing".to_string(),
profiler_kernel_time_ns: 0,
speedup_evidence_status: "blocked_missing_external_baseline_and_profiler".to_string(),
fallback_reason: "none".to_string(),
speed_claim_allowed: false,
speed_claim_blocker: "local repeated timing still lacks external baseline and profiler evidence"
.to_string(),
})
}
fn hip_fallback_row(
fixture: &PadicStratifiedBenchmarkFixture,
device: &RocmHipCapabilityReport,
dense: &PadicStratifiedBenchmarkRow,
sparse: &PadicStratifiedBenchmarkRow,
reason: &str,
config: PadicStratifiedBenchmarkConfig,
) -> PadicStratifiedBenchmarkRow {
PadicStratifiedBenchmarkRow {
fixture: fixture.name.to_string(),
distribution: fixture.distribution.to_string(),
backend: ROCM_HIP_PADIC_STRATIFIED_MATMUL_BACKEND.to_string(),
status: "fallback_captured".to_string(),
shape: fixture.shape(),
prime: fixture.prime,
precision: fixture.precision,
skipped_products: sparse.skipped_products,
evaluated_products: sparse.evaluated_products,
precision_margin_min: sparse.precision_margin_min,
dense_cpu_oracle_matches: dense.dense_cpu_oracle_matches,
sparse_cpu_oracle_matches: sparse.sparse_cpu_oracle_matches,
certificate_coverage: sparse.certificate_coverage,
output_residue_fingerprint: sparse.output_residue_fingerprint.clone(),
device_fingerprint: device.capability_fingerprint.clone(),
kernel_source_fingerprint: "not_built".to_string(),
compiler_fingerprint: "not_used".to_string(),
transfer_time_ns: 0,
kernel_time_ns: 0,
wall_clock_ns: 0,
warmup_runs: config.warmup_runs,
measured_runs: 0,
transfer_time_min_ns: 0,
transfer_time_median_ns: 0,
transfer_time_max_ns: 0,
kernel_time_min_ns: 0,
kernel_time_median_ns: 0,
kernel_time_max_ns: 0,
wall_clock_min_ns: 0,
wall_clock_median_ns: 0,
wall_clock_max_ns: 0,
timing_scope: "fallback_row_no_device_execution".to_string(),
external_baseline_kind: "not_applicable_fallback".to_string(),
external_baseline_source: "not_provided".to_string(),
external_baseline_wall_clock_ns: 0,
profiler_tool: "not_captured".to_string(),
profiler_trace_status: "missing".to_string(),
profiler_kernel_time_ns: 0,
speedup_evidence_status: "blocked_fallback_row_no_device_execution".to_string(),
fallback_reason: reason.to_string(),
speed_claim_allowed: false,
speed_claim_blocker: "fallback row cannot support hardware speedup".to_string(),
}
}
fn fixture_summary(
fixture: &PadicStratifiedBenchmarkFixture,
lhs: &PadicMatrix,
rhs: &PadicMatrix,
) -> Result<PadicStratifiedBenchmarkFixtureSummary> {
let lhs_meta = lhs.metadata()?;
let rhs_meta = rhs.metadata()?;
Ok(PadicStratifiedBenchmarkFixtureSummary {
name: fixture.name.to_string(),
distribution: fixture.distribution.to_string(),
shape: fixture.shape(),
prime: fixture.prime,
precision: fixture.precision,
lhs_bucket_fingerprint: lhs_meta.bucket_fingerprint,
rhs_bucket_fingerprint: rhs_meta.bucket_fingerprint,
lhs_valuation_histogram: histogram_string(&lhs_meta.valuation_histogram),
rhs_valuation_histogram: histogram_string(&rhs_meta.valuation_histogram),
})
}
fn benchmark_fixtures() -> Vec<PadicStratifiedBenchmarkFixture> {
vec![
PadicStratifiedBenchmarkFixture {
name: "unit_heavy_q5_p3_2x3x2",
distribution: "unit_heavy",
rows: 2,
inner: 3,
cols: 2,
prime: 5,
precision: 3,
lhs: vec![1, 2, 3, 4, 6, 7],
rhs: vec![8, 9, 11, 12, 13, 14],
},
PadicStratifiedBenchmarkFixture {
name: "sparse_high_valuation_q5_p3_2x3x2",
distribution: "sparse_high_valuation",
rows: 2,
inner: 3,
cols: 2,
prime: 5,
precision: 3,
lhs: vec![25, 50, 5, 0, 125, 75],
rhs: vec![5, 25, 50, 75, 1, 125],
},
PadicStratifiedBenchmarkFixture {
name: "mixed_q5_p3_2x3x2",
distribution: "mixed",
rows: 2,
inner: 3,
cols: 2,
prime: 5,
precision: 3,
lhs: vec![25, 1, 5, 2, 125, 4],
rhs: vec![5, 3, 25, 5, 1, 25],
},
PadicStratifiedBenchmarkFixture {
name: "adversarial_boundary_q5_p3_2x3x2",
distribution: "adversarial_boundary",
rows: 2,
inner: 3,
cols: 2,
prime: 5,
precision: 3,
lhs: vec![1, 5, 25, 5, 25, 1],
rhs: vec![25, 5, 5, 25, 1, 1],
},
PadicStratifiedBenchmarkFixture {
name: "mixed_q7_p2_2x3x2_fallback",
distribution: "mixed_prime_variant",
rows: 2,
inner: 3,
cols: 2,
prime: 7,
precision: 2,
lhs: vec![1, 7, 14, 2, 49, 3],
rhs: vec![7, 5, 49, 1, 4, 14],
},
PadicStratifiedBenchmarkFixture {
name: "sparse_q5_p3_3x3x3_runtime_shape",
distribution: "larger_runtime_shape",
rows: 3,
inner: 3,
cols: 3,
prime: 5,
precision: 3,
lhs: vec![1, 5, 25, 2, 10, 50, 3, 15, 75],
rhs: vec![5, 1, 25, 10, 2, 50, 15, 3, 75],
},
PadicStratifiedBenchmarkFixture {
name: "sparse_q5_p4_3x3x3_fallback",
distribution: "larger_precision_shape",
rows: 3,
inner: 3,
cols: 3,
prime: 5,
precision: 4,
lhs: vec![1, 5, 25, 125, 2, 10, 50, 250, 3],
rhs: vec![5, 1, 125, 25, 2, 250, 3, 75, 625],
},
]
}
impl PadicStratifiedBenchmarkFixture {
fn shape(&self) -> String {
format!("{}x{}x{}", self.rows, self.inner, self.cols)
}
fn hip_supported(&self) -> bool {
self.prime == 5
&& self.precision == 3
&& (1..=3).contains(&self.rows)
&& (1..=3).contains(&self.inner)
&& (1..=3).contains(&self.cols)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
struct TimingSummary {
min: u128,
median: u128,
max: u128,
}
impl TimingSummary {
fn from_samples(samples: &[u128]) -> Result<Self> {
if samples.is_empty() {
return Err(Error::backend(
"p-adic benchmark timing summary requires at least one sample",
));
}
let mut sorted = samples.to_vec();
sorted.sort_unstable();
Ok(Self {
min: sorted[0],
median: sorted[sorted.len() / 2],
max: sorted[sorted.len() - 1],
})
}
}
fn matrix_from_residues(
domain: &PadicDomain,
rows: usize,
cols: usize,
residues: &[u64],
) -> Result<PadicMatrix> {
domain.matrix(
rows,
cols,
residues
.iter()
.map(|value| domain.element(u128::from(*value)))
.collect(),
)
}
fn certificate_coverage(
certificates: &[PadicOutputCertificate],
rows: usize,
cols: usize,
evaluated_products: usize,
skipped_products: usize,
) -> bool {
certificates.len() == rows * cols
&& certificates
.iter()
.enumerate()
.all(|(idx, cert)| cert.row == idx / cols && cert.col == idx % cols)
&& certificates
.iter()
.map(|cert| cert.evaluated_product_count)
.sum::<usize>()
== evaluated_products
&& certificates
.iter()
.map(|cert| cert.skipped_product_count)
.sum::<usize>()
== skipped_products
&& certificates.iter().all(|cert| {
cert.skipped_product_count == 0
|| cert
.min_skipped_valuation
.map(|valuation| valuation >= cert.precision_cutoff)
.unwrap_or(false)
})
}
fn min_margin(certificates: &[PadicOutputCertificate]) -> Option<u32> {
certificates
.iter()
.filter_map(|cert| cert.precision_safety_margin)
.min()
}
fn residue_fingerprint(values: &[crate::domain::Padic]) -> String {
let residues = values.iter().map(|value| value.residue).collect::<Vec<_>>();
format!("residue-fingerprint-{}", stable_join(&residues))
}
fn u64_residue_fingerprint(values: &[u64]) -> String {
let residues = values
.iter()
.map(|value| u128::from(*value))
.collect::<Vec<_>>();
format!("residue-fingerprint-{}", stable_join(&residues))
}
fn stable_join(values: &[u128]) -> String {
values
.iter()
.map(u128::to_string)
.collect::<Vec<_>>()
.join("-")
}
fn histogram_string(histogram: &std::collections::BTreeMap<u32, usize>) -> String {
histogram
.iter()
.map(|(valuation, count)| format!("{valuation}:{count}"))
.collect::<Vec<_>>()
.join(";")
}
fn csv(value: &str) -> String {
if value.contains(',') || value.contains('"') || value.contains('\n') {
format!("\"{}\"", value.replace('"', "\"\""))
} else {
value.to_string()
}
}
fn json_array(values: &[String]) -> String {
format!(
"[{}]",
values
.iter()
.map(|value| json_string(value))
.collect::<Vec<_>>()
.join(",")
)
}
fn json_option_u32(value: Option<u32>) -> String {
value
.map(|value| value.to_string())
.unwrap_or_else(|| "null".to_string())
}
fn json_string(value: &str) -> String {
let mut escaped = String::from("\"");
for ch in value.chars() {
match ch {
'"' => escaped.push_str("\\\""),
'\\' => escaped.push_str("\\\\"),
'\n' => escaped.push_str("\\n"),
'\r' => escaped.push_str("\\r"),
'\t' => escaped.push_str("\\t"),
ch if ch.is_control() => escaped.push_str(&format!("\\u{:04x}", ch as u32)),
ch => escaped.push(ch),
}
}
escaped.push('"');
escaped
}
fn md(value: &str) -> String {
value.replace('|', "\\|")
}