use std::collections::BTreeSet;
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum BenchmarkThresholdDirection {
AtLeast,
AtMost,
}
#[derive(Clone, Copy, Debug, PartialEq)]
pub struct ReleaseBenchmarkBaseline {
pub id: &'static str,
pub command: &'static str,
pub hardware: &'static str,
pub metric: &'static str,
pub observed: f64,
pub threshold: f64,
pub direction: BenchmarkThresholdDirection,
}
#[derive(Clone, Copy, Debug, PartialEq)]
pub struct ReleaseBenchmarkBaselineProof {
pub baseline_count: usize,
pub cuda_baseline_count: usize,
}
#[derive(Clone, Copy, Debug, PartialEq)]
pub struct ReleaseBenchmarkArtifact<'a> {
pub path: &'a str,
pub contents: &'a str,
pub required_family: &'a str,
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct ReleaseBenchmarkArtifactProof {
pub artifact_count: usize,
}
#[derive(Clone, Debug, PartialEq)]
pub enum ReleaseBenchmarkBaselineError {
EmptyBaselines,
DuplicateId {
id: &'static str,
},
EmptyMetadata {
id: &'static str,
field: &'static str,
},
CommandDoesNotUseCargoFull {
id: &'static str,
command: &'static str,
},
MissingCudaHardware {
id: &'static str,
hardware: &'static str,
},
InvalidMetric {
id: &'static str,
},
ThresholdMiss {
id: &'static str,
observed: f64,
threshold: f64,
direction: BenchmarkThresholdDirection,
},
ArtifactMissingEvidence {
path: String,
evidence: &'static str,
},
}
impl std::fmt::Display for ReleaseBenchmarkBaselineError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::EmptyBaselines => write!(
f,
"release benchmark baselines are empty. Fix: commit exact benchmark command, CUDA hardware, metric, observed value, and threshold."
),
Self::DuplicateId { id } => write!(
f,
"release benchmark baseline `{id}` is duplicated. Fix: keep one owner per release metric."
),
Self::EmptyMetadata { id, field } => write!(
f,
"release benchmark baseline `{id}` has empty {field}. Fix: every baseline needs id, command, hardware, metric, observed value, and threshold."
),
Self::CommandDoesNotUseCargoFull { id, command } => write!(
f,
"release benchmark baseline `{id}` uses `{command}` instead of ./cargo_full. Fix: record the exact cargo_full benchmark command."
),
Self::MissingCudaHardware { id, hardware } => write!(
f,
"release benchmark baseline `{id}` hardware `{hardware}` does not name CUDA/NVIDIA hardware. Fix: record the RTX 5090/CUDA device used for the run."
),
Self::InvalidMetric { id } => write!(
f,
"release benchmark baseline `{id}` has invalid metric values. Fix: observed and threshold must be positive finite values."
),
Self::ThresholdMiss {
id,
observed,
threshold,
direction,
} => write!(
f,
"release benchmark baseline `{id}` missed threshold: observed={observed}, threshold={threshold}, direction={direction:?}. Fix: improve performance or update the release target with explicit approval."
),
Self::ArtifactMissingEvidence { path, evidence } => write!(
f,
"release benchmark artifact `{path}` is missing {evidence}. Fix: commit benchmark evidence with CUDA hardware, pass status, 100x contract, samples, and source fingerprint."
),
}
}
}
impl std::error::Error for ReleaseBenchmarkBaselineError {}
pub fn validate_release_benchmark_baselines(
baselines: &[ReleaseBenchmarkBaseline],
) -> Result<ReleaseBenchmarkBaselineProof, ReleaseBenchmarkBaselineError> {
if baselines.is_empty() {
return Err(ReleaseBenchmarkBaselineError::EmptyBaselines);
}
let mut ids = BTreeSet::new();
let mut cuda_baseline_count = 0_usize;
for baseline in baselines {
validate_metadata(baseline)?;
if !ids.insert(baseline.id) {
return Err(ReleaseBenchmarkBaselineError::DuplicateId { id: baseline.id });
}
if !baseline.command.trim_start().starts_with("./cargo_full ") {
return Err(ReleaseBenchmarkBaselineError::CommandDoesNotUseCargoFull {
id: baseline.id,
command: baseline.command,
});
}
if !baseline.hardware.contains("CUDA")
&& !baseline.hardware.contains("NVIDIA")
&& !baseline.hardware.contains("RTX")
{
return Err(ReleaseBenchmarkBaselineError::MissingCudaHardware {
id: baseline.id,
hardware: baseline.hardware,
});
}
cuda_baseline_count += 1;
if !baseline.observed.is_finite()
|| !baseline.threshold.is_finite()
|| baseline.observed <= 0.0
|| baseline.threshold <= 0.0
{
return Err(ReleaseBenchmarkBaselineError::InvalidMetric { id: baseline.id });
}
let passed = match baseline.direction {
BenchmarkThresholdDirection::AtLeast => baseline.observed >= baseline.threshold,
BenchmarkThresholdDirection::AtMost => baseline.observed <= baseline.threshold,
};
if !passed {
return Err(ReleaseBenchmarkBaselineError::ThresholdMiss {
id: baseline.id,
observed: baseline.observed,
threshold: baseline.threshold,
direction: baseline.direction,
});
}
}
Ok(ReleaseBenchmarkBaselineProof {
baseline_count: baselines.len(),
cuda_baseline_count,
})
}
pub fn validate_committed_benchmark_artifacts(
artifacts: &[ReleaseBenchmarkArtifact<'_>],
) -> Result<ReleaseBenchmarkArtifactProof, ReleaseBenchmarkBaselineError> {
if artifacts.is_empty() {
return Err(ReleaseBenchmarkBaselineError::EmptyBaselines);
}
for artifact in artifacts {
validate_artifact_field(artifact, "path", artifact.path)?;
validate_artifact_field(artifact, "contents", artifact.contents)?;
validate_artifact_field(artifact, "required_family", artifact.required_family)?;
require_artifact_contains(
artifact,
"selected CUDA backend",
"\"selected_backend\": \"cuda\"",
)?;
require_artifact_contains(artifact, "RTX 5090 hardware", "NVIDIA GeForce RTX 5090")?;
require_artifact_contains(
artifact,
"NVIDIA driver version",
"\"nvidia_driver_version\"",
)?;
require_artifact_contains(artifact, "source fingerprint", "\"source_fingerprint\"")?;
require_artifact_contains_any(
artifact,
"passing case status",
&["\"status\": \"pass\"", "\"failed_count\": 0"],
)?;
require_artifact_contains_any(
artifact,
"100x speedup floor",
&["\"min_speedup_x\": 100", "\"cpu_sota_100x_required\": true"],
)?;
require_artifact_contains_any(
artifact,
"passed performance contract",
&[
"\"contract_passed\": true",
"\"cpu_sota_100x_passing_cases\": 1",
],
)?;
require_artifact_contains_any(
artifact,
"wall sample count",
&["\"samples\"", "\"min_wall_samples\""],
)?;
if !artifact.contents.contains(artifact.required_family) {
return Err(ReleaseBenchmarkBaselineError::ArtifactMissingEvidence {
path: artifact.path.to_owned(),
evidence: "required workload family",
});
}
}
Ok(ReleaseBenchmarkArtifactProof {
artifact_count: artifacts.len(),
})
}
fn validate_artifact_field(
artifact: &ReleaseBenchmarkArtifact<'_>,
field: &'static str,
value: &str,
) -> Result<(), ReleaseBenchmarkBaselineError> {
if value.trim().is_empty() {
return Err(ReleaseBenchmarkBaselineError::ArtifactMissingEvidence {
path: artifact.path.to_owned(),
evidence: field,
});
}
Ok(())
}
fn require_artifact_contains(
artifact: &ReleaseBenchmarkArtifact<'_>,
evidence: &'static str,
needle: &str,
) -> Result<(), ReleaseBenchmarkBaselineError> {
if artifact.contents.contains(needle) {
Ok(())
} else {
Err(ReleaseBenchmarkBaselineError::ArtifactMissingEvidence {
path: artifact.path.to_owned(),
evidence,
})
}
}
fn require_artifact_contains_any(
artifact: &ReleaseBenchmarkArtifact<'_>,
evidence: &'static str,
needles: &[&str],
) -> Result<(), ReleaseBenchmarkBaselineError> {
if needles
.iter()
.any(|needle| artifact.contents.contains(needle))
{
Ok(())
} else {
Err(ReleaseBenchmarkBaselineError::ArtifactMissingEvidence {
path: artifact.path.to_owned(),
evidence,
})
}
}
fn validate_metadata(
baseline: &ReleaseBenchmarkBaseline,
) -> Result<(), ReleaseBenchmarkBaselineError> {
for (field, value) in [
("id", baseline.id),
("command", baseline.command),
("hardware", baseline.hardware),
("metric", baseline.metric),
] {
if value.trim().is_empty() {
return Err(ReleaseBenchmarkBaselineError::EmptyMetadata {
id: baseline.id,
field,
});
}
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn benchmark_baselines_accept_exact_cuda_cargo_full_records() {
let proof = validate_release_benchmark_baselines(&[
baseline(
"cuda-megakernel-100x",
125.0,
100.0,
BenchmarkThresholdDirection::AtLeast,
),
baseline(
"cuda-readback-us",
42.0,
50.0,
BenchmarkThresholdDirection::AtMost,
),
])
.expect("Fix: valid CUDA baselines should pass");
assert_eq!(proof.baseline_count, 2);
assert_eq!(proof.cuda_baseline_count, 2);
}
#[test]
fn benchmark_baselines_reject_non_cargo_full_commands() {
let mut bad = baseline(
"bad-command",
125.0,
100.0,
BenchmarkThresholdDirection::AtLeast,
);
bad.command = "cargo bench";
assert_eq!(
validate_release_benchmark_baselines(&[bad]).expect_err("raw cargo bench should fail"),
ReleaseBenchmarkBaselineError::CommandDoesNotUseCargoFull {
id: "bad-command",
command: "cargo bench",
}
);
}
#[test]
fn benchmark_baselines_reject_missing_cuda_hardware_and_threshold_miss() {
let mut cpu = baseline("cpu", 125.0, 100.0, BenchmarkThresholdDirection::AtLeast);
cpu.hardware = "generic x86";
assert_eq!(
validate_release_benchmark_baselines(&[cpu])
.expect_err("missing CUDA hardware should fail"),
ReleaseBenchmarkBaselineError::MissingCudaHardware {
id: "cpu",
hardware: "generic x86",
}
);
assert_eq!(
validate_release_benchmark_baselines(&[baseline(
"too-slow",
99.0,
100.0,
BenchmarkThresholdDirection::AtLeast,
)])
.expect_err("threshold miss should fail"),
ReleaseBenchmarkBaselineError::ThresholdMiss {
id: "too-slow",
observed: 99.0,
threshold: 100.0,
direction: BenchmarkThresholdDirection::AtLeast,
}
);
}
#[test]
fn benchmark_artifacts_accept_committed_cuda_release_evidence() {
let proof = validate_committed_benchmark_artifacts(&[
artifact(
"release/evidence/benchmarks/cuda-release-suite.json",
include_str!("../../../../release/evidence/benchmarks/cuda-release-suite.json"),
"ifds-witness",
),
artifact(
"release/evidence/benchmarks/megakernel-condition-cuda.json",
include_str!("../../../../release/evidence/benchmarks/megakernel-condition-cuda.json"),
"megakernel",
),
artifact(
"release/evidence/benchmarks/workload-10-megakernel-queued-batches.json",
include_str!(
"../../../../release/evidence/benchmarks/workload-10-megakernel-queued-batches.json"
),
"megakernel",
),
artifact(
"release/evidence/benchmarks/dataflow-analysis-release.json",
include_str!("../../../../release/evidence/benchmarks/dataflow-analysis-release.json"),
"dataflow",
),
])
.expect("Fix: committed CUDA benchmark artifacts should satisfy release evidence contracts");
assert_eq!(proof.artifact_count, 4);
}
#[test]
fn benchmark_artifacts_reject_missing_100x_contract() {
let err = validate_committed_benchmark_artifacts(&[artifact(
"bad.json",
"{\"selected_backend\": \"cuda\", \"gpu\": \"NVIDIA GeForce RTX 5090\", \"nvidia_driver_version\": \"570.211.01\", \"source_fingerprint\": \"git:x\", \"status\": \"pass\", \"contract_passed\": true, \"samples\": 35, \"family\": \"megakernel\"}",
"megakernel",
)])
.expect_err("missing 100x contract should fail");
assert_eq!(
err,
ReleaseBenchmarkBaselineError::ArtifactMissingEvidence {
path: "bad.json".to_owned(),
evidence: "100x speedup floor",
}
);
}
fn artifact(
path: &'static str,
contents: &'static str,
required_family: &'static str,
) -> ReleaseBenchmarkArtifact<'static> {
ReleaseBenchmarkArtifact {
path,
contents,
required_family,
}
}
fn baseline(
id: &'static str,
observed: f64,
threshold: f64,
direction: BenchmarkThresholdDirection,
) -> ReleaseBenchmarkBaseline {
ReleaseBenchmarkBaseline {
id,
command: "./cargo_full bench -j1 -p vyre-driver-cuda",
hardware: "NVIDIA RTX 5090 CUDA",
metric: "speedup_x",
observed,
threshold,
direction,
}
}
}