const MIN_RELEASE_GRAPH_NODES: u64 = 10_000;
const MIN_RELEASE_GRAPH_EDGES: u64 = 80_000;
const MIN_RELEASE_REPETITIONS: u64 = 64;
const MIN_RELEASE_SAMPLE_COUNT: usize = 2;
const MIN_RELEASE_DEVICE_MEMORY_BYTES: u64 = 16 * 1024 * 1024 * 1024;
const MIN_RELEASE_COMPUTE_CAPABILITY_MAJOR: u32 = 8;
const MIN_RELEASE_COMPUTE_CAPABILITY_MINOR: u32 = 0;
#[derive(Clone, Copy, Debug, PartialEq)]
pub struct CudaMegakernelSpeedupSample {
pub backend_id: &'static str,
pub device_ordinal: u64,
pub device_memory_bytes: u64,
pub compute_capability_major: u32,
pub compute_capability_minor: u32,
pub graph_nodes: u64,
pub graph_edges: u64,
pub repetitions: u64,
pub host_orchestrated_ns: f64,
pub resident_megakernel_ns: f64,
pub setup_ns: f64,
pub timed_graph_uploads: u64,
pub timed_host_allocations: u64,
pub timed_host_syncs: u64,
}
#[derive(Clone, Copy, Debug, PartialEq)]
pub struct CudaMegakernelSpeedupProof {
pub min_speedup_x: f64,
pub max_speedup_x: f64,
pub sample_count: usize,
pub total_repetitions: u64,
}
#[derive(Clone, Debug, PartialEq)]
pub enum CudaMegakernelSpeedupGateError {
EmptySamples,
InvalidRequiredSpeedup {
required_x: f64,
},
NonCudaEvidence {
index: usize,
backend_id: String,
},
InsufficientCudaDevice {
index: usize,
device_memory_bytes: u64,
compute_capability_major: u32,
compute_capability_minor: u32,
},
NotRepeatedFixedGraph {
index: usize,
},
InsufficientScale {
index: usize,
graph_nodes: u64,
graph_edges: u64,
repetitions: u64,
},
InsufficientSampleCount {
sample_count: usize,
},
PollutedTimedRegion {
index: usize,
graph_uploads: u64,
host_allocations: u64,
host_syncs: u64,
},
InvalidTiming {
index: usize,
},
SpeedupBelowThreshold {
index: usize,
observed_x: f64,
required_x: f64,
},
RepetitionCountOverflow {
index: usize,
},
InvalidEvidenceHeader {
observed: String,
},
InvalidEvidenceRow {
line: usize,
},
InvalidEvidenceValue {
line: usize,
field: &'static str,
value: String,
},
EvidenceCsvCapacityOverflow,
EvidenceCsvReserveFailed {
requested: usize,
message: String,
},
}
impl std::fmt::Display for CudaMegakernelSpeedupGateError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::EmptySamples => write!(
f,
"CUDA megakernel speedup gate has no samples. Fix: record steady-state resident and host-orchestrated benchmark samples."
),
Self::InvalidRequiredSpeedup { required_x } => write!(
f,
"CUDA megakernel speedup gate received invalid required speedup {required_x}. Fix: pass a positive finite release threshold."
),
Self::NonCudaEvidence { index, backend_id } => write!(
f,
"CUDA megakernel speedup sample {index} was produced by backend `{backend_id}`. Fix: collect release speedup evidence through the CUDA backend, not a CPU/WGPU/comparison path."
),
Self::InsufficientCudaDevice {
index,
device_memory_bytes,
compute_capability_major,
compute_capability_minor,
} => write!(
f,
"CUDA megakernel speedup sample {index} was produced on an insufficient CUDA device: memory_bytes={device_memory_bytes}, compute_capability={compute_capability_major}.{compute_capability_minor}. Fix: collect release evidence on a CUDA GPU with at least {MIN_RELEASE_DEVICE_MEMORY_BYTES} bytes VRAM and compute capability {MIN_RELEASE_COMPUTE_CAPABILITY_MAJOR}.{MIN_RELEASE_COMPUTE_CAPABILITY_MINOR}."
),
Self::NotRepeatedFixedGraph { index } => write!(
f,
"CUDA megakernel speedup sample {index} is not a repeated fixed-graph workload. Fix: use repetitions > 1 with non-empty graph nodes and edges."
),
Self::InsufficientScale {
index,
graph_nodes,
graph_edges,
repetitions,
} => write!(
f,
"CUDA megakernel speedup sample {index} is below release scale: nodes={graph_nodes}, edges={graph_edges}, repetitions={repetitions}. Fix: use at least {MIN_RELEASE_GRAPH_NODES} nodes, {MIN_RELEASE_GRAPH_EDGES} edges, and {MIN_RELEASE_REPETITIONS} steady-state repetitions."
),
Self::InsufficientSampleCount { sample_count } => write!(
f,
"CUDA megakernel speedup proof has only {sample_count} accepted sample(s). Fix: provide at least {MIN_RELEASE_SAMPLE_COUNT} independent release-scale samples."
),
Self::PollutedTimedRegion {
index,
graph_uploads,
host_allocations,
host_syncs,
} => write!(
f,
"CUDA megakernel speedup sample {index} is polluted: graph_uploads={graph_uploads}, host_allocations={host_allocations}, host_syncs={host_syncs}. Fix: measure steady-state execution only."
),
Self::InvalidTiming { index } => write!(
f,
"CUDA megakernel speedup sample {index} has invalid timing. Fix: provide positive finite host and resident durations."
),
Self::SpeedupBelowThreshold {
index,
observed_x,
required_x,
} => write!(
f,
"CUDA megakernel speedup sample {index} reached {observed_x:.2}x but release requires {required_x:.2}x. Fix: improve resident execution or lower host orchestration overhead before claiming the release speedup."
),
Self::RepetitionCountOverflow { index } => write!(
f,
"CUDA megakernel speedup proof overflowed total repetitions at sample {index}. Fix: split the proof set into bounded release evidence groups."
),
Self::InvalidEvidenceHeader { observed } => write!(
f,
"CUDA megakernel speedup evidence has invalid CSV header `{observed}`. Fix: emit the exact release header `{MEGAKERNEL_SPEEDUP_EVIDENCE_CSV_HEADER}`."
),
Self::InvalidEvidenceRow { line } => write!(
f,
"CUDA megakernel speedup evidence line {line} has the wrong field count. Fix: emit one complete CSV row per benchmark sample."
),
Self::InvalidEvidenceValue { line, field, value } => write!(
f,
"CUDA megakernel speedup evidence line {line} has invalid `{field}` value `{value}`. Fix: emit finite numeric benchmark fields without separators or units."
),
Self::EvidenceCsvCapacityOverflow => write!(
f,
"CUDA megakernel speedup evidence CSV capacity overflowed usize. Fix: split release evidence into bounded proof artifacts."
),
Self::EvidenceCsvReserveFailed { requested, message } => write!(
f,
"CUDA megakernel speedup evidence CSV could not reserve {requested} byte(s): {message}. Fix: split release evidence into bounded proof artifacts."
),
}
}
}
impl std::error::Error for CudaMegakernelSpeedupGateError {}
pub const MEGAKERNEL_SPEEDUP_EVIDENCE_CSV_HEADER: &str = "backend_id,device_ordinal,device_memory_bytes,compute_capability_major,compute_capability_minor,graph_nodes,graph_edges,repetitions,host_orchestrated_ns,resident_megakernel_ns,setup_ns,timed_graph_uploads,timed_host_allocations,timed_host_syncs";
pub fn format_validated_cuda_megakernel_speedup_evidence_csv(
samples: &[CudaMegakernelSpeedupSample],
required_speedup_x: f64,
) -> Result<(CudaMegakernelSpeedupProof, String), CudaMegakernelSpeedupGateError> {
let proof = validate_cuda_megakernel_speedup_gate(samples, required_speedup_x)?;
let capacity = megakernel_speedup_evidence_csv_capacity(samples.len())?;
let mut csv = String::new();
vyre_foundation::allocation::try_reserve_string_to_capacity(&mut csv, capacity).map_err(
|error| CudaMegakernelSpeedupGateError::EvidenceCsvReserveFailed {
requested: capacity,
message: error.to_string(),
},
)?;
csv.push_str(MEGAKERNEL_SPEEDUP_EVIDENCE_CSV_HEADER);
csv.push('\n');
for sample in samples {
use std::fmt::Write as _;
writeln!(
csv,
"{},{},{},{},{},{},{},{},{},{},{},{},{},{}",
sample.backend_id,
sample.device_ordinal,
sample.device_memory_bytes,
sample.compute_capability_major,
sample.compute_capability_minor,
sample.graph_nodes,
sample.graph_edges,
sample.repetitions,
sample.host_orchestrated_ns,
sample.resident_megakernel_ns,
sample.setup_ns,
sample.timed_graph_uploads,
sample.timed_host_allocations,
sample.timed_host_syncs
)
.map_err(|_| CudaMegakernelSpeedupGateError::InvalidEvidenceValue {
line: 0,
field: "csv_string_write",
value: "fmt::Error".to_string(),
})?;
}
Ok((proof, csv))
}
fn megakernel_speedup_evidence_csv_capacity(
sample_count: usize,
) -> Result<usize, CudaMegakernelSpeedupGateError> {
let sample_bytes = sample_count
.checked_mul(128)
.ok_or(CudaMegakernelSpeedupGateError::EvidenceCsvCapacityOverflow)?;
MEGAKERNEL_SPEEDUP_EVIDENCE_CSV_HEADER
.len()
.checked_add(1)
.and_then(|header| header.checked_add(sample_bytes))
.ok_or(CudaMegakernelSpeedupGateError::EvidenceCsvCapacityOverflow)
}
pub fn validate_cuda_megakernel_speedup_gate(
samples: &[CudaMegakernelSpeedupSample],
required_speedup_x: f64,
) -> Result<CudaMegakernelSpeedupProof, CudaMegakernelSpeedupGateError> {
if samples.is_empty() {
return Err(CudaMegakernelSpeedupGateError::EmptySamples);
}
if !required_speedup_x.is_finite() || required_speedup_x <= 0.0 {
return Err(CudaMegakernelSpeedupGateError::InvalidRequiredSpeedup {
required_x: required_speedup_x,
});
}
let mut min_speedup_x = f64::INFINITY;
let mut max_speedup_x = 0.0_f64;
let mut total_repetitions = 0_u64;
let mut sample_count = 0_usize;
for (index, sample) in samples.iter().copied().enumerate() {
accumulate_cuda_megakernel_speedup_sample(
sample,
index,
required_speedup_x,
&mut min_speedup_x,
&mut max_speedup_x,
&mut total_repetitions,
)?;
sample_count += 1;
}
if sample_count == 0 {
return Err(CudaMegakernelSpeedupGateError::EmptySamples);
}
if sample_count < MIN_RELEASE_SAMPLE_COUNT {
return Err(CudaMegakernelSpeedupGateError::InsufficientSampleCount { sample_count });
}
Ok(CudaMegakernelSpeedupProof {
min_speedup_x,
max_speedup_x,
sample_count,
total_repetitions,
})
}
pub fn validate_cuda_megakernel_speedup_evidence_csv(
csv: &str,
required_speedup_x: f64,
) -> Result<CudaMegakernelSpeedupProof, CudaMegakernelSpeedupGateError> {
let mut lines = csv.lines().filter(|line| !line.trim().is_empty());
let header = lines
.next()
.ok_or(CudaMegakernelSpeedupGateError::EmptySamples)?;
if header.trim() != MEGAKERNEL_SPEEDUP_EVIDENCE_CSV_HEADER {
return Err(CudaMegakernelSpeedupGateError::InvalidEvidenceHeader {
observed: header.to_string(),
});
}
let mut min_speedup_x = f64::INFINITY;
let mut max_speedup_x = 0.0_f64;
let mut total_repetitions = 0_u64;
let mut sample_count = 0_usize;
for (line_index, line) in lines.enumerate() {
let line_number = line_index + 2;
let mut fields = [""; 14];
let mut field_count = 0_usize;
for (field_index, field) in line.split(',').map(str::trim).enumerate() {
if field_index >= fields.len() {
return Err(CudaMegakernelSpeedupGateError::InvalidEvidenceRow {
line: line_number,
});
}
fields[field_index] = field;
field_count += 1;
}
if field_count != fields.len() {
return Err(CudaMegakernelSpeedupGateError::InvalidEvidenceRow { line: line_number });
}
let backend_id = parse_backend_id_field(fields[0], line_number)?;
let sample = CudaMegakernelSpeedupSample {
backend_id,
device_ordinal: parse_u64_field(fields[1], line_number, "device_ordinal")?,
device_memory_bytes: parse_u64_field(fields[2], line_number, "device_memory_bytes")?,
compute_capability_major: parse_u32_field(
fields[3],
line_number,
"compute_capability_major",
)?,
compute_capability_minor: parse_u32_field(
fields[4],
line_number,
"compute_capability_minor",
)?,
graph_nodes: parse_u64_field(fields[5], line_number, "graph_nodes")?,
graph_edges: parse_u64_field(fields[6], line_number, "graph_edges")?,
repetitions: parse_u64_field(fields[7], line_number, "repetitions")?,
host_orchestrated_ns: parse_f64_field(fields[8], line_number, "host_orchestrated_ns")?,
resident_megakernel_ns: parse_f64_field(
fields[9],
line_number,
"resident_megakernel_ns",
)?,
setup_ns: parse_f64_field(fields[10], line_number, "setup_ns")?,
timed_graph_uploads: parse_u64_field(fields[11], line_number, "timed_graph_uploads")?,
timed_host_allocations: parse_u64_field(
fields[12],
line_number,
"timed_host_allocations",
)?,
timed_host_syncs: parse_u64_field(fields[13], line_number, "timed_host_syncs")?,
};
accumulate_cuda_megakernel_speedup_sample(
sample,
sample_count,
required_speedup_x,
&mut min_speedup_x,
&mut max_speedup_x,
&mut total_repetitions,
)?;
sample_count += 1;
}
if sample_count < MIN_RELEASE_SAMPLE_COUNT {
return Err(CudaMegakernelSpeedupGateError::InsufficientSampleCount { sample_count });
}
Ok(CudaMegakernelSpeedupProof {
min_speedup_x,
max_speedup_x,
sample_count,
total_repetitions,
})
}
fn accumulate_cuda_megakernel_speedup_sample(
sample: CudaMegakernelSpeedupSample,
index: usize,
required_speedup_x: f64,
min_speedup_x: &mut f64,
max_speedup_x: &mut f64,
total_repetitions: &mut u64,
) -> Result<(), CudaMegakernelSpeedupGateError> {
if sample.backend_id != crate::CUDA_BACKEND_ID {
return Err(CudaMegakernelSpeedupGateError::NonCudaEvidence {
index,
backend_id: sample.backend_id.to_string(),
});
}
if sample.device_memory_bytes < MIN_RELEASE_DEVICE_MEMORY_BYTES
|| (
sample.compute_capability_major,
sample.compute_capability_minor,
) < (
MIN_RELEASE_COMPUTE_CAPABILITY_MAJOR,
MIN_RELEASE_COMPUTE_CAPABILITY_MINOR,
)
{
return Err(CudaMegakernelSpeedupGateError::InsufficientCudaDevice {
index,
device_memory_bytes: sample.device_memory_bytes,
compute_capability_major: sample.compute_capability_major,
compute_capability_minor: sample.compute_capability_minor,
});
}
if sample.graph_nodes == 0 || sample.graph_edges == 0 || sample.repetitions <= 1 {
return Err(CudaMegakernelSpeedupGateError::NotRepeatedFixedGraph { index });
}
if sample.graph_nodes < MIN_RELEASE_GRAPH_NODES
|| sample.graph_edges < MIN_RELEASE_GRAPH_EDGES
|| sample.repetitions < MIN_RELEASE_REPETITIONS
{
return Err(CudaMegakernelSpeedupGateError::InsufficientScale {
index,
graph_nodes: sample.graph_nodes,
graph_edges: sample.graph_edges,
repetitions: sample.repetitions,
});
}
if sample.timed_graph_uploads != 0
|| sample.timed_host_allocations != 0
|| sample.timed_host_syncs != 0
{
return Err(CudaMegakernelSpeedupGateError::PollutedTimedRegion {
index,
graph_uploads: sample.timed_graph_uploads,
host_allocations: sample.timed_host_allocations,
host_syncs: sample.timed_host_syncs,
});
}
if !sample.host_orchestrated_ns.is_finite()
|| !sample.resident_megakernel_ns.is_finite()
|| sample.host_orchestrated_ns <= 0.0
|| sample.resident_megakernel_ns <= 0.0
|| !sample.setup_ns.is_finite()
|| sample.setup_ns < 0.0
{
return Err(CudaMegakernelSpeedupGateError::InvalidTiming { index });
}
let speedup_x = sample.host_orchestrated_ns / sample.resident_megakernel_ns;
if speedup_x < required_speedup_x {
return Err(CudaMegakernelSpeedupGateError::SpeedupBelowThreshold {
index,
observed_x: speedup_x,
required_x: required_speedup_x,
});
}
*min_speedup_x = (*min_speedup_x).min(speedup_x);
*max_speedup_x = (*max_speedup_x).max(speedup_x);
*total_repetitions = total_repetitions
.checked_add(sample.repetitions)
.ok_or(CudaMegakernelSpeedupGateError::RepetitionCountOverflow { index })?;
Ok(())
}
fn parse_backend_id_field(
value: &str,
line: usize,
) -> Result<&'static str, CudaMegakernelSpeedupGateError> {
if value == crate::CUDA_BACKEND_ID {
Ok(crate::CUDA_BACKEND_ID)
} else {
Err(CudaMegakernelSpeedupGateError::InvalidEvidenceValue {
line,
field: "backend_id",
value: value.to_string(),
})
}
}
fn parse_u64_field(
value: &str,
line: usize,
field: &'static str,
) -> Result<u64, CudaMegakernelSpeedupGateError> {
value
.parse()
.map_err(|_| CudaMegakernelSpeedupGateError::InvalidEvidenceValue {
line,
field,
value: value.to_string(),
})
}
fn parse_u32_field(
value: &str,
line: usize,
field: &'static str,
) -> Result<u32, CudaMegakernelSpeedupGateError> {
value
.parse()
.map_err(|_| CudaMegakernelSpeedupGateError::InvalidEvidenceValue {
line,
field,
value: value.to_string(),
})
}
fn parse_f64_field(
value: &str,
line: usize,
field: &'static str,
) -> Result<f64, CudaMegakernelSpeedupGateError> {
value
.parse()
.map_err(|_| CudaMegakernelSpeedupGateError::InvalidEvidenceValue {
line,
field,
value: value.to_string(),
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn speedup_gate_accepts_unpolluted_steady_state_100x_samples() {
let proof = validate_cuda_megakernel_speedup_gate(
&[
sample(1_000_000.0, 10_000.0, 64),
sample(2_500_000.0, 20_000.0, 128),
],
100.0,
)
.expect("Fix: unpolluted 100x samples should pass release gate");
assert_eq!(proof.sample_count, 2);
assert_eq!(proof.total_repetitions, 192);
assert_eq!(proof.min_speedup_x, 100.0);
assert_eq!(proof.max_speedup_x, 125.0);
}
#[test]
fn speedup_gate_validates_release_evidence_csv_artifact() {
let csv = format!(
"{MEGAKERNEL_SPEEDUP_EVIDENCE_CSV_HEADER}\n\
cuda,0,34359738368,12,0,10000,80000,64,1000000,10000,250000,0,0,0\n\
cuda,0,34359738368,12,0,20000,160000,128,2500000,20000,350000,0,0,0\n"
);
let proof = validate_cuda_megakernel_speedup_evidence_csv(&csv, 100.0)
.expect("Fix: release evidence CSV should validate");
assert_eq!(proof.sample_count, 2);
assert_eq!(proof.total_repetitions, 192);
assert_eq!(proof.min_speedup_x, 100.0);
assert_eq!(proof.max_speedup_x, 125.0);
}
#[test]
fn speedup_gate_formats_validated_release_evidence_csv_artifact() {
let samples = [
sample(1_000_000.0, 10_000.0, 64),
sample(2_500_000.0, 20_000.0, 128),
];
let (proof, csv) = format_validated_cuda_megakernel_speedup_evidence_csv(&samples, 100.0)
.expect("Fix: validated release samples should format as verifier CSV");
let reparsed = validate_cuda_megakernel_speedup_evidence_csv(&csv, 100.0)
.expect("Fix: formatted release CSV should roundtrip through the verifier");
assert_eq!(proof, reparsed);
assert!(csv.starts_with(MEGAKERNEL_SPEEDUP_EVIDENCE_CSV_HEADER));
assert_eq!(csv.lines().count(), 3);
}
#[test]
fn speedup_gate_csv_capacity_planning_is_checked() {
let capacity = megakernel_speedup_evidence_csv_capacity(2)
.expect("Fix: two release samples should have bounded CSV capacity");
assert_eq!(
capacity,
MEGAKERNEL_SPEEDUP_EVIDENCE_CSV_HEADER.len() + 1 + 256
);
assert_eq!(
megakernel_speedup_evidence_csv_capacity(usize::MAX / 128 + 1)
.expect_err("oversized proof artifacts must fail before allocation"),
CudaMegakernelSpeedupGateError::EvidenceCsvCapacityOverflow
);
}
#[test]
fn speedup_gate_rejects_malformed_release_evidence_csv() {
assert_eq!(
validate_cuda_megakernel_speedup_evidence_csv("nodes,edges\n", 100.0)
.expect_err("wrong header must fail"),
CudaMegakernelSpeedupGateError::InvalidEvidenceHeader {
observed: "nodes,edges".to_string(),
}
);
let missing_field = format!(
"{MEGAKERNEL_SPEEDUP_EVIDENCE_CSV_HEADER}\n\
cuda,0,34359738368,12,0,10000,80000,64,1000000,10000,250000,0,0\n"
);
assert_eq!(
validate_cuda_megakernel_speedup_evidence_csv(&missing_field, 100.0)
.expect_err("wrong row width must fail"),
CudaMegakernelSpeedupGateError::InvalidEvidenceRow { line: 2 }
);
let bad_number = format!(
"{MEGAKERNEL_SPEEDUP_EVIDENCE_CSV_HEADER}\n\
cuda,0,34359738368,12,0,10000,80000,many,1000000,10000,250000,0,0,0\n"
);
assert_eq!(
validate_cuda_megakernel_speedup_evidence_csv(&bad_number, 100.0)
.expect_err("bad numeric field must fail"),
CudaMegakernelSpeedupGateError::InvalidEvidenceValue {
line: 2,
field: "repetitions",
value: "many".to_string(),
}
);
let wrong_backend = format!(
"{MEGAKERNEL_SPEEDUP_EVIDENCE_CSV_HEADER}\n\
wgpu,0,34359738368,12,0,10000,80000,64,1000000,10000,250000,0,0,0\n"
);
assert_eq!(
validate_cuda_megakernel_speedup_evidence_csv(&wrong_backend, 100.0)
.expect_err("non-CUDA evidence must fail at the artifact boundary"),
CudaMegakernelSpeedupGateError::InvalidEvidenceValue {
line: 2,
field: "backend_id",
value: "wgpu".to_string(),
}
);
}
#[test]
fn speedup_gate_rejects_non_cuda_or_too_small_device_samples() {
let mut wrong_backend = sample(1_000_000.0, 1_000.0, 64);
wrong_backend.backend_id = "wgpu";
assert_eq!(
validate_cuda_megakernel_speedup_gate(&[wrong_backend], 100.0)
.expect_err("release samples must be CUDA-produced"),
CudaMegakernelSpeedupGateError::NonCudaEvidence {
index: 0,
backend_id: "wgpu".to_string(),
}
);
let mut too_small = sample(1_000_000.0, 1_000.0, 64);
too_small.device_memory_bytes = 8 * 1024 * 1024 * 1024;
too_small.compute_capability_major = 7;
too_small.compute_capability_minor = 5;
assert_eq!(
validate_cuda_megakernel_speedup_gate(&[too_small], 100.0)
.expect_err("release samples must prove CUDA hardware floor"),
CudaMegakernelSpeedupGateError::InsufficientCudaDevice {
index: 0,
device_memory_bytes: 8 * 1024 * 1024 * 1024,
compute_capability_major: 7,
compute_capability_minor: 5,
}
);
}
#[test]
fn speedup_gate_rejects_setup_pollution() {
let mut polluted = sample(1_000_000.0, 1_000.0, 64);
polluted.timed_graph_uploads = 1;
assert_eq!(
validate_cuda_megakernel_speedup_gate(&[polluted], 100.0)
.expect_err("setup pollution must fail"),
CudaMegakernelSpeedupGateError::PollutedTimedRegion {
index: 0,
graph_uploads: 1,
host_allocations: 0,
host_syncs: 0,
}
);
}
#[test]
fn speedup_gate_rejects_below_threshold_and_non_repeated_samples() {
assert_eq!(
validate_cuda_megakernel_speedup_gate(&[sample(50_000.0, 1_000.0, 64)], 100.0)
.expect_err("below threshold must fail"),
CudaMegakernelSpeedupGateError::SpeedupBelowThreshold {
index: 0,
observed_x: 50.0,
required_x: 100.0,
}
);
assert_eq!(
validate_cuda_megakernel_speedup_gate(&[sample(1_000_000.0, 1_000.0, 1)], 100.0)
.expect_err("single-run sample must fail"),
CudaMegakernelSpeedupGateError::NotRepeatedFixedGraph { index: 0 }
);
}
#[test]
fn speedup_gate_rejects_toy_workloads_that_cannot_prove_release_scale() {
let mut tiny = sample(1_000_000.0, 1_000.0, 64);
tiny.graph_nodes = 128;
tiny.graph_edges = 512;
assert_eq!(
validate_cuda_megakernel_speedup_gate(&[tiny], 100.0)
.expect_err("toy workload must not pass the release speedup gate"),
CudaMegakernelSpeedupGateError::InsufficientScale {
index: 0,
graph_nodes: 128,
graph_edges: 512,
repetitions: 64,
}
);
assert_eq!(
validate_cuda_megakernel_speedup_gate(&[sample(1_000_000.0, 1_000.0, 8)], 100.0)
.expect_err("too few repetitions must not pass the release speedup gate"),
CudaMegakernelSpeedupGateError::InsufficientScale {
index: 0,
graph_nodes: MIN_RELEASE_GRAPH_NODES,
graph_edges: MIN_RELEASE_GRAPH_EDGES,
repetitions: 8,
}
);
}
#[test]
fn speedup_gate_rejects_single_sample_release_proofs() {
assert_eq!(
validate_cuda_megakernel_speedup_gate(&[sample(1_000_000.0, 10_000.0, 64)], 100.0)
.expect_err("single-sample release proof must fail"),
CudaMegakernelSpeedupGateError::InsufficientSampleCount { sample_count: 1 }
);
}
#[test]
fn speedup_gate_rejects_invalid_thresholds_timings_and_repetition_overflow() {
let threshold_error =
validate_cuda_megakernel_speedup_gate(&[sample(1_000_000.0, 1_000.0, 64)], f64::NAN)
.expect_err("NaN release threshold must fail");
assert!(matches!(
threshold_error,
CudaMegakernelSpeedupGateError::InvalidRequiredSpeedup { required_x }
if required_x.is_nan()
));
let mut invalid_setup = sample(1_000_000.0, 1_000.0, 64);
invalid_setup.setup_ns = f64::NAN;
assert_eq!(
validate_cuda_megakernel_speedup_gate(&[invalid_setup], 100.0)
.expect_err("NaN setup timing must fail"),
CudaMegakernelSpeedupGateError::InvalidTiming { index: 0 }
);
assert_eq!(
validate_cuda_megakernel_speedup_gate(
&[
sample(1_000_000.0, 1_000.0, u64::MAX),
sample(1_000_000.0, 1_000.0, 64),
],
100.0,
)
.expect_err("proof repetition count overflow must fail loudly"),
CudaMegakernelSpeedupGateError::RepetitionCountOverflow { index: 1 }
);
}
fn sample(
host_orchestrated_ns: f64,
resident_megakernel_ns: f64,
repetitions: u64,
) -> CudaMegakernelSpeedupSample {
CudaMegakernelSpeedupSample {
backend_id: crate::CUDA_BACKEND_ID,
device_ordinal: 0,
device_memory_bytes: 32 * 1024 * 1024 * 1024,
compute_capability_major: 12,
compute_capability_minor: 0,
graph_nodes: 10_000,
graph_edges: 80_000,
repetitions,
host_orchestrated_ns,
resident_megakernel_ns,
setup_ns: 250_000.0,
timed_graph_uploads: 0,
timed_host_allocations: 0,
timed_host_syncs: 0,
}
}
}