use crate::backend::accounting::{
checked_add_u64_count as checked_add, checked_mul_u64_count as checked_mul,
CudaArithmeticOverflow,
};
use crate::backend::staging_reserve::reserved_vec;
use crate::megakernel_speedup_gate::{
format_validated_cuda_megakernel_speedup_evidence_csv, CudaMegakernelSpeedupGateError,
CudaMegakernelSpeedupProof, CudaMegakernelSpeedupSample,
};
use vyre_driver::ResidentGraphReuseTelemetry;
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum CudaResidentGraphReadback {
FinalOnly,
PerRun,
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct CudaResidentGraphSessionProfile {
pub graph_layout_hash: u64,
pub graph_bytes: u64,
pub run_count: u64,
pub per_run_frontier_bytes: u64,
pub reusable_scratch_bytes: u64,
pub per_run_output_bytes: u64,
pub budget_bytes: u64,
pub readback: CudaResidentGraphReadback,
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct CudaResidentGraphSessionPlan {
pub graph_layout_hash: u64,
pub one_time_graph_upload_bytes: u64,
pub total_frontier_refresh_bytes: u64,
pub peak_resident_bytes: u64,
pub avoided_graph_upload_bytes: u64,
pub graph_reuse: ResidentGraphReuseTelemetry,
pub avoided_device_allocations: u64,
pub avoided_host_fences: u64,
pub host_readback_bytes: u64,
pub graph_topology_resident: bool,
pub scratch_reused: bool,
pub final_only_host_readback: bool,
}
#[derive(Clone, Copy, Debug, PartialEq)]
pub struct CudaResidentGraphSessionEvidence {
pub backend_id: &'static str,
pub device_ordinal: u64,
pub device_memory_bytes: u64,
pub compute_capability_major: u32,
pub compute_capability_minor: u32,
pub graph_nodes: u64,
pub graph_edges: u64,
pub plan: CudaResidentGraphSessionPlan,
pub host_orchestrated_ns: f64,
pub resident_megakernel_ns: f64,
pub setup_ns: f64,
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum CudaResidentGraphSessionError {
ZeroGraphHash,
ZeroGraphBytes,
ZeroRuns,
ZeroBudget,
PerRunReadbackRejected,
ByteCountOverflow {
field: &'static str,
},
OverBudget {
required_bytes: u64,
budget_bytes: u64,
},
NonResidentEvidence,
}
#[derive(Clone, Debug, PartialEq)]
pub enum CudaResidentGraphSessionEvidenceError {
Session(CudaResidentGraphSessionError),
Speedup(CudaMegakernelSpeedupGateError),
SampleReserveFailed {
capacity: usize,
message: String,
},
}
impl std::fmt::Display for CudaResidentGraphSessionError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::ZeroGraphHash => write!(
f,
"CUDA resident graph session received graph_layout_hash=0. Fix: normalize and hash graph topology before session planning."
),
Self::ZeroGraphBytes => write!(
f,
"CUDA resident graph session received graph_bytes=0. Fix: pass the concrete resident graph topology byte count."
),
Self::ZeroRuns => write!(
f,
"CUDA resident graph session received run_count=0. Fix: plan only non-empty repeated execution sessions."
),
Self::ZeroBudget => write!(
f,
"CUDA resident graph session received budget_bytes=0. Fix: pass an explicit CUDA memory budget."
),
Self::PerRunReadbackRejected => write!(
f,
"CUDA resident graph session rejected per-run readback. Fix: compact final outputs on device and read back once after repeated execution."
),
Self::ByteCountOverflow { field } => write!(
f,
"CUDA resident graph session overflowed while computing {field}. Fix: shard repeated graph execution before planning."
),
Self::OverBudget {
required_bytes,
budget_bytes,
} => write!(
f,
"CUDA resident graph session requires {required_bytes} bytes but budget allows {budget_bytes}. Fix: reduce frontier/output size, reuse compact outputs, or shard the graph."
),
Self::NonResidentEvidence => write!(
f,
"CUDA resident graph session evidence is not final-only resident execution. Fix: build evidence from a plan with resident topology, reused scratch, and one final readback."
),
}
}
}
impl std::error::Error for CudaResidentGraphSessionError {}
impl CudaArithmeticOverflow for CudaResidentGraphSessionError {
fn arithmetic_overflow(field: &'static str) -> Self {
Self::ByteCountOverflow { field }
}
}
impl std::fmt::Display for CudaResidentGraphSessionEvidenceError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Session(error) => write!(f, "{error}"),
Self::Speedup(error) => write!(f, "{error}"),
Self::SampleReserveFailed { capacity, message } => write!(
f,
"CUDA resident graph session evidence could not reserve {capacity} release sample slot(s): {message}. Fix: split the release evidence batch before formatting."
),
}
}
}
impl std::error::Error for CudaResidentGraphSessionEvidenceError {}
impl From<CudaResidentGraphSessionError> for CudaResidentGraphSessionEvidenceError {
fn from(error: CudaResidentGraphSessionError) -> Self {
Self::Session(error)
}
}
impl From<CudaMegakernelSpeedupGateError> for CudaResidentGraphSessionEvidenceError {
fn from(error: CudaMegakernelSpeedupGateError) -> Self {
Self::Speedup(error)
}
}
pub fn plan_cuda_resident_graph_session(
profile: CudaResidentGraphSessionProfile,
) -> Result<CudaResidentGraphSessionPlan, CudaResidentGraphSessionError> {
if profile.graph_layout_hash == 0 {
return Err(CudaResidentGraphSessionError::ZeroGraphHash);
}
if profile.graph_bytes == 0 {
return Err(CudaResidentGraphSessionError::ZeroGraphBytes);
}
if profile.run_count == 0 {
return Err(CudaResidentGraphSessionError::ZeroRuns);
}
if profile.budget_bytes == 0 {
return Err(CudaResidentGraphSessionError::ZeroBudget);
}
if profile.readback != CudaResidentGraphReadback::FinalOnly {
return Err(CudaResidentGraphSessionError::PerRunReadbackRejected);
}
if profile.run_count == 1 {
let graph_plus_frontier = checked_add(
profile.graph_bytes,
profile.per_run_frontier_bytes,
"graph plus frontier bytes",
)?;
let with_scratch = checked_add(
graph_plus_frontier,
profile.reusable_scratch_bytes,
"graph frontier scratch bytes",
)?;
let peak_resident_bytes = checked_add(
with_scratch,
profile.per_run_output_bytes,
"peak resident bytes",
)?;
if peak_resident_bytes > profile.budget_bytes {
return Err(CudaResidentGraphSessionError::OverBudget {
required_bytes: peak_resident_bytes,
budget_bytes: profile.budget_bytes,
});
}
return Ok(CudaResidentGraphSessionPlan {
graph_layout_hash: profile.graph_layout_hash,
one_time_graph_upload_bytes: profile.graph_bytes,
total_frontier_refresh_bytes: profile.per_run_frontier_bytes,
peak_resident_bytes,
avoided_graph_upload_bytes: 0,
graph_reuse: ResidentGraphReuseTelemetry::cold_upload(profile.graph_bytes),
avoided_device_allocations: 0,
avoided_host_fences: 0,
host_readback_bytes: profile.per_run_output_bytes,
graph_topology_resident: true,
scratch_reused: true,
final_only_host_readback: true,
});
}
let graph_plus_frontier = checked_add(
profile.graph_bytes,
profile.per_run_frontier_bytes,
"graph plus frontier bytes",
)?;
let with_scratch = checked_add(
graph_plus_frontier,
profile.reusable_scratch_bytes,
"graph frontier scratch bytes",
)?;
let peak_resident_bytes = checked_add(
with_scratch,
profile.per_run_output_bytes,
"peak resident bytes",
)?;
if peak_resident_bytes > profile.budget_bytes {
return Err(CudaResidentGraphSessionError::OverBudget {
required_bytes: peak_resident_bytes,
budget_bytes: profile.budget_bytes,
});
}
let total_frontier_refresh_bytes = checked_mul(
profile.run_count,
profile.per_run_frontier_bytes,
"total frontier refresh bytes",
)?;
let repeated_runs = profile.run_count - 1;
let avoided_graph_upload_bytes = checked_mul(
repeated_runs,
profile.graph_bytes,
"avoided graph upload bytes",
)?;
let avoided_device_allocations = checked_mul(repeated_runs, 3, "avoided allocations")?;
Ok(CudaResidentGraphSessionPlan {
graph_layout_hash: profile.graph_layout_hash,
one_time_graph_upload_bytes: profile.graph_bytes,
total_frontier_refresh_bytes,
peak_resident_bytes,
avoided_graph_upload_bytes,
graph_reuse: ResidentGraphReuseTelemetry::from_counters(
1,
repeated_runs,
profile.graph_bytes,
avoided_graph_upload_bytes,
),
avoided_device_allocations,
avoided_host_fences: repeated_runs,
host_readback_bytes: profile.per_run_output_bytes,
graph_topology_resident: true,
scratch_reused: true,
final_only_host_readback: true,
})
}
pub fn resident_graph_session_speedup_sample(
evidence: CudaResidentGraphSessionEvidence,
) -> Result<CudaMegakernelSpeedupSample, CudaResidentGraphSessionError> {
if !evidence.plan.graph_topology_resident
|| !evidence.plan.scratch_reused
|| !evidence.plan.final_only_host_readback
{
return Err(CudaResidentGraphSessionError::NonResidentEvidence);
}
Ok(CudaMegakernelSpeedupSample {
backend_id: evidence.backend_id,
device_ordinal: evidence.device_ordinal,
device_memory_bytes: evidence.device_memory_bytes,
compute_capability_major: evidence.compute_capability_major,
compute_capability_minor: evidence.compute_capability_minor,
graph_nodes: evidence.graph_nodes,
graph_edges: evidence.graph_edges,
repetitions: checked_add(evidence.plan.avoided_host_fences, 1, "evidence repetitions")?,
host_orchestrated_ns: evidence.host_orchestrated_ns,
resident_megakernel_ns: evidence.resident_megakernel_ns,
setup_ns: evidence.setup_ns,
timed_graph_uploads: 0,
timed_host_allocations: 0,
timed_host_syncs: 0,
})
}
pub fn format_validated_cuda_resident_graph_session_evidence_csv(
evidence: &[CudaResidentGraphSessionEvidence],
required_speedup_x: f64,
) -> Result<(CudaMegakernelSpeedupProof, String), CudaResidentGraphSessionEvidenceError> {
let mut samples = reserved_vec(
evidence.len(),
"cuda resident graph session release samples",
)
.map_err(
|error| CudaResidentGraphSessionEvidenceError::SampleReserveFailed {
capacity: evidence.len(),
message: error.to_string(),
},
)?;
for item in evidence {
samples.push(resident_graph_session_speedup_sample(*item)?);
}
format_validated_cuda_megakernel_speedup_evidence_csv(&samples, required_speedup_x)
.map_err(CudaResidentGraphSessionEvidenceError::Speedup)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn resident_graph_session_uses_shared_typed_cuda_arithmetic() {
let source = include_str!("resident_graph_session.rs");
assert!(source.contains("checked_add_u64_count as checked_add"));
assert!(source.contains("checked_mul_u64_count as checked_mul"));
assert!(source.contains("impl CudaArithmeticOverflow for CudaResidentGraphSessionError"));
assert!(!source.contains(concat!("fn checked_", "mul(")));
assert!(!source.contains(concat!("fn checked_", "add(")));
}
#[test]
fn resident_graph_session_amortizes_fixed_graph_repeated_execution() {
let plan = plan_cuda_resident_graph_session(CudaResidentGraphSessionProfile {
graph_layout_hash: 0xabc,
graph_bytes: 1_048_576,
run_count: 128,
per_run_frontier_bytes: 4_096,
reusable_scratch_bytes: 65_536,
per_run_output_bytes: 2_048,
budget_bytes: 2_000_000,
readback: CudaResidentGraphReadback::FinalOnly,
})
.expect("Fix: resident graph session should fit");
assert_eq!(plan.one_time_graph_upload_bytes, 1_048_576);
assert_eq!(plan.total_frontier_refresh_bytes, 524_288);
assert_eq!(plan.avoided_graph_upload_bytes, 133_169_152);
assert_eq!(
plan.graph_reuse,
ResidentGraphReuseTelemetry::from_counters(1, 127, 1_048_576, 133_169_152)
);
assert_eq!(plan.avoided_device_allocations, 381);
assert_eq!(plan.avoided_host_fences, 127);
assert_eq!(plan.host_readback_bytes, 2_048);
assert!(plan.graph_topology_resident);
assert!(plan.scratch_reused);
assert!(plan.final_only_host_readback);
}
#[test]
fn resident_graph_session_builds_release_speedup_sample_without_timed_pollution() {
let plan = plan_cuda_resident_graph_session(CudaResidentGraphSessionProfile {
graph_layout_hash: 0xabc,
graph_bytes: 1_048_576,
run_count: 128,
per_run_frontier_bytes: 4_096,
reusable_scratch_bytes: 65_536,
per_run_output_bytes: 2_048,
budget_bytes: 2_000_000,
readback: CudaResidentGraphReadback::FinalOnly,
})
.expect("Fix: resident graph session should fit");
let sample = resident_graph_session_speedup_sample(CudaResidentGraphSessionEvidence {
backend_id: crate::CUDA_BACKEND_ID,
device_ordinal: 0,
device_memory_bytes: 32 * 1024 * 1024 * 1024,
compute_capability_major: 12,
compute_capability_minor: 0,
graph_nodes: 10_000,
graph_edges: 80_000,
plan,
host_orchestrated_ns: 1_000_000.0,
resident_megakernel_ns: 10_000.0,
setup_ns: 250_000.0,
})
.expect("Fix: resident final-only plan should produce release evidence");
assert_eq!(sample.backend_id, crate::CUDA_BACKEND_ID);
assert_eq!(sample.device_memory_bytes, 32 * 1024 * 1024 * 1024);
assert_eq!(sample.compute_capability_major, 12);
assert_eq!(sample.graph_nodes, 10_000);
assert_eq!(sample.graph_edges, 80_000);
assert_eq!(sample.repetitions, 128);
assert_eq!(sample.timed_graph_uploads, 0);
assert_eq!(sample.timed_host_allocations, 0);
assert_eq!(sample.timed_host_syncs, 0);
}
#[test]
fn resident_graph_session_formats_validated_release_speedup_csv() {
let plan_a = plan_cuda_resident_graph_session(CudaResidentGraphSessionProfile {
graph_layout_hash: 0xabc,
graph_bytes: 1_048_576,
run_count: 128,
per_run_frontier_bytes: 4_096,
reusable_scratch_bytes: 65_536,
per_run_output_bytes: 2_048,
budget_bytes: 2_000_000,
readback: CudaResidentGraphReadback::FinalOnly,
})
.expect("Fix: first resident graph session should fit");
let plan_b = plan_cuda_resident_graph_session(CudaResidentGraphSessionProfile {
graph_layout_hash: 0xdef,
graph_bytes: 2_097_152,
run_count: 256,
per_run_frontier_bytes: 8_192,
reusable_scratch_bytes: 131_072,
per_run_output_bytes: 4_096,
budget_bytes: 4_000_000,
readback: CudaResidentGraphReadback::FinalOnly,
})
.expect("Fix: second resident graph session should fit");
let evidence = [
CudaResidentGraphSessionEvidence {
backend_id: crate::CUDA_BACKEND_ID,
device_ordinal: 0,
device_memory_bytes: 32 * 1024 * 1024 * 1024,
compute_capability_major: 12,
compute_capability_minor: 0,
graph_nodes: 10_000,
graph_edges: 80_000,
plan: plan_a,
host_orchestrated_ns: 1_000_000.0,
resident_megakernel_ns: 10_000.0,
setup_ns: 250_000.0,
},
CudaResidentGraphSessionEvidence {
backend_id: crate::CUDA_BACKEND_ID,
device_ordinal: 0,
device_memory_bytes: 32 * 1024 * 1024 * 1024,
compute_capability_major: 12,
compute_capability_minor: 0,
graph_nodes: 20_000,
graph_edges: 160_000,
plan: plan_b,
host_orchestrated_ns: 2_500_000.0,
resident_megakernel_ns: 20_000.0,
setup_ns: 350_000.0,
},
];
let (proof, csv) =
format_validated_cuda_resident_graph_session_evidence_csv(&evidence, 100.0)
.expect("Fix: resident graph release evidence should format as validated CSV");
let reparsed = crate::validate_cuda_megakernel_speedup_evidence_csv(&csv, 100.0)
.expect("Fix: resident graph release CSV should roundtrip through verifier");
assert_eq!(proof, reparsed);
assert_eq!(proof.sample_count, 2);
assert_eq!(proof.min_speedup_x, 100.0);
assert_eq!(proof.max_speedup_x, 125.0);
assert_eq!(csv.lines().count(), 3);
}
#[test]
fn resident_graph_session_rejects_host_orchestration_shape() {
assert_eq!(
plan_cuda_resident_graph_session(CudaResidentGraphSessionProfile {
graph_layout_hash: 1,
graph_bytes: 128,
run_count: 2,
per_run_frontier_bytes: 16,
reusable_scratch_bytes: 16,
per_run_output_bytes: 16,
budget_bytes: 1_024,
readback: CudaResidentGraphReadback::PerRun,
})
.expect_err("per-run readback should fail"),
CudaResidentGraphSessionError::PerRunReadbackRejected
);
}
#[test]
fn resident_graph_session_rejects_invalid_inputs_and_budget() {
assert_eq!(
plan_cuda_resident_graph_session(profile(0, 128, 1, 16, 16, 16, 1_024))
.expect_err("zero hash should fail"),
CudaResidentGraphSessionError::ZeroGraphHash
);
assert_eq!(
plan_cuda_resident_graph_session(profile(1, 128, 0, 16, 16, 16, 1_024))
.expect_err("zero runs should fail"),
CudaResidentGraphSessionError::ZeroRuns
);
assert_eq!(
plan_cuda_resident_graph_session(profile(1, 128, 1, 16, 16, 16, 127))
.expect_err("over-budget session should fail"),
CudaResidentGraphSessionError::OverBudget {
required_bytes: 176,
budget_bytes: 127,
}
);
}
#[test]
fn resident_graph_session_evidence_uses_fallible_sample_staging() {
let source = include_str!("resident_graph_session.rs");
assert!(source.contains("use crate::backend::staging_reserve::reserve_vec;"));
assert!(source.contains("SampleReserveFailed"));
assert!(!source.contains(concat!("Vec", "::with_capacity(evidence.len())")));
}
fn profile(
graph_layout_hash: u64,
graph_bytes: u64,
run_count: u64,
per_run_frontier_bytes: u64,
reusable_scratch_bytes: u64,
per_run_output_bytes: u64,
budget_bytes: u64,
) -> CudaResidentGraphSessionProfile {
CudaResidentGraphSessionProfile {
graph_layout_hash,
graph_bytes,
run_count,
per_run_frontier_bytes,
reusable_scratch_bytes,
per_run_output_bytes,
budget_bytes,
readback: CudaResidentGraphReadback::FinalOnly,
}
}
}