1use crate::backend::accounting::{
10 checked_add_u64_count as checked_add, checked_mul_u64_count as checked_mul,
11 CudaArithmeticOverflow,
12};
13use crate::backend::staging_reserve::reserved_vec;
14use crate::megakernel_speedup_gate::{
15 format_validated_cuda_megakernel_speedup_evidence_csv, CudaMegakernelSpeedupGateError,
16 CudaMegakernelSpeedupProof, CudaMegakernelSpeedupSample,
17};
18use vyre_driver::ResidentGraphReuseTelemetry;
19
20#[derive(Clone, Copy, Debug, Eq, PartialEq)]
22pub enum CudaResidentGraphReadback {
23 FinalOnly,
25 PerRun,
27}
28
29#[derive(Clone, Copy, Debug, Eq, PartialEq)]
31pub struct CudaResidentGraphSessionProfile {
32 pub graph_layout_hash: u64,
34 pub graph_bytes: u64,
36 pub run_count: u64,
38 pub per_run_frontier_bytes: u64,
40 pub reusable_scratch_bytes: u64,
42 pub per_run_output_bytes: u64,
44 pub budget_bytes: u64,
46 pub readback: CudaResidentGraphReadback,
48}
49
50#[derive(Clone, Copy, Debug, Eq, PartialEq)]
52pub struct CudaResidentGraphSessionPlan {
53 pub graph_layout_hash: u64,
55 pub one_time_graph_upload_bytes: u64,
57 pub total_frontier_refresh_bytes: u64,
59 pub peak_resident_bytes: u64,
61 pub avoided_graph_upload_bytes: u64,
63 pub graph_reuse: ResidentGraphReuseTelemetry,
65 pub avoided_device_allocations: u64,
67 pub avoided_host_fences: u64,
69 pub host_readback_bytes: u64,
71 pub graph_topology_resident: bool,
73 pub scratch_reused: bool,
75 pub final_only_host_readback: bool,
77}
78
79#[derive(Clone, Copy, Debug, PartialEq)]
81pub struct CudaResidentGraphSessionEvidence {
82 pub backend_id: &'static str,
84 pub device_ordinal: u64,
86 pub device_memory_bytes: u64,
88 pub compute_capability_major: u32,
90 pub compute_capability_minor: u32,
92 pub graph_nodes: u64,
94 pub graph_edges: u64,
96 pub plan: CudaResidentGraphSessionPlan,
98 pub host_orchestrated_ns: f64,
100 pub resident_megakernel_ns: f64,
102 pub setup_ns: f64,
104}
105
106#[derive(Clone, Debug, Eq, PartialEq)]
108pub enum CudaResidentGraphSessionError {
109 ZeroGraphHash,
111 ZeroGraphBytes,
113 ZeroRuns,
115 ZeroBudget,
117 PerRunReadbackRejected,
119 ByteCountOverflow {
121 field: &'static str,
123 },
124 OverBudget {
126 required_bytes: u64,
128 budget_bytes: u64,
130 },
131 NonResidentEvidence,
133}
134
135#[derive(Clone, Debug, PartialEq)]
137pub enum CudaResidentGraphSessionEvidenceError {
138 Session(CudaResidentGraphSessionError),
140 Speedup(CudaMegakernelSpeedupGateError),
142 SampleReserveFailed {
144 capacity: usize,
146 message: String,
148 },
149}
150
151impl std::fmt::Display for CudaResidentGraphSessionError {
152 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
153 match self {
154 Self::ZeroGraphHash => write!(
155 f,
156 "CUDA resident graph session received graph_layout_hash=0. Fix: normalize and hash graph topology before session planning."
157 ),
158 Self::ZeroGraphBytes => write!(
159 f,
160 "CUDA resident graph session received graph_bytes=0. Fix: pass the concrete resident graph topology byte count."
161 ),
162 Self::ZeroRuns => write!(
163 f,
164 "CUDA resident graph session received run_count=0. Fix: plan only non-empty repeated execution sessions."
165 ),
166 Self::ZeroBudget => write!(
167 f,
168 "CUDA resident graph session received budget_bytes=0. Fix: pass an explicit CUDA memory budget."
169 ),
170 Self::PerRunReadbackRejected => write!(
171 f,
172 "CUDA resident graph session rejected per-run readback. Fix: compact final outputs on device and read back once after repeated execution."
173 ),
174 Self::ByteCountOverflow { field } => write!(
175 f,
176 "CUDA resident graph session overflowed while computing {field}. Fix: shard repeated graph execution before planning."
177 ),
178 Self::OverBudget {
179 required_bytes,
180 budget_bytes,
181 } => write!(
182 f,
183 "CUDA resident graph session requires {required_bytes} bytes but budget allows {budget_bytes}. Fix: reduce frontier/output size, reuse compact outputs, or shard the graph."
184 ),
185 Self::NonResidentEvidence => write!(
186 f,
187 "CUDA resident graph session evidence is not final-only resident execution. Fix: build evidence from a plan with resident topology, reused scratch, and one final readback."
188 ),
189 }
190 }
191}
192
193impl std::error::Error for CudaResidentGraphSessionError {}
194
195impl CudaArithmeticOverflow for CudaResidentGraphSessionError {
196 fn arithmetic_overflow(field: &'static str) -> Self {
197 Self::ByteCountOverflow { field }
198 }
199}
200
201impl std::fmt::Display for CudaResidentGraphSessionEvidenceError {
202 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
203 match self {
204 Self::Session(error) => write!(f, "{error}"),
205 Self::Speedup(error) => write!(f, "{error}"),
206 Self::SampleReserveFailed { capacity, message } => write!(
207 f,
208 "CUDA resident graph session evidence could not reserve {capacity} release sample slot(s): {message}. Fix: split the release evidence batch before formatting."
209 ),
210 }
211 }
212}
213
214impl std::error::Error for CudaResidentGraphSessionEvidenceError {}
215
216impl From<CudaResidentGraphSessionError> for CudaResidentGraphSessionEvidenceError {
217 fn from(error: CudaResidentGraphSessionError) -> Self {
218 Self::Session(error)
219 }
220}
221
222impl From<CudaMegakernelSpeedupGateError> for CudaResidentGraphSessionEvidenceError {
223 fn from(error: CudaMegakernelSpeedupGateError) -> Self {
224 Self::Speedup(error)
225 }
226}
227
228pub fn plan_cuda_resident_graph_session(
230 profile: CudaResidentGraphSessionProfile,
231) -> Result<CudaResidentGraphSessionPlan, CudaResidentGraphSessionError> {
232 if profile.graph_layout_hash == 0 {
233 return Err(CudaResidentGraphSessionError::ZeroGraphHash);
234 }
235 if profile.graph_bytes == 0 {
236 return Err(CudaResidentGraphSessionError::ZeroGraphBytes);
237 }
238 if profile.run_count == 0 {
239 return Err(CudaResidentGraphSessionError::ZeroRuns);
240 }
241 if profile.budget_bytes == 0 {
242 return Err(CudaResidentGraphSessionError::ZeroBudget);
243 }
244 if profile.readback != CudaResidentGraphReadback::FinalOnly {
245 return Err(CudaResidentGraphSessionError::PerRunReadbackRejected);
246 }
247 if profile.run_count == 1 {
248 let graph_plus_frontier = checked_add(
249 profile.graph_bytes,
250 profile.per_run_frontier_bytes,
251 "graph plus frontier bytes",
252 )?;
253 let with_scratch = checked_add(
254 graph_plus_frontier,
255 profile.reusable_scratch_bytes,
256 "graph frontier scratch bytes",
257 )?;
258 let peak_resident_bytes = checked_add(
259 with_scratch,
260 profile.per_run_output_bytes,
261 "peak resident bytes",
262 )?;
263 if peak_resident_bytes > profile.budget_bytes {
264 return Err(CudaResidentGraphSessionError::OverBudget {
265 required_bytes: peak_resident_bytes,
266 budget_bytes: profile.budget_bytes,
267 });
268 }
269 return Ok(CudaResidentGraphSessionPlan {
270 graph_layout_hash: profile.graph_layout_hash,
271 one_time_graph_upload_bytes: profile.graph_bytes,
272 total_frontier_refresh_bytes: profile.per_run_frontier_bytes,
273 peak_resident_bytes,
274 avoided_graph_upload_bytes: 0,
275 graph_reuse: ResidentGraphReuseTelemetry::cold_upload(profile.graph_bytes),
276 avoided_device_allocations: 0,
277 avoided_host_fences: 0,
278 host_readback_bytes: profile.per_run_output_bytes,
279 graph_topology_resident: true,
280 scratch_reused: true,
281 final_only_host_readback: true,
282 });
283 }
284
285 let graph_plus_frontier = checked_add(
286 profile.graph_bytes,
287 profile.per_run_frontier_bytes,
288 "graph plus frontier bytes",
289 )?;
290 let with_scratch = checked_add(
291 graph_plus_frontier,
292 profile.reusable_scratch_bytes,
293 "graph frontier scratch bytes",
294 )?;
295 let peak_resident_bytes = checked_add(
296 with_scratch,
297 profile.per_run_output_bytes,
298 "peak resident bytes",
299 )?;
300 if peak_resident_bytes > profile.budget_bytes {
301 return Err(CudaResidentGraphSessionError::OverBudget {
302 required_bytes: peak_resident_bytes,
303 budget_bytes: profile.budget_bytes,
304 });
305 }
306
307 let total_frontier_refresh_bytes = checked_mul(
308 profile.run_count,
309 profile.per_run_frontier_bytes,
310 "total frontier refresh bytes",
311 )?;
312 let repeated_runs = profile.run_count - 1;
313 let avoided_graph_upload_bytes = checked_mul(
314 repeated_runs,
315 profile.graph_bytes,
316 "avoided graph upload bytes",
317 )?;
318 let avoided_device_allocations = checked_mul(repeated_runs, 3, "avoided allocations")?;
319
320 Ok(CudaResidentGraphSessionPlan {
321 graph_layout_hash: profile.graph_layout_hash,
322 one_time_graph_upload_bytes: profile.graph_bytes,
323 total_frontier_refresh_bytes,
324 peak_resident_bytes,
325 avoided_graph_upload_bytes,
326 graph_reuse: ResidentGraphReuseTelemetry::from_counters(
327 1,
328 repeated_runs,
329 profile.graph_bytes,
330 avoided_graph_upload_bytes,
331 ),
332 avoided_device_allocations,
333 avoided_host_fences: repeated_runs,
334 host_readback_bytes: profile.per_run_output_bytes,
335 graph_topology_resident: true,
336 scratch_reused: true,
337 final_only_host_readback: true,
338 })
339}
340
341pub fn resident_graph_session_speedup_sample(
344 evidence: CudaResidentGraphSessionEvidence,
345) -> Result<CudaMegakernelSpeedupSample, CudaResidentGraphSessionError> {
346 if !evidence.plan.graph_topology_resident
347 || !evidence.plan.scratch_reused
348 || !evidence.plan.final_only_host_readback
349 {
350 return Err(CudaResidentGraphSessionError::NonResidentEvidence);
351 }
352 Ok(CudaMegakernelSpeedupSample {
353 backend_id: evidence.backend_id,
354 device_ordinal: evidence.device_ordinal,
355 device_memory_bytes: evidence.device_memory_bytes,
356 compute_capability_major: evidence.compute_capability_major,
357 compute_capability_minor: evidence.compute_capability_minor,
358 graph_nodes: evidence.graph_nodes,
359 graph_edges: evidence.graph_edges,
360 repetitions: checked_add(evidence.plan.avoided_host_fences, 1, "evidence repetitions")?,
361 host_orchestrated_ns: evidence.host_orchestrated_ns,
362 resident_megakernel_ns: evidence.resident_megakernel_ns,
363 setup_ns: evidence.setup_ns,
364 timed_graph_uploads: 0,
365 timed_host_allocations: 0,
366 timed_host_syncs: 0,
367 })
368}
369
370pub fn format_validated_cuda_resident_graph_session_evidence_csv(
373 evidence: &[CudaResidentGraphSessionEvidence],
374 required_speedup_x: f64,
375) -> Result<(CudaMegakernelSpeedupProof, String), CudaResidentGraphSessionEvidenceError> {
376 let mut samples = reserved_vec(
377 evidence.len(),
378 "cuda resident graph session release samples",
379 )
380 .map_err(
381 |error| CudaResidentGraphSessionEvidenceError::SampleReserveFailed {
382 capacity: evidence.len(),
383 message: error.to_string(),
384 },
385 )?;
386 for item in evidence {
387 samples.push(resident_graph_session_speedup_sample(*item)?);
388 }
389 format_validated_cuda_megakernel_speedup_evidence_csv(&samples, required_speedup_x)
390 .map_err(CudaResidentGraphSessionEvidenceError::Speedup)
391}
392
393#[cfg(test)]
394mod tests {
395 use super::*;
396
397 #[test]
398 fn resident_graph_session_uses_shared_typed_cuda_arithmetic() {
399 let source = include_str!("resident_graph_session.rs");
400
401 assert!(source.contains("checked_add_u64_count as checked_add"));
402 assert!(source.contains("checked_mul_u64_count as checked_mul"));
403 assert!(source.contains("impl CudaArithmeticOverflow for CudaResidentGraphSessionError"));
404 assert!(!source.contains(concat!("fn checked_", "mul(")));
405 assert!(!source.contains(concat!("fn checked_", "add(")));
406 }
407
408 #[test]
409 fn resident_graph_session_amortizes_fixed_graph_repeated_execution() {
410 let plan = plan_cuda_resident_graph_session(CudaResidentGraphSessionProfile {
411 graph_layout_hash: 0xabc,
412 graph_bytes: 1_048_576,
413 run_count: 128,
414 per_run_frontier_bytes: 4_096,
415 reusable_scratch_bytes: 65_536,
416 per_run_output_bytes: 2_048,
417 budget_bytes: 2_000_000,
418 readback: CudaResidentGraphReadback::FinalOnly,
419 })
420 .expect("Fix: resident graph session should fit");
421
422 assert_eq!(plan.one_time_graph_upload_bytes, 1_048_576);
423 assert_eq!(plan.total_frontier_refresh_bytes, 524_288);
424 assert_eq!(plan.avoided_graph_upload_bytes, 133_169_152);
425 assert_eq!(
426 plan.graph_reuse,
427 ResidentGraphReuseTelemetry::from_counters(1, 127, 1_048_576, 133_169_152)
428 );
429 assert_eq!(plan.avoided_device_allocations, 381);
430 assert_eq!(plan.avoided_host_fences, 127);
431 assert_eq!(plan.host_readback_bytes, 2_048);
432 assert!(plan.graph_topology_resident);
433 assert!(plan.scratch_reused);
434 assert!(plan.final_only_host_readback);
435 }
436
437 #[test]
438 fn resident_graph_session_builds_release_speedup_sample_without_timed_pollution() {
439 let plan = plan_cuda_resident_graph_session(CudaResidentGraphSessionProfile {
440 graph_layout_hash: 0xabc,
441 graph_bytes: 1_048_576,
442 run_count: 128,
443 per_run_frontier_bytes: 4_096,
444 reusable_scratch_bytes: 65_536,
445 per_run_output_bytes: 2_048,
446 budget_bytes: 2_000_000,
447 readback: CudaResidentGraphReadback::FinalOnly,
448 })
449 .expect("Fix: resident graph session should fit");
450
451
452 let sample = resident_graph_session_speedup_sample(CudaResidentGraphSessionEvidence {
453 backend_id: crate::CUDA_BACKEND_ID,
454 device_ordinal: 0,
455 device_memory_bytes: 32 * 1024 * 1024 * 1024,
456 compute_capability_major: 12,
457 compute_capability_minor: 0,
458 graph_nodes: 10_000,
459 graph_edges: 80_000,
460 plan,
461 host_orchestrated_ns: 1_000_000.0,
462 resident_megakernel_ns: 10_000.0,
463 setup_ns: 250_000.0,
464 })
465 .expect("Fix: resident final-only plan should produce release evidence");
466
467 assert_eq!(sample.backend_id, crate::CUDA_BACKEND_ID);
468 assert_eq!(sample.device_memory_bytes, 32 * 1024 * 1024 * 1024);
469 assert_eq!(sample.compute_capability_major, 12);
470 assert_eq!(sample.graph_nodes, 10_000);
471 assert_eq!(sample.graph_edges, 80_000);
472 assert_eq!(sample.repetitions, 128);
473 assert_eq!(sample.timed_graph_uploads, 0);
474 assert_eq!(sample.timed_host_allocations, 0);
475 assert_eq!(sample.timed_host_syncs, 0);
476 }
477
478 #[test]
479 fn resident_graph_session_formats_validated_release_speedup_csv() {
480 let plan_a = plan_cuda_resident_graph_session(CudaResidentGraphSessionProfile {
481 graph_layout_hash: 0xabc,
482 graph_bytes: 1_048_576,
483 run_count: 128,
484 per_run_frontier_bytes: 4_096,
485 reusable_scratch_bytes: 65_536,
486 per_run_output_bytes: 2_048,
487 budget_bytes: 2_000_000,
488 readback: CudaResidentGraphReadback::FinalOnly,
489 })
490 .expect("Fix: first resident graph session should fit");
491 let plan_b = plan_cuda_resident_graph_session(CudaResidentGraphSessionProfile {
492 graph_layout_hash: 0xdef,
493 graph_bytes: 2_097_152,
494 run_count: 256,
495 per_run_frontier_bytes: 8_192,
496 reusable_scratch_bytes: 131_072,
497 per_run_output_bytes: 4_096,
498 budget_bytes: 4_000_000,
499 readback: CudaResidentGraphReadback::FinalOnly,
500 })
501 .expect("Fix: second resident graph session should fit");
502 let evidence = [
503 CudaResidentGraphSessionEvidence {
504 backend_id: crate::CUDA_BACKEND_ID,
505 device_ordinal: 0,
506 device_memory_bytes: 32 * 1024 * 1024 * 1024,
507 compute_capability_major: 12,
508 compute_capability_minor: 0,
509 graph_nodes: 10_000,
510 graph_edges: 80_000,
511 plan: plan_a,
512 host_orchestrated_ns: 1_000_000.0,
513 resident_megakernel_ns: 10_000.0,
514 setup_ns: 250_000.0,
515 },
516 CudaResidentGraphSessionEvidence {
517 backend_id: crate::CUDA_BACKEND_ID,
518 device_ordinal: 0,
519 device_memory_bytes: 32 * 1024 * 1024 * 1024,
520 compute_capability_major: 12,
521 compute_capability_minor: 0,
522 graph_nodes: 20_000,
523 graph_edges: 160_000,
524 plan: plan_b,
525 host_orchestrated_ns: 2_500_000.0,
526 resident_megakernel_ns: 20_000.0,
527 setup_ns: 350_000.0,
528 },
529 ];
530
531 let (proof, csv) =
532 format_validated_cuda_resident_graph_session_evidence_csv(&evidence, 100.0)
533 .expect("Fix: resident graph release evidence should format as validated CSV");
534 let reparsed = crate::validate_cuda_megakernel_speedup_evidence_csv(&csv, 100.0)
535 .expect("Fix: resident graph release CSV should roundtrip through verifier");
536
537 assert_eq!(proof, reparsed);
538 assert_eq!(proof.sample_count, 2);
539 assert_eq!(proof.min_speedup_x, 100.0);
540 assert_eq!(proof.max_speedup_x, 125.0);
541 assert_eq!(csv.lines().count(), 3);
542 }
543
544 #[test]
545 fn resident_graph_session_rejects_host_orchestration_shape() {
546 assert_eq!(
547 plan_cuda_resident_graph_session(CudaResidentGraphSessionProfile {
548 graph_layout_hash: 1,
549 graph_bytes: 128,
550 run_count: 2,
551 per_run_frontier_bytes: 16,
552 reusable_scratch_bytes: 16,
553 per_run_output_bytes: 16,
554 budget_bytes: 1_024,
555 readback: CudaResidentGraphReadback::PerRun,
556 })
557 .expect_err("per-run readback should fail"),
558 CudaResidentGraphSessionError::PerRunReadbackRejected
559 );
560 }
561
562 #[test]
563 fn resident_graph_session_rejects_invalid_inputs_and_budget() {
564 assert_eq!(
565 plan_cuda_resident_graph_session(profile(0, 128, 1, 16, 16, 16, 1_024))
566 .expect_err("zero hash should fail"),
567 CudaResidentGraphSessionError::ZeroGraphHash
568 );
569 assert_eq!(
570 plan_cuda_resident_graph_session(profile(1, 128, 0, 16, 16, 16, 1_024))
571 .expect_err("zero runs should fail"),
572 CudaResidentGraphSessionError::ZeroRuns
573 );
574 assert_eq!(
575 plan_cuda_resident_graph_session(profile(1, 128, 1, 16, 16, 16, 127))
576 .expect_err("over-budget session should fail"),
577 CudaResidentGraphSessionError::OverBudget {
578 required_bytes: 176,
579 budget_bytes: 127,
580 }
581 );
582 }
583
584 #[test]
585 fn resident_graph_session_evidence_uses_fallible_sample_staging() {
586 let source = include_str!("resident_graph_session.rs");
587
588 assert!(source.contains("use crate::backend::staging_reserve::reserve_vec;"));
589 assert!(source.contains("SampleReserveFailed"));
590 assert!(!source.contains(concat!("Vec", "::with_capacity(evidence.len())")));
591 }
592
593 fn profile(
594 graph_layout_hash: u64,
595 graph_bytes: u64,
596 run_count: u64,
597 per_run_frontier_bytes: u64,
598 reusable_scratch_bytes: u64,
599 per_run_output_bytes: u64,
600 budget_bytes: u64,
601 ) -> CudaResidentGraphSessionProfile {
602 CudaResidentGraphSessionProfile {
603 graph_layout_hash,
604 graph_bytes,
605 run_count,
606 per_run_frontier_bytes,
607 reusable_scratch_bytes,
608 per_run_output_bytes,
609 budget_bytes,
610 readback: CudaResidentGraphReadback::FinalOnly,
611 }
612 }
613}
614