Skip to main content

vyre_driver_cuda/
resident_graph_session.rs

1//! CUDA resident graph session planning.
2//!
3//! Repeated fixed-graph execution is the release path for dataflow analyses
4//! and frontend graph passes. The graph topology must be uploaded once, kept
5//! resident, and reused across runs. This module plans the steady-state memory
6//! envelope and quantifies the upload/allocation/fence work removed by keeping
7//! graph state resident.
8
9use crate::backend::accounting::{
10    checked_add_u64_count as checked_add, checked_mul_u64_count as checked_mul,
11    CudaArithmeticOverflow,
12};
13use crate::backend::staging_reserve::reserved_vec;
14use crate::megakernel_speedup_gate::{
15    format_validated_cuda_megakernel_speedup_evidence_csv, CudaMegakernelSpeedupGateError,
16    CudaMegakernelSpeedupProof, CudaMegakernelSpeedupSample,
17};
18use vyre_driver::ResidentGraphReuseTelemetry;
19
20/// Host readback policy for a CUDA resident graph session.
21#[derive(Clone, Copy, Debug, Eq, PartialEq)]
22pub enum CudaResidentGraphReadback {
23    /// Read only the final output after all repeated runs complete.
24    FinalOnly,
25    /// Read after every run.
26    PerRun,
27}
28
29/// Input profile for repeated execution over one resident CUDA graph.
30#[derive(Clone, Copy, Debug, Eq, PartialEq)]
31pub struct CudaResidentGraphSessionProfile {
32    /// Stable normalized graph layout hash.
33    pub graph_layout_hash: u64,
34    /// Bytes required for resident graph topology and immutable metadata.
35    pub graph_bytes: u64,
36    /// Number of repeated executions over the same graph.
37    pub run_count: u64,
38    /// Frontier/input bytes refreshed each run.
39    pub per_run_frontier_bytes: u64,
40    /// Scratch bytes reused across runs.
41    pub reusable_scratch_bytes: u64,
42    /// Meaningful output bytes produced per run.
43    pub per_run_output_bytes: u64,
44    /// Explicit CUDA memory budget.
45    pub budget_bytes: u64,
46    /// Host readback policy.
47    pub readback: CudaResidentGraphReadback,
48}
49
50/// CUDA resident graph session plan.
51#[derive(Clone, Copy, Debug, Eq, PartialEq)]
52pub struct CudaResidentGraphSessionPlan {
53    /// Stable normalized graph layout hash.
54    pub graph_layout_hash: u64,
55    /// Bytes uploaded once at session start.
56    pub one_time_graph_upload_bytes: u64,
57    /// Bytes refreshed across all runs.
58    pub total_frontier_refresh_bytes: u64,
59    /// Peak bytes resident on device.
60    pub peak_resident_bytes: u64,
61    /// Bytes avoided versus uploading graph topology before every run.
62    pub avoided_graph_upload_bytes: u64,
63    /// Backend-neutral graph upload/reuse telemetry for the session.
64    pub graph_reuse: ResidentGraphReuseTelemetry,
65    /// Device allocations avoided versus allocating graph/scratch/output per run.
66    pub avoided_device_allocations: u64,
67    /// Host fences avoided versus per-run readback.
68    pub avoided_host_fences: u64,
69    /// Host readback bytes after session planning.
70    pub host_readback_bytes: u64,
71    /// Whether the plan keeps graph topology resident.
72    pub graph_topology_resident: bool,
73    /// Whether scratch allocation is reused across runs.
74    pub scratch_reused: bool,
75    /// Whether host readback happens once at the end.
76    pub final_only_host_readback: bool,
77}
78
79/// Release evidence profile for a measured resident graph session.
80#[derive(Clone, Copy, Debug, PartialEq)]
81pub struct CudaResidentGraphSessionEvidence {
82    /// Backend identifier that produced the measured session.
83    pub backend_id: &'static str,
84    /// Physical CUDA device ordinal that produced the measured session.
85    pub device_ordinal: u64,
86    /// Probed CUDA device memory in bytes.
87    pub device_memory_bytes: u64,
88    /// Probed CUDA compute capability major version.
89    pub compute_capability_major: u32,
90    /// Probed CUDA compute capability minor version.
91    pub compute_capability_minor: u32,
92    /// Logical graph nodes in the measured workload.
93    pub graph_nodes: u64,
94    /// Logical graph edges in the measured workload.
95    pub graph_edges: u64,
96    /// Planned resident session.
97    pub plan: CudaResidentGraphSessionPlan,
98    /// Naive host-orchestrated execution time in nanoseconds.
99    pub host_orchestrated_ns: f64,
100    /// Resident megakernel execution time in nanoseconds.
101    pub resident_megakernel_ns: f64,
102    /// Setup time measured outside the timed region.
103    pub setup_ns: f64,
104}
105
106/// CUDA resident graph session planning errors.
107#[derive(Clone, Debug, Eq, PartialEq)]
108pub enum CudaResidentGraphSessionError {
109    /// Graph hash must be non-zero.
110    ZeroGraphHash,
111    /// Graph must have resident bytes.
112    ZeroGraphBytes,
113    /// Run count must be non-zero.
114    ZeroRuns,
115    /// Explicit CUDA memory budget cannot be zero.
116    ZeroBudget,
117    /// Per-run host readback would reintroduce CPU orchestration.
118    PerRunReadbackRejected,
119    /// Byte arithmetic overflowed.
120    ByteCountOverflow {
121        /// Field being computed.
122        field: &'static str,
123    },
124    /// Peak resident bytes exceed the explicit budget.
125    OverBudget {
126        /// Required resident bytes.
127        required_bytes: u64,
128        /// Caller-provided budget.
129        budget_bytes: u64,
130    },
131    /// Resident session evidence does not describe a final-only resident execution.
132    NonResidentEvidence,
133}
134
135/// Error while converting resident graph session evidence into release CSV.
136#[derive(Clone, Debug, PartialEq)]
137pub enum CudaResidentGraphSessionEvidenceError {
138    /// Resident session evidence was not a valid final-only resident session.
139    Session(CudaResidentGraphSessionError),
140    /// Megakernel speedup release gate rejected the converted samples.
141    Speedup(CudaMegakernelSpeedupGateError),
142    /// Resident session evidence sample staging could not reserve enough slots.
143    SampleReserveFailed {
144        /// Required sample capacity.
145        capacity: usize,
146        /// Allocator/backend error message.
147        message: String,
148    },
149}
150
151impl std::fmt::Display for CudaResidentGraphSessionError {
152    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
153        match self {
154            Self::ZeroGraphHash => write!(
155                f,
156                "CUDA resident graph session received graph_layout_hash=0. Fix: normalize and hash graph topology before session planning."
157            ),
158            Self::ZeroGraphBytes => write!(
159                f,
160                "CUDA resident graph session received graph_bytes=0. Fix: pass the concrete resident graph topology byte count."
161            ),
162            Self::ZeroRuns => write!(
163                f,
164                "CUDA resident graph session received run_count=0. Fix: plan only non-empty repeated execution sessions."
165            ),
166            Self::ZeroBudget => write!(
167                f,
168                "CUDA resident graph session received budget_bytes=0. Fix: pass an explicit CUDA memory budget."
169            ),
170            Self::PerRunReadbackRejected => write!(
171                f,
172                "CUDA resident graph session rejected per-run readback. Fix: compact final outputs on device and read back once after repeated execution."
173            ),
174            Self::ByteCountOverflow { field } => write!(
175                f,
176                "CUDA resident graph session overflowed while computing {field}. Fix: shard repeated graph execution before planning."
177            ),
178            Self::OverBudget {
179                required_bytes,
180                budget_bytes,
181            } => write!(
182                f,
183                "CUDA resident graph session requires {required_bytes} bytes but budget allows {budget_bytes}. Fix: reduce frontier/output size, reuse compact outputs, or shard the graph."
184            ),
185            Self::NonResidentEvidence => write!(
186                f,
187                "CUDA resident graph session evidence is not final-only resident execution. Fix: build evidence from a plan with resident topology, reused scratch, and one final readback."
188            ),
189        }
190    }
191}
192
193impl std::error::Error for CudaResidentGraphSessionError {}
194
195impl CudaArithmeticOverflow for CudaResidentGraphSessionError {
196    fn arithmetic_overflow(field: &'static str) -> Self {
197        Self::ByteCountOverflow { field }
198    }
199}
200
201impl std::fmt::Display for CudaResidentGraphSessionEvidenceError {
202    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
203        match self {
204            Self::Session(error) => write!(f, "{error}"),
205            Self::Speedup(error) => write!(f, "{error}"),
206            Self::SampleReserveFailed { capacity, message } => write!(
207                f,
208                "CUDA resident graph session evidence could not reserve {capacity} release sample slot(s): {message}. Fix: split the release evidence batch before formatting."
209            ),
210        }
211    }
212}
213
214impl std::error::Error for CudaResidentGraphSessionEvidenceError {}
215
216impl From<CudaResidentGraphSessionError> for CudaResidentGraphSessionEvidenceError {
217    fn from(error: CudaResidentGraphSessionError) -> Self {
218        Self::Session(error)
219    }
220}
221
222impl From<CudaMegakernelSpeedupGateError> for CudaResidentGraphSessionEvidenceError {
223    fn from(error: CudaMegakernelSpeedupGateError) -> Self {
224        Self::Speedup(error)
225    }
226}
227
228/// Plan a repeated CUDA execution session over one resident graph.
229pub fn plan_cuda_resident_graph_session(
230    profile: CudaResidentGraphSessionProfile,
231) -> Result<CudaResidentGraphSessionPlan, CudaResidentGraphSessionError> {
232    if profile.graph_layout_hash == 0 {
233        return Err(CudaResidentGraphSessionError::ZeroGraphHash);
234    }
235    if profile.graph_bytes == 0 {
236        return Err(CudaResidentGraphSessionError::ZeroGraphBytes);
237    }
238    if profile.run_count == 0 {
239        return Err(CudaResidentGraphSessionError::ZeroRuns);
240    }
241    if profile.budget_bytes == 0 {
242        return Err(CudaResidentGraphSessionError::ZeroBudget);
243    }
244    if profile.readback != CudaResidentGraphReadback::FinalOnly {
245        return Err(CudaResidentGraphSessionError::PerRunReadbackRejected);
246    }
247    if profile.run_count == 1 {
248        let graph_plus_frontier = checked_add(
249            profile.graph_bytes,
250            profile.per_run_frontier_bytes,
251            "graph plus frontier bytes",
252        )?;
253        let with_scratch = checked_add(
254            graph_plus_frontier,
255            profile.reusable_scratch_bytes,
256            "graph frontier scratch bytes",
257        )?;
258        let peak_resident_bytes = checked_add(
259            with_scratch,
260            profile.per_run_output_bytes,
261            "peak resident bytes",
262        )?;
263        if peak_resident_bytes > profile.budget_bytes {
264            return Err(CudaResidentGraphSessionError::OverBudget {
265                required_bytes: peak_resident_bytes,
266                budget_bytes: profile.budget_bytes,
267            });
268        }
269        return Ok(CudaResidentGraphSessionPlan {
270            graph_layout_hash: profile.graph_layout_hash,
271            one_time_graph_upload_bytes: profile.graph_bytes,
272            total_frontier_refresh_bytes: profile.per_run_frontier_bytes,
273            peak_resident_bytes,
274            avoided_graph_upload_bytes: 0,
275            graph_reuse: ResidentGraphReuseTelemetry::cold_upload(profile.graph_bytes),
276            avoided_device_allocations: 0,
277            avoided_host_fences: 0,
278            host_readback_bytes: profile.per_run_output_bytes,
279            graph_topology_resident: true,
280            scratch_reused: true,
281            final_only_host_readback: true,
282        });
283    }
284
285    let graph_plus_frontier = checked_add(
286        profile.graph_bytes,
287        profile.per_run_frontier_bytes,
288        "graph plus frontier bytes",
289    )?;
290    let with_scratch = checked_add(
291        graph_plus_frontier,
292        profile.reusable_scratch_bytes,
293        "graph frontier scratch bytes",
294    )?;
295    let peak_resident_bytes = checked_add(
296        with_scratch,
297        profile.per_run_output_bytes,
298        "peak resident bytes",
299    )?;
300    if peak_resident_bytes > profile.budget_bytes {
301        return Err(CudaResidentGraphSessionError::OverBudget {
302            required_bytes: peak_resident_bytes,
303            budget_bytes: profile.budget_bytes,
304        });
305    }
306
307    let total_frontier_refresh_bytes = checked_mul(
308        profile.run_count,
309        profile.per_run_frontier_bytes,
310        "total frontier refresh bytes",
311    )?;
312    let repeated_runs = profile.run_count - 1;
313    let avoided_graph_upload_bytes = checked_mul(
314        repeated_runs,
315        profile.graph_bytes,
316        "avoided graph upload bytes",
317    )?;
318    let avoided_device_allocations = checked_mul(repeated_runs, 3, "avoided allocations")?;
319
320    Ok(CudaResidentGraphSessionPlan {
321        graph_layout_hash: profile.graph_layout_hash,
322        one_time_graph_upload_bytes: profile.graph_bytes,
323        total_frontier_refresh_bytes,
324        peak_resident_bytes,
325        avoided_graph_upload_bytes,
326        graph_reuse: ResidentGraphReuseTelemetry::from_counters(
327            1,
328            repeated_runs,
329            profile.graph_bytes,
330            avoided_graph_upload_bytes,
331        ),
332        avoided_device_allocations,
333        avoided_host_fences: repeated_runs,
334        host_readback_bytes: profile.per_run_output_bytes,
335        graph_topology_resident: true,
336        scratch_reused: true,
337        final_only_host_readback: true,
338    })
339}
340
341/// Convert a planned resident graph session measurement into the release
342/// megakernel speedup sample schema.
343pub fn resident_graph_session_speedup_sample(
344    evidence: CudaResidentGraphSessionEvidence,
345) -> Result<CudaMegakernelSpeedupSample, CudaResidentGraphSessionError> {
346    if !evidence.plan.graph_topology_resident
347        || !evidence.plan.scratch_reused
348        || !evidence.plan.final_only_host_readback
349    {
350        return Err(CudaResidentGraphSessionError::NonResidentEvidence);
351    }
352    Ok(CudaMegakernelSpeedupSample {
353        backend_id: evidence.backend_id,
354        device_ordinal: evidence.device_ordinal,
355        device_memory_bytes: evidence.device_memory_bytes,
356        compute_capability_major: evidence.compute_capability_major,
357        compute_capability_minor: evidence.compute_capability_minor,
358        graph_nodes: evidence.graph_nodes,
359        graph_edges: evidence.graph_edges,
360        repetitions: checked_add(evidence.plan.avoided_host_fences, 1, "evidence repetitions")?,
361        host_orchestrated_ns: evidence.host_orchestrated_ns,
362        resident_megakernel_ns: evidence.resident_megakernel_ns,
363        setup_ns: evidence.setup_ns,
364        timed_graph_uploads: 0,
365        timed_host_allocations: 0,
366        timed_host_syncs: 0,
367    })
368}
369
370/// Convert measured resident graph sessions into the exact validated CUDA
371/// megakernel release CSV artifact.
372pub fn format_validated_cuda_resident_graph_session_evidence_csv(
373    evidence: &[CudaResidentGraphSessionEvidence],
374    required_speedup_x: f64,
375) -> Result<(CudaMegakernelSpeedupProof, String), CudaResidentGraphSessionEvidenceError> {
376    let mut samples = reserved_vec(
377        evidence.len(),
378        "cuda resident graph session release samples",
379    )
380    .map_err(
381        |error| CudaResidentGraphSessionEvidenceError::SampleReserveFailed {
382            capacity: evidence.len(),
383            message: error.to_string(),
384        },
385    )?;
386    for item in evidence {
387        samples.push(resident_graph_session_speedup_sample(*item)?);
388    }
389    format_validated_cuda_megakernel_speedup_evidence_csv(&samples, required_speedup_x)
390        .map_err(CudaResidentGraphSessionEvidenceError::Speedup)
391}
392
393#[cfg(test)]
394mod tests {
395    use super::*;
396
397    #[test]
398    fn resident_graph_session_uses_shared_typed_cuda_arithmetic() {
399        let source = include_str!("resident_graph_session.rs");
400
401        assert!(source.contains("checked_add_u64_count as checked_add"));
402        assert!(source.contains("checked_mul_u64_count as checked_mul"));
403        assert!(source.contains("impl CudaArithmeticOverflow for CudaResidentGraphSessionError"));
404        assert!(!source.contains(concat!("fn checked_", "mul(")));
405        assert!(!source.contains(concat!("fn checked_", "add(")));
406    }
407
408    #[test]
409    fn resident_graph_session_amortizes_fixed_graph_repeated_execution() {
410        let plan = plan_cuda_resident_graph_session(CudaResidentGraphSessionProfile {
411            graph_layout_hash: 0xabc,
412            graph_bytes: 1_048_576,
413            run_count: 128,
414            per_run_frontier_bytes: 4_096,
415            reusable_scratch_bytes: 65_536,
416            per_run_output_bytes: 2_048,
417            budget_bytes: 2_000_000,
418            readback: CudaResidentGraphReadback::FinalOnly,
419        })
420        .expect("Fix: resident graph session should fit");
421
422        assert_eq!(plan.one_time_graph_upload_bytes, 1_048_576);
423        assert_eq!(plan.total_frontier_refresh_bytes, 524_288);
424        assert_eq!(plan.avoided_graph_upload_bytes, 133_169_152);
425        assert_eq!(
426            plan.graph_reuse,
427            ResidentGraphReuseTelemetry::from_counters(1, 127, 1_048_576, 133_169_152)
428        );
429        assert_eq!(plan.avoided_device_allocations, 381);
430        assert_eq!(plan.avoided_host_fences, 127);
431        assert_eq!(plan.host_readback_bytes, 2_048);
432        assert!(plan.graph_topology_resident);
433        assert!(plan.scratch_reused);
434        assert!(plan.final_only_host_readback);
435    }
436
437    #[test]
438    fn resident_graph_session_builds_release_speedup_sample_without_timed_pollution() {
439        let plan = plan_cuda_resident_graph_session(CudaResidentGraphSessionProfile {
440            graph_layout_hash: 0xabc,
441            graph_bytes: 1_048_576,
442            run_count: 128,
443            per_run_frontier_bytes: 4_096,
444            reusable_scratch_bytes: 65_536,
445            per_run_output_bytes: 2_048,
446            budget_bytes: 2_000_000,
447            readback: CudaResidentGraphReadback::FinalOnly,
448        })
449        .expect("Fix: resident graph session should fit");
450
451
452        let sample = resident_graph_session_speedup_sample(CudaResidentGraphSessionEvidence {
453            backend_id: crate::CUDA_BACKEND_ID,
454            device_ordinal: 0,
455            device_memory_bytes: 32 * 1024 * 1024 * 1024,
456            compute_capability_major: 12,
457            compute_capability_minor: 0,
458            graph_nodes: 10_000,
459            graph_edges: 80_000,
460            plan,
461            host_orchestrated_ns: 1_000_000.0,
462            resident_megakernel_ns: 10_000.0,
463            setup_ns: 250_000.0,
464        })
465        .expect("Fix: resident final-only plan should produce release evidence");
466
467        assert_eq!(sample.backend_id, crate::CUDA_BACKEND_ID);
468        assert_eq!(sample.device_memory_bytes, 32 * 1024 * 1024 * 1024);
469        assert_eq!(sample.compute_capability_major, 12);
470        assert_eq!(sample.graph_nodes, 10_000);
471        assert_eq!(sample.graph_edges, 80_000);
472        assert_eq!(sample.repetitions, 128);
473        assert_eq!(sample.timed_graph_uploads, 0);
474        assert_eq!(sample.timed_host_allocations, 0);
475        assert_eq!(sample.timed_host_syncs, 0);
476    }
477
478    #[test]
479    fn resident_graph_session_formats_validated_release_speedup_csv() {
480        let plan_a = plan_cuda_resident_graph_session(CudaResidentGraphSessionProfile {
481            graph_layout_hash: 0xabc,
482            graph_bytes: 1_048_576,
483            run_count: 128,
484            per_run_frontier_bytes: 4_096,
485            reusable_scratch_bytes: 65_536,
486            per_run_output_bytes: 2_048,
487            budget_bytes: 2_000_000,
488            readback: CudaResidentGraphReadback::FinalOnly,
489        })
490        .expect("Fix: first resident graph session should fit");
491        let plan_b = plan_cuda_resident_graph_session(CudaResidentGraphSessionProfile {
492            graph_layout_hash: 0xdef,
493            graph_bytes: 2_097_152,
494            run_count: 256,
495            per_run_frontier_bytes: 8_192,
496            reusable_scratch_bytes: 131_072,
497            per_run_output_bytes: 4_096,
498            budget_bytes: 4_000_000,
499            readback: CudaResidentGraphReadback::FinalOnly,
500        })
501        .expect("Fix: second resident graph session should fit");
502        let evidence = [
503            CudaResidentGraphSessionEvidence {
504                backend_id: crate::CUDA_BACKEND_ID,
505                device_ordinal: 0,
506                device_memory_bytes: 32 * 1024 * 1024 * 1024,
507                compute_capability_major: 12,
508                compute_capability_minor: 0,
509                graph_nodes: 10_000,
510                graph_edges: 80_000,
511                plan: plan_a,
512                host_orchestrated_ns: 1_000_000.0,
513                resident_megakernel_ns: 10_000.0,
514                setup_ns: 250_000.0,
515            },
516            CudaResidentGraphSessionEvidence {
517                backend_id: crate::CUDA_BACKEND_ID,
518                device_ordinal: 0,
519                device_memory_bytes: 32 * 1024 * 1024 * 1024,
520                compute_capability_major: 12,
521                compute_capability_minor: 0,
522                graph_nodes: 20_000,
523                graph_edges: 160_000,
524                plan: plan_b,
525                host_orchestrated_ns: 2_500_000.0,
526                resident_megakernel_ns: 20_000.0,
527                setup_ns: 350_000.0,
528            },
529        ];
530
531        let (proof, csv) =
532            format_validated_cuda_resident_graph_session_evidence_csv(&evidence, 100.0)
533                .expect("Fix: resident graph release evidence should format as validated CSV");
534        let reparsed = crate::validate_cuda_megakernel_speedup_evidence_csv(&csv, 100.0)
535            .expect("Fix: resident graph release CSV should roundtrip through verifier");
536
537        assert_eq!(proof, reparsed);
538        assert_eq!(proof.sample_count, 2);
539        assert_eq!(proof.min_speedup_x, 100.0);
540        assert_eq!(proof.max_speedup_x, 125.0);
541        assert_eq!(csv.lines().count(), 3);
542    }
543
544    #[test]
545    fn resident_graph_session_rejects_host_orchestration_shape() {
546        assert_eq!(
547            plan_cuda_resident_graph_session(CudaResidentGraphSessionProfile {
548                graph_layout_hash: 1,
549                graph_bytes: 128,
550                run_count: 2,
551                per_run_frontier_bytes: 16,
552                reusable_scratch_bytes: 16,
553                per_run_output_bytes: 16,
554                budget_bytes: 1_024,
555                readback: CudaResidentGraphReadback::PerRun,
556            })
557            .expect_err("per-run readback should fail"),
558            CudaResidentGraphSessionError::PerRunReadbackRejected
559        );
560    }
561
562    #[test]
563    fn resident_graph_session_rejects_invalid_inputs_and_budget() {
564        assert_eq!(
565            plan_cuda_resident_graph_session(profile(0, 128, 1, 16, 16, 16, 1_024))
566                .expect_err("zero hash should fail"),
567            CudaResidentGraphSessionError::ZeroGraphHash
568        );
569        assert_eq!(
570            plan_cuda_resident_graph_session(profile(1, 128, 0, 16, 16, 16, 1_024))
571                .expect_err("zero runs should fail"),
572            CudaResidentGraphSessionError::ZeroRuns
573        );
574        assert_eq!(
575            plan_cuda_resident_graph_session(profile(1, 128, 1, 16, 16, 16, 127))
576                .expect_err("over-budget session should fail"),
577            CudaResidentGraphSessionError::OverBudget {
578                required_bytes: 176,
579                budget_bytes: 127,
580            }
581        );
582    }
583
584    #[test]
585    fn resident_graph_session_evidence_uses_fallible_sample_staging() {
586        let source = include_str!("resident_graph_session.rs");
587
588        assert!(source.contains("use crate::backend::staging_reserve::reserve_vec;"));
589        assert!(source.contains("SampleReserveFailed"));
590        assert!(!source.contains(concat!("Vec", "::with_capacity(evidence.len())")));
591    }
592
593    fn profile(
594        graph_layout_hash: u64,
595        graph_bytes: u64,
596        run_count: u64,
597        per_run_frontier_bytes: u64,
598        reusable_scratch_bytes: u64,
599        per_run_output_bytes: u64,
600        budget_bytes: u64,
601    ) -> CudaResidentGraphSessionProfile {
602        CudaResidentGraphSessionProfile {
603            graph_layout_hash,
604            graph_bytes,
605            run_count,
606            per_run_frontier_bytes,
607            reusable_scratch_bytes,
608            per_run_output_bytes,
609            budget_bytes,
610            readback: CudaResidentGraphReadback::FinalOnly,
611        }
612    }
613}
614