Skip to main content

vyre_driver_cuda/
lib.rs

1//! # vyre-driver-cuda  -  CUDA/PTX backend for vyre
2//!
3//! Implements [`VyreBackend`] via the CUDA driver API through `cudarc`.
4//! Translates vyre `Program` IR into PTX kernels, loads them through
5//! the CUDA driver JIT, and dispatches on NVIDIA GPUs.
6//!
7//! The backend registers itself as `"cuda"` in the vyre backend registry
8//! via `inventory::submit!` so `vyre::registered_backends()` enumerates
9//! it alongside `wgpu`, `spirv`, etc.
10//!
11//! ## Architecture
12//!
13//! ```text
14//!    Program ─► PTX emitter ─► cuModuleLoadData ─► cuLaunchKernel
15//! ```
16//!
17#![deny(missing_docs)]
18// CUDA driver bindings (`cudarc::driver::sys::cu*`) are inherently unsafe FFI;
19// every call site is the boundary between safe vyre code and the CUDA driver
20// API. Allow `unsafe` here so the rest of the workspace can keep
21// `unsafe_code = "deny"` while this backend wraps cudarc properly with
22// per-call Safety: comments enforced by `check_unsafe_justifications.sh`.
23#![allow(unsafe_code)]
24
25mod aot_launcher;
26/// CUDA backend core: device management and dispatch.
27pub mod backend;
28/// Benchmark-driven CUDA optimization pass selection.
29pub mod benchmark_pass_selection;
30/// PTX code generation from vyre IR.
31pub mod codegen;
32/// CUDA device capability probing.
33pub mod device;
34/// Device-side diagnostic aggregation and compact readback planning.
35pub mod device_diagnostic_aggregation;
36/// Device-side work queue planning for dependent dataflow.
37pub mod device_work_queue;
38/// CUDA upload planning for GPU e-graph device images.
39pub mod egraph_device_image;
40/// CUDA launch-wave planning for resident e-graph device images.
41pub mod egraph_kernel_plan;
42mod egraph_readback;
43/// Adapter from frontier-typed IR plans to CUDA frontier wave envelopes.
44pub mod frontier_typed_ir_adapter;
45mod input_identity;
46mod instrumentation;
47/// Cross-process persistent CUDA JIT cache wiring (E4 + E5): configures
48/// the NVIDIA driver's built-in disk cache at backend bring-up so the
49/// JIT-compiled cuBINs persist across runs and are shared across every
50/// vyre process on the host.
51pub mod jit_cache;
52/// Actionable CUDA kernel capability diagnostics.
53pub mod kernel_failure_diagnostics;
54/// Adjacent-stage CUDA launch fusion planning.
55pub mod launch_fusion;
56/// Bounded CUDA megakernel plan cache keyed by graph, analysis, device, and
57/// runtime pressure buckets.
58pub mod megakernel_plan_cache;
59/// Multi-query CUDA execution planning over shared resident graphs.
60pub mod multi_query_execution;
61mod numeric;
62/// Occupancy-aware empirical autotuning (I4): pure estimator that picks
63/// the workgroup size with the highest predicted hardware occupancy from
64/// `(CudaDeviceCaps, KernelResourceUsage)`. The runtime feeds the result
65/// into `AutotuneStore` (I3) so subsequent dispatches reuse the choice.
66pub mod occupancy;
67/// Self-hosted optimizer GPU dispatcher  -  runs the
68/// `vyre-self-substrate::optimizer` passes (DCE, CSE, const-fold,
69/// validator) on CUDA. External parity tests reach in via the
70/// `CudaOptimizerDispatcher` re-export below.
71pub mod optimizer;
72mod pipeline;
73/// CUDA profiler range integration for Nsight/NVTX without mandatory NVTX linkage.
74pub mod profiler;
75/// Repeated execution over persistent CUDA-resident graph state.
76pub mod resident_graph_session;
77/// Compact result readback planning.
78pub mod result_compaction;
79mod stream;
80/// Synthetic CUDA device profiles for offline release-path planning.
81pub mod synthetic_device_caps;
82/// CUDA execution planning for unified token/fact graph frontier waves.
83pub mod token_fact_frontier_execution;
84/// Adapter from unified token/fact graph layouts to CUDA resident bytes.
85pub mod token_fact_graph_cuda_adapter;
86
87pub use backend::{
88    CudaBackend, CudaPtxSourceCacheSnapshot, CudaResidentBuffer, CudaTelemetrySnapshot,
89};
90pub use benchmark_pass_selection::{
91    select_cuda_benchmark_passes, select_cuda_benchmark_passes_with_scratch,
92    CudaBenchmarkPassCandidate, CudaBenchmarkPassSelectionError, CudaBenchmarkPassSelectionPlan,
93    CudaBenchmarkPassSelectionSample, CudaBenchmarkPassSelectionScratch,
94    CudaBenchmarkPassSkipReason, CudaSkippedBenchmarkPass,
95};
96pub use stream::CudaLaunchResourceCounts;
97/// CUDA megakernel global-barrier minimization for dependency-typed waves.
98pub mod megakernel_barrier_planner;
99/// CUDA megakernel convergence planning for iterative fixed-point analyses.
100pub mod megakernel_convergence;
101pub mod megakernel_scheduler;
102/// Release gate for steady-state CUDA megakernel speedup claims.
103pub mod megakernel_speedup_gate;
104pub use device::{CudaDeviceCaps, CudaDeviceHandle};
105pub use device_diagnostic_aggregation::{
106    plan_cuda_device_diagnostic_aggregation, plan_cuda_device_diagnostic_aggregation_with_scratch,
107    CudaDiagnosticAggregationError, CudaDiagnosticAggregationPlan,
108    CudaDiagnosticAggregationScratch, CudaDiagnosticCompactRange, CudaDiagnosticShard,
109};
110pub use device_work_queue::{
111    plan_cuda_device_work_queue, plan_cuda_device_work_queue_backpressure,
112    CudaDeviceWorkQueueBackpressurePlan, CudaDeviceWorkQueueDrainStrategy,
113    CudaDeviceWorkQueueError, CudaDeviceWorkQueuePlan, CudaDeviceWorkQueueProfile,
114    CudaWorkQueueHostSync,
115};
116pub use egraph_device_image::{
117    plan_cuda_egraph_device_upload, plan_cuda_egraph_device_upload_from_image,
118    plan_cuda_egraph_device_upload_from_image_ref, CudaEGraphDeviceBorrowedUploadPlan,
119    CudaEGraphDeviceByteLayout, CudaEGraphDeviceByteSpan, CudaEGraphDeviceKernelView,
120    CudaEGraphDeviceUploadError, CudaEGraphDeviceUploadPlan, CudaResidentEGraphDeviceImage,
121};
122pub use egraph_kernel_plan::{
123    collect_cuda_egraph_structural_equivalences, cuda_egraph_canonical_rewrite_kernel_ptx,
124    cuda_egraph_signature_pair_rows, cuda_egraph_signature_refresh_kernel_ptx,
125    cuda_egraph_structural_equivalence_kernel_ptx, pack_cuda_egraph_canonical_rewrite_device_image,
126    pack_cuda_egraph_signature_bucket_device_image, plan_cuda_egraph_kernel_work,
127    plan_cuda_egraph_signature_buckets, plan_cuda_egraph_signature_buckets_from_resident_snapshot,
128    plan_cuda_egraph_signature_buckets_from_signature_snapshot,
129    plan_cuda_egraph_structural_equivalence_launch_artifact,
130    plan_cuda_egraph_structural_equivalence_output, plan_cuda_egraph_structural_equivalences,
131    plan_cuda_egraph_union_compaction, CudaEGraphCanonicalRewrite,
132    CudaEGraphCanonicalRewriteDeviceImage, CudaEGraphCanonicalRewriteKernelPtx,
133    CudaEGraphCanonicalRewriteKernelResult, CudaEGraphFixedPointReadback,
134    CudaEGraphKernelLaunchConfig, CudaEGraphKernelPass, CudaEGraphKernelPlanError,
135    CudaEGraphKernelWave, CudaEGraphKernelWorkPlan, CudaEGraphResidentColumnSnapshot,
136    CudaEGraphResidentSignatureSnapshot, CudaEGraphSignatureBucket,
137    CudaEGraphSignatureBucketDeviceImage, CudaEGraphSignatureBucketPlan,
138    CudaEGraphSignaturePairWave, CudaEGraphSignatureRefreshKernelPtx,
139    CudaEGraphSignatureRefreshKernelResult, CudaEGraphStructuralCanonicalizationFixedPointReport,
140    CudaEGraphStructuralCanonicalizationFixedPointResult,
141    CudaEGraphStructuralCanonicalizationRoundResult, CudaEGraphStructuralEquivalenceKernelPtx,
142    CudaEGraphStructuralEquivalenceKernelResult, CudaEGraphStructuralEquivalenceLaunchArtifact,
143    CudaEGraphStructuralEquivalenceOutputPlan, CudaEGraphStructuralEquivalencePlan,
144    CudaEGraphUnionCompactionPass, CudaEGraphUnionCompactionPlan, CudaEGraphUnionCompactionWave,
145    CUDA_EGRAPH_CANONICAL_REWRITE_KERNEL_ENTRY, CUDA_EGRAPH_CANONICAL_REWRITE_KERNEL_PARAM_COUNT,
146    CUDA_EGRAPH_CANONICAL_REWRITE_RECORD_WORDS, CUDA_EGRAPH_SIGNATURE_BUCKET_RECORD_WORDS,
147    CUDA_EGRAPH_SIGNATURE_REFRESH_KERNEL_ENTRY, CUDA_EGRAPH_SIGNATURE_REFRESH_KERNEL_PARAM_COUNT,
148    CUDA_EGRAPH_STRUCTURAL_EQUIVALENCE_KERNEL_ENTRY,
149    CUDA_EGRAPH_STRUCTURAL_EQUIVALENCE_KERNEL_PARAM_COUNT,
150};
151pub use frontier_typed_ir_adapter::{
152    adapt_frontier_typed_ir_to_cuda, CudaFrontierTypedIrAdapterError, CudaFrontierTypedIrInput,
153};
154pub use kernel_failure_diagnostics::{
155    diagnose_cuda_kernel_launch, diagnose_cuda_kernel_launch_shape,
156    diagnose_cuda_kernel_launch_with_scratch, CudaKernelCapabilityFailure,
157    CudaKernelDeviceEnvelope, CudaKernelLaunchDiagnostic, CudaKernelLaunchDiagnosticRef,
158    CudaKernelLaunchDiagnosticScratch, CudaKernelLaunchEnvelope, CudaKernelLaunchEnvelopeError,
159    CudaKernelLaunchShape, CudaKernelRequirement,
160};
161pub use launch_fusion::{
162    plan_cuda_launch_fusion, plan_cuda_launch_fusion_with_scratch, CudaFusionStage,
163    CudaLaunchFusionError, CudaLaunchFusionGroup, CudaLaunchFusionPlan, CudaLaunchFusionScratch,
164};
165pub use megakernel_barrier_planner::{
166    plan_cuda_frontier_megakernel_execution, plan_cuda_frontier_megakernel_execution_with_scratch,
167    plan_cuda_megakernel_barriers, plan_cuda_megakernel_barriers_with_scratch,
168    CudaMegakernelBarrierGroup, CudaMegakernelBarrierPlan, CudaMegakernelBarrierPlanError,
169    CudaMegakernelBarrierScratch, CudaMegakernelFrontierExecutionPlan,
170    CudaMegakernelFrontierExecutionPlanError, CudaMegakernelFrontierWave,
171    CudaMegakernelWaveDependency,
172};
173pub use megakernel_convergence::{
174    plan_cuda_device_convergence, CudaConvergenceReadbackPolicy, CudaDeviceConvergencePlan,
175    CudaDeviceConvergencePlanError,
176};
177pub use megakernel_plan_cache::{
178    CudaMegakernelAnalysisKind, CudaMegakernelCachedPlan, CudaMegakernelDeviceKey,
179    CudaMegakernelPlanCache, CudaMegakernelPlanCacheKey, CudaMegakernelPlanCacheStats,
180};
181pub use megakernel_scheduler::{
182    plan_cuda_megakernel_execution, plan_cuda_megakernel_memory_budget,
183    schedule_megakernel_from_cuda_samples, schedule_megakernel_from_cuda_samples_into,
184    select_cuda_megakernel_topology, CudaMegakernelExecutionPlan, CudaMegakernelGraphShape,
185    CudaMegakernelMemoryBudget, CudaMegakernelMemoryError, CudaMegakernelMemoryPlan,
186    CudaMegakernelScheduleSample, CudaMegakernelTopology, CudaMegakernelTopologyDecision,
187};
188pub use megakernel_speedup_gate::{
189    format_validated_cuda_megakernel_speedup_evidence_csv,
190    validate_cuda_megakernel_speedup_evidence_csv, validate_cuda_megakernel_speedup_gate,
191    CudaMegakernelSpeedupGateError, CudaMegakernelSpeedupProof, CudaMegakernelSpeedupSample,
192    MEGAKERNEL_SPEEDUP_EVIDENCE_CSV_HEADER,
193};
194pub use multi_query_execution::{
195    plan_cuda_multi_query_execution, plan_cuda_multi_query_execution_with_scratch, CudaMultiQuery,
196    CudaMultiQueryExecutionError, CudaMultiQueryExecutionPlan, CudaMultiQueryExecutionScratch,
197    CudaMultiQueryGroup,
198};
199pub use optimizer::CudaOptimizerDispatcher;
200pub use resident_graph_session::{
201    format_validated_cuda_resident_graph_session_evidence_csv, plan_cuda_resident_graph_session,
202    resident_graph_session_speedup_sample, CudaResidentGraphReadback,
203    CudaResidentGraphSessionError, CudaResidentGraphSessionEvidence,
204    CudaResidentGraphSessionEvidenceError, CudaResidentGraphSessionPlan,
205    CudaResidentGraphSessionProfile,
206};
207pub use result_compaction::{
208    plan_cuda_result_compaction, plan_cuda_result_compaction_with_scratch, CudaCompactResultRecord,
209    CudaResultCompactionError, CudaResultCompactionPlan, CudaResultCompactionScratch,
210    CudaResultSlot,
211};
212pub use token_fact_frontier_execution::{
213    plan_cuda_token_fact_frontier_execution, plan_cuda_token_fact_frontier_execution_with_scratch,
214    CudaTokenFactFrontierExecutionError, CudaTokenFactFrontierExecutionPlan,
215};
216pub use token_fact_graph_cuda_adapter::{
217    adapt_token_fact_graph_to_cuda_layout, CudaTokenFactGraphLayout, CudaTokenFactGraphLayoutError,
218};
219
220use std::sync::Arc;
221
222use crate::backend::staging_reserve::reserve_smallvec;
223use smallvec::SmallVec;
224use vyre_driver::{BackendError, BackendRegistration, DispatchConfig, Resource, VyreBackend};
225use vyre_foundation::ir::Program;
226
227/// Stable backend identifier for registration and conform certificates.
228pub const CUDA_BACKEND_ID: &str = "cuda";
229
230/// CUDA implementation of [`vyre_driver::DeviceBuffer`]. Wraps a
231/// [`backend::CudaResidentBuffer`] handle so consumers can hold a
232/// `Box<dyn DeviceBuffer>` against the CUDA backend without naming
233/// `CudaResidentBuffer` directly.
234///
235/// Lifecycle is explicit-free  -  call
236/// `VyreBackend::free_device_buffer(boxed_buffer)` when done. This
237/// matches the existing CUDA-resident contract and keeps the substrate
238/// free of reference-counted backend handles. A future RAII variant
239/// (Drop-managed via `Arc<CudaBackend>`) can ship as a drop-in
240/// replacement when the backend ownership model accommodates it.
241#[derive(Debug)]
242pub struct CudaDeviceBuffer {
243    backend_id: &'static str,
244    handle: backend::CudaResidentBuffer,
245}
246
247impl vyre_driver::DeviceBuffer for CudaDeviceBuffer {
248    fn backend_id(&self) -> &'static str {
249        self.backend_id
250    }
251
252    fn byte_len(&self) -> usize {
253        self.handle.byte_len
254    }
255
256    fn as_any(&self) -> &dyn std::any::Any {
257        self
258    }
259
260    fn as_any_mut(&mut self) -> &mut dyn std::any::Any {
261        self
262    }
263}
264
265/// Factory wrapper for the inventory registration path.
266///
267/// Unlike the SPIR-V backend, the CUDA backend owns a live device handle
268/// and can dispatch programs directly.
269#[derive(Debug)]
270pub struct CudaBackendRegistration {
271    inner: CudaBackend,
272}
273
274impl CudaBackendRegistration {
275    /// Wrap an already-acquired [`CudaBackend`] as a [`VyreBackend`] trait object.
276    ///
277    /// The inventory-driven path uses [`cuda_factory`] which acquires its own
278    /// device handle. Callers that already own a [`CudaBackend`] (e.g. so they
279    /// can keep the live device handle for direct API access while also handing
280    /// it to a megakernel) use this constructor instead.
281    #[must_use]
282    pub fn new(inner: CudaBackend) -> Self {
283        Self { inner }
284    }
285
286    /// Borrow the inner [`CudaBackend`] for direct device-API access.
287    #[must_use]
288    pub fn inner(&self) -> &CudaBackend {
289        &self.inner
290    }
291
292    /// Snapshot the CUDA PTX-source cache used before driver module loading.
293    #[must_use]
294    pub fn ptx_source_cache_snapshot(&self) -> CudaPtxSourceCacheSnapshot {
295        self.inner.ptx_source_cache_snapshot()
296    }
297
298    /// Runtime CUDA telemetry counters for release-path performance gates.
299    #[must_use]
300    pub fn telemetry_snapshot(&self) -> CudaTelemetrySnapshot {
301        self.inner.telemetry_snapshot()
302    }
303
304    /// Reset runtime CUDA telemetry counters without clearing backend caches.
305    pub fn reset_telemetry(&self) {
306        self.inner.reset_telemetry();
307    }
308
309    fn resolve_uploads<'a>(
310        &self,
311        uploads: &[(&Resource, &'a [u8])],
312    ) -> Result<SmallVec<[(CudaResidentBuffer, &'a [u8]); 8]>, BackendError> {
313        let mut concrete = SmallVec::<[(CudaResidentBuffer, &'a [u8]); 8]>::new();
314        reserve_smallvec(&mut concrete, uploads.len(), "CUDA resident upload handles")?;
315        for (resource, bytes) in uploads {
316            let handle = self.inner.resident_handle_from_resource(resource)?;
317            concrete.push((handle, *bytes));
318        }
319        Ok(concrete)
320    }
321
322    fn resolve_offset_uploads<'a>(
323        &self,
324        uploads: &[(&Resource, usize, &'a [u8])],
325    ) -> Result<SmallVec<[(CudaResidentBuffer, usize, &'a [u8]); 8]>, BackendError> {
326        let mut concrete = SmallVec::<[(CudaResidentBuffer, usize, &'a [u8]); 8]>::new();
327        reserve_smallvec(
328            &mut concrete,
329            uploads.len(),
330            "CUDA resident offset upload handles",
331        )?;
332        for (resource, dst_offset_bytes, bytes) in uploads {
333            let handle = self.inner.resident_handle_from_resource(resource)?;
334            concrete.push((handle, *dst_offset_bytes, *bytes));
335        }
336        Ok(concrete)
337    }
338
339    fn resolve_download_ranges(
340        &self,
341        ranges: &[(&Resource, usize, usize)],
342    ) -> Result<SmallVec<[(CudaResidentBuffer, usize, usize); 8]>, BackendError> {
343        let mut concrete = SmallVec::<[(CudaResidentBuffer, usize, usize); 8]>::new();
344        reserve_smallvec(
345            &mut concrete,
346            ranges.len(),
347            "CUDA resident download range handles",
348        )?;
349        for (resource, byte_offset, byte_len) in ranges {
350            let handle = self.inner.resident_handle_from_resource(resource)?;
351            concrete.push((handle, *byte_offset, *byte_len));
352        }
353        Ok(concrete)
354    }
355
356    fn resolve_read_ranges(
357        &self,
358        read_ranges: &[vyre_driver::backend::ResidentReadRange<'_>],
359    ) -> Result<
360        (
361            SmallVec<[CudaResidentBuffer; 8]>,
362            SmallVec<[crate::backend::output_range::CudaOutputReadback; 8]>,
363        ),
364        BackendError,
365    > {
366        let mut handles = SmallVec::<[CudaResidentBuffer; 8]>::new();
367        let mut concrete_readbacks =
368            SmallVec::<[crate::backend::output_range::CudaOutputReadback; 8]>::new();
369        reserve_smallvec(
370            &mut handles,
371            read_ranges.len(),
372            "CUDA resident read handles",
373        )?;
374        reserve_smallvec(
375            &mut concrete_readbacks,
376            read_ranges.len(),
377            "CUDA resident readback ranges",
378        )?;
379        for range in read_ranges {
380            handles.push(self.inner.resident_handle_from_resource(range.resource)?);
381            concrete_readbacks.push(crate::backend::output_range::CudaOutputReadback {
382                device_offset: range.byte_offset,
383                byte_len: range.byte_len,
384            });
385        }
386        Ok((handles, concrete_readbacks))
387    }
388
389    fn resolve_step_handle_sets(
390        &self,
391        steps: &[vyre_driver::backend::ResidentDispatchStep<'_>],
392        field: &'static str,
393    ) -> Result<SmallVec<[SmallVec<[crate::backend::CudaResidentBuffer; 8]>; 8]>, BackendError>
394    {
395        let mut handle_sets =
396            SmallVec::<[SmallVec<[crate::backend::CudaResidentBuffer; 8]>; 8]>::new();
397        reserve_smallvec(&mut handle_sets, steps.len(), field)?;
398        for step in steps {
399            handle_sets.push(self.inner.resident_handles_from_resources(step.resources)?);
400        }
401        Ok(handle_sets)
402    }
403
404    fn resolve_repeated_step_handle_sets(
405        &self,
406        steps: &[vyre_driver::backend::ResidentDispatchStep<'_>],
407        repeat_count: usize,
408    ) -> Result<SmallVec<[SmallVec<[crate::backend::CudaResidentBuffer; 8]>; 8]>, BackendError>
409    {
410        let mut handle_sets =
411            SmallVec::<[SmallVec<[crate::backend::CudaResidentBuffer; 8]>; 8]>::new();
412        let capacity = if repeat_count == 0 { 0 } else { steps.len() };
413        reserve_smallvec(
414            &mut handle_sets,
415            capacity,
416            "CUDA repeated resident repeated handle sets",
417        )?;
418        if repeat_count != 0 {
419            for step in steps {
420                handle_sets.push(self.inner.resident_handles_from_resources(step.resources)?);
421            }
422        }
423        Ok(handle_sets)
424    }
425
426    fn concrete_resident_steps<'program: 'handles, 'handles>(
427        steps: &[vyre_driver::backend::ResidentDispatchStep<'program>],
428        handle_sets: &'handles [SmallVec<[crate::backend::CudaResidentBuffer; 8]>],
429        field: &'static str,
430    ) -> Result<SmallVec<[crate::backend::CudaResidentDispatchStep<'handles>; 8]>, BackendError>
431    {
432        let mut concrete_steps =
433            SmallVec::<[crate::backend::CudaResidentDispatchStep<'handles>; 8]>::new();
434        reserve_smallvec(&mut concrete_steps, handle_sets.len(), field)?;
435        for (step, handles) in steps.iter().zip(handle_sets.iter()) {
436            let mut config = DispatchConfig::default();
437            config.grid_override = step.grid_override;
438            concrete_steps.push(crate::backend::CudaResidentDispatchStep {
439                program: step.program,
440                handles,
441                config,
442            });
443        }
444        Ok(concrete_steps)
445    }
446
447    /// Bytes of transient CUDA device memory currently owned by the transient pool.
448    ///
449    /// This includes checked-out dispatch allocations, compiled-pipeline static parameter
450    /// allocations, and cached transient blocks retained for reuse.
451    ///
452    /// # Errors
453    ///
454    /// Returns [`BackendError`] if allocation accounting cannot be read.
455    pub fn allocated_transient_allocation_bytes(&self) -> Result<usize, BackendError> {
456        self.inner.allocated_transient_allocation_bytes()
457    }
458
459    fn reject_grid_sync_without_native_lowering(
460        &self,
461        program: &Program,
462    ) -> Result<(), BackendError> {
463        if vyre_driver::grid_sync::contains_grid_sync(program) && !self.supports_grid_sync() {
464            return Err(BackendError::UnsupportedFeature {
465                name: "cuda_native_grid_sync_lowering (MemoryOrdering::GridSync requires explicit split routing or native cooperative-grid barrier lowering)"
466                    .to_string(),
467                backend: CUDA_BACKEND_ID.to_string(),
468            });
469        }
470        Ok(())
471    }
472
473    fn validate_program_for_dispatch(&self, program: &Program) -> Result<(), BackendError> {
474        let required = vyre_foundation::program_caps::scan(program);
475        vyre_foundation::program_caps::check_backend_capabilities(
476            CUDA_BACKEND_ID,
477            self.supports_subgroup_ops(),
478            self.supports_f16(),
479            self.supports_bf16(),
480            self.supports_indirect_dispatch(),
481            true,
482            self.supports_distributed_collectives(),
483            self.max_workgroup_size(),
484            &required,
485        )
486        .map_err(|error| BackendError::InvalidProgram {
487            fix: error.to_string(),
488        })?;
489        self.reject_grid_sync_without_native_lowering(program)
490    }
491
492    fn validate_resident_steps_for_dispatch(
493        &self,
494        steps: &[vyre_driver::backend::ResidentDispatchStep<'_>],
495    ) -> Result<(), BackendError> {
496        for step in steps {
497            self.validate_program_for_dispatch(step.program)?;
498        }
499        Ok(())
500    }
501}
502
503
504impl vyre_driver::backend::private::Sealed for CudaBackendRegistration {}
505
506impl VyreBackend for CudaBackendRegistration {
507    fn id(&self) -> &'static str {
508        CUDA_BACKEND_ID
509    }
510
511    fn version(&self) -> &'static str {
512        env!("CARGO_PKG_VERSION")
513    }
514
515    fn dispatch(
516        &self,
517        program: &Program,
518        inputs: &[Vec<u8>],
519        config: &DispatchConfig,
520    ) -> Result<Vec<Vec<u8>>, BackendError> {
521        self.validate_program_for_dispatch(program)?;
522        self.inner.dispatch(program, inputs, config)
523    }
524
525    fn dispatch_async(
526        &self,
527        program: &Program,
528        inputs: &[Vec<u8>],
529        config: &DispatchConfig,
530    ) -> Result<Box<dyn vyre_driver::PendingDispatch>, BackendError> {
531        self.validate_program_for_dispatch(program)?;
532        self.inner.dispatch_async(program, inputs, config)
533    }
534
535    fn dispatch_borrowed_async(
536        &self,
537        program: &Program,
538        inputs: &[&[u8]],
539        config: &DispatchConfig,
540    ) -> Result<Box<dyn vyre_driver::PendingDispatch>, BackendError> {
541        self.validate_program_for_dispatch(program)?;
542        self.inner.dispatch_borrowed_async(program, inputs, config)
543    }
544
545    fn dispatch_borrowed(
546        &self,
547        program: &Program,
548        inputs: &[&[u8]],
549        config: &DispatchConfig,
550    ) -> Result<Vec<Vec<u8>>, BackendError> {
551        self.validate_program_for_dispatch(program)?;
552        self.inner
553            .dispatch_borrowed_async(program, inputs, config)?
554            .await_result()
555    }
556
557    fn dispatch_borrowed_into(
558        &self,
559        program: &Program,
560        inputs: &[&[u8]],
561        config: &DispatchConfig,
562        outputs: &mut vyre_driver::OutputBuffers,
563    ) -> Result<(), BackendError> {
564        self.validate_program_for_dispatch(program)?;
565        self.inner
566            .dispatch_borrowed_async(program, inputs, config)?
567            .await_result_into(outputs)
568    }
569
570    fn dispatch_borrowed_timed(
571        &self,
572        program: &Program,
573        inputs: &[&[u8]],
574        config: &DispatchConfig,
575    ) -> Result<vyre_driver::TimedDispatchResult, BackendError> {
576        self.validate_program_for_dispatch(program)?;
577        self.inner.dispatch_borrowed_timed(program, inputs, config)
578    }
579
580    fn allocate_resident(&self, byte_len: usize) -> Result<Resource, BackendError> {
581        self.inner
582            .allocate_resident(byte_len)
583            .map(|handle| Resource::Resident(handle.id))
584    }
585
586    fn allocate_device_buffer(
587        &self,
588        byte_len: usize,
589    ) -> Result<Box<dyn vyre_driver::DeviceBuffer>, BackendError> {
590        let handle = self.inner.allocate_resident(byte_len)?;
591        Ok(Box::new(CudaDeviceBuffer {
592            backend_id: CUDA_BACKEND_ID,
593            handle,
594        }))
595    }
596
597    fn upload_device_buffer(
598        &self,
599        buffer: &mut dyn vyre_driver::DeviceBuffer,
600        bytes: &[u8],
601    ) -> Result<(), BackendError> {
602        let backend_id = buffer.backend_id().to_string();
603        let handle = buffer
604            .as_any_mut()
605            .downcast_mut::<CudaDeviceBuffer>()
606            .map(|cuda_buf| cuda_buf.handle)
607            .ok_or_else(|| BackendError::InvalidProgram {
608                fix: format!(
609                    "Fix: upload_device_buffer expected a CudaDeviceBuffer (allocated by `cuda` backend) but got buffer owned by `{backend_id}`."
610                ),
611            })?;
612        self.inner.upload_resident(handle, bytes)
613    }
614
615    fn download_device_buffer(
616        &self,
617        buffer: &dyn vyre_driver::DeviceBuffer,
618    ) -> Result<Vec<u8>, BackendError> {
619        let cuda_buf = buffer
620            .as_any()
621            .downcast_ref::<CudaDeviceBuffer>()
622            .ok_or_else(|| BackendError::InvalidProgram {
623                fix: format!(
624                    "Fix: download_device_buffer expected a CudaDeviceBuffer (allocated by `cuda` backend) but got buffer owned by `{}`.",
625                    buffer.backend_id()
626                ),
627            })?;
628        self.inner.download_resident(cuda_buf.handle)
629    }
630
631    fn free_device_buffer(
632        &self,
633        buffer: Box<dyn vyre_driver::DeviceBuffer>,
634    ) -> Result<(), BackendError> {
635        let backend_id = buffer.backend_id().to_string();
636        let handle = buffer
637            .as_any()
638            .downcast_ref::<CudaDeviceBuffer>()
639            .map(|cuda_buf| cuda_buf.handle)
640            .ok_or_else(|| BackendError::InvalidProgram {
641                fix: format!(
642                    "Fix: free_device_buffer expected a CudaDeviceBuffer but got buffer owned by `{backend_id}`."
643                ),
644            })?;
645        // Drop the Box (releases the wrapper allocation) before freeing
646        // the underlying CUDA-resident allocation. CudaResidentBuffer is
647        // Copy so we already captured the handle.
648        drop(buffer);
649        self.inner.free_resident(handle)
650    }
651
652    fn dispatch_with_device_buffers(
653        &self,
654        program: &Program,
655        inputs: &[&dyn vyre_driver::DeviceBuffer],
656        outputs: &mut [&mut dyn vyre_driver::DeviceBuffer],
657        config: &DispatchConfig,
658    ) -> Result<(), BackendError> {
659        self.validate_program_for_dispatch(program)?;
660        // Convert &[&dyn DeviceBuffer] into &[Resource::Resident(id)]
661        // so we can re-use the existing dispatch_resident_timed path.
662        // Outputs are bound by Resource::Resident as well  -  the kernel
663        // writes results in-place into the device-resident buffers; the
664        // caller reads them via download_device_buffer afterwards.
665        vyre_driver::validate_buffer_ownership(self.id(), inputs.iter().copied())?;
666        vyre_driver::validate_buffer_ownership(
667            self.id(),
668            outputs
669                .iter()
670                .map(|b| &**b as &dyn vyre_driver::DeviceBuffer),
671        )?;
672        let resource_capacity =
673            inputs
674                .len()
675                .checked_add(outputs.len())
676                .ok_or_else(|| BackendError::InvalidProgram {
677                    fix: format!(
678                        "Fix: CUDA borrowed dispatch resource capacity overflowed usize for {} input buffer(s) plus {} output buffer(s); split the dispatch.",
679                        inputs.len(),
680                        outputs.len()
681                    ),
682                })?;
683        let mut handles = SmallVec::<[CudaResidentBuffer; 8]>::new();
684        reserve_smallvec(
685            &mut handles,
686            resource_capacity,
687            "CUDA borrowed dispatch resource handles",
688        )?;
689        for buffer in inputs {
690            let handle = buffer
691                .as_any()
692                .downcast_ref::<CudaDeviceBuffer>()
693                .ok_or_else(|| BackendError::InvalidProgram {
694                    fix: format!(
695                        "Fix: dispatch_with_device_buffers expected CudaDeviceBuffer inputs but got buffer owned by `{}`.",
696                        buffer.backend_id()
697                    ),
698                })?
699                .handle;
700            handles.push(handle);
701        }
702        for buffer in outputs.iter() {
703            let backend_id = buffer.backend_id().to_string();
704            let handle = buffer
705                .as_any()
706                .downcast_ref::<CudaDeviceBuffer>()
707                .ok_or_else(|| BackendError::InvalidProgram {
708                    fix: format!(
709                        "Fix: dispatch_with_device_buffers expected CudaDeviceBuffer outputs but got buffer owned by `{backend_id}`."
710                    ),
711                })?
712                .handle;
713            handles.push(handle);
714        }
715        let _timed = self
716            .inner
717            .dispatch_resident_timed(program, &handles, config)?;
718        Ok(())
719    }
720
721    fn upload_resident(&self, resource: &Resource, bytes: &[u8]) -> Result<(), BackendError> {
722        let handle = self.inner.resident_handle_from_resource(resource)?;
723        self.inner.upload_resident(handle, bytes)
724    }
725
726    fn upload_resident_many(&self, uploads: &[(&Resource, &[u8])]) -> Result<(), BackendError> {
727        let concrete = self.resolve_uploads(uploads)?;
728        self.inner.upload_resident_many(&concrete)
729    }
730
731    fn upload_resident_at(
732        &self,
733        resource: &Resource,
734        dst_offset_bytes: usize,
735        bytes: &[u8],
736    ) -> Result<(), BackendError> {
737        let handle = self.inner.resident_handle_from_resource(resource)?;
738        self.inner
739            .upload_resident_at(handle, dst_offset_bytes, bytes)
740    }
741
742    fn upload_resident_at_many(
743        &self,
744        uploads: &[(&Resource, usize, &[u8])],
745    ) -> Result<(), BackendError> {
746        let concrete = self.resolve_offset_uploads(uploads)?;
747        self.inner.upload_resident_at_many(&concrete)
748    }
749
750    fn download_resident(&self, resource: &Resource) -> Result<Vec<u8>, BackendError> {
751        let handle = self.inner.resident_handle_from_resource(resource)?;
752        self.inner.download_resident(handle)
753    }
754
755    fn download_resident_into(
756        &self,
757        resource: &Resource,
758        out: &mut Vec<u8>,
759    ) -> Result<(), BackendError> {
760        let handle = self.inner.resident_handle_from_resource(resource)?;
761        self.inner.download_resident_into(handle, out)
762    }
763
764    fn download_resident_range(
765        &self,
766        resource: &Resource,
767        byte_offset: usize,
768        byte_len: usize,
769    ) -> Result<Vec<u8>, BackendError> {
770        let handle = self.inner.resident_handle_from_resource(resource)?;
771        self.inner
772            .download_resident_range(handle, byte_offset, byte_len)
773    }
774
775    fn download_resident_range_into(
776        &self,
777        resource: &Resource,
778        byte_offset: usize,
779        byte_len: usize,
780        out: &mut Vec<u8>,
781    ) -> Result<(), BackendError> {
782        let handle = self.inner.resident_handle_from_resource(resource)?;
783        self.inner
784            .download_resident_range_into(handle, byte_offset, byte_len, out)
785    }
786
787    fn download_resident_ranges_into(
788        &self,
789        ranges: &[(&Resource, usize, usize)],
790        outputs: &mut [&mut Vec<u8>],
791    ) -> Result<(), BackendError> {
792        let concrete = self.resolve_download_ranges(ranges)?;
793        self.inner.download_resident_ranges_into(&concrete, outputs)
794    }
795
796    fn free_resident(&self, resource: Resource) -> Result<(), BackendError> {
797        let handle = self.inner.resident_handle_from_resource(&resource)?;
798        self.inner.free_resident(handle)
799    }
800
801    fn dispatch_resident_timed(
802        &self,
803        program: &Program,
804        resources: &[Resource],
805        config: &DispatchConfig,
806    ) -> Result<vyre_driver::TimedDispatchResult, BackendError> {
807        self.validate_program_for_dispatch(program)?;
808        let handles = self.inner.resident_handles_from_resources(resources)?;
809        self.inner
810            .dispatch_resident_timed(program, &handles, config)
811    }
812
813    fn dispatch_resident_sequence_read_ranges_into(
814        &self,
815        steps: &[vyre_driver::backend::ResidentDispatchStep<'_>],
816        read_ranges: &[vyre_driver::backend::ResidentReadRange<'_>],
817        outputs: &mut [&mut Vec<u8>],
818    ) -> Result<(), BackendError> {
819        self.validate_resident_steps_for_dispatch(steps)?;
820        if read_ranges.len() != outputs.len() {
821            return Err(BackendError::InvalidProgram {
822                fix: format!(
823                    "Fix: CUDA resident sequence ranged readback expected matching range/output counts but got {} range(s) and {} output(s).",
824                    read_ranges.len(),
825                    outputs.len()
826                ),
827            });
828        }
829        let handle_sets =
830            self.resolve_step_handle_sets(steps, "CUDA resident sequence handle sets")?;
831        let concrete_steps =
832            Self::concrete_resident_steps(steps, &handle_sets, "CUDA resident sequence steps")?;
833
834        let (read_handles, concrete_readbacks) = self.resolve_read_ranges(read_ranges)?;
835
836        let uploads: [(crate::backend::CudaResidentBuffer, &[u8]); 0] = [];
837        self.inner
838            .upload_resident_many_sequence_read_ranges_borrowed_into(
839                &uploads,
840                &concrete_steps,
841                &read_handles,
842                &concrete_readbacks,
843                outputs,
844            )
845    }
846
847    fn dispatch_resident_repeated_sequence_read_ranges_into(
848        &self,
849        prefix_steps: &[vyre_driver::backend::ResidentDispatchStep<'_>],
850        repeated_steps: &[vyre_driver::backend::ResidentDispatchStep<'_>],
851        repeat_count: u32,
852        read_ranges: &[vyre_driver::backend::ResidentReadRange<'_>],
853        outputs: &mut [&mut Vec<u8>],
854    ) -> Result<(), BackendError> {
855        self.validate_resident_steps_for_dispatch(prefix_steps)?;
856        self.validate_resident_steps_for_dispatch(repeated_steps)?;
857        let repeat_count =
858            usize::try_from(repeat_count).map_err(|error| BackendError::InvalidProgram {
859                fix: format!(
860                    "Fix: CUDA repeated resident sequence count does not fit usize: {error}."
861                ),
862            })?;
863        if read_ranges.len() != outputs.len() {
864            return Err(BackendError::InvalidProgram {
865                fix: format!(
866                    "Fix: CUDA repeated resident sequence ranged readback expected matching range/output counts but got {} range(s) and {} output(s).",
867                    read_ranges.len(),
868                    outputs.len()
869                ),
870            });
871        }
872
873        let prefix_handle_sets = self
874            .resolve_step_handle_sets(prefix_steps, "CUDA repeated resident prefix handle sets")?;
875        let repeated_handle_sets =
876            self.resolve_repeated_step_handle_sets(repeated_steps, repeat_count)?;
877        let concrete_prefix = Self::concrete_resident_steps(
878            prefix_steps,
879            &prefix_handle_sets,
880            "CUDA repeated resident prefix steps",
881        )?;
882        let concrete_repeated = Self::concrete_resident_steps(
883            repeated_steps,
884            &repeated_handle_sets,
885            "CUDA repeated resident repeated steps",
886        )?;
887
888        let (read_handles, concrete_readbacks) = self.resolve_read_ranges(read_ranges)?;
889        let uploads: [(crate::backend::CudaResidentBuffer, &[u8]); 0] = [];
890        self.inner
891            .upload_resident_many_repeated_sequence_read_ranges_borrowed_into(
892                &uploads,
893                &concrete_prefix,
894                &concrete_repeated,
895                repeat_count,
896                &read_handles,
897                &concrete_readbacks,
898                outputs,
899            )
900    }
901
902    fn compile_native(
903        &self,
904        program: &Program,
905        config: &DispatchConfig,
906    ) -> Result<Option<Arc<dyn vyre_driver::CompiledPipeline>>, BackendError> {
907        self.validate_program_for_dispatch(program)?;
908        self.inner.compile_native(program, config).map(Some)
909    }
910
911    fn compile_native_shared(
912        &self,
913        program: Arc<Program>,
914        config: &DispatchConfig,
915    ) -> Result<Option<Arc<dyn vyre_driver::CompiledPipeline>>, BackendError> {
916        self.validate_program_for_dispatch(program.as_ref())?;
917        self.inner.compile_native_shared(program, config).map(Some)
918    }
919
920    fn pipeline_cache_snapshot(&self) -> Option<vyre_driver::pipeline::PipelineCacheSnapshot> {
921        Some(self.inner.pipeline_cache_snapshot())
922    }
923
924    fn backend_metric_snapshot(&self) -> Vec<(&'static str, u64)> {
925        let source_cache = self.inner.ptx_source_cache_snapshot();
926        let mut metrics = Vec::new();
927        match u64::try_from(source_cache.entries) {
928            Ok(entries) => metrics.push(("cuda_ptx_source_cache_entries", entries)),
929            Err(source) => {
930                tracing::error!(
931                    "CUDA PTX source cache entry count cannot fit u64: {source}. Fix: shard backend metrics before source-cache cardinality exceeds u64."
932                );
933                metrics.push(("cuda_ptx_source_cache_entries_unrepresentable", 1));
934            }
935        }
936        metrics.push(("cuda_ptx_source_cache_hits", source_cache.hits));
937        metrics.push(("cuda_ptx_source_cache_misses", source_cache.misses));
938        let telemetry = self.inner.telemetry_snapshot();
939        metrics.push(("cuda_timed_dispatches", telemetry.timed_dispatches));
940        metrics.push((
941            "cuda_timed_device_measurements",
942            telemetry.timed_device_measurements,
943        ));
944        metrics.push((
945            "cuda_timed_dispatches_missing_device_time",
946            telemetry.timed_dispatches_missing_device_time,
947        ));
948        metrics.push(("cuda_timed_wall_ns_total", telemetry.timed_wall_ns_total));
949        metrics.push((
950            "cuda_timed_device_ns_total",
951            telemetry.timed_device_ns_total,
952        ));
953        metrics.push(("cuda_timed_device_ns_max", telemetry.timed_device_ns_max));
954        metrics.push((
955            "cuda_timed_enqueue_ns_total",
956            telemetry.timed_enqueue_ns_total,
957        ));
958        metrics.push(("cuda_timed_wait_ns_total", telemetry.timed_wait_ns_total));
959        metrics
960    }
961
962    fn supports_subgroup_ops(&self) -> bool {
963        self.inner.hardware_supports_subgroup_ops()
964    }
965
966    fn supports_f16(&self) -> bool {
967        self.inner.hardware_supports_f16()
968    }
969
970    fn supports_bf16(&self) -> bool {
971        self.inner.hardware_supports_bf16()
972    }
973
974    fn supports_tensor_cores(&self) -> bool {
975        self.inner.hardware_supports_tensor_cores() && self.inner.lowers_tensor_core_ops()
976    }
977
978    fn supports_async_compute(&self) -> bool {
979        self.inner.hardware_supports_async_compute()
980    }
981
982    fn supports_grid_sync(&self) -> bool {
983        self.inner.supports_grid_sync()
984    }
985
986    fn allows_host_grid_sync_split(&self) -> bool {
987        false
988    }
989
990    fn supports_speculation(&self) -> bool {
991        false
992    }
993
994    fn max_workgroup_size(&self) -> [u32; 3] {
995        self.inner.max_block_dim()
996    }
997
998    fn max_compute_workgroups_per_dimension(&self) -> u32 {
999        self.inner.max_grid_dim()[0]
1000    }
1001
1002    fn max_compute_invocations_per_workgroup(&self) -> u32 {
1003        self.inner.max_threads_per_block()
1004    }
1005
1006    fn subgroup_size(&self) -> Option<u32> {
1007        self.inner.warp_size()
1008    }
1009
1010    fn max_storage_buffer_bytes(&self) -> u64 {
1011        self.inner.device_memory_bytes()
1012    }
1013
1014    fn device_profile(&self) -> vyre_driver::DeviceProfile {
1015        let mut profile = self.inner.caps.to_device_profile();
1016        profile.supports_tensor_cores = self.supports_tensor_cores();
1017        profile.supports_indirect_dispatch = self.supports_indirect_dispatch();
1018        profile
1019    }
1020
1021    fn prepare(&self) -> Result<(), BackendError> {
1022        self.inner.warmup()
1023    }
1024
1025    fn shutdown(&self) -> Result<(), BackendError> {
1026        self.inner.cleanup()
1027    }
1028}
1029
1030/// Factory function for inventory registration.
1031
1032pub fn cuda_factory() -> Result<Box<dyn VyreBackend>, BackendError> {
1033    let backend = CudaBackend::acquire().map_err(|e| BackendError::DispatchFailed {
1034        code: None,
1035        message: format!("CUDA backend acquisition failed: {e}"),
1036    })?;
1037    Ok(Box::new(CudaBackendRegistration { inner: backend }))
1038}
1039
1040/// Op-support set  -  CUDA supports every op the foundation IR defines
1041/// plus hardware intrinsics. Populated at runtime by the conform runner.
1042pub fn cuda_supported_ops() -> &'static std::collections::HashSet<vyre_foundation::ir::OpId> {
1043    vyre_driver::backend::validation::default_supported_ops_with_trap()
1044}
1045
1046inventory::submit! {
1047    BackendRegistration {
1048        id: CUDA_BACKEND_ID,
1049        factory: cuda_factory,
1050        supported_ops: cuda_supported_ops,
1051    }
1052}
1053
1054// rank 5 - CUDA is the canonical release dispatch backend when linked.
1055inventory::submit! {
1056    vyre_driver::backend::BackendPrecedence {
1057        id: CUDA_BACKEND_ID,
1058        rank: 5,
1059    }
1060}
1061
1062// CUDA owns a live dispatch stack, so conform can prove against it.
1063inventory::submit! {
1064    vyre_driver::backend::BackendCapability {
1065        id: CUDA_BACKEND_ID,
1066        dispatches: true,
1067    }
1068}
1069
1070fn emit_aot_bytes(program: &Program, config: &DispatchConfig) -> Result<Vec<u8>, String> {
1071    let backend = CudaBackend::acquire().map_err(|error| {
1072        format!(
1073            "CUDA PTX AOT emission could not probe the live device target: {error}. Fix: run AOT emission on a host with the CUDA driver and target GPU visible."
1074        )
1075    })?;
1076    crate::codegen::program_to_ptx_for_sm_and_subgroup(
1077        program,
1078        config,
1079        backend.ptx_target_sm(),
1080        backend.warp_size().ok_or_else(|| {
1081            "CUDA PTX AOT emission could not read a hardware warp size from the live device probe. Fix: repair CUDA capability probing before AOT emission.".to_string()
1082        })?,
1083    )
1084    .map(String::into_bytes)
1085}
1086
1087inventory::submit! {
1088    vyre_driver::aot::AotEmitter {
1089        target: "secondary_text",
1090        emit: emit_aot_bytes,
1091    }
1092}
1093
1094inventory::submit! {
1095    vyre_driver::aot::AotLauncherEmitter {
1096        target: "secondary_text",
1097        emit: aot_launcher::emit_launcher,
1098    }
1099}
1100
1101#[cfg(test)]
1102mod tests {
1103    #[test]
1104    fn public_cuda_resident_helpers_reserve_smallvecs_fallibly() {
1105        let source = include_str!("lib.rs");
1106        assert!(
1107            source.contains("use crate::backend::staging_reserve::reserve_smallvec;"),
1108            "Fix: public CUDA resident helpers must use the shared fallible staging reservation contract."
1109        );
1110        assert!(
1111            !source.contains(concat!(
1112                "SmallVec::<",
1113                "[(CudaResidentBuffer, &'a [u8]); 8]>::with_capacity"
1114            )) && !source.contains(concat!(
1115                "SmallVec::<",
1116                "[(CudaResidentBuffer, usize, &'a [u8]); 8]>::with_capacity"
1117            )) && !source.contains(concat!(
1118                "SmallVec::<",
1119                "[(CudaResidentBuffer, usize, usize); 8]>::with_capacity"
1120            )) && !source.contains(concat!(
1121                "SmallVec::<",
1122                "[CudaResidentBuffer; 8]>::with_capacity"
1123            )) && !source.contains(concat!(
1124                "SmallVec::<",
1125                "[SmallVec<[crate::backend::CudaResidentBuffer; 8]>; 8]>::with_capacity"
1126            )) && !source.contains(concat!(
1127                "SmallVec::<",
1128                "[crate::backend::CudaResidentDispatchStep<'_>; 8]>::with_capacity"
1129            )),
1130            "Fix: public CUDA resident helpers must reserve fallibly instead of using infallible SmallVec capacity growth."
1131        );
1132        assert!(
1133            source.contains("CUDA resident sequence handle sets")
1134                && source.contains("CUDA repeated resident repeated steps")
1135                && source.contains("CUDA borrowed dispatch resource handles"),
1136            "Fix: public CUDA resident sequence and borrowed-buffer staging paths must expose specific fallible-reservation labels."
1137        );
1138        assert!(
1139            source.contains("fn resolve_step_handle_sets")
1140                && source.contains("fn resolve_repeated_step_handle_sets")
1141                && source.contains("fn concrete_resident_steps"),
1142            "Fix: public CUDA resident sequence paths must share one handle-set and concrete-step staging implementation."
1143        );
1144    }
1145
1146    #[test]
1147    fn public_cuda_execution_entrypoints_share_capability_validation() {
1148        let source = include_str!("lib.rs");
1149        assert!(
1150            source.contains("fn validate_program_for_dispatch")
1151                && source.contains("check_backend_capabilities")
1152                && source.contains("reject_grid_sync_without_native_lowering(program)"),
1153            "Fix: CUDA dispatch validation must centralize capability and grid-sync checks before launch/lowering."
1154        );
1155
1156        for (name, body) in [
1157            (
1158                "dispatch",
1159                method_region(source, "    fn dispatch(\n", "    fn dispatch_async("),
1160            ),
1161            (
1162                "dispatch_async",
1163                method_region(
1164                    source,
1165                    "    fn dispatch_async(\n",
1166                    "    fn dispatch_borrowed_async(",
1167                ),
1168            ),
1169            (
1170                "dispatch_borrowed_async",
1171                method_region(
1172                    source,
1173                    "    fn dispatch_borrowed_async(\n",
1174                    "    fn dispatch_borrowed(",
1175                ),
1176            ),
1177            (
1178                "dispatch_borrowed",
1179                method_region(
1180                    source,
1181                    "    fn dispatch_borrowed(\n",
1182                    "    fn dispatch_borrowed_into(",
1183                ),
1184            ),
1185            (
1186                "dispatch_borrowed_into",
1187                method_region(
1188                    source,
1189                    "    fn dispatch_borrowed_into(\n",
1190                    "    fn dispatch_borrowed_timed(",
1191                ),
1192            ),
1193            (
1194                "dispatch_borrowed_timed",
1195                method_region(
1196                    source,
1197                    "    fn dispatch_borrowed_timed(\n",
1198                    "    fn allocate_resident(",
1199                ),
1200            ),
1201            (
1202                "dispatch_with_device_buffers",
1203                method_region(
1204                    source,
1205                    "    fn dispatch_with_device_buffers(\n",
1206                    "    fn upload_resident(",
1207                ),
1208            ),
1209            (
1210                "dispatch_resident_timed",
1211                method_region(
1212                    source,
1213                    "    fn dispatch_resident_timed(\n",
1214                    "    fn dispatch_resident_sequence_read_ranges_into(",
1215                ),
1216            ),
1217            (
1218                "compile_native",
1219                method_region(
1220                    source,
1221                    "    fn compile_native(\n",
1222                    "    fn compile_native_shared(",
1223                ),
1224            ),
1225        ] {
1226            assert!(
1227                body.contains("validate_program_for_dispatch(program)?"),
1228                "Fix: CUDA {name} must run the shared capability/grid-sync validation gate before lowering or launch."
1229            );
1230        }
1231
1232        let compile_shared = method_region(
1233            source,
1234            "    fn compile_native_shared(\n",
1235            "    fn pipeline_cache_snapshot(",
1236        );
1237        assert!(
1238            compile_shared.contains("validate_program_for_dispatch(program.as_ref())?"),
1239            "Fix: CUDA compile_native_shared must validate the shared Program before lowering."
1240        );
1241
1242        let resident_sequence = method_region(
1243            source,
1244            "    fn dispatch_resident_sequence_read_ranges_into(\n",
1245            "    fn dispatch_resident_repeated_sequence_read_ranges_into(",
1246        );
1247        assert!(
1248            resident_sequence.contains("validate_resident_steps_for_dispatch(steps)?"),
1249            "Fix: CUDA resident sequence dispatch must validate every step Program before launch."
1250        );
1251
1252        let repeated_sequence = method_region(
1253            source,
1254            "    fn dispatch_resident_repeated_sequence_read_ranges_into(\n",
1255            "    fn compile_native(",
1256        );
1257        assert!(
1258            repeated_sequence.contains("validate_resident_steps_for_dispatch(prefix_steps)?")
1259                && repeated_sequence.contains("validate_resident_steps_for_dispatch(repeated_steps)?"),
1260            "Fix: CUDA repeated resident sequence dispatch must validate both prefix and repeated step Programs before launch."
1261        );
1262    }
1263
1264    fn method_region<'a>(source: &'a str, start: &str, end: &str) -> &'a str {
1265        source
1266            .split(start)
1267            .nth(1)
1268            .expect("Fix: replace expect with fallible API or document caller precondition; panic only on programmer error - method start must exist")
1269            .split(end)
1270            .next()
1271            .expect("Fix: replace expect with fallible API or document caller precondition; panic only on programmer error - method end must exist")
1272    }
1273}
1274