1#![deny(missing_docs)]
18#![allow(unsafe_code)]
24
25mod aot_launcher;
26pub mod backend;
28pub mod benchmark_pass_selection;
30pub mod codegen;
32pub mod device;
34pub mod device_diagnostic_aggregation;
36pub mod device_work_queue;
38pub mod egraph_device_image;
40pub mod egraph_kernel_plan;
42mod egraph_readback;
43pub mod frontier_typed_ir_adapter;
45mod input_identity;
46mod instrumentation;
47pub mod jit_cache;
52pub mod kernel_failure_diagnostics;
54pub mod launch_fusion;
56pub mod megakernel_plan_cache;
59pub mod multi_query_execution;
61mod numeric;
62pub mod occupancy;
67pub mod optimizer;
72mod pipeline;
73pub mod profiler;
75pub mod resident_graph_session;
77pub mod result_compaction;
79mod stream;
80pub mod synthetic_device_caps;
82pub mod token_fact_frontier_execution;
84pub mod token_fact_graph_cuda_adapter;
86
87pub use backend::{
88 CudaBackend, CudaPtxSourceCacheSnapshot, CudaResidentBuffer, CudaTelemetrySnapshot,
89};
90pub use benchmark_pass_selection::{
91 select_cuda_benchmark_passes, select_cuda_benchmark_passes_with_scratch,
92 CudaBenchmarkPassCandidate, CudaBenchmarkPassSelectionError, CudaBenchmarkPassSelectionPlan,
93 CudaBenchmarkPassSelectionSample, CudaBenchmarkPassSelectionScratch,
94 CudaBenchmarkPassSkipReason, CudaSkippedBenchmarkPass,
95};
96pub use stream::CudaLaunchResourceCounts;
97pub mod megakernel_barrier_planner;
99pub mod megakernel_convergence;
101pub mod megakernel_scheduler;
102pub mod megakernel_speedup_gate;
104pub use device::{CudaDeviceCaps, CudaDeviceHandle};
105pub use device_diagnostic_aggregation::{
106 plan_cuda_device_diagnostic_aggregation, plan_cuda_device_diagnostic_aggregation_with_scratch,
107 CudaDiagnosticAggregationError, CudaDiagnosticAggregationPlan,
108 CudaDiagnosticAggregationScratch, CudaDiagnosticCompactRange, CudaDiagnosticShard,
109};
110pub use device_work_queue::{
111 plan_cuda_device_work_queue, plan_cuda_device_work_queue_backpressure,
112 CudaDeviceWorkQueueBackpressurePlan, CudaDeviceWorkQueueDrainStrategy,
113 CudaDeviceWorkQueueError, CudaDeviceWorkQueuePlan, CudaDeviceWorkQueueProfile,
114 CudaWorkQueueHostSync,
115};
116pub use egraph_device_image::{
117 plan_cuda_egraph_device_upload, plan_cuda_egraph_device_upload_from_image,
118 plan_cuda_egraph_device_upload_from_image_ref, CudaEGraphDeviceBorrowedUploadPlan,
119 CudaEGraphDeviceByteLayout, CudaEGraphDeviceByteSpan, CudaEGraphDeviceKernelView,
120 CudaEGraphDeviceUploadError, CudaEGraphDeviceUploadPlan, CudaResidentEGraphDeviceImage,
121};
122pub use egraph_kernel_plan::{
123 collect_cuda_egraph_structural_equivalences, cuda_egraph_canonical_rewrite_kernel_ptx,
124 cuda_egraph_signature_pair_rows, cuda_egraph_signature_refresh_kernel_ptx,
125 cuda_egraph_structural_equivalence_kernel_ptx, pack_cuda_egraph_canonical_rewrite_device_image,
126 pack_cuda_egraph_signature_bucket_device_image, plan_cuda_egraph_kernel_work,
127 plan_cuda_egraph_signature_buckets, plan_cuda_egraph_signature_buckets_from_resident_snapshot,
128 plan_cuda_egraph_signature_buckets_from_signature_snapshot,
129 plan_cuda_egraph_structural_equivalence_launch_artifact,
130 plan_cuda_egraph_structural_equivalence_output, plan_cuda_egraph_structural_equivalences,
131 plan_cuda_egraph_union_compaction, CudaEGraphCanonicalRewrite,
132 CudaEGraphCanonicalRewriteDeviceImage, CudaEGraphCanonicalRewriteKernelPtx,
133 CudaEGraphCanonicalRewriteKernelResult, CudaEGraphFixedPointReadback,
134 CudaEGraphKernelLaunchConfig, CudaEGraphKernelPass, CudaEGraphKernelPlanError,
135 CudaEGraphKernelWave, CudaEGraphKernelWorkPlan, CudaEGraphResidentColumnSnapshot,
136 CudaEGraphResidentSignatureSnapshot, CudaEGraphSignatureBucket,
137 CudaEGraphSignatureBucketDeviceImage, CudaEGraphSignatureBucketPlan,
138 CudaEGraphSignaturePairWave, CudaEGraphSignatureRefreshKernelPtx,
139 CudaEGraphSignatureRefreshKernelResult, CudaEGraphStructuralCanonicalizationFixedPointReport,
140 CudaEGraphStructuralCanonicalizationFixedPointResult,
141 CudaEGraphStructuralCanonicalizationRoundResult, CudaEGraphStructuralEquivalenceKernelPtx,
142 CudaEGraphStructuralEquivalenceKernelResult, CudaEGraphStructuralEquivalenceLaunchArtifact,
143 CudaEGraphStructuralEquivalenceOutputPlan, CudaEGraphStructuralEquivalencePlan,
144 CudaEGraphUnionCompactionPass, CudaEGraphUnionCompactionPlan, CudaEGraphUnionCompactionWave,
145 CUDA_EGRAPH_CANONICAL_REWRITE_KERNEL_ENTRY, CUDA_EGRAPH_CANONICAL_REWRITE_KERNEL_PARAM_COUNT,
146 CUDA_EGRAPH_CANONICAL_REWRITE_RECORD_WORDS, CUDA_EGRAPH_SIGNATURE_BUCKET_RECORD_WORDS,
147 CUDA_EGRAPH_SIGNATURE_REFRESH_KERNEL_ENTRY, CUDA_EGRAPH_SIGNATURE_REFRESH_KERNEL_PARAM_COUNT,
148 CUDA_EGRAPH_STRUCTURAL_EQUIVALENCE_KERNEL_ENTRY,
149 CUDA_EGRAPH_STRUCTURAL_EQUIVALENCE_KERNEL_PARAM_COUNT,
150};
151pub use frontier_typed_ir_adapter::{
152 adapt_frontier_typed_ir_to_cuda, CudaFrontierTypedIrAdapterError, CudaFrontierTypedIrInput,
153};
154pub use kernel_failure_diagnostics::{
155 diagnose_cuda_kernel_launch, diagnose_cuda_kernel_launch_shape,
156 diagnose_cuda_kernel_launch_with_scratch, CudaKernelCapabilityFailure,
157 CudaKernelDeviceEnvelope, CudaKernelLaunchDiagnostic, CudaKernelLaunchDiagnosticRef,
158 CudaKernelLaunchDiagnosticScratch, CudaKernelLaunchEnvelope, CudaKernelLaunchEnvelopeError,
159 CudaKernelLaunchShape, CudaKernelRequirement,
160};
161pub use launch_fusion::{
162 plan_cuda_launch_fusion, plan_cuda_launch_fusion_with_scratch, CudaFusionStage,
163 CudaLaunchFusionError, CudaLaunchFusionGroup, CudaLaunchFusionPlan, CudaLaunchFusionScratch,
164};
165pub use megakernel_barrier_planner::{
166 plan_cuda_frontier_megakernel_execution, plan_cuda_frontier_megakernel_execution_with_scratch,
167 plan_cuda_megakernel_barriers, plan_cuda_megakernel_barriers_with_scratch,
168 CudaMegakernelBarrierGroup, CudaMegakernelBarrierPlan, CudaMegakernelBarrierPlanError,
169 CudaMegakernelBarrierScratch, CudaMegakernelFrontierExecutionPlan,
170 CudaMegakernelFrontierExecutionPlanError, CudaMegakernelFrontierWave,
171 CudaMegakernelWaveDependency,
172};
173pub use megakernel_convergence::{
174 plan_cuda_device_convergence, CudaConvergenceReadbackPolicy, CudaDeviceConvergencePlan,
175 CudaDeviceConvergencePlanError,
176};
177pub use megakernel_plan_cache::{
178 CudaMegakernelAnalysisKind, CudaMegakernelCachedPlan, CudaMegakernelDeviceKey,
179 CudaMegakernelPlanCache, CudaMegakernelPlanCacheKey, CudaMegakernelPlanCacheStats,
180};
181pub use megakernel_scheduler::{
182 plan_cuda_megakernel_execution, plan_cuda_megakernel_memory_budget,
183 schedule_megakernel_from_cuda_samples, schedule_megakernel_from_cuda_samples_into,
184 select_cuda_megakernel_topology, CudaMegakernelExecutionPlan, CudaMegakernelGraphShape,
185 CudaMegakernelMemoryBudget, CudaMegakernelMemoryError, CudaMegakernelMemoryPlan,
186 CudaMegakernelScheduleSample, CudaMegakernelTopology, CudaMegakernelTopologyDecision,
187};
188pub use megakernel_speedup_gate::{
189 format_validated_cuda_megakernel_speedup_evidence_csv,
190 validate_cuda_megakernel_speedup_evidence_csv, validate_cuda_megakernel_speedup_gate,
191 CudaMegakernelSpeedupGateError, CudaMegakernelSpeedupProof, CudaMegakernelSpeedupSample,
192 MEGAKERNEL_SPEEDUP_EVIDENCE_CSV_HEADER,
193};
194pub use multi_query_execution::{
195 plan_cuda_multi_query_execution, plan_cuda_multi_query_execution_with_scratch, CudaMultiQuery,
196 CudaMultiQueryExecutionError, CudaMultiQueryExecutionPlan, CudaMultiQueryExecutionScratch,
197 CudaMultiQueryGroup,
198};
199pub use optimizer::CudaOptimizerDispatcher;
200pub use resident_graph_session::{
201 format_validated_cuda_resident_graph_session_evidence_csv, plan_cuda_resident_graph_session,
202 resident_graph_session_speedup_sample, CudaResidentGraphReadback,
203 CudaResidentGraphSessionError, CudaResidentGraphSessionEvidence,
204 CudaResidentGraphSessionEvidenceError, CudaResidentGraphSessionPlan,
205 CudaResidentGraphSessionProfile,
206};
207pub use result_compaction::{
208 plan_cuda_result_compaction, plan_cuda_result_compaction_with_scratch, CudaCompactResultRecord,
209 CudaResultCompactionError, CudaResultCompactionPlan, CudaResultCompactionScratch,
210 CudaResultSlot,
211};
212pub use token_fact_frontier_execution::{
213 plan_cuda_token_fact_frontier_execution, plan_cuda_token_fact_frontier_execution_with_scratch,
214 CudaTokenFactFrontierExecutionError, CudaTokenFactFrontierExecutionPlan,
215};
216pub use token_fact_graph_cuda_adapter::{
217 adapt_token_fact_graph_to_cuda_layout, CudaTokenFactGraphLayout, CudaTokenFactGraphLayoutError,
218};
219
220use std::sync::Arc;
221
222use crate::backend::staging_reserve::reserve_smallvec;
223use smallvec::SmallVec;
224use vyre_driver::{BackendError, BackendRegistration, DispatchConfig, Resource, VyreBackend};
225use vyre_foundation::ir::Program;
226
227pub const CUDA_BACKEND_ID: &str = "cuda";
229
230#[derive(Debug)]
242pub struct CudaDeviceBuffer {
243 backend_id: &'static str,
244 handle: backend::CudaResidentBuffer,
245}
246
247impl vyre_driver::DeviceBuffer for CudaDeviceBuffer {
248 fn backend_id(&self) -> &'static str {
249 self.backend_id
250 }
251
252 fn byte_len(&self) -> usize {
253 self.handle.byte_len
254 }
255
256 fn as_any(&self) -> &dyn std::any::Any {
257 self
258 }
259
260 fn as_any_mut(&mut self) -> &mut dyn std::any::Any {
261 self
262 }
263}
264
265#[derive(Debug)]
270pub struct CudaBackendRegistration {
271 inner: CudaBackend,
272}
273
274impl CudaBackendRegistration {
275 #[must_use]
282 pub fn new(inner: CudaBackend) -> Self {
283 Self { inner }
284 }
285
286 #[must_use]
288 pub fn inner(&self) -> &CudaBackend {
289 &self.inner
290 }
291
292 #[must_use]
294 pub fn ptx_source_cache_snapshot(&self) -> CudaPtxSourceCacheSnapshot {
295 self.inner.ptx_source_cache_snapshot()
296 }
297
298 #[must_use]
300 pub fn telemetry_snapshot(&self) -> CudaTelemetrySnapshot {
301 self.inner.telemetry_snapshot()
302 }
303
304 pub fn reset_telemetry(&self) {
306 self.inner.reset_telemetry();
307 }
308
309 fn resolve_uploads<'a>(
310 &self,
311 uploads: &[(&Resource, &'a [u8])],
312 ) -> Result<SmallVec<[(CudaResidentBuffer, &'a [u8]); 8]>, BackendError> {
313 let mut concrete = SmallVec::<[(CudaResidentBuffer, &'a [u8]); 8]>::new();
314 reserve_smallvec(&mut concrete, uploads.len(), "CUDA resident upload handles")?;
315 for (resource, bytes) in uploads {
316 let handle = self.inner.resident_handle_from_resource(resource)?;
317 concrete.push((handle, *bytes));
318 }
319 Ok(concrete)
320 }
321
322 fn resolve_offset_uploads<'a>(
323 &self,
324 uploads: &[(&Resource, usize, &'a [u8])],
325 ) -> Result<SmallVec<[(CudaResidentBuffer, usize, &'a [u8]); 8]>, BackendError> {
326 let mut concrete = SmallVec::<[(CudaResidentBuffer, usize, &'a [u8]); 8]>::new();
327 reserve_smallvec(
328 &mut concrete,
329 uploads.len(),
330 "CUDA resident offset upload handles",
331 )?;
332 for (resource, dst_offset_bytes, bytes) in uploads {
333 let handle = self.inner.resident_handle_from_resource(resource)?;
334 concrete.push((handle, *dst_offset_bytes, *bytes));
335 }
336 Ok(concrete)
337 }
338
339 fn resolve_download_ranges(
340 &self,
341 ranges: &[(&Resource, usize, usize)],
342 ) -> Result<SmallVec<[(CudaResidentBuffer, usize, usize); 8]>, BackendError> {
343 let mut concrete = SmallVec::<[(CudaResidentBuffer, usize, usize); 8]>::new();
344 reserve_smallvec(
345 &mut concrete,
346 ranges.len(),
347 "CUDA resident download range handles",
348 )?;
349 for (resource, byte_offset, byte_len) in ranges {
350 let handle = self.inner.resident_handle_from_resource(resource)?;
351 concrete.push((handle, *byte_offset, *byte_len));
352 }
353 Ok(concrete)
354 }
355
356 fn resolve_read_ranges(
357 &self,
358 read_ranges: &[vyre_driver::backend::ResidentReadRange<'_>],
359 ) -> Result<
360 (
361 SmallVec<[CudaResidentBuffer; 8]>,
362 SmallVec<[crate::backend::output_range::CudaOutputReadback; 8]>,
363 ),
364 BackendError,
365 > {
366 let mut handles = SmallVec::<[CudaResidentBuffer; 8]>::new();
367 let mut concrete_readbacks =
368 SmallVec::<[crate::backend::output_range::CudaOutputReadback; 8]>::new();
369 reserve_smallvec(
370 &mut handles,
371 read_ranges.len(),
372 "CUDA resident read handles",
373 )?;
374 reserve_smallvec(
375 &mut concrete_readbacks,
376 read_ranges.len(),
377 "CUDA resident readback ranges",
378 )?;
379 for range in read_ranges {
380 handles.push(self.inner.resident_handle_from_resource(range.resource)?);
381 concrete_readbacks.push(crate::backend::output_range::CudaOutputReadback {
382 device_offset: range.byte_offset,
383 byte_len: range.byte_len,
384 });
385 }
386 Ok((handles, concrete_readbacks))
387 }
388
389 fn resolve_step_handle_sets(
390 &self,
391 steps: &[vyre_driver::backend::ResidentDispatchStep<'_>],
392 field: &'static str,
393 ) -> Result<SmallVec<[SmallVec<[crate::backend::CudaResidentBuffer; 8]>; 8]>, BackendError>
394 {
395 let mut handle_sets =
396 SmallVec::<[SmallVec<[crate::backend::CudaResidentBuffer; 8]>; 8]>::new();
397 reserve_smallvec(&mut handle_sets, steps.len(), field)?;
398 for step in steps {
399 handle_sets.push(self.inner.resident_handles_from_resources(step.resources)?);
400 }
401 Ok(handle_sets)
402 }
403
404 fn resolve_repeated_step_handle_sets(
405 &self,
406 steps: &[vyre_driver::backend::ResidentDispatchStep<'_>],
407 repeat_count: usize,
408 ) -> Result<SmallVec<[SmallVec<[crate::backend::CudaResidentBuffer; 8]>; 8]>, BackendError>
409 {
410 let mut handle_sets =
411 SmallVec::<[SmallVec<[crate::backend::CudaResidentBuffer; 8]>; 8]>::new();
412 let capacity = if repeat_count == 0 { 0 } else { steps.len() };
413 reserve_smallvec(
414 &mut handle_sets,
415 capacity,
416 "CUDA repeated resident repeated handle sets",
417 )?;
418 if repeat_count != 0 {
419 for step in steps {
420 handle_sets.push(self.inner.resident_handles_from_resources(step.resources)?);
421 }
422 }
423 Ok(handle_sets)
424 }
425
426 fn concrete_resident_steps<'program: 'handles, 'handles>(
427 steps: &[vyre_driver::backend::ResidentDispatchStep<'program>],
428 handle_sets: &'handles [SmallVec<[crate::backend::CudaResidentBuffer; 8]>],
429 field: &'static str,
430 ) -> Result<SmallVec<[crate::backend::CudaResidentDispatchStep<'handles>; 8]>, BackendError>
431 {
432 let mut concrete_steps =
433 SmallVec::<[crate::backend::CudaResidentDispatchStep<'handles>; 8]>::new();
434 reserve_smallvec(&mut concrete_steps, handle_sets.len(), field)?;
435 for (step, handles) in steps.iter().zip(handle_sets.iter()) {
436 let mut config = DispatchConfig::default();
437 config.grid_override = step.grid_override;
438 concrete_steps.push(crate::backend::CudaResidentDispatchStep {
439 program: step.program,
440 handles,
441 config,
442 });
443 }
444 Ok(concrete_steps)
445 }
446
447 pub fn allocated_transient_allocation_bytes(&self) -> Result<usize, BackendError> {
456 self.inner.allocated_transient_allocation_bytes()
457 }
458
459 fn reject_grid_sync_without_native_lowering(
460 &self,
461 program: &Program,
462 ) -> Result<(), BackendError> {
463 if vyre_driver::grid_sync::contains_grid_sync(program) && !self.supports_grid_sync() {
464 return Err(BackendError::UnsupportedFeature {
465 name: "cuda_native_grid_sync_lowering (MemoryOrdering::GridSync requires explicit split routing or native cooperative-grid barrier lowering)"
466 .to_string(),
467 backend: CUDA_BACKEND_ID.to_string(),
468 });
469 }
470 Ok(())
471 }
472
473 fn validate_program_for_dispatch(&self, program: &Program) -> Result<(), BackendError> {
474 let required = vyre_foundation::program_caps::scan(program);
475 vyre_foundation::program_caps::check_backend_capabilities(
476 CUDA_BACKEND_ID,
477 self.supports_subgroup_ops(),
478 self.supports_f16(),
479 self.supports_bf16(),
480 self.supports_indirect_dispatch(),
481 true,
482 self.supports_distributed_collectives(),
483 self.max_workgroup_size(),
484 &required,
485 )
486 .map_err(|error| BackendError::InvalidProgram {
487 fix: error.to_string(),
488 })?;
489 self.reject_grid_sync_without_native_lowering(program)
490 }
491
492 fn validate_resident_steps_for_dispatch(
493 &self,
494 steps: &[vyre_driver::backend::ResidentDispatchStep<'_>],
495 ) -> Result<(), BackendError> {
496 for step in steps {
497 self.validate_program_for_dispatch(step.program)?;
498 }
499 Ok(())
500 }
501}
502
503
504impl vyre_driver::backend::private::Sealed for CudaBackendRegistration {}
505
506impl VyreBackend for CudaBackendRegistration {
507 fn id(&self) -> &'static str {
508 CUDA_BACKEND_ID
509 }
510
511 fn version(&self) -> &'static str {
512 env!("CARGO_PKG_VERSION")
513 }
514
515 fn dispatch(
516 &self,
517 program: &Program,
518 inputs: &[Vec<u8>],
519 config: &DispatchConfig,
520 ) -> Result<Vec<Vec<u8>>, BackendError> {
521 self.validate_program_for_dispatch(program)?;
522 self.inner.dispatch(program, inputs, config)
523 }
524
525 fn dispatch_async(
526 &self,
527 program: &Program,
528 inputs: &[Vec<u8>],
529 config: &DispatchConfig,
530 ) -> Result<Box<dyn vyre_driver::PendingDispatch>, BackendError> {
531 self.validate_program_for_dispatch(program)?;
532 self.inner.dispatch_async(program, inputs, config)
533 }
534
535 fn dispatch_borrowed_async(
536 &self,
537 program: &Program,
538 inputs: &[&[u8]],
539 config: &DispatchConfig,
540 ) -> Result<Box<dyn vyre_driver::PendingDispatch>, BackendError> {
541 self.validate_program_for_dispatch(program)?;
542 self.inner.dispatch_borrowed_async(program, inputs, config)
543 }
544
545 fn dispatch_borrowed(
546 &self,
547 program: &Program,
548 inputs: &[&[u8]],
549 config: &DispatchConfig,
550 ) -> Result<Vec<Vec<u8>>, BackendError> {
551 self.validate_program_for_dispatch(program)?;
552 self.inner
553 .dispatch_borrowed_async(program, inputs, config)?
554 .await_result()
555 }
556
557 fn dispatch_borrowed_into(
558 &self,
559 program: &Program,
560 inputs: &[&[u8]],
561 config: &DispatchConfig,
562 outputs: &mut vyre_driver::OutputBuffers,
563 ) -> Result<(), BackendError> {
564 self.validate_program_for_dispatch(program)?;
565 self.inner
566 .dispatch_borrowed_async(program, inputs, config)?
567 .await_result_into(outputs)
568 }
569
570 fn dispatch_borrowed_timed(
571 &self,
572 program: &Program,
573 inputs: &[&[u8]],
574 config: &DispatchConfig,
575 ) -> Result<vyre_driver::TimedDispatchResult, BackendError> {
576 self.validate_program_for_dispatch(program)?;
577 self.inner.dispatch_borrowed_timed(program, inputs, config)
578 }
579
580 fn allocate_resident(&self, byte_len: usize) -> Result<Resource, BackendError> {
581 self.inner
582 .allocate_resident(byte_len)
583 .map(|handle| Resource::Resident(handle.id))
584 }
585
586 fn allocate_device_buffer(
587 &self,
588 byte_len: usize,
589 ) -> Result<Box<dyn vyre_driver::DeviceBuffer>, BackendError> {
590 let handle = self.inner.allocate_resident(byte_len)?;
591 Ok(Box::new(CudaDeviceBuffer {
592 backend_id: CUDA_BACKEND_ID,
593 handle,
594 }))
595 }
596
597 fn upload_device_buffer(
598 &self,
599 buffer: &mut dyn vyre_driver::DeviceBuffer,
600 bytes: &[u8],
601 ) -> Result<(), BackendError> {
602 let backend_id = buffer.backend_id().to_string();
603 let handle = buffer
604 .as_any_mut()
605 .downcast_mut::<CudaDeviceBuffer>()
606 .map(|cuda_buf| cuda_buf.handle)
607 .ok_or_else(|| BackendError::InvalidProgram {
608 fix: format!(
609 "Fix: upload_device_buffer expected a CudaDeviceBuffer (allocated by `cuda` backend) but got buffer owned by `{backend_id}`."
610 ),
611 })?;
612 self.inner.upload_resident(handle, bytes)
613 }
614
615 fn download_device_buffer(
616 &self,
617 buffer: &dyn vyre_driver::DeviceBuffer,
618 ) -> Result<Vec<u8>, BackendError> {
619 let cuda_buf = buffer
620 .as_any()
621 .downcast_ref::<CudaDeviceBuffer>()
622 .ok_or_else(|| BackendError::InvalidProgram {
623 fix: format!(
624 "Fix: download_device_buffer expected a CudaDeviceBuffer (allocated by `cuda` backend) but got buffer owned by `{}`.",
625 buffer.backend_id()
626 ),
627 })?;
628 self.inner.download_resident(cuda_buf.handle)
629 }
630
631 fn free_device_buffer(
632 &self,
633 buffer: Box<dyn vyre_driver::DeviceBuffer>,
634 ) -> Result<(), BackendError> {
635 let backend_id = buffer.backend_id().to_string();
636 let handle = buffer
637 .as_any()
638 .downcast_ref::<CudaDeviceBuffer>()
639 .map(|cuda_buf| cuda_buf.handle)
640 .ok_or_else(|| BackendError::InvalidProgram {
641 fix: format!(
642 "Fix: free_device_buffer expected a CudaDeviceBuffer but got buffer owned by `{backend_id}`."
643 ),
644 })?;
645 drop(buffer);
649 self.inner.free_resident(handle)
650 }
651
652 fn dispatch_with_device_buffers(
653 &self,
654 program: &Program,
655 inputs: &[&dyn vyre_driver::DeviceBuffer],
656 outputs: &mut [&mut dyn vyre_driver::DeviceBuffer],
657 config: &DispatchConfig,
658 ) -> Result<(), BackendError> {
659 self.validate_program_for_dispatch(program)?;
660 vyre_driver::validate_buffer_ownership(self.id(), inputs.iter().copied())?;
666 vyre_driver::validate_buffer_ownership(
667 self.id(),
668 outputs
669 .iter()
670 .map(|b| &**b as &dyn vyre_driver::DeviceBuffer),
671 )?;
672 let resource_capacity =
673 inputs
674 .len()
675 .checked_add(outputs.len())
676 .ok_or_else(|| BackendError::InvalidProgram {
677 fix: format!(
678 "Fix: CUDA borrowed dispatch resource capacity overflowed usize for {} input buffer(s) plus {} output buffer(s); split the dispatch.",
679 inputs.len(),
680 outputs.len()
681 ),
682 })?;
683 let mut handles = SmallVec::<[CudaResidentBuffer; 8]>::new();
684 reserve_smallvec(
685 &mut handles,
686 resource_capacity,
687 "CUDA borrowed dispatch resource handles",
688 )?;
689 for buffer in inputs {
690 let handle = buffer
691 .as_any()
692 .downcast_ref::<CudaDeviceBuffer>()
693 .ok_or_else(|| BackendError::InvalidProgram {
694 fix: format!(
695 "Fix: dispatch_with_device_buffers expected CudaDeviceBuffer inputs but got buffer owned by `{}`.",
696 buffer.backend_id()
697 ),
698 })?
699 .handle;
700 handles.push(handle);
701 }
702 for buffer in outputs.iter() {
703 let backend_id = buffer.backend_id().to_string();
704 let handle = buffer
705 .as_any()
706 .downcast_ref::<CudaDeviceBuffer>()
707 .ok_or_else(|| BackendError::InvalidProgram {
708 fix: format!(
709 "Fix: dispatch_with_device_buffers expected CudaDeviceBuffer outputs but got buffer owned by `{backend_id}`."
710 ),
711 })?
712 .handle;
713 handles.push(handle);
714 }
715 let _timed = self
716 .inner
717 .dispatch_resident_timed(program, &handles, config)?;
718 Ok(())
719 }
720
721 fn upload_resident(&self, resource: &Resource, bytes: &[u8]) -> Result<(), BackendError> {
722 let handle = self.inner.resident_handle_from_resource(resource)?;
723 self.inner.upload_resident(handle, bytes)
724 }
725
726 fn upload_resident_many(&self, uploads: &[(&Resource, &[u8])]) -> Result<(), BackendError> {
727 let concrete = self.resolve_uploads(uploads)?;
728 self.inner.upload_resident_many(&concrete)
729 }
730
731 fn upload_resident_at(
732 &self,
733 resource: &Resource,
734 dst_offset_bytes: usize,
735 bytes: &[u8],
736 ) -> Result<(), BackendError> {
737 let handle = self.inner.resident_handle_from_resource(resource)?;
738 self.inner
739 .upload_resident_at(handle, dst_offset_bytes, bytes)
740 }
741
742 fn upload_resident_at_many(
743 &self,
744 uploads: &[(&Resource, usize, &[u8])],
745 ) -> Result<(), BackendError> {
746 let concrete = self.resolve_offset_uploads(uploads)?;
747 self.inner.upload_resident_at_many(&concrete)
748 }
749
750 fn download_resident(&self, resource: &Resource) -> Result<Vec<u8>, BackendError> {
751 let handle = self.inner.resident_handle_from_resource(resource)?;
752 self.inner.download_resident(handle)
753 }
754
755 fn download_resident_into(
756 &self,
757 resource: &Resource,
758 out: &mut Vec<u8>,
759 ) -> Result<(), BackendError> {
760 let handle = self.inner.resident_handle_from_resource(resource)?;
761 self.inner.download_resident_into(handle, out)
762 }
763
764 fn download_resident_range(
765 &self,
766 resource: &Resource,
767 byte_offset: usize,
768 byte_len: usize,
769 ) -> Result<Vec<u8>, BackendError> {
770 let handle = self.inner.resident_handle_from_resource(resource)?;
771 self.inner
772 .download_resident_range(handle, byte_offset, byte_len)
773 }
774
775 fn download_resident_range_into(
776 &self,
777 resource: &Resource,
778 byte_offset: usize,
779 byte_len: usize,
780 out: &mut Vec<u8>,
781 ) -> Result<(), BackendError> {
782 let handle = self.inner.resident_handle_from_resource(resource)?;
783 self.inner
784 .download_resident_range_into(handle, byte_offset, byte_len, out)
785 }
786
787 fn download_resident_ranges_into(
788 &self,
789 ranges: &[(&Resource, usize, usize)],
790 outputs: &mut [&mut Vec<u8>],
791 ) -> Result<(), BackendError> {
792 let concrete = self.resolve_download_ranges(ranges)?;
793 self.inner.download_resident_ranges_into(&concrete, outputs)
794 }
795
796 fn free_resident(&self, resource: Resource) -> Result<(), BackendError> {
797 let handle = self.inner.resident_handle_from_resource(&resource)?;
798 self.inner.free_resident(handle)
799 }
800
801 fn dispatch_resident_timed(
802 &self,
803 program: &Program,
804 resources: &[Resource],
805 config: &DispatchConfig,
806 ) -> Result<vyre_driver::TimedDispatchResult, BackendError> {
807 self.validate_program_for_dispatch(program)?;
808 let handles = self.inner.resident_handles_from_resources(resources)?;
809 self.inner
810 .dispatch_resident_timed(program, &handles, config)
811 }
812
813 fn dispatch_resident_sequence_read_ranges_into(
814 &self,
815 steps: &[vyre_driver::backend::ResidentDispatchStep<'_>],
816 read_ranges: &[vyre_driver::backend::ResidentReadRange<'_>],
817 outputs: &mut [&mut Vec<u8>],
818 ) -> Result<(), BackendError> {
819 self.validate_resident_steps_for_dispatch(steps)?;
820 if read_ranges.len() != outputs.len() {
821 return Err(BackendError::InvalidProgram {
822 fix: format!(
823 "Fix: CUDA resident sequence ranged readback expected matching range/output counts but got {} range(s) and {} output(s).",
824 read_ranges.len(),
825 outputs.len()
826 ),
827 });
828 }
829 let handle_sets =
830 self.resolve_step_handle_sets(steps, "CUDA resident sequence handle sets")?;
831 let concrete_steps =
832 Self::concrete_resident_steps(steps, &handle_sets, "CUDA resident sequence steps")?;
833
834 let (read_handles, concrete_readbacks) = self.resolve_read_ranges(read_ranges)?;
835
836 let uploads: [(crate::backend::CudaResidentBuffer, &[u8]); 0] = [];
837 self.inner
838 .upload_resident_many_sequence_read_ranges_borrowed_into(
839 &uploads,
840 &concrete_steps,
841 &read_handles,
842 &concrete_readbacks,
843 outputs,
844 )
845 }
846
847 fn dispatch_resident_repeated_sequence_read_ranges_into(
848 &self,
849 prefix_steps: &[vyre_driver::backend::ResidentDispatchStep<'_>],
850 repeated_steps: &[vyre_driver::backend::ResidentDispatchStep<'_>],
851 repeat_count: u32,
852 read_ranges: &[vyre_driver::backend::ResidentReadRange<'_>],
853 outputs: &mut [&mut Vec<u8>],
854 ) -> Result<(), BackendError> {
855 self.validate_resident_steps_for_dispatch(prefix_steps)?;
856 self.validate_resident_steps_for_dispatch(repeated_steps)?;
857 let repeat_count =
858 usize::try_from(repeat_count).map_err(|error| BackendError::InvalidProgram {
859 fix: format!(
860 "Fix: CUDA repeated resident sequence count does not fit usize: {error}."
861 ),
862 })?;
863 if read_ranges.len() != outputs.len() {
864 return Err(BackendError::InvalidProgram {
865 fix: format!(
866 "Fix: CUDA repeated resident sequence ranged readback expected matching range/output counts but got {} range(s) and {} output(s).",
867 read_ranges.len(),
868 outputs.len()
869 ),
870 });
871 }
872
873 let prefix_handle_sets = self
874 .resolve_step_handle_sets(prefix_steps, "CUDA repeated resident prefix handle sets")?;
875 let repeated_handle_sets =
876 self.resolve_repeated_step_handle_sets(repeated_steps, repeat_count)?;
877 let concrete_prefix = Self::concrete_resident_steps(
878 prefix_steps,
879 &prefix_handle_sets,
880 "CUDA repeated resident prefix steps",
881 )?;
882 let concrete_repeated = Self::concrete_resident_steps(
883 repeated_steps,
884 &repeated_handle_sets,
885 "CUDA repeated resident repeated steps",
886 )?;
887
888 let (read_handles, concrete_readbacks) = self.resolve_read_ranges(read_ranges)?;
889 let uploads: [(crate::backend::CudaResidentBuffer, &[u8]); 0] = [];
890 self.inner
891 .upload_resident_many_repeated_sequence_read_ranges_borrowed_into(
892 &uploads,
893 &concrete_prefix,
894 &concrete_repeated,
895 repeat_count,
896 &read_handles,
897 &concrete_readbacks,
898 outputs,
899 )
900 }
901
902 fn compile_native(
903 &self,
904 program: &Program,
905 config: &DispatchConfig,
906 ) -> Result<Option<Arc<dyn vyre_driver::CompiledPipeline>>, BackendError> {
907 self.validate_program_for_dispatch(program)?;
908 self.inner.compile_native(program, config).map(Some)
909 }
910
911 fn compile_native_shared(
912 &self,
913 program: Arc<Program>,
914 config: &DispatchConfig,
915 ) -> Result<Option<Arc<dyn vyre_driver::CompiledPipeline>>, BackendError> {
916 self.validate_program_for_dispatch(program.as_ref())?;
917 self.inner.compile_native_shared(program, config).map(Some)
918 }
919
920 fn pipeline_cache_snapshot(&self) -> Option<vyre_driver::pipeline::PipelineCacheSnapshot> {
921 Some(self.inner.pipeline_cache_snapshot())
922 }
923
924 fn backend_metric_snapshot(&self) -> Vec<(&'static str, u64)> {
925 let source_cache = self.inner.ptx_source_cache_snapshot();
926 let mut metrics = Vec::new();
927 match u64::try_from(source_cache.entries) {
928 Ok(entries) => metrics.push(("cuda_ptx_source_cache_entries", entries)),
929 Err(source) => {
930 tracing::error!(
931 "CUDA PTX source cache entry count cannot fit u64: {source}. Fix: shard backend metrics before source-cache cardinality exceeds u64."
932 );
933 metrics.push(("cuda_ptx_source_cache_entries_unrepresentable", 1));
934 }
935 }
936 metrics.push(("cuda_ptx_source_cache_hits", source_cache.hits));
937 metrics.push(("cuda_ptx_source_cache_misses", source_cache.misses));
938 let telemetry = self.inner.telemetry_snapshot();
939 metrics.push(("cuda_timed_dispatches", telemetry.timed_dispatches));
940 metrics.push((
941 "cuda_timed_device_measurements",
942 telemetry.timed_device_measurements,
943 ));
944 metrics.push((
945 "cuda_timed_dispatches_missing_device_time",
946 telemetry.timed_dispatches_missing_device_time,
947 ));
948 metrics.push(("cuda_timed_wall_ns_total", telemetry.timed_wall_ns_total));
949 metrics.push((
950 "cuda_timed_device_ns_total",
951 telemetry.timed_device_ns_total,
952 ));
953 metrics.push(("cuda_timed_device_ns_max", telemetry.timed_device_ns_max));
954 metrics.push((
955 "cuda_timed_enqueue_ns_total",
956 telemetry.timed_enqueue_ns_total,
957 ));
958 metrics.push(("cuda_timed_wait_ns_total", telemetry.timed_wait_ns_total));
959 metrics
960 }
961
962 fn supports_subgroup_ops(&self) -> bool {
963 self.inner.hardware_supports_subgroup_ops()
964 }
965
966 fn supports_f16(&self) -> bool {
967 self.inner.hardware_supports_f16()
968 }
969
970 fn supports_bf16(&self) -> bool {
971 self.inner.hardware_supports_bf16()
972 }
973
974 fn supports_tensor_cores(&self) -> bool {
975 self.inner.hardware_supports_tensor_cores() && self.inner.lowers_tensor_core_ops()
976 }
977
978 fn supports_async_compute(&self) -> bool {
979 self.inner.hardware_supports_async_compute()
980 }
981
982 fn supports_grid_sync(&self) -> bool {
983 self.inner.supports_grid_sync()
984 }
985
986 fn allows_host_grid_sync_split(&self) -> bool {
987 false
988 }
989
990 fn supports_speculation(&self) -> bool {
991 false
992 }
993
994 fn max_workgroup_size(&self) -> [u32; 3] {
995 self.inner.max_block_dim()
996 }
997
998 fn max_compute_workgroups_per_dimension(&self) -> u32 {
999 self.inner.max_grid_dim()[0]
1000 }
1001
1002 fn max_compute_invocations_per_workgroup(&self) -> u32 {
1003 self.inner.max_threads_per_block()
1004 }
1005
1006 fn subgroup_size(&self) -> Option<u32> {
1007 self.inner.warp_size()
1008 }
1009
1010 fn max_storage_buffer_bytes(&self) -> u64 {
1011 self.inner.device_memory_bytes()
1012 }
1013
1014 fn device_profile(&self) -> vyre_driver::DeviceProfile {
1015 let mut profile = self.inner.caps.to_device_profile();
1016 profile.supports_tensor_cores = self.supports_tensor_cores();
1017 profile.supports_indirect_dispatch = self.supports_indirect_dispatch();
1018 profile
1019 }
1020
1021 fn prepare(&self) -> Result<(), BackendError> {
1022 self.inner.warmup()
1023 }
1024
1025 fn shutdown(&self) -> Result<(), BackendError> {
1026 self.inner.cleanup()
1027 }
1028}
1029
1030pub fn cuda_factory() -> Result<Box<dyn VyreBackend>, BackendError> {
1033 let backend = CudaBackend::acquire().map_err(|e| BackendError::DispatchFailed {
1034 code: None,
1035 message: format!("CUDA backend acquisition failed: {e}"),
1036 })?;
1037 Ok(Box::new(CudaBackendRegistration { inner: backend }))
1038}
1039
1040pub fn cuda_supported_ops() -> &'static std::collections::HashSet<vyre_foundation::ir::OpId> {
1043 vyre_driver::backend::validation::default_supported_ops_with_trap()
1044}
1045
1046inventory::submit! {
1047 BackendRegistration {
1048 id: CUDA_BACKEND_ID,
1049 factory: cuda_factory,
1050 supported_ops: cuda_supported_ops,
1051 }
1052}
1053
1054inventory::submit! {
1056 vyre_driver::backend::BackendPrecedence {
1057 id: CUDA_BACKEND_ID,
1058 rank: 5,
1059 }
1060}
1061
1062inventory::submit! {
1064 vyre_driver::backend::BackendCapability {
1065 id: CUDA_BACKEND_ID,
1066 dispatches: true,
1067 }
1068}
1069
1070fn emit_aot_bytes(program: &Program, config: &DispatchConfig) -> Result<Vec<u8>, String> {
1071 let backend = CudaBackend::acquire().map_err(|error| {
1072 format!(
1073 "CUDA PTX AOT emission could not probe the live device target: {error}. Fix: run AOT emission on a host with the CUDA driver and target GPU visible."
1074 )
1075 })?;
1076 crate::codegen::program_to_ptx_for_sm_and_subgroup(
1077 program,
1078 config,
1079 backend.ptx_target_sm(),
1080 backend.warp_size().ok_or_else(|| {
1081 "CUDA PTX AOT emission could not read a hardware warp size from the live device probe. Fix: repair CUDA capability probing before AOT emission.".to_string()
1082 })?,
1083 )
1084 .map(String::into_bytes)
1085}
1086
1087inventory::submit! {
1088 vyre_driver::aot::AotEmitter {
1089 target: "secondary_text",
1090 emit: emit_aot_bytes,
1091 }
1092}
1093
1094inventory::submit! {
1095 vyre_driver::aot::AotLauncherEmitter {
1096 target: "secondary_text",
1097 emit: aot_launcher::emit_launcher,
1098 }
1099}
1100
1101#[cfg(test)]
1102mod tests {
1103 #[test]
1104 fn public_cuda_resident_helpers_reserve_smallvecs_fallibly() {
1105 let source = include_str!("lib.rs");
1106 assert!(
1107 source.contains("use crate::backend::staging_reserve::reserve_smallvec;"),
1108 "Fix: public CUDA resident helpers must use the shared fallible staging reservation contract."
1109 );
1110 assert!(
1111 !source.contains(concat!(
1112 "SmallVec::<",
1113 "[(CudaResidentBuffer, &'a [u8]); 8]>::with_capacity"
1114 )) && !source.contains(concat!(
1115 "SmallVec::<",
1116 "[(CudaResidentBuffer, usize, &'a [u8]); 8]>::with_capacity"
1117 )) && !source.contains(concat!(
1118 "SmallVec::<",
1119 "[(CudaResidentBuffer, usize, usize); 8]>::with_capacity"
1120 )) && !source.contains(concat!(
1121 "SmallVec::<",
1122 "[CudaResidentBuffer; 8]>::with_capacity"
1123 )) && !source.contains(concat!(
1124 "SmallVec::<",
1125 "[SmallVec<[crate::backend::CudaResidentBuffer; 8]>; 8]>::with_capacity"
1126 )) && !source.contains(concat!(
1127 "SmallVec::<",
1128 "[crate::backend::CudaResidentDispatchStep<'_>; 8]>::with_capacity"
1129 )),
1130 "Fix: public CUDA resident helpers must reserve fallibly instead of using infallible SmallVec capacity growth."
1131 );
1132 assert!(
1133 source.contains("CUDA resident sequence handle sets")
1134 && source.contains("CUDA repeated resident repeated steps")
1135 && source.contains("CUDA borrowed dispatch resource handles"),
1136 "Fix: public CUDA resident sequence and borrowed-buffer staging paths must expose specific fallible-reservation labels."
1137 );
1138 assert!(
1139 source.contains("fn resolve_step_handle_sets")
1140 && source.contains("fn resolve_repeated_step_handle_sets")
1141 && source.contains("fn concrete_resident_steps"),
1142 "Fix: public CUDA resident sequence paths must share one handle-set and concrete-step staging implementation."
1143 );
1144 }
1145
1146 #[test]
1147 fn public_cuda_execution_entrypoints_share_capability_validation() {
1148 let source = include_str!("lib.rs");
1149 assert!(
1150 source.contains("fn validate_program_for_dispatch")
1151 && source.contains("check_backend_capabilities")
1152 && source.contains("reject_grid_sync_without_native_lowering(program)"),
1153 "Fix: CUDA dispatch validation must centralize capability and grid-sync checks before launch/lowering."
1154 );
1155
1156 for (name, body) in [
1157 (
1158 "dispatch",
1159 method_region(source, " fn dispatch(\n", " fn dispatch_async("),
1160 ),
1161 (
1162 "dispatch_async",
1163 method_region(
1164 source,
1165 " fn dispatch_async(\n",
1166 " fn dispatch_borrowed_async(",
1167 ),
1168 ),
1169 (
1170 "dispatch_borrowed_async",
1171 method_region(
1172 source,
1173 " fn dispatch_borrowed_async(\n",
1174 " fn dispatch_borrowed(",
1175 ),
1176 ),
1177 (
1178 "dispatch_borrowed",
1179 method_region(
1180 source,
1181 " fn dispatch_borrowed(\n",
1182 " fn dispatch_borrowed_into(",
1183 ),
1184 ),
1185 (
1186 "dispatch_borrowed_into",
1187 method_region(
1188 source,
1189 " fn dispatch_borrowed_into(\n",
1190 " fn dispatch_borrowed_timed(",
1191 ),
1192 ),
1193 (
1194 "dispatch_borrowed_timed",
1195 method_region(
1196 source,
1197 " fn dispatch_borrowed_timed(\n",
1198 " fn allocate_resident(",
1199 ),
1200 ),
1201 (
1202 "dispatch_with_device_buffers",
1203 method_region(
1204 source,
1205 " fn dispatch_with_device_buffers(\n",
1206 " fn upload_resident(",
1207 ),
1208 ),
1209 (
1210 "dispatch_resident_timed",
1211 method_region(
1212 source,
1213 " fn dispatch_resident_timed(\n",
1214 " fn dispatch_resident_sequence_read_ranges_into(",
1215 ),
1216 ),
1217 (
1218 "compile_native",
1219 method_region(
1220 source,
1221 " fn compile_native(\n",
1222 " fn compile_native_shared(",
1223 ),
1224 ),
1225 ] {
1226 assert!(
1227 body.contains("validate_program_for_dispatch(program)?"),
1228 "Fix: CUDA {name} must run the shared capability/grid-sync validation gate before lowering or launch."
1229 );
1230 }
1231
1232 let compile_shared = method_region(
1233 source,
1234 " fn compile_native_shared(\n",
1235 " fn pipeline_cache_snapshot(",
1236 );
1237 assert!(
1238 compile_shared.contains("validate_program_for_dispatch(program.as_ref())?"),
1239 "Fix: CUDA compile_native_shared must validate the shared Program before lowering."
1240 );
1241
1242 let resident_sequence = method_region(
1243 source,
1244 " fn dispatch_resident_sequence_read_ranges_into(\n",
1245 " fn dispatch_resident_repeated_sequence_read_ranges_into(",
1246 );
1247 assert!(
1248 resident_sequence.contains("validate_resident_steps_for_dispatch(steps)?"),
1249 "Fix: CUDA resident sequence dispatch must validate every step Program before launch."
1250 );
1251
1252 let repeated_sequence = method_region(
1253 source,
1254 " fn dispatch_resident_repeated_sequence_read_ranges_into(\n",
1255 " fn compile_native(",
1256 );
1257 assert!(
1258 repeated_sequence.contains("validate_resident_steps_for_dispatch(prefix_steps)?")
1259 && repeated_sequence.contains("validate_resident_steps_for_dispatch(repeated_steps)?"),
1260 "Fix: CUDA repeated resident sequence dispatch must validate both prefix and repeated step Programs before launch."
1261 );
1262 }
1263
1264 fn method_region<'a>(source: &'a str, start: &str, end: &str) -> &'a str {
1265 source
1266 .split(start)
1267 .nth(1)
1268 .expect("Fix: replace expect with fallible API or document caller precondition; panic only on programmer error - method start must exist")
1269 .split(end)
1270 .next()
1271 .expect("Fix: replace expect with fallible API or document caller precondition; panic only on programmer error - method end must exist")
1272 }
1273}
1274