Expand description
§vyre-driver-cuda - CUDA/PTX backend for vyre
Implements VyreBackend via the CUDA driver API through cudarc.
Translates vyre Program IR into PTX kernels, loads them through
the CUDA driver JIT, and dispatches on NVIDIA GPUs.
The backend registers itself as "cuda" in the vyre backend registry
via inventory::submit! so vyre::registered_backends() enumerates
it alongside wgpu, spirv, etc.
§Architecture
Program ─► PTX emitter ─► cuModuleLoadData ─► cuLaunchKernelRe-exports§
pub use backend::CudaBackend;pub use backend::CudaPtxSourceCacheSnapshot;pub use backend::CudaResidentBuffer;pub use backend::CudaTelemetrySnapshot;pub use benchmark_pass_selection::select_cuda_benchmark_passes;pub use benchmark_pass_selection::select_cuda_benchmark_passes_with_scratch;pub use benchmark_pass_selection::CudaBenchmarkPassCandidate;pub use benchmark_pass_selection::CudaBenchmarkPassSelectionError;pub use benchmark_pass_selection::CudaBenchmarkPassSelectionPlan;pub use benchmark_pass_selection::CudaBenchmarkPassSelectionSample;pub use benchmark_pass_selection::CudaBenchmarkPassSelectionScratch;pub use benchmark_pass_selection::CudaBenchmarkPassSkipReason;pub use benchmark_pass_selection::CudaSkippedBenchmarkPass;pub use device::CudaDeviceCaps;pub use device::CudaDeviceHandle;pub use device_diagnostic_aggregation::plan_cuda_device_diagnostic_aggregation;pub use device_diagnostic_aggregation::plan_cuda_device_diagnostic_aggregation_with_scratch;pub use device_diagnostic_aggregation::CudaDiagnosticAggregationError;pub use device_diagnostic_aggregation::CudaDiagnosticAggregationPlan;pub use device_diagnostic_aggregation::CudaDiagnosticAggregationScratch;pub use device_diagnostic_aggregation::CudaDiagnosticCompactRange;pub use device_diagnostic_aggregation::CudaDiagnosticShard;pub use device_work_queue::plan_cuda_device_work_queue;pub use device_work_queue::plan_cuda_device_work_queue_backpressure;pub use device_work_queue::CudaDeviceWorkQueueBackpressurePlan;pub use device_work_queue::CudaDeviceWorkQueueDrainStrategy;pub use device_work_queue::CudaDeviceWorkQueueError;pub use device_work_queue::CudaDeviceWorkQueuePlan;pub use device_work_queue::CudaDeviceWorkQueueProfile;pub use device_work_queue::CudaWorkQueueHostSync;pub use egraph_device_image::plan_cuda_egraph_device_upload;pub use egraph_device_image::plan_cuda_egraph_device_upload_from_image;pub use egraph_device_image::plan_cuda_egraph_device_upload_from_image_ref;pub use egraph_device_image::CudaEGraphDeviceBorrowedUploadPlan;pub use egraph_device_image::CudaEGraphDeviceByteLayout;pub use egraph_device_image::CudaEGraphDeviceByteSpan;pub use egraph_device_image::CudaEGraphDeviceKernelView;pub use egraph_device_image::CudaEGraphDeviceUploadError;pub use egraph_device_image::CudaEGraphDeviceUploadPlan;pub use egraph_device_image::CudaResidentEGraphDeviceImage;pub use egraph_kernel_plan::collect_cuda_egraph_structural_equivalences;pub use egraph_kernel_plan::cuda_egraph_canonical_rewrite_kernel_ptx;pub use egraph_kernel_plan::cuda_egraph_signature_pair_rows;pub use egraph_kernel_plan::cuda_egraph_signature_refresh_kernel_ptx;pub use egraph_kernel_plan::cuda_egraph_structural_equivalence_kernel_ptx;pub use egraph_kernel_plan::pack_cuda_egraph_canonical_rewrite_device_image;pub use egraph_kernel_plan::pack_cuda_egraph_signature_bucket_device_image;pub use egraph_kernel_plan::plan_cuda_egraph_kernel_work;pub use egraph_kernel_plan::plan_cuda_egraph_signature_buckets;pub use egraph_kernel_plan::plan_cuda_egraph_signature_buckets_from_resident_snapshot;pub use egraph_kernel_plan::plan_cuda_egraph_signature_buckets_from_signature_snapshot;pub use egraph_kernel_plan::plan_cuda_egraph_structural_equivalence_launch_artifact;pub use egraph_kernel_plan::plan_cuda_egraph_structural_equivalence_output;pub use egraph_kernel_plan::plan_cuda_egraph_structural_equivalences;pub use egraph_kernel_plan::plan_cuda_egraph_union_compaction;pub use egraph_kernel_plan::CudaEGraphCanonicalRewrite;pub use egraph_kernel_plan::CudaEGraphCanonicalRewriteDeviceImage;pub use egraph_kernel_plan::CudaEGraphCanonicalRewriteKernelPtx;pub use egraph_kernel_plan::CudaEGraphCanonicalRewriteKernelResult;pub use egraph_kernel_plan::CudaEGraphFixedPointReadback;pub use egraph_kernel_plan::CudaEGraphKernelLaunchConfig;pub use egraph_kernel_plan::CudaEGraphKernelPass;pub use egraph_kernel_plan::CudaEGraphKernelPlanError;pub use egraph_kernel_plan::CudaEGraphKernelWave;pub use egraph_kernel_plan::CudaEGraphKernelWorkPlan;pub use egraph_kernel_plan::CudaEGraphResidentColumnSnapshot;pub use egraph_kernel_plan::CudaEGraphResidentSignatureSnapshot;pub use egraph_kernel_plan::CudaEGraphSignatureBucket;pub use egraph_kernel_plan::CudaEGraphSignatureBucketDeviceImage;pub use egraph_kernel_plan::CudaEGraphSignatureBucketPlan;pub use egraph_kernel_plan::CudaEGraphSignaturePairWave;pub use egraph_kernel_plan::CudaEGraphSignatureRefreshKernelPtx;pub use egraph_kernel_plan::CudaEGraphSignatureRefreshKernelResult;pub use egraph_kernel_plan::CudaEGraphStructuralCanonicalizationFixedPointReport;pub use egraph_kernel_plan::CudaEGraphStructuralCanonicalizationFixedPointResult;pub use egraph_kernel_plan::CudaEGraphStructuralCanonicalizationRoundResult;pub use egraph_kernel_plan::CudaEGraphStructuralEquivalenceKernelPtx;pub use egraph_kernel_plan::CudaEGraphStructuralEquivalenceKernelResult;pub use egraph_kernel_plan::CudaEGraphStructuralEquivalenceLaunchArtifact;pub use egraph_kernel_plan::CudaEGraphStructuralEquivalenceOutputPlan;pub use egraph_kernel_plan::CudaEGraphStructuralEquivalencePlan;pub use egraph_kernel_plan::CudaEGraphUnionCompactionPass;pub use egraph_kernel_plan::CudaEGraphUnionCompactionPlan;pub use egraph_kernel_plan::CudaEGraphUnionCompactionWave;pub use egraph_kernel_plan::CUDA_EGRAPH_CANONICAL_REWRITE_KERNEL_ENTRY;pub use egraph_kernel_plan::CUDA_EGRAPH_CANONICAL_REWRITE_KERNEL_PARAM_COUNT;pub use egraph_kernel_plan::CUDA_EGRAPH_CANONICAL_REWRITE_RECORD_WORDS;pub use egraph_kernel_plan::CUDA_EGRAPH_SIGNATURE_BUCKET_RECORD_WORDS;pub use egraph_kernel_plan::CUDA_EGRAPH_SIGNATURE_REFRESH_KERNEL_ENTRY;pub use egraph_kernel_plan::CUDA_EGRAPH_SIGNATURE_REFRESH_KERNEL_PARAM_COUNT;pub use egraph_kernel_plan::CUDA_EGRAPH_STRUCTURAL_EQUIVALENCE_KERNEL_ENTRY;pub use egraph_kernel_plan::CUDA_EGRAPH_STRUCTURAL_EQUIVALENCE_KERNEL_PARAM_COUNT;pub use frontier_typed_ir_adapter::adapt_frontier_typed_ir_to_cuda;pub use frontier_typed_ir_adapter::CudaFrontierTypedIrAdapterError;pub use frontier_typed_ir_adapter::CudaFrontierTypedIrInput;pub use kernel_failure_diagnostics::diagnose_cuda_kernel_launch;pub use kernel_failure_diagnostics::diagnose_cuda_kernel_launch_shape;pub use kernel_failure_diagnostics::diagnose_cuda_kernel_launch_with_scratch;pub use kernel_failure_diagnostics::CudaKernelCapabilityFailure;pub use kernel_failure_diagnostics::CudaKernelDeviceEnvelope;pub use kernel_failure_diagnostics::CudaKernelLaunchDiagnostic;pub use kernel_failure_diagnostics::CudaKernelLaunchDiagnosticRef;pub use kernel_failure_diagnostics::CudaKernelLaunchDiagnosticScratch;pub use kernel_failure_diagnostics::CudaKernelLaunchEnvelope;pub use kernel_failure_diagnostics::CudaKernelLaunchEnvelopeError;pub use kernel_failure_diagnostics::CudaKernelLaunchShape;pub use kernel_failure_diagnostics::CudaKernelRequirement;pub use launch_fusion::plan_cuda_launch_fusion;pub use launch_fusion::plan_cuda_launch_fusion_with_scratch;pub use launch_fusion::CudaFusionStage;pub use launch_fusion::CudaLaunchFusionError;pub use launch_fusion::CudaLaunchFusionGroup;pub use launch_fusion::CudaLaunchFusionPlan;pub use launch_fusion::CudaLaunchFusionScratch;pub use megakernel_barrier_planner::plan_cuda_frontier_megakernel_execution;pub use megakernel_barrier_planner::plan_cuda_frontier_megakernel_execution_with_scratch;pub use megakernel_barrier_planner::plan_cuda_megakernel_barriers;pub use megakernel_barrier_planner::plan_cuda_megakernel_barriers_with_scratch;pub use megakernel_barrier_planner::CudaMegakernelBarrierGroup;pub use megakernel_barrier_planner::CudaMegakernelBarrierPlan;pub use megakernel_barrier_planner::CudaMegakernelBarrierPlanError;pub use megakernel_barrier_planner::CudaMegakernelBarrierScratch;pub use megakernel_barrier_planner::CudaMegakernelFrontierExecutionPlan;pub use megakernel_barrier_planner::CudaMegakernelFrontierExecutionPlanError;pub use megakernel_barrier_planner::CudaMegakernelFrontierWave;pub use megakernel_barrier_planner::CudaMegakernelWaveDependency;pub use megakernel_convergence::plan_cuda_device_convergence;pub use megakernel_convergence::CudaConvergenceReadbackPolicy;pub use megakernel_convergence::CudaDeviceConvergencePlan;pub use megakernel_convergence::CudaDeviceConvergencePlanError;pub use megakernel_plan_cache::CudaMegakernelAnalysisKind;pub use megakernel_plan_cache::CudaMegakernelCachedPlan;pub use megakernel_plan_cache::CudaMegakernelDeviceKey;pub use megakernel_plan_cache::CudaMegakernelPlanCache;pub use megakernel_plan_cache::CudaMegakernelPlanCacheKey;pub use megakernel_plan_cache::CudaMegakernelPlanCacheStats;pub use megakernel_scheduler::plan_cuda_megakernel_execution;pub use megakernel_scheduler::plan_cuda_megakernel_memory_budget;pub use megakernel_scheduler::schedule_megakernel_from_cuda_samples;pub use megakernel_scheduler::schedule_megakernel_from_cuda_samples_into;pub use megakernel_scheduler::select_cuda_megakernel_topology;pub use megakernel_scheduler::CudaMegakernelExecutionPlan;pub use megakernel_scheduler::CudaMegakernelGraphShape;pub use megakernel_scheduler::CudaMegakernelMemoryBudget;pub use megakernel_scheduler::CudaMegakernelMemoryError;pub use megakernel_scheduler::CudaMegakernelMemoryPlan;pub use megakernel_scheduler::CudaMegakernelScheduleSample;pub use megakernel_scheduler::CudaMegakernelTopology;pub use megakernel_scheduler::CudaMegakernelTopologyDecision;pub use megakernel_speedup_gate::format_validated_cuda_megakernel_speedup_evidence_csv;pub use megakernel_speedup_gate::validate_cuda_megakernel_speedup_evidence_csv;pub use megakernel_speedup_gate::validate_cuda_megakernel_speedup_gate;pub use megakernel_speedup_gate::CudaMegakernelSpeedupGateError;pub use megakernel_speedup_gate::CudaMegakernelSpeedupProof;pub use megakernel_speedup_gate::CudaMegakernelSpeedupSample;pub use megakernel_speedup_gate::MEGAKERNEL_SPEEDUP_EVIDENCE_CSV_HEADER;pub use multi_query_execution::plan_cuda_multi_query_execution;pub use multi_query_execution::plan_cuda_multi_query_execution_with_scratch;pub use multi_query_execution::CudaMultiQuery;pub use multi_query_execution::CudaMultiQueryExecutionError;pub use multi_query_execution::CudaMultiQueryExecutionPlan;pub use multi_query_execution::CudaMultiQueryExecutionScratch;pub use multi_query_execution::CudaMultiQueryGroup;pub use optimizer::CudaOptimizerDispatcher;pub use resident_graph_session::format_validated_cuda_resident_graph_session_evidence_csv;pub use resident_graph_session::plan_cuda_resident_graph_session;pub use resident_graph_session::resident_graph_session_speedup_sample;pub use resident_graph_session::CudaResidentGraphReadback;pub use resident_graph_session::CudaResidentGraphSessionError;pub use resident_graph_session::CudaResidentGraphSessionEvidence;pub use resident_graph_session::CudaResidentGraphSessionEvidenceError;pub use resident_graph_session::CudaResidentGraphSessionPlan;pub use resident_graph_session::CudaResidentGraphSessionProfile;pub use result_compaction::plan_cuda_result_compaction;pub use result_compaction::plan_cuda_result_compaction_with_scratch;pub use result_compaction::CudaCompactResultRecord;pub use result_compaction::CudaResultCompactionError;pub use result_compaction::CudaResultCompactionPlan;pub use result_compaction::CudaResultCompactionScratch;pub use result_compaction::CudaResultSlot;pub use token_fact_frontier_execution::plan_cuda_token_fact_frontier_execution;pub use token_fact_frontier_execution::plan_cuda_token_fact_frontier_execution_with_scratch;pub use token_fact_frontier_execution::CudaTokenFactFrontierExecutionError;pub use token_fact_frontier_execution::CudaTokenFactFrontierExecutionPlan;pub use token_fact_graph_cuda_adapter::adapt_token_fact_graph_to_cuda_layout;pub use token_fact_graph_cuda_adapter::CudaTokenFactGraphLayout;pub use token_fact_graph_cuda_adapter::CudaTokenFactGraphLayoutError;
Modules§
- backend
- CUDA backend core: device management and dispatch. CUDA backend module: device lifecycle, allocation pools, and kernel dispatch.
- benchmark_
pass_ selection - Benchmark-driven CUDA optimization pass selection. CUDA adapter for benchmark-driven optimization pass selection.
- codegen
- PTX code generation from vyre IR.
PTX code generation from vyre
ProgramIR. - device
- CUDA device capability probing. CUDA device probing and capability snapshots.
- device_
diagnostic_ aggregation - Device-side diagnostic aggregation and compact readback planning. CUDA device-side diagnostic aggregation planning adapter.
- device_
work_ queue - Device-side work queue planning for dependent dataflow. CUDA adapter for backend-neutral device-side work queue planning.
- egraph_
device_ image - CUDA upload planning for GPU e-graph device images. CUDA upload planning for GPU e-graph device images.
- egraph_
kernel_ plan - CUDA launch-wave planning for resident e-graph device images. CUDA launch-wave planning for resident e-graph device images.
- frontier_
typed_ ir_ adapter - Adapter from frontier-typed IR plans to CUDA frontier wave envelopes. Adapter from substrate frontier-typed IR plans to CUDA frontier waves.
- jit_
cache - Cross-process persistent CUDA JIT cache wiring (E4 + E5): configures the NVIDIA driver’s built-in disk cache at backend bring-up so the JIT-compiled cuBINs persist across runs and are shared across every vyre process on the host. E4 + E5 substrate: cross-process persistent CUDA JIT cache wiring.
- kernel_
failure_ diagnostics - Actionable CUDA kernel capability diagnostics. Actionable CUDA kernel capability diagnostics.
- launch_
fusion - Adjacent-stage CUDA launch fusion planning. CUDA-facing adjacent-stage launch fusion adapter.
- megakernel_
barrier_ planner - CUDA megakernel global-barrier minimization for dependency-typed waves. CUDA megakernel barrier planning for dependency-typed dataflow waves.
- megakernel_
convergence - CUDA megakernel convergence planning for iterative fixed-point analyses. CUDA megakernel convergence planning adapter.
- megakernel_
plan_ cache - Bounded CUDA megakernel plan cache keyed by graph, analysis, device, and runtime pressure buckets. Bounded CUDA megakernel plan cache.
- megakernel_
scheduler - CUDA telemetry adapter for the scale-aware megakernel scheduler.
- megakernel_
speedup_ gate - Release gate for steady-state CUDA megakernel speedup claims. CUDA megakernel steady-state speedup release gate.
- multi_
query_ execution - Multi-query CUDA execution planning over shared resident graphs. CUDA adapter for backend-neutral multi-query execution planning.
- occupancy
- Occupancy-aware empirical autotuning (I4): pure estimator that picks
the workgroup size with the highest predicted hardware occupancy from
(CudaDeviceCaps, KernelResourceUsage). The runtime feeds the result intoAutotuneStore(I3) so subsequent dispatches reuse the choice. I4 substrate: occupancy-aware empirical autotuning. - optimizer
- Self-hosted optimizer GPU dispatcher - runs the
vyre-self-substrate::optimizerpasses (DCE, CSE, const-fold, validator) on CUDA. External parity tests reach in via theCudaOptimizerDispatcherre-export below. CUDA-residentOptimizerDispatcher- the fast path for the self-hosted optimizer. - profiler
- CUDA profiler range integration for Nsight/NVTX without mandatory NVTX linkage. Optional CUDA profiler range integration for Nsight Systems.
- resident_
graph_ session - Repeated execution over persistent CUDA-resident graph state. CUDA resident graph session planning.
- result_
compaction - Compact result readback planning. CUDA compact result readback planning adapter.
- synthetic_
device_ caps - Synthetic CUDA device profiles for offline release-path planning. Synthetic CUDA device capability profiles for offline planning.
- token_
fact_ frontier_ execution - CUDA execution planning for unified token/fact graph frontier waves. CUDA execution planner for unified token/fact graph frontier waves.
- token_
fact_ graph_ cuda_ adapter - Adapter from unified token/fact graph layouts to CUDA resident bytes. CUDA adapter for the unified resident token/fact graph.
Structs§
- Cuda
Backend Registration - Factory wrapper for the inventory registration path.
- Cuda
Device Buffer - CUDA implementation of
vyre_driver::DeviceBuffer. Wraps abackend::CudaResidentBufferhandle so consumers can hold aBox<dyn DeviceBuffer>against the CUDA backend without namingCudaResidentBufferdirectly. - Cuda
Launch Resource Counts - Cached CUDA launch-resource counts retained for dispatch reuse.
Constants§
- CUDA_
BACKEND_ ID - Stable backend identifier for registration and conform certificates.
Functions§
- cuda_
factory - Factory function for inventory registration.
- cuda_
supported_ ops - Op-support set - CUDA supports every op the foundation IR defines plus hardware intrinsics. Populated at runtime by the conform runner.