ringkernel_cuda/
lib.rs

1//! CUDA Backend for RingKernel
2//!
3//! This crate provides NVIDIA CUDA GPU support for RingKernel using cudarc.
4//!
5//! # Features
6//!
7//! - Persistent kernel execution (cooperative groups)
8//! - Lock-free message queues in GPU global memory
9//! - PTX compilation via NVRTC
10//! - Multi-GPU support
11//!
12//! # Requirements
13//!
14//! - NVIDIA GPU with Compute Capability 7.0+
15//! - CUDA Toolkit 11.0+
16//! - Native Linux (persistent kernels) or WSL2 (event-driven fallback)
17//!
18//! # Example
19//!
20//! ```ignore
21//! use ringkernel_cuda::CudaRuntime;
22//!
23//! #[tokio::main]
24//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
25//!     let runtime = CudaRuntime::new().await?;
26//!     let kernel = runtime.launch("vector_add", Default::default()).await?;
27//!     kernel.activate().await?;
28//!     Ok(())
29//! }
30//! ```
31
32#![warn(missing_docs)]
33
34#[cfg(feature = "cuda")]
35mod device;
36#[cfg(feature = "cuda")]
37mod kernel;
38#[cfg(feature = "cuda")]
39mod memory;
40#[cfg(feature = "cuda")]
41mod runtime;
42#[cfg(feature = "cuda")]
43mod stencil;
44
45#[cfg(feature = "cuda")]
46pub use device::CudaDevice;
47#[cfg(feature = "cuda")]
48pub use kernel::CudaKernel;
49#[cfg(feature = "cuda")]
50pub use memory::{CudaBuffer, CudaControlBlock, CudaMemoryPool, CudaMessageQueue};
51#[cfg(feature = "cuda")]
52pub use runtime::CudaRuntime;
53#[cfg(feature = "cuda")]
54pub use stencil::{CompiledStencilKernel, LaunchConfig, StencilKernelLoader};
55
56/// Re-export memory module for advanced usage.
57#[cfg(feature = "cuda")]
58pub mod memory_exports {
59    pub use super::memory::{CudaBuffer, CudaControlBlock, CudaMemoryPool, CudaMessageQueue};
60}
61
62// Placeholder implementations when CUDA is not available
63#[cfg(not(feature = "cuda"))]
64mod stub {
65    use async_trait::async_trait;
66    use ringkernel_core::error::{Result, RingKernelError};
67    use ringkernel_core::runtime::{
68        Backend, KernelHandle, KernelId, LaunchOptions, RingKernelRuntime, RuntimeMetrics,
69    };
70
71    /// Stub CUDA runtime when CUDA feature is disabled.
72    pub struct CudaRuntime;
73
74    impl CudaRuntime {
75        /// Create fails when CUDA is not available.
76        pub async fn new() -> Result<Self> {
77            Err(RingKernelError::BackendUnavailable(
78                "CUDA feature not enabled".to_string(),
79            ))
80        }
81    }
82
83    #[async_trait]
84    impl RingKernelRuntime for CudaRuntime {
85        fn backend(&self) -> Backend {
86            Backend::Cuda
87        }
88
89        fn is_backend_available(&self, _backend: Backend) -> bool {
90            false
91        }
92
93        async fn launch(&self, _kernel_id: &str, _options: LaunchOptions) -> Result<KernelHandle> {
94            Err(RingKernelError::BackendUnavailable("CUDA".to_string()))
95        }
96
97        fn get_kernel(&self, _kernel_id: &KernelId) -> Option<KernelHandle> {
98            None
99        }
100
101        fn list_kernels(&self) -> Vec<KernelId> {
102            vec![]
103        }
104
105        fn metrics(&self) -> RuntimeMetrics {
106            RuntimeMetrics::default()
107        }
108
109        async fn shutdown(&self) -> Result<()> {
110            Ok(())
111        }
112    }
113}
114
115#[cfg(not(feature = "cuda"))]
116pub use stub::CudaRuntime;
117
118/// Check if CUDA is available at runtime.
119///
120/// This function returns false if:
121/// - CUDA feature is not enabled
122/// - CUDA libraries are not installed on the system
123/// - No CUDA devices are present
124///
125/// It safely catches panics from cudarc when CUDA is not installed.
126pub fn is_cuda_available() -> bool {
127    #[cfg(feature = "cuda")]
128    {
129        // cudarc panics if CUDA libraries are not found, so we catch that
130        std::panic::catch_unwind(|| {
131            cudarc::driver::CudaDevice::count()
132                .map(|c| c > 0)
133                .unwrap_or(false)
134        })
135        .unwrap_or(false)
136    }
137    #[cfg(not(feature = "cuda"))]
138    {
139        false
140    }
141}
142
143/// Get CUDA device count.
144///
145/// Returns 0 if CUDA is not available or libraries are not installed.
146pub fn cuda_device_count() -> usize {
147    #[cfg(feature = "cuda")]
148    {
149        // cudarc panics if CUDA libraries are not found, so we catch that
150        std::panic::catch_unwind(|| cudarc::driver::CudaDevice::count().unwrap_or(0) as usize)
151            .unwrap_or(0)
152    }
153    #[cfg(not(feature = "cuda"))]
154    {
155        0
156    }
157}
158
159/// PTX kernel source template for persistent ring kernel.
160///
161/// This is a minimal kernel that immediately marks itself as terminated.
162/// Uses PTX 8.0 / sm_89 for Ada Lovelace GPU compatibility (RTX 40xx series).
163pub const RING_KERNEL_PTX_TEMPLATE: &str = r#"
164.version 8.0
165.target sm_89
166.address_size 64
167
168.visible .entry ring_kernel_main(
169    .param .u64 control_block_ptr,
170    .param .u64 input_queue_ptr,
171    .param .u64 output_queue_ptr,
172    .param .u64 shared_state_ptr
173) {
174    .reg .u64 %cb_ptr;
175    .reg .u32 %one;
176
177    // Load control block pointer
178    ld.param.u64 %cb_ptr, [control_block_ptr];
179
180    // Mark as terminated immediately (offset 8)
181    mov.u32 %one, 1;
182    st.global.u32 [%cb_ptr + 8], %one;
183
184    ret;
185}
186"#;