ringkernel_cuda/
lib.rs

1//! CUDA Backend for RingKernel
2//!
3//! This crate provides NVIDIA CUDA GPU support for RingKernel using cudarc.
4//!
5//! # Features
6//!
7//! - Persistent kernel execution (cooperative groups)
8//! - Lock-free message queues in GPU global memory
9//! - PTX compilation via NVRTC
10//! - Multi-GPU support
11//!
12//! # Requirements
13//!
14//! - NVIDIA GPU with Compute Capability 7.0+
15//! - CUDA Toolkit 11.0+
16//! - Native Linux (persistent kernels) or WSL2 (event-driven fallback)
17//!
18//! # Example
19//!
20//! ```ignore
21//! use ringkernel_cuda::CudaRuntime;
22//! use ringkernel_core::runtime::RingKernelRuntime;
23//!
24//! #[tokio::main]
25//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
26//!     let runtime = CudaRuntime::new().await?;
27//!     let kernel = runtime.launch("vector_add", Default::default()).await?;
28//!     kernel.activate().await?;
29//!     Ok(())
30//! }
31//! ```
32
33#![warn(missing_docs)]
34
35#[cfg(feature = "cooperative")]
36pub mod cooperative;
37#[cfg(feature = "cuda")]
38mod device;
39#[cfg(feature = "cuda")]
40pub mod driver_api;
41#[cfg(feature = "cuda")]
42pub mod k2k_gpu;
43#[cfg(feature = "cuda")]
44mod kernel;
45#[cfg(feature = "cuda")]
46mod memory;
47#[cfg(feature = "cuda")]
48pub mod persistent;
49#[cfg(feature = "cuda")]
50mod runtime;
51#[cfg(feature = "cuda")]
52mod stencil;
53
54#[cfg(feature = "cuda")]
55pub use device::CudaDevice;
56#[cfg(feature = "cuda")]
57pub use kernel::CudaKernel;
58#[cfg(feature = "cuda")]
59pub use memory::{CudaBuffer, CudaControlBlock, CudaMemoryPool, CudaMessageQueue};
60#[cfg(feature = "cuda")]
61pub use runtime::CudaRuntime;
62#[cfg(feature = "cuda")]
63pub use stencil::{CompiledStencilKernel, LaunchConfig, StencilKernelLoader};
64
65/// Re-export memory module for advanced usage.
66#[cfg(feature = "cuda")]
67pub mod memory_exports {
68    pub use super::memory::{CudaBuffer, CudaControlBlock, CudaMemoryPool, CudaMessageQueue};
69}
70
71// Placeholder implementations when CUDA is not available
72#[cfg(not(feature = "cuda"))]
73mod stub {
74    use async_trait::async_trait;
75    use ringkernel_core::error::{Result, RingKernelError};
76    use ringkernel_core::runtime::{
77        Backend, KernelHandle, KernelId, LaunchOptions, RingKernelRuntime, RuntimeMetrics,
78    };
79
80    /// Stub CUDA runtime when CUDA feature is disabled.
81    pub struct CudaRuntime;
82
83    impl CudaRuntime {
84        /// Create fails when CUDA is not available.
85        pub async fn new() -> Result<Self> {
86            Err(RingKernelError::BackendUnavailable(
87                "CUDA feature not enabled".to_string(),
88            ))
89        }
90    }
91
92    #[async_trait]
93    impl RingKernelRuntime for CudaRuntime {
94        fn backend(&self) -> Backend {
95            Backend::Cuda
96        }
97
98        fn is_backend_available(&self, _backend: Backend) -> bool {
99            false
100        }
101
102        async fn launch(&self, _kernel_id: &str, _options: LaunchOptions) -> Result<KernelHandle> {
103            Err(RingKernelError::BackendUnavailable("CUDA".to_string()))
104        }
105
106        fn get_kernel(&self, _kernel_id: &KernelId) -> Option<KernelHandle> {
107            None
108        }
109
110        fn list_kernels(&self) -> Vec<KernelId> {
111            vec![]
112        }
113
114        fn metrics(&self) -> RuntimeMetrics {
115            RuntimeMetrics::default()
116        }
117
118        async fn shutdown(&self) -> Result<()> {
119            Ok(())
120        }
121    }
122}
123
124#[cfg(not(feature = "cuda"))]
125pub use stub::CudaRuntime;
126
127/// Check if CUDA is available at runtime.
128///
129/// This function returns false if:
130/// - CUDA feature is not enabled
131/// - CUDA libraries are not installed on the system
132/// - No CUDA devices are present
133///
134/// It safely catches panics from cudarc when CUDA is not installed.
135pub fn is_cuda_available() -> bool {
136    #[cfg(feature = "cuda")]
137    {
138        // cudarc panics if CUDA libraries are not found, so we catch that
139        std::panic::catch_unwind(|| {
140            cudarc::driver::CudaContext::device_count()
141                .map(|c| c > 0)
142                .unwrap_or(false)
143        })
144        .unwrap_or(false)
145    }
146    #[cfg(not(feature = "cuda"))]
147    {
148        false
149    }
150}
151
152/// Get CUDA device count.
153///
154/// Returns 0 if CUDA is not available or libraries are not installed.
155pub fn cuda_device_count() -> usize {
156    #[cfg(feature = "cuda")]
157    {
158        // cudarc panics if CUDA libraries are not found, so we catch that
159        std::panic::catch_unwind(|| {
160            cudarc::driver::CudaContext::device_count().unwrap_or(0) as usize
161        })
162        .unwrap_or(0)
163    }
164    #[cfg(not(feature = "cuda"))]
165    {
166        0
167    }
168}
169
170/// PTX kernel source template for persistent ring kernel.
171///
172/// This is a minimal kernel that immediately marks itself as terminated.
173/// Uses PTX 8.0 / sm_89 for Ada Lovelace GPU compatibility (RTX 40xx series).
174pub const RING_KERNEL_PTX_TEMPLATE: &str = r#"
175.version 8.0
176.target sm_89
177.address_size 64
178
179.visible .entry ring_kernel_main(
180    .param .u64 control_block_ptr,
181    .param .u64 input_queue_ptr,
182    .param .u64 output_queue_ptr,
183    .param .u64 shared_state_ptr
184) {
185    .reg .u64 %cb_ptr;
186    .reg .u32 %one;
187
188    // Load control block pointer
189    ld.param.u64 %cb_ptr, [control_block_ptr];
190
191    // Mark as terminated immediately (offset 8)
192    mov.u32 %one, 1;
193    st.global.u32 [%cb_ptr + 8], %one;
194
195    ret;
196}
197"#;