ringkernel_cuda/
lib.rs

1//! CUDA Backend for RingKernel
2//!
3//! This crate provides NVIDIA CUDA GPU support for RingKernel using cudarc.
4//!
5//! # Features
6//!
7//! - Persistent kernel execution (cooperative groups)
8//! - Lock-free message queues in GPU global memory
9//! - PTX compilation via NVRTC
10//! - Multi-GPU support
11//!
12//! # Requirements
13//!
14//! - NVIDIA GPU with Compute Capability 7.0+
15//! - CUDA Toolkit 11.0+
16//! - Native Linux (persistent kernels) or WSL2 (event-driven fallback)
17//!
18//! # Example
19//!
20//! ```ignore
21//! use ringkernel_cuda::CudaRuntime;
22//! use ringkernel_core::runtime::RingKernelRuntime;
23//!
24//! #[tokio::main]
25//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
26//!     let runtime = CudaRuntime::new().await?;
27//!     let kernel = runtime.launch("vector_add", Default::default()).await?;
28//!     kernel.activate().await?;
29//!     Ok(())
30//! }
31//! ```
32
33#![warn(missing_docs)]
34
35#[cfg(feature = "cooperative")]
36pub mod cooperative;
37#[cfg(feature = "cuda")]
38mod device;
39#[cfg(feature = "cuda")]
40pub mod driver_api;
41#[cfg(feature = "cuda")]
42pub mod k2k_gpu;
43#[cfg(feature = "cuda")]
44mod kernel;
45#[cfg(feature = "cuda")]
46mod memory;
47#[cfg(feature = "cuda")]
48pub mod persistent;
49#[cfg(feature = "cuda")]
50pub mod phases;
51#[cfg(feature = "cuda")]
52pub mod reduction;
53#[cfg(feature = "cuda")]
54mod runtime;
55#[cfg(feature = "cuda")]
56mod stencil;
57
58#[cfg(feature = "cuda")]
59pub use device::CudaDevice;
60#[cfg(feature = "cuda")]
61pub use kernel::CudaKernel;
62#[cfg(feature = "cuda")]
63pub use memory::{CudaBuffer, CudaControlBlock, CudaMemoryPool, CudaMessageQueue};
64#[cfg(feature = "cuda")]
65pub use persistent::CudaMappedBuffer;
66#[cfg(feature = "cuda")]
67pub use phases::{
68    InterPhaseReduction, KernelPhase, MultiPhaseConfig, MultiPhaseExecutor, PhaseExecutionStats,
69    SyncMode,
70};
71#[cfg(feature = "cuda")]
72pub use reduction::{
73    generate_block_reduce_code, generate_grid_reduce_code, generate_reduce_and_broadcast_code,
74    CacheKey, CacheStats, CachedReductionBuffer, ReductionBuffer, ReductionBufferBuilder,
75    ReductionBufferCache,
76};
77#[cfg(feature = "cuda")]
78pub use runtime::CudaRuntime;
79#[cfg(feature = "cuda")]
80pub use stencil::{CompiledStencilKernel, LaunchConfig, StencilKernelLoader};
81
82/// Re-export memory module for advanced usage.
83#[cfg(feature = "cuda")]
84pub mod memory_exports {
85    pub use super::memory::{CudaBuffer, CudaControlBlock, CudaMemoryPool, CudaMessageQueue};
86}
87
88// Placeholder implementations when CUDA is not available
89#[cfg(not(feature = "cuda"))]
90mod stub {
91    use async_trait::async_trait;
92    use ringkernel_core::error::{Result, RingKernelError};
93    use ringkernel_core::runtime::{
94        Backend, KernelHandle, KernelId, LaunchOptions, RingKernelRuntime, RuntimeMetrics,
95    };
96
97    /// Stub CUDA runtime when CUDA feature is disabled.
98    pub struct CudaRuntime;
99
100    impl CudaRuntime {
101        /// Create fails when CUDA is not available.
102        pub async fn new() -> Result<Self> {
103            Err(RingKernelError::BackendUnavailable(
104                "CUDA feature not enabled".to_string(),
105            ))
106        }
107    }
108
109    #[async_trait]
110    impl RingKernelRuntime for CudaRuntime {
111        fn backend(&self) -> Backend {
112            Backend::Cuda
113        }
114
115        fn is_backend_available(&self, _backend: Backend) -> bool {
116            false
117        }
118
119        async fn launch(&self, _kernel_id: &str, _options: LaunchOptions) -> Result<KernelHandle> {
120            Err(RingKernelError::BackendUnavailable("CUDA".to_string()))
121        }
122
123        fn get_kernel(&self, _kernel_id: &KernelId) -> Option<KernelHandle> {
124            None
125        }
126
127        fn list_kernels(&self) -> Vec<KernelId> {
128            vec![]
129        }
130
131        fn metrics(&self) -> RuntimeMetrics {
132            RuntimeMetrics::default()
133        }
134
135        async fn shutdown(&self) -> Result<()> {
136            Ok(())
137        }
138    }
139}
140
141#[cfg(not(feature = "cuda"))]
142pub use stub::CudaRuntime;
143
144/// Check if CUDA is available at runtime.
145///
146/// This function returns false if:
147/// - CUDA feature is not enabled
148/// - CUDA libraries are not installed on the system
149/// - No CUDA devices are present
150///
151/// It safely catches panics from cudarc when CUDA is not installed.
152pub fn is_cuda_available() -> bool {
153    #[cfg(feature = "cuda")]
154    {
155        // cudarc panics if CUDA libraries are not found, so we catch that
156        std::panic::catch_unwind(|| {
157            cudarc::driver::CudaContext::device_count()
158                .map(|c| c > 0)
159                .unwrap_or(false)
160        })
161        .unwrap_or(false)
162    }
163    #[cfg(not(feature = "cuda"))]
164    {
165        false
166    }
167}
168
169/// Get CUDA device count.
170///
171/// Returns 0 if CUDA is not available or libraries are not installed.
172pub fn cuda_device_count() -> usize {
173    #[cfg(feature = "cuda")]
174    {
175        // cudarc panics if CUDA libraries are not found, so we catch that
176        std::panic::catch_unwind(|| {
177            cudarc::driver::CudaContext::device_count().unwrap_or(0) as usize
178        })
179        .unwrap_or(0)
180    }
181    #[cfg(not(feature = "cuda"))]
182    {
183        0
184    }
185}
186
187/// Compile CUDA C source code to PTX using NVRTC.
188///
189/// This wraps `cudarc::nvrtc::compile_ptx` to provide PTX compilation
190/// without requiring downstream crates to depend on cudarc directly.
191///
192/// # Arguments
193///
194/// * `cuda_source` - CUDA C source code string
195///
196/// # Returns
197///
198/// PTX assembly as a string, or an error if compilation fails.
199///
200/// # Example
201///
202/// ```ignore
203/// use ringkernel_cuda::compile_ptx;
204///
205/// let cuda_source = r#"
206///     extern "C" __global__ void add(float* a, float* b, float* c, int n) {
207///         int i = blockIdx.x * blockDim.x + threadIdx.x;
208///         if (i < n) c[i] = a[i] + b[i];
209///     }
210/// "#;
211///
212/// let ptx = compile_ptx(cuda_source)?;
213/// ```
214#[cfg(feature = "cuda")]
215pub fn compile_ptx(cuda_source: &str) -> ringkernel_core::error::Result<String> {
216    use ringkernel_core::error::RingKernelError;
217
218    let ptx = cudarc::nvrtc::compile_ptx(cuda_source).map_err(|e| {
219        RingKernelError::CompilationError(format!("NVRTC compilation failed: {}", e))
220    })?;
221
222    Ok(ptx.to_src().to_string())
223}
224
225/// Stub compile_ptx when CUDA is not available.
226#[cfg(not(feature = "cuda"))]
227pub fn compile_ptx(_cuda_source: &str) -> ringkernel_core::error::Result<String> {
228    Err(ringkernel_core::error::RingKernelError::BackendUnavailable(
229        "CUDA feature not enabled".to_string(),
230    ))
231}
232
233/// PTX kernel source template for persistent ring kernel.
234///
235/// This is a minimal kernel that immediately marks itself as terminated.
236/// Uses PTX 8.0 / sm_89 for Ada Lovelace GPU compatibility (RTX 40xx series).
237pub const RING_KERNEL_PTX_TEMPLATE: &str = r#"
238.version 8.0
239.target sm_89
240.address_size 64
241
242.visible .entry ring_kernel_main(
243    .param .u64 control_block_ptr,
244    .param .u64 input_queue_ptr,
245    .param .u64 output_queue_ptr,
246    .param .u64 shared_state_ptr
247) {
248    .reg .u64 %cb_ptr;
249    .reg .u32 %one;
250
251    // Load control block pointer
252    ld.param.u64 %cb_ptr, [control_block_ptr];
253
254    // Mark as terminated immediately (offset 8)
255    mov.u32 %one, 1;
256    st.global.u32 [%cb_ptr + 8], %one;
257
258    ret;
259}
260"#;