ringkernel_cuda/
lib.rs

1//! CUDA Backend for RingKernel
2//!
3//! This crate provides NVIDIA CUDA GPU support for RingKernel using cudarc.
4//!
5//! # Features
6//!
7//! - Persistent kernel execution (cooperative groups)
8//! - Lock-free message queues in GPU global memory
9//! - PTX compilation via NVRTC
10//! - Multi-GPU support
11//!
12//! # Requirements
13//!
14//! - NVIDIA GPU with Compute Capability 7.0+
15//! - CUDA Toolkit 11.0+
16//! - Native Linux (persistent kernels) or WSL2 (event-driven fallback)
17//!
18//! # Example
19//!
20//! ```ignore
21//! use ringkernel_cuda::CudaRuntime;
22//! use ringkernel_core::runtime::RingKernelRuntime;
23//!
24//! #[tokio::main]
25//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
26//!     let runtime = CudaRuntime::new().await?;
27//!     let kernel = runtime.launch("vector_add", Default::default()).await?;
28//!     kernel.activate().await?;
29//!     Ok(())
30//! }
31//! ```
32
33#![warn(missing_docs)]
34
35#[cfg(feature = "ptx-cache")]
36pub mod compile;
37#[cfg(feature = "cooperative")]
38pub mod cooperative;
39#[cfg(feature = "cuda")]
40mod device;
41#[cfg(feature = "cuda")]
42pub mod driver_api;
43#[cfg(feature = "cuda")]
44pub mod k2k_gpu;
45#[cfg(feature = "cuda")]
46mod kernel;
47#[cfg(feature = "cuda")]
48pub mod launch_config;
49#[cfg(feature = "cuda")]
50mod memory;
51#[cfg(feature = "cuda")]
52pub mod memory_pool;
53#[cfg(feature = "cuda")]
54pub mod persistent;
55#[cfg(feature = "cuda")]
56pub mod phases;
57#[cfg(feature = "profiling")]
58pub mod profiling;
59#[cfg(feature = "cuda")]
60pub mod reduction;
61#[cfg(feature = "cuda")]
62mod runtime;
63#[cfg(feature = "cuda")]
64mod stencil;
65#[cfg(feature = "cuda")]
66pub mod stream;
67
68#[cfg(feature = "cuda")]
69pub use device::CudaDevice;
70#[cfg(feature = "cuda")]
71pub use kernel::CudaKernel;
72#[cfg(feature = "cuda")]
73pub use memory::{CudaBuffer, CudaControlBlock, CudaMemoryPool, CudaMessageQueue};
74#[cfg(feature = "cuda")]
75pub use persistent::CudaMappedBuffer;
76#[cfg(feature = "cuda")]
77pub use phases::{
78    InterPhaseReduction, KernelPhase, MultiPhaseConfig, MultiPhaseExecutor, PhaseExecutionStats,
79    SyncMode,
80};
81#[cfg(feature = "cuda")]
82pub use reduction::{
83    generate_block_reduce_code, generate_grid_reduce_code, generate_reduce_and_broadcast_code,
84    CacheKey, CacheStats, CachedReductionBuffer, ReductionBuffer, ReductionBufferBuilder,
85    ReductionBufferCache,
86};
87#[cfg(feature = "cuda")]
88pub use runtime::CudaRuntime;
89#[cfg(feature = "cuda")]
90pub use stencil::{CompiledStencilKernel, LaunchConfig, StencilKernelLoader};
91
92// Profiling re-exports
93#[cfg(feature = "profiling")]
94pub use profiling::{
95    CudaEvent, CudaEventFlags, CudaMemoryKind, CudaMemoryTracker, CudaNvtxProfiler,
96    GpuChromeTraceBuilder, GpuEventArgs, GpuTimer, GpuTimerPool, GpuTraceEvent, KernelMetrics,
97    ProfilingSession, TrackedAllocation, TransferDirection, TransferMetrics,
98};
99
100// PTX cache re-exports
101#[cfg(feature = "ptx-cache")]
102pub use compile::{PtxCache, PtxCacheError, PtxCacheResult, PtxCacheStats, CACHE_VERSION};
103
104// GPU memory pool re-exports
105#[cfg(feature = "cuda")]
106pub use memory_pool::{
107    GpuBucketStats, GpuPoolConfig, GpuPoolDiagnostics, GpuSizeClass, GpuStratifiedPool,
108};
109
110// Stream manager re-exports
111#[cfg(feature = "cuda")]
112pub use stream::{
113    OverlapMetrics, StreamConfig, StreamConfigBuilder, StreamError, StreamId, StreamManager,
114    StreamPool, StreamPoolStats, StreamResult,
115};
116
117/// Re-export memory module for advanced usage.
118#[cfg(feature = "cuda")]
119pub mod memory_exports {
120    pub use super::memory::{CudaBuffer, CudaControlBlock, CudaMemoryPool, CudaMessageQueue};
121}
122
123// Placeholder implementations when CUDA is not available
124#[cfg(not(feature = "cuda"))]
125mod stub {
126    use async_trait::async_trait;
127    use ringkernel_core::error::{Result, RingKernelError};
128    use ringkernel_core::runtime::{
129        Backend, KernelHandle, KernelId, LaunchOptions, RingKernelRuntime, RuntimeMetrics,
130    };
131
132    /// Stub CUDA runtime when CUDA feature is disabled.
133    pub struct CudaRuntime;
134
135    impl CudaRuntime {
136        /// Create fails when CUDA is not available.
137        pub async fn new() -> Result<Self> {
138            Err(RingKernelError::BackendUnavailable(
139                "CUDA feature not enabled".to_string(),
140            ))
141        }
142    }
143
144    #[async_trait]
145    impl RingKernelRuntime for CudaRuntime {
146        fn backend(&self) -> Backend {
147            Backend::Cuda
148        }
149
150        fn is_backend_available(&self, _backend: Backend) -> bool {
151            false
152        }
153
154        async fn launch(&self, _kernel_id: &str, _options: LaunchOptions) -> Result<KernelHandle> {
155            Err(RingKernelError::BackendUnavailable("CUDA".to_string()))
156        }
157
158        fn get_kernel(&self, _kernel_id: &KernelId) -> Option<KernelHandle> {
159            None
160        }
161
162        fn list_kernels(&self) -> Vec<KernelId> {
163            vec![]
164        }
165
166        fn metrics(&self) -> RuntimeMetrics {
167            RuntimeMetrics::default()
168        }
169
170        async fn shutdown(&self) -> Result<()> {
171            Ok(())
172        }
173    }
174}
175
176#[cfg(not(feature = "cuda"))]
177pub use stub::CudaRuntime;
178
179/// Check if CUDA is available at runtime.
180///
181/// This function returns false if:
182/// - CUDA feature is not enabled
183/// - CUDA libraries are not installed on the system
184/// - No CUDA devices are present
185///
186/// It safely catches panics from cudarc when CUDA is not installed.
187pub fn is_cuda_available() -> bool {
188    #[cfg(feature = "cuda")]
189    {
190        // cudarc panics if CUDA libraries are not found, so we catch that
191        std::panic::catch_unwind(|| {
192            cudarc::driver::CudaContext::device_count()
193                .map(|c| c > 0)
194                .unwrap_or(false)
195        })
196        .unwrap_or(false)
197    }
198    #[cfg(not(feature = "cuda"))]
199    {
200        false
201    }
202}
203
204/// Get CUDA device count.
205///
206/// Returns 0 if CUDA is not available or libraries are not installed.
207pub fn cuda_device_count() -> usize {
208    #[cfg(feature = "cuda")]
209    {
210        // cudarc panics if CUDA libraries are not found, so we catch that
211        std::panic::catch_unwind(|| {
212            cudarc::driver::CudaContext::device_count().unwrap_or(0) as usize
213        })
214        .unwrap_or(0)
215    }
216    #[cfg(not(feature = "cuda"))]
217    {
218        0
219    }
220}
221
222/// Compile CUDA C source code to PTX using NVRTC.
223///
224/// This wraps `cudarc::nvrtc::compile_ptx` to provide PTX compilation
225/// without requiring downstream crates to depend on cudarc directly.
226///
227/// # Arguments
228///
229/// * `cuda_source` - CUDA C source code string
230///
231/// # Returns
232///
233/// PTX assembly as a string, or an error if compilation fails.
234///
235/// # Example
236///
237/// ```ignore
238/// use ringkernel_cuda::compile_ptx;
239///
240/// let cuda_source = r#"
241///     extern "C" __global__ void add(float* a, float* b, float* c, int n) {
242///         int i = blockIdx.x * blockDim.x + threadIdx.x;
243///         if (i < n) c[i] = a[i] + b[i];
244///     }
245/// "#;
246///
247/// let ptx = compile_ptx(cuda_source)?;
248/// ```
249#[cfg(feature = "cuda")]
250pub fn compile_ptx(cuda_source: &str) -> ringkernel_core::error::Result<String> {
251    use ringkernel_core::error::RingKernelError;
252
253    let ptx = cudarc::nvrtc::compile_ptx(cuda_source).map_err(|e| {
254        RingKernelError::CompilationError(format!("NVRTC compilation failed: {}", e))
255    })?;
256
257    Ok(ptx.to_src().to_string())
258}
259
260/// Stub compile_ptx when CUDA is not available.
261#[cfg(not(feature = "cuda"))]
262pub fn compile_ptx(_cuda_source: &str) -> ringkernel_core::error::Result<String> {
263    Err(ringkernel_core::error::RingKernelError::BackendUnavailable(
264        "CUDA feature not enabled".to_string(),
265    ))
266}
267
268/// PTX kernel source template for persistent ring kernel.
269///
270/// This is a minimal kernel that immediately marks itself as terminated.
271/// Uses PTX 8.0 / sm_89 for Ada Lovelace GPU compatibility (RTX 40xx series).
272pub const RING_KERNEL_PTX_TEMPLATE: &str = r#"
273.version 8.0
274.target sm_89
275.address_size 64
276
277.visible .entry ring_kernel_main(
278    .param .u64 control_block_ptr,
279    .param .u64 input_queue_ptr,
280    .param .u64 output_queue_ptr,
281    .param .u64 shared_state_ptr
282) {
283    .reg .u64 %cb_ptr;
284    .reg .u32 %one;
285
286    // Load control block pointer
287    ld.param.u64 %cb_ptr, [control_block_ptr];
288
289    // Mark as terminated immediately (offset 8)
290    mov.u32 %one, 1;
291    st.global.u32 [%cb_ptr + 8], %one;
292
293    ret;
294}
295"#;
ringkernel_cuda/lib.rs

ringkernel_cuda/
lib.rs