1#![warn(missing_docs)]
34
35#[cfg(feature = "cooperative")]
36pub mod cooperative;
37#[cfg(feature = "cuda")]
38mod device;
39#[cfg(feature = "cuda")]
40pub mod driver_api;
41#[cfg(feature = "cuda")]
42pub mod k2k_gpu;
43#[cfg(feature = "cuda")]
44mod kernel;
45#[cfg(feature = "cuda")]
46mod memory;
47#[cfg(feature = "cuda")]
48pub mod persistent;
49#[cfg(feature = "cuda")]
50pub mod phases;
51#[cfg(feature = "cuda")]
52pub mod reduction;
53#[cfg(feature = "cuda")]
54mod runtime;
55#[cfg(feature = "cuda")]
56mod stencil;
57
58#[cfg(feature = "cuda")]
59pub use device::CudaDevice;
60#[cfg(feature = "cuda")]
61pub use kernel::CudaKernel;
62#[cfg(feature = "cuda")]
63pub use memory::{CudaBuffer, CudaControlBlock, CudaMemoryPool, CudaMessageQueue};
64#[cfg(feature = "cuda")]
65pub use persistent::CudaMappedBuffer;
66#[cfg(feature = "cuda")]
67pub use phases::{
68 InterPhaseReduction, KernelPhase, MultiPhaseConfig, MultiPhaseExecutor, PhaseExecutionStats,
69 SyncMode,
70};
71#[cfg(feature = "cuda")]
72pub use reduction::{
73 generate_block_reduce_code, generate_grid_reduce_code, generate_reduce_and_broadcast_code,
74 CacheKey, CacheStats, CachedReductionBuffer, ReductionBuffer, ReductionBufferBuilder,
75 ReductionBufferCache,
76};
77#[cfg(feature = "cuda")]
78pub use runtime::CudaRuntime;
79#[cfg(feature = "cuda")]
80pub use stencil::{CompiledStencilKernel, LaunchConfig, StencilKernelLoader};
81
82#[cfg(feature = "cuda")]
84pub mod memory_exports {
85 pub use super::memory::{CudaBuffer, CudaControlBlock, CudaMemoryPool, CudaMessageQueue};
86}
87
88#[cfg(not(feature = "cuda"))]
90mod stub {
91 use async_trait::async_trait;
92 use ringkernel_core::error::{Result, RingKernelError};
93 use ringkernel_core::runtime::{
94 Backend, KernelHandle, KernelId, LaunchOptions, RingKernelRuntime, RuntimeMetrics,
95 };
96
97 pub struct CudaRuntime;
99
100 impl CudaRuntime {
101 pub async fn new() -> Result<Self> {
103 Err(RingKernelError::BackendUnavailable(
104 "CUDA feature not enabled".to_string(),
105 ))
106 }
107 }
108
109 #[async_trait]
110 impl RingKernelRuntime for CudaRuntime {
111 fn backend(&self) -> Backend {
112 Backend::Cuda
113 }
114
115 fn is_backend_available(&self, _backend: Backend) -> bool {
116 false
117 }
118
119 async fn launch(&self, _kernel_id: &str, _options: LaunchOptions) -> Result<KernelHandle> {
120 Err(RingKernelError::BackendUnavailable("CUDA".to_string()))
121 }
122
123 fn get_kernel(&self, _kernel_id: &KernelId) -> Option<KernelHandle> {
124 None
125 }
126
127 fn list_kernels(&self) -> Vec<KernelId> {
128 vec![]
129 }
130
131 fn metrics(&self) -> RuntimeMetrics {
132 RuntimeMetrics::default()
133 }
134
135 async fn shutdown(&self) -> Result<()> {
136 Ok(())
137 }
138 }
139}
140
141#[cfg(not(feature = "cuda"))]
142pub use stub::CudaRuntime;
143
144pub fn is_cuda_available() -> bool {
153 #[cfg(feature = "cuda")]
154 {
155 std::panic::catch_unwind(|| {
157 cudarc::driver::CudaContext::device_count()
158 .map(|c| c > 0)
159 .unwrap_or(false)
160 })
161 .unwrap_or(false)
162 }
163 #[cfg(not(feature = "cuda"))]
164 {
165 false
166 }
167}
168
169pub fn cuda_device_count() -> usize {
173 #[cfg(feature = "cuda")]
174 {
175 std::panic::catch_unwind(|| {
177 cudarc::driver::CudaContext::device_count().unwrap_or(0) as usize
178 })
179 .unwrap_or(0)
180 }
181 #[cfg(not(feature = "cuda"))]
182 {
183 0
184 }
185}
186
187#[cfg(feature = "cuda")]
215pub fn compile_ptx(cuda_source: &str) -> ringkernel_core::error::Result<String> {
216 use ringkernel_core::error::RingKernelError;
217
218 let ptx = cudarc::nvrtc::compile_ptx(cuda_source).map_err(|e| {
219 RingKernelError::CompilationError(format!("NVRTC compilation failed: {}", e))
220 })?;
221
222 Ok(ptx.to_src().to_string())
223}
224
225#[cfg(not(feature = "cuda"))]
227pub fn compile_ptx(_cuda_source: &str) -> ringkernel_core::error::Result<String> {
228 Err(ringkernel_core::error::RingKernelError::BackendUnavailable(
229 "CUDA feature not enabled".to_string(),
230 ))
231}
232
233pub const RING_KERNEL_PTX_TEMPLATE: &str = r#"
238.version 8.0
239.target sm_89
240.address_size 64
241
242.visible .entry ring_kernel_main(
243 .param .u64 control_block_ptr,
244 .param .u64 input_queue_ptr,
245 .param .u64 output_queue_ptr,
246 .param .u64 shared_state_ptr
247) {
248 .reg .u64 %cb_ptr;
249 .reg .u32 %one;
250
251 // Load control block pointer
252 ld.param.u64 %cb_ptr, [control_block_ptr];
253
254 // Mark as terminated immediately (offset 8)
255 mov.u32 %one, 1;
256 st.global.u32 [%cb_ptr + 8], %one;
257
258 ret;
259}
260"#;