1#![warn(missing_docs)]
34
35#[cfg(feature = "cooperative")]
36pub mod cooperative;
37#[cfg(feature = "cuda")]
38mod device;
39#[cfg(feature = "cuda")]
40pub mod driver_api;
41#[cfg(feature = "cuda")]
42pub mod k2k_gpu;
43#[cfg(feature = "cuda")]
44mod kernel;
45#[cfg(feature = "cuda")]
46mod memory;
47#[cfg(feature = "cuda")]
48pub mod persistent;
49#[cfg(feature = "cuda")]
50pub mod phases;
51#[cfg(feature = "profiling")]
52pub mod profiling;
53#[cfg(feature = "cuda")]
54pub mod reduction;
55#[cfg(feature = "cuda")]
56mod runtime;
57#[cfg(feature = "cuda")]
58mod stencil;
59
60#[cfg(feature = "cuda")]
61pub use device::CudaDevice;
62#[cfg(feature = "cuda")]
63pub use kernel::CudaKernel;
64#[cfg(feature = "cuda")]
65pub use memory::{CudaBuffer, CudaControlBlock, CudaMemoryPool, CudaMessageQueue};
66#[cfg(feature = "cuda")]
67pub use persistent::CudaMappedBuffer;
68#[cfg(feature = "cuda")]
69pub use phases::{
70 InterPhaseReduction, KernelPhase, MultiPhaseConfig, MultiPhaseExecutor, PhaseExecutionStats,
71 SyncMode,
72};
73#[cfg(feature = "cuda")]
74pub use reduction::{
75 generate_block_reduce_code, generate_grid_reduce_code, generate_reduce_and_broadcast_code,
76 CacheKey, CacheStats, CachedReductionBuffer, ReductionBuffer, ReductionBufferBuilder,
77 ReductionBufferCache,
78};
79#[cfg(feature = "cuda")]
80pub use runtime::CudaRuntime;
81#[cfg(feature = "cuda")]
82pub use stencil::{CompiledStencilKernel, LaunchConfig, StencilKernelLoader};
83
84#[cfg(feature = "profiling")]
86pub use profiling::{
87 CudaEvent, CudaEventFlags, CudaMemoryKind, CudaMemoryTracker, CudaNvtxProfiler,
88 GpuChromeTraceBuilder, GpuEventArgs, GpuTimer, GpuTimerPool, GpuTraceEvent, KernelMetrics,
89 ProfilingSession, TrackedAllocation, TransferDirection, TransferMetrics,
90};
91
92#[cfg(feature = "cuda")]
94pub mod memory_exports {
95 pub use super::memory::{CudaBuffer, CudaControlBlock, CudaMemoryPool, CudaMessageQueue};
96}
97
98#[cfg(not(feature = "cuda"))]
100mod stub {
101 use async_trait::async_trait;
102 use ringkernel_core::error::{Result, RingKernelError};
103 use ringkernel_core::runtime::{
104 Backend, KernelHandle, KernelId, LaunchOptions, RingKernelRuntime, RuntimeMetrics,
105 };
106
107 pub struct CudaRuntime;
109
110 impl CudaRuntime {
111 pub async fn new() -> Result<Self> {
113 Err(RingKernelError::BackendUnavailable(
114 "CUDA feature not enabled".to_string(),
115 ))
116 }
117 }
118
119 #[async_trait]
120 impl RingKernelRuntime for CudaRuntime {
121 fn backend(&self) -> Backend {
122 Backend::Cuda
123 }
124
125 fn is_backend_available(&self, _backend: Backend) -> bool {
126 false
127 }
128
129 async fn launch(&self, _kernel_id: &str, _options: LaunchOptions) -> Result<KernelHandle> {
130 Err(RingKernelError::BackendUnavailable("CUDA".to_string()))
131 }
132
133 fn get_kernel(&self, _kernel_id: &KernelId) -> Option<KernelHandle> {
134 None
135 }
136
137 fn list_kernels(&self) -> Vec<KernelId> {
138 vec![]
139 }
140
141 fn metrics(&self) -> RuntimeMetrics {
142 RuntimeMetrics::default()
143 }
144
145 async fn shutdown(&self) -> Result<()> {
146 Ok(())
147 }
148 }
149}
150
151#[cfg(not(feature = "cuda"))]
152pub use stub::CudaRuntime;
153
154pub fn is_cuda_available() -> bool {
163 #[cfg(feature = "cuda")]
164 {
165 std::panic::catch_unwind(|| {
167 cudarc::driver::CudaContext::device_count()
168 .map(|c| c > 0)
169 .unwrap_or(false)
170 })
171 .unwrap_or(false)
172 }
173 #[cfg(not(feature = "cuda"))]
174 {
175 false
176 }
177}
178
179pub fn cuda_device_count() -> usize {
183 #[cfg(feature = "cuda")]
184 {
185 std::panic::catch_unwind(|| {
187 cudarc::driver::CudaContext::device_count().unwrap_or(0) as usize
188 })
189 .unwrap_or(0)
190 }
191 #[cfg(not(feature = "cuda"))]
192 {
193 0
194 }
195}
196
197#[cfg(feature = "cuda")]
225pub fn compile_ptx(cuda_source: &str) -> ringkernel_core::error::Result<String> {
226 use ringkernel_core::error::RingKernelError;
227
228 let ptx = cudarc::nvrtc::compile_ptx(cuda_source).map_err(|e| {
229 RingKernelError::CompilationError(format!("NVRTC compilation failed: {}", e))
230 })?;
231
232 Ok(ptx.to_src().to_string())
233}
234
235#[cfg(not(feature = "cuda"))]
237pub fn compile_ptx(_cuda_source: &str) -> ringkernel_core::error::Result<String> {
238 Err(ringkernel_core::error::RingKernelError::BackendUnavailable(
239 "CUDA feature not enabled".to_string(),
240 ))
241}
242
243pub const RING_KERNEL_PTX_TEMPLATE: &str = r#"
248.version 8.0
249.target sm_89
250.address_size 64
251
252.visible .entry ring_kernel_main(
253 .param .u64 control_block_ptr,
254 .param .u64 input_queue_ptr,
255 .param .u64 output_queue_ptr,
256 .param .u64 shared_state_ptr
257) {
258 .reg .u64 %cb_ptr;
259 .reg .u32 %one;
260
261 // Load control block pointer
262 ld.param.u64 %cb_ptr, [control_block_ptr];
263
264 // Mark as terminated immediately (offset 8)
265 mov.u32 %one, 1;
266 st.global.u32 [%cb_ptr + 8], %one;
267
268 ret;
269}
270"#;