1#![warn(missing_docs)]
34
35#[cfg(feature = "ptx-cache")]
36pub mod compile;
37#[cfg(feature = "cooperative")]
38pub mod cooperative;
39#[cfg(feature = "cuda")]
40mod device;
41#[cfg(feature = "cuda")]
42pub mod driver_api;
43#[cfg(feature = "cuda")]
44pub mod k2k_gpu;
45#[cfg(feature = "cuda")]
46mod kernel;
47#[cfg(feature = "cuda")]
48pub mod launch_config;
49#[cfg(feature = "cuda")]
50mod memory;
51#[cfg(feature = "cuda")]
52pub mod memory_pool;
53#[cfg(feature = "cuda")]
54pub mod persistent;
55#[cfg(feature = "cuda")]
56pub mod phases;
57#[cfg(feature = "profiling")]
58pub mod profiling;
59#[cfg(feature = "cuda")]
60pub mod reduction;
61#[cfg(feature = "cuda")]
62mod runtime;
63#[cfg(feature = "cuda")]
64mod stencil;
65#[cfg(feature = "cuda")]
66pub mod stream;
67
68#[cfg(feature = "cuda")]
69pub use device::CudaDevice;
70#[cfg(feature = "cuda")]
71pub use kernel::CudaKernel;
72#[cfg(feature = "cuda")]
73pub use memory::{CudaBuffer, CudaControlBlock, CudaMemoryPool, CudaMessageQueue};
74#[cfg(feature = "cuda")]
75pub use persistent::CudaMappedBuffer;
76#[cfg(feature = "cuda")]
77pub use phases::{
78 InterPhaseReduction, KernelPhase, MultiPhaseConfig, MultiPhaseExecutor, PhaseExecutionStats,
79 SyncMode,
80};
81#[cfg(feature = "cuda")]
82pub use reduction::{
83 generate_block_reduce_code, generate_grid_reduce_code, generate_reduce_and_broadcast_code,
84 CacheKey, CacheStats, CachedReductionBuffer, ReductionBuffer, ReductionBufferBuilder,
85 ReductionBufferCache,
86};
87#[cfg(feature = "cuda")]
88pub use runtime::CudaRuntime;
89#[cfg(feature = "cuda")]
90pub use stencil::{CompiledStencilKernel, LaunchConfig, StencilKernelLoader};
91
92#[cfg(feature = "profiling")]
94pub use profiling::{
95 CudaEvent, CudaEventFlags, CudaMemoryKind, CudaMemoryTracker, CudaNvtxProfiler,
96 GpuChromeTraceBuilder, GpuEventArgs, GpuTimer, GpuTimerPool, GpuTraceEvent, KernelMetrics,
97 ProfilingSession, TrackedAllocation, TransferDirection, TransferMetrics,
98};
99
100#[cfg(feature = "ptx-cache")]
102pub use compile::{PtxCache, PtxCacheError, PtxCacheResult, PtxCacheStats, CACHE_VERSION};
103
104#[cfg(feature = "cuda")]
106pub use memory_pool::{
107 GpuBucketStats, GpuPoolConfig, GpuPoolDiagnostics, GpuSizeClass, GpuStratifiedPool,
108};
109
110#[cfg(feature = "cuda")]
112pub use stream::{
113 OverlapMetrics, StreamConfig, StreamConfigBuilder, StreamError, StreamId, StreamManager,
114 StreamPool, StreamPoolStats, StreamResult,
115};
116
117#[cfg(feature = "cuda")]
119pub mod memory_exports {
120 pub use super::memory::{CudaBuffer, CudaControlBlock, CudaMemoryPool, CudaMessageQueue};
121}
122
123#[cfg(not(feature = "cuda"))]
125mod stub {
126 use async_trait::async_trait;
127 use ringkernel_core::error::{Result, RingKernelError};
128 use ringkernel_core::runtime::{
129 Backend, KernelHandle, KernelId, LaunchOptions, RingKernelRuntime, RuntimeMetrics,
130 };
131
132 pub struct CudaRuntime;
134
135 impl CudaRuntime {
136 pub async fn new() -> Result<Self> {
138 Err(RingKernelError::BackendUnavailable(
139 "CUDA feature not enabled".to_string(),
140 ))
141 }
142 }
143
144 #[async_trait]
145 impl RingKernelRuntime for CudaRuntime {
146 fn backend(&self) -> Backend {
147 Backend::Cuda
148 }
149
150 fn is_backend_available(&self, _backend: Backend) -> bool {
151 false
152 }
153
154 async fn launch(&self, _kernel_id: &str, _options: LaunchOptions) -> Result<KernelHandle> {
155 Err(RingKernelError::BackendUnavailable("CUDA".to_string()))
156 }
157
158 fn get_kernel(&self, _kernel_id: &KernelId) -> Option<KernelHandle> {
159 None
160 }
161
162 fn list_kernels(&self) -> Vec<KernelId> {
163 vec![]
164 }
165
166 fn metrics(&self) -> RuntimeMetrics {
167 RuntimeMetrics::default()
168 }
169
170 async fn shutdown(&self) -> Result<()> {
171 Ok(())
172 }
173 }
174}
175
176#[cfg(not(feature = "cuda"))]
177pub use stub::CudaRuntime;
178
179pub fn is_cuda_available() -> bool {
188 #[cfg(feature = "cuda")]
189 {
190 std::panic::catch_unwind(|| {
192 cudarc::driver::CudaContext::device_count()
193 .map(|c| c > 0)
194 .unwrap_or(false)
195 })
196 .unwrap_or(false)
197 }
198 #[cfg(not(feature = "cuda"))]
199 {
200 false
201 }
202}
203
204pub fn cuda_device_count() -> usize {
208 #[cfg(feature = "cuda")]
209 {
210 std::panic::catch_unwind(|| {
212 cudarc::driver::CudaContext::device_count().unwrap_or(0) as usize
213 })
214 .unwrap_or(0)
215 }
216 #[cfg(not(feature = "cuda"))]
217 {
218 0
219 }
220}
221
222#[cfg(feature = "cuda")]
250pub fn compile_ptx(cuda_source: &str) -> ringkernel_core::error::Result<String> {
251 use ringkernel_core::error::RingKernelError;
252
253 let ptx = cudarc::nvrtc::compile_ptx(cuda_source).map_err(|e| {
254 RingKernelError::CompilationError(format!("NVRTC compilation failed: {}", e))
255 })?;
256
257 Ok(ptx.to_src().to_string())
258}
259
260#[cfg(not(feature = "cuda"))]
262pub fn compile_ptx(_cuda_source: &str) -> ringkernel_core::error::Result<String> {
263 Err(ringkernel_core::error::RingKernelError::BackendUnavailable(
264 "CUDA feature not enabled".to_string(),
265 ))
266}
267
268pub const RING_KERNEL_PTX_TEMPLATE: &str = r#"
273.version 8.0
274.target sm_89
275.address_size 64
276
277.visible .entry ring_kernel_main(
278 .param .u64 control_block_ptr,
279 .param .u64 input_queue_ptr,
280 .param .u64 output_queue_ptr,
281 .param .u64 shared_state_ptr
282) {
283 .reg .u64 %cb_ptr;
284 .reg .u32 %one;
285
286 // Load control block pointer
287 ld.param.u64 %cb_ptr, [control_block_ptr];
288
289 // Mark as terminated immediately (offset 8)
290 mov.u32 %one, 1;
291 st.global.u32 [%cb_ptr + 8], %one;
292
293 ret;
294}
295"#;