numr 0.6.0-beta.1

High-performance numerical computing with multi-backend GPU acceleration (CPU/CUDA/WebGPU)
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
//! CUDA Client implementation
//!
//! CudaClient owns stream and context for direct cudarc access.
//!
//! # Thread Safety
//!
//! `CudaClient` is `Clone` and can be shared across threads. The underlying
//! CUDA context and stream are reference-counted via `Arc`. However, CUDA
//! operations must be performed on the thread that owns the context or after
//! calling `context.bind_to_thread()`.

use cudarc::cublas::CudaBlas;
use cudarc::driver::safe::{CudaContext, CudaStream};
use std::sync::Arc;

use super::CudaRuntime;
use super::allocator::CudaAllocator;
use super::device::{CudaDevice, CudaError};
use super::sobol_cache::SobolDvCache;
use crate::runtime::RuntimeClient;

// ============================================================================
// CudaClient
// ============================================================================

/// CUDA Runtime Client
///
/// Owns CUDA context and stream for direct kernel launches.
/// All tensor operations launch on this stream.
///
/// # Stream Ownership
///
/// The key insight: All ops MUST launch on `self.stream()` for correct ordering.
/// Operations launched on different streams may execute out of order.
///
/// # Panics
///
/// Memory allocation via the allocator may panic on CUDA OOM conditions.
/// See the module-level documentation for details.
#[derive(Clone)]
pub struct CudaClient {
    /// GPU device index
    pub(crate) device: CudaDevice,

    /// CUDA context for this device (owns GPU context)
    pub(crate) context: Arc<CudaContext>,

    /// Stream on which all kernels launch (compute stream)
    pub(crate) stream: Arc<CudaStream>,

    /// Dedicated stream for D2H copies (overlaps with compute stream)
    pub(crate) copy_stream: Arc<CudaStream>,

    /// cuBLAS handle for GEMM operations
    pub(crate) cublas: Arc<CudaBlas>,

    /// Allocator for memory management
    pub(crate) allocator: CudaAllocator,

    /// Raw handle for custom kernel launching
    pub(crate) raw_handle: CudaRawHandle,

    /// Persistent cache of Sobol direction-vector device buffers.
    ///
    /// Buffers are allocated once per unique dimension count and reused on
    /// every subsequent call, including inside CUDA graph capture regions.
    /// This avoids H2D memcpy nodes with stack-local source pointers.
    pub(crate) sobol_dv_cache: Arc<SobolDvCache>,
}

impl std::fmt::Debug for CudaClient {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("CudaClient")
            .field("device", &self.device)
            .finish_non_exhaustive()
    }
}

// ============================================================================
// CudaClient Implementation
// ============================================================================

impl CudaClient {
    /// Create a CUDA client for a device.
    ///
    /// Returns the canonical client for this device index from the global cache.
    /// If no client exists for this device yet, a new one is created, registered,
    /// and returned. Subsequent calls with the same device index always return the
    /// same underlying streams, context, and allocator — even from call sites that
    /// do not go through the runtime's `get_or_create_client` (e.g.,
    /// `build_device_client` in embedding pipelines).
    ///
    /// This ensures that tensor allocations (which route through
    /// `CudaRuntime::allocate` → `get_or_create_client`) and kernel launches
    /// (which use the client supplied by the caller) always share the same CUDA
    /// stream, maintaining the stream-ordering invariant required by
    /// `cuMemAllocAsync`/`cuMemFreeAsync`.
    ///
    /// # Errors
    ///
    /// Returns an error if device initialisation fails (invalid device index,
    /// driver error, cuBLAS init failure, etc.).
    pub fn new(device: CudaDevice) -> Result<Self, CudaError> {
        // Return the cached canonical client if one already exists.
        if let Some(cached) = super::cache::try_get_cached_client(device.index) {
            return Ok(cached);
        }
        let client = Self::new_uncached(device)?;
        Ok(super::cache::register_or_get_client(
            client.device.index,
            client,
        ))
    }

    /// Construct a brand-new client without consulting the cache.
    ///
    /// Used internally by `get_or_create_client` and `new`. External callers
    /// should always use `CudaClient::new`.
    pub(super) fn new_uncached(device: CudaDevice) -> Result<Self, CudaError> {
        // Create CUDA context for this device
        let context = CudaContext::new(device.index).map_err(|e| {
            CudaError::ContextError(format!(
                "Failed to create CUDA context for device {}: {:?}",
                device.index, e
            ))
        })?;

        // Bind context to current thread for proper cuBLAS operation
        context.bind_to_thread().map_err(|e| {
            CudaError::ContextError(format!("Failed to bind CUDA context to thread: {:?}", e))
        })?;

        // Create compute stream
        let stream = context.new_stream().map_err(|e| {
            CudaError::ContextError(format!("Failed to create CUDA stream: {:?}", e))
        })?;

        // Create dedicated copy stream for overlapped D2H transfers
        let copy_stream = context.new_stream().map_err(|e| {
            CudaError::ContextError(format!("Failed to create CUDA copy stream: {:?}", e))
        })?;

        // Initialize cuBLAS handle for GEMM operations
        let cublas = CudaBlas::new(stream.clone())
            .map_err(|e| CudaError::CublasError(format!("Failed to initialize cuBLAS: {:?}", e)))?;

        // Configure the default memory pool with a bounded release threshold.
        // `u64::MAX` (cache everything forever) is great for tight decode loops
        // but causes fragmentation: after loading many small tensors then trying
        // to allocate a multi-GB contiguous block (e.g. a large model's weight),
        // the pool reports OOM despite ample free VRAM because its address space
        // is fragmented. A 512 MiB threshold lets the pool keep moderate caches
        // for hot reuse but reclaim larger freed segments back to the OS.
        // The OOM retry path additionally trims the pool to 0 to recover
        // from fragmentation when a large request would otherwise fail.
        //
        // The threshold is tunable per workload/hardware via the
        // `NUMR_CUDA_POOL_RELEASE_THRESHOLD_MB` env var (value in MiB); many-shape
        // workloads (e.g. variable-length embedding ingest) may prefer a smaller
        // threshold to reclaim aggressively, while tight decode loops may raise it.
        let mut pool_handle: u64 = 0;
        unsafe {
            let mut pool: cudarc::driver::sys::CUmemoryPool = std::ptr::null_mut();
            let result =
                cudarc::driver::sys::cuDeviceGetDefaultMemPool(&mut pool, device.index as i32);
            if result == cudarc::driver::sys::CUresult::CUDA_SUCCESS && !pool.is_null() {
                let threshold: u64 = super::env_config::env_mib_to_bytes(
                    "NUMR_CUDA_POOL_RELEASE_THRESHOLD_MB",
                    512 * 1024 * 1024,
                );
                let _ = cudarc::driver::sys::cuMemPoolSetAttribute(
                    pool,
                    cudarc::driver::sys::CUmemPool_attribute::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
                    &threshold as *const u64 as *mut std::ffi::c_void,
                );
                pool_handle = pool as u64;
            }
        }

        let allocator = CudaAllocator::new(stream.clone(), pool_handle);

        let raw_handle = CudaRawHandle {
            context: context.clone(),
            stream: stream.clone(),
        };

        Ok(Self {
            device,
            context,
            stream,
            copy_stream,
            cublas: Arc::new(cublas),
            allocator,
            raw_handle,
            sobol_dv_cache: SobolDvCache::new(),
        })
    }

    /// Get reference to the CUDA stream.
    ///
    /// **CRITICAL**: All kernel launches MUST use this stream for correct ordering.
    #[inline]
    pub fn stream(&self) -> &CudaStream {
        &self.stream
    }

    /// Get the Arc-wrapped CUDA stream for operations that need ownership.
    #[inline]
    pub fn stream_arc(&self) -> &Arc<CudaStream> {
        &self.stream
    }

    /// Get reference to the CUDA context.
    #[inline]
    pub fn context(&self) -> &Arc<CudaContext> {
        &self.context
    }

    /// Get reference to the copy stream (for overlapped D2H transfers).
    #[inline]
    pub fn copy_stream(&self) -> &CudaStream {
        &self.copy_stream
    }

    /// Get reference to the cuBLAS handle.
    #[inline]
    pub fn cublas(&self) -> &CudaBlas {
        &self.cublas
    }

    /// Record an event on the compute stream.
    ///
    /// Returns an event handle that can be passed to `copy_stream_wait_event`.
    pub fn record_event_on_compute(&self) -> Result<u64, CudaError> {
        use cudarc::driver::sys::{CUevent_flags, cuEventCreate, cuEventRecord};
        unsafe {
            let mut event = std::ptr::null_mut();
            let r = cuEventCreate(&mut event, CUevent_flags::CU_EVENT_DISABLE_TIMING as u32);
            if r != cudarc::driver::sys::CUresult::CUDA_SUCCESS {
                return Err(CudaError::ContextError(format!(
                    "cuEventCreate failed: {:?}",
                    r
                )));
            }
            let r = cuEventRecord(event, self.stream.cu_stream());
            if r != cudarc::driver::sys::CUresult::CUDA_SUCCESS {
                cudarc::driver::sys::cuEventDestroy_v2(event);
                return Err(CudaError::ContextError(format!(
                    "cuEventRecord failed: {:?}",
                    r
                )));
            }
            Ok(event as u64)
        }
    }

    /// Make the copy stream wait for an event recorded on the compute stream.
    pub fn copy_stream_wait_event(&self, event: u64) -> Result<(), CudaError> {
        use cudarc::driver::sys::cuStreamWaitEvent;
        unsafe {
            let r = cuStreamWaitEvent(
                self.copy_stream.cu_stream(),
                event as cudarc::driver::sys::CUevent,
                0,
            );
            if r != cudarc::driver::sys::CUresult::CUDA_SUCCESS {
                return Err(CudaError::ContextError(format!(
                    "cuStreamWaitEvent failed: {:?}",
                    r
                )));
            }
        }
        Ok(())
    }

    /// Pre-load CUDA PTX modules to avoid JIT compilation latency on first use.
    ///
    /// Call this during warmup with the list of numr kernel module names
    /// that will be used during inference.
    pub fn preload_modules(&self, module_names: &[&'static str]) -> crate::error::Result<()> {
        crate::runtime::cuda::kernels::preload_modules(
            &self.context,
            self.device.index,
            module_names,
        )
    }

    /// Pre-populate the Sobol direction-vector device buffer for `dimension`.
    ///
    /// Computes `dimension * 32` direction vectors on the host, uploads them to
    /// a persistent device buffer, and stores the pointer in the per-client
    /// cache. Subsequent calls to `sobol(…, dimension, …)` — including those
    /// executed inside a CUDA graph capture region — will use the cached pointer
    /// instead of performing a new H2D copy, which would embed a memcpy node
    /// with a freed host-pointer source in the captured graph.
    ///
    /// # Contract for CUDA graph capture
    ///
    /// Call `warmup_sobol(dimension)` **once**, outside any capture region,
    /// before using `sobol(…, dimension, …)` inside `capture_graph_into`. The
    /// upload stream is synchronised before this method returns, so the device
    /// buffer is fully ready when the next capture begins.
    ///
    /// Calling `warmup_sobol` with the same `dimension` multiple times is safe
    /// and cheap (the cache entry already exists; no second allocation is made).
    ///
    /// # Errors
    ///
    /// Returns an error if:
    /// - `dimension` exceeds the maximum supported by the Joe & Kuo dataset
    ///   (21 201).
    /// - Device memory allocation fails.
    /// - The H2D copy fails.
    pub fn warmup_sobol(&self, dimension: usize) -> crate::error::Result<()> {
        use crate::ops::common::quasirandom::{SOBOL_BITS, SOBOL_MAX_DIMENSIONS};

        if dimension == 0 {
            return Err(crate::error::Error::InvalidArgument {
                arg: "dimension",
                reason: "Sobol dimension must be at least 1".into(),
            });
        }
        if dimension > SOBOL_MAX_DIMENSIONS {
            return Err(crate::error::Error::InvalidArgument {
                arg: "dimension",
                reason: format!(
                    "Sobol dimension {} exceeds maximum supported value {}",
                    dimension, SOBOL_MAX_DIMENSIONS
                ),
            });
        }

        let dim_u32 = dimension as u32;

        // Fast path: already cached.
        if self.sobol_dv_cache.get(dim_u32).is_some() {
            return Ok(());
        }

        // Compute direction vectors on the host.
        let direction_vectors =
            crate::ops::common::quasirandom::compute_all_direction_vectors(dimension);
        let num_u32s = direction_vectors.len();
        debug_assert_eq!(num_u32s, dimension * SOBOL_BITS);

        let dv_bytes = bytemuck::cast_slice::<u32, u8>(&direction_vectors);

        // Allocate a device buffer directly via the driver (bypassing the
        // caching allocator's frozen path) so the address survives across
        // graph replays.  We use `cuMemAllocAsync` on the compute stream
        // for proper pool membership and synchronise before returning.
        let dv_ptr: u64 = unsafe {
            let mut ptr: u64 = 0;
            let r = cudarc::driver::sys::cuMemAllocAsync(
                &mut ptr,
                dv_bytes.len(),
                self.stream.cu_stream(),
            );
            if r != cudarc::driver::sys::CUresult::CUDA_SUCCESS {
                return Err(crate::error::Error::OutOfMemory {
                    size: dv_bytes.len(),
                });
            }
            ptr
        };

        // H2D copy.
        unsafe {
            let r = cudarc::driver::sys::cuMemcpyHtoDAsync_v2(
                dv_ptr,
                dv_bytes.as_ptr() as *const std::ffi::c_void,
                dv_bytes.len(),
                self.stream.cu_stream(),
            );
            if r != cudarc::driver::sys::CUresult::CUDA_SUCCESS {
                // Free the buffer we just allocated before returning the error.
                let _ = cudarc::driver::sys::cuMemFreeAsync(dv_ptr, self.stream.cu_stream());
                return Err(crate::error::Error::Backend(format!(
                    "Sobol warmup H2D copy failed: {:?}",
                    r
                )));
            }
        }

        // Synchronise: the buffer must be fully uploaded before any subsequent
        // capture region references the pointer.
        self.stream
            .synchronize()
            .map_err(|e| crate::error::Error::Internal(format!("stream sync failed: {:?}", e)))?;

        // Store in cache. Ownership of the device pointer is transferred.
        // SAFETY: ptr is a valid, fully-uploaded device buffer.
        unsafe { self.sobol_dv_cache.insert(dim_u32, dv_ptr, num_u32s) };

        Ok(())
    }

    /// Destroy a CUDA event handle returned by `record_event_on_compute`.
    ///
    /// Must be called after the copy stream has finished using the event
    /// (i.e., after `copy_stream.synchronize()`). Passing an already-destroyed
    /// or invalid handle is safe (CUDA ignores it).
    pub fn destroy_event(&self, event: u64) {
        unsafe {
            cudarc::driver::sys::cuEventDestroy_v2(event as cudarc::driver::sys::CUevent);
        }
    }
}

impl RuntimeClient<CudaRuntime> for CudaClient {
    fn device(&self) -> &CudaDevice {
        &self.device
    }

    fn synchronize(&self) {
        if let Err(e) = self.stream.synchronize() {
            eprintln!("[numr::cuda] Stream synchronization failed: {:?}", e);
        }
    }

    fn allocator(&self) -> &CudaAllocator {
        &self.allocator
    }

    fn compute_stream_handle(&self) -> Option<u64> {
        Some(self.stream.cu_stream() as u64)
    }
}

// ============================================================================
// CudaRawHandle
// ============================================================================

/// Raw handle for custom kernel launching.
///
/// Provides access to the CUDA context and stream for users who want
/// to launch their own kernels outside of numr's operation system.
///
/// # Example
///
/// ```ignore
/// let client = CudaRuntime::default_client(&device);
/// let handle = CudaRuntime::raw_handle(&client);
///
/// // Use handle.stream for custom kernel launches
/// // Use handle.context for context management
/// ```
#[derive(Clone)]
pub struct CudaRawHandle {
    /// CUDA context for device management
    pub context: Arc<CudaContext>,
    /// CUDA stream for kernel execution
    pub stream: Arc<CudaStream>,
}