dynamo_memory/pool/
cuda.rs

1// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2// SPDX-License-Identifier: Apache-2.0
3
4//! CUDA memory pool for efficient device memory allocation in hot paths.
5//!
6//! This module provides a safe wrapper around CUDA's memory pool APIs, enabling
7//! fast async allocations that avoid the overhead of cudaMalloc/cudaFree per call.
8//! Memory is returned to the pool on free and reused for subsequent allocations.
9//!
10//! # Thread Safety
11//!
12//! [`CudaMemPool`] uses internal locking to serialize host-side calls to the CUDA
13//! driver. This is required because `cuMemAllocFromPoolAsync` is not host-thread
14//! reentrant. The GPU-side operations remain stream-ordered and asynchronous.
15
16use anyhow::{Result, anyhow};
17use cudarc::driver::sys::{
18    self, CUmemAllocationType, CUmemLocationType, CUmemPool_attribute, CUmemPoolProps,
19    CUmemoryPool, CUresult, CUstream,
20};
21use cudarc::driver::{CudaContext, CudaStream};
22use std::ptr;
23use std::sync::{Arc, Mutex};
24
25/// Builder for creating a CUDA memory pool with configurable parameters.
26///
27/// # Example
28/// ```ignore
29/// let pool = CudaMemPoolBuilder::new(context, 64 * 1024 * 1024) // 64 MiB reserve
30///     .release_threshold(32 * 1024 * 1024) // 32 MiB release threshold
31///     .build()?;
32/// ```
33pub struct CudaMemPoolBuilder {
34    /// CUDA context for the target device.
35    context: Arc<CudaContext>,
36    /// Bytes to pre-allocate to warm the pool.
37    reserve_size: usize,
38    /// Optional threshold above which memory is returned to the system on free.
39    release_threshold: Option<u64>,
40}
41
42impl CudaMemPoolBuilder {
43    /// Create a new builder with the required reserve size.
44    ///
45    /// # Arguments
46    /// * `context` - CUDA context for the device
47    /// * `reserve_size` - Number of bytes to pre-allocate to warm the pool
48    pub fn new(context: Arc<CudaContext>, reserve_size: usize) -> Self {
49        Self {
50            context,
51            reserve_size,
52            release_threshold: None,
53        }
54    }
55
56    /// Set the release threshold for the pool.
57    ///
58    /// Memory above this threshold is returned to the system when freed.
59    /// If not set, no release threshold is configured (CUDA default behavior).
60    pub fn release_threshold(mut self, threshold: u64) -> Self {
61        self.release_threshold = Some(threshold);
62        self
63    }
64
65    /// Build the CUDA memory pool.
66    ///
67    /// This will:
68    /// 1. Create the pool
69    /// 2. Set the release threshold if configured
70    /// 3. Pre-allocate and free memory to warm the pool
71    pub fn build(self) -> Result<CudaMemPool> {
72        // Initialize pool properties
73        let mut props: CUmemPoolProps = unsafe { std::mem::zeroed() };
74        props.allocType = CUmemAllocationType::CU_MEM_ALLOCATION_TYPE_PINNED;
75        props.location.type_ = CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE;
76        props.location.id = self.context.cu_device();
77
78        let mut pool: CUmemoryPool = ptr::null_mut();
79
80        // Create the pool
81        let result = unsafe { sys::cuMemPoolCreate(&mut pool, &props) };
82        if result != CUresult::CUDA_SUCCESS {
83            return Err(anyhow!("cuMemPoolCreate failed with error: {:?}", result));
84        }
85
86        // Set release threshold if configured
87        if let Some(threshold) = self.release_threshold {
88            let result = unsafe {
89                sys::cuMemPoolSetAttribute(
90                    pool,
91                    CUmemPool_attribute::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
92                    &threshold as *const u64 as *mut std::ffi::c_void,
93                )
94            };
95            if result != CUresult::CUDA_SUCCESS {
96                // Clean up on failure
97                unsafe { sys::cuMemPoolDestroy(pool) };
98                return Err(anyhow!(
99                    "cuMemPoolSetAttribute failed with error: {:?}",
100                    result
101                ));
102            }
103        }
104
105        let cuda_pool = CudaMemPool {
106            inner: Mutex::new(pool),
107        };
108
109        // Warm the pool by pre-allocating and freeing memory
110        if self.reserve_size > 0 {
111            // Create a temporary stream for warming
112            let stream = self.context.new_stream()?;
113
114            // Allocate to warm the pool (using safe variant)
115            let ptr = cuda_pool.alloc_async(self.reserve_size, &stream)?;
116
117            // Free back to pool (memory stays reserved)
118            cuda_pool.free_async(ptr, &stream)?;
119
120            // Synchronize to ensure operations complete
121            // SAFETY: stream.cu_stream() is valid for the lifetime of `stream`
122            let result = unsafe { sys::cuStreamSynchronize(stream.cu_stream()) };
123            if result != CUresult::CUDA_SUCCESS {
124                return Err(anyhow!(
125                    "cuStreamSynchronize failed with error: {:?}",
126                    result
127                ));
128            }
129        }
130
131        Ok(cuda_pool)
132    }
133}
134
135/// Safe wrapper around a CUDA memory pool.
136///
137/// The pool amortizes allocation overhead by maintaining a reservoir of device memory.
138/// Allocations are fast sub-allocations from this reservoir, and frees return memory
139/// to the pool rather than the OS (until the release threshold is exceeded).
140///
141/// # Thread Safety
142///
143/// This type uses internal locking to serialize host-side calls to CUDA driver APIs.
144/// `cuMemAllocFromPoolAsync` is not host-thread reentrant, so concurrent calls from
145/// multiple threads must be serialized. The GPU-side operations remain asynchronous
146/// and stream-ordered.
147///
148/// Use [`CudaMemPoolBuilder`] for configurable pool creation with pre-allocation.
149pub struct CudaMemPool {
150    /// Mutex protecting the pool handle for host-thread serialization.
151    ///
152    /// CUDA's `cuMemAllocFromPoolAsync` does not guarantee host-thread reentrancy,
153    /// so all calls to the pool must be serialized on the host side.
154    inner: Mutex<CUmemoryPool>,
155}
156
157// SAFETY: CudaMemPool is Send because the Mutex serializes all host-side access
158// to the pool handle, and CUDA driver state is thread-safe when properly serialized.
159unsafe impl Send for CudaMemPool {}
160
161// SAFETY: CudaMemPool is Sync because all access to the pool handle goes through
162// the Mutex, which serializes host-thread access. The CUDA driver requires this
163// serialization because cuMemAllocFromPoolAsync is not host-thread reentrant.
164unsafe impl Sync for CudaMemPool {}
165
166impl CudaMemPool {
167    /// Create a builder for a new CUDA memory pool.
168    ///
169    /// # Arguments
170    /// * `context` - CUDA context for the device
171    /// * `reserve_size` - Number of bytes to pre-allocate to warm the pool
172    pub fn builder(context: Arc<CudaContext>, reserve_size: usize) -> CudaMemPoolBuilder {
173        CudaMemPoolBuilder::new(context, reserve_size)
174    }
175
176    /// Allocate memory from the pool asynchronously.
177    ///
178    /// This is the safe variant that takes a `&CudaStream` reference, ensuring
179    /// the stream is valid for the duration of the call.
180    ///
181    /// The allocation is stream-ordered; the memory is available for use
182    /// after all preceding operations on the stream complete.
183    ///
184    /// # Host Serialization
185    ///
186    /// This method acquires an internal mutex because `cuMemAllocFromPoolAsync`
187    /// is not host-thread reentrant. The allocation itself is stream-ordered on
188    /// the GPU side.
189    ///
190    /// # Arguments
191    /// * `size` - Size in bytes to allocate
192    /// * `stream` - CUDA stream for async ordering
193    ///
194    /// # Returns
195    /// Device pointer to the allocated memory
196    pub fn alloc_async(&self, size: usize, stream: &CudaStream) -> Result<u64> {
197        // SAFETY: stream.cu_stream() returns a valid handle owned by the CudaStream,
198        // and the borrow ensures the stream lives for the duration of this call.
199        unsafe { self.alloc_async_raw(size, stream.cu_stream()) }
200    }
201
202    /// Allocate memory from the pool asynchronously (raw stream handle variant).
203    ///
204    /// This is the unsafe variant for use when you have a raw `CUstream` handle
205    /// from sources other than cudarc's `CudaStream`.
206    ///
207    /// # Host Serialization
208    ///
209    /// This method acquires an internal mutex because `cuMemAllocFromPoolAsync`
210    /// is not host-thread reentrant.
211    ///
212    /// # Arguments
213    /// * `size` - Size in bytes to allocate
214    /// * `stream` - Raw CUDA stream handle for async ordering
215    ///
216    /// # Returns
217    /// Device pointer to the allocated memory
218    ///
219    /// # Safety
220    ///
221    /// The caller must ensure that `stream` is a valid CUDA stream handle that
222    /// will remain valid for the duration of this call.
223    pub unsafe fn alloc_async_raw(&self, size: usize, stream: CUstream) -> Result<u64> {
224        let pool = self
225            .inner
226            .lock()
227            .map_err(|e| anyhow!("mutex poisoned: {}", e))?;
228
229        let mut ptr: u64 = 0;
230
231        let result = unsafe { sys::cuMemAllocFromPoolAsync(&mut ptr, size, *pool, stream) };
232
233        if result != CUresult::CUDA_SUCCESS {
234            return Err(anyhow!(
235                "cuMemAllocFromPoolAsync failed with error: {:?}",
236                result
237            ));
238        }
239
240        Ok(ptr)
241    }
242
243    /// Free memory back to the pool asynchronously.
244    ///
245    /// This is the safe variant that takes a `&CudaStream` reference.
246    ///
247    /// The memory is returned to the pool's reservoir (not the OS) and can be
248    /// reused by subsequent allocations. The free is stream-ordered.
249    ///
250    /// # Arguments
251    /// * `ptr` - Device pointer previously allocated from this pool
252    /// * `stream` - CUDA stream for async ordering
253    pub fn free_async(&self, ptr: u64, stream: &CudaStream) -> Result<()> {
254        // SAFETY: stream.cu_stream() returns a valid handle owned by the CudaStream,
255        // and the borrow ensures the stream lives for the duration of this call.
256        unsafe { self.free_async_raw(ptr, stream.cu_stream()) }
257    }
258
259    // NOTE: Unlike alloc_async_raw, this method does NOT acquire the pool mutex.
260    // The mutex in alloc_async_raw ensures each allocation returns a unique pointer.
261    // cuMemFreeAsync only enqueues a stream-ordered free operation for that unique
262    // pointer - multiple threads can safely enqueue frees for different unique pointers
263    // concurrently. The actual return-to-pool happens asynchronously on the GPU side.
264
265    /// Free memory back to the pool asynchronously (raw stream handle variant).
266    ///
267    /// This is the unsafe variant for use when you have a raw `CUstream` handle.
268    ///
269    /// The memory is returned to the pool's reservoir (not the OS) and can be
270    /// reused by subsequent allocations. The free is stream-ordered.
271    ///
272    /// # Arguments
273    /// * `ptr` - Device pointer previously allocated from this pool
274    /// * `stream` - Raw CUDA stream handle for async ordering
275    ///
276    /// # Safety
277    ///
278    /// The caller must ensure that:
279    /// - `ptr` is a valid device pointer previously allocated from this pool
280    /// - `stream` is a valid CUDA stream handle
281    pub unsafe fn free_async_raw(&self, ptr: u64, stream: CUstream) -> Result<()> {
282        let result = unsafe { sys::cuMemFreeAsync(ptr, stream) };
283
284        if result != CUresult::CUDA_SUCCESS {
285            return Err(anyhow!("cuMemFreeAsync failed with error: {:?}", result));
286        }
287
288        Ok(())
289    }
290}
291
292impl Drop for CudaMemPool {
293    fn drop(&mut self) {
294        // No need to lock - we have &mut self so exclusive access is guaranteed
295        let pool = self
296            .inner
297            .get_mut()
298            .expect("mutex should not be poisoned during drop");
299
300        // Destroy the pool, releasing all memory back to the system
301        let result = unsafe { sys::cuMemPoolDestroy(*pool) };
302        if result != CUresult::CUDA_SUCCESS {
303            tracing::warn!("cuMemPoolDestroy failed with error: {:?}", result);
304        }
305    }
306}
307
308#[cfg(all(test, feature = "testing-cuda"))]
309mod tests {
310    use super::*;
311
312    #[test]
313    fn test_pool_creation_with_builder() {
314        // Skip if no CUDA device available
315        let context = match CudaContext::new(0) {
316            Ok(ctx) => ctx,
317            Err(e) => {
318                eprintln!("Skipping test - no CUDA device: {:?}", e);
319                return;
320            }
321        };
322
323        // Test builder with reserve size and release threshold
324        let result = CudaMemPool::builder(context.clone(), 1024 * 1024) // 1 MiB reserve
325            .release_threshold(64 * 1024 * 1024) // 64 MiB threshold
326            .build();
327
328        if result.is_err() {
329            eprintln!("Skipping test - pool creation failed: {:?}", result.err());
330            return;
331        }
332        let pool = result.unwrap();
333        drop(pool);
334    }
335
336    #[test]
337    fn test_pool_creation_no_threshold() {
338        // Skip if no CUDA device available
339        let context = match CudaContext::new(0) {
340            Ok(ctx) => ctx,
341            Err(e) => {
342                eprintln!("Skipping test - no CUDA device: {:?}", e);
343                return;
344            }
345        };
346
347        // Test builder without release threshold
348        let result = CudaMemPool::builder(context, 0).build();
349
350        if result.is_err() {
351            eprintln!("Skipping test - pool creation failed: {:?}", result.err());
352            return;
353        }
354        let pool = result.unwrap();
355        drop(pool);
356    }
357}
dynamo_memory/pool/cuda.rs

dynamo_memory/pool/
cuda.rs