Skip to main content

oxicuda_driver/
stream_ordered_alloc.rs

1//! Stream-ordered memory allocation (CUDA 11.2+ / 12.x+).
2//!
3//! Stream-ordered memory allocation allows memory operations (`alloc` / `free`)
4//! to participate in the stream execution order, eliminating the need for
5//! explicit synchronisation between allocation and kernel launch.
6//!
7//! This module provides:
8//!
9//! * [`StreamMemoryPool`] — a memory pool bound to a specific device.
10//! * [`StreamAllocation`] — a handle to a stream-ordered allocation.
11//! * [`StreamOrderedAllocConfig`] — pool configuration (sizes, thresholds).
12//! * [`PoolAttribute`] / [`PoolUsageStats`] — attribute queries and statistics.
13//! * [`PoolExportDescriptor`] / [`ShareableHandleType`] — IPC sharing metadata.
14//! * [`stream_alloc`] / [`stream_free`] — convenience free functions.
15//!
16//! # Platform behaviour
17//!
18//! On macOS (where NVIDIA dropped CUDA support), all operations that would
19//! require the GPU driver return `Err(CudaError::NotSupported)`.  Config
20//! validation, statistics tracking, and accessor methods work everywhere.
21//!
22//! # Example
23//!
24//! ```rust,no_run
25//! use oxicuda_driver::stream_ordered_alloc::*;
26//!
27//! let config = StreamOrderedAllocConfig::default_for_device(0);
28//! let mut pool = StreamMemoryPool::new(config)?;
29//!
30//! let stream_handle = 0u64; // placeholder
31//! let mut alloc = pool.alloc_async(1024, stream_handle)?;
32//! assert_eq!(alloc.size(), 1024);
33//! assert!(!alloc.is_freed());
34//!
35//! pool.free_async(&mut alloc)?;
36//! assert!(alloc.is_freed());
37//! # Ok::<(), oxicuda_driver::CudaError>(())
38//! ```
39
40use std::fmt;
41
42use crate::error::{CudaError, CudaResult};
43use crate::ffi::CUdeviceptr;
44
45// ---------------------------------------------------------------------------
46// Constants — CUmemPoolAttribute (mirrors CUDA header values)
47// ---------------------------------------------------------------------------
48
49/// Pool reuse policy: follow event dependencies.
50pub const CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: u32 = 1;
51/// Pool reuse policy: allow opportunistic reuse.
52pub const CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: u32 = 2;
53/// Pool reuse policy: allow internal dependency insertion.
54pub const CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: u32 = 3;
55/// Release threshold in bytes (memory returned to OS when usage drops below).
56pub const CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: u32 = 4;
57/// Current reserved memory (bytes) — read-only.
58pub const CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT: u32 = 5;
59/// High-water mark of reserved memory (bytes) — resettable.
60pub const CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: u32 = 6;
61/// Current used memory (bytes) — read-only.
62pub const CU_MEMPOOL_ATTR_USED_MEM_CURRENT: u32 = 7;
63/// High-water mark of used memory (bytes) — resettable.
64pub const CU_MEMPOOL_ATTR_USED_MEM_HIGH: u32 = 8;
65
66// ---------------------------------------------------------------------------
67// StreamOrderedAllocConfig
68// ---------------------------------------------------------------------------
69
70/// Configuration for a stream-ordered memory pool.
71///
72/// All sizes are in bytes.
73#[derive(Debug, Clone, PartialEq, Eq)]
74pub struct StreamOrderedAllocConfig {
75    /// Initial pool size in bytes.  The pool pre-reserves this amount of
76    /// device memory when created.
77    pub initial_pool_size: usize,
78
79    /// Maximum pool size in bytes.  `0` means unlimited — the pool will grow
80    /// as needed (subject to device memory limits).
81    pub max_pool_size: usize,
82
83    /// Release threshold in bytes.  When the pool is trimmed, at least this
84    /// much memory is kept reserved for future allocations.
85    pub release_threshold: usize,
86
87    /// The device ordinal to create the pool on.
88    pub device: i32,
89}
90
91impl StreamOrderedAllocConfig {
92    /// Validate that the configuration is internally consistent.
93    ///
94    /// # Rules
95    ///
96    /// * `initial_pool_size` must not exceed `max_pool_size` (when
97    ///   `max_pool_size > 0`).
98    /// * `release_threshold` must not exceed `max_pool_size` (when
99    ///   `max_pool_size > 0`).
100    /// * `device` must be non-negative.
101    ///
102    /// # Errors
103    ///
104    /// Returns [`CudaError::InvalidValue`] if any rule is violated.
105    pub fn validate(&self) -> CudaResult<()> {
106        if self.device < 0 {
107            return Err(CudaError::InvalidValue);
108        }
109
110        if self.max_pool_size > 0 {
111            if self.initial_pool_size > self.max_pool_size {
112                return Err(CudaError::InvalidValue);
113            }
114            if self.release_threshold > self.max_pool_size {
115                return Err(CudaError::InvalidValue);
116            }
117        }
118
119        Ok(())
120    }
121
122    /// Returns a sensible default configuration for the given device.
123    ///
124    /// * `initial_pool_size` = 0 (grow on demand)
125    /// * `max_pool_size` = 0 (unlimited)
126    /// * `release_threshold` = 0 (release everything on trim)
127    pub fn default_for_device(device: i32) -> Self {
128        Self {
129            initial_pool_size: 0,
130            max_pool_size: 0,
131            release_threshold: 0,
132            device,
133        }
134    }
135}
136
137// ---------------------------------------------------------------------------
138// PoolAttribute
139// ---------------------------------------------------------------------------
140
141/// Attributes that can be queried or set on a [`StreamMemoryPool`].
142#[derive(Debug, Clone, Copy, PartialEq, Eq)]
143pub enum PoolAttribute {
144    /// Whether freed blocks can be reused by following event dependencies.
145    ReuseFollowEventDependencies,
146    /// Whether freed blocks can be opportunistically reused (without ordering).
147    ReuseAllowOpportunistic,
148    /// Whether the pool may insert internal dependencies for reuse.
149    ReuseAllowInternalDependencies,
150    /// The release threshold in bytes.
151    ReleaseThreshold(u64),
152    /// Current reserved memory (read-only query).
153    ReservedMemCurrent,
154    /// High-water mark of reserved memory.
155    ReservedMemHigh,
156    /// Current used memory (read-only query).
157    UsedMemCurrent,
158    /// High-water mark of used memory.
159    UsedMemHigh,
160}
161
162impl PoolAttribute {
163    /// Convert to the raw CUDA attribute constant.
164    pub fn to_raw(self) -> u32 {
165        match self {
166            Self::ReuseFollowEventDependencies => CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES,
167            Self::ReuseAllowOpportunistic => CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC,
168            Self::ReuseAllowInternalDependencies => {
169                CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES
170            }
171            Self::ReleaseThreshold(_) => CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
172            Self::ReservedMemCurrent => CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT,
173            Self::ReservedMemHigh => CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH,
174            Self::UsedMemCurrent => CU_MEMPOOL_ATTR_USED_MEM_CURRENT,
175            Self::UsedMemHigh => CU_MEMPOOL_ATTR_USED_MEM_HIGH,
176        }
177    }
178}
179
180// ---------------------------------------------------------------------------
181// PoolUsageStats
182// ---------------------------------------------------------------------------
183
184/// Snapshot of pool memory usage.
185#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
186pub struct PoolUsageStats {
187    /// Bytes currently reserved from the device allocator.
188    pub reserved_current: u64,
189    /// Peak bytes reserved (since creation or last reset).
190    pub reserved_high: u64,
191    /// Bytes currently in use by outstanding allocations.
192    pub used_current: u64,
193    /// Peak bytes in use (since creation or last reset).
194    pub used_high: u64,
195    /// Number of active (not-yet-freed) allocations.
196    pub active_allocations: usize,
197    /// Peak number of concurrent allocations.
198    pub peak_allocations: usize,
199}
200
201// ---------------------------------------------------------------------------
202// ShareableHandleType / PoolExportDescriptor
203// ---------------------------------------------------------------------------
204
205/// Handle type used for IPC sharing of memory pools.
206#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
207pub enum ShareableHandleType {
208    /// No sharing.
209    #[default]
210    None,
211    /// POSIX file descriptor (Linux).
212    PosixFileDescriptor,
213    /// Win32 handle (Windows).
214    Win32Handle,
215    /// Win32 KMT handle (Windows, legacy).
216    Win32KmtHandle,
217}
218
219/// Descriptor for exporting a pool for IPC sharing.
220#[derive(Debug, Clone, Copy, PartialEq, Eq)]
221pub struct PoolExportDescriptor {
222    /// The handle type to use for sharing.
223    pub shareable_handle_type: ShareableHandleType,
224    /// The device ordinal that owns the pool.
225    pub pool_device: i32,
226}
227
228// ---------------------------------------------------------------------------
229// StreamAllocation
230// ---------------------------------------------------------------------------
231
232/// Handle to a stream-ordered memory allocation.
233///
234/// An allocation lives on the GPU and is associated with a specific stream
235/// and memory pool.  It becomes available when all preceding work on the
236/// stream has completed, and is returned to the pool when freed (also
237/// stream-ordered).
238pub struct StreamAllocation {
239    /// Device pointer (`CUdeviceptr`).
240    ptr: CUdeviceptr,
241    /// Size of the allocation in bytes.
242    size: usize,
243    /// The stream this allocation is ordered on.
244    stream: u64,
245    /// The pool handle that owns this allocation.
246    pool: u64,
247    /// Whether this allocation has already been freed.
248    freed: bool,
249}
250
251impl StreamAllocation {
252    /// Returns the device pointer as a raw `u64` (`CUdeviceptr`).
253    #[inline]
254    pub fn as_ptr(&self) -> u64 {
255        self.ptr
256    }
257
258    /// Returns the allocation size in bytes.
259    #[inline]
260    pub fn size(&self) -> usize {
261        self.size
262    }
263
264    /// Returns `true` if this allocation has been freed.
265    #[inline]
266    pub fn is_freed(&self) -> bool {
267        self.freed
268    }
269
270    /// Returns the stream handle this allocation is ordered on.
271    #[inline]
272    pub fn stream(&self) -> u64 {
273        self.stream
274    }
275
276    /// Returns the pool handle that owns this allocation.
277    #[inline]
278    pub fn pool(&self) -> u64 {
279        self.pool
280    }
281}
282
283impl fmt::Debug for StreamAllocation {
284    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
285        f.debug_struct("StreamAllocation")
286            .field("ptr", &format_args!("0x{:016x}", self.ptr))
287            .field("size", &self.size)
288            .field("stream", &format_args!("0x{:016x}", self.stream))
289            .field("freed", &self.freed)
290            .finish()
291    }
292}
293
294// ---------------------------------------------------------------------------
295// StreamMemoryPool
296// ---------------------------------------------------------------------------
297
298/// A memory pool for stream-ordered allocations.
299///
300/// On platforms with a real CUDA driver (Linux, Windows), creating a pool
301/// calls `cuMemPoolCreate` under the hood.  On macOS (where there is no
302/// NVIDIA driver), pool metadata is tracked locally but any operation that
303/// would require the driver returns `Err(CudaError::NotSupported)`.
304///
305/// # Allocation tracking
306///
307/// The pool tracks allocation counts and byte totals locally for
308/// diagnostics.  These statistics are maintained even on macOS so that
309/// the API surface can be exercised in tests.
310pub struct StreamMemoryPool {
311    /// Raw `CUmemoryPool` handle (0 if not backed by a real driver pool).
312    handle: u64,
313    /// Device ordinal.
314    device: i32,
315    /// Configuration used to create this pool.
316    config: StreamOrderedAllocConfig,
317    /// Number of currently active (not freed) allocations.
318    active_allocations: usize,
319    /// Total bytes currently allocated.
320    total_allocated: usize,
321    /// Peak bytes ever allocated concurrently.
322    peak_allocated: usize,
323    /// Peak number of concurrent allocations.
324    peak_allocation_count: usize,
325    /// Monotonically increasing allocation id for generating unique pointers
326    /// in non-GPU mode.
327    #[cfg_attr(not(target_os = "macos"), allow(dead_code))]
328    next_alloc_id: u64,
329}
330
331impl fmt::Debug for StreamMemoryPool {
332    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
333        f.debug_struct("StreamMemoryPool")
334            .field("handle", &format_args!("0x{:016x}", self.handle))
335            .field("device", &self.device)
336            .field("active_allocations", &self.active_allocations)
337            .field("total_allocated", &self.total_allocated)
338            .field("peak_allocated", &self.peak_allocated)
339            .finish()
340    }
341}
342
343impl StreamMemoryPool {
344    /// Create a new memory pool for the given device.
345    ///
346    /// The configuration is validated before the pool is created.  On
347    /// platforms with a real CUDA driver, `cuMemPoolCreate` is invoked.
348    /// On macOS, a local-only pool is created for testing purposes.
349    ///
350    /// # Errors
351    ///
352    /// * [`CudaError::InvalidValue`] if the config fails validation.
353    /// * [`CudaError::NotSupported`] on macOS (pool metadata is still created
354    ///   so that tests can exercise the API).
355    pub fn new(config: StreamOrderedAllocConfig) -> CudaResult<Self> {
356        config.validate()?;
357
358        #[cfg_attr(target_os = "macos", allow(unused_mut))]
359        let mut pool = Self {
360            handle: 0,
361            device: config.device,
362            config,
363            active_allocations: 0,
364            total_allocated: 0,
365            peak_allocated: 0,
366            peak_allocation_count: 0,
367            next_alloc_id: 1,
368        };
369
370        // On real GPU platforms, create the driver-side pool via
371        // `cuMemPoolCreate` and store the returned handle.  When the driver
372        // is absent the call returns `Err` and pool creation fails cleanly.
373        #[cfg(not(target_os = "macos"))]
374        {
375            pool.handle = Self::gpu_create_pool(&pool.config)?;
376        }
377
378        Ok(pool)
379    }
380
381    /// Allocate memory on a stream (stream-ordered).
382    ///
383    /// The allocation becomes available when all prior work on the stream
384    /// has completed.  The returned [`StreamAllocation`] tracks the pointer,
385    /// size, and ownership.
386    ///
387    /// # Errors
388    ///
389    /// * [`CudaError::InvalidValue`] if `size` is zero.
390    /// * [`CudaError::OutOfMemory`] if `max_pool_size` would be exceeded.
391    /// * [`CudaError::NotSupported`] on macOS.
392    pub fn alloc_async(&mut self, size: usize, stream: u64) -> CudaResult<StreamAllocation> {
393        if size == 0 {
394            return Err(CudaError::InvalidValue);
395        }
396
397        // Check max pool size constraint.
398        if self.config.max_pool_size > 0
399            && self.total_allocated.saturating_add(size) > self.config.max_pool_size
400        {
401            return Err(CudaError::OutOfMemory);
402        }
403
404        let ptr = self.platform_alloc_async(size, stream)?;
405
406        // Update bookkeeping.
407        self.active_allocations += 1;
408        self.total_allocated = self.total_allocated.saturating_add(size);
409        if self.total_allocated > self.peak_allocated {
410            self.peak_allocated = self.total_allocated;
411        }
412        if self.active_allocations > self.peak_allocation_count {
413            self.peak_allocation_count = self.active_allocations;
414        }
415
416        Ok(StreamAllocation {
417            ptr,
418            size,
419            stream,
420            pool: self.handle,
421            freed: false,
422        })
423    }
424
425    /// Free memory on a stream (stream-ordered).
426    ///
427    /// The memory is returned to the pool when all prior work on the
428    /// stream has completed.  The allocation is marked as freed and
429    /// cannot be freed again.
430    ///
431    /// # Errors
432    ///
433    /// * [`CudaError::InvalidValue`] if the allocation is already freed.
434    /// * [`CudaError::NotSupported`] on macOS.
435    pub fn free_async(&mut self, alloc: &mut StreamAllocation) -> CudaResult<()> {
436        if alloc.freed {
437            return Err(CudaError::InvalidValue);
438        }
439
440        self.platform_free_async(alloc)?;
441
442        alloc.freed = true;
443        self.active_allocations = self.active_allocations.saturating_sub(1);
444        self.total_allocated = self.total_allocated.saturating_sub(alloc.size);
445
446        Ok(())
447    }
448
449    /// Trim the pool, releasing unused memory back to the OS.
450    ///
451    /// At least `min_bytes_to_keep` bytes of reserved memory will remain
452    /// in the pool for future allocations.
453    ///
454    /// # Errors
455    ///
456    /// * [`CudaError::NotSupported`] on macOS.
457    pub fn trim(&mut self, min_bytes_to_keep: usize) -> CudaResult<()> {
458        self.platform_trim(min_bytes_to_keep)
459    }
460
461    /// Get pool usage statistics.
462    ///
463    /// The returned [`PoolUsageStats`] combines locally tracked allocation
464    /// counts with byte-level information.  On macOS, the reserved/used
465    /// byte fields mirror the local bookkeeping since no driver is available.
466    pub fn stats(&self) -> PoolUsageStats {
467        PoolUsageStats {
468            reserved_current: self.total_allocated as u64,
469            reserved_high: self.peak_allocated as u64,
470            used_current: self.total_allocated as u64,
471            used_high: self.peak_allocated as u64,
472            active_allocations: self.active_allocations,
473            peak_allocations: self.peak_allocation_count,
474        }
475    }
476
477    /// Set a pool attribute.
478    ///
479    /// Only attributes that carry a value (e.g. [`PoolAttribute::ReleaseThreshold`])
480    /// modify pool state.  Read-only attributes (e.g. `ReservedMemCurrent`)
481    /// return [`CudaError::InvalidValue`].
482    ///
483    /// # Errors
484    ///
485    /// * [`CudaError::InvalidValue`] for read-only attributes.
486    /// * [`CudaError::NotSupported`] on macOS.
487    pub fn set_attribute(&mut self, attr: PoolAttribute) -> CudaResult<()> {
488        // Read-only attributes cannot be set.
489        match attr {
490            PoolAttribute::ReservedMemCurrent
491            | PoolAttribute::UsedMemCurrent
492            | PoolAttribute::ReservedMemHigh
493            | PoolAttribute::UsedMemHigh => {
494                return Err(CudaError::InvalidValue);
495            }
496            _ => {}
497        }
498
499        // Apply locally-meaningful attributes.
500        if let PoolAttribute::ReleaseThreshold(val) = attr {
501            self.config.release_threshold = val as usize;
502        }
503
504        self.platform_set_attribute(attr)
505    }
506
507    /// Enable peer access from another device to allocations in this pool.
508    ///
509    /// After this call, kernels running on `peer_device` can access memory
510    /// allocated from this pool.
511    ///
512    /// # Errors
513    ///
514    /// * [`CudaError::InvalidDevice`] if `peer_device` equals this pool's device.
515    /// * [`CudaError::NotSupported`] on macOS.
516    pub fn enable_peer_access(&self, peer_device: i32) -> CudaResult<()> {
517        if peer_device == self.device {
518            return Err(CudaError::InvalidDevice);
519        }
520
521        self.platform_enable_peer_access(peer_device)
522    }
523
524    /// Disable peer access from another device to allocations in this pool.
525    ///
526    /// # Errors
527    ///
528    /// * [`CudaError::InvalidDevice`] if `peer_device` equals this pool's device.
529    /// * [`CudaError::NotSupported`] on macOS.
530    pub fn disable_peer_access(&self, peer_device: i32) -> CudaResult<()> {
531        if peer_device == self.device {
532            return Err(CudaError::InvalidDevice);
533        }
534
535        self.platform_disable_peer_access(peer_device)
536    }
537
538    /// Reset peak statistics (peak allocated bytes and peak allocation count).
539    pub fn reset_peak_stats(&mut self) {
540        self.peak_allocated = self.total_allocated;
541        self.peak_allocation_count = self.active_allocations;
542    }
543
544    /// Get the default memory pool for a device.
545    ///
546    /// CUDA provides a default pool per device, queried via
547    /// `cuDeviceGetDefaultMemPool`.  The returned pool is owned by the
548    /// driver and is *not* destroyed when the [`StreamMemoryPool`] wrapper
549    /// is dropped.  On macOS, this returns a local-only pool with default
550    /// configuration.
551    ///
552    /// # Errors
553    ///
554    /// * [`CudaError::InvalidValue`] if `device` is negative.
555    /// * [`CudaError::NotInitialized`] if the CUDA driver is not loaded.
556    /// * Any [`CudaError`] mapped from `cuDeviceGetDefaultMemPool`.
557    pub fn default_pool(device: i32) -> CudaResult<Self> {
558        if device < 0 {
559            return Err(CudaError::InvalidValue);
560        }
561
562        let config = StreamOrderedAllocConfig::default_for_device(device);
563
564        // On macOS there is no driver — fall back to a local-only pool.
565        #[cfg(target_os = "macos")]
566        {
567            Self::new(config)
568        }
569
570        // On real GPU platforms, resolve the device's default pool handle.
571        #[cfg(not(target_os = "macos"))]
572        {
573            let handle = Self::gpu_default_pool(device)?;
574            Ok(Self {
575                handle,
576                device,
577                config,
578                active_allocations: 0,
579                total_allocated: 0,
580                peak_allocated: 0,
581                peak_allocation_count: 0,
582                next_alloc_id: 1,
583            })
584        }
585    }
586
587    /// Returns the raw pool handle.
588    #[inline]
589    pub fn handle(&self) -> u64 {
590        self.handle
591    }
592
593    /// Returns the device ordinal.
594    #[inline]
595    pub fn device(&self) -> i32 {
596        self.device
597    }
598
599    /// Returns the pool configuration.
600    #[inline]
601    pub fn config(&self) -> &StreamOrderedAllocConfig {
602        &self.config
603    }
604
605    // -----------------------------------------------------------------------
606    // Platform-specific helpers
607    // -----------------------------------------------------------------------
608
609    /// Perform the actual allocation.  On macOS, generates a synthetic pointer.
610    fn platform_alloc_async(&mut self, size: usize, stream: u64) -> CudaResult<CUdeviceptr> {
611        #[cfg(target_os = "macos")]
612        {
613            let _ = stream;
614            // Generate a synthetic, non-zero device pointer for testing.
615            // Each allocation gets a unique "address" based on the pool's
616            // monotonic counter, with a base offset to avoid null.
617            let synthetic_ptr = 0x1000_0000_0000_u64 + self.next_alloc_id * 0x1000;
618            self.next_alloc_id = self.next_alloc_id.wrapping_add(1);
619            let _ = size;
620            Ok(synthetic_ptr)
621        }
622
623        #[cfg(not(target_os = "macos"))]
624        {
625            Self::gpu_alloc_async(self.handle, size, stream)
626        }
627    }
628
629    /// Trim on current platform.
630    fn platform_trim(&mut self, min_bytes_to_keep: usize) -> CudaResult<()> {
631        #[cfg(target_os = "macos")]
632        {
633            let _ = min_bytes_to_keep;
634            Err(CudaError::NotSupported)
635        }
636
637        #[cfg(not(target_os = "macos"))]
638        {
639            Self::gpu_trim(self.handle, min_bytes_to_keep)
640        }
641    }
642
643    /// Set attribute on current platform.
644    fn platform_set_attribute(&self, attr: PoolAttribute) -> CudaResult<()> {
645        #[cfg(target_os = "macos")]
646        {
647            match attr {
648                PoolAttribute::ReleaseThreshold(_) => Ok(()),
649                _ => Err(CudaError::NotSupported),
650            }
651        }
652
653        #[cfg(not(target_os = "macos"))]
654        {
655            Self::gpu_set_attribute(self.handle, attr)
656        }
657    }
658
659    /// Enable peer access on current platform.
660    fn platform_enable_peer_access(&self, peer_device: i32) -> CudaResult<()> {
661        #[cfg(target_os = "macos")]
662        {
663            let _ = peer_device;
664            Err(CudaError::NotSupported)
665        }
666
667        #[cfg(not(target_os = "macos"))]
668        {
669            Self::gpu_enable_peer_access(self.handle, peer_device)
670        }
671    }
672
673    /// Disable peer access on current platform.
674    fn platform_disable_peer_access(&self, peer_device: i32) -> CudaResult<()> {
675        #[cfg(target_os = "macos")]
676        {
677            let _ = peer_device;
678            Err(CudaError::NotSupported)
679        }
680
681        #[cfg(not(target_os = "macos"))]
682        {
683            Self::gpu_disable_peer_access(self.handle, peer_device)
684        }
685    }
686
687    /// Perform the actual free.  On macOS, this is a no-op (synthetic pointers).
688    fn platform_free_async(&self, alloc: &StreamAllocation) -> CudaResult<()> {
689        #[cfg(target_os = "macos")]
690        {
691            let _ = alloc;
692            Ok(())
693        }
694
695        #[cfg(not(target_os = "macos"))]
696        {
697            Self::gpu_free_async(alloc.ptr, alloc.stream)
698        }
699    }
700
701    // -----------------------------------------------------------------------
702    // GPU-only driver bindings (compiled out on macOS)
703    // -----------------------------------------------------------------------
704
705    /// Create the pool on the GPU via `cuMemPoolCreate`.
706    ///
707    /// Builds a [`CUmemPoolProps`] from the pool configuration (pinned device
708    /// memory on `config.device`, `max_size` from `config.max_pool_size`),
709    /// invokes the driver, and returns the raw `CUmemoryPool` handle encoded
710    /// as a `u64`.
711    ///
712    /// When the driver is absent, [`try_driver`](crate::loader::try_driver)
713    /// returns `Err(CudaError::NotInitialized)` and pool creation fails
714    /// cleanly.  When the driver is present but predates CUDA 11.2 (no
715    /// `cuMemPoolCreate`), [`CudaError::NotSupported`] is returned.
716    #[cfg(not(target_os = "macos"))]
717    fn gpu_create_pool(config: &StreamOrderedAllocConfig) -> CudaResult<u64> {
718        use crate::ffi::{
719            CUmemAllocationType, CUmemLocation, CUmemLocationType, CUmemPoolProps, CUmemoryPool,
720        };
721
722        let api = crate::loader::try_driver()?;
723        let create = api.cu_mem_pool_create.ok_or(CudaError::NotSupported)?;
724
725        let props = CUmemPoolProps {
726            alloc_type: CUmemAllocationType::Pinned as u32,
727            handle_types: 0,
728            location: CUmemLocation {
729                loc_type: CUmemLocationType::Device as u32,
730                id: config.device,
731            },
732            win32_security_attributes: std::ptr::null_mut(),
733            max_size: config.max_pool_size,
734            reserved: [0u8; 56],
735        };
736
737        let mut pool = CUmemoryPool::default();
738        // SAFETY: `create` was just resolved from the driver; `props` and
739        // `pool` are valid, correctly-typed local variables, and the CUDA
740        // ABI's reserved padding is zeroed.
741        let rc = unsafe { create(&mut pool, &props) };
742        crate::error::check(rc)?;
743
744        Ok(pool.0 as usize as u64)
745    }
746
747    /// Resolve a device's default memory pool via `cuDeviceGetDefaultMemPool`.
748    #[cfg(not(target_os = "macos"))]
749    fn gpu_default_pool(device: i32) -> CudaResult<u64> {
750        use crate::ffi::CUmemoryPool;
751
752        let api = crate::loader::try_driver()?;
753        let get_default = api
754            .cu_device_get_default_mem_pool
755            .ok_or(CudaError::NotSupported)?;
756
757        let mut pool = CUmemoryPool::default();
758        // SAFETY: `get_default` was just resolved from the driver; `pool` is
759        // a valid local and `device` is a plain device ordinal.
760        let rc = unsafe { get_default(&mut pool, device) };
761        crate::error::check(rc)?;
762
763        Ok(pool.0 as usize as u64)
764    }
765
766    /// Allocate stream-ordered memory.
767    ///
768    /// When `pool_handle` is non-zero, allocates from that explicit pool via
769    /// `cuMemAllocFromPoolAsync`; when it is zero (default-pool semantics),
770    /// uses the context-wide `cuMemAllocAsync`.
771    #[cfg(not(target_os = "macos"))]
772    fn gpu_alloc_async(pool_handle: u64, size: usize, stream: u64) -> CudaResult<CUdeviceptr> {
773        use crate::ffi::{CUmemoryPool, CUstream};
774
775        let api = crate::loader::try_driver()?;
776        let cu_stream = CUstream(stream as usize as *mut std::ffi::c_void);
777        let mut dptr: CUdeviceptr = 0;
778
779        if pool_handle != 0 {
780            let alloc_from_pool = api
781                .cu_mem_alloc_from_pool_async
782                .ok_or(CudaError::NotSupported)?;
783            let pool = CUmemoryPool(pool_handle as usize as *mut std::ffi::c_void);
784            // SAFETY: `alloc_from_pool` was just resolved; `dptr` is a valid
785            // out-pointer and `pool`/`cu_stream` are reconstructed handles.
786            let rc = unsafe { alloc_from_pool(&mut dptr, size, pool, cu_stream) };
787            crate::error::check(rc)?;
788        } else {
789            let alloc_async = api.cu_mem_alloc_async.ok_or(CudaError::NotSupported)?;
790            // SAFETY: `alloc_async` was just resolved; `dptr` is a valid
791            // out-pointer and `cu_stream` is a reconstructed handle.
792            let rc = unsafe { alloc_async(&mut dptr, size, cu_stream) };
793            crate::error::check(rc)?;
794        }
795
796        Ok(dptr)
797    }
798
799    /// Free stream-ordered memory via `cuMemFreeAsync`.
800    #[cfg(not(target_os = "macos"))]
801    fn gpu_free_async(ptr: CUdeviceptr, stream: u64) -> CudaResult<()> {
802        use crate::ffi::CUstream;
803
804        let api = crate::loader::try_driver()?;
805        let free_async = api.cu_mem_free_async.ok_or(CudaError::NotSupported)?;
806        let cu_stream = CUstream(stream as usize as *mut std::ffi::c_void);
807        // SAFETY: `free_async` was just resolved from the driver; `ptr` is a
808        // device pointer previously returned by an async allocation and
809        // `cu_stream` is a reconstructed handle.
810        crate::error::check(unsafe { free_async(ptr, cu_stream) })
811    }
812
813    /// Trim the pool via `cuMemPoolTrimTo`.
814    #[cfg(not(target_os = "macos"))]
815    fn gpu_trim(pool_handle: u64, min_bytes_to_keep: usize) -> CudaResult<()> {
816        use crate::ffi::CUmemoryPool;
817
818        let api = crate::loader::try_driver()?;
819        let trim = api.cu_mem_pool_trim_to.ok_or(CudaError::NotSupported)?;
820        let pool = CUmemoryPool(pool_handle as usize as *mut std::ffi::c_void);
821        // SAFETY: `trim` was just resolved from the driver; `pool` is a
822        // reconstructed pool handle and `min_bytes_to_keep` is a plain count.
823        crate::error::check(unsafe { trim(pool, min_bytes_to_keep) })
824    }
825
826    /// Set a pool attribute via `cuMemPoolSetAttribute`.
827    ///
828    /// The reuse-policy attributes carry an `int` value; the release
829    /// threshold carries a `cuuint64_t`.  The value buffer is sized
830    /// accordingly and passed to the driver.
831    #[cfg(not(target_os = "macos"))]
832    fn gpu_set_attribute(pool_handle: u64, attr: PoolAttribute) -> CudaResult<()> {
833        use crate::ffi::CUmemoryPool;
834
835        let api = crate::loader::try_driver()?;
836        let set_attr = api
837            .cu_mem_pool_set_attribute
838            .ok_or(CudaError::NotSupported)?;
839        let pool = CUmemoryPool(pool_handle as usize as *mut std::ffi::c_void);
840        let raw_attr = Self::map_pool_attribute(attr)?;
841
842        // The driver dereferences `value` as either `int` or `cuuint64_t`
843        // depending on the attribute.  Stack-allocate the correct width.
844        match attr {
845            PoolAttribute::ReuseFollowEventDependencies
846            | PoolAttribute::ReuseAllowOpportunistic
847            | PoolAttribute::ReuseAllowInternalDependencies => {
848                // Boolean-style reuse policies: enable (1) the policy.
849                let mut value: std::ffi::c_int = 1;
850                // SAFETY: `set_attr` was just resolved; `pool` is a
851                // reconstructed handle and `value` is a valid `int` matching
852                // the attribute's documented value type.
853                let rc = unsafe {
854                    set_attr(pool, raw_attr, (&mut value as *mut std::ffi::c_int).cast())
855                };
856                crate::error::check(rc)
857            }
858            PoolAttribute::ReleaseThreshold(threshold) => {
859                let mut value: u64 = threshold;
860                // SAFETY: `set_attr` was just resolved; `pool` is a
861                // reconstructed handle and `value` is a valid `cuuint64_t`
862                // matching the release-threshold value type.
863                let rc = unsafe { set_attr(pool, raw_attr, (&mut value as *mut u64).cast()) };
864                crate::error::check(rc)
865            }
866            // Read-only attributes are rejected before reaching this point.
867            PoolAttribute::ReservedMemCurrent
868            | PoolAttribute::ReservedMemHigh
869            | PoolAttribute::UsedMemCurrent
870            | PoolAttribute::UsedMemHigh => Err(CudaError::InvalidValue),
871        }
872    }
873
874    /// Map a [`PoolAttribute`] to the driver's [`CUmemPoolAttribute`].
875    #[cfg(not(target_os = "macos"))]
876    fn map_pool_attribute(attr: PoolAttribute) -> CudaResult<crate::ffi::CUmemPoolAttribute> {
877        use crate::ffi::CUmemPoolAttribute;
878        Ok(match attr {
879            PoolAttribute::ReuseFollowEventDependencies => {
880                CUmemPoolAttribute::ReuseFollowEventDependencies
881            }
882            PoolAttribute::ReuseAllowOpportunistic => CUmemPoolAttribute::ReuseAllowOpportunistic,
883            PoolAttribute::ReuseAllowInternalDependencies => {
884                CUmemPoolAttribute::ReuseAllowInternalDependencies
885            }
886            PoolAttribute::ReleaseThreshold(_) => CUmemPoolAttribute::ReleaseThreshold,
887            PoolAttribute::ReservedMemCurrent => CUmemPoolAttribute::ReservedMemCurrent,
888            PoolAttribute::ReservedMemHigh => CUmemPoolAttribute::ReservedMemHigh,
889            PoolAttribute::UsedMemCurrent => CUmemPoolAttribute::UsedMemCurrent,
890            PoolAttribute::UsedMemHigh => CUmemPoolAttribute::UsedMemHigh,
891        })
892    }
893
894    /// Enable peer access from `peer_device` via `cuMemPoolSetAccess`.
895    ///
896    /// Builds a [`CUmemAccessDesc`] granting read-write access to the peer
897    /// device and applies it to the pool.
898    #[cfg(not(target_os = "macos"))]
899    fn gpu_enable_peer_access(pool_handle: u64, peer_device: i32) -> CudaResult<()> {
900        Self::gpu_set_pool_access(pool_handle, peer_device, true)
901    }
902
903    /// Disable peer access from `peer_device` via `cuMemPoolSetAccess`.
904    #[cfg(not(target_os = "macos"))]
905    fn gpu_disable_peer_access(pool_handle: u64, peer_device: i32) -> CudaResult<()> {
906        Self::gpu_set_pool_access(pool_handle, peer_device, false)
907    }
908
909    /// Shared implementation for enabling / disabling pool peer access.
910    #[cfg(not(target_os = "macos"))]
911    fn gpu_set_pool_access(pool_handle: u64, peer_device: i32, enable: bool) -> CudaResult<()> {
912        use crate::ffi::{
913            CUmemAccessDesc, CUmemAccessFlags, CUmemLocation, CUmemLocationType, CUmemoryPool,
914        };
915
916        let api = crate::loader::try_driver()?;
917        let set_access = api.cu_mem_pool_set_access.ok_or(CudaError::NotSupported)?;
918        let pool = CUmemoryPool(pool_handle as usize as *mut std::ffi::c_void);
919
920        let flags = if enable {
921            CUmemAccessFlags::ReadWrite
922        } else {
923            CUmemAccessFlags::None
924        };
925        let desc = CUmemAccessDesc {
926            location: CUmemLocation {
927                loc_type: CUmemLocationType::Device as u32,
928                id: peer_device,
929            },
930            flags: flags as u32,
931        };
932
933        // SAFETY: `set_access` was just resolved from the driver; `pool` is a
934        // reconstructed handle and `desc` is a single valid descriptor.
935        let rc = unsafe { set_access(pool, &desc, 1) };
936        crate::error::check(rc)
937    }
938}
939
940// ---------------------------------------------------------------------------
941// Convenience free functions
942// ---------------------------------------------------------------------------
943
944/// Allocate memory on a stream using the default pool for device 0.
945///
946/// This is a convenience wrapper around [`StreamMemoryPool::default_pool`]
947/// and [`StreamMemoryPool::alloc_async`].
948///
949/// # Errors
950///
951/// Propagates errors from pool creation and allocation.
952pub fn stream_alloc(size: usize, stream: u64) -> CudaResult<StreamAllocation> {
953    let mut pool = StreamMemoryPool::default_pool(0)?;
954    pool.alloc_async(size, stream)
955}
956
957/// Free a stream-ordered allocation using a temporary default pool.
958///
959/// # Errors
960///
961/// * [`CudaError::InvalidValue`] if the allocation is already freed.
962pub fn stream_free(alloc: &mut StreamAllocation) -> CudaResult<()> {
963    if alloc.freed {
964        return Err(CudaError::InvalidValue);
965    }
966
967    // On macOS, just mark as freed (no real GPU work).
968    #[cfg(target_os = "macos")]
969    {
970        alloc.freed = true;
971        Ok(())
972    }
973
974    #[cfg(not(target_os = "macos"))]
975    {
976        StreamMemoryPool::gpu_free_async(alloc.ptr, alloc.stream)?;
977        alloc.freed = true;
978        Ok(())
979    }
980}
981
982// ---------------------------------------------------------------------------
983// Tests
984// ---------------------------------------------------------------------------
985
986#[cfg(test)]
987mod tests {
988    use super::*;
989
990    /// Returns `true` when a real CUDA driver is loadable on this host.
991    ///
992    /// Pool creation on non-macOS platforms now performs a genuine
993    /// `cuMemPoolCreate`; without a driver it must fail with a clean typed
994    /// error rather than succeeding or panicking.  Tests that need a live
995    /// pool gate on this helper.
996    #[cfg(not(target_os = "macos"))]
997    fn driver_present() -> bool {
998        crate::loader::try_driver().is_ok()
999    }
1000
1001    // -- Config validation -------------------------------------------------
1002
1003    #[test]
1004    fn config_validate_valid_sizes() {
1005        let config = StreamOrderedAllocConfig {
1006            initial_pool_size: 1024,
1007            max_pool_size: 4096,
1008            release_threshold: 512,
1009            device: 0,
1010        };
1011        assert!(config.validate().is_ok());
1012    }
1013
1014    #[test]
1015    fn config_validate_unlimited_max() {
1016        let config = StreamOrderedAllocConfig {
1017            initial_pool_size: 1024 * 1024,
1018            max_pool_size: 0, // unlimited
1019            release_threshold: 512,
1020            device: 0,
1021        };
1022        assert!(config.validate().is_ok());
1023    }
1024
1025    #[test]
1026    fn config_validate_initial_exceeds_max() {
1027        let config = StreamOrderedAllocConfig {
1028            initial_pool_size: 8192,
1029            max_pool_size: 4096,
1030            release_threshold: 0,
1031            device: 0,
1032        };
1033        assert_eq!(config.validate(), Err(CudaError::InvalidValue));
1034    }
1035
1036    #[test]
1037    fn config_validate_negative_device() {
1038        let config = StreamOrderedAllocConfig {
1039            initial_pool_size: 0,
1040            max_pool_size: 0,
1041            release_threshold: 0,
1042            device: -1,
1043        };
1044        assert_eq!(config.validate(), Err(CudaError::InvalidValue));
1045    }
1046
1047    #[test]
1048    fn config_validate_threshold_exceeds_max() {
1049        let config = StreamOrderedAllocConfig {
1050            initial_pool_size: 0,
1051            max_pool_size: 1024,
1052            release_threshold: 2048,
1053            device: 0,
1054        };
1055        assert_eq!(config.validate(), Err(CudaError::InvalidValue));
1056    }
1057
1058    // -- Default config ----------------------------------------------------
1059
1060    #[test]
1061    fn default_config_for_device() {
1062        let config = StreamOrderedAllocConfig::default_for_device(2);
1063        assert_eq!(config.device, 2);
1064        assert_eq!(config.initial_pool_size, 0);
1065        assert_eq!(config.max_pool_size, 0);
1066        assert_eq!(config.release_threshold, 0);
1067        assert!(config.validate().is_ok());
1068    }
1069
1070    // -- Pool creation -----------------------------------------------------
1071
1072    /// On macOS, pool creation always succeeds with a local-only pool.
1073    #[cfg(target_os = "macos")]
1074    #[test]
1075    fn pool_creation() {
1076        let config = StreamOrderedAllocConfig::default_for_device(0);
1077        let pool = StreamMemoryPool::new(config);
1078        assert!(pool.is_ok());
1079        let pool = pool.ok();
1080        assert!(pool.is_some());
1081        let pool = pool.map(|p| {
1082            assert_eq!(p.device(), 0);
1083            assert_eq!(p.active_allocations, 0);
1084            assert_eq!(p.total_allocated, 0);
1085        });
1086        let _ = pool;
1087    }
1088
1089    /// On non-macOS, pool creation performs a real `cuMemPoolCreate`: it
1090    /// succeeds when a driver is present and otherwise fails with a clean
1091    /// typed error (never a panic).
1092    #[cfg(not(target_os = "macos"))]
1093    #[test]
1094    fn pool_creation() {
1095        let config = StreamOrderedAllocConfig::default_for_device(0);
1096        let pool = StreamMemoryPool::new(config);
1097        if driver_present() {
1098            // A live driver may still reject the pool (e.g. no device);
1099            // either way the result must be a typed Result, not a panic.
1100            if let Ok(p) = pool {
1101                assert_eq!(p.device(), 0);
1102                assert_eq!(p.active_allocations, 0);
1103                assert_eq!(p.total_allocated, 0);
1104            } else {
1105                assert!(matches!(
1106                    pool,
1107                    Err(CudaError::NotSupported)
1108                        | Err(CudaError::NoDevice)
1109                        | Err(CudaError::InvalidDevice)
1110                        | Err(CudaError::InvalidContext)
1111                        | Err(CudaError::NotInitialized)
1112                ));
1113            }
1114        } else {
1115            // No driver: must surface a clean NotInitialized error.
1116            assert_eq!(pool.err(), Some(CudaError::NotInitialized));
1117        }
1118    }
1119
1120    #[test]
1121    fn pool_creation_invalid_config() {
1122        let config = StreamOrderedAllocConfig {
1123            initial_pool_size: 0,
1124            max_pool_size: 0,
1125            release_threshold: 0,
1126            device: -1,
1127        };
1128        let result = StreamMemoryPool::new(config);
1129        assert!(matches!(result, Err(CudaError::InvalidValue)));
1130    }
1131
1132    // -- alloc_async / free_async -----------------------------------------
1133
1134    #[cfg(target_os = "macos")]
1135    #[test]
1136    fn alloc_async_creates_allocation() {
1137        let config = StreamOrderedAllocConfig::default_for_device(0);
1138        let mut pool = StreamMemoryPool::new(config).ok();
1139        assert!(pool.is_some());
1140        let pool = pool.as_mut().map(|p| {
1141            let alloc = p.alloc_async(1024, 0);
1142            assert!(alloc.is_ok());
1143            let alloc = alloc.ok();
1144            assert!(alloc.is_some());
1145            if let Some(a) = &alloc {
1146                assert_eq!(a.size(), 1024);
1147                assert!(!a.is_freed());
1148                assert_ne!(a.as_ptr(), 0);
1149                assert_eq!(a.stream(), 0);
1150            }
1151        });
1152        let _ = pool;
1153    }
1154
1155    #[cfg(target_os = "macos")]
1156    #[test]
1157    fn free_async_marks_freed() {
1158        let config = StreamOrderedAllocConfig::default_for_device(0);
1159        let mut pool =
1160            StreamMemoryPool::new(config).expect("pool creation should succeed on macOS");
1161        let mut alloc = pool
1162            .alloc_async(2048, 0)
1163            .expect("alloc should succeed on macOS");
1164        assert!(!alloc.is_freed());
1165        assert!(pool.free_async(&mut alloc).is_ok());
1166        assert!(alloc.is_freed());
1167        assert_eq!(pool.active_allocations, 0);
1168    }
1169
1170    #[cfg(target_os = "macos")]
1171    #[test]
1172    fn double_free_returns_error() {
1173        let config = StreamOrderedAllocConfig::default_for_device(0);
1174        let mut pool =
1175            StreamMemoryPool::new(config).expect("pool creation should succeed on macOS");
1176        let mut alloc = pool
1177            .alloc_async(512, 0)
1178            .expect("alloc should succeed on macOS");
1179        assert!(pool.free_async(&mut alloc).is_ok());
1180        assert_eq!(pool.free_async(&mut alloc), Err(CudaError::InvalidValue));
1181    }
1182
1183    // -- Trim --------------------------------------------------------------
1184
1185    #[cfg(target_os = "macos")]
1186    #[test]
1187    fn trim_returns_not_supported_on_macos() {
1188        let config = StreamOrderedAllocConfig::default_for_device(0);
1189        let mut pool =
1190            StreamMemoryPool::new(config).expect("pool creation should succeed on macOS");
1191        assert_eq!(pool.trim(0), Err(CudaError::NotSupported));
1192    }
1193
1194    // -- Stats tracking ----------------------------------------------------
1195
1196    #[cfg(target_os = "macos")]
1197    #[test]
1198    fn stats_tracking() {
1199        let config = StreamOrderedAllocConfig::default_for_device(0);
1200        let mut pool =
1201            StreamMemoryPool::new(config).expect("pool creation should succeed on macOS");
1202
1203        let mut a1 = pool.alloc_async(1024, 0).expect("alloc should succeed");
1204        let _a2 = pool.alloc_async(2048, 0).expect("alloc should succeed");
1205
1206        let stats = pool.stats();
1207        assert_eq!(stats.active_allocations, 2);
1208        assert_eq!(stats.used_current, 3072);
1209        assert_eq!(stats.used_high, 3072);
1210        assert_eq!(stats.peak_allocations, 2);
1211
1212        pool.free_async(&mut a1).expect("free should succeed");
1213        let stats = pool.stats();
1214        assert_eq!(stats.active_allocations, 1);
1215        assert_eq!(stats.used_current, 2048);
1216        // Peak should remain at 3072.
1217        assert_eq!(stats.used_high, 3072);
1218    }
1219
1220    // -- Pool attribute setting --------------------------------------------
1221
1222    #[cfg(target_os = "macos")]
1223    #[test]
1224    fn set_attribute_release_threshold() {
1225        let config = StreamOrderedAllocConfig::default_for_device(0);
1226        let mut pool =
1227            StreamMemoryPool::new(config).expect("pool creation should succeed on macOS");
1228        let result = pool.set_attribute(PoolAttribute::ReleaseThreshold(4096));
1229        assert!(result.is_ok());
1230        assert_eq!(pool.config().release_threshold, 4096);
1231    }
1232
1233    /// Read-only attributes are rejected by pre-flight validation, before
1234    /// any driver call.  On macOS the pool is always available.
1235    #[cfg(target_os = "macos")]
1236    #[test]
1237    fn set_attribute_readonly_returns_error() {
1238        let config = StreamOrderedAllocConfig::default_for_device(0);
1239        let mut pool = StreamMemoryPool::new(config).expect("pool creation should succeed");
1240        assert_eq!(
1241            pool.set_attribute(PoolAttribute::ReservedMemCurrent),
1242            Err(CudaError::InvalidValue)
1243        );
1244        assert_eq!(
1245            pool.set_attribute(PoolAttribute::UsedMemCurrent),
1246            Err(CudaError::InvalidValue)
1247        );
1248    }
1249
1250    /// On non-macOS, this can only be exercised when a driver is present
1251    /// (pool creation requires `cuMemPoolCreate`).  The read-only check
1252    /// itself runs before the driver is touched.
1253    #[cfg(not(target_os = "macos"))]
1254    #[test]
1255    fn set_attribute_readonly_returns_error() {
1256        let config = StreamOrderedAllocConfig::default_for_device(0);
1257        let pool = StreamMemoryPool::new(config);
1258        let mut pool = match pool {
1259            Ok(p) => p,
1260            Err(e) => {
1261                // No usable driver/device: pool creation must fail cleanly.
1262                assert!(matches!(
1263                    e,
1264                    CudaError::NotInitialized
1265                        | CudaError::NotSupported
1266                        | CudaError::NoDevice
1267                        | CudaError::InvalidDevice
1268                        | CudaError::InvalidContext
1269                ));
1270                return;
1271            }
1272        };
1273        assert_eq!(
1274            pool.set_attribute(PoolAttribute::ReservedMemCurrent),
1275            Err(CudaError::InvalidValue)
1276        );
1277        assert_eq!(
1278            pool.set_attribute(PoolAttribute::UsedMemCurrent),
1279            Err(CudaError::InvalidValue)
1280        );
1281    }
1282
1283    // -- StreamAllocation accessors ----------------------------------------
1284
1285    #[cfg(target_os = "macos")]
1286    #[test]
1287    fn allocation_accessors() {
1288        let config = StreamOrderedAllocConfig::default_for_device(0);
1289        let mut pool =
1290            StreamMemoryPool::new(config).expect("pool creation should succeed on macOS");
1291        let alloc = pool.alloc_async(4096, 42).expect("alloc should succeed");
1292        assert_eq!(alloc.size(), 4096);
1293        assert_eq!(alloc.stream(), 42);
1294        assert!(!alloc.is_freed());
1295        assert_ne!(alloc.as_ptr(), 0);
1296        // Debug formatting should not panic.
1297        let _debug = format!("{alloc:?}");
1298    }
1299
1300    // -- Convenience functions ---------------------------------------------
1301
1302    #[cfg(target_os = "macos")]
1303    #[test]
1304    fn convenience_stream_alloc() {
1305        let result = stream_alloc(256, 0);
1306        assert!(result.is_ok());
1307        let alloc = result.expect("should succeed on macOS");
1308        assert_eq!(alloc.size(), 256);
1309        assert!(!alloc.is_freed());
1310    }
1311
1312    #[cfg(target_os = "macos")]
1313    #[test]
1314    fn convenience_stream_free() {
1315        let mut alloc = stream_alloc(128, 0).expect("alloc should succeed on macOS");
1316        assert!(stream_free(&mut alloc).is_ok());
1317        assert!(alloc.is_freed());
1318        // Double free via convenience function.
1319        assert_eq!(stream_free(&mut alloc), Err(CudaError::InvalidValue));
1320    }
1321
1322    // -- Large allocation size ---------------------------------------------
1323
1324    #[cfg(target_os = "macos")]
1325    #[test]
1326    fn large_allocation_size() {
1327        let config = StreamOrderedAllocConfig {
1328            initial_pool_size: 0,
1329            max_pool_size: 0, // unlimited
1330            release_threshold: 0,
1331            device: 0,
1332        };
1333        let mut pool =
1334            StreamMemoryPool::new(config).expect("pool creation should succeed on macOS");
1335        // 16 GiB allocation (large but valid).
1336        let size = 16 * 1024 * 1024 * 1024_usize;
1337        let alloc = pool.alloc_async(size, 0);
1338        assert!(alloc.is_ok());
1339        let alloc = alloc.expect("should succeed");
1340        assert_eq!(alloc.size(), size);
1341    }
1342
1343    #[cfg(target_os = "macos")]
1344    #[test]
1345    fn alloc_exceeds_max_pool_size() {
1346        let config = StreamOrderedAllocConfig {
1347            initial_pool_size: 0,
1348            max_pool_size: 1024,
1349            release_threshold: 0,
1350            device: 0,
1351        };
1352        let mut pool = StreamMemoryPool::new(config).expect("pool creation should succeed");
1353        assert!(matches!(
1354            pool.alloc_async(2048, 0),
1355            Err(CudaError::OutOfMemory)
1356        ));
1357    }
1358
1359    // -- Peer access -------------------------------------------------------
1360
1361    /// Same-device peer access is rejected by pre-flight validation, before
1362    /// any driver call.  On macOS the pool is always available.
1363    #[cfg(target_os = "macos")]
1364    #[test]
1365    fn peer_access_same_device_error() {
1366        let config = StreamOrderedAllocConfig::default_for_device(0);
1367        let pool = StreamMemoryPool::new(config).expect("pool creation should succeed");
1368        assert_eq!(pool.enable_peer_access(0), Err(CudaError::InvalidDevice));
1369        assert_eq!(pool.disable_peer_access(0), Err(CudaError::InvalidDevice));
1370    }
1371
1372    /// On non-macOS, the same-device check runs before the driver is
1373    /// touched; it is only reachable when a pool could be created.
1374    #[cfg(not(target_os = "macos"))]
1375    #[test]
1376    fn peer_access_same_device_error() {
1377        let config = StreamOrderedAllocConfig::default_for_device(0);
1378        let pool = match StreamMemoryPool::new(config) {
1379            Ok(p) => p,
1380            Err(e) => {
1381                assert!(matches!(
1382                    e,
1383                    CudaError::NotInitialized
1384                        | CudaError::NotSupported
1385                        | CudaError::NoDevice
1386                        | CudaError::InvalidDevice
1387                        | CudaError::InvalidContext
1388                ));
1389                return;
1390            }
1391        };
1392        assert_eq!(pool.enable_peer_access(0), Err(CudaError::InvalidDevice));
1393        assert_eq!(pool.disable_peer_access(0), Err(CudaError::InvalidDevice));
1394    }
1395
1396    #[cfg(target_os = "macos")]
1397    #[test]
1398    fn peer_access_not_supported_on_macos() {
1399        let config = StreamOrderedAllocConfig::default_for_device(0);
1400        let pool = StreamMemoryPool::new(config).expect("pool creation should succeed on macOS");
1401        assert_eq!(pool.enable_peer_access(1), Err(CudaError::NotSupported));
1402        assert_eq!(pool.disable_peer_access(1), Err(CudaError::NotSupported));
1403    }
1404
1405    // -- Reset peak stats --------------------------------------------------
1406
1407    #[cfg(target_os = "macos")]
1408    #[test]
1409    fn reset_peak_stats() {
1410        let config = StreamOrderedAllocConfig::default_for_device(0);
1411        let mut pool =
1412            StreamMemoryPool::new(config).expect("pool creation should succeed on macOS");
1413
1414        let mut a1 = pool.alloc_async(1024, 0).expect("alloc ok");
1415        let _a2 = pool.alloc_async(2048, 0).expect("alloc ok");
1416        assert_eq!(pool.stats().peak_allocations, 2);
1417        assert_eq!(pool.stats().used_high, 3072);
1418
1419        pool.free_async(&mut a1).expect("free ok");
1420        pool.reset_peak_stats();
1421
1422        let stats = pool.stats();
1423        assert_eq!(stats.used_high, 2048); // reset to current
1424        assert_eq!(stats.peak_allocations, 1); // reset to current
1425    }
1426
1427    // -- Zero-size alloc ---------------------------------------------------
1428
1429    /// Zero-size allocation is rejected by pre-flight validation, before any
1430    /// driver call.  On macOS the pool is always available.
1431    #[cfg(target_os = "macos")]
1432    #[test]
1433    fn alloc_zero_size_returns_error() {
1434        let config = StreamOrderedAllocConfig::default_for_device(0);
1435        let mut pool = StreamMemoryPool::new(config).expect("pool creation should succeed");
1436        assert!(matches!(
1437            pool.alloc_async(0, 0),
1438            Err(CudaError::InvalidValue)
1439        ));
1440    }
1441
1442    /// On non-macOS, the zero-size check runs before the driver is touched;
1443    /// it is only reachable when a pool could be created.
1444    #[cfg(not(target_os = "macos"))]
1445    #[test]
1446    fn alloc_zero_size_returns_error() {
1447        let config = StreamOrderedAllocConfig::default_for_device(0);
1448        let mut pool = match StreamMemoryPool::new(config) {
1449            Ok(p) => p,
1450            Err(e) => {
1451                assert!(matches!(
1452                    e,
1453                    CudaError::NotInitialized
1454                        | CudaError::NotSupported
1455                        | CudaError::NoDevice
1456                        | CudaError::InvalidDevice
1457                        | CudaError::InvalidContext
1458                ));
1459                return;
1460            }
1461        };
1462        assert!(matches!(
1463            pool.alloc_async(0, 0),
1464            Err(CudaError::InvalidValue)
1465        ));
1466    }
1467
1468    // -- Default pool ------------------------------------------------------
1469
1470    /// On macOS, the default pool is a local-only pool and always succeeds.
1471    #[cfg(target_os = "macos")]
1472    #[test]
1473    fn default_pool_valid_device() {
1474        let pool = StreamMemoryPool::default_pool(0);
1475        assert!(pool.is_ok());
1476    }
1477
1478    /// On non-macOS, `default_pool` performs a real
1479    /// `cuDeviceGetDefaultMemPool`: success with a driver, a clean typed
1480    /// error without one — never a panic.
1481    #[cfg(not(target_os = "macos"))]
1482    #[test]
1483    fn default_pool_valid_device() {
1484        let pool = StreamMemoryPool::default_pool(0);
1485        if driver_present() {
1486            if let Ok(p) = pool {
1487                assert_eq!(p.device(), 0);
1488            } else {
1489                assert!(matches!(
1490                    pool,
1491                    Err(CudaError::NotSupported)
1492                        | Err(CudaError::NoDevice)
1493                        | Err(CudaError::InvalidDevice)
1494                        | Err(CudaError::InvalidContext)
1495                        | Err(CudaError::NotInitialized)
1496                ));
1497            }
1498        } else {
1499            assert_eq!(pool.err(), Some(CudaError::NotInitialized));
1500        }
1501    }
1502
1503    #[test]
1504    fn default_pool_negative_device() {
1505        assert!(matches!(
1506            StreamMemoryPool::default_pool(-1),
1507            Err(CudaError::InvalidValue)
1508        ));
1509    }
1510
1511    // -- PoolAttribute::to_raw ---------------------------------------------
1512
1513    #[test]
1514    fn pool_attribute_to_raw() {
1515        assert_eq!(
1516            PoolAttribute::ReuseFollowEventDependencies.to_raw(),
1517            CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES
1518        );
1519        assert_eq!(
1520            PoolAttribute::ReuseAllowOpportunistic.to_raw(),
1521            CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC
1522        );
1523        assert_eq!(
1524            PoolAttribute::ReuseAllowInternalDependencies.to_raw(),
1525            CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES
1526        );
1527        assert_eq!(
1528            PoolAttribute::ReleaseThreshold(0).to_raw(),
1529            CU_MEMPOOL_ATTR_RELEASE_THRESHOLD
1530        );
1531        assert_eq!(
1532            PoolAttribute::ReservedMemCurrent.to_raw(),
1533            CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT
1534        );
1535        assert_eq!(
1536            PoolAttribute::ReservedMemHigh.to_raw(),
1537            CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH
1538        );
1539        assert_eq!(
1540            PoolAttribute::UsedMemCurrent.to_raw(),
1541            CU_MEMPOOL_ATTR_USED_MEM_CURRENT
1542        );
1543        assert_eq!(
1544            PoolAttribute::UsedMemHigh.to_raw(),
1545            CU_MEMPOOL_ATTR_USED_MEM_HIGH
1546        );
1547    }
1548
1549    // -- ShareableHandleType default ---------------------------------------
1550
1551    #[test]
1552    fn shareable_handle_type_default() {
1553        assert_eq!(ShareableHandleType::default(), ShareableHandleType::None);
1554    }
1555
1556    // -- PoolExportDescriptor construction ---------------------------------
1557
1558    #[test]
1559    fn pool_export_descriptor() {
1560        let desc = PoolExportDescriptor {
1561            shareable_handle_type: ShareableHandleType::PosixFileDescriptor,
1562            pool_device: 0,
1563        };
1564        assert_eq!(
1565            desc.shareable_handle_type,
1566            ShareableHandleType::PosixFileDescriptor
1567        );
1568        assert_eq!(desc.pool_device, 0);
1569    }
1570
1571    // -- GPU driver bindings: real-FFI / absent-driver path ----------------
1572    //
1573    // These tests exercise the `gpu_*` bindings against whatever driver the
1574    // host provides.
1575    //
1576    // * Without a driver, every binding must surface a clean typed error
1577    //   (`NotInitialized`) — never a panic, never a fake `Ok`.
1578    // * With a driver, the bindings reach the real CUDA FFI.  The driver
1579    //   *dereferences* pool / device-pointer handles without validating
1580    //   them, so a fabricated handle would segfault.  Handle-consuming
1581    //   bindings are therefore only ever called with handles obtained from
1582    //   a genuine `cuMemPoolCreate` / `cuMemAllocAsync`.
1583
1584    /// Create a real driver-backed pool, or `None` when the host cannot
1585    /// (no driver, no device, or a graphless/poolless driver).
1586    #[cfg(not(target_os = "macos"))]
1587    fn make_real_pool() -> Option<StreamMemoryPool> {
1588        let config = StreamOrderedAllocConfig::default_for_device(0);
1589        StreamMemoryPool::new(config).ok()
1590    }
1591
1592    /// `gpu_create_pool` is deref-free: it builds `CUmemPoolProps` and calls
1593    /// `cuMemPoolCreate`.  Without a driver it fails cleanly; with one it
1594    /// returns a real handle or a typed driver error.
1595    #[cfg(not(target_os = "macos"))]
1596    #[test]
1597    fn gpu_create_pool_real_or_clean_error() {
1598        let config = StreamOrderedAllocConfig::default_for_device(0);
1599        let result = StreamMemoryPool::gpu_create_pool(&config);
1600        if !driver_present() {
1601            assert_eq!(result.err(), Some(CudaError::NotInitialized));
1602        } else {
1603            match result {
1604                Ok(handle) => assert_ne!(handle, 0, "a created pool has a non-null handle"),
1605                Err(e) => assert!(matches!(
1606                    e,
1607                    CudaError::NotSupported
1608                        | CudaError::NoDevice
1609                        | CudaError::InvalidDevice
1610                        | CudaError::InvalidContext
1611                )),
1612            }
1613        }
1614    }
1615
1616    /// `gpu_default_pool` is deref-free: it only needs a device ordinal.
1617    #[cfg(not(target_os = "macos"))]
1618    #[test]
1619    fn gpu_default_pool_real_or_clean_error() {
1620        let result = StreamMemoryPool::gpu_default_pool(0);
1621        if !driver_present() {
1622            assert_eq!(result.err(), Some(CudaError::NotInitialized));
1623        } else {
1624            match result {
1625                Ok(handle) => assert_ne!(handle, 0, "the default pool has a non-null handle"),
1626                Err(e) => assert!(matches!(
1627                    e,
1628                    CudaError::NotSupported | CudaError::NoDevice | CudaError::InvalidDevice
1629                )),
1630            }
1631        }
1632    }
1633
1634    /// `gpu_alloc_async` default-pool path (`handle == 0`) calls
1635    /// `cuMemAllocAsync`, which dereferences no caller handle.  Without a
1636    /// current context the driver returns `InvalidContext`; without a
1637    /// driver, `NotInitialized`.  Either way: a clean typed error, never a
1638    /// panic.  Any device pointer actually returned is freed immediately.
1639    #[cfg(not(target_os = "macos"))]
1640    #[test]
1641    fn gpu_alloc_async_default_pool_is_clean() {
1642        let result = StreamMemoryPool::gpu_alloc_async(0, 1024, 0);
1643        if !driver_present() {
1644            assert_eq!(result.err(), Some(CudaError::NotInitialized));
1645        } else {
1646            match result {
1647                Ok(ptr) => {
1648                    // A live allocation: return it on the same null stream.
1649                    assert_ne!(ptr, 0);
1650                    let _ = StreamMemoryPool::gpu_free_async(ptr, 0);
1651                }
1652                Err(e) => assert!(matches!(
1653                    e,
1654                    CudaError::InvalidContext
1655                        | CudaError::NotSupported
1656                        | CudaError::NoDevice
1657                        | CudaError::InvalidDevice
1658                        | CudaError::OutOfMemory
1659                )),
1660            }
1661        }
1662    }
1663
1664    /// `gpu_trim` on a *real* pool handle: trimming an empty pool succeeds.
1665    /// Without a driver the binding fails cleanly before any dereference.
1666    #[cfg(not(target_os = "macos"))]
1667    #[test]
1668    fn gpu_trim_on_real_pool_or_clean_error() {
1669        if !driver_present() {
1670            // The function must fail cleanly; with no driver `try_driver`
1671            // returns the error before a handle is ever dereferenced.
1672            // (A fabricated handle is never passed to a live driver.)
1673            return;
1674        }
1675        let Some(pool) = make_real_pool() else {
1676            return; // driver present but no usable pool — nothing to trim.
1677        };
1678        // `cuMemPoolTrimTo` on a real, empty pool is a valid no-op.
1679        let result = StreamMemoryPool::gpu_trim(pool.handle(), 0);
1680        assert!(
1681            result.is_ok() || result.is_err(),
1682            "gpu_trim must return a typed Result, not panic"
1683        );
1684        if let Err(e) = result {
1685            assert!(matches!(e, CudaError::NotSupported));
1686        }
1687    }
1688
1689    /// `gpu_set_attribute` on a *real* pool handle, for both the reuse-policy
1690    /// (`int` value) and release-threshold (`cuuint64_t` value) branches.
1691    #[cfg(not(target_os = "macos"))]
1692    #[test]
1693    fn gpu_set_attribute_on_real_pool_or_clean_error() {
1694        if !driver_present() {
1695            return;
1696        }
1697        let Some(pool) = make_real_pool() else {
1698            return;
1699        };
1700        let reuse = StreamMemoryPool::gpu_set_attribute(
1701            pool.handle(),
1702            PoolAttribute::ReuseAllowOpportunistic,
1703        );
1704        let threshold = StreamMemoryPool::gpu_set_attribute(
1705            pool.handle(),
1706            PoolAttribute::ReleaseThreshold(8192),
1707        );
1708        // On a CUDA 11.2+ driver both branches succeed; an older driver
1709        // yields a clean `NotSupported`.
1710        for r in [reuse, threshold] {
1711            if let Err(e) = r {
1712                assert!(matches!(e, CudaError::NotSupported));
1713            }
1714        }
1715    }
1716
1717    /// `gpu_enable_peer_access` / `gpu_disable_peer_access` on a *real* pool.
1718    /// Granting access to a non-existent peer device is a typed driver error
1719    /// (`InvalidDevice`), not a panic.
1720    #[cfg(not(target_os = "macos"))]
1721    #[test]
1722    fn gpu_peer_access_on_real_pool_or_clean_error() {
1723        if !driver_present() {
1724            return;
1725        }
1726        let Some(pool) = make_real_pool() else {
1727            return;
1728        };
1729        // Device 1 may or may not exist; either outcome must be a typed
1730        // Result.  `cuMemPoolSetAccess` dereferences only the real pool
1731        // handle, never a fabricated one.
1732        let enable = StreamMemoryPool::gpu_enable_peer_access(pool.handle(), 1);
1733        let disable = StreamMemoryPool::gpu_disable_peer_access(pool.handle(), 1);
1734        for r in [enable, disable] {
1735            if let Err(e) = r {
1736                assert!(matches!(
1737                    e,
1738                    CudaError::InvalidDevice | CudaError::InvalidValue | CudaError::NotSupported
1739                ));
1740            }
1741        }
1742    }
1743
1744    /// The `gpu_*` bindings surface `NotInitialized` (never a panic) when no
1745    /// driver is loadable.  This is a no-op assertion on a host *with* a
1746    /// driver; the per-binding tests above cover the live-FFI behaviour.
1747    #[cfg(not(target_os = "macos"))]
1748    #[test]
1749    fn gpu_bindings_clean_error_without_driver() {
1750        if driver_present() {
1751            return;
1752        }
1753        // No driver: every binding fails before touching a handle.
1754        let config = StreamOrderedAllocConfig::default_for_device(0);
1755        assert_eq!(
1756            StreamMemoryPool::gpu_create_pool(&config).err(),
1757            Some(CudaError::NotInitialized)
1758        );
1759        assert_eq!(
1760            StreamMemoryPool::gpu_default_pool(0).err(),
1761            Some(CudaError::NotInitialized)
1762        );
1763        assert_eq!(
1764            StreamMemoryPool::gpu_alloc_async(0, 1024, 0).err(),
1765            Some(CudaError::NotInitialized)
1766        );
1767        // Handle-consuming bindings also fail before any dereference.
1768        assert_eq!(
1769            StreamMemoryPool::gpu_alloc_async(0x1, 1024, 0).err(),
1770            Some(CudaError::NotInitialized)
1771        );
1772        assert_eq!(
1773            StreamMemoryPool::gpu_free_async(0x1, 0).err(),
1774            Some(CudaError::NotInitialized)
1775        );
1776        assert_eq!(
1777            StreamMemoryPool::gpu_trim(0x1, 0).err(),
1778            Some(CudaError::NotInitialized)
1779        );
1780        assert_eq!(
1781            StreamMemoryPool::gpu_set_attribute(0x1, PoolAttribute::ReleaseThreshold(1)).err(),
1782            Some(CudaError::NotInitialized)
1783        );
1784        assert_eq!(
1785            StreamMemoryPool::gpu_enable_peer_access(0x1, 1).err(),
1786            Some(CudaError::NotInitialized)
1787        );
1788        assert_eq!(
1789            StreamMemoryPool::gpu_disable_peer_access(0x1, 1).err(),
1790            Some(CudaError::NotInitialized)
1791        );
1792    }
1793
1794    /// `map_pool_attribute` maps every variant to the matching driver enum.
1795    #[cfg(not(target_os = "macos"))]
1796    #[test]
1797    fn map_pool_attribute_covers_all_variants() {
1798        use crate::ffi::CUmemPoolAttribute;
1799        let cases = [
1800            (
1801                PoolAttribute::ReuseFollowEventDependencies,
1802                CUmemPoolAttribute::ReuseFollowEventDependencies,
1803            ),
1804            (
1805                PoolAttribute::ReuseAllowOpportunistic,
1806                CUmemPoolAttribute::ReuseAllowOpportunistic,
1807            ),
1808            (
1809                PoolAttribute::ReuseAllowInternalDependencies,
1810                CUmemPoolAttribute::ReuseAllowInternalDependencies,
1811            ),
1812            (
1813                PoolAttribute::ReleaseThreshold(0),
1814                CUmemPoolAttribute::ReleaseThreshold,
1815            ),
1816            (
1817                PoolAttribute::ReservedMemCurrent,
1818                CUmemPoolAttribute::ReservedMemCurrent,
1819            ),
1820            (
1821                PoolAttribute::ReservedMemHigh,
1822                CUmemPoolAttribute::ReservedMemHigh,
1823            ),
1824            (
1825                PoolAttribute::UsedMemCurrent,
1826                CUmemPoolAttribute::UsedMemCurrent,
1827            ),
1828            (PoolAttribute::UsedMemHigh, CUmemPoolAttribute::UsedMemHigh),
1829        ];
1830        for (attr, expected) in cases {
1831            let mapped = StreamMemoryPool::map_pool_attribute(attr);
1832            assert_eq!(mapped, Ok(expected));
1833        }
1834    }
1835
1836    /// `stream_alloc` convenience: a clean typed error without a driver, and
1837    /// a typed `Result` (allocation, or a context/driver error) with one.
1838    /// Any device pointer actually returned is freed immediately.
1839    #[cfg(not(target_os = "macos"))]
1840    #[test]
1841    fn convenience_stream_alloc_real_or_clean_error() {
1842        let result = stream_alloc(256, 0);
1843        if !driver_present() {
1844            assert_eq!(result.err(), Some(CudaError::NotInitialized));
1845        } else {
1846            match result {
1847                Ok(mut alloc) => {
1848                    assert_eq!(alloc.size(), 256);
1849                    let _ = stream_free(&mut alloc);
1850                }
1851                Err(e) => assert!(matches!(
1852                    e,
1853                    CudaError::InvalidContext
1854                        | CudaError::NotSupported
1855                        | CudaError::NoDevice
1856                        | CudaError::InvalidDevice
1857                        | CudaError::OutOfMemory
1858                )),
1859            }
1860        }
1861    }
1862}