oxicuda-memory 0.1.0

//! Stream-ordered memory pool for efficient async allocation.
//!
//! Requires CUDA 11.2+ driver.  Gated behind the `pool` feature.
//!
//! Stream-ordered memory pools allow allocation and deallocation to be
//! ordered relative to other operations on a CUDA stream, enabling the
//! driver to reuse memory more aggressively and avoid synchronisation
//! barriers that would otherwise be needed for conventional
//! `cuMemAlloc` / `cuMemFree` calls.
//!
//! # Status
//!
//! This module is a **stub**.  The underlying `cuMemPoolCreate`,
//! `cuMemAllocFromPoolAsync`, and related function pointers are not yet
//! loaded by `oxicuda-driver`.  All methods currently return
//! [`CudaError::NotSupported`] as a placeholder.
//!
//! # Planned API
//!
//! ```rust,ignore
//! let pool = MemoryPool::new(device)?;
//! let buf = PooledBuffer::<f32>::alloc_async(&pool, 1024, &stream)?;
//! // … use buf in kernels on `stream` …
//! // buf is freed asynchronously when dropped (enqueued on the pool's stream).
//! ```

#![cfg(feature = "pool")]

use std::marker::PhantomData;

use oxicuda_driver::error::{CudaError, CudaResult};
use oxicuda_driver::ffi::CUdeviceptr;
use oxicuda_driver::stream::Stream;

// ---------------------------------------------------------------------------
// MemoryPool
// ---------------------------------------------------------------------------

/// A stream-ordered memory pool (CUDA 11.2+).
///
/// Memory pools allow the driver to reuse freed allocations without
/// returning them to the OS, reducing allocation latency and avoiding
/// the implicit synchronisation of `cuMemFree`.
///
/// # Status
///
/// This type is a placeholder.  The pool-related driver function pointers
/// (`cuMemPoolCreate`, `cuMemPoolDestroy`, etc.) are not yet available in
/// `oxicuda-driver`.
///
/// TODO: Add `cu_mem_pool_create`, `cu_mem_pool_destroy`,
/// `cu_mem_alloc_from_pool_async`, `cu_mem_free_async` to `DriverApi`.
/// Statistics for a memory pool's allocation behaviour.
///
/// These statistics track the total bytes allocated, peak usage,
/// allocation count, and free count for a given pool.
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
pub struct PoolStats {
    /// Total bytes currently allocated from the pool.
    pub allocated_bytes: usize,
    /// Peak bytes allocated at any point during the pool's lifetime.
    pub peak_bytes: usize,
    /// Total number of allocations performed.
    pub allocation_count: u64,
    /// Total number of frees performed.
    pub free_count: u64,
}

/// A stream-ordered memory pool (CUDA 11.2+).
pub struct MemoryPool {
    /// Placeholder for `CUmemoryPool` handle.
    _handle: u64,
    /// Statistics tracking.
    stats: PoolStats,
    /// Trim threshold in bytes.
    threshold_bytes: usize,
}

impl MemoryPool {
    /// Creates a new memory pool on the given device.
    ///
    /// # Errors
    ///
    /// Currently always returns [`CudaError::NotSupported`] because the
    /// pool driver function pointers are not yet loaded.
    ///
    /// TODO: Implement once `cu_mem_pool_create` is added to `DriverApi`.
    pub fn new(_device_ordinal: i32) -> CudaResult<Self> {
        // TODO: call (api.cu_mem_pool_create)(...) when available
        Err(CudaError::NotSupported)
    }

    /// Returns the raw pool handle.
    ///
    /// # Status
    ///
    /// Returns `0` until the pool is properly initialised.
    #[inline]
    pub fn raw_handle(&self) -> u64 {
        self._handle
    }

    /// Returns current pool statistics.
    ///
    /// The statistics track allocation behaviour over the pool's lifetime.
    #[inline]
    pub fn stats(&self) -> PoolStats {
        self.stats
    }

    /// Trims the pool, releasing unused memory back to the OS.
    ///
    /// Attempts to release memory such that the pool retains at most
    /// `min_bytes` of unused memory.
    ///
    /// # Errors
    ///
    /// Currently returns [`CudaError::NotSupported`] because the
    /// pool driver function pointers are not yet loaded.
    ///
    /// TODO: Implement once `cu_mem_pool_trim_to` is added to `DriverApi`.
    pub fn trim(&mut self, _min_bytes: usize) -> CudaResult<()> {
        // TODO: call (api.cu_mem_pool_trim_to)(self._handle, min_bytes)
        Err(CudaError::NotSupported)
    }

    /// Sets the threshold at which the pool will automatically release
    /// memory back to the OS.
    ///
    /// When the pool's unused memory exceeds `bytes`, subsequent frees
    /// will trigger automatic trimming.
    ///
    /// # Errors
    ///
    /// Currently returns [`CudaError::NotSupported`] because the
    /// pool driver function pointers are not yet loaded.
    ///
    /// TODO: Implement once `cu_mem_pool_set_attribute` is added to `DriverApi`.
    pub fn set_threshold(&mut self, bytes: usize) -> CudaResult<()> {
        // TODO: call (api.cu_mem_pool_set_attribute)(...)
        self.threshold_bytes = bytes;
        Err(CudaError::NotSupported)
    }
}

impl Drop for MemoryPool {
    fn drop(&mut self) {
        // TODO: call (api.cu_mem_pool_destroy)(self._handle) when available.
        // For now, nothing to free since construction always fails.
    }
}

// ---------------------------------------------------------------------------
// PooledBuffer<T>
// ---------------------------------------------------------------------------

/// A device buffer allocated from a [`MemoryPool`].
///
/// Unlike [`DeviceBuffer`](crate::DeviceBuffer), a `PooledBuffer` is freed
/// asynchronously — the free operation is enqueued on the stream rather
/// than blocking the CPU.  This enables overlap of allocation, computation,
/// and deallocation across multiple stream operations.
///
/// # Status
///
/// This type is a placeholder.  All allocation methods currently return
/// [`CudaError::NotSupported`].
///
/// TODO: Implement once `cu_mem_alloc_from_pool_async` and
/// `cu_mem_free_async` are added to `DriverApi`.
pub struct PooledBuffer<T: Copy> {
    /// Raw device pointer to the pooled allocation.
    _ptr: CUdeviceptr,
    /// Number of `T` elements.
    _len: usize,
    /// Marker for the element type.
    _phantom: PhantomData<T>,
}

impl<T: Copy> PooledBuffer<T> {
    /// Asynchronously allocates a buffer of `n` elements from the given pool.
    ///
    /// The allocation is ordered relative to other operations on `stream`.
    ///
    /// # Errors
    ///
    /// Currently always returns [`CudaError::NotSupported`].
    ///
    /// TODO: Implement once `cu_mem_alloc_from_pool_async` is available.
    pub fn alloc_async(_pool: &MemoryPool, _n: usize, _stream: &Stream) -> CudaResult<Self> {
        // TODO: call (api.cu_mem_alloc_from_pool_async)(...) when available
        Err(CudaError::NotSupported)
    }

    /// Returns the number of `T` elements in this buffer.
    #[inline]
    pub fn len(&self) -> usize {
        self._len
    }

    /// Returns `true` if the buffer contains zero elements.
    #[inline]
    pub fn is_empty(&self) -> bool {
        self._len == 0
    }

    /// Returns the total size of the allocation in bytes.
    #[inline]
    pub fn byte_size(&self) -> usize {
        self._len * std::mem::size_of::<T>()
    }

    /// Returns the raw [`CUdeviceptr`] handle.
    #[inline]
    pub fn as_device_ptr(&self) -> CUdeviceptr {
        self._ptr
    }
}

impl<T: Copy> Drop for PooledBuffer<T> {
    fn drop(&mut self) {
        // TODO: call (api.cu_mem_free_async)(self._ptr, stream) when available.
        // For now, nothing to free since construction always fails.
    }
}