candle_cuda_vmm/
lib.rs

1//! # candle-cuda-vmm
2//!
3//! CUDA Virtual Memory Management bindings for elastic KV cache allocation in Candle.
4//!
5//! This crate provides safe Rust bindings to CUDA's Virtual Memory Management (VMM) APIs,
6//! enabling elastic memory allocation for LLM inference workloads. It integrates with the
7//! Candle deep learning framework and supports:
8//!
9//! - **Elastic KV Cache Allocation**: Allocate memory on-demand rather than pre-allocating
10//!   large static buffers
11//! - **Multi-Model Serving**: Share GPU memory pools across multiple models with dynamic
12//!   allocation
13//! - **Reduced TTFT**: Faster time-to-first-token (1.2-28×) in multi-model scenarios vs
14//!   static allocation
15//! - **Memory Efficiency**: Optimal memory usage for bursty multi-tenant workloads
16//!
17//! ## Architecture
18//!
19//! The crate is organized into several modules:
20//!
21//! - [`error`]: Error types for VMM operations
22//! - [`cuda_ffi`]: Low-level CUDA VMM FFI bindings
23//! - [`physical_memory`]: Physical GPU memory allocation with RAII
24//! - [`mapping`]: Virtual address space reservation and mapping operations
25//! - [`virtual_memory`]: High-level elastic memory pool abstractions
26//!
27//! ## Quick Start
28//!
29//! ```no_run
30//! use candle_cuda_vmm::{VirtualMemoryPool, Result};
31//! use candle_core::Device;
32//!
33//! fn main() -> Result<()> {
34//!     let device = Device::new_cuda(0)?;
35//!     
36//!     // Create a pool with 128GB virtual capacity, 2MB pages
37//!     let mut pool = VirtualMemoryPool::new(
38//!         128 * 1024 * 1024 * 1024, // 128GB virtual
39//!         2 * 1024 * 1024,          // 2MB pages
40//!         device,
41//!     )?;
42//!     
43//!     // Allocate 1GB of physical memory on-demand
44//!     let addr = pool.allocate(0, 1024 * 1024 * 1024)?;
45//!     println!("Allocated at virtual address: 0x{:x}", addr);
46//!     
47//!     // Physical memory usage: ~1GB
48//!     println!("Physical usage: {} bytes", pool.physical_memory_usage());
49//!     
50//!     // Deallocate when done
51//!     pool.deallocate(0, 1024 * 1024 * 1024)?;
52//!     
53//!     Ok(())
54//! }
55//! ```
56//!
57//! ## Multi-Model Serving
58//!
59//! ```no_run
60//! use candle_cuda_vmm::{SharedMemoryPool, Result};
61//! use candle_core::Device;
62//!
63//! fn main() -> Result<()> {
64//!     let device = Device::new_cuda(0)?;
65//!     let mut shared_pool = SharedMemoryPool::new(
66//!         32 * 1024 * 1024 * 1024, // 32GB global physical limit
67//!         device,
68//!     )?;
69//!     
70//!     // Register models
71//!     shared_pool.register_model("llama-7b", 64 * 1024 * 1024 * 1024)?;
72//!     shared_pool.register_model("gpt2", 32 * 1024 * 1024 * 1024)?;
73//!     
74//!     // Allocate for specific model
75//!     let addr = shared_pool.allocate_for_model("llama-7b", 1024 * 1024 * 1024)?;
76//!     
77//!     Ok(())
78//! }
79//! ```
80//!
81//! ## Requirements
82//!
83//! - CUDA 11.2 or later (CUDA VMM APIs introduced in 11.2)
84//! - NVIDIA GPU with Compute Capability 6.0+ (Pascal or newer)
85//! - Rust 1.70+
86//!
87//! ## Performance
88//!
89//! Based on KVCached benchmarks:
90//!
91//! - **Allocation Latency**: <100μs per 2MB page
92//! - **TTFT Improvement**: 1.2-28× faster vs static allocation (multi-model scenarios)
93//! - **Memory Overhead**: <5% metadata overhead
94//! - **Throughput**: No degradation vs static allocation for single-model workloads
95
96pub mod error;
97pub mod cuda_ffi;
98pub mod physical_memory;
99pub mod mapping;
100pub mod virtual_memory;
101
102// Re-export main types
103pub use error::{Result, VmmError};
104pub use physical_memory::PhysicalMemoryHandle;
105pub use mapping::{VirtualAddressRange, map_memory, unmap_memory, set_memory_access};
106pub use virtual_memory::{
107    VirtualMemoryPool, SharedMemoryPool, MemoryStats, GlobalMemoryStats
108};
109pub use cuda_ffi::AccessFlags;
110
111/// Library version.
112pub const VERSION: &str = env!("CARGO_PKG_VERSION");
113
114/// Check if CUDA VMM is supported on the current system.
115///
116/// # Returns
117/// True if CUDA VMM is available, false otherwise.
118pub fn is_vmm_supported() -> bool {
119    // Try to get granularity for device 0 - if this fails, VMM is not supported
120    cuda_ffi::get_recommended_granularity(0).is_ok()
121}
122
123#[cfg(test)]
124mod tests {
125    use super::*;
126
127    #[test]
128    fn test_version() {
129        assert!(!VERSION.is_empty());
130    }
131}