candle_cuda_vmm/lib.rs
1//! # candle-cuda-vmm
2//!
3//! CUDA Virtual Memory Management bindings for elastic KV cache allocation in Candle.
4//!
5//! This crate provides safe Rust bindings to CUDA's Virtual Memory Management (VMM) APIs,
6//! enabling elastic memory allocation for LLM inference workloads. It integrates with the
7//! Candle deep learning framework and supports:
8//!
9//! - **Elastic KV Cache Allocation**: Allocate memory on-demand rather than pre-allocating
10//! large static buffers
11//! - **Multi-Model Serving**: Share GPU memory pools across multiple models with dynamic
12//! allocation
13//! - **Reduced TTFT**: Faster time-to-first-token (1.2-28×) in multi-model scenarios vs
14//! static allocation
15//! - **Memory Efficiency**: Optimal memory usage for bursty multi-tenant workloads
16//!
17//! ## Architecture
18//!
19//! The crate is organized into several modules:
20//!
21//! - [`error`]: Error types for VMM operations
22//! - [`cuda_ffi`]: Low-level CUDA VMM FFI bindings
23//! - [`physical_memory`]: Physical GPU memory allocation with RAII
24//! - [`mapping`]: Virtual address space reservation and mapping operations
25//! - [`virtual_memory`]: High-level elastic memory pool abstractions
26//!
27//! ## Quick Start
28//!
29//! ```no_run
30//! use candle_cuda_vmm::{VirtualMemoryPool, Result};
31//! use candle_core::Device;
32//!
33//! fn main() -> Result<()> {
34//! let device = Device::new_cuda(0)?;
35//!
36//! // Create a pool with 128GB virtual capacity, 2MB pages
37//! let mut pool = VirtualMemoryPool::new(
38//! 128 * 1024 * 1024 * 1024, // 128GB virtual
39//! 2 * 1024 * 1024, // 2MB pages
40//! device,
41//! )?;
42//!
43//! // Allocate 1GB of physical memory on-demand
44//! let addr = pool.allocate(0, 1024 * 1024 * 1024)?;
45//! println!("Allocated at virtual address: 0x{:x}", addr);
46//!
47//! // Physical memory usage: ~1GB
48//! println!("Physical usage: {} bytes", pool.physical_memory_usage());
49//!
50//! // Deallocate when done
51//! pool.deallocate(0, 1024 * 1024 * 1024)?;
52//!
53//! Ok(())
54//! }
55//! ```
56//!
57//! ## Multi-Model Serving
58//!
59//! ```no_run
60//! use candle_cuda_vmm::{SharedMemoryPool, Result};
61//! use candle_core::Device;
62//!
63//! fn main() -> Result<()> {
64//! let device = Device::new_cuda(0)?;
65//! let mut shared_pool = SharedMemoryPool::new(
66//! 32 * 1024 * 1024 * 1024, // 32GB global physical limit
67//! device,
68//! )?;
69//!
70//! // Register models
71//! shared_pool.register_model("llama-7b", 64 * 1024 * 1024 * 1024)?;
72//! shared_pool.register_model("gpt2", 32 * 1024 * 1024 * 1024)?;
73//!
74//! // Allocate for specific model
75//! let addr = shared_pool.allocate_for_model("llama-7b", 1024 * 1024 * 1024)?;
76//!
77//! Ok(())
78//! }
79//! ```
80//!
81//! ## Requirements
82//!
83//! - CUDA 11.2 or later (CUDA VMM APIs introduced in 11.2)
84//! - NVIDIA GPU with Compute Capability 6.0+ (Pascal or newer)
85//! - Rust 1.70+
86//!
87//! ## Performance
88//!
89//! Based on KVCached benchmarks:
90//!
91//! - **Allocation Latency**: <100μs per 2MB page
92//! - **TTFT Improvement**: 1.2-28× faster vs static allocation (multi-model scenarios)
93//! - **Memory Overhead**: <5% metadata overhead
94//! - **Throughput**: No degradation vs static allocation for single-model workloads
95
96pub mod error;
97pub mod cuda_ffi;
98pub mod physical_memory;
99pub mod mapping;
100pub mod virtual_memory;
101
102// Re-export main types
103pub use error::{Result, VmmError};
104pub use physical_memory::PhysicalMemoryHandle;
105pub use mapping::{VirtualAddressRange, map_memory, unmap_memory, set_memory_access};
106pub use virtual_memory::{
107 VirtualMemoryPool, SharedMemoryPool, MemoryStats, GlobalMemoryStats
108};
109pub use cuda_ffi::AccessFlags;
110
111/// Library version.
112pub const VERSION: &str = env!("CARGO_PKG_VERSION");
113
114/// Check if CUDA VMM is supported on the current system.
115///
116/// # Returns
117/// True if CUDA VMM is available, false otherwise.
118pub fn is_vmm_supported() -> bool {
119 // Try to get granularity for device 0 - if this fails, VMM is not supported
120 cuda_ffi::get_recommended_granularity(0).is_ok()
121}
122
123#[cfg(test)]
124mod tests {
125 use super::*;
126
127 #[test]
128 fn test_version() {
129 assert!(!VERSION.is_empty());
130 }
131}