Skip to main content

baracuda_driver/
vmm.rs

1//! CUDA Virtual Memory Management (VMM) — the fine-grained alternative to
2//! `cuMemAlloc`.
3//!
4//! The Driver-API malloc (`DeviceBuffer`) hides allocation + virtual
5//! address mapping behind one call. The VMM API splits them:
6//!
7//! 1. **Reserve** a virtual address range ([`AddressRange`]).
8//! 2. **Create** a physical allocation ([`PhysicalMemory`]).
9//! 3. **Map** the physical memory into the reserved address range.
10//! 4. **Grant access** to one or more devices.
11//! 5. ... use the memory like any device pointer ...
12//! 6. On drop: unmap, release physical, free virtual range.
13//!
14//! This buys three things:
15//!
16//! - Safe remapping (resize-in-place for tensor libraries).
17//! - Explicit peer-access control (per-device `READ` / `READWRITE`).
18//! - IPC / external-resource export via `CUmemAllocationHandleType` (future).
19//!
20//! Availability: CUDA 10.2+, Linux + Windows with WDDM 2.0 driver model
21//! (which is all modern NVIDIA Windows setups).
22
23use std::sync::Arc;
24
25use baracuda_cuda_sys::types::{
26    CUmemAccessDesc, CUmemAccess_flags, CUmemAllocationGranularity_flags, CUmemAllocationProp,
27    CUmemAllocationPropFlags, CUmemAllocationType, CUmemLocation, CUmemLocationType,
28};
29use baracuda_cuda_sys::{driver, CUdevice, CUdeviceptr, CUmemGenericAllocationHandle};
30
31use crate::context::Context;
32use crate::device::Device;
33use crate::error::{check, Result};
34
35/// Query the minimum or recommended allocation granularity for a device.
36/// VMM allocations must be sized (and address ranges aligned) to this value.
37pub fn allocation_granularity(device: &Device, recommended: bool) -> Result<usize> {
38    let d = driver()?;
39    let cu = d.cu_mem_get_allocation_granularity()?;
40    let prop = device_prop(device.as_raw());
41    let mut gran: usize = 0;
42    let option = if recommended {
43        CUmemAllocationGranularity_flags::RECOMMENDED
44    } else {
45        CUmemAllocationGranularity_flags::MINIMUM
46    };
47    check(unsafe { cu(&mut gran, &prop, option) })?;
48    Ok(gran)
49}
50
51fn device_prop(dev: CUdevice) -> CUmemAllocationProp {
52    CUmemAllocationProp {
53        type_: CUmemAllocationType::PINNED,
54        requested_handle_types: 0, // no IPC export
55        location: CUmemLocation {
56            type_: CUmemLocationType::DEVICE,
57            id: dev.0,
58        },
59        win32_handle_meta_data: core::ptr::null_mut(),
60        alloc_flags: CUmemAllocationPropFlags::default(),
61    }
62}
63
64/// A reserved virtual address range (not yet backed by physical memory).
65/// Drops release the VA range via `cuMemAddressFree`.
66pub struct AddressRange {
67    inner: Arc<AddressRangeInner>,
68}
69
70struct AddressRangeInner {
71    ptr: CUdeviceptr,
72    size: usize,
73    #[allow(dead_code)]
74    context: Context,
75}
76
77unsafe impl Send for AddressRangeInner {}
78unsafe impl Sync for AddressRangeInner {}
79
80impl core::fmt::Debug for AddressRangeInner {
81    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
82        f.debug_struct("AddressRange")
83            .field("ptr", &format_args!("{:#x}", self.ptr.0))
84            .field("size", &self.size)
85            .finish()
86    }
87}
88
89impl core::fmt::Debug for AddressRange {
90    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
91        self.inner.fmt(f)
92    }
93}
94
95impl Clone for AddressRange {
96    fn clone(&self) -> Self {
97        Self {
98            inner: self.inner.clone(),
99        }
100    }
101}
102
103impl AddressRange {
104    /// Reserve `size` bytes of device virtual address space.
105    pub fn reserve(context: &Context, size: usize, alignment: usize) -> Result<Self> {
106        context.set_current()?;
107        let d = driver()?;
108        let cu = d.cu_mem_address_reserve()?;
109        let mut ptr = CUdeviceptr(0);
110        check(unsafe { cu(&mut ptr, size, alignment, CUdeviceptr(0), 0) })?;
111        Ok(Self {
112            inner: Arc::new(AddressRangeInner {
113                ptr,
114                size,
115                context: context.clone(),
116            }),
117        })
118    }
119
120    #[inline]
121    pub fn as_raw(&self) -> CUdeviceptr {
122        self.inner.ptr
123    }
124    #[inline]
125    pub fn size(&self) -> usize {
126        self.inner.size
127    }
128}
129
130impl Drop for AddressRangeInner {
131    fn drop(&mut self) {
132        if self.ptr.0 == 0 {
133            return;
134        }
135        if let Ok(d) = driver() {
136            if let Ok(cu) = d.cu_mem_address_free() {
137                let _ = unsafe { cu(self.ptr, self.size) };
138            }
139        }
140    }
141}
142
143/// A physical device-memory allocation. Not usable until mapped into an
144/// [`AddressRange`] via [`MappedRange::new`].
145pub struct PhysicalMemory {
146    inner: Arc<PhysicalMemoryInner>,
147}
148
149struct PhysicalMemoryInner {
150    handle: CUmemGenericAllocationHandle,
151    size: usize,
152    device: CUdevice,
153    #[allow(dead_code)]
154    context: Context,
155}
156
157unsafe impl Send for PhysicalMemoryInner {}
158unsafe impl Sync for PhysicalMemoryInner {}
159
160impl core::fmt::Debug for PhysicalMemoryInner {
161    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
162        f.debug_struct("PhysicalMemory")
163            .field("handle", &self.handle)
164            .field("size", &self.size)
165            .field("device", &self.device.0)
166            .finish()
167    }
168}
169
170impl core::fmt::Debug for PhysicalMemory {
171    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
172        self.inner.fmt(f)
173    }
174}
175
176impl Clone for PhysicalMemory {
177    fn clone(&self) -> Self {
178        Self {
179            inner: self.inner.clone(),
180        }
181    }
182}
183
184impl PhysicalMemory {
185    /// Create a physical allocation of `size` bytes on `device`. `size` must
186    /// be a multiple of [`allocation_granularity`].
187    pub fn create(context: &Context, device: &Device, size: usize) -> Result<Self> {
188        context.set_current()?;
189        let d = driver()?;
190        let cu = d.cu_mem_create()?;
191        let prop = device_prop(device.as_raw());
192        let mut handle: CUmemGenericAllocationHandle = 0;
193        check(unsafe { cu(&mut handle, size, &prop, 0) })?;
194        Ok(Self {
195            inner: Arc::new(PhysicalMemoryInner {
196                handle,
197                size,
198                device: device.as_raw(),
199                context: context.clone(),
200            }),
201        })
202    }
203
204    #[inline]
205    pub fn as_raw(&self) -> CUmemGenericAllocationHandle {
206        self.inner.handle
207    }
208    #[inline]
209    pub fn size(&self) -> usize {
210        self.inner.size
211    }
212}
213
214impl Drop for PhysicalMemoryInner {
215    fn drop(&mut self) {
216        if self.handle == 0 {
217            return;
218        }
219        if let Ok(d) = driver() {
220            if let Ok(cu) = d.cu_mem_release() {
221                let _ = unsafe { cu(self.handle) };
222            }
223        }
224    }
225}
226
227/// Access rights granted to a device by [`MappedRange::set_access`].
228#[derive(Copy, Clone, Debug, Eq, PartialEq)]
229pub enum AccessFlags {
230    None,
231    Read,
232    ReadWrite,
233}
234
235impl AccessFlags {
236    #[doc(hidden)]
237    #[inline]
238    pub fn raw(self) -> core::ffi::c_int {
239        match self {
240            AccessFlags::None => CUmemAccess_flags::NONE,
241            AccessFlags::Read => CUmemAccess_flags::READ,
242            AccessFlags::ReadWrite => CUmemAccess_flags::READWRITE,
243        }
244    }
245}
246
247/// An address range with physical backing mapped in. `cuMemUnmap` is called
248/// on drop; the underlying [`AddressRange`] and [`PhysicalMemory`] remain
249/// live through their own refcounts.
250pub struct MappedRange {
251    range: AddressRange,
252    physical: PhysicalMemory,
253    offset: usize,
254    size: usize,
255}
256
257impl core::fmt::Debug for MappedRange {
258    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
259        f.debug_struct("MappedRange")
260            .field("ptr", &format_args!("{:#x}", self.range.as_raw().0))
261            .field("offset", &self.offset)
262            .field("size", &self.size)
263            .finish()
264    }
265}
266
267impl MappedRange {
268    /// Map `physical` into `range` at `offset` within the range.
269    /// `offset + physical.size() <= range.size()`.
270    pub fn new(range: &AddressRange, physical: &PhysicalMemory, offset: usize) -> Result<Self> {
271        assert!(
272            offset + physical.size() <= range.size(),
273            "physical size + offset ({} + {}) overflows the reserved range ({})",
274            offset,
275            physical.size(),
276            range.size()
277        );
278        let d = driver()?;
279        let cu = d.cu_mem_map()?;
280        let target = CUdeviceptr(range.as_raw().0 + offset as u64);
281        check(unsafe { cu(target, physical.size(), 0, physical.as_raw(), 0) })?;
282        Ok(Self {
283            range: range.clone(),
284            physical: physical.clone(),
285            offset,
286            size: physical.size(),
287        })
288    }
289
290    /// Grant `flags` access to `device` for this mapping. Must be called at
291    /// least once (typically as [`AccessFlags::ReadWrite`]) before the
292    /// mapping becomes usable — a fresh `cuMemMap` defaults to no access.
293    pub fn set_access(&self, device: &Device, flags: AccessFlags) -> Result<()> {
294        let d = driver()?;
295        let cu = d.cu_mem_set_access()?;
296        let desc = CUmemAccessDesc {
297            location: CUmemLocation {
298                type_: CUmemLocationType::DEVICE,
299                id: device.as_raw().0,
300            },
301            flags: flags.raw(),
302        };
303        check(unsafe { cu(self.as_raw(), self.size, &desc, 1) })
304    }
305
306    /// The device pointer at which the physical memory is now accessible.
307    #[inline]
308    pub fn as_raw(&self) -> CUdeviceptr {
309        CUdeviceptr(self.range.as_raw().0 + self.offset as u64)
310    }
311
312    #[inline]
313    pub fn size(&self) -> usize {
314        self.size
315    }
316}
317
318impl Drop for MappedRange {
319    fn drop(&mut self) {
320        if self.range.as_raw().0 == 0 {
321            return;
322        }
323        if let Ok(d) = driver() {
324            if let Ok(cu) = d.cu_mem_unmap() {
325                let _ = unsafe { cu(self.as_raw(), self.size) };
326            }
327        }
328        // keep `physical` alive here so the above unmap precedes release
329        let _ = &self.physical;
330    }
331}