singe_cuda/
memory.rs

1use std::{
2    marker::PhantomData,
3    mem::{self, MaybeUninit},
4    ptr, slice,
5};
6
7use num_enum::{IntoPrimitive, TryFromPrimitive};
8use singe_core::{impl_enum_conversion, impl_enum_display};
9use singe_cuda_sys::{driver, runtime};
10
11use crate::{
12    error::{Error, Result},
13    ipc::IpcMemoryHandle,
14    stream::{GraphRecordable, Stream, StreamCaptureScope, StreamScope},
15    try_ffi,
16    types::DevicePtr,
17    view::{
18        DeviceRepr, DeviceSlice, DeviceSliceMut, DeviceView, DeviceViewMut, ZeroableDeviceRepr,
19    },
20};
21
22#[derive(Debug, Clone, Copy)]
23pub struct MemoryCopyOperation<'a, T> {
24    dst: *mut T,
25    src: *const T,
26    count: usize,
27    kind: MemoryCopyKind,
28    _marker: PhantomData<&'a mut [T]>,
29}
30
31impl<'a, T> MemoryCopyOperation<'a, T> {
32    /// Creates a stream-capture-safe memcpy operation from raw pointers.
33    ///
34    /// # Safety
35    ///
36    /// Capturing this operation stores `dst` and `src` pointer addresses in the
37    /// resulting CUDA graph. The caller must ensure both pointers are valid for
38    /// `count` elements whenever a captured graph using this operation is
39    /// launched. If `dst` is mutable memory, it must also remain exclusive for
40    /// the work ordered by those launches.
41    pub const unsafe fn new(
42        dst: *mut T,
43        src: *const T,
44        count: usize,
45        kind: MemoryCopyKind,
46    ) -> Self {
47        Self {
48            dst,
49            src,
50            count,
51            kind,
52            _marker: PhantomData,
53        }
54    }
55}
56
57unsafe impl<T> GraphRecordable for MemoryCopyOperation<'_, T> {
58    type Output = ();
59
60    fn record(self, scope: &StreamCaptureScope<'_>) -> Result<()> {
61        unsafe {
62            DeviceMemory::<T>::copy_async(self.dst, self.src, self.count, self.kind, scope.stream())
63        }
64    }
65}
66
67#[derive(Debug, Clone, Copy)]
68pub struct MemorySetOperation<'a, T> {
69    dst: *mut T,
70    value: u8,
71    count: usize,
72    _marker: PhantomData<&'a mut [T]>,
73}
74
75impl<'a, T> MemorySetOperation<'a, T> {
76    /// Creates a stream-capture-safe memset operation from a raw pointer.
77    ///
78    /// # Safety
79    ///
80    /// Capturing this operation stores `dst` in the resulting CUDA graph. The
81    /// caller must ensure `dst` is valid for writes of
82    /// `count * size_of::<T>()` bytes whenever a captured graph using this
83    /// operation is launched, and that it remains exclusive for the work ordered
84    /// by those launches.
85    pub const unsafe fn new(dst: *mut T, value: u8, count: usize) -> Self {
86        Self {
87            dst,
88            value,
89            count,
90            _marker: PhantomData,
91        }
92    }
93}
94
95unsafe impl<T> GraphRecordable for MemorySetOperation<'_, T> {
96    type Output = ();
97
98    fn record(self, scope: &StreamCaptureScope<'_>) -> Result<()> {
99        unsafe { DeviceMemory::<T>::set_async(self.dst, self.value, self.count, scope.stream()) }
100    }
101}
102
103/// CUDA memory copy types.
104#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
105#[repr(u32)]
106#[non_exhaustive]
107pub enum MemoryCopyKind {
108    /// Host -&gt; Host.
109    HostToHost = runtime::cudaMemcpyKind::cudaMemcpyHostToHost as _,
110    /// Host -&gt; Device.
111    HostToDevice = runtime::cudaMemcpyKind::cudaMemcpyHostToDevice as _,
112    /// Device -&gt; Host.
113    DeviceToHost = runtime::cudaMemcpyKind::cudaMemcpyDeviceToHost as _,
114    /// Device -&gt; Device.
115    DeviceToDevice = runtime::cudaMemcpyKind::cudaMemcpyDeviceToDevice as _,
116    /// Direction of the transfer is inferred from the pointer values.
117    /// Requires unified virtual addressing.
118    Default = runtime::cudaMemcpyKind::cudaMemcpyDefault as _,
119}
120
121impl_enum_conversion!(runtime::cudaMemcpyKind, MemoryCopyKind);
122
123impl_enum_display!(MemoryCopyKind, {
124    Self::HostToHost => "cudaMemcpyHostToHost",
125    Self::HostToDevice => "cudaMemcpyHostToDevice",
126    Self::DeviceToHost => "cudaMemcpyDeviceToHost",
127    Self::DeviceToDevice => "cudaMemcpyDeviceToDevice",
128    Self::Default => "cudaMemcpyDefault",
129});
130
131#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
132#[repr(transparent)]
133pub struct ArrayHandle(runtime::cudaArray_t);
134
135impl ArrayHandle {
136    pub const unsafe fn from_raw(handle: runtime::cudaArray_t) -> Self {
137        Self(handle)
138    }
139
140    pub const fn as_raw(self) -> runtime::cudaArray_t {
141        self.0
142    }
143}
144
145bitflags::bitflags! {
146    #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
147    pub struct MemoryAttachFlags: u32 {
148        const GLOBAL = driver::CUmemAttach_flags::CU_MEM_ATTACH_GLOBAL as _;
149        const HOST = driver::CUmemAttach_flags::CU_MEM_ATTACH_HOST as _;
150        const SINGLE = driver::CUmemAttach_flags::CU_MEM_ATTACH_SINGLE as _;
151    }
152}
153
154#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
155#[repr(u32)]
156#[non_exhaustive]
157pub enum MemoryAllocationType {
158    Invalid = driver::CUmemAllocationType::CU_MEM_ALLOCATION_TYPE_INVALID as _,
159    Pinned = driver::CUmemAllocationType::CU_MEM_ALLOCATION_TYPE_PINNED as _,
160    Managed = driver::CUmemAllocationType::CU_MEM_ALLOCATION_TYPE_MANAGED as _,
161    Max = driver::CUmemAllocationType::CU_MEM_ALLOCATION_TYPE_MAX as _,
162}
163
164impl_enum_conversion!(u32, driver::CUmemAllocationType, MemoryAllocationType);
165
166#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
167#[repr(u32)]
168#[non_exhaustive]
169pub enum MemoryAllocationHandleType {
170    None = driver::CUmemAllocationHandleType::CU_MEM_HANDLE_TYPE_NONE as _,
171    PosixFileDescriptor =
172        driver::CUmemAllocationHandleType::CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR as _,
173    Win32 = driver::CUmemAllocationHandleType::CU_MEM_HANDLE_TYPE_WIN32 as _,
174    Win32Kmt = driver::CUmemAllocationHandleType::CU_MEM_HANDLE_TYPE_WIN32_KMT as _,
175    Fabric = driver::CUmemAllocationHandleType::CU_MEM_HANDLE_TYPE_FABRIC as _,
176    Max = driver::CUmemAllocationHandleType::CU_MEM_HANDLE_TYPE_MAX as _,
177}
178
179impl_enum_conversion!(
180    u32,
181    driver::CUmemAllocationHandleType,
182    MemoryAllocationHandleType
183);
184
185#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
186#[repr(u32)]
187#[non_exhaustive]
188pub enum MemoryAccessFlag {
189    None = driver::CUmemAccess_flags::CU_MEM_ACCESS_FLAGS_PROT_NONE as _,
190    Read = driver::CUmemAccess_flags::CU_MEM_ACCESS_FLAGS_PROT_READ as _,
191    ReadWrite = driver::CUmemAccess_flags::CU_MEM_ACCESS_FLAGS_PROT_READWRITE as _,
192    Max = driver::CUmemAccess_flags::CU_MEM_ACCESS_FLAGS_PROT_MAX as _,
193}
194
195impl_enum_conversion!(u32, driver::CUmemAccess_flags, MemoryAccessFlag);
196
197#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
198#[repr(u32)]
199#[non_exhaustive]
200pub enum MemoryPoolAttribute {
201    ReuseFollowEventDependencies =
202        driver::CUmemPool_attribute::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES as _,
203    ReuseAllowOpportunistic =
204        driver::CUmemPool_attribute::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC as _,
205    ReuseAllowInternalDependencies =
206        driver::CUmemPool_attribute::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES as _,
207    ReleaseThreshold = driver::CUmemPool_attribute::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD as _,
208    ReservedMemoryCurrent = driver::CUmemPool_attribute::CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT as _,
209    ReservedMemoryHigh = driver::CUmemPool_attribute::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH as _,
210    UsedMemoryCurrent = driver::CUmemPool_attribute::CU_MEMPOOL_ATTR_USED_MEM_CURRENT as _,
211    UsedMemoryHigh = driver::CUmemPool_attribute::CU_MEMPOOL_ATTR_USED_MEM_HIGH as _,
212}
213
214impl_enum_conversion!(u32, driver::CUmemPool_attribute, MemoryPoolAttribute);
215
216#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
217#[non_exhaustive]
218pub enum MemoryPoolAttributeValue {
219    Bool(bool),
220    Bytes(u64),
221}
222
223#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
224pub struct MemoryAccessDescriptor {
225    pub location: MemoryLocation,
226    pub flags: MemoryAccessFlag,
227}
228
229#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
230pub struct MemoryPoolProps {
231    pub alloc_type: MemoryAllocationType,
232    pub handle_type: MemoryAllocationHandleType,
233    pub location: MemoryLocation,
234    pub max_size: usize,
235    pub usage: u16,
236}
237
238#[derive(Debug)]
239pub struct MemoryPool {
240    handle: driver::CUmemoryPool,
241}
242
243impl From<MemoryAccessDescriptor> for driver::CUmemAccessDesc {
244    fn from(value: MemoryAccessDescriptor) -> Self {
245        Self {
246            location: value.location.into(),
247            flags: value.flags.into(),
248        }
249    }
250}
251
252impl From<MemoryPoolProps> for driver::CUmemPoolProps {
253    fn from(value: MemoryPoolProps) -> Self {
254        Self {
255            allocType: value.alloc_type.into(),
256            handleTypes: value.handle_type.into(),
257            location: value.location.into(),
258            win32SecurityAttributes: ptr::null_mut(),
259            maxSize: value.max_size as _,
260            usage: value.usage,
261            reserved: [0; 54],
262        }
263    }
264}
265
266bitflags::bitflags! {
267    #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
268    pub struct HostAllocationFlags: u32 {
269        const DEFAULT = runtime::cudaHostAllocDefault;
270        const PORTABLE = runtime::cudaHostAllocPortable;
271        const MAPPED = runtime::cudaHostAllocMapped;
272        const WRITE_COMBINED = runtime::cudaHostAllocWriteCombined;
273    }
274}
275
276bitflags::bitflags! {
277    /// Flags for [`DeviceMemory::register_host`].
278    #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
279    pub struct HostRegisterFlags: u32 {
280        const DEFAULT = runtime::cudaHostRegisterDefault;
281        const PORTABLE = runtime::cudaHostRegisterPortable;
282        const MAPPED = runtime::cudaHostRegisterMapped;
283        const IO_MEMORY = runtime::cudaHostRegisterIoMemory;
284        const READ_ONLY = runtime::cudaHostRegisterReadOnly;
285    }
286}
287
288/// CUDA memory types.
289#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
290#[repr(u32)]
291#[non_exhaustive]
292pub enum MemoryType {
293    /// Unregistered memory.
294    Unregistered = runtime::cudaMemoryType::cudaMemoryTypeUnregistered as _,
295    /// Host memory.
296    Host = runtime::cudaMemoryType::cudaMemoryTypeHost as _,
297    /// Device memory.
298    Device = runtime::cudaMemoryType::cudaMemoryTypeDevice as _,
299    /// Managed memory.
300    Managed = runtime::cudaMemoryType::cudaMemoryTypeManaged as _,
301}
302
303impl_enum_conversion!(runtime::cudaMemoryType, MemoryType);
304
305impl_enum_display!(MemoryType, {
306    Self::Unregistered => "cudaMemoryTypeUnregistered",
307    Self::Host => "cudaMemoryTypeHost",
308    Self::Device => "cudaMemoryTypeDevice",
309    Self::Managed => "cudaMemoryTypeManaged",
310});
311
312#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
313pub struct PointerAttributes {
314    pub memory_type: MemoryType,
315    pub device: i32,
316    pub device_pointer: DevicePtr,
317    pub host_pointer: *mut (),
318}
319
320impl From<runtime::cudaPointerAttributes> for PointerAttributes {
321    fn from(attr: runtime::cudaPointerAttributes) -> Self {
322        Self {
323            memory_type: attr.type_.into(),
324            device: attr.device,
325            device_pointer: unsafe { DevicePtr::from_raw(attr.devicePointer.cast()) },
326            host_pointer: attr.hostPointer.cast(),
327        }
328    }
329}
330
331#[repr(u32)]
332#[derive(
333    Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq, TryFromPrimitive, IntoPrimitive,
334)]
335#[non_exhaustive]
336pub enum MemoryLocationKind {
337    Invalid = driver::CUmemLocationType_enum::CU_MEM_LOCATION_TYPE_INVALID as _,
338    Device = driver::CUmemLocationType_enum::CU_MEM_LOCATION_TYPE_DEVICE as _,
339    Host = driver::CUmemLocationType_enum::CU_MEM_LOCATION_TYPE_HOST as _,
340    Numa = driver::CUmemLocationType_enum::CU_MEM_LOCATION_TYPE_HOST_NUMA as _,
341    NumaCurrent = driver::CUmemLocationType_enum::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT as _,
342    Max = driver::CUmemLocationType_enum::CU_MEM_LOCATION_TYPE_MAX as _,
343}
344
345impl_enum_conversion!(driver::CUmemLocationType_enum, MemoryLocationKind);
346
347impl_enum_display!(MemoryLocationKind, {
348    Self::Invalid => "CU_MEM_LOCATION_TYPE_INVALID",
349    Self::Device => "CU_MEM_LOCATION_TYPE_DEVICE",
350    Self::Host => "CU_MEM_LOCATION_TYPE_HOST",
351    Self::Numa => "CU_MEM_LOCATION_TYPE_HOST_NUMA",
352    Self::NumaCurrent => "CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT",
353    Self::Max => "CU_MEM_LOCATION_TYPE_MAX",
354});
355
356#[derive(Debug, Clone, Copy, Hash, PartialOrd, Ord, PartialEq, Eq)]
357pub struct MemoryLocation {
358    pub kind: MemoryLocationKind,
359    pub id: i32,
360}
361
362impl From<driver::CUmemLocation_st> for MemoryLocation {
363    fn from(s: driver::CUmemLocation_st) -> Self {
364        Self {
365            kind: s.type_.into(),
366            id: unsafe { s.__bindgen_anon_1.id },
367        }
368    }
369}
370
371impl From<MemoryLocation> for driver::CUmemLocation_st {
372    fn from(m: MemoryLocation) -> Self {
373        Self {
374            type_: m.kind.into(),
375            __bindgen_anon_1: driver::CUmemLocation_st__bindgen_ty_1 { id: m.id as _ },
376        }
377    }
378}
379
380impl Default for MemoryLocation {
381    fn default() -> Self {
382        driver::CUmemLocation_st::default().into()
383    }
384}
385
386impl MemoryPool {
387    /// Creates a CUDA memory pool.
388    /// `props` determines the properties of the pool such as the backing device and IPC capabilities.
389    ///
390    /// To create a memory pool for host memory not targeting a specific NUMA node, applications must set [`MemoryPoolProps::location`] to [`MemoryLocationKind::Host`].
391    /// [`MemoryLocation::id`] is ignored for such pools.
392    /// Pools created with [`MemoryLocationKind::Host`] are not IPC-capable and [`MemoryPoolProps::handle_type`] must be [`MemoryAllocationHandleType::None`]; any other value returns [`crate::error::Status::InvalidValue`].
393    /// To create a memory pool targeting a specific host NUMA node, applications must set [`MemoryLocation::kind`] to [`MemoryLocationKind::Numa`] and [`MemoryLocation::id`] must specify the NUMA ID of the host memory node.
394    /// Specifying [`MemoryLocationKind::NumaCurrent`] as [`MemoryLocation::kind`] returns [`crate::error::Status::InvalidValue`].
395    /// By default, the pool's memory is accessible from the device where it is allocated.
396    /// Pools created with [`MemoryLocationKind::Numa`] or [`MemoryLocationKind::Host`] are accessible from the host CPU by default.
397    /// Applications can control the maximum size of the pool by specifying a non-zero value for [`MemoryPoolProps::max_size`].
398    /// A value of 0 uses a system-dependent maximum pool size.
399    ///
400    /// Callers that intend to use [`MemoryAllocationHandleType::Fabric`] based memory sharing must ensure: (1) the `nvidia-caps-imex-channels` character device is created by the driver and is listed under `/proc/devices`; (2) at least one IMEX channel file is accessible to the process.
401    ///
402    /// When exporter and importer CUDA processes have been granted access to the same IMEX channel, they can securely share memory.
403    ///
404    /// The IMEX channel security model works per operating-system account.
405    /// All processes for an account can share memory if that account has access to a valid IMEX channel.
406    /// When isolation between accounts is desired, each account needs a separate IMEX channel.
407    ///
408    /// These channel files exist in `/dev/nvidia-caps-imex-channels/channel*` and can be created using standard OS native calls like `mknod` on Linux.
409    ///
410    /// To create a managed memory pool, applications must set [`MemoryPoolProps::alloc_type`] to [`MemoryAllocationType::Managed`].
411    /// [`MemoryPoolProps::handle_type`] must also be [`MemoryAllocationHandleType::None`] because IPC is not supported.
412    /// For managed memory pools, [`MemoryPoolProps::location`] is treated as the preferred location for all allocations created from the pool.
413    /// An application can also set [`MemoryLocationKind::Invalid`] to indicate no preferred location.
414    /// [`MemoryPoolProps::max_size`] must be set to zero for managed memory pools.
415    /// [`MemoryPoolProps::usage`] must be zero because decompression for managed memory is not supported.
416    /// For managed memory pools, all devices on the system must have non-zero concurrentManagedAccess.
417    /// If not, this call returns [`crate::error::Status::NotSupported`].
418    ///
419    /// Specifying [`MemoryAllocationHandleType::None`] creates a memory pool that does not support IPC.
420    ///
421    /// # Errors
422    ///
423    /// Returns an error if `props` describes an unsupported pool, CUDA cannot
424    /// create the pool, or CUDA returns a null memory-pool handle.
425    pub fn create(props: MemoryPoolProps) -> Result<Self> {
426        let mut handle = ptr::null_mut();
427        let props = driver::CUmemPoolProps::from(props);
428        unsafe {
429            try_ffi!(driver::cuMemPoolCreate(&raw mut handle, &raw const props))?;
430        }
431        if handle.is_null() {
432            return Err(Error::NullHandle);
433        }
434        Ok(Self { handle })
435    }
436
437    /// Supported attributes are:
438    ///
439    /// * [`MemoryPoolAttribute::ReleaseThreshold`]: amount of reserved memory, in bytes, to keep before trying to release memory back to the OS.
440    ///   When more than the release threshold bytes of memory are held by the memory pool, the allocator will try to release memory
441    ///   back to the OS on the next call to stream, event or context synchronize.
442    ///   (default 0)
443    /// * [`MemoryPoolAttribute::ReuseFollowEventDependencies`]: allows [`sys::cuMemAllocAsync`](singe_cuda_sys::driver::cuMemAllocAsync) to use memory asynchronously freed in another stream as long as a stream ordering dependency of the allocating stream on
444    ///   the free action exists.
445    ///   CUDA events and null stream interactions can create the required stream ordered dependencies.
446    ///   (default
447    ///   enabled)
448    /// * [`MemoryPoolAttribute::ReuseAllowOpportunistic`]: allows reuse of already completed frees when there is no dependency between the free and allocation.
449    ///   (default
450    ///   enabled)
451    /// * [`MemoryPoolAttribute::ReuseAllowInternalDependencies`]: allows [`sys::cuMemAllocAsync`](singe_cuda_sys::driver::cuMemAllocAsync) to insert new stream dependencies to establish the stream ordering required to reuse a piece of memory released
452    ///   by [`sys::cuMemFreeAsync`](singe_cuda_sys::driver::cuMemFreeAsync) (default enabled).
453    /// * [`MemoryPoolAttribute::ReservedMemoryHigh`]: resets the high watermark that tracks the amount of backing memory allocated for the memory
454    ///   pool.
455    ///   It is illegal to set this attribute to a non-zero value.
456    /// * [`MemoryPoolAttribute::UsedMemoryHigh`]: resets the high watermark that tracks the amount of used memory allocated for the memory
457    ///   pool.
458    pub fn set_attribute(
459        &mut self,
460        attribute: MemoryPoolAttribute,
461        value: MemoryPoolAttributeValue,
462    ) -> Result<()> {
463        unsafe {
464            match (attribute, value) {
465                (
466                    MemoryPoolAttribute::ReuseFollowEventDependencies
467                    | MemoryPoolAttribute::ReuseAllowOpportunistic
468                    | MemoryPoolAttribute::ReuseAllowInternalDependencies,
469                    MemoryPoolAttributeValue::Bool(value),
470                ) => {
471                    let mut value = u32::from(value);
472                    try_ffi!(driver::cuMemPoolSetAttribute(
473                        self.handle,
474                        attribute.into(),
475                        ptr::from_mut(&mut value).cast(),
476                    ))?;
477                }
478                (
479                    MemoryPoolAttribute::ReleaseThreshold
480                    | MemoryPoolAttribute::ReservedMemoryCurrent
481                    | MemoryPoolAttribute::ReservedMemoryHigh
482                    | MemoryPoolAttribute::UsedMemoryCurrent
483                    | MemoryPoolAttribute::UsedMemoryHigh,
484                    MemoryPoolAttributeValue::Bytes(value),
485                ) => {
486                    let mut value = value;
487                    try_ffi!(driver::cuMemPoolSetAttribute(
488                        self.handle,
489                        attribute.into(),
490                        ptr::from_mut(&mut value).cast(),
491                    ))?;
492                }
493                _ => return Err(Error::InvalidValue),
494            }
495        }
496        Ok(())
497    }
498
499    /// Supported attributes are:
500    ///
501    /// * [`MemoryPoolAttribute::ReleaseThreshold`]: amount of reserved memory, in bytes, to keep before trying to release memory back to the OS.
502    ///   When more than the release threshold bytes of memory are held by the memory pool, the allocator will try to release memory
503    ///   back to the OS on the next call to stream, event or context synchronize.
504    ///   (default 0)
505    /// * [`MemoryPoolAttribute::ReuseFollowEventDependencies`]: allows [`sys::cuMemAllocAsync`](singe_cuda_sys::driver::cuMemAllocAsync) to use memory asynchronously freed in another stream as long as a stream ordering dependency of the allocating stream on
506    ///   the free action exists.
507    ///   CUDA events and null stream interactions can create the required stream ordered dependencies.
508    ///   (default
509    ///   enabled)
510    /// * [`MemoryPoolAttribute::ReuseAllowOpportunistic`]: allows reuse of already completed frees when there is no dependency between the free and allocation.
511    ///   (default
512    ///   enabled)
513    /// * [`MemoryPoolAttribute::ReuseAllowInternalDependencies`]: allows [`sys::cuMemAllocAsync`](singe_cuda_sys::driver::cuMemAllocAsync) to insert new stream dependencies to establish the stream ordering required to reuse a piece of memory released
514    ///   by [`sys::cuMemFreeAsync`](singe_cuda_sys::driver::cuMemFreeAsync) (default enabled).
515    /// * [`MemoryPoolAttribute::ReservedMemoryCurrent`]: backing memory currently allocated for the memory pool.
516    /// * [`MemoryPoolAttribute::ReservedMemoryHigh`]: high watermark of backing memory allocated for the memory pool since the last reset.
517    /// * [`MemoryPoolAttribute::UsedMemoryCurrent`]: memory from the pool that is currently in use by the application.
518    /// * [`MemoryPoolAttribute::UsedMemoryHigh`]: high watermark of memory from the pool that was in use by the application.
519    ///
520    /// # Errors
521    ///
522    /// Returns an error if CUDA Driver cannot report the requested pool attribute.
523    pub fn attribute(&self, attribute: MemoryPoolAttribute) -> Result<MemoryPoolAttributeValue> {
524        unsafe {
525            match attribute {
526                MemoryPoolAttribute::ReuseFollowEventDependencies
527                | MemoryPoolAttribute::ReuseAllowOpportunistic
528                | MemoryPoolAttribute::ReuseAllowInternalDependencies => {
529                    let mut value = 0u32;
530                    try_ffi!(driver::cuMemPoolGetAttribute(
531                        self.handle,
532                        attribute.into(),
533                        ptr::from_mut(&mut value).cast(),
534                    ))?;
535                    Ok(MemoryPoolAttributeValue::Bool(value != 0))
536                }
537                MemoryPoolAttribute::ReleaseThreshold
538                | MemoryPoolAttribute::ReservedMemoryCurrent
539                | MemoryPoolAttribute::ReservedMemoryHigh
540                | MemoryPoolAttribute::UsedMemoryCurrent
541                | MemoryPoolAttribute::UsedMemoryHigh => {
542                    let mut value = 0u64;
543                    try_ffi!(driver::cuMemPoolGetAttribute(
544                        self.handle,
545                        attribute.into(),
546                        ptr::from_mut(&mut value).cast(),
547                    ))?;
548                    Ok(MemoryPoolAttributeValue::Bytes(value))
549                }
550            }
551        }
552    }
553
554    /// Controls visibility of pools between devices.
555    ///
556    /// # Errors
557    ///
558    /// Returns an error if CUDA Driver rejects the access descriptors.
559    pub fn set_access(&mut self, access_descs: &[MemoryAccessDescriptor]) -> Result<()> {
560        let access_descs: Vec<_> = access_descs.iter().copied().map(Into::into).collect();
561        unsafe {
562            try_ffi!(driver::cuMemPoolSetAccess(
563                self.handle,
564                access_descs.as_ptr(),
565                access_descs.len() as _,
566            ))?;
567        }
568        Ok(())
569    }
570
571    /// Returns the accessibility of the pool's memory from the specified location.
572    ///
573    /// # Errors
574    ///
575    /// Returns an error if CUDA Driver cannot report access from `location`.
576    pub fn access(&self, location: MemoryLocation) -> Result<MemoryAccessFlag> {
577        let mut flags = driver::CUmemAccess_flags::CU_MEM_ACCESS_FLAGS_PROT_NONE;
578        let mut location = driver::CUmemLocation_st::from(location);
579        unsafe {
580            try_ffi!(driver::cuMemPoolGetAccess(
581                &raw mut flags,
582                self.handle,
583                &raw mut location,
584            ))?;
585        }
586        Ok(flags.into())
587    }
588
589    /// Releases memory back to the OS until the pool contains fewer than `min_bytes_to_keep` reserved bytes, or there is no more memory that the allocator can safely release.
590    /// The allocator cannot release OS allocations that back outstanding asynchronous allocations.
591    /// The OS allocations may happen at different granularity from the caller's allocations.
592    ///
593    /// * Allocations that have not been freed count as outstanding.
594    /// * Allocations that have been asynchronously freed but whose completion has not been observed on the host, for example by synchronization, can count as outstanding.
595    ///
596    /// # Errors
597    ///
598    /// Returns an error if CUDA cannot trim the pool.
599    pub fn trim_to(&mut self, min_bytes_to_keep: usize) -> Result<()> {
600        unsafe {
601            try_ffi!(driver::cuMemPoolTrimTo(self.handle, min_bytes_to_keep as _))?;
602        }
603        Ok(())
604    }
605
606    pub const fn as_raw(&self) -> driver::CUmemoryPool {
607        self.handle
608    }
609}
610
611impl Drop for MemoryPool {
612    fn drop(&mut self) {
613        unsafe {
614            if let Err(err) = try_ffi!(driver::cuMemPoolDestroy(self.handle)) {
615                #[cfg(debug_assertions)]
616                eprintln!("failed to destroy cuda memory pool: {err}");
617            }
618        }
619    }
620}
621
622/// Represents a region of owned CUDA device memory for elements of type `T`.
623#[derive(Debug)]
624pub struct DeviceMemory<T> {
625    /// Raw pointer to the allocated device memory.
626    ptr: *mut T,
627    /// Number of elements of type `T` allocated.
628    length: usize,
629    /// Marker for the type `T`.
630    _phantom: PhantomData<T>,
631}
632
633impl<T> PartialEq for DeviceMemory<T> {
634    fn eq(&self, other: &Self) -> bool {
635        self.ptr == other.ptr && self.length == other.length
636    }
637}
638
639impl<T> Eq for DeviceMemory<T> {}
640
641#[derive(Debug)]
642pub struct ManagedMemory<T: DeviceRepr> {
643    ptr: *mut T,
644    length: usize,
645    // CUDA tracks the current visibility policy for managed memory separately
646    // from the pointer. Store the last policy requested through this wrapper so
647    // callers can reason about stream attachment without another FFI query.
648    attach_flags: MemoryAttachFlags,
649    _phantom: PhantomData<T>,
650}
651
652/// Associated utility functions.
653impl<T> DeviceMemory<T> {
654    /// Allocates size bytes of linear memory on the device and returns a pointer to the allocated memory.
655    /// The allocated memory is suitably aligned for any kind of variable.
656    /// The memory is not cleared.
657    /// [`DeviceMemory::alloc`] returns [`crate::error::Status::OutOfMemory`] on allocation failure.
658    ///
659    /// The device version of [`DeviceMemory::free`] cannot be used with a pointer allocated using the host API, and vice versa.
660    ///
661    /// # Errors
662    ///
663    /// Returns an error if the requested byte size overflows, CUDA cannot
664    /// allocate device memory, a previous asynchronous launch reports an error,
665    /// or CUDA reports runtime initialization diagnostics such as
666    /// [`crate::error::Status::NotInitialized`], [`crate::error::Status::CallRequiresNewerDriver`],
667    /// or [`crate::error::Status::NoDevice`].
668    ///
669    /// # Safety
670    ///
671    /// The returned pointer is uninitialized device memory. The caller must use
672    /// it only for `count` elements of `T` and eventually free it with a
673    /// compatible CUDA free function.
674    pub unsafe fn alloc(count: usize) -> Result<*mut T> {
675        let Some(bytes) = count.checked_mul(size_of::<T>()) else {
676            return Err(Error::InvalidMemoryAllocationRequest);
677        };
678        let mut p = ptr::null_mut();
679        unsafe {
680            try_ffi!(runtime::cudaMalloc(&raw mut p, bytes as _))?;
681        }
682        Ok(p.cast())
683    }
684
685    pub unsafe fn alloc_managed(count: usize, flags: MemoryAttachFlags) -> Result<*mut T> {
686        let Some(bytes) = count.checked_mul(size_of::<T>()) else {
687            return Err(Error::InvalidMemoryAllocationRequest);
688        };
689        if bytes == 0 {
690            return Ok(ptr::null_mut());
691        }
692        let mut p = ptr::null_mut();
693        unsafe {
694            try_ffi!(runtime::cudaMallocManaged(
695                &raw mut p,
696                bytes as _,
697                flags.bits(),
698            ))?;
699        }
700        Ok(p.cast::<T>())
701    }
702
703    /// Frees the memory space pointed to by `ptr`, which must have been returned by a previous call to one of these allocation functions: [`DeviceMemory::alloc`], [`sys::cudaMallocPitch`](singe_cuda_sys::runtime::cudaMallocPitch), [`DeviceMemory::alloc_managed`], [`DeviceMemory::alloc_async`], or [`sys::cudaMallocFromPoolAsync`](singe_cuda_sys::runtime::cudaMallocFromPoolAsync).
704    ///
705    /// This does not perform implicit synchronization when the pointer was allocated with [`DeviceMemory::alloc_async`] or [`sys::cudaMallocFromPoolAsync`](singe_cuda_sys::runtime::cudaMallocFromPoolAsync).
706    /// Callers must ensure that all accesses to this pointer have completed before invoking [`DeviceMemory::free`].
707    /// For best performance and memory reuse, use [`DeviceMemory::free_async`] to free memory allocated via the stream ordered memory allocator.
708    /// For all other pointers, this call may perform implicit synchronization.
709    ///
710    /// If [`DeviceMemory::free`] has already been called before, an error is returned.
711    /// If `ptr` is null, no operation is performed.
712    /// [`DeviceMemory::free`] returns an error on failure.
713    ///
714    /// The device version of [`DeviceMemory::free`] cannot be used with a pointer allocated using the host API, and vice versa.
715    ///
716    /// # Errors
717    ///
718    /// Returns an error if CUDA cannot free `ptr`, `ptr` has already been
719    /// freed, a previous asynchronous launch reports an error, or CUDA reports
720    /// runtime initialization diagnostics.
721    ///
722    /// # Safety
723    ///
724    /// `ptr` must be null or a live allocation returned by a compatible CUDA
725    /// device allocation function, and no work may access it after it is freed.
726    pub unsafe fn free(ptr: *mut T) -> Result<()> {
727        unsafe {
728            try_ffi!(runtime::cudaFree(ptr.cast()))?;
729        }
730        Ok(())
731    }
732
733    /// Copies `count` elements from `src` to `dst`.
734    /// The transfer direction is specified by [`MemoryCopyKind`].
735    /// [`MemoryCopyKind::Default`] is recommended when unified virtual addressing is available, in which case the transfer direction is inferred from the pointer values.
736    /// Calling [`DeviceMemory::copy`] with `dst` and `src` pointers that do not match the direction of the copy results in undefined behavior.
737    ///
738    /// * Exhibits `synchronous` behavior for most use cases.
739    /// * Memory regions requested must be either entirely registered with CUDA, or in the case of host pageable transfers, not registered
740    ///   at all.
741    ///   Memory regions spanning over allocations that are both registered and not registered with CUDA are not supported and
742    ///   return [`crate::error::Status::InvalidValue`].
743    ///
744    /// # Errors
745    ///
746    /// Returns an error if the requested byte count overflows, CUDA rejects the
747    /// pointer combination or copy kind, a previous asynchronous launch reports
748    /// an error, or CUDA reports runtime initialization diagnostics.
749    ///
750    /// # Safety
751    ///
752    /// `src` and `dst` must be valid for `count` elements of `T` according to
753    /// `kind`, and the source and destination regions must not overlap unless
754    /// CUDA permits that transfer.
755    pub unsafe fn copy(
756        dst: *mut T,
757        src: *const T,
758        count: usize,
759        kind: MemoryCopyKind,
760    ) -> Result<()> {
761        let Some(bytes) = count.checked_mul(size_of::<T>()) else {
762            return Err(Error::InvalidMemoryAllocationRequest);
763        };
764        unsafe {
765            try_ffi!(runtime::cudaMemcpy(
766                dst.cast(),
767                src.cast(),
768                bytes as _,
769                kind.into(),
770            ))?;
771        }
772        Ok(())
773    }
774
775    /// Fills the first `count` bytes of the memory area pointed to by `ptr` with the constant byte `value`.
776    ///
777    /// This call is asynchronous with respect to the host unless `ptr` refers to pinned host memory.
778    ///
779    /// See the CUDA memset synchronization rules for when this operation blocks
780    /// the host.
781    ///
782    /// # Errors
783    ///
784    /// Returns an error if the requested byte count overflows, CUDA rejects the
785    /// pointer or size, a previous asynchronous launch reports an error, or CUDA
786    /// reports runtime initialization diagnostics.
787    ///
788    /// # Safety
789    ///
790    /// `dst` must be valid for writes of `count * size_of::<T>()` bytes and
791    /// must refer to memory that CUDA can memset.
792    pub unsafe fn set(dst: *mut T, value: u8, count: usize) -> Result<()> {
793        let Some(bytes) = count.checked_mul(size_of::<T>()) else {
794            return Err(Error::InvalidMemoryAllocationRequest);
795        };
796        unsafe {
797            try_ffi!(runtime::cudaMemset(dst.cast(), value.into(), bytes as _))?;
798        }
799        Ok(())
800    }
801
802    pub unsafe fn alloc_host(size: usize) -> Result<*mut ()> {
803        let mut ptr = ptr::null_mut();
804        unsafe {
805            try_ffi!(runtime::cudaMallocHost(
806                &raw mut ptr,
807                size as runtime::size_t
808            ))?;
809        }
810        Ok(ptr.cast())
811    }
812
813    /// Frees host memory returned by [`DeviceMemory::alloc_host`] or [`DeviceMemory::alloc_pinned`].
814    ///
815    /// # Errors
816    ///
817    /// Returns an error if CUDA cannot free the host allocation, a previous
818    /// asynchronous launch reports an error, or CUDA reports runtime
819    /// initialization diagnostics.
820    ///
821    /// # Safety
822    ///
823    /// `ptr` must be null or a live host allocation returned by a compatible
824    /// CUDA host allocation function.
825    pub unsafe fn free_host(ptr: *mut ()) -> Result<()> {
826        unsafe { try_ffi!(runtime::cudaFreeHost(ptr.cast())) }
827    }
828
829    /// Allocates size bytes of host memory that is page-locked and accessible to the device.
830    /// The driver tracks the allocated virtual memory ranges and automatically accelerates calls such as [`DeviceMemory::copy`].
831    /// Since the memory can be accessed directly by the device, it can be read or written with much higher bandwidth than pageable memory obtained with functions such as `malloc()`.
832    /// Allocating excessive amounts of pinned memory may degrade system performance, since it reduces the amount of memory available to the system for paging.
833    /// As a result, use this sparingly to allocate staging areas for data exchange between host and device.
834    ///
835    /// `flags` selects allocation options:
836    ///
837    /// * [`HostAllocationFlags::DEFAULT`]: equivalent to [`DeviceMemory::alloc_host`].
838    /// * [`HostAllocationFlags::PORTABLE`]: the memory returned by this call is considered pinned memory by all CUDA contexts, not just the one that performed
839    ///   the allocation.
840    /// * [`HostAllocationFlags::MAPPED`]: maps the allocation into the CUDA address space.
841    ///   The device pointer to the memory may be obtained by calling [`sys::cudaHostGetDevicePointer`](singe_cuda_sys::runtime::cudaHostGetDevicePointer).
842    /// * [`HostAllocationFlags::WRITE_COMBINED`]: allocates the memory as write-combined (WC).
843    ///   WC memory can be transferred across the PCI Express bus more quickly on some
844    ///   system configurations, but cannot be read efficiently by most CPUs.
845    ///   WC memory is a good option for buffers written
846    ///   by the CPU and read by the device via mapped pinned memory or host-&gt;device transfers.
847    ///
848    /// All of these flags are orthogonal to one another: a developer may allocate memory that is portable, mapped and/or write-combined with no restrictions.
849    ///
850    /// For [`HostAllocationFlags::MAPPED`] to have any effect, the CUDA context must support [`ContextFlags::MAP_HOST`](crate::context::ContextFlags::MAP_HOST), which can be checked via [`Device::flags`](crate::device::Device::flags).
851    /// [`ContextFlags::MAP_HOST`](crate::context::ContextFlags::MAP_HOST) is implicitly set for contexts created via the runtime API.
852    ///
853    /// [`HostAllocationFlags::MAPPED`] may be specified on CUDA contexts for devices that do not support mapped pinned memory.
854    /// The failure is deferred to [`sys::cudaHostGetDevicePointer`](singe_cuda_sys::runtime::cudaHostGetDevicePointer) because the memory may be mapped into other CUDA contexts via [`HostAllocationFlags::PORTABLE`].
855    ///
856    /// Memory allocated by this method must be freed with [`DeviceMemory::free_host`].
857    ///
858    /// # Errors
859    ///
860    /// Returns an error if CUDA cannot allocate pinned host memory, a previous
861    /// asynchronous launch reports an error, or CUDA reports runtime
862    /// initialization diagnostics.
863    ///
864    /// # Safety
865    ///
866    /// The returned pointer is uninitialized host memory. The caller must ensure
867    /// it is accessed within `size` bytes and freed with [`DeviceMemory::free_host`].
868    pub unsafe fn alloc_pinned(size: usize, flags: HostAllocationFlags) -> Result<*mut ()> {
869        let mut ptr = ptr::null_mut();
870        unsafe {
871            try_ffi!(runtime::cudaHostAlloc(
872                &raw mut ptr,
873                size as _,
874                flags.bits()
875            ))?;
876        }
877        Ok(ptr.cast())
878    }
879
880    /// Page-locks the memory range specified by `ptr` and `size`, and maps it for the devices selected by `flags`.
881    /// This memory range also is added to the same tracking mechanism as [`DeviceMemory::alloc_pinned`] to automatically accelerate calls to functions such as [`DeviceMemory::copy`].
882    /// Since the memory can be accessed directly by the device, it can be read or written with much higher bandwidth than pageable memory that has not been registered.
883    /// Page-locking excessive amounts of memory may degrade system performance, since it reduces the amount of memory available to the system for paging.
884    /// As a result, use this sparingly to register staging areas for data exchange between host and device.
885    ///
886    /// On systems where [`DeviceProperties::pageable_memory_access_uses_host_page_tables`](crate::device::DeviceProperties::pageable_memory_access_uses_host_page_tables) is enabled, [`DeviceMemory::register_host`] does not page-lock the memory range specified by `ptr` and instead only populates unpopulated pages.
887    ///
888    /// [`DeviceMemory::register_host`] is supported only on I/O coherent devices where [`DeviceProperties::host_register_supported`](crate::device::DeviceProperties::host_register_supported) is enabled.
889    ///
890    /// `flags` selects registration options:
891    ///
892    /// * [`HostRegisterFlags::DEFAULT`]: on a system with unified virtual addressing, the memory is both mapped and portable.
893    ///   On a system with no unified virtual addressing, the memory is neither mapped nor portable.
894    ///
895    /// * [`HostRegisterFlags::PORTABLE`]: the memory returned by this call is considered pinned memory by all CUDA contexts, not just the one that performed
896    ///   the allocation.
897    ///
898    /// * [`HostRegisterFlags::MAPPED`]: maps the allocation into the CUDA address space.
899    ///   The device pointer to the memory may be obtained by calling [`sys::cudaHostGetDevicePointer`](singe_cuda_sys::runtime::cudaHostGetDevicePointer).
900    ///
901    /// * [`HostRegisterFlags::IO_MEMORY`]: the passed memory pointer is treated as pointing to some memory-mapped I/O space, for example belonging to a third-party PCIe device,
902    ///   and it is marked as non-cache-coherent and contiguous.
903    ///
904    /// * [`HostRegisterFlags::READ_ONLY`]: the passed memory pointer is treated as pointing to memory that is considered read-only by the device.
905    ///   On platforms without
906    ///   [`DeviceProperties::pageable_memory_access_uses_host_page_tables`](crate::device::DeviceProperties::pageable_memory_access_uses_host_page_tables), this flag is required to register memory mapped to the CPU as read-only.
907    ///   Query support with [`DeviceProperties::host_register_read_only_supported`](crate::device::DeviceProperties::host_register_read_only_supported).
908    ///   Using this flag with a current context associated with a device that does not have this attribute set makes [`DeviceMemory::register_host`] return [`crate::error::Status::NotSupported`].
909    ///
910    /// All of these flags are orthogonal to one another: a developer may page-lock memory that is portable or mapped with no restrictions.
911    ///
912    /// The CUDA context must have been created with [`ContextFlags::MAP_HOST`](crate::context::ContextFlags::MAP_HOST) for [`HostRegisterFlags::MAPPED`] to have any effect.
913    ///
914    /// [`HostRegisterFlags::MAPPED`] may be specified on CUDA contexts for devices that do not support mapped pinned memory.
915    /// The failure is deferred to [`sys::cudaHostGetDevicePointer`](singe_cuda_sys::runtime::cudaHostGetDevicePointer) because the memory may be mapped into other CUDA contexts via [`HostRegisterFlags::PORTABLE`].
916    ///
917    /// On devices where [`DeviceProperties::can_use_host_pointer_for_registered_mem`](crate::device::DeviceProperties::can_use_host_pointer_for_registered_mem) is enabled, the memory can also be accessed from the device using the original host pointer.
918    /// The device pointer returned by [`sys::cudaHostGetDevicePointer`](singe_cuda_sys::runtime::cudaHostGetDevicePointer) may or may not match the original host pointer and depends on the devices visible to the application.
919    /// If all devices visible to the application have a non-zero value for the device attribute, the device pointer returned by [`sys::cudaHostGetDevicePointer`](singe_cuda_sys::runtime::cudaHostGetDevicePointer) matches the original pointer.
920    /// If any device visible to the application has a zero value for the device attribute, the device pointer returned by [`sys::cudaHostGetDevicePointer`](singe_cuda_sys::runtime::cudaHostGetDevicePointer) does not match the original host pointer, but is suitable for use on all devices provided Unified Virtual Addressing is enabled.
921    /// In such systems, it is valid to access the memory using either pointer on devices that have a non-zero value for the device attribute.
922    /// Such devices must access the memory through only one of the two pointers, not both.
923    ///
924    /// The memory page-locked by this method must be unregistered with [`DeviceMemory::unregister_host`].
925    ///
926    /// # Errors
927    ///
928    /// Returns an error if CUDA cannot register the host range, the pointer,
929    /// size, or flags are invalid, a previous asynchronous launch reports an
930    /// error, or CUDA reports runtime initialization diagnostics.
931    ///
932    /// # Safety
933    ///
934    /// `ptr..ptr + size` must be a valid host memory range and must remain valid
935    /// until it is unregistered.
936    pub unsafe fn register_host(ptr: *mut (), size: usize, flags: HostRegisterFlags) -> Result<()> {
937        unsafe {
938            try_ffi!(runtime::cudaHostRegister(
939                ptr.cast(),
940                size as _,
941                flags.bits()
942            ))?;
943        }
944        Ok(())
945    }
946
947    /// Unmaps the memory range whose base address is specified by `ptr`, and makes it pageable again.
948    ///
949    /// The base address must be the same one specified to [`DeviceMemory::register_host`].
950    ///
951    /// # Errors
952    ///
953    /// Returns an error if CUDA cannot unregister the host range, `ptr` is not
954    /// the base address of a registered range, a previous asynchronous launch
955    /// reports an error, or CUDA reports runtime initialization diagnostics.
956    ///
957    /// # Safety
958    ///
959    /// `ptr` must be the base address of a host range registered with
960    /// [`DeviceMemory::register_host`] and must not be unregistered twice.
961    pub unsafe fn unregister_host(ptr: *mut ()) -> Result<()> {
962        unsafe { try_ffi!(runtime::cudaHostUnregister(ptr.cast())) }
963    }
964
965    /// Returns the total amount of memory available to the current context and the amount of memory free on the device.
966    /// CUDA is not guaranteed to be able to allocate all of the memory that the OS reports as free.
967    /// In a multi-tenant situation, the free-memory estimate is prone to a race condition: an allocation or free by another process or thread between estimation and reporting can make the reported free value differ from actual free memory.
968    ///
969    /// The integrated GPU on Tegra shares memory with CPU and other component of the SoC.
970    /// The free and total values returned by this call exclude the SWAP memory space maintained by the OS on some platforms.
971    /// The OS may move some of the memory pages into swap area as the GPU or CPU allocate or access memory.
972    /// See Tegra app note on how to calculate total and free memory on Tegra.
973    ///
974    /// # Errors
975    ///
976    /// Returns an error if CUDA cannot query memory information, a previous
977    /// asynchronous launch reports an error, or CUDA reports runtime
978    /// initialization diagnostics.
979    pub fn memory_info() -> Result<(usize, usize)> {
980        let mut free: runtime::size_t = 0;
981        let mut total: runtime::size_t = 0;
982        unsafe {
983            try_ffi!(runtime::cudaMemGetInfo(&raw mut free, &raw mut total))?;
984        }
985        Ok((free as usize, total as usize))
986    }
987
988    /// Returns the attributes of `ptr`.
989    /// If `ptr` was not allocated in, mapped by, or registered with a context that supports unified addressing, [`crate::error::Status::InvalidValue`] is returned.
990    ///
991    /// In CUDA 11.0 and later, passing a host pointer reports [`MemoryType::Unregistered`] in [`PointerAttributes::memory_type`].
992    ///
993    /// * [`PointerAttributes::memory_type`] identifies the type of memory.
994    ///   It can be [`MemoryType::Unregistered`] for unregistered host memory, [`MemoryType::Host`] for registered host memory, [`MemoryType::Device`] for device memory, or [`MemoryType::Managed`] for managed memory.
995    ///
996    /// * [`PointerAttributes::device`] is the device against which `ptr` was allocated.
997    ///   If `ptr` has memory type [`MemoryType::Device`], this identifies the device on which the memory physically resides.
998    ///   If `ptr` has memory type [`MemoryType::Host`], this identifies the device that was current when the allocation was made, and if that device is deinitialized then
999    ///   this allocation will vanish with that device's state.
1000    ///
1001    /// * [`PointerAttributes::device_pointer`] is the device pointer alias through which the memory referred to by `ptr` may be accessed on the current device.
1002    ///   If the memory referred to by `ptr` cannot be accessed directly by the current device then this is null.
1003    ///
1004    /// * [`PointerAttributes::host_pointer`] is the host pointer alias through which the memory referred to by `ptr` may be accessed on the host.
1005    ///   If the memory referred to by `ptr` cannot be accessed directly by the host then this is null.
1006    ///
1007    /// # Errors
1008    ///
1009    /// Returns an error if CUDA cannot query attributes for `ptr`, `ptr` is not
1010    /// known to a unified-addressing context, or CUDA reports runtime
1011    /// initialization diagnostics.
1012    pub fn pointer_attributes(ptr: *const T) -> Result<PointerAttributes> {
1013        let mut attr_ffi = MaybeUninit::<runtime::cudaPointerAttributes>::uninit();
1014        unsafe {
1015            try_ffi!(runtime::cudaPointerGetAttributes(
1016                attr_ffi.as_mut_ptr(),
1017                ptr.cast(),
1018            ))?;
1019            // Safety: FFI call successful, attr_ffi is initialized.
1020            Ok(attr_ffi.assume_init().into())
1021        }
1022    }
1023
1024    pub unsafe fn alloc_async(count: usize, stream: &Stream) -> Result<*mut T> {
1025        let Some(bytes) = count.checked_mul(size_of::<T>()) else {
1026            return Err(Error::InvalidMemoryAllocationRequest);
1027        };
1028        if bytes == 0 {
1029            return Ok(ptr::null_mut());
1030        }
1031        let mut p = ptr::null_mut();
1032        unsafe {
1033            try_ffi!(runtime::cudaMallocAsync(
1034                &raw mut p,
1035                bytes as _,
1036                stream.as_raw()
1037            ))?;
1038        }
1039        Ok(p.cast::<T>())
1040    }
1041
1042    /// Inserts a free operation into `stream`.
1043    /// The allocation must not be accessed after stream execution reaches the free.
1044    /// After this call returns, accessing the memory from any subsequent work launched on the GPU or querying its pointer attributes results in undefined behavior.
1045    ///
1046    /// During stream capture, this creates a free node and must therefore be passed the address of a graph allocation.
1047    ///
1048    /// # Errors
1049    ///
1050    /// Returns an error if CUDA cannot enqueue the free on `stream`, `ptr` is
1051    /// invalid for asynchronous freeing, a previous asynchronous launch reports
1052    /// an error, or CUDA reports runtime initialization diagnostics.
1053    ///
1054    /// # Safety
1055    ///
1056    /// `ptr` must be null or a live stream-ordered CUDA allocation. No work may
1057    /// access it after `stream` reaches the enqueued free.
1058    pub unsafe fn free_async(ptr: *mut T, stream: &Stream) -> Result<()> {
1059        if ptr.is_null() {
1060            return Ok(());
1061        }
1062        unsafe { try_ffi!(runtime::cudaFreeAsync(ptr.cast(), stream.as_raw())) }
1063    }
1064
1065    pub unsafe fn copy_async(
1066        dst: *mut T,
1067        src: *const T,
1068        count: usize,
1069        kind: MemoryCopyKind,
1070        stream: &Stream,
1071    ) -> Result<()> {
1072        if count == 0 {
1073            return Ok(());
1074        }
1075        let Some(bytes) = count.checked_mul(size_of::<T>()) else {
1076            return Err(Error::InvalidMemoryAllocationRequest);
1077        };
1078        unsafe {
1079            try_ffi!(runtime::cudaMemcpyAsync(
1080                dst.cast(),
1081                src.cast(),
1082                bytes as _,
1083                kind.into(),
1084                stream.as_raw(),
1085            ))?;
1086        }
1087        Ok(())
1088    }
1089
1090    /// Fills the first `count` bytes of the memory area pointed to by `ptr` with the constant byte `value`.
1091    ///
1092    /// [`DeviceMemory::set_async`] is asynchronous with respect to the host, so the call may return before the memset is complete.
1093    /// The operation can optionally be associated with a stream by passing a non-zero stream argument.
1094    /// If `stream` is non-zero, the operation may overlap with operations in other streams.
1095    ///
1096    /// The device version only handles device-to-device copies and cannot be given local or shared pointers.
1097    ///
1098    /// See the CUDA memset synchronization rules for when this operation blocks
1099    /// the host.
1100    ///
1101    /// # Errors
1102    ///
1103    /// Returns an error if the requested byte count overflows, CUDA cannot
1104    /// enqueue the memset on `stream`, a previous asynchronous launch reports an
1105    /// error, or CUDA reports runtime initialization diagnostics.
1106    ///
1107    /// # Safety
1108    ///
1109    /// `dst` must be valid for writes of `count * size_of::<T>()` bytes until
1110    /// `stream` reaches the enqueued memset.
1111    pub unsafe fn set_async(dst: *mut T, value: u8, count: usize, stream: &Stream) -> Result<()> {
1112        if count == 0 {
1113            return Ok(());
1114        }
1115        let Some(bytes) = count.checked_mul(size_of::<T>()) else {
1116            return Err(Error::InvalidMemoryAllocationRequest);
1117        };
1118        unsafe {
1119            try_ffi!(runtime::cudaMemsetAsync(
1120                dst.cast(),
1121                value.into(),
1122                bytes as _,
1123                stream.as_raw(),
1124            ))?;
1125        }
1126        Ok(())
1127    }
1128
1129    /// Prefetches memory to the specified destination location.
1130    /// `ptr` is the base device pointer of the memory to be prefetched, `location` specifies the destination location, `count` specifies the number of bytes to copy, and `stream` is the stream in which the operation is enqueued.
1131    /// The memory range must refer to managed memory allocated via [`DeviceMemory::alloc_managed`] or declared via `__managed__` variables. It may also refer to memory allocated from a managed memory pool, or to system-allocated memory on systems where [`DeviceProperties::pageable_memory_access`](crate::device::DeviceProperties::pageable_memory_access) is enabled.
1132    ///
1133    /// Setting [`MemoryLocation::kind`](crate::memory::MemoryLocation::kind) to [`MemoryLocationKind::Device`] prefetches memory to the GPU identified by [`MemoryLocation::id`](crate::memory::MemoryLocation::id). That device, and the device associated with `stream`, must support concurrent managed access.
1134    /// Setting [`MemoryLocation::kind`](crate::memory::MemoryLocation::kind) to [`MemoryLocationKind::Host`] prefetches data to host memory.
1135    /// Applications can request prefetching memory to a specific host NUMA node by using [`MemoryLocationKind::Numa`] with a valid NUMA node identifier, or to the NUMA node closest to the current thread's CPU by using [`MemoryLocationKind::NumaCurrent`].
1136    /// When [`MemoryLocation::kind`](crate::memory::MemoryLocation::kind) is [`MemoryLocationKind::Host`] or [`MemoryLocationKind::NumaCurrent`], [`MemoryLocation::id`](crate::memory::MemoryLocation::id) is ignored.
1137    ///
1138    /// The start and end addresses of the memory range are rounded down and up, respectively, to CPU page-size alignment before the prefetch operation is enqueued in the stream.
1139    ///
1140    /// If no physical memory has been allocated for this region, CUDA populates and maps it on the destination device.
1141    /// If there is insufficient memory to prefetch the desired region, the Unified Memory driver may evict pages from other [`DeviceMemory::alloc_managed`] allocations to host memory to make room.
1142    /// Device memory allocated using [`DeviceMemory::alloc`] or [`sys::cudaMallocArray`](singe_cuda_sys::runtime::cudaMallocArray) is not evicted.
1143    ///
1144    /// By default, mappings to the previous location of the migrated pages are removed and mappings for the new location are only set up at the destination.
1145    /// The exact behavior also depends on the settings applied to this memory range via `cuMemAdvise` as described below:
1146    ///
1147    /// If read-mostly advice was set on any subset of this memory range, then that subset will create a read-only copy of the pages at the destination location.
1148    /// If the destination location is a host NUMA node, any pages of that subset that are already in another host NUMA node are transferred to the destination.
1149    ///
1150    /// If preferred-location advice was set on any subset of this memory range, then the pages will migrate to `location` even if it is not the preferred location of every page in the range.
1151    ///
1152    /// If accessed-by advice was set on any subset of this memory range, then mappings to those pages from all appropriate processors are updated to refer to the new location if establishing such a mapping is possible.
1153    /// Otherwise, those mappings are cleared.
1154    ///
1155    /// This is not required for correctness; it improves performance by allowing the application to migrate data to a suitable location before access.
1156    /// Memory accesses to this range are always coherent and are allowed even when the data is actively being migrated.
1157    ///
1158    /// This call is asynchronous with respect to the host and all work on other devices.
1159    ///
1160    /// # Errors
1161    ///
1162    /// Returns an error if CUDA cannot enqueue the prefetch on `stream`, the
1163    /// memory range or destination location is invalid, a previous asynchronous
1164    /// launch reports an error, or CUDA reports runtime initialization
1165    /// diagnostics.
1166    pub fn prefetch_async(
1167        ptr: DevicePtr,
1168        count: usize,
1169        location: MemoryLocation,
1170        stream: &Stream,
1171    ) -> Result<()> {
1172        if count == 0 {
1173            return Ok(());
1174        }
1175        unsafe {
1176            try_ffi!(runtime::cudaMemPrefetchAsync(
1177                ptr.as_ptr() as _,
1178                count as _,
1179                location.into(),
1180                0, // flags
1181                stream.as_raw()
1182            ))?;
1183        }
1184        Ok(())
1185    }
1186}
1187
1188// Safety: DeviceMemory owns a CUDA allocation and does not provide CPU
1189// references into the allocation. Moving the owner to another thread is safe
1190// when `T: Send` because CUDA allocation/free operations are synchronized by
1191// the CUDA context APIs used by this wrapper. Shared references are safe when
1192// `T: Sync` because `&DeviceMemory<T>` only exposes raw device pointers and
1193// read-only metadata; mutation of device memory still requires CUDA operations
1194// whose safety contracts are enforced by the calling APIs.
1195unsafe impl<T: Send> Send for DeviceMemory<T> {}
1196unsafe impl<T: Sync> Sync for DeviceMemory<T> {}
1197
1198// Safety: managed memory has the same Rust-side ownership properties as
1199// DeviceMemory. CUDA maintains coherence for managed allocations across host
1200// threads and devices; Rust references into the allocation are not exposed by
1201// this wrapper.
1202unsafe impl<T: DeviceRepr + Send> Send for ManagedMemory<T> {}
1203unsafe impl<T: DeviceRepr + Sync> Sync for ManagedMemory<T> {}
1204
1205impl<T> DeviceMemory<T> {
1206    /// Takes ownership of an existing device allocation.
1207    ///
1208    /// # Safety
1209    ///
1210    /// `ptr` must be null for an empty allocation or point to `length` live
1211    /// elements allocated by `cudaMallocManaged` or another CUDA allocation
1212    /// function compatible with `cudaFree`. `length * size_of::<T>()` must fit
1213    /// in `usize`.
1214    /// No other owner may free the pointer while the returned value is alive.
1215    pub unsafe fn from_raw_parts(ptr: *mut T, length: usize) -> Self {
1216        Self {
1217            ptr,
1218            length,
1219            _phantom: PhantomData,
1220        }
1221    }
1222
1223    pub fn into_raw_parts(self) -> (*mut T, usize) {
1224        let ptr = self.ptr;
1225        let length = self.length;
1226        mem::forget(self);
1227        (ptr, length)
1228    }
1229
1230    pub fn create(length: usize) -> Result<Self> {
1231        let size_t = size_of::<T>();
1232
1233        if size_t == 0 {
1234            if length == 0 {
1235                return Ok(Self {
1236                    ptr: ptr::null_mut(), // No allocation needed for ZSTs with count 0
1237                    length: 0,
1238                    _phantom: PhantomData,
1239                });
1240            }
1241            return Err(Error::InvalidMemoryAllocationRequest);
1242        }
1243
1244        // Ensure allocation size doesn't overflow usize when calculating bytes internally in `alloc`.
1245        if length > (usize::MAX / size_t) {
1246            return Err(Error::InvalidMemoryAllocationRequest);
1247        }
1248
1249        if length == 0 {
1250            Ok(Self {
1251                ptr: ptr::null_mut(),
1252                length: 0,
1253                _phantom: PhantomData,
1254            })
1255        } else {
1256            let device_ptr = unsafe { Self::alloc(length)? };
1257
1258            Ok(Self {
1259                ptr: device_ptr,
1260                length,
1261                _phantom: PhantomData,
1262            })
1263        }
1264    }
1265
1266    pub fn zeroes(length: usize) -> Result<Self> {
1267        let mut mem = Self::create(length)?;
1268        mem.set_zeroes()?;
1269        Ok(mem)
1270    }
1271
1272    pub fn from_slice(v: &[T]) -> Result<Self> {
1273        let mut mem = Self::create(v.len())?;
1274        mem.copy_from_host(v)?;
1275        Ok(mem)
1276    }
1277
1278    /// # Safety
1279    ///
1280    /// The caller must ensure `v` remains valid and unmodified until `stream`
1281    /// has completed the transfer.
1282    ///
1283    /// # Errors
1284    ///
1285    /// Returns an error if CUDA cannot allocate device memory or enqueue the
1286    /// host-to-device copy.
1287    pub unsafe fn from_slice_async(v: &[T], stream: &Stream) -> Result<Self> {
1288        let mut mem = Self::create(v.len())?;
1289        unsafe {
1290            mem.copy_from_host_async_unchecked(v, stream)?;
1291        }
1292        Ok(mem)
1293    }
1294
1295    pub const fn len(&self) -> usize {
1296        self.length
1297    }
1298
1299    pub const fn is_empty(&self) -> bool {
1300        self.length == 0
1301    }
1302
1303    pub fn byte_len(&self) -> usize {
1304        self.length
1305            .checked_mul(size_of::<T>())
1306            .expect("device memory byte length overflow")
1307    }
1308
1309    pub const fn as_ptr(&self) -> *const T {
1310        self.ptr
1311    }
1312
1313    pub const fn as_mut_ptr(&self) -> *mut T {
1314        self.ptr
1315    }
1316
1317    pub fn copy_from_host(&mut self, host_slice: &[T]) -> Result<()> {
1318        if host_slice.len() != self.length {
1319            return Err(Error::InvalidMemoryAccess);
1320        }
1321        if self.length == 0 {
1322            return Ok(());
1323        }
1324        unsafe {
1325            Self::copy(
1326                self.ptr,
1327                host_slice.as_ptr(),
1328                self.length,
1329                MemoryCopyKind::HostToDevice,
1330            )
1331        }
1332    }
1333
1334    pub fn copy_from_host_async<'scope, 'env>(
1335        &mut self,
1336        host_slice: &'env [T],
1337        stream: &StreamScope<'scope, 'env>,
1338    ) -> Result<()> {
1339        unsafe { self.copy_from_host_async_unchecked(host_slice, stream.stream()) }
1340    }
1341
1342    /// # Safety
1343    ///
1344    /// The caller must ensure `self` and `host_slice` both remain valid until
1345    /// `stream` has completed the transfer.
1346    pub unsafe fn copy_from_host_async_unchecked(
1347        &mut self,
1348        host_slice: &[T],
1349        stream: &Stream,
1350    ) -> Result<()> {
1351        if host_slice.len() != self.len() {
1352            return Err(Error::InvalidMemoryAccess);
1353        }
1354        if self.is_empty() {
1355            return Ok(());
1356        }
1357        unsafe {
1358            Self::copy_async(
1359                self.as_mut_ptr(),
1360                host_slice.as_ptr(),
1361                self.len(),
1362                MemoryCopyKind::HostToDevice,
1363                stream,
1364            )
1365        }
1366    }
1367
1368    /// Returns a capture operation that copies from host memory into this device allocation.
1369    ///
1370    /// # Safety
1371    ///
1372    /// Capturing this operation stores the host and device pointer addresses in
1373    /// the resulting CUDA graph. The caller must ensure `self` and `host_slice`
1374    /// remain valid whenever a captured graph using this operation is launched.
1375    /// The destination allocation must remain exclusive for the work ordered by
1376    /// those launches.
1377    pub unsafe fn copy_from_host_operation<'a>(
1378        &'a mut self,
1379        host_slice: &'a [T],
1380    ) -> Result<MemoryCopyOperation<'a, T>> {
1381        if host_slice.len() != self.len() {
1382            return Err(Error::InvalidMemoryAccess);
1383        }
1384        Ok(unsafe {
1385            MemoryCopyOperation::new(
1386                self.as_mut_ptr(),
1387                host_slice.as_ptr(),
1388                self.len(),
1389                MemoryCopyKind::HostToDevice,
1390            )
1391        })
1392    }
1393
1394    pub fn copy_to_host(&self, host_slice: &mut [T]) -> Result<()> {
1395        if host_slice.len() != self.length {
1396            return Err(Error::InvalidMemoryAccess);
1397        }
1398        if self.length == 0 {
1399            return Ok(());
1400        }
1401        unsafe {
1402            Self::copy(
1403                host_slice.as_mut_ptr(),
1404                self.ptr,
1405                self.length,
1406                MemoryCopyKind::DeviceToHost,
1407            )
1408        }
1409    }
1410
1411    pub fn copy_to_host_async<'scope, 'env>(
1412        &self,
1413        host_slice: &'env mut [T],
1414        stream: &StreamScope<'scope, 'env>,
1415    ) -> Result<()> {
1416        unsafe { self.copy_to_host_async_unchecked(host_slice, stream.stream()) }
1417    }
1418
1419    /// # Safety
1420    ///
1421    /// The caller must ensure `self` and `host_slice` both remain valid until
1422    /// `stream` has completed the transfer.
1423    pub unsafe fn copy_to_host_async_unchecked(
1424        &self,
1425        host_slice: &mut [T],
1426        stream: &Stream,
1427    ) -> Result<()> {
1428        if host_slice.len() != self.len() {
1429            return Err(Error::InvalidMemoryAccess);
1430        }
1431        if self.is_empty() {
1432            return Ok(());
1433        }
1434        unsafe {
1435            Self::copy_async(
1436                host_slice.as_mut_ptr(),
1437                self.as_ptr(),
1438                self.len(),
1439                MemoryCopyKind::DeviceToHost,
1440                stream,
1441            )
1442        }
1443    }
1444
1445    /// Returns a capture operation that copies this allocation into host memory.
1446    ///
1447    /// # Safety
1448    ///
1449    /// Capturing this operation stores the device and host pointer addresses in
1450    /// the resulting CUDA graph. The caller must ensure `self` and `host_slice`
1451    /// remain valid whenever a captured graph using this operation is launched.
1452    /// The host destination must remain exclusive for the work ordered by those
1453    /// launches.
1454    pub unsafe fn copy_to_host_operation<'a>(
1455        &'a self,
1456        host_slice: &'a mut [T],
1457    ) -> Result<MemoryCopyOperation<'a, T>> {
1458        if host_slice.len() != self.len() {
1459            return Err(Error::InvalidMemoryAccess);
1460        }
1461        Ok(unsafe {
1462            MemoryCopyOperation::new(
1463                host_slice.as_mut_ptr(),
1464                self.as_ptr(),
1465                self.len(),
1466                MemoryCopyKind::DeviceToHost,
1467            )
1468        })
1469    }
1470
1471    pub fn copy_to_host_vec(&self) -> Result<Vec<T>> {
1472        if size_of::<T>() == 0 {
1473            return Err(Error::InvalidMemoryAllocationRequest);
1474        }
1475
1476        if self.length == 0 {
1477            return Ok(Vec::new());
1478        }
1479
1480        let mut host_vec = Vec::<T>::with_capacity(self.length);
1481
1482        unsafe {
1483            Self::copy(
1484                host_vec.as_mut_ptr(),
1485                self.ptr,
1486                self.length,
1487                MemoryCopyKind::DeviceToHost,
1488            )?;
1489
1490            host_vec.set_len(self.length);
1491        }
1492
1493        Ok(host_vec)
1494    }
1495
1496    pub fn copy_from_device(&mut self, src: &Self) -> Result<()> {
1497        if src.len() != self.length {
1498            return Err(Error::InvalidMemoryAccess);
1499        }
1500        if self.length == 0 {
1501            return Ok(());
1502        }
1503        unsafe {
1504            Self::copy(
1505                self.ptr,
1506                src.as_ptr(),
1507                self.length,
1508                MemoryCopyKind::DeviceToDevice,
1509            )
1510        }
1511    }
1512
1513    pub fn copy_from_device_async<'scope, 'env>(
1514        &mut self,
1515        src: &Self,
1516        stream: &StreamScope<'scope, 'env>,
1517    ) -> Result<()> {
1518        unsafe { self.copy_from_device_async_unchecked(src, stream.stream()) }
1519    }
1520
1521    /// # Safety
1522    ///
1523    /// The caller must ensure `self` and `src` both remain valid until
1524    /// `stream` has completed the transfer.
1525    pub unsafe fn copy_from_device_async_unchecked(
1526        &mut self,
1527        src: &Self,
1528        stream: &Stream,
1529    ) -> Result<()> {
1530        if src.len() != self.len() {
1531            return Err(Error::InvalidMemoryAccess);
1532        }
1533        if self.is_empty() {
1534            return Ok(());
1535        }
1536        unsafe {
1537            Self::copy_async(
1538                self.as_mut_ptr(),
1539                src.as_ptr(),
1540                self.len(),
1541                MemoryCopyKind::DeviceToDevice,
1542                stream,
1543            )
1544        }
1545    }
1546
1547    /// Returns a capture operation that copies from another device allocation into this allocation.
1548    ///
1549    /// # Safety
1550    ///
1551    /// Capturing this operation stores both device pointer addresses in the
1552    /// resulting CUDA graph. The caller must ensure `self` and `src` remain
1553    /// valid whenever a captured graph using this operation is launched. The
1554    /// destination allocation must remain exclusive for the work ordered by
1555    /// those launches.
1556    pub unsafe fn copy_from_device_operation<'a>(
1557        &'a mut self,
1558        src: &'a Self,
1559    ) -> Result<MemoryCopyOperation<'a, T>> {
1560        if src.len() != self.len() {
1561            return Err(Error::InvalidMemoryAccess);
1562        }
1563        Ok(unsafe {
1564            MemoryCopyOperation::new(
1565                self.as_mut_ptr(),
1566                src.as_ptr(),
1567                self.len(),
1568                MemoryCopyKind::DeviceToDevice,
1569            )
1570        })
1571    }
1572
1573    pub fn set_zeroes(&mut self) -> Result<()> {
1574        if self.length == 0 {
1575            return Ok(());
1576        }
1577        unsafe { Self::set(self.ptr, 0, self.length) }
1578    }
1579
1580    pub fn set_value(&mut self, value: u8) -> Result<()> {
1581        if self.length == 0 {
1582            return Ok(());
1583        }
1584        unsafe { Self::set(self.ptr, value, self.length) }
1585    }
1586
1587    pub fn set_value_async<'scope, 'env>(
1588        &mut self,
1589        value: u8,
1590        stream: &StreamScope<'scope, 'env>,
1591    ) -> Result<()> {
1592        unsafe { self.set_value_async_unchecked(value, stream.stream()) }
1593    }
1594
1595    /// # Safety
1596    ///
1597    /// The caller must ensure `self` remains valid until `stream` has
1598    /// completed the memset.
1599    ///
1600    /// # Errors
1601    ///
1602    /// Returns an error if CUDA cannot enqueue the memset on `stream`.
1603    pub unsafe fn set_value_async_unchecked(&mut self, value: u8, stream: &Stream) -> Result<()> {
1604        if self.is_empty() {
1605            return Ok(());
1606        }
1607        unsafe { Self::set_async(self.as_mut_ptr(), value, self.len(), stream) }
1608    }
1609
1610    /// Returns a capture operation that fills this device allocation with `value`.
1611    ///
1612    /// # Safety
1613    ///
1614    /// Capturing this operation stores this allocation's pointer address in the
1615    /// resulting CUDA graph. The caller must ensure `self` remains valid and
1616    /// exclusive whenever a captured graph using this operation is launched.
1617    pub unsafe fn set_value_operation<'a>(&'a mut self, value: u8) -> MemorySetOperation<'a, T> {
1618        unsafe { MemorySetOperation::new(self.as_mut_ptr(), value, self.len()) }
1619    }
1620
1621    /// Takes a pointer to the base of an existing device memory allocation created with [`DeviceMemory::alloc`] and exports it for use in another process.
1622    /// This is a lightweight operation and may be called multiple times on an allocation without adverse effects.
1623    ///
1624    /// If a region of memory is freed with [`DeviceMemory::free`] and a subsequent call to [`DeviceMemory::alloc`] returns memory with the same device address, [`DeviceMemory::ipc_handle`] returns a unique handle for the new memory.
1625    ///
1626    /// IPC is restricted to devices with unified-addressing support on Linux and Windows.
1627    /// IPC on Windows is supported for compatibility but is not recommended because of its performance cost.
1628    /// Check device IPC support through the device properties exposed by this crate, for example [`DeviceProperties::ipc_event_supported`](crate::device::DeviceProperties::ipc_event_supported).
1629    ///
1630    /// # Errors
1631    ///
1632    /// Returns an error if the allocation is empty, CUDA cannot export an IPC
1633    /// handle for the allocation, or CUDA reports runtime initialization
1634    /// diagnostics.
1635    pub fn ipc_handle(&self) -> Result<IpcMemoryHandle> {
1636        if self.is_empty() {
1637            // Cannot get handle for null pointer / zero size? Check docs.
1638            return Err(Error::InvalidMemoryAccess);
1639        }
1640        let mut handle = MaybeUninit::uninit();
1641        unsafe {
1642            try_ffi!(runtime::cudaIpcGetMemHandle(
1643                handle.as_mut_ptr(),
1644                self.as_ptr().cast_mut().cast(),
1645            ))?;
1646            Ok(IpcMemoryHandle::from_raw(handle.assume_init()))
1647        }
1648    }
1649
1650    pub fn try_clone(&self) -> Result<Self> {
1651        if self.length == 0 || size_of::<T>() == 0 {
1652            return Ok(Self {
1653                ptr: ptr::null_mut(),
1654                length: self.length,
1655                _phantom: PhantomData,
1656            });
1657        }
1658
1659        let new_mem = Self::create(self.length)?;
1660
1661        unsafe {
1662            Self::copy(
1663                new_mem.as_mut_ptr(),
1664                self.as_ptr(),
1665                self.length,
1666                MemoryCopyKind::DeviceToDevice,
1667            )?;
1668        }
1669
1670        Ok(new_mem)
1671    }
1672}
1673
1674impl<T> Clone for DeviceMemory<T> {
1675    fn clone(&self) -> Self {
1676        match self.try_clone() {
1677            Ok(new_mem) => new_mem,
1678            Err(err) => {
1679                #[cfg(debug_assertions)]
1680                eprintln!("device memory clone failed: {err}");
1681                Self {
1682                    ptr: ptr::null_mut(),
1683                    length: 0,
1684                    _phantom: PhantomData,
1685                }
1686            }
1687        }
1688    }
1689}
1690
1691impl<T> Drop for DeviceMemory<T> {
1692    fn drop(&mut self) {
1693        if self.ptr.is_null() {
1694            return;
1695        }
1696
1697        // debug_assert!(
1698        //     unsafe { free(self.ptr) }.is_ok(),
1699        //     "failed to free device memory at {:#x}",
1700        //     self.ptr as usize
1701        // );
1702        if let Err(err) = unsafe { Self::free(self.ptr) } {
1703            #[cfg(debug_assertions)]
1704            eprintln!("failed to free device memory: {err}");
1705            return;
1706        }
1707
1708        self.ptr = ptr::null_mut();
1709        self.length = 0;
1710    }
1711}
1712
1713impl<T: DeviceRepr> ManagedMemory<T> {
1714    /// Allocates typed CUDA managed memory and records its initial attach mode.
1715    ///
1716    /// # Errors
1717    ///
1718    /// Returns an error if the requested byte size overflows, CUDA cannot
1719    /// allocate managed memory, or a non-empty zero-sized allocation is
1720    /// requested.
1721    pub fn create(length: usize, attach_flags: MemoryAttachFlags) -> Result<Self> {
1722        if size_of::<T>() == 0 {
1723            return if length == 0 {
1724                Ok(Self {
1725                    ptr: ptr::null_mut(),
1726                    length,
1727                    attach_flags,
1728                    _phantom: PhantomData,
1729                })
1730            } else {
1731                Err(Error::InvalidMemoryAllocationRequest)
1732            };
1733        }
1734
1735        let ptr = unsafe { DeviceMemory::<T>::alloc_managed(length, attach_flags)? };
1736        Ok(Self {
1737            ptr,
1738            length,
1739            attach_flags,
1740            _phantom: PhantomData,
1741        })
1742    }
1743
1744    pub fn zeroes(length: usize, attach_flags: MemoryAttachFlags) -> Result<Self>
1745    where
1746        T: ZeroableDeviceRepr,
1747    {
1748        let mut memory = Self::create(length, attach_flags)?;
1749        memory.set_zeroes()?;
1750        Ok(memory)
1751    }
1752
1753    /// Takes ownership of an existing managed allocation.
1754    ///
1755    /// # Safety
1756    ///
1757    /// `ptr` must be null for an empty allocation or point to `length` live
1758    /// elements allocated by a CUDA allocation function compatible with
1759    /// [`DeviceMemory::free`]. `length * size_of::<T>()` must fit in `usize`.
1760    /// No other owner may free the pointer while the returned value is alive.
1761    pub unsafe fn from_raw_parts(
1762        ptr: *mut T,
1763        length: usize,
1764        attach_flags: MemoryAttachFlags,
1765    ) -> Self {
1766        Self {
1767            ptr,
1768            length,
1769            attach_flags,
1770            _phantom: PhantomData,
1771        }
1772    }
1773
1774    pub fn into_raw_parts(self) -> (*mut T, usize, MemoryAttachFlags) {
1775        let ptr = self.ptr;
1776        let length = self.length;
1777        let attach_flags = self.attach_flags;
1778        mem::forget(self);
1779        (ptr, length, attach_flags)
1780    }
1781
1782    pub const fn len(&self) -> usize {
1783        self.length
1784    }
1785
1786    pub const fn is_empty(&self) -> bool {
1787        self.length == 0
1788    }
1789
1790    pub fn byte_len(&self) -> usize {
1791        self.length
1792            .checked_mul(size_of::<T>())
1793            .expect("managed memory byte length overflow")
1794    }
1795
1796    pub const fn attach_flags(&self) -> MemoryAttachFlags {
1797        self.attach_flags
1798    }
1799
1800    pub const fn as_ptr(&self) -> *const T {
1801        self.ptr
1802    }
1803
1804    pub const fn as_mut_ptr(&mut self) -> *mut T {
1805        self.ptr
1806    }
1807
1808    pub fn view(&self) -> DeviceView<'_, T> {
1809        // The ManagedMemory owner guarantees the pointer remains live for the
1810        // borrowed view lifetime.
1811        unsafe { DeviceView::from_raw_parts(self.ptr, self.length) }
1812    }
1813
1814    pub fn view_mut(&mut self) -> DeviceViewMut<'_, T> {
1815        // &mut self guarantees unique access to the represented range.
1816        unsafe { DeviceViewMut::from_raw_parts(self.ptr, self.length) }
1817    }
1818
1819    /// Returns a host slice over this managed allocation.
1820    ///
1821    /// # Safety
1822    ///
1823    /// The caller must ensure no GPU work or other CPU reference can
1824    /// concurrently mutate the same memory for the returned lifetime, and that
1825    /// the allocation is accessible from the host at the point of access.
1826    pub unsafe fn as_host_slice(&self) -> &[T] {
1827        if self.is_empty() {
1828            return &[];
1829        }
1830        unsafe { slice::from_raw_parts(self.ptr, self.length) }
1831    }
1832
1833    /// Returns a mutable host slice over this managed allocation.
1834    ///
1835    /// # Safety
1836    ///
1837    /// The caller must ensure no GPU work or other CPU reference can
1838    /// concurrently access the same memory for the returned lifetime, and that
1839    /// the allocation is accessible from the host at the point of access.
1840    pub unsafe fn as_host_slice_mut(&mut self) -> &mut [T] {
1841        if self.is_empty() {
1842            return &mut [];
1843        }
1844        unsafe { slice::from_raw_parts_mut(self.ptr, self.length) }
1845    }
1846
1847    pub fn set_zeroes(&mut self) -> Result<()>
1848    where
1849        T: ZeroableDeviceRepr,
1850    {
1851        if self.is_empty() {
1852            return Ok(());
1853        }
1854        unsafe { DeviceMemory::<T>::set(self.ptr, 0, self.length) }
1855    }
1856
1857    pub fn prefetch_to(&self, location: MemoryLocation, stream: &Stream) -> Result<()> {
1858        DeviceMemory::<T>::prefetch_async(
1859            unsafe { DevicePtr::from_raw(self.ptr.cast::<()>()) },
1860            self.byte_len(),
1861            location,
1862            stream,
1863        )
1864    }
1865
1866    pub fn attach_to_stream(&mut self, stream: &Stream, flags: MemoryAttachFlags) -> Result<()> {
1867        stream.context().bind()?;
1868        unsafe {
1869            try_ffi!(runtime::cudaStreamAttachMemAsync(
1870                stream.as_raw(),
1871                self.ptr.cast(),
1872                self.byte_len() as _,
1873                flags.bits(),
1874            ))?;
1875        }
1876        self.attach_flags = flags;
1877        Ok(())
1878    }
1879}
1880
1881impl<T: DeviceRepr> DeviceSlice<T> for ManagedMemory<T> {
1882    fn as_device_ptr(&self) -> *const T {
1883        self.ptr
1884    }
1885
1886    fn len(&self) -> usize {
1887        self.length
1888    }
1889}
1890
1891impl<T: DeviceRepr> DeviceSliceMut<T> for ManagedMemory<T> {
1892    fn as_device_mut_ptr(&mut self) -> *mut T {
1893        self.ptr
1894    }
1895}
1896
1897impl<T: DeviceRepr> Drop for ManagedMemory<T> {
1898    fn drop(&mut self) {
1899        if self.ptr.is_null() {
1900            return;
1901        }
1902
1903        if let Err(err) = unsafe { DeviceMemory::<T>::free(self.ptr) } {
1904            #[cfg(debug_assertions)]
1905            eprintln!("failed to free managed memory: {err}");
1906            return;
1907        }
1908
1909        self.ptr = ptr::null_mut();
1910        self.length = 0;
1911    }
1912}
1913
1914#[cfg(all(test, feature = "testing"))]
1915mod tests {
1916    use super::*;
1917    use crate::testing;
1918
1919    #[test]
1920    fn it_works() -> Result<()> {
1921        unsafe {
1922            let host_in = [1, 2, 3];
1923
1924            let device_ptr = DeviceMemory::alloc(3)?;
1925
1926            DeviceMemory::copy(
1927                device_ptr,
1928                host_in.as_ptr(),
1929                3,
1930                MemoryCopyKind::HostToDevice,
1931            )?;
1932            let mut host_out = [0, 0, 0];
1933            DeviceMemory::copy(
1934                host_out.as_mut_ptr(),
1935                device_ptr,
1936                3,
1937                MemoryCopyKind::DeviceToHost,
1938            )?;
1939            assert_eq!(host_out, host_in);
1940
1941            DeviceMemory::free(device_ptr)?;
1942        }
1943        Ok(())
1944    }
1945
1946    #[test]
1947    fn test_scoped_async_copy_round_trip() -> Result<()> {
1948        let (_lock, ctx) = testing::bootstrap()?;
1949        let stream = ctx.create_stream()?;
1950
1951        let host_in = [4_i32, 5, 6];
1952        let mut device = DeviceMemory::create(host_in.len())?;
1953        let mut host_out = [0_i32; 3];
1954
1955        stream.sync_scope(|scope| {
1956            device.copy_from_host_async(&host_in, scope)?;
1957            device.copy_to_host_async(&mut host_out, scope)
1958        })?;
1959
1960        assert_eq!(host_out, host_in);
1961
1962        Ok(())
1963    }
1964
1965    #[test]
1966    fn managed_memory_tracks_metadata_and_views() -> Result<()> {
1967        let mut backing = [1_u32, 2, 3, 4];
1968        let mut managed = unsafe {
1969            ManagedMemory::from_raw_parts(
1970                backing.as_mut_ptr(),
1971                backing.len(),
1972                MemoryAttachFlags::HOST,
1973            )
1974        };
1975
1976        assert_eq!(managed.len(), backing.len());
1977        assert_eq!(managed.byte_len(), backing.len() * size_of::<u32>());
1978        assert_eq!(managed.attach_flags(), MemoryAttachFlags::HOST);
1979        assert_eq!(managed.view().len(), backing.len());
1980        assert_eq!(managed.view_mut().len(), backing.len());
1981
1982        unsafe {
1983            assert_eq!(managed.as_host_slice(), &[1, 2, 3, 4]);
1984            managed.as_host_slice_mut()[2] = 9;
1985        }
1986        assert_eq!(backing[2], 9);
1987
1988        let (ptr, length, flags) = managed.into_raw_parts();
1989        assert_eq!(ptr, backing.as_mut_ptr());
1990        assert_eq!(length, backing.len());
1991        assert_eq!(flags, MemoryAttachFlags::HOST);
1992
1993        Ok(())
1994    }
1995}
singe_cuda/memory.rs

singe_cuda/
memory.rs