singe_cuda/memory.rs
1use std::{
2 marker::PhantomData,
3 mem::{self, MaybeUninit},
4 ptr, slice,
5};
6
7use num_enum::{IntoPrimitive, TryFromPrimitive};
8use singe_core::{impl_enum_conversion, impl_enum_display};
9use singe_cuda_sys::{driver, runtime};
10
11use crate::{
12 error::{Error, Result},
13 ipc::IpcMemoryHandle,
14 stream::{GraphRecordable, Stream, StreamCaptureScope, StreamScope},
15 try_ffi,
16 types::DevicePtr,
17 view::{
18 DeviceRepr, DeviceSlice, DeviceSliceMut, DeviceView, DeviceViewMut, ZeroableDeviceRepr,
19 },
20};
21
22#[derive(Debug, Clone, Copy)]
23pub struct MemoryCopyOperation<'a, T> {
24 dst: *mut T,
25 src: *const T,
26 count: usize,
27 kind: MemoryCopyKind,
28 _marker: PhantomData<&'a mut [T]>,
29}
30
31impl<'a, T> MemoryCopyOperation<'a, T> {
32 /// Creates a stream-capture-safe memcpy operation from raw pointers.
33 ///
34 /// # Safety
35 ///
36 /// Capturing this operation stores `dst` and `src` pointer addresses in the
37 /// resulting CUDA graph. The caller must ensure both pointers are valid for
38 /// `count` elements whenever a captured graph using this operation is
39 /// launched. If `dst` is mutable memory, it must also remain exclusive for
40 /// the work ordered by those launches.
41 pub const unsafe fn new(
42 dst: *mut T,
43 src: *const T,
44 count: usize,
45 kind: MemoryCopyKind,
46 ) -> Self {
47 Self {
48 dst,
49 src,
50 count,
51 kind,
52 _marker: PhantomData,
53 }
54 }
55}
56
57unsafe impl<T> GraphRecordable for MemoryCopyOperation<'_, T> {
58 type Output = ();
59
60 fn record(self, scope: &StreamCaptureScope<'_>) -> Result<()> {
61 unsafe {
62 DeviceMemory::<T>::copy_async(self.dst, self.src, self.count, self.kind, scope.stream())
63 }
64 }
65}
66
67#[derive(Debug, Clone, Copy)]
68pub struct MemorySetOperation<'a, T> {
69 dst: *mut T,
70 value: u8,
71 count: usize,
72 _marker: PhantomData<&'a mut [T]>,
73}
74
75impl<'a, T> MemorySetOperation<'a, T> {
76 /// Creates a stream-capture-safe memset operation from a raw pointer.
77 ///
78 /// # Safety
79 ///
80 /// Capturing this operation stores `dst` in the resulting CUDA graph. The
81 /// caller must ensure `dst` is valid for writes of
82 /// `count * size_of::<T>()` bytes whenever a captured graph using this
83 /// operation is launched, and that it remains exclusive for the work ordered
84 /// by those launches.
85 pub const unsafe fn new(dst: *mut T, value: u8, count: usize) -> Self {
86 Self {
87 dst,
88 value,
89 count,
90 _marker: PhantomData,
91 }
92 }
93}
94
95unsafe impl<T> GraphRecordable for MemorySetOperation<'_, T> {
96 type Output = ();
97
98 fn record(self, scope: &StreamCaptureScope<'_>) -> Result<()> {
99 unsafe { DeviceMemory::<T>::set_async(self.dst, self.value, self.count, scope.stream()) }
100 }
101}
102
103/// CUDA memory copy types.
104#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
105#[repr(u32)]
106#[non_exhaustive]
107pub enum MemoryCopyKind {
108 /// Host -> Host.
109 HostToHost = runtime::cudaMemcpyKind::cudaMemcpyHostToHost as _,
110 /// Host -> Device.
111 HostToDevice = runtime::cudaMemcpyKind::cudaMemcpyHostToDevice as _,
112 /// Device -> Host.
113 DeviceToHost = runtime::cudaMemcpyKind::cudaMemcpyDeviceToHost as _,
114 /// Device -> Device.
115 DeviceToDevice = runtime::cudaMemcpyKind::cudaMemcpyDeviceToDevice as _,
116 /// Direction of the transfer is inferred from the pointer values.
117 /// Requires unified virtual addressing.
118 Default = runtime::cudaMemcpyKind::cudaMemcpyDefault as _,
119}
120
121impl_enum_conversion!(runtime::cudaMemcpyKind, MemoryCopyKind);
122
123impl_enum_display!(MemoryCopyKind, {
124 Self::HostToHost => "cudaMemcpyHostToHost",
125 Self::HostToDevice => "cudaMemcpyHostToDevice",
126 Self::DeviceToHost => "cudaMemcpyDeviceToHost",
127 Self::DeviceToDevice => "cudaMemcpyDeviceToDevice",
128 Self::Default => "cudaMemcpyDefault",
129});
130
131#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
132#[repr(transparent)]
133pub struct ArrayHandle(runtime::cudaArray_t);
134
135impl ArrayHandle {
136 pub const unsafe fn from_raw(handle: runtime::cudaArray_t) -> Self {
137 Self(handle)
138 }
139
140 pub const fn as_raw(self) -> runtime::cudaArray_t {
141 self.0
142 }
143}
144
145bitflags::bitflags! {
146 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
147 pub struct MemoryAttachFlags: u32 {
148 const GLOBAL = driver::CUmemAttach_flags::CU_MEM_ATTACH_GLOBAL as _;
149 const HOST = driver::CUmemAttach_flags::CU_MEM_ATTACH_HOST as _;
150 const SINGLE = driver::CUmemAttach_flags::CU_MEM_ATTACH_SINGLE as _;
151 }
152}
153
154#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
155#[repr(u32)]
156#[non_exhaustive]
157pub enum MemoryAllocationType {
158 Invalid = driver::CUmemAllocationType::CU_MEM_ALLOCATION_TYPE_INVALID as _,
159 Pinned = driver::CUmemAllocationType::CU_MEM_ALLOCATION_TYPE_PINNED as _,
160 Managed = driver::CUmemAllocationType::CU_MEM_ALLOCATION_TYPE_MANAGED as _,
161 Max = driver::CUmemAllocationType::CU_MEM_ALLOCATION_TYPE_MAX as _,
162}
163
164impl_enum_conversion!(u32, driver::CUmemAllocationType, MemoryAllocationType);
165
166#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
167#[repr(u32)]
168#[non_exhaustive]
169pub enum MemoryAllocationHandleType {
170 None = driver::CUmemAllocationHandleType::CU_MEM_HANDLE_TYPE_NONE as _,
171 PosixFileDescriptor =
172 driver::CUmemAllocationHandleType::CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR as _,
173 Win32 = driver::CUmemAllocationHandleType::CU_MEM_HANDLE_TYPE_WIN32 as _,
174 Win32Kmt = driver::CUmemAllocationHandleType::CU_MEM_HANDLE_TYPE_WIN32_KMT as _,
175 Fabric = driver::CUmemAllocationHandleType::CU_MEM_HANDLE_TYPE_FABRIC as _,
176 Max = driver::CUmemAllocationHandleType::CU_MEM_HANDLE_TYPE_MAX as _,
177}
178
179impl_enum_conversion!(
180 u32,
181 driver::CUmemAllocationHandleType,
182 MemoryAllocationHandleType
183);
184
185#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
186#[repr(u32)]
187#[non_exhaustive]
188pub enum MemoryAccessFlag {
189 None = driver::CUmemAccess_flags::CU_MEM_ACCESS_FLAGS_PROT_NONE as _,
190 Read = driver::CUmemAccess_flags::CU_MEM_ACCESS_FLAGS_PROT_READ as _,
191 ReadWrite = driver::CUmemAccess_flags::CU_MEM_ACCESS_FLAGS_PROT_READWRITE as _,
192 Max = driver::CUmemAccess_flags::CU_MEM_ACCESS_FLAGS_PROT_MAX as _,
193}
194
195impl_enum_conversion!(u32, driver::CUmemAccess_flags, MemoryAccessFlag);
196
197#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
198#[repr(u32)]
199#[non_exhaustive]
200pub enum MemoryPoolAttribute {
201 ReuseFollowEventDependencies =
202 driver::CUmemPool_attribute::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES as _,
203 ReuseAllowOpportunistic =
204 driver::CUmemPool_attribute::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC as _,
205 ReuseAllowInternalDependencies =
206 driver::CUmemPool_attribute::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES as _,
207 ReleaseThreshold = driver::CUmemPool_attribute::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD as _,
208 ReservedMemoryCurrent = driver::CUmemPool_attribute::CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT as _,
209 ReservedMemoryHigh = driver::CUmemPool_attribute::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH as _,
210 UsedMemoryCurrent = driver::CUmemPool_attribute::CU_MEMPOOL_ATTR_USED_MEM_CURRENT as _,
211 UsedMemoryHigh = driver::CUmemPool_attribute::CU_MEMPOOL_ATTR_USED_MEM_HIGH as _,
212}
213
214impl_enum_conversion!(u32, driver::CUmemPool_attribute, MemoryPoolAttribute);
215
216#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
217#[non_exhaustive]
218pub enum MemoryPoolAttributeValue {
219 Bool(bool),
220 Bytes(u64),
221}
222
223#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
224pub struct MemoryAccessDescriptor {
225 pub location: MemoryLocation,
226 pub flags: MemoryAccessFlag,
227}
228
229#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
230pub struct MemoryPoolProps {
231 pub alloc_type: MemoryAllocationType,
232 pub handle_type: MemoryAllocationHandleType,
233 pub location: MemoryLocation,
234 pub max_size: usize,
235 pub usage: u16,
236}
237
238#[derive(Debug)]
239pub struct MemoryPool {
240 handle: driver::CUmemoryPool,
241}
242
243impl From<MemoryAccessDescriptor> for driver::CUmemAccessDesc {
244 fn from(value: MemoryAccessDescriptor) -> Self {
245 Self {
246 location: value.location.into(),
247 flags: value.flags.into(),
248 }
249 }
250}
251
252impl From<MemoryPoolProps> for driver::CUmemPoolProps {
253 fn from(value: MemoryPoolProps) -> Self {
254 Self {
255 allocType: value.alloc_type.into(),
256 handleTypes: value.handle_type.into(),
257 location: value.location.into(),
258 win32SecurityAttributes: ptr::null_mut(),
259 maxSize: value.max_size as _,
260 usage: value.usage,
261 reserved: [0; 54],
262 }
263 }
264}
265
266bitflags::bitflags! {
267 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
268 pub struct HostAllocationFlags: u32 {
269 const DEFAULT = runtime::cudaHostAllocDefault;
270 const PORTABLE = runtime::cudaHostAllocPortable;
271 const MAPPED = runtime::cudaHostAllocMapped;
272 const WRITE_COMBINED = runtime::cudaHostAllocWriteCombined;
273 }
274}
275
276bitflags::bitflags! {
277 /// Flags for [`DeviceMemory::register_host`].
278 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
279 pub struct HostRegisterFlags: u32 {
280 const DEFAULT = runtime::cudaHostRegisterDefault;
281 const PORTABLE = runtime::cudaHostRegisterPortable;
282 const MAPPED = runtime::cudaHostRegisterMapped;
283 const IO_MEMORY = runtime::cudaHostRegisterIoMemory;
284 const READ_ONLY = runtime::cudaHostRegisterReadOnly;
285 }
286}
287
288/// CUDA memory types.
289#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
290#[repr(u32)]
291#[non_exhaustive]
292pub enum MemoryType {
293 /// Unregistered memory.
294 Unregistered = runtime::cudaMemoryType::cudaMemoryTypeUnregistered as _,
295 /// Host memory.
296 Host = runtime::cudaMemoryType::cudaMemoryTypeHost as _,
297 /// Device memory.
298 Device = runtime::cudaMemoryType::cudaMemoryTypeDevice as _,
299 /// Managed memory.
300 Managed = runtime::cudaMemoryType::cudaMemoryTypeManaged as _,
301}
302
303impl_enum_conversion!(runtime::cudaMemoryType, MemoryType);
304
305impl_enum_display!(MemoryType, {
306 Self::Unregistered => "cudaMemoryTypeUnregistered",
307 Self::Host => "cudaMemoryTypeHost",
308 Self::Device => "cudaMemoryTypeDevice",
309 Self::Managed => "cudaMemoryTypeManaged",
310});
311
312#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
313pub struct PointerAttributes {
314 pub memory_type: MemoryType,
315 pub device: i32,
316 pub device_pointer: DevicePtr,
317 pub host_pointer: *mut (),
318}
319
320impl From<runtime::cudaPointerAttributes> for PointerAttributes {
321 fn from(attr: runtime::cudaPointerAttributes) -> Self {
322 Self {
323 memory_type: attr.type_.into(),
324 device: attr.device,
325 device_pointer: unsafe { DevicePtr::from_raw(attr.devicePointer.cast()) },
326 host_pointer: attr.hostPointer.cast(),
327 }
328 }
329}
330
331#[repr(u32)]
332#[derive(
333 Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq, TryFromPrimitive, IntoPrimitive,
334)]
335#[non_exhaustive]
336pub enum MemoryLocationKind {
337 Invalid = driver::CUmemLocationType_enum::CU_MEM_LOCATION_TYPE_INVALID as _,
338 Device = driver::CUmemLocationType_enum::CU_MEM_LOCATION_TYPE_DEVICE as _,
339 Host = driver::CUmemLocationType_enum::CU_MEM_LOCATION_TYPE_HOST as _,
340 Numa = driver::CUmemLocationType_enum::CU_MEM_LOCATION_TYPE_HOST_NUMA as _,
341 NumaCurrent = driver::CUmemLocationType_enum::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT as _,
342 Max = driver::CUmemLocationType_enum::CU_MEM_LOCATION_TYPE_MAX as _,
343}
344
345impl_enum_conversion!(driver::CUmemLocationType_enum, MemoryLocationKind);
346
347impl_enum_display!(MemoryLocationKind, {
348 Self::Invalid => "CU_MEM_LOCATION_TYPE_INVALID",
349 Self::Device => "CU_MEM_LOCATION_TYPE_DEVICE",
350 Self::Host => "CU_MEM_LOCATION_TYPE_HOST",
351 Self::Numa => "CU_MEM_LOCATION_TYPE_HOST_NUMA",
352 Self::NumaCurrent => "CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT",
353 Self::Max => "CU_MEM_LOCATION_TYPE_MAX",
354});
355
356#[derive(Debug, Clone, Copy, Hash, PartialOrd, Ord, PartialEq, Eq)]
357pub struct MemoryLocation {
358 pub kind: MemoryLocationKind,
359 pub id: i32,
360}
361
362impl From<driver::CUmemLocation_st> for MemoryLocation {
363 fn from(s: driver::CUmemLocation_st) -> Self {
364 Self {
365 kind: s.type_.into(),
366 id: unsafe { s.__bindgen_anon_1.id },
367 }
368 }
369}
370
371impl From<MemoryLocation> for driver::CUmemLocation_st {
372 fn from(m: MemoryLocation) -> Self {
373 Self {
374 type_: m.kind.into(),
375 __bindgen_anon_1: driver::CUmemLocation_st__bindgen_ty_1 { id: m.id as _ },
376 }
377 }
378}
379
380impl Default for MemoryLocation {
381 fn default() -> Self {
382 driver::CUmemLocation_st::default().into()
383 }
384}
385
386impl MemoryPool {
387 /// Creates a CUDA memory pool.
388 /// `props` determines the properties of the pool such as the backing device and IPC capabilities.
389 ///
390 /// To create a memory pool for host memory not targeting a specific NUMA node, applications must set [`MemoryPoolProps::location`] to [`MemoryLocationKind::Host`].
391 /// [`MemoryLocation::id`] is ignored for such pools.
392 /// Pools created with [`MemoryLocationKind::Host`] are not IPC-capable and [`MemoryPoolProps::handle_type`] must be [`MemoryAllocationHandleType::None`]; any other value returns [`crate::error::Status::InvalidValue`].
393 /// To create a memory pool targeting a specific host NUMA node, applications must set [`MemoryLocation::kind`] to [`MemoryLocationKind::Numa`] and [`MemoryLocation::id`] must specify the NUMA ID of the host memory node.
394 /// Specifying [`MemoryLocationKind::NumaCurrent`] as [`MemoryLocation::kind`] returns [`crate::error::Status::InvalidValue`].
395 /// By default, the pool's memory is accessible from the device where it is allocated.
396 /// Pools created with [`MemoryLocationKind::Numa`] or [`MemoryLocationKind::Host`] are accessible from the host CPU by default.
397 /// Applications can control the maximum size of the pool by specifying a non-zero value for [`MemoryPoolProps::max_size`].
398 /// A value of 0 uses a system-dependent maximum pool size.
399 ///
400 /// Callers that intend to use [`MemoryAllocationHandleType::Fabric`] based memory sharing must ensure: (1) the `nvidia-caps-imex-channels` character device is created by the driver and is listed under `/proc/devices`; (2) at least one IMEX channel file is accessible to the process.
401 ///
402 /// When exporter and importer CUDA processes have been granted access to the same IMEX channel, they can securely share memory.
403 ///
404 /// The IMEX channel security model works per operating-system account.
405 /// All processes for an account can share memory if that account has access to a valid IMEX channel.
406 /// When isolation between accounts is desired, each account needs a separate IMEX channel.
407 ///
408 /// These channel files exist in `/dev/nvidia-caps-imex-channels/channel*` and can be created using standard OS native calls like `mknod` on Linux.
409 ///
410 /// To create a managed memory pool, applications must set [`MemoryPoolProps::alloc_type`] to [`MemoryAllocationType::Managed`].
411 /// [`MemoryPoolProps::handle_type`] must also be [`MemoryAllocationHandleType::None`] because IPC is not supported.
412 /// For managed memory pools, [`MemoryPoolProps::location`] is treated as the preferred location for all allocations created from the pool.
413 /// An application can also set [`MemoryLocationKind::Invalid`] to indicate no preferred location.
414 /// [`MemoryPoolProps::max_size`] must be set to zero for managed memory pools.
415 /// [`MemoryPoolProps::usage`] must be zero because decompression for managed memory is not supported.
416 /// For managed memory pools, all devices on the system must have non-zero concurrentManagedAccess.
417 /// If not, this call returns [`crate::error::Status::NotSupported`].
418 ///
419 /// Specifying [`MemoryAllocationHandleType::None`] creates a memory pool that does not support IPC.
420 ///
421 /// # Errors
422 ///
423 /// Returns an error if `props` describes an unsupported pool, CUDA cannot
424 /// create the pool, or CUDA returns a null memory-pool handle.
425 pub fn create(props: MemoryPoolProps) -> Result<Self> {
426 let mut handle = ptr::null_mut();
427 let props = driver::CUmemPoolProps::from(props);
428 unsafe {
429 try_ffi!(driver::cuMemPoolCreate(&raw mut handle, &raw const props))?;
430 }
431 if handle.is_null() {
432 return Err(Error::NullHandle);
433 }
434 Ok(Self { handle })
435 }
436
437 /// Supported attributes are:
438 ///
439 /// * [`MemoryPoolAttribute::ReleaseThreshold`]: amount of reserved memory, in bytes, to keep before trying to release memory back to the OS.
440 /// When more than the release threshold bytes of memory are held by the memory pool, the allocator will try to release memory
441 /// back to the OS on the next call to stream, event or context synchronize.
442 /// (default 0)
443 /// * [`MemoryPoolAttribute::ReuseFollowEventDependencies`]: allows [`sys::cuMemAllocAsync`](singe_cuda_sys::driver::cuMemAllocAsync) to use memory asynchronously freed in another stream as long as a stream ordering dependency of the allocating stream on
444 /// the free action exists.
445 /// CUDA events and null stream interactions can create the required stream ordered dependencies.
446 /// (default
447 /// enabled)
448 /// * [`MemoryPoolAttribute::ReuseAllowOpportunistic`]: allows reuse of already completed frees when there is no dependency between the free and allocation.
449 /// (default
450 /// enabled)
451 /// * [`MemoryPoolAttribute::ReuseAllowInternalDependencies`]: allows [`sys::cuMemAllocAsync`](singe_cuda_sys::driver::cuMemAllocAsync) to insert new stream dependencies to establish the stream ordering required to reuse a piece of memory released
452 /// by [`sys::cuMemFreeAsync`](singe_cuda_sys::driver::cuMemFreeAsync) (default enabled).
453 /// * [`MemoryPoolAttribute::ReservedMemoryHigh`]: resets the high watermark that tracks the amount of backing memory allocated for the memory
454 /// pool.
455 /// It is illegal to set this attribute to a non-zero value.
456 /// * [`MemoryPoolAttribute::UsedMemoryHigh`]: resets the high watermark that tracks the amount of used memory allocated for the memory
457 /// pool.
458 pub fn set_attribute(
459 &mut self,
460 attribute: MemoryPoolAttribute,
461 value: MemoryPoolAttributeValue,
462 ) -> Result<()> {
463 unsafe {
464 match (attribute, value) {
465 (
466 MemoryPoolAttribute::ReuseFollowEventDependencies
467 | MemoryPoolAttribute::ReuseAllowOpportunistic
468 | MemoryPoolAttribute::ReuseAllowInternalDependencies,
469 MemoryPoolAttributeValue::Bool(value),
470 ) => {
471 let mut value = u32::from(value);
472 try_ffi!(driver::cuMemPoolSetAttribute(
473 self.handle,
474 attribute.into(),
475 ptr::from_mut(&mut value).cast(),
476 ))?;
477 }
478 (
479 MemoryPoolAttribute::ReleaseThreshold
480 | MemoryPoolAttribute::ReservedMemoryCurrent
481 | MemoryPoolAttribute::ReservedMemoryHigh
482 | MemoryPoolAttribute::UsedMemoryCurrent
483 | MemoryPoolAttribute::UsedMemoryHigh,
484 MemoryPoolAttributeValue::Bytes(value),
485 ) => {
486 let mut value = value;
487 try_ffi!(driver::cuMemPoolSetAttribute(
488 self.handle,
489 attribute.into(),
490 ptr::from_mut(&mut value).cast(),
491 ))?;
492 }
493 _ => return Err(Error::InvalidValue),
494 }
495 }
496 Ok(())
497 }
498
499 /// Supported attributes are:
500 ///
501 /// * [`MemoryPoolAttribute::ReleaseThreshold`]: amount of reserved memory, in bytes, to keep before trying to release memory back to the OS.
502 /// When more than the release threshold bytes of memory are held by the memory pool, the allocator will try to release memory
503 /// back to the OS on the next call to stream, event or context synchronize.
504 /// (default 0)
505 /// * [`MemoryPoolAttribute::ReuseFollowEventDependencies`]: allows [`sys::cuMemAllocAsync`](singe_cuda_sys::driver::cuMemAllocAsync) to use memory asynchronously freed in another stream as long as a stream ordering dependency of the allocating stream on
506 /// the free action exists.
507 /// CUDA events and null stream interactions can create the required stream ordered dependencies.
508 /// (default
509 /// enabled)
510 /// * [`MemoryPoolAttribute::ReuseAllowOpportunistic`]: allows reuse of already completed frees when there is no dependency between the free and allocation.
511 /// (default
512 /// enabled)
513 /// * [`MemoryPoolAttribute::ReuseAllowInternalDependencies`]: allows [`sys::cuMemAllocAsync`](singe_cuda_sys::driver::cuMemAllocAsync) to insert new stream dependencies to establish the stream ordering required to reuse a piece of memory released
514 /// by [`sys::cuMemFreeAsync`](singe_cuda_sys::driver::cuMemFreeAsync) (default enabled).
515 /// * [`MemoryPoolAttribute::ReservedMemoryCurrent`]: backing memory currently allocated for the memory pool.
516 /// * [`MemoryPoolAttribute::ReservedMemoryHigh`]: high watermark of backing memory allocated for the memory pool since the last reset.
517 /// * [`MemoryPoolAttribute::UsedMemoryCurrent`]: memory from the pool that is currently in use by the application.
518 /// * [`MemoryPoolAttribute::UsedMemoryHigh`]: high watermark of memory from the pool that was in use by the application.
519 ///
520 /// # Errors
521 ///
522 /// Returns an error if CUDA Driver cannot report the requested pool attribute.
523 pub fn attribute(&self, attribute: MemoryPoolAttribute) -> Result<MemoryPoolAttributeValue> {
524 unsafe {
525 match attribute {
526 MemoryPoolAttribute::ReuseFollowEventDependencies
527 | MemoryPoolAttribute::ReuseAllowOpportunistic
528 | MemoryPoolAttribute::ReuseAllowInternalDependencies => {
529 let mut value = 0u32;
530 try_ffi!(driver::cuMemPoolGetAttribute(
531 self.handle,
532 attribute.into(),
533 ptr::from_mut(&mut value).cast(),
534 ))?;
535 Ok(MemoryPoolAttributeValue::Bool(value != 0))
536 }
537 MemoryPoolAttribute::ReleaseThreshold
538 | MemoryPoolAttribute::ReservedMemoryCurrent
539 | MemoryPoolAttribute::ReservedMemoryHigh
540 | MemoryPoolAttribute::UsedMemoryCurrent
541 | MemoryPoolAttribute::UsedMemoryHigh => {
542 let mut value = 0u64;
543 try_ffi!(driver::cuMemPoolGetAttribute(
544 self.handle,
545 attribute.into(),
546 ptr::from_mut(&mut value).cast(),
547 ))?;
548 Ok(MemoryPoolAttributeValue::Bytes(value))
549 }
550 }
551 }
552 }
553
554 /// Controls visibility of pools between devices.
555 ///
556 /// # Errors
557 ///
558 /// Returns an error if CUDA Driver rejects the access descriptors.
559 pub fn set_access(&mut self, access_descs: &[MemoryAccessDescriptor]) -> Result<()> {
560 let access_descs: Vec<_> = access_descs.iter().copied().map(Into::into).collect();
561 unsafe {
562 try_ffi!(driver::cuMemPoolSetAccess(
563 self.handle,
564 access_descs.as_ptr(),
565 access_descs.len() as _,
566 ))?;
567 }
568 Ok(())
569 }
570
571 /// Returns the accessibility of the pool's memory from the specified location.
572 ///
573 /// # Errors
574 ///
575 /// Returns an error if CUDA Driver cannot report access from `location`.
576 pub fn access(&self, location: MemoryLocation) -> Result<MemoryAccessFlag> {
577 let mut flags = driver::CUmemAccess_flags::CU_MEM_ACCESS_FLAGS_PROT_NONE;
578 let mut location = driver::CUmemLocation_st::from(location);
579 unsafe {
580 try_ffi!(driver::cuMemPoolGetAccess(
581 &raw mut flags,
582 self.handle,
583 &raw mut location,
584 ))?;
585 }
586 Ok(flags.into())
587 }
588
589 /// Releases memory back to the OS until the pool contains fewer than `min_bytes_to_keep` reserved bytes, or there is no more memory that the allocator can safely release.
590 /// The allocator cannot release OS allocations that back outstanding asynchronous allocations.
591 /// The OS allocations may happen at different granularity from the caller's allocations.
592 ///
593 /// * Allocations that have not been freed count as outstanding.
594 /// * Allocations that have been asynchronously freed but whose completion has not been observed on the host, for example by synchronization, can count as outstanding.
595 ///
596 /// # Errors
597 ///
598 /// Returns an error if CUDA cannot trim the pool.
599 pub fn trim_to(&mut self, min_bytes_to_keep: usize) -> Result<()> {
600 unsafe {
601 try_ffi!(driver::cuMemPoolTrimTo(self.handle, min_bytes_to_keep as _))?;
602 }
603 Ok(())
604 }
605
606 pub const fn as_raw(&self) -> driver::CUmemoryPool {
607 self.handle
608 }
609}
610
611impl Drop for MemoryPool {
612 fn drop(&mut self) {
613 unsafe {
614 if let Err(err) = try_ffi!(driver::cuMemPoolDestroy(self.handle)) {
615 #[cfg(debug_assertions)]
616 eprintln!("failed to destroy cuda memory pool: {err}");
617 }
618 }
619 }
620}
621
622/// Represents a region of owned CUDA device memory for elements of type `T`.
623#[derive(Debug)]
624pub struct DeviceMemory<T> {
625 /// Raw pointer to the allocated device memory.
626 ptr: *mut T,
627 /// Number of elements of type `T` allocated.
628 length: usize,
629 /// Marker for the type `T`.
630 _phantom: PhantomData<T>,
631}
632
633impl<T> PartialEq for DeviceMemory<T> {
634 fn eq(&self, other: &Self) -> bool {
635 self.ptr == other.ptr && self.length == other.length
636 }
637}
638
639impl<T> Eq for DeviceMemory<T> {}
640
641#[derive(Debug)]
642pub struct ManagedMemory<T: DeviceRepr> {
643 ptr: *mut T,
644 length: usize,
645 // CUDA tracks the current visibility policy for managed memory separately
646 // from the pointer. Store the last policy requested through this wrapper so
647 // callers can reason about stream attachment without another FFI query.
648 attach_flags: MemoryAttachFlags,
649 _phantom: PhantomData<T>,
650}
651
652/// Associated utility functions.
653impl<T> DeviceMemory<T> {
654 /// Allocates size bytes of linear memory on the device and returns a pointer to the allocated memory.
655 /// The allocated memory is suitably aligned for any kind of variable.
656 /// The memory is not cleared.
657 /// [`DeviceMemory::alloc`] returns [`crate::error::Status::OutOfMemory`] on allocation failure.
658 ///
659 /// The device version of [`DeviceMemory::free`] cannot be used with a pointer allocated using the host API, and vice versa.
660 ///
661 /// # Errors
662 ///
663 /// Returns an error if the requested byte size overflows, CUDA cannot
664 /// allocate device memory, a previous asynchronous launch reports an error,
665 /// or CUDA reports runtime initialization diagnostics such as
666 /// [`crate::error::Status::NotInitialized`], [`crate::error::Status::CallRequiresNewerDriver`],
667 /// or [`crate::error::Status::NoDevice`].
668 ///
669 /// # Safety
670 ///
671 /// The returned pointer is uninitialized device memory. The caller must use
672 /// it only for `count` elements of `T` and eventually free it with a
673 /// compatible CUDA free function.
674 pub unsafe fn alloc(count: usize) -> Result<*mut T> {
675 let Some(bytes) = count.checked_mul(size_of::<T>()) else {
676 return Err(Error::InvalidMemoryAllocationRequest);
677 };
678 let mut p = ptr::null_mut();
679 unsafe {
680 try_ffi!(runtime::cudaMalloc(&raw mut p, bytes as _))?;
681 }
682 Ok(p.cast())
683 }
684
685 pub unsafe fn alloc_managed(count: usize, flags: MemoryAttachFlags) -> Result<*mut T> {
686 let Some(bytes) = count.checked_mul(size_of::<T>()) else {
687 return Err(Error::InvalidMemoryAllocationRequest);
688 };
689 if bytes == 0 {
690 return Ok(ptr::null_mut());
691 }
692 let mut p = ptr::null_mut();
693 unsafe {
694 try_ffi!(runtime::cudaMallocManaged(
695 &raw mut p,
696 bytes as _,
697 flags.bits(),
698 ))?;
699 }
700 Ok(p.cast::<T>())
701 }
702
703 /// Frees the memory space pointed to by `ptr`, which must have been returned by a previous call to one of these allocation functions: [`DeviceMemory::alloc`], [`sys::cudaMallocPitch`](singe_cuda_sys::runtime::cudaMallocPitch), [`DeviceMemory::alloc_managed`], [`DeviceMemory::alloc_async`], or [`sys::cudaMallocFromPoolAsync`](singe_cuda_sys::runtime::cudaMallocFromPoolAsync).
704 ///
705 /// This does not perform implicit synchronization when the pointer was allocated with [`DeviceMemory::alloc_async`] or [`sys::cudaMallocFromPoolAsync`](singe_cuda_sys::runtime::cudaMallocFromPoolAsync).
706 /// Callers must ensure that all accesses to this pointer have completed before invoking [`DeviceMemory::free`].
707 /// For best performance and memory reuse, use [`DeviceMemory::free_async`] to free memory allocated via the stream ordered memory allocator.
708 /// For all other pointers, this call may perform implicit synchronization.
709 ///
710 /// If [`DeviceMemory::free`] has already been called before, an error is returned.
711 /// If `ptr` is null, no operation is performed.
712 /// [`DeviceMemory::free`] returns an error on failure.
713 ///
714 /// The device version of [`DeviceMemory::free`] cannot be used with a pointer allocated using the host API, and vice versa.
715 ///
716 /// # Errors
717 ///
718 /// Returns an error if CUDA cannot free `ptr`, `ptr` has already been
719 /// freed, a previous asynchronous launch reports an error, or CUDA reports
720 /// runtime initialization diagnostics.
721 ///
722 /// # Safety
723 ///
724 /// `ptr` must be null or a live allocation returned by a compatible CUDA
725 /// device allocation function, and no work may access it after it is freed.
726 pub unsafe fn free(ptr: *mut T) -> Result<()> {
727 unsafe {
728 try_ffi!(runtime::cudaFree(ptr.cast()))?;
729 }
730 Ok(())
731 }
732
733 /// Copies `count` elements from `src` to `dst`.
734 /// The transfer direction is specified by [`MemoryCopyKind`].
735 /// [`MemoryCopyKind::Default`] is recommended when unified virtual addressing is available, in which case the transfer direction is inferred from the pointer values.
736 /// Calling [`DeviceMemory::copy`] with `dst` and `src` pointers that do not match the direction of the copy results in undefined behavior.
737 ///
738 /// * Exhibits `synchronous` behavior for most use cases.
739 /// * Memory regions requested must be either entirely registered with CUDA, or in the case of host pageable transfers, not registered
740 /// at all.
741 /// Memory regions spanning over allocations that are both registered and not registered with CUDA are not supported and
742 /// return [`crate::error::Status::InvalidValue`].
743 ///
744 /// # Errors
745 ///
746 /// Returns an error if the requested byte count overflows, CUDA rejects the
747 /// pointer combination or copy kind, a previous asynchronous launch reports
748 /// an error, or CUDA reports runtime initialization diagnostics.
749 ///
750 /// # Safety
751 ///
752 /// `src` and `dst` must be valid for `count` elements of `T` according to
753 /// `kind`, and the source and destination regions must not overlap unless
754 /// CUDA permits that transfer.
755 pub unsafe fn copy(
756 dst: *mut T,
757 src: *const T,
758 count: usize,
759 kind: MemoryCopyKind,
760 ) -> Result<()> {
761 let Some(bytes) = count.checked_mul(size_of::<T>()) else {
762 return Err(Error::InvalidMemoryAllocationRequest);
763 };
764 unsafe {
765 try_ffi!(runtime::cudaMemcpy(
766 dst.cast(),
767 src.cast(),
768 bytes as _,
769 kind.into(),
770 ))?;
771 }
772 Ok(())
773 }
774
775 /// Fills the first `count` bytes of the memory area pointed to by `ptr` with the constant byte `value`.
776 ///
777 /// This call is asynchronous with respect to the host unless `ptr` refers to pinned host memory.
778 ///
779 /// See the CUDA memset synchronization rules for when this operation blocks
780 /// the host.
781 ///
782 /// # Errors
783 ///
784 /// Returns an error if the requested byte count overflows, CUDA rejects the
785 /// pointer or size, a previous asynchronous launch reports an error, or CUDA
786 /// reports runtime initialization diagnostics.
787 ///
788 /// # Safety
789 ///
790 /// `dst` must be valid for writes of `count * size_of::<T>()` bytes and
791 /// must refer to memory that CUDA can memset.
792 pub unsafe fn set(dst: *mut T, value: u8, count: usize) -> Result<()> {
793 let Some(bytes) = count.checked_mul(size_of::<T>()) else {
794 return Err(Error::InvalidMemoryAllocationRequest);
795 };
796 unsafe {
797 try_ffi!(runtime::cudaMemset(dst.cast(), value.into(), bytes as _))?;
798 }
799 Ok(())
800 }
801
802 pub unsafe fn alloc_host(size: usize) -> Result<*mut ()> {
803 let mut ptr = ptr::null_mut();
804 unsafe {
805 try_ffi!(runtime::cudaMallocHost(
806 &raw mut ptr,
807 size as runtime::size_t
808 ))?;
809 }
810 Ok(ptr.cast())
811 }
812
813 /// Frees host memory returned by [`DeviceMemory::alloc_host`] or [`DeviceMemory::alloc_pinned`].
814 ///
815 /// # Errors
816 ///
817 /// Returns an error if CUDA cannot free the host allocation, a previous
818 /// asynchronous launch reports an error, or CUDA reports runtime
819 /// initialization diagnostics.
820 ///
821 /// # Safety
822 ///
823 /// `ptr` must be null or a live host allocation returned by a compatible
824 /// CUDA host allocation function.
825 pub unsafe fn free_host(ptr: *mut ()) -> Result<()> {
826 unsafe { try_ffi!(runtime::cudaFreeHost(ptr.cast())) }
827 }
828
829 /// Allocates size bytes of host memory that is page-locked and accessible to the device.
830 /// The driver tracks the allocated virtual memory ranges and automatically accelerates calls such as [`DeviceMemory::copy`].
831 /// Since the memory can be accessed directly by the device, it can be read or written with much higher bandwidth than pageable memory obtained with functions such as `malloc()`.
832 /// Allocating excessive amounts of pinned memory may degrade system performance, since it reduces the amount of memory available to the system for paging.
833 /// As a result, use this sparingly to allocate staging areas for data exchange between host and device.
834 ///
835 /// `flags` selects allocation options:
836 ///
837 /// * [`HostAllocationFlags::DEFAULT`]: equivalent to [`DeviceMemory::alloc_host`].
838 /// * [`HostAllocationFlags::PORTABLE`]: the memory returned by this call is considered pinned memory by all CUDA contexts, not just the one that performed
839 /// the allocation.
840 /// * [`HostAllocationFlags::MAPPED`]: maps the allocation into the CUDA address space.
841 /// The device pointer to the memory may be obtained by calling [`sys::cudaHostGetDevicePointer`](singe_cuda_sys::runtime::cudaHostGetDevicePointer).
842 /// * [`HostAllocationFlags::WRITE_COMBINED`]: allocates the memory as write-combined (WC).
843 /// WC memory can be transferred across the PCI Express bus more quickly on some
844 /// system configurations, but cannot be read efficiently by most CPUs.
845 /// WC memory is a good option for buffers written
846 /// by the CPU and read by the device via mapped pinned memory or host->device transfers.
847 ///
848 /// All of these flags are orthogonal to one another: a developer may allocate memory that is portable, mapped and/or write-combined with no restrictions.
849 ///
850 /// For [`HostAllocationFlags::MAPPED`] to have any effect, the CUDA context must support [`ContextFlags::MAP_HOST`](crate::context::ContextFlags::MAP_HOST), which can be checked via [`Device::flags`](crate::device::Device::flags).
851 /// [`ContextFlags::MAP_HOST`](crate::context::ContextFlags::MAP_HOST) is implicitly set for contexts created via the runtime API.
852 ///
853 /// [`HostAllocationFlags::MAPPED`] may be specified on CUDA contexts for devices that do not support mapped pinned memory.
854 /// The failure is deferred to [`sys::cudaHostGetDevicePointer`](singe_cuda_sys::runtime::cudaHostGetDevicePointer) because the memory may be mapped into other CUDA contexts via [`HostAllocationFlags::PORTABLE`].
855 ///
856 /// Memory allocated by this method must be freed with [`DeviceMemory::free_host`].
857 ///
858 /// # Errors
859 ///
860 /// Returns an error if CUDA cannot allocate pinned host memory, a previous
861 /// asynchronous launch reports an error, or CUDA reports runtime
862 /// initialization diagnostics.
863 ///
864 /// # Safety
865 ///
866 /// The returned pointer is uninitialized host memory. The caller must ensure
867 /// it is accessed within `size` bytes and freed with [`DeviceMemory::free_host`].
868 pub unsafe fn alloc_pinned(size: usize, flags: HostAllocationFlags) -> Result<*mut ()> {
869 let mut ptr = ptr::null_mut();
870 unsafe {
871 try_ffi!(runtime::cudaHostAlloc(
872 &raw mut ptr,
873 size as _,
874 flags.bits()
875 ))?;
876 }
877 Ok(ptr.cast())
878 }
879
880 /// Page-locks the memory range specified by `ptr` and `size`, and maps it for the devices selected by `flags`.
881 /// This memory range also is added to the same tracking mechanism as [`DeviceMemory::alloc_pinned`] to automatically accelerate calls to functions such as [`DeviceMemory::copy`].
882 /// Since the memory can be accessed directly by the device, it can be read or written with much higher bandwidth than pageable memory that has not been registered.
883 /// Page-locking excessive amounts of memory may degrade system performance, since it reduces the amount of memory available to the system for paging.
884 /// As a result, use this sparingly to register staging areas for data exchange between host and device.
885 ///
886 /// On systems where [`DeviceProperties::pageable_memory_access_uses_host_page_tables`](crate::device::DeviceProperties::pageable_memory_access_uses_host_page_tables) is enabled, [`DeviceMemory::register_host`] does not page-lock the memory range specified by `ptr` and instead only populates unpopulated pages.
887 ///
888 /// [`DeviceMemory::register_host`] is supported only on I/O coherent devices where [`DeviceProperties::host_register_supported`](crate::device::DeviceProperties::host_register_supported) is enabled.
889 ///
890 /// `flags` selects registration options:
891 ///
892 /// * [`HostRegisterFlags::DEFAULT`]: on a system with unified virtual addressing, the memory is both mapped and portable.
893 /// On a system with no unified virtual addressing, the memory is neither mapped nor portable.
894 ///
895 /// * [`HostRegisterFlags::PORTABLE`]: the memory returned by this call is considered pinned memory by all CUDA contexts, not just the one that performed
896 /// the allocation.
897 ///
898 /// * [`HostRegisterFlags::MAPPED`]: maps the allocation into the CUDA address space.
899 /// The device pointer to the memory may be obtained by calling [`sys::cudaHostGetDevicePointer`](singe_cuda_sys::runtime::cudaHostGetDevicePointer).
900 ///
901 /// * [`HostRegisterFlags::IO_MEMORY`]: the passed memory pointer is treated as pointing to some memory-mapped I/O space, for example belonging to a third-party PCIe device,
902 /// and it is marked as non-cache-coherent and contiguous.
903 ///
904 /// * [`HostRegisterFlags::READ_ONLY`]: the passed memory pointer is treated as pointing to memory that is considered read-only by the device.
905 /// On platforms without
906 /// [`DeviceProperties::pageable_memory_access_uses_host_page_tables`](crate::device::DeviceProperties::pageable_memory_access_uses_host_page_tables), this flag is required to register memory mapped to the CPU as read-only.
907 /// Query support with [`DeviceProperties::host_register_read_only_supported`](crate::device::DeviceProperties::host_register_read_only_supported).
908 /// Using this flag with a current context associated with a device that does not have this attribute set makes [`DeviceMemory::register_host`] return [`crate::error::Status::NotSupported`].
909 ///
910 /// All of these flags are orthogonal to one another: a developer may page-lock memory that is portable or mapped with no restrictions.
911 ///
912 /// The CUDA context must have been created with [`ContextFlags::MAP_HOST`](crate::context::ContextFlags::MAP_HOST) for [`HostRegisterFlags::MAPPED`] to have any effect.
913 ///
914 /// [`HostRegisterFlags::MAPPED`] may be specified on CUDA contexts for devices that do not support mapped pinned memory.
915 /// The failure is deferred to [`sys::cudaHostGetDevicePointer`](singe_cuda_sys::runtime::cudaHostGetDevicePointer) because the memory may be mapped into other CUDA contexts via [`HostRegisterFlags::PORTABLE`].
916 ///
917 /// On devices where [`DeviceProperties::can_use_host_pointer_for_registered_mem`](crate::device::DeviceProperties::can_use_host_pointer_for_registered_mem) is enabled, the memory can also be accessed from the device using the original host pointer.
918 /// The device pointer returned by [`sys::cudaHostGetDevicePointer`](singe_cuda_sys::runtime::cudaHostGetDevicePointer) may or may not match the original host pointer and depends on the devices visible to the application.
919 /// If all devices visible to the application have a non-zero value for the device attribute, the device pointer returned by [`sys::cudaHostGetDevicePointer`](singe_cuda_sys::runtime::cudaHostGetDevicePointer) matches the original pointer.
920 /// If any device visible to the application has a zero value for the device attribute, the device pointer returned by [`sys::cudaHostGetDevicePointer`](singe_cuda_sys::runtime::cudaHostGetDevicePointer) does not match the original host pointer, but is suitable for use on all devices provided Unified Virtual Addressing is enabled.
921 /// In such systems, it is valid to access the memory using either pointer on devices that have a non-zero value for the device attribute.
922 /// Such devices must access the memory through only one of the two pointers, not both.
923 ///
924 /// The memory page-locked by this method must be unregistered with [`DeviceMemory::unregister_host`].
925 ///
926 /// # Errors
927 ///
928 /// Returns an error if CUDA cannot register the host range, the pointer,
929 /// size, or flags are invalid, a previous asynchronous launch reports an
930 /// error, or CUDA reports runtime initialization diagnostics.
931 ///
932 /// # Safety
933 ///
934 /// `ptr..ptr + size` must be a valid host memory range and must remain valid
935 /// until it is unregistered.
936 pub unsafe fn register_host(ptr: *mut (), size: usize, flags: HostRegisterFlags) -> Result<()> {
937 unsafe {
938 try_ffi!(runtime::cudaHostRegister(
939 ptr.cast(),
940 size as _,
941 flags.bits()
942 ))?;
943 }
944 Ok(())
945 }
946
947 /// Unmaps the memory range whose base address is specified by `ptr`, and makes it pageable again.
948 ///
949 /// The base address must be the same one specified to [`DeviceMemory::register_host`].
950 ///
951 /// # Errors
952 ///
953 /// Returns an error if CUDA cannot unregister the host range, `ptr` is not
954 /// the base address of a registered range, a previous asynchronous launch
955 /// reports an error, or CUDA reports runtime initialization diagnostics.
956 ///
957 /// # Safety
958 ///
959 /// `ptr` must be the base address of a host range registered with
960 /// [`DeviceMemory::register_host`] and must not be unregistered twice.
961 pub unsafe fn unregister_host(ptr: *mut ()) -> Result<()> {
962 unsafe { try_ffi!(runtime::cudaHostUnregister(ptr.cast())) }
963 }
964
965 /// Returns the total amount of memory available to the current context and the amount of memory free on the device.
966 /// CUDA is not guaranteed to be able to allocate all of the memory that the OS reports as free.
967 /// In a multi-tenant situation, the free-memory estimate is prone to a race condition: an allocation or free by another process or thread between estimation and reporting can make the reported free value differ from actual free memory.
968 ///
969 /// The integrated GPU on Tegra shares memory with CPU and other component of the SoC.
970 /// The free and total values returned by this call exclude the SWAP memory space maintained by the OS on some platforms.
971 /// The OS may move some of the memory pages into swap area as the GPU or CPU allocate or access memory.
972 /// See Tegra app note on how to calculate total and free memory on Tegra.
973 ///
974 /// # Errors
975 ///
976 /// Returns an error if CUDA cannot query memory information, a previous
977 /// asynchronous launch reports an error, or CUDA reports runtime
978 /// initialization diagnostics.
979 pub fn memory_info() -> Result<(usize, usize)> {
980 let mut free: runtime::size_t = 0;
981 let mut total: runtime::size_t = 0;
982 unsafe {
983 try_ffi!(runtime::cudaMemGetInfo(&raw mut free, &raw mut total))?;
984 }
985 Ok((free as usize, total as usize))
986 }
987
988 /// Returns the attributes of `ptr`.
989 /// If `ptr` was not allocated in, mapped by, or registered with a context that supports unified addressing, [`crate::error::Status::InvalidValue`] is returned.
990 ///
991 /// In CUDA 11.0 and later, passing a host pointer reports [`MemoryType::Unregistered`] in [`PointerAttributes::memory_type`].
992 ///
993 /// * [`PointerAttributes::memory_type`] identifies the type of memory.
994 /// It can be [`MemoryType::Unregistered`] for unregistered host memory, [`MemoryType::Host`] for registered host memory, [`MemoryType::Device`] for device memory, or [`MemoryType::Managed`] for managed memory.
995 ///
996 /// * [`PointerAttributes::device`] is the device against which `ptr` was allocated.
997 /// If `ptr` has memory type [`MemoryType::Device`], this identifies the device on which the memory physically resides.
998 /// If `ptr` has memory type [`MemoryType::Host`], this identifies the device that was current when the allocation was made, and if that device is deinitialized then
999 /// this allocation will vanish with that device's state.
1000 ///
1001 /// * [`PointerAttributes::device_pointer`] is the device pointer alias through which the memory referred to by `ptr` may be accessed on the current device.
1002 /// If the memory referred to by `ptr` cannot be accessed directly by the current device then this is null.
1003 ///
1004 /// * [`PointerAttributes::host_pointer`] is the host pointer alias through which the memory referred to by `ptr` may be accessed on the host.
1005 /// If the memory referred to by `ptr` cannot be accessed directly by the host then this is null.
1006 ///
1007 /// # Errors
1008 ///
1009 /// Returns an error if CUDA cannot query attributes for `ptr`, `ptr` is not
1010 /// known to a unified-addressing context, or CUDA reports runtime
1011 /// initialization diagnostics.
1012 pub fn pointer_attributes(ptr: *const T) -> Result<PointerAttributes> {
1013 let mut attr_ffi = MaybeUninit::<runtime::cudaPointerAttributes>::uninit();
1014 unsafe {
1015 try_ffi!(runtime::cudaPointerGetAttributes(
1016 attr_ffi.as_mut_ptr(),
1017 ptr.cast(),
1018 ))?;
1019 // Safety: FFI call successful, attr_ffi is initialized.
1020 Ok(attr_ffi.assume_init().into())
1021 }
1022 }
1023
1024 pub unsafe fn alloc_async(count: usize, stream: &Stream) -> Result<*mut T> {
1025 let Some(bytes) = count.checked_mul(size_of::<T>()) else {
1026 return Err(Error::InvalidMemoryAllocationRequest);
1027 };
1028 if bytes == 0 {
1029 return Ok(ptr::null_mut());
1030 }
1031 let mut p = ptr::null_mut();
1032 unsafe {
1033 try_ffi!(runtime::cudaMallocAsync(
1034 &raw mut p,
1035 bytes as _,
1036 stream.as_raw()
1037 ))?;
1038 }
1039 Ok(p.cast::<T>())
1040 }
1041
1042 /// Inserts a free operation into `stream`.
1043 /// The allocation must not be accessed after stream execution reaches the free.
1044 /// After this call returns, accessing the memory from any subsequent work launched on the GPU or querying its pointer attributes results in undefined behavior.
1045 ///
1046 /// During stream capture, this creates a free node and must therefore be passed the address of a graph allocation.
1047 ///
1048 /// # Errors
1049 ///
1050 /// Returns an error if CUDA cannot enqueue the free on `stream`, `ptr` is
1051 /// invalid for asynchronous freeing, a previous asynchronous launch reports
1052 /// an error, or CUDA reports runtime initialization diagnostics.
1053 ///
1054 /// # Safety
1055 ///
1056 /// `ptr` must be null or a live stream-ordered CUDA allocation. No work may
1057 /// access it after `stream` reaches the enqueued free.
1058 pub unsafe fn free_async(ptr: *mut T, stream: &Stream) -> Result<()> {
1059 if ptr.is_null() {
1060 return Ok(());
1061 }
1062 unsafe { try_ffi!(runtime::cudaFreeAsync(ptr.cast(), stream.as_raw())) }
1063 }
1064
1065 pub unsafe fn copy_async(
1066 dst: *mut T,
1067 src: *const T,
1068 count: usize,
1069 kind: MemoryCopyKind,
1070 stream: &Stream,
1071 ) -> Result<()> {
1072 if count == 0 {
1073 return Ok(());
1074 }
1075 let Some(bytes) = count.checked_mul(size_of::<T>()) else {
1076 return Err(Error::InvalidMemoryAllocationRequest);
1077 };
1078 unsafe {
1079 try_ffi!(runtime::cudaMemcpyAsync(
1080 dst.cast(),
1081 src.cast(),
1082 bytes as _,
1083 kind.into(),
1084 stream.as_raw(),
1085 ))?;
1086 }
1087 Ok(())
1088 }
1089
1090 /// Fills the first `count` bytes of the memory area pointed to by `ptr` with the constant byte `value`.
1091 ///
1092 /// [`DeviceMemory::set_async`] is asynchronous with respect to the host, so the call may return before the memset is complete.
1093 /// The operation can optionally be associated with a stream by passing a non-zero stream argument.
1094 /// If `stream` is non-zero, the operation may overlap with operations in other streams.
1095 ///
1096 /// The device version only handles device-to-device copies and cannot be given local or shared pointers.
1097 ///
1098 /// See the CUDA memset synchronization rules for when this operation blocks
1099 /// the host.
1100 ///
1101 /// # Errors
1102 ///
1103 /// Returns an error if the requested byte count overflows, CUDA cannot
1104 /// enqueue the memset on `stream`, a previous asynchronous launch reports an
1105 /// error, or CUDA reports runtime initialization diagnostics.
1106 ///
1107 /// # Safety
1108 ///
1109 /// `dst` must be valid for writes of `count * size_of::<T>()` bytes until
1110 /// `stream` reaches the enqueued memset.
1111 pub unsafe fn set_async(dst: *mut T, value: u8, count: usize, stream: &Stream) -> Result<()> {
1112 if count == 0 {
1113 return Ok(());
1114 }
1115 let Some(bytes) = count.checked_mul(size_of::<T>()) else {
1116 return Err(Error::InvalidMemoryAllocationRequest);
1117 };
1118 unsafe {
1119 try_ffi!(runtime::cudaMemsetAsync(
1120 dst.cast(),
1121 value.into(),
1122 bytes as _,
1123 stream.as_raw(),
1124 ))?;
1125 }
1126 Ok(())
1127 }
1128
1129 /// Prefetches memory to the specified destination location.
1130 /// `ptr` is the base device pointer of the memory to be prefetched, `location` specifies the destination location, `count` specifies the number of bytes to copy, and `stream` is the stream in which the operation is enqueued.
1131 /// The memory range must refer to managed memory allocated via [`DeviceMemory::alloc_managed`] or declared via `__managed__` variables. It may also refer to memory allocated from a managed memory pool, or to system-allocated memory on systems where [`DeviceProperties::pageable_memory_access`](crate::device::DeviceProperties::pageable_memory_access) is enabled.
1132 ///
1133 /// Setting [`MemoryLocation::kind`](crate::memory::MemoryLocation::kind) to [`MemoryLocationKind::Device`] prefetches memory to the GPU identified by [`MemoryLocation::id`](crate::memory::MemoryLocation::id). That device, and the device associated with `stream`, must support concurrent managed access.
1134 /// Setting [`MemoryLocation::kind`](crate::memory::MemoryLocation::kind) to [`MemoryLocationKind::Host`] prefetches data to host memory.
1135 /// Applications can request prefetching memory to a specific host NUMA node by using [`MemoryLocationKind::Numa`] with a valid NUMA node identifier, or to the NUMA node closest to the current thread's CPU by using [`MemoryLocationKind::NumaCurrent`].
1136 /// When [`MemoryLocation::kind`](crate::memory::MemoryLocation::kind) is [`MemoryLocationKind::Host`] or [`MemoryLocationKind::NumaCurrent`], [`MemoryLocation::id`](crate::memory::MemoryLocation::id) is ignored.
1137 ///
1138 /// The start and end addresses of the memory range are rounded down and up, respectively, to CPU page-size alignment before the prefetch operation is enqueued in the stream.
1139 ///
1140 /// If no physical memory has been allocated for this region, CUDA populates and maps it on the destination device.
1141 /// If there is insufficient memory to prefetch the desired region, the Unified Memory driver may evict pages from other [`DeviceMemory::alloc_managed`] allocations to host memory to make room.
1142 /// Device memory allocated using [`DeviceMemory::alloc`] or [`sys::cudaMallocArray`](singe_cuda_sys::runtime::cudaMallocArray) is not evicted.
1143 ///
1144 /// By default, mappings to the previous location of the migrated pages are removed and mappings for the new location are only set up at the destination.
1145 /// The exact behavior also depends on the settings applied to this memory range via `cuMemAdvise` as described below:
1146 ///
1147 /// If read-mostly advice was set on any subset of this memory range, then that subset will create a read-only copy of the pages at the destination location.
1148 /// If the destination location is a host NUMA node, any pages of that subset that are already in another host NUMA node are transferred to the destination.
1149 ///
1150 /// If preferred-location advice was set on any subset of this memory range, then the pages will migrate to `location` even if it is not the preferred location of every page in the range.
1151 ///
1152 /// If accessed-by advice was set on any subset of this memory range, then mappings to those pages from all appropriate processors are updated to refer to the new location if establishing such a mapping is possible.
1153 /// Otherwise, those mappings are cleared.
1154 ///
1155 /// This is not required for correctness; it improves performance by allowing the application to migrate data to a suitable location before access.
1156 /// Memory accesses to this range are always coherent and are allowed even when the data is actively being migrated.
1157 ///
1158 /// This call is asynchronous with respect to the host and all work on other devices.
1159 ///
1160 /// # Errors
1161 ///
1162 /// Returns an error if CUDA cannot enqueue the prefetch on `stream`, the
1163 /// memory range or destination location is invalid, a previous asynchronous
1164 /// launch reports an error, or CUDA reports runtime initialization
1165 /// diagnostics.
1166 pub fn prefetch_async(
1167 ptr: DevicePtr,
1168 count: usize,
1169 location: MemoryLocation,
1170 stream: &Stream,
1171 ) -> Result<()> {
1172 if count == 0 {
1173 return Ok(());
1174 }
1175 unsafe {
1176 try_ffi!(runtime::cudaMemPrefetchAsync(
1177 ptr.as_ptr() as _,
1178 count as _,
1179 location.into(),
1180 0, // flags
1181 stream.as_raw()
1182 ))?;
1183 }
1184 Ok(())
1185 }
1186}
1187
1188// Safety: DeviceMemory owns a CUDA allocation and does not provide CPU
1189// references into the allocation. Moving the owner to another thread is safe
1190// when `T: Send` because CUDA allocation/free operations are synchronized by
1191// the CUDA context APIs used by this wrapper. Shared references are safe when
1192// `T: Sync` because `&DeviceMemory<T>` only exposes raw device pointers and
1193// read-only metadata; mutation of device memory still requires CUDA operations
1194// whose safety contracts are enforced by the calling APIs.
1195unsafe impl<T: Send> Send for DeviceMemory<T> {}
1196unsafe impl<T: Sync> Sync for DeviceMemory<T> {}
1197
1198// Safety: managed memory has the same Rust-side ownership properties as
1199// DeviceMemory. CUDA maintains coherence for managed allocations across host
1200// threads and devices; Rust references into the allocation are not exposed by
1201// this wrapper.
1202unsafe impl<T: DeviceRepr + Send> Send for ManagedMemory<T> {}
1203unsafe impl<T: DeviceRepr + Sync> Sync for ManagedMemory<T> {}
1204
1205impl<T> DeviceMemory<T> {
1206 /// Takes ownership of an existing device allocation.
1207 ///
1208 /// # Safety
1209 ///
1210 /// `ptr` must be null for an empty allocation or point to `length` live
1211 /// elements allocated by `cudaMallocManaged` or another CUDA allocation
1212 /// function compatible with `cudaFree`. `length * size_of::<T>()` must fit
1213 /// in `usize`.
1214 /// No other owner may free the pointer while the returned value is alive.
1215 pub unsafe fn from_raw_parts(ptr: *mut T, length: usize) -> Self {
1216 Self {
1217 ptr,
1218 length,
1219 _phantom: PhantomData,
1220 }
1221 }
1222
1223 pub fn into_raw_parts(self) -> (*mut T, usize) {
1224 let ptr = self.ptr;
1225 let length = self.length;
1226 mem::forget(self);
1227 (ptr, length)
1228 }
1229
1230 pub fn create(length: usize) -> Result<Self> {
1231 let size_t = size_of::<T>();
1232
1233 if size_t == 0 {
1234 if length == 0 {
1235 return Ok(Self {
1236 ptr: ptr::null_mut(), // No allocation needed for ZSTs with count 0
1237 length: 0,
1238 _phantom: PhantomData,
1239 });
1240 }
1241 return Err(Error::InvalidMemoryAllocationRequest);
1242 }
1243
1244 // Ensure allocation size doesn't overflow usize when calculating bytes internally in `alloc`.
1245 if length > (usize::MAX / size_t) {
1246 return Err(Error::InvalidMemoryAllocationRequest);
1247 }
1248
1249 if length == 0 {
1250 Ok(Self {
1251 ptr: ptr::null_mut(),
1252 length: 0,
1253 _phantom: PhantomData,
1254 })
1255 } else {
1256 let device_ptr = unsafe { Self::alloc(length)? };
1257
1258 Ok(Self {
1259 ptr: device_ptr,
1260 length,
1261 _phantom: PhantomData,
1262 })
1263 }
1264 }
1265
1266 pub fn zeroes(length: usize) -> Result<Self> {
1267 let mut mem = Self::create(length)?;
1268 mem.set_zeroes()?;
1269 Ok(mem)
1270 }
1271
1272 pub fn from_slice(v: &[T]) -> Result<Self> {
1273 let mut mem = Self::create(v.len())?;
1274 mem.copy_from_host(v)?;
1275 Ok(mem)
1276 }
1277
1278 /// # Safety
1279 ///
1280 /// The caller must ensure `v` remains valid and unmodified until `stream`
1281 /// has completed the transfer.
1282 ///
1283 /// # Errors
1284 ///
1285 /// Returns an error if CUDA cannot allocate device memory or enqueue the
1286 /// host-to-device copy.
1287 pub unsafe fn from_slice_async(v: &[T], stream: &Stream) -> Result<Self> {
1288 let mut mem = Self::create(v.len())?;
1289 unsafe {
1290 mem.copy_from_host_async_unchecked(v, stream)?;
1291 }
1292 Ok(mem)
1293 }
1294
1295 pub const fn len(&self) -> usize {
1296 self.length
1297 }
1298
1299 pub const fn is_empty(&self) -> bool {
1300 self.length == 0
1301 }
1302
1303 pub fn byte_len(&self) -> usize {
1304 self.length
1305 .checked_mul(size_of::<T>())
1306 .expect("device memory byte length overflow")
1307 }
1308
1309 pub const fn as_ptr(&self) -> *const T {
1310 self.ptr
1311 }
1312
1313 pub const fn as_mut_ptr(&self) -> *mut T {
1314 self.ptr
1315 }
1316
1317 pub fn copy_from_host(&mut self, host_slice: &[T]) -> Result<()> {
1318 if host_slice.len() != self.length {
1319 return Err(Error::InvalidMemoryAccess);
1320 }
1321 if self.length == 0 {
1322 return Ok(());
1323 }
1324 unsafe {
1325 Self::copy(
1326 self.ptr,
1327 host_slice.as_ptr(),
1328 self.length,
1329 MemoryCopyKind::HostToDevice,
1330 )
1331 }
1332 }
1333
1334 pub fn copy_from_host_async<'scope, 'env>(
1335 &mut self,
1336 host_slice: &'env [T],
1337 stream: &StreamScope<'scope, 'env>,
1338 ) -> Result<()> {
1339 unsafe { self.copy_from_host_async_unchecked(host_slice, stream.stream()) }
1340 }
1341
1342 /// # Safety
1343 ///
1344 /// The caller must ensure `self` and `host_slice` both remain valid until
1345 /// `stream` has completed the transfer.
1346 pub unsafe fn copy_from_host_async_unchecked(
1347 &mut self,
1348 host_slice: &[T],
1349 stream: &Stream,
1350 ) -> Result<()> {
1351 if host_slice.len() != self.len() {
1352 return Err(Error::InvalidMemoryAccess);
1353 }
1354 if self.is_empty() {
1355 return Ok(());
1356 }
1357 unsafe {
1358 Self::copy_async(
1359 self.as_mut_ptr(),
1360 host_slice.as_ptr(),
1361 self.len(),
1362 MemoryCopyKind::HostToDevice,
1363 stream,
1364 )
1365 }
1366 }
1367
1368 /// Returns a capture operation that copies from host memory into this device allocation.
1369 ///
1370 /// # Safety
1371 ///
1372 /// Capturing this operation stores the host and device pointer addresses in
1373 /// the resulting CUDA graph. The caller must ensure `self` and `host_slice`
1374 /// remain valid whenever a captured graph using this operation is launched.
1375 /// The destination allocation must remain exclusive for the work ordered by
1376 /// those launches.
1377 pub unsafe fn copy_from_host_operation<'a>(
1378 &'a mut self,
1379 host_slice: &'a [T],
1380 ) -> Result<MemoryCopyOperation<'a, T>> {
1381 if host_slice.len() != self.len() {
1382 return Err(Error::InvalidMemoryAccess);
1383 }
1384 Ok(unsafe {
1385 MemoryCopyOperation::new(
1386 self.as_mut_ptr(),
1387 host_slice.as_ptr(),
1388 self.len(),
1389 MemoryCopyKind::HostToDevice,
1390 )
1391 })
1392 }
1393
1394 pub fn copy_to_host(&self, host_slice: &mut [T]) -> Result<()> {
1395 if host_slice.len() != self.length {
1396 return Err(Error::InvalidMemoryAccess);
1397 }
1398 if self.length == 0 {
1399 return Ok(());
1400 }
1401 unsafe {
1402 Self::copy(
1403 host_slice.as_mut_ptr(),
1404 self.ptr,
1405 self.length,
1406 MemoryCopyKind::DeviceToHost,
1407 )
1408 }
1409 }
1410
1411 pub fn copy_to_host_async<'scope, 'env>(
1412 &self,
1413 host_slice: &'env mut [T],
1414 stream: &StreamScope<'scope, 'env>,
1415 ) -> Result<()> {
1416 unsafe { self.copy_to_host_async_unchecked(host_slice, stream.stream()) }
1417 }
1418
1419 /// # Safety
1420 ///
1421 /// The caller must ensure `self` and `host_slice` both remain valid until
1422 /// `stream` has completed the transfer.
1423 pub unsafe fn copy_to_host_async_unchecked(
1424 &self,
1425 host_slice: &mut [T],
1426 stream: &Stream,
1427 ) -> Result<()> {
1428 if host_slice.len() != self.len() {
1429 return Err(Error::InvalidMemoryAccess);
1430 }
1431 if self.is_empty() {
1432 return Ok(());
1433 }
1434 unsafe {
1435 Self::copy_async(
1436 host_slice.as_mut_ptr(),
1437 self.as_ptr(),
1438 self.len(),
1439 MemoryCopyKind::DeviceToHost,
1440 stream,
1441 )
1442 }
1443 }
1444
1445 /// Returns a capture operation that copies this allocation into host memory.
1446 ///
1447 /// # Safety
1448 ///
1449 /// Capturing this operation stores the device and host pointer addresses in
1450 /// the resulting CUDA graph. The caller must ensure `self` and `host_slice`
1451 /// remain valid whenever a captured graph using this operation is launched.
1452 /// The host destination must remain exclusive for the work ordered by those
1453 /// launches.
1454 pub unsafe fn copy_to_host_operation<'a>(
1455 &'a self,
1456 host_slice: &'a mut [T],
1457 ) -> Result<MemoryCopyOperation<'a, T>> {
1458 if host_slice.len() != self.len() {
1459 return Err(Error::InvalidMemoryAccess);
1460 }
1461 Ok(unsafe {
1462 MemoryCopyOperation::new(
1463 host_slice.as_mut_ptr(),
1464 self.as_ptr(),
1465 self.len(),
1466 MemoryCopyKind::DeviceToHost,
1467 )
1468 })
1469 }
1470
1471 pub fn copy_to_host_vec(&self) -> Result<Vec<T>> {
1472 if size_of::<T>() == 0 {
1473 return Err(Error::InvalidMemoryAllocationRequest);
1474 }
1475
1476 if self.length == 0 {
1477 return Ok(Vec::new());
1478 }
1479
1480 let mut host_vec = Vec::<T>::with_capacity(self.length);
1481
1482 unsafe {
1483 Self::copy(
1484 host_vec.as_mut_ptr(),
1485 self.ptr,
1486 self.length,
1487 MemoryCopyKind::DeviceToHost,
1488 )?;
1489
1490 host_vec.set_len(self.length);
1491 }
1492
1493 Ok(host_vec)
1494 }
1495
1496 pub fn copy_from_device(&mut self, src: &Self) -> Result<()> {
1497 if src.len() != self.length {
1498 return Err(Error::InvalidMemoryAccess);
1499 }
1500 if self.length == 0 {
1501 return Ok(());
1502 }
1503 unsafe {
1504 Self::copy(
1505 self.ptr,
1506 src.as_ptr(),
1507 self.length,
1508 MemoryCopyKind::DeviceToDevice,
1509 )
1510 }
1511 }
1512
1513 pub fn copy_from_device_async<'scope, 'env>(
1514 &mut self,
1515 src: &Self,
1516 stream: &StreamScope<'scope, 'env>,
1517 ) -> Result<()> {
1518 unsafe { self.copy_from_device_async_unchecked(src, stream.stream()) }
1519 }
1520
1521 /// # Safety
1522 ///
1523 /// The caller must ensure `self` and `src` both remain valid until
1524 /// `stream` has completed the transfer.
1525 pub unsafe fn copy_from_device_async_unchecked(
1526 &mut self,
1527 src: &Self,
1528 stream: &Stream,
1529 ) -> Result<()> {
1530 if src.len() != self.len() {
1531 return Err(Error::InvalidMemoryAccess);
1532 }
1533 if self.is_empty() {
1534 return Ok(());
1535 }
1536 unsafe {
1537 Self::copy_async(
1538 self.as_mut_ptr(),
1539 src.as_ptr(),
1540 self.len(),
1541 MemoryCopyKind::DeviceToDevice,
1542 stream,
1543 )
1544 }
1545 }
1546
1547 /// Returns a capture operation that copies from another device allocation into this allocation.
1548 ///
1549 /// # Safety
1550 ///
1551 /// Capturing this operation stores both device pointer addresses in the
1552 /// resulting CUDA graph. The caller must ensure `self` and `src` remain
1553 /// valid whenever a captured graph using this operation is launched. The
1554 /// destination allocation must remain exclusive for the work ordered by
1555 /// those launches.
1556 pub unsafe fn copy_from_device_operation<'a>(
1557 &'a mut self,
1558 src: &'a Self,
1559 ) -> Result<MemoryCopyOperation<'a, T>> {
1560 if src.len() != self.len() {
1561 return Err(Error::InvalidMemoryAccess);
1562 }
1563 Ok(unsafe {
1564 MemoryCopyOperation::new(
1565 self.as_mut_ptr(),
1566 src.as_ptr(),
1567 self.len(),
1568 MemoryCopyKind::DeviceToDevice,
1569 )
1570 })
1571 }
1572
1573 pub fn set_zeroes(&mut self) -> Result<()> {
1574 if self.length == 0 {
1575 return Ok(());
1576 }
1577 unsafe { Self::set(self.ptr, 0, self.length) }
1578 }
1579
1580 pub fn set_value(&mut self, value: u8) -> Result<()> {
1581 if self.length == 0 {
1582 return Ok(());
1583 }
1584 unsafe { Self::set(self.ptr, value, self.length) }
1585 }
1586
1587 pub fn set_value_async<'scope, 'env>(
1588 &mut self,
1589 value: u8,
1590 stream: &StreamScope<'scope, 'env>,
1591 ) -> Result<()> {
1592 unsafe { self.set_value_async_unchecked(value, stream.stream()) }
1593 }
1594
1595 /// # Safety
1596 ///
1597 /// The caller must ensure `self` remains valid until `stream` has
1598 /// completed the memset.
1599 ///
1600 /// # Errors
1601 ///
1602 /// Returns an error if CUDA cannot enqueue the memset on `stream`.
1603 pub unsafe fn set_value_async_unchecked(&mut self, value: u8, stream: &Stream) -> Result<()> {
1604 if self.is_empty() {
1605 return Ok(());
1606 }
1607 unsafe { Self::set_async(self.as_mut_ptr(), value, self.len(), stream) }
1608 }
1609
1610 /// Returns a capture operation that fills this device allocation with `value`.
1611 ///
1612 /// # Safety
1613 ///
1614 /// Capturing this operation stores this allocation's pointer address in the
1615 /// resulting CUDA graph. The caller must ensure `self` remains valid and
1616 /// exclusive whenever a captured graph using this operation is launched.
1617 pub unsafe fn set_value_operation<'a>(&'a mut self, value: u8) -> MemorySetOperation<'a, T> {
1618 unsafe { MemorySetOperation::new(self.as_mut_ptr(), value, self.len()) }
1619 }
1620
1621 /// Takes a pointer to the base of an existing device memory allocation created with [`DeviceMemory::alloc`] and exports it for use in another process.
1622 /// This is a lightweight operation and may be called multiple times on an allocation without adverse effects.
1623 ///
1624 /// If a region of memory is freed with [`DeviceMemory::free`] and a subsequent call to [`DeviceMemory::alloc`] returns memory with the same device address, [`DeviceMemory::ipc_handle`] returns a unique handle for the new memory.
1625 ///
1626 /// IPC is restricted to devices with unified-addressing support on Linux and Windows.
1627 /// IPC on Windows is supported for compatibility but is not recommended because of its performance cost.
1628 /// Check device IPC support through the device properties exposed by this crate, for example [`DeviceProperties::ipc_event_supported`](crate::device::DeviceProperties::ipc_event_supported).
1629 ///
1630 /// # Errors
1631 ///
1632 /// Returns an error if the allocation is empty, CUDA cannot export an IPC
1633 /// handle for the allocation, or CUDA reports runtime initialization
1634 /// diagnostics.
1635 pub fn ipc_handle(&self) -> Result<IpcMemoryHandle> {
1636 if self.is_empty() {
1637 // Cannot get handle for null pointer / zero size? Check docs.
1638 return Err(Error::InvalidMemoryAccess);
1639 }
1640 let mut handle = MaybeUninit::uninit();
1641 unsafe {
1642 try_ffi!(runtime::cudaIpcGetMemHandle(
1643 handle.as_mut_ptr(),
1644 self.as_ptr().cast_mut().cast(),
1645 ))?;
1646 Ok(IpcMemoryHandle::from_raw(handle.assume_init()))
1647 }
1648 }
1649
1650 pub fn try_clone(&self) -> Result<Self> {
1651 if self.length == 0 || size_of::<T>() == 0 {
1652 return Ok(Self {
1653 ptr: ptr::null_mut(),
1654 length: self.length,
1655 _phantom: PhantomData,
1656 });
1657 }
1658
1659 let new_mem = Self::create(self.length)?;
1660
1661 unsafe {
1662 Self::copy(
1663 new_mem.as_mut_ptr(),
1664 self.as_ptr(),
1665 self.length,
1666 MemoryCopyKind::DeviceToDevice,
1667 )?;
1668 }
1669
1670 Ok(new_mem)
1671 }
1672}
1673
1674impl<T> Clone for DeviceMemory<T> {
1675 fn clone(&self) -> Self {
1676 match self.try_clone() {
1677 Ok(new_mem) => new_mem,
1678 Err(err) => {
1679 #[cfg(debug_assertions)]
1680 eprintln!("device memory clone failed: {err}");
1681 Self {
1682 ptr: ptr::null_mut(),
1683 length: 0,
1684 _phantom: PhantomData,
1685 }
1686 }
1687 }
1688 }
1689}
1690
1691impl<T> Drop for DeviceMemory<T> {
1692 fn drop(&mut self) {
1693 if self.ptr.is_null() {
1694 return;
1695 }
1696
1697 // debug_assert!(
1698 // unsafe { free(self.ptr) }.is_ok(),
1699 // "failed to free device memory at {:#x}",
1700 // self.ptr as usize
1701 // );
1702 if let Err(err) = unsafe { Self::free(self.ptr) } {
1703 #[cfg(debug_assertions)]
1704 eprintln!("failed to free device memory: {err}");
1705 return;
1706 }
1707
1708 self.ptr = ptr::null_mut();
1709 self.length = 0;
1710 }
1711}
1712
1713impl<T: DeviceRepr> ManagedMemory<T> {
1714 /// Allocates typed CUDA managed memory and records its initial attach mode.
1715 ///
1716 /// # Errors
1717 ///
1718 /// Returns an error if the requested byte size overflows, CUDA cannot
1719 /// allocate managed memory, or a non-empty zero-sized allocation is
1720 /// requested.
1721 pub fn create(length: usize, attach_flags: MemoryAttachFlags) -> Result<Self> {
1722 if size_of::<T>() == 0 {
1723 return if length == 0 {
1724 Ok(Self {
1725 ptr: ptr::null_mut(),
1726 length,
1727 attach_flags,
1728 _phantom: PhantomData,
1729 })
1730 } else {
1731 Err(Error::InvalidMemoryAllocationRequest)
1732 };
1733 }
1734
1735 let ptr = unsafe { DeviceMemory::<T>::alloc_managed(length, attach_flags)? };
1736 Ok(Self {
1737 ptr,
1738 length,
1739 attach_flags,
1740 _phantom: PhantomData,
1741 })
1742 }
1743
1744 pub fn zeroes(length: usize, attach_flags: MemoryAttachFlags) -> Result<Self>
1745 where
1746 T: ZeroableDeviceRepr,
1747 {
1748 let mut memory = Self::create(length, attach_flags)?;
1749 memory.set_zeroes()?;
1750 Ok(memory)
1751 }
1752
1753 /// Takes ownership of an existing managed allocation.
1754 ///
1755 /// # Safety
1756 ///
1757 /// `ptr` must be null for an empty allocation or point to `length` live
1758 /// elements allocated by a CUDA allocation function compatible with
1759 /// [`DeviceMemory::free`]. `length * size_of::<T>()` must fit in `usize`.
1760 /// No other owner may free the pointer while the returned value is alive.
1761 pub unsafe fn from_raw_parts(
1762 ptr: *mut T,
1763 length: usize,
1764 attach_flags: MemoryAttachFlags,
1765 ) -> Self {
1766 Self {
1767 ptr,
1768 length,
1769 attach_flags,
1770 _phantom: PhantomData,
1771 }
1772 }
1773
1774 pub fn into_raw_parts(self) -> (*mut T, usize, MemoryAttachFlags) {
1775 let ptr = self.ptr;
1776 let length = self.length;
1777 let attach_flags = self.attach_flags;
1778 mem::forget(self);
1779 (ptr, length, attach_flags)
1780 }
1781
1782 pub const fn len(&self) -> usize {
1783 self.length
1784 }
1785
1786 pub const fn is_empty(&self) -> bool {
1787 self.length == 0
1788 }
1789
1790 pub fn byte_len(&self) -> usize {
1791 self.length
1792 .checked_mul(size_of::<T>())
1793 .expect("managed memory byte length overflow")
1794 }
1795
1796 pub const fn attach_flags(&self) -> MemoryAttachFlags {
1797 self.attach_flags
1798 }
1799
1800 pub const fn as_ptr(&self) -> *const T {
1801 self.ptr
1802 }
1803
1804 pub const fn as_mut_ptr(&mut self) -> *mut T {
1805 self.ptr
1806 }
1807
1808 pub fn view(&self) -> DeviceView<'_, T> {
1809 // The ManagedMemory owner guarantees the pointer remains live for the
1810 // borrowed view lifetime.
1811 unsafe { DeviceView::from_raw_parts(self.ptr, self.length) }
1812 }
1813
1814 pub fn view_mut(&mut self) -> DeviceViewMut<'_, T> {
1815 // &mut self guarantees unique access to the represented range.
1816 unsafe { DeviceViewMut::from_raw_parts(self.ptr, self.length) }
1817 }
1818
1819 /// Returns a host slice over this managed allocation.
1820 ///
1821 /// # Safety
1822 ///
1823 /// The caller must ensure no GPU work or other CPU reference can
1824 /// concurrently mutate the same memory for the returned lifetime, and that
1825 /// the allocation is accessible from the host at the point of access.
1826 pub unsafe fn as_host_slice(&self) -> &[T] {
1827 if self.is_empty() {
1828 return &[];
1829 }
1830 unsafe { slice::from_raw_parts(self.ptr, self.length) }
1831 }
1832
1833 /// Returns a mutable host slice over this managed allocation.
1834 ///
1835 /// # Safety
1836 ///
1837 /// The caller must ensure no GPU work or other CPU reference can
1838 /// concurrently access the same memory for the returned lifetime, and that
1839 /// the allocation is accessible from the host at the point of access.
1840 pub unsafe fn as_host_slice_mut(&mut self) -> &mut [T] {
1841 if self.is_empty() {
1842 return &mut [];
1843 }
1844 unsafe { slice::from_raw_parts_mut(self.ptr, self.length) }
1845 }
1846
1847 pub fn set_zeroes(&mut self) -> Result<()>
1848 where
1849 T: ZeroableDeviceRepr,
1850 {
1851 if self.is_empty() {
1852 return Ok(());
1853 }
1854 unsafe { DeviceMemory::<T>::set(self.ptr, 0, self.length) }
1855 }
1856
1857 pub fn prefetch_to(&self, location: MemoryLocation, stream: &Stream) -> Result<()> {
1858 DeviceMemory::<T>::prefetch_async(
1859 unsafe { DevicePtr::from_raw(self.ptr.cast::<()>()) },
1860 self.byte_len(),
1861 location,
1862 stream,
1863 )
1864 }
1865
1866 pub fn attach_to_stream(&mut self, stream: &Stream, flags: MemoryAttachFlags) -> Result<()> {
1867 stream.context().bind()?;
1868 unsafe {
1869 try_ffi!(runtime::cudaStreamAttachMemAsync(
1870 stream.as_raw(),
1871 self.ptr.cast(),
1872 self.byte_len() as _,
1873 flags.bits(),
1874 ))?;
1875 }
1876 self.attach_flags = flags;
1877 Ok(())
1878 }
1879}
1880
1881impl<T: DeviceRepr> DeviceSlice<T> for ManagedMemory<T> {
1882 fn as_device_ptr(&self) -> *const T {
1883 self.ptr
1884 }
1885
1886 fn len(&self) -> usize {
1887 self.length
1888 }
1889}
1890
1891impl<T: DeviceRepr> DeviceSliceMut<T> for ManagedMemory<T> {
1892 fn as_device_mut_ptr(&mut self) -> *mut T {
1893 self.ptr
1894 }
1895}
1896
1897impl<T: DeviceRepr> Drop for ManagedMemory<T> {
1898 fn drop(&mut self) {
1899 if self.ptr.is_null() {
1900 return;
1901 }
1902
1903 if let Err(err) = unsafe { DeviceMemory::<T>::free(self.ptr) } {
1904 #[cfg(debug_assertions)]
1905 eprintln!("failed to free managed memory: {err}");
1906 return;
1907 }
1908
1909 self.ptr = ptr::null_mut();
1910 self.length = 0;
1911 }
1912}
1913
1914#[cfg(all(test, feature = "testing"))]
1915mod tests {
1916 use super::*;
1917 use crate::testing;
1918
1919 #[test]
1920 fn it_works() -> Result<()> {
1921 unsafe {
1922 let host_in = [1, 2, 3];
1923
1924 let device_ptr = DeviceMemory::alloc(3)?;
1925
1926 DeviceMemory::copy(
1927 device_ptr,
1928 host_in.as_ptr(),
1929 3,
1930 MemoryCopyKind::HostToDevice,
1931 )?;
1932 let mut host_out = [0, 0, 0];
1933 DeviceMemory::copy(
1934 host_out.as_mut_ptr(),
1935 device_ptr,
1936 3,
1937 MemoryCopyKind::DeviceToHost,
1938 )?;
1939 assert_eq!(host_out, host_in);
1940
1941 DeviceMemory::free(device_ptr)?;
1942 }
1943 Ok(())
1944 }
1945
1946 #[test]
1947 fn test_scoped_async_copy_round_trip() -> Result<()> {
1948 let (_lock, ctx) = testing::bootstrap()?;
1949 let stream = ctx.create_stream()?;
1950
1951 let host_in = [4_i32, 5, 6];
1952 let mut device = DeviceMemory::create(host_in.len())?;
1953 let mut host_out = [0_i32; 3];
1954
1955 stream.sync_scope(|scope| {
1956 device.copy_from_host_async(&host_in, scope)?;
1957 device.copy_to_host_async(&mut host_out, scope)
1958 })?;
1959
1960 assert_eq!(host_out, host_in);
1961
1962 Ok(())
1963 }
1964
1965 #[test]
1966 fn managed_memory_tracks_metadata_and_views() -> Result<()> {
1967 let mut backing = [1_u32, 2, 3, 4];
1968 let mut managed = unsafe {
1969 ManagedMemory::from_raw_parts(
1970 backing.as_mut_ptr(),
1971 backing.len(),
1972 MemoryAttachFlags::HOST,
1973 )
1974 };
1975
1976 assert_eq!(managed.len(), backing.len());
1977 assert_eq!(managed.byte_len(), backing.len() * size_of::<u32>());
1978 assert_eq!(managed.attach_flags(), MemoryAttachFlags::HOST);
1979 assert_eq!(managed.view().len(), backing.len());
1980 assert_eq!(managed.view_mut().len(), backing.len());
1981
1982 unsafe {
1983 assert_eq!(managed.as_host_slice(), &[1, 2, 3, 4]);
1984 managed.as_host_slice_mut()[2] = 9;
1985 }
1986 assert_eq!(backing[2], 9);
1987
1988 let (ptr, length, flags) = managed.into_raw_parts();
1989 assert_eq!(ptr, backing.as_mut_ptr());
1990 assert_eq!(length, backing.len());
1991 assert_eq!(flags, MemoryAttachFlags::HOST);
1992
1993 Ok(())
1994 }
1995}