oxicuda_memory/
peer_copy.rs

1//! Peer-to-peer (P2P) memory copy operations for multi-GPU workloads.
2//!
3//! This module provides functions to check, enable, and disable peer access
4//! between CUDA devices, as well as copy data between device buffers on
5//! different GPUs.
6//!
7//! Peer access enables direct GPU-to-GPU memory transfers over PCIe or
8//! NVLink without staging through host memory, significantly improving
9//! transfer bandwidth in multi-GPU configurations.
10//!
11//! # Platform note
12//!
13//! The underlying `cuDeviceCanAccessPeer`, `cuCtxEnablePeerAccess`,
14//! `cuCtxDisablePeerAccess`, and `cuMemcpyPeer` driver functions are not
15//! yet loaded by `oxicuda-driver`.  All functions currently return
16//! [`CudaError::NotSupported`] as placeholders.  The API surface is
17//! established here so that downstream crates can program against it.
18//!
19//! # Example
20//!
21//! ```rust,no_run
22//! use oxicuda_driver::device::Device;
23//! use oxicuda_memory::peer_copy;
24//!
25//! oxicuda_driver::init()?;
26//! let dev0 = Device::get(0)?;
27//! let dev1 = Device::get(1)?;
28//!
29//! if peer_copy::can_access_peer(&dev0, &dev1)? {
30//!     peer_copy::enable_peer_access(&dev0, &dev1)?;
31//!     // Now D2D copies between dev0 and dev1 can go over NVLink/PCIe
32//!     // peer_copy::copy_peer(&mut dst_buf, &dev1, &src_buf, &dev0)?;
33//! }
34//! # Ok::<(), oxicuda_driver::error::CudaError>(())
35//! ```
36
37use oxicuda_driver::device::Device;
38use oxicuda_driver::error::{CudaError, CudaResult};
39use oxicuda_driver::stream::Stream;
40
41use crate::device_buffer::DeviceBuffer;
42
43/// Checks whether `device` can directly access memory on `peer`.
44///
45/// Returns `true` if peer access is supported between the two devices
46/// (e.g., over NVLink or PCIe).  Returns `false` if the devices are the
47/// same or if the hardware topology does not support peer access.
48///
49/// # Errors
50///
51/// Currently returns [`CudaError::NotSupported`] because the underlying
52/// driver function pointer (`cuDeviceCanAccessPeer`) is not yet loaded.
53pub fn can_access_peer(_device: &Device, _peer: &Device) -> CudaResult<bool> {
54    // TODO: load `cuDeviceCanAccessPeer` in DriverApi and call it here.
55    // let api = oxicuda_driver::loader::try_driver()?;
56    // let mut can_access: i32 = 0;
57    // oxicuda_driver::check(unsafe {
58    //     (api.cu_device_can_access_peer)(&mut can_access, device.raw(), peer.raw())
59    // })?;
60    // Ok(can_access != 0)
61    Err(CudaError::NotSupported)
62}
63
64/// Enables peer access from the current context's device to `peer`.
65///
66/// After enabling, memory on `peer` can be directly accessed from
67/// kernels and copy operations in the current context.
68///
69/// # Errors
70///
71/// * [`CudaError::PeerAccessAlreadyEnabled`] if peer access is already
72///   enabled.
73/// * [`CudaError::PeerAccessUnsupported`] if the hardware does not
74///   support peer access.
75/// * [`CudaError::NotSupported`] (current stub).
76pub fn enable_peer_access(_device: &Device, _peer: &Device) -> CudaResult<()> {
77    // TODO: load `cuCtxEnablePeerAccess` in DriverApi and call it.
78    // Need to set the device's context as current first, then enable
79    // access to peer's context.
80    Err(CudaError::NotSupported)
81}
82
83/// Disables peer access from the current context's device to `peer`.
84///
85/// # Errors
86///
87/// * [`CudaError::PeerAccessNotEnabled`] if peer access was not enabled.
88/// * [`CudaError::NotSupported`] (current stub).
89pub fn disable_peer_access(_device: &Device, _peer: &Device) -> CudaResult<()> {
90    // TODO: load `cuCtxDisablePeerAccess` in DriverApi and call it.
91    Err(CudaError::NotSupported)
92}
93
94/// Copies data between device buffers on different GPUs (synchronous).
95///
96/// Both buffers must have the same length.  Peer access should be enabled
97/// between the source and destination devices before calling this function.
98///
99/// # Errors
100///
101/// * [`CudaError::InvalidValue`] if buffer lengths do not match.
102/// * [`CudaError::PeerAccessNotEnabled`] if peer access has not been
103///   enabled.
104/// * [`CudaError::NotSupported`] (current stub).
105pub fn copy_peer<T: Copy>(
106    dst: &mut DeviceBuffer<T>,
107    _dst_device: &Device,
108    src: &DeviceBuffer<T>,
109    _src_device: &Device,
110) -> CudaResult<()> {
111    if dst.len() != src.len() {
112        return Err(CudaError::InvalidValue);
113    }
114    // TODO: load `cuMemcpyPeer` in DriverApi and call it.
115    // let byte_size = src.byte_size();
116    // let api = oxicuda_driver::loader::try_driver()?;
117    // oxicuda_driver::check(unsafe {
118    //     (api.cu_memcpy_peer)(
119    //         dst.as_device_ptr(), dst_ctx, src.as_device_ptr(), src_ctx, byte_size
120    //     )
121    // })
122    Err(CudaError::NotSupported)
123}
124
125/// Copies data between device buffers on different GPUs (asynchronous).
126///
127/// The copy is enqueued on `stream` and may not be complete when this
128/// function returns.  Both buffers must have the same length.
129///
130/// # Errors
131///
132/// * [`CudaError::InvalidValue`] if buffer lengths do not match.
133/// * [`CudaError::NotSupported`] (current stub).
134pub fn copy_peer_async<T: Copy>(
135    dst: &mut DeviceBuffer<T>,
136    _dst_device: &Device,
137    src: &DeviceBuffer<T>,
138    _src_device: &Device,
139    _stream: &Stream,
140) -> CudaResult<()> {
141    if dst.len() != src.len() {
142        return Err(CudaError::InvalidValue);
143    }
144    // TODO: load `cuMemcpyPeerAsync` in DriverApi and call it.
145    Err(CudaError::NotSupported)
146}
147
148// ---------------------------------------------------------------------------
149// Tests
150// ---------------------------------------------------------------------------
151
152#[cfg(test)]
153mod tests {
154    use super::*;
155
156    #[test]
157    fn can_access_peer_returns_not_supported() {
158        // On macOS, we cannot enumerate devices, so we create dummy Device
159        // values indirectly. Since `Device::get` will fail, we just test
160        // that the function compiles and the error type is correct.
161        // The function itself returns NotSupported as a stub.
162        let _f: fn(&Device, &Device) -> CudaResult<bool> = can_access_peer;
163    }
164
165    #[test]
166    fn enable_peer_access_returns_not_supported() {
167        let _f: fn(&Device, &Device) -> CudaResult<()> = enable_peer_access;
168    }
169
170    #[test]
171    fn disable_peer_access_returns_not_supported() {
172        let _f: fn(&Device, &Device) -> CudaResult<()> = disable_peer_access;
173    }
174
175    #[test]
176    fn copy_peer_signature_compiles() {
177        let _f: fn(&mut DeviceBuffer<f32>, &Device, &DeviceBuffer<f32>, &Device) -> CudaResult<()> =
178            copy_peer;
179    }
180
181    #[test]
182    #[allow(clippy::type_complexity)]
183    fn copy_peer_async_signature_compiles() {
184        let _f: fn(
185            &mut DeviceBuffer<f32>,
186            &Device,
187            &DeviceBuffer<f32>,
188            &Device,
189            &Stream,
190        ) -> CudaResult<()> = copy_peer_async;
191    }
192
193    #[cfg(feature = "gpu-tests")]
194    #[test]
195    fn peer_access_with_real_devices() {
196        oxicuda_driver::init().ok();
197        let count = oxicuda_driver::device::Device::count().unwrap_or(0);
198        if count >= 2 {
199            let dev0 = Device::get(0).expect("device 0");
200            let dev1 = Device::get(1).expect("device 1");
201            // These may or may not succeed depending on hardware.
202            let _ = can_access_peer(&dev0, &dev1);
203        }
204    }
205}
oxicuda_memory/peer_copy.rs

oxicuda_memory/
peer_copy.rs