oxicuda_memory/peer_copy.rs
1//! Peer-to-peer (P2P) memory copy operations for multi-GPU workloads.
2//!
3//! This module provides functions to check, enable, and disable peer access
4//! between CUDA devices, as well as copy data between device buffers on
5//! different GPUs.
6//!
7//! Peer access enables direct GPU-to-GPU memory transfers over PCIe or
8//! NVLink without staging through host memory, significantly improving
9//! transfer bandwidth in multi-GPU configurations.
10//!
11//! # Platform note
12//!
13//! The underlying `cuDeviceCanAccessPeer`, `cuCtxEnablePeerAccess`,
14//! `cuCtxDisablePeerAccess`, and `cuMemcpyPeer` driver functions are not
15//! yet loaded by `oxicuda-driver`. All functions currently return
16//! [`CudaError::NotSupported`] as placeholders. The API surface is
17//! established here so that downstream crates can program against it.
18//!
19//! # Example
20//!
21//! ```rust,no_run
22//! use oxicuda_driver::device::Device;
23//! use oxicuda_memory::peer_copy;
24//!
25//! oxicuda_driver::init()?;
26//! let dev0 = Device::get(0)?;
27//! let dev1 = Device::get(1)?;
28//!
29//! if peer_copy::can_access_peer(&dev0, &dev1)? {
30//! peer_copy::enable_peer_access(&dev0, &dev1)?;
31//! // Now D2D copies between dev0 and dev1 can go over NVLink/PCIe
32//! // peer_copy::copy_peer(&mut dst_buf, &dev1, &src_buf, &dev0)?;
33//! }
34//! # Ok::<(), oxicuda_driver::error::CudaError>(())
35//! ```
36
37use oxicuda_driver::device::Device;
38use oxicuda_driver::error::{CudaError, CudaResult};
39use oxicuda_driver::stream::Stream;
40
41use crate::device_buffer::DeviceBuffer;
42
43/// Checks whether `device` can directly access memory on `peer`.
44///
45/// Returns `true` if peer access is supported between the two devices
46/// (e.g., over NVLink or PCIe). Returns `false` if the devices are the
47/// same or if the hardware topology does not support peer access.
48///
49/// # Errors
50///
51/// Currently returns [`CudaError::NotSupported`] because the underlying
52/// driver function pointer (`cuDeviceCanAccessPeer`) is not yet loaded.
53pub fn can_access_peer(_device: &Device, _peer: &Device) -> CudaResult<bool> {
54 // TODO: load `cuDeviceCanAccessPeer` in DriverApi and call it here.
55 // let api = oxicuda_driver::loader::try_driver()?;
56 // let mut can_access: i32 = 0;
57 // oxicuda_driver::check(unsafe {
58 // (api.cu_device_can_access_peer)(&mut can_access, device.raw(), peer.raw())
59 // })?;
60 // Ok(can_access != 0)
61 Err(CudaError::NotSupported)
62}
63
64/// Enables peer access from the current context's device to `peer`.
65///
66/// After enabling, memory on `peer` can be directly accessed from
67/// kernels and copy operations in the current context.
68///
69/// # Errors
70///
71/// * [`CudaError::PeerAccessAlreadyEnabled`] if peer access is already
72/// enabled.
73/// * [`CudaError::PeerAccessUnsupported`] if the hardware does not
74/// support peer access.
75/// * [`CudaError::NotSupported`] (current stub).
76pub fn enable_peer_access(_device: &Device, _peer: &Device) -> CudaResult<()> {
77 // TODO: load `cuCtxEnablePeerAccess` in DriverApi and call it.
78 // Need to set the device's context as current first, then enable
79 // access to peer's context.
80 Err(CudaError::NotSupported)
81}
82
83/// Disables peer access from the current context's device to `peer`.
84///
85/// # Errors
86///
87/// * [`CudaError::PeerAccessNotEnabled`] if peer access was not enabled.
88/// * [`CudaError::NotSupported`] (current stub).
89pub fn disable_peer_access(_device: &Device, _peer: &Device) -> CudaResult<()> {
90 // TODO: load `cuCtxDisablePeerAccess` in DriverApi and call it.
91 Err(CudaError::NotSupported)
92}
93
94/// Copies data between device buffers on different GPUs (synchronous).
95///
96/// Both buffers must have the same length. Peer access should be enabled
97/// between the source and destination devices before calling this function.
98///
99/// # Errors
100///
101/// * [`CudaError::InvalidValue`] if buffer lengths do not match.
102/// * [`CudaError::PeerAccessNotEnabled`] if peer access has not been
103/// enabled.
104/// * [`CudaError::NotSupported`] (current stub).
105pub fn copy_peer<T: Copy>(
106 dst: &mut DeviceBuffer<T>,
107 _dst_device: &Device,
108 src: &DeviceBuffer<T>,
109 _src_device: &Device,
110) -> CudaResult<()> {
111 if dst.len() != src.len() {
112 return Err(CudaError::InvalidValue);
113 }
114 // TODO: load `cuMemcpyPeer` in DriverApi and call it.
115 // let byte_size = src.byte_size();
116 // let api = oxicuda_driver::loader::try_driver()?;
117 // oxicuda_driver::check(unsafe {
118 // (api.cu_memcpy_peer)(
119 // dst.as_device_ptr(), dst_ctx, src.as_device_ptr(), src_ctx, byte_size
120 // )
121 // })
122 Err(CudaError::NotSupported)
123}
124
125/// Copies data between device buffers on different GPUs (asynchronous).
126///
127/// The copy is enqueued on `stream` and may not be complete when this
128/// function returns. Both buffers must have the same length.
129///
130/// # Errors
131///
132/// * [`CudaError::InvalidValue`] if buffer lengths do not match.
133/// * [`CudaError::NotSupported`] (current stub).
134pub fn copy_peer_async<T: Copy>(
135 dst: &mut DeviceBuffer<T>,
136 _dst_device: &Device,
137 src: &DeviceBuffer<T>,
138 _src_device: &Device,
139 _stream: &Stream,
140) -> CudaResult<()> {
141 if dst.len() != src.len() {
142 return Err(CudaError::InvalidValue);
143 }
144 // TODO: load `cuMemcpyPeerAsync` in DriverApi and call it.
145 Err(CudaError::NotSupported)
146}
147
148// ---------------------------------------------------------------------------
149// Tests
150// ---------------------------------------------------------------------------
151
152#[cfg(test)]
153mod tests {
154 use super::*;
155
156 #[test]
157 fn can_access_peer_returns_not_supported() {
158 // On macOS, we cannot enumerate devices, so we create dummy Device
159 // values indirectly. Since `Device::get` will fail, we just test
160 // that the function compiles and the error type is correct.
161 // The function itself returns NotSupported as a stub.
162 let _f: fn(&Device, &Device) -> CudaResult<bool> = can_access_peer;
163 }
164
165 #[test]
166 fn enable_peer_access_returns_not_supported() {
167 let _f: fn(&Device, &Device) -> CudaResult<()> = enable_peer_access;
168 }
169
170 #[test]
171 fn disable_peer_access_returns_not_supported() {
172 let _f: fn(&Device, &Device) -> CudaResult<()> = disable_peer_access;
173 }
174
175 #[test]
176 fn copy_peer_signature_compiles() {
177 let _f: fn(&mut DeviceBuffer<f32>, &Device, &DeviceBuffer<f32>, &Device) -> CudaResult<()> =
178 copy_peer;
179 }
180
181 #[test]
182 #[allow(clippy::type_complexity)]
183 fn copy_peer_async_signature_compiles() {
184 let _f: fn(
185 &mut DeviceBuffer<f32>,
186 &Device,
187 &DeviceBuffer<f32>,
188 &Device,
189 &Stream,
190 ) -> CudaResult<()> = copy_peer_async;
191 }
192
193 #[cfg(feature = "gpu-tests")]
194 #[test]
195 fn peer_access_with_real_devices() {
196 oxicuda_driver::init().ok();
197 let count = oxicuda_driver::device::Device::count().unwrap_or(0);
198 if count >= 2 {
199 let dev0 = Device::get(0).expect("device 0");
200 let dev1 = Device::get(1).expect("device 1");
201 // These may or may not succeed depending on hardware.
202 let _ = can_access_peer(&dev0, &dev1);
203 }
204 }
205}