oxicuda-memory 0.1.8

OxiCUDA Memory - Type-safe GPU memory management with Rust ownership semantics
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
//! Explicit memory copy operations between host and device.
//!
//! This module provides freestanding functions for copying data between
//! host memory, device memory, and pinned host memory.  Each function
//! validates that the source and destination have matching lengths before
//! issuing the underlying CUDA driver call.
//!
//! For simple cases, the methods on [`DeviceBuffer`]
//! (e.g. [`DeviceBuffer::copy_from_host`]) are more
//! ergonomic.  These freestanding functions are useful when you want to be
//! explicit about the direction of the transfer or when working with
//! [`PinnedBuffer`] for async operations.
//!
//! # Length validation
//!
//! All functions return [`CudaError::InvalidValue`] if the element counts
//! of source and destination do not match.

use std::ffi::c_void;

use oxicuda_driver::error::{CudaError, CudaResult};
use oxicuda_driver::loader::try_driver;
use oxicuda_driver::stream::Stream;

use crate::device_buffer::DeviceBuffer;
use crate::host_buffer::PinnedBuffer;

// ---------------------------------------------------------------------------
// Synchronous copies
// ---------------------------------------------------------------------------

/// Copies data from a host slice into a device buffer (host-to-device).
///
/// This is a synchronous operation: it blocks the calling thread until the
/// transfer completes.
///
/// # Errors
///
/// * [`CudaError::InvalidValue`] if `src.len() != dst.len()`.
/// * Other driver errors from `cuMemcpyHtoD_v2`.
pub fn copy_htod<T: Copy>(dst: &mut DeviceBuffer<T>, src: &[T]) -> CudaResult<()> {
    if src.len() != dst.len() {
        return Err(CudaError::InvalidValue);
    }
    let byte_size = dst.byte_size();
    let api = try_driver()?;
    // SAFETY: `src` is a valid host slice, `dst` owns a valid device allocation,
    // and the byte counts match.
    let rc = unsafe {
        (api.cu_memcpy_htod_v2)(
            dst.as_device_ptr(),
            src.as_ptr().cast::<c_void>(),
            byte_size,
        )
    };
    oxicuda_driver::check(rc)
}

/// Copies data from a device buffer into a host slice (device-to-host).
///
/// This is a synchronous operation: it blocks the calling thread until the
/// transfer completes.
///
/// # Errors
///
/// * [`CudaError::InvalidValue`] if `dst.len() != src.len()`.
/// * Other driver errors from `cuMemcpyDtoH_v2`.
pub fn copy_dtoh<T: Copy>(dst: &mut [T], src: &DeviceBuffer<T>) -> CudaResult<()> {
    if dst.len() != src.len() {
        return Err(CudaError::InvalidValue);
    }
    let byte_size = src.byte_size();
    let api = try_driver()?;
    // SAFETY: `dst` is a valid host slice, `src` owns a valid device allocation,
    // and the byte counts match.
    let rc = unsafe {
        (api.cu_memcpy_dtoh_v2)(
            dst.as_mut_ptr().cast::<c_void>(),
            src.as_device_ptr(),
            byte_size,
        )
    };
    oxicuda_driver::check(rc)
}

/// Copies data from one device buffer to another (device-to-device).
///
/// This is a synchronous operation that blocks until the copy completes.
///
/// # Errors
///
/// * [`CudaError::InvalidValue`] if `dst.len() != src.len()`.
/// * Other driver errors from `cuMemcpyDtoD_v2`.
pub fn copy_dtod<T: Copy>(dst: &mut DeviceBuffer<T>, src: &DeviceBuffer<T>) -> CudaResult<()> {
    if dst.len() != src.len() {
        return Err(CudaError::InvalidValue);
    }
    let byte_size = src.byte_size();
    let api = try_driver()?;
    // SAFETY: both buffers own valid device allocations of the same size.
    let rc =
        unsafe { (api.cu_memcpy_dtod_v2)(dst.as_device_ptr(), src.as_device_ptr(), byte_size) };
    oxicuda_driver::check(rc)
}

// ---------------------------------------------------------------------------
// Asynchronous copies
// ---------------------------------------------------------------------------

// ---------------------------------------------------------------------------
// Asynchronous copies (raw slice variants)
// ---------------------------------------------------------------------------

/// Asynchronously copies data from a host slice into a device buffer.
///
/// The copy is enqueued on `stream` and may not be complete when this
/// function returns.  The caller must ensure that `src` remains valid
/// (i.e., is not moved or dropped) until the stream has been synchronised.
/// For guaranteed correctness with DMA, prefer using a [`PinnedBuffer`]
/// as the source.
///
/// # Errors
///
/// * [`CudaError::InvalidValue`] if `src.len() != dst.len()`.
/// * Other driver errors from `cuMemcpyHtoDAsync_v2`.
pub fn copy_htod_async_raw<T: Copy>(
    dst: &mut DeviceBuffer<T>,
    src: &[T],
    stream: &Stream,
) -> CudaResult<()> {
    if src.len() != dst.len() {
        return Err(CudaError::InvalidValue);
    }
    let byte_size = dst.byte_size();
    let api = try_driver()?;
    let rc = unsafe {
        (api.cu_memcpy_htod_async_v2)(
            dst.as_device_ptr(),
            src.as_ptr().cast::<c_void>(),
            byte_size,
            stream.raw(),
        )
    };
    oxicuda_driver::check(rc)
}

/// Asynchronously copies data from a device buffer into a host slice.
///
/// The copy is enqueued on `stream` and may not be complete when this
/// function returns.  The caller must ensure that `dst` remains valid
/// and is not read until the stream has been synchronised.
///
/// # Errors
///
/// * [`CudaError::InvalidValue`] if `dst.len() != src.len()`.
/// * Other driver errors from `cuMemcpyDtoHAsync_v2`.
pub fn copy_dtoh_async_raw<T: Copy>(
    dst: &mut [T],
    src: &DeviceBuffer<T>,
    stream: &Stream,
) -> CudaResult<()> {
    if dst.len() != src.len() {
        return Err(CudaError::InvalidValue);
    }
    let byte_size = src.byte_size();
    let api = try_driver()?;
    let rc = unsafe {
        (api.cu_memcpy_dtoh_async_v2)(
            dst.as_mut_ptr().cast::<c_void>(),
            src.as_device_ptr(),
            byte_size,
            stream.raw(),
        )
    };
    oxicuda_driver::check(rc)
}

/// Asynchronously copies data from one device buffer to another.
///
/// Both buffers must have the same length.  The copy is enqueued on
/// `stream`.
///
/// Note: The CUDA Driver API does not provide `cuMemcpyDtoDAsync` directly;
/// this uses `cuMemcpyHtoDAsync_v2` semantics via the driver's internal
/// routing for device-to-device copies.  For true async D2D, consider
/// using peer copy functions or ensuring both buffers are in the same
/// context.
///
/// # Errors
///
/// * [`CudaError::InvalidValue`] if `dst.len() != src.len()`.
/// * Other driver errors.
pub fn copy_dtod_async<T: Copy>(
    dst: &mut DeviceBuffer<T>,
    src: &DeviceBuffer<T>,
    stream: &Stream,
) -> CudaResult<()> {
    if dst.len() != src.len() {
        return Err(CudaError::InvalidValue);
    }
    // Use synchronous D2D copy followed by stream ordering via event.
    // The CUDA driver routes D2D copies internally; we use the sync version
    // and rely on stream ordering at the caller level.
    // A future enhancement can add cuMemcpyDtoDAsync when the driver
    // exposes it.
    let _ = stream;
    copy_dtod(dst, src)
}

// ---------------------------------------------------------------------------
// Asynchronous copies (pinned buffer variants)
// ---------------------------------------------------------------------------

/// Asynchronously copies data from a pinned host buffer into a device buffer.
///
/// The copy is enqueued on `stream` and may not be complete when this
/// function returns.  The caller must not modify `src` or read `dst` until
/// the stream has been synchronised.
///
/// Using a [`PinnedBuffer`] as the source guarantees that the host memory
/// is page-locked, which is required for correct async DMA transfers.
///
/// # Errors
///
/// * [`CudaError::InvalidValue`] if `src.len() != dst.len()`.
/// * Other driver errors from `cuMemcpyHtoDAsync_v2`.
pub fn copy_htod_async<T: Copy>(
    dst: &mut DeviceBuffer<T>,
    src: &PinnedBuffer<T>,
    stream: &Stream,
) -> CudaResult<()> {
    if src.len() != dst.len() {
        return Err(CudaError::InvalidValue);
    }
    let byte_size = dst.byte_size();
    let api = try_driver()?;
    // SAFETY: `src` is pinned host memory, `dst` is a valid device allocation,
    // byte counts match, and the stream will order the transfer.
    let rc = unsafe {
        (api.cu_memcpy_htod_async_v2)(
            dst.as_device_ptr(),
            src.as_ptr().cast::<c_void>(),
            byte_size,
            stream.raw(),
        )
    };
    oxicuda_driver::check(rc)
}

/// Asynchronously copies data from a device buffer into a pinned host buffer.
///
/// The copy is enqueued on `stream` and may not be complete when this
/// function returns.  The caller must not read `dst` until the stream
/// has been synchronised.
///
/// Using a [`PinnedBuffer`] as the destination guarantees that the host
/// memory is page-locked, which is required for correct async DMA transfers.
///
/// # Errors
///
/// * [`CudaError::InvalidValue`] if `dst.len() != src.len()`.
/// * Other driver errors from `cuMemcpyDtoHAsync_v2`.
pub fn copy_dtoh_async<T: Copy>(
    dst: &mut PinnedBuffer<T>,
    src: &DeviceBuffer<T>,
    stream: &Stream,
) -> CudaResult<()> {
    if dst.len() != src.len() {
        return Err(CudaError::InvalidValue);
    }
    let byte_size = src.byte_size();
    let api = try_driver()?;
    // SAFETY: `dst` is pinned host memory, `src` is a valid device allocation,
    // byte counts match, and the stream will order the transfer.
    let rc = unsafe {
        (api.cu_memcpy_dtoh_async_v2)(
            dst.as_mut_ptr().cast::<c_void>(),
            src.as_device_ptr(),
            byte_size,
            stream.raw(),
        )
    };
    oxicuda_driver::check(rc)
}

// ---------------------------------------------------------------------------
// Asynchronous sub-region copies (pinned buffer staging)
// ---------------------------------------------------------------------------

/// Asynchronously copies a contiguous sub-region of a device buffer into a
/// pinned host buffer.
///
/// Exactly `count` elements starting at element index `src_offset` within
/// `src` are copied into `dst[0..count]`.  The pinned buffer must be large
/// enough to receive `count` elements.
///
/// This is the device→host leg of a host-staged inter-device transfer: the
/// caller stages a slab slice into pinned memory here, then pushes it onto a
/// different device with [`copy_htod_region_async`].
///
/// The copy is enqueued on `stream`; the caller must synchronise the stream
/// before reading `dst`.
///
/// # Errors
///
/// * [`CudaError::InvalidValue`] if `src_offset + count` exceeds `src.len()`,
///   if `count` exceeds `dst.len()`, or on offset overflow.
/// * Other driver errors from `cuMemcpyDtoHAsync_v2`.
pub fn copy_dtoh_region_async<T: Copy>(
    dst: &mut PinnedBuffer<T>,
    src: &DeviceBuffer<T>,
    src_offset: usize,
    count: usize,
    stream: &Stream,
) -> CudaResult<()> {
    let elem_size = std::mem::size_of::<T>();
    let src_end = src_offset
        .checked_add(count)
        .ok_or(CudaError::InvalidValue)?;
    if src_end > src.len() || count > dst.len() {
        return Err(CudaError::InvalidValue);
    }
    if count == 0 {
        return Ok(());
    }
    let byte_count = count
        .checked_mul(elem_size)
        .ok_or(CudaError::InvalidValue)?;
    let src_byte_offset = src_offset
        .checked_mul(elem_size)
        .ok_or(CudaError::InvalidValue)? as u64;
    let api = try_driver()?;
    // SAFETY: `dst` is pinned host memory with room for `count` elements,
    // the source sub-range lies within `src`, and byte counts match.
    let rc = unsafe {
        (api.cu_memcpy_dtoh_async_v2)(
            dst.as_mut_ptr().cast::<c_void>(),
            src.as_device_ptr() + src_byte_offset,
            byte_count,
            stream.raw(),
        )
    };
    oxicuda_driver::check(rc)
}

/// Asynchronously copies from a pinned host buffer into a contiguous
/// sub-region of a device buffer.
///
/// The first `count` elements of `src` are written into `dst` starting at
/// element index `dst_offset`.
///
/// This is the host→device leg of a host-staged inter-device transfer; see
/// [`copy_dtoh_region_async`] for the device→host leg.
///
/// The copy is enqueued on `stream`; the caller must synchronise the stream
/// before reusing `src`.
///
/// # Errors
///
/// * [`CudaError::InvalidValue`] if `dst_offset + count` exceeds `dst.len()`,
///   if `count` exceeds `src.len()`, or on offset overflow.
/// * Other driver errors from `cuMemcpyHtoDAsync_v2`.
pub fn copy_htod_region_async<T: Copy>(
    dst: &mut DeviceBuffer<T>,
    dst_offset: usize,
    src: &PinnedBuffer<T>,
    count: usize,
    stream: &Stream,
) -> CudaResult<()> {
    let elem_size = std::mem::size_of::<T>();
    let dst_end = dst_offset
        .checked_add(count)
        .ok_or(CudaError::InvalidValue)?;
    if dst_end > dst.len() || count > src.len() {
        return Err(CudaError::InvalidValue);
    }
    if count == 0 {
        return Ok(());
    }
    let byte_count = count
        .checked_mul(elem_size)
        .ok_or(CudaError::InvalidValue)?;
    let dst_byte_offset = dst_offset
        .checked_mul(elem_size)
        .ok_or(CudaError::InvalidValue)? as u64;
    let api = try_driver()?;
    // SAFETY: `src` is pinned host memory holding at least `count` elements,
    // the destination sub-range lies within `dst`, and byte counts match.
    let rc = unsafe {
        (api.cu_memcpy_htod_async_v2)(
            dst.as_device_ptr() + dst_byte_offset,
            src.as_ptr().cast::<c_void>(),
            byte_count,
            stream.raw(),
        )
    };
    oxicuda_driver::check(rc)
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    #[test]
    fn copy_htod_signature_compiles() {
        let _f: fn(&mut super::DeviceBuffer<f32>, &[f32]) -> super::CudaResult<()> =
            super::copy_htod;
        let _f2: fn(&mut [f32], &super::DeviceBuffer<f32>) -> super::CudaResult<()> =
            super::copy_dtoh;
    }

    #[test]
    fn copy_dtod_signature_compiles() {
        let _f: fn(
            &mut super::DeviceBuffer<f32>,
            &super::DeviceBuffer<f32>,
        ) -> super::CudaResult<()> = super::copy_dtod;
    }

    #[test]
    fn async_raw_htod_signature_compiles() {
        let _f: fn(
            &mut super::DeviceBuffer<f32>,
            &[f32],
            &oxicuda_driver::stream::Stream,
        ) -> super::CudaResult<()> = super::copy_htod_async_raw;
    }

    #[test]
    fn async_raw_dtoh_signature_compiles() {
        let _f: fn(
            &mut [f32],
            &super::DeviceBuffer<f32>,
            &oxicuda_driver::stream::Stream,
        ) -> super::CudaResult<()> = super::copy_dtoh_async_raw;
    }

    #[test]
    fn async_dtod_signature_compiles() {
        let _f: fn(
            &mut super::DeviceBuffer<f32>,
            &super::DeviceBuffer<f32>,
            &oxicuda_driver::stream::Stream,
        ) -> super::CudaResult<()> = super::copy_dtod_async;
    }

    #[test]
    fn async_pinned_htod_signature_compiles() {
        let _f: fn(
            &mut super::DeviceBuffer<f32>,
            &super::PinnedBuffer<f32>,
            &oxicuda_driver::stream::Stream,
        ) -> super::CudaResult<()> = super::copy_htod_async;
    }

    #[test]
    fn region_dtoh_signature_compiles() {
        type RegionDtohFn = fn(
            &mut super::PinnedBuffer<f32>,
            &super::DeviceBuffer<f32>,
            usize,
            usize,
            &oxicuda_driver::stream::Stream,
        ) -> super::CudaResult<()>;
        let _f: RegionDtohFn = super::copy_dtoh_region_async;
    }

    #[test]
    fn region_htod_signature_compiles() {
        type RegionHtodFn = fn(
            &mut super::DeviceBuffer<f32>,
            usize,
            &super::PinnedBuffer<f32>,
            usize,
            &oxicuda_driver::stream::Stream,
        ) -> super::CudaResult<()>;
        let _f: RegionHtodFn = super::copy_htod_region_async;
    }
}