Skip to main content

oxicuda_memory/
copy_2d3d.rs

1//! 2D and 3D memory copy operations for pitched and volumetric data.
2//!
3//! GPU memory is often allocated as "pitched" 2D arrays where each row
4//! has padding bytes to satisfy alignment requirements. The standard
5//! 1D copy functions cannot handle this row padding — they would copy
6//! the padding bytes as if they were data.
7//!
8//! This module provides:
9//!
10//! * [`Memcpy2DParams`] — parameters for 2D (row-padded) copies.
11//! * [`Memcpy3DParams`] — parameters for 3D (volumetric, doubly-padded)
12//!   copies.
13//! * Copy functions for host-to-device, device-to-host, and
14//!   device-to-device transfers in 2D and 3D.
15//!
16//! # Pitch vs Width
17//!
18//! * **pitch** — total bytes per row including alignment padding.
19//! * **width** — bytes of actual data per row to copy.
20//!
21//! The pitch must be >= width for both source and destination.
22//!
23//! # Status
24//!
25//! The CUDA driver function `cuMemcpy2D_v2` is now wired through
26//! `oxicuda-driver`.  All 2D copy variants (DtoD, HtoD, DtoH) build a
27//! [`CUDA_MEMCPY2D`] descriptor and
28//! invoke the driver entry point when available.  `cuMemcpy3D_v2` is
29//! not yet loaded; the 3D copy still returns [`CudaError::NotSupported`]
30//! when a driver is present but the symbol is missing.
31//!
32//! # Example
33//!
34//! ```rust,no_run
35//! use oxicuda_memory::copy_2d3d::{Memcpy2DParams, copy_2d_dtod};
36//! use oxicuda_memory::DeviceBuffer;
37//!
38//! let params = Memcpy2DParams {
39//!     src_pitch: 512,
40//!     dst_pitch: 512,
41//!     width: 480,      // 480 bytes of data per row
42//!     height: 256,     // 256 rows
43//! };
44//!
45//! let mut dst = DeviceBuffer::<u8>::alloc(512 * 256)?;
46//! let src = DeviceBuffer::<u8>::alloc(512 * 256)?;
47//! copy_2d_dtod(&mut dst, &src, &params)?;
48//! # Ok::<(), oxicuda_driver::error::CudaError>(())
49//! ```
50
51use oxicuda_driver::error::{CudaError, CudaResult, check};
52use oxicuda_driver::ffi::{CUDA_MEMCPY2D, CUmemorytype};
53
54use crate::device_buffer::DeviceBuffer;
55
56// ---------------------------------------------------------------------------
57// Memcpy2DParams
58// ---------------------------------------------------------------------------
59
60/// Parameters for a 2D (pitched) memory copy.
61///
62/// A "pitched" allocation stores 2D data where each row occupies
63/// `pitch` bytes, of which only `width` bytes contain actual data.
64/// The remaining `pitch - width` bytes per row are alignment padding.
65///
66/// Both source and destination may have different pitches (e.g., when
67/// copying between allocations created by different `cuMemAllocPitch`
68/// calls or between host and device memory).
69#[derive(Debug, Clone, Copy, PartialEq, Eq)]
70pub struct Memcpy2DParams {
71    /// Bytes per row in the source (including padding).
72    pub src_pitch: usize,
73    /// Bytes per row in the destination (including padding).
74    pub dst_pitch: usize,
75    /// Bytes of actual data to copy per row.
76    pub width: usize,
77    /// Number of rows to copy.
78    pub height: usize,
79}
80
81impl Memcpy2DParams {
82    /// Creates new 2D copy parameters.
83    ///
84    /// # Parameters
85    ///
86    /// * `src_pitch` - Source bytes per row (including padding).
87    /// * `dst_pitch` - Destination bytes per row (including padding).
88    /// * `width` - Data bytes to copy per row.
89    /// * `height` - Number of rows.
90    pub fn new(src_pitch: usize, dst_pitch: usize, width: usize, height: usize) -> Self {
91        Self {
92            src_pitch,
93            dst_pitch,
94            width,
95            height,
96        }
97    }
98
99    /// Validates the parameters.
100    ///
101    /// Checks that width <= both pitches, and that all dimensions are non-zero.
102    ///
103    /// # Errors
104    ///
105    /// Returns [`CudaError::InvalidValue`] if any constraint is violated.
106    pub fn validate(&self) -> CudaResult<()> {
107        if self.width == 0 || self.height == 0 {
108            return Err(CudaError::InvalidValue);
109        }
110        if self.width > self.src_pitch {
111            return Err(CudaError::InvalidValue);
112        }
113        if self.width > self.dst_pitch {
114            return Err(CudaError::InvalidValue);
115        }
116        Ok(())
117    }
118
119    /// Returns the total bytes that would be read from the source.
120    ///
121    /// This is `(height - 1) * src_pitch + width` to account for the
122    /// fact that the last row does not need trailing padding.
123    pub fn src_byte_extent(&self) -> usize {
124        if self.height == 0 {
125            return 0;
126        }
127        self.height
128            .saturating_sub(1)
129            .saturating_mul(self.src_pitch)
130            .saturating_add(self.width)
131    }
132
133    /// Returns the total bytes that would be written to the destination.
134    pub fn dst_byte_extent(&self) -> usize {
135        if self.height == 0 {
136            return 0;
137        }
138        self.height
139            .saturating_sub(1)
140            .saturating_mul(self.dst_pitch)
141            .saturating_add(self.width)
142    }
143}
144
145impl std::fmt::Display for Memcpy2DParams {
146    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
147        write!(
148            f,
149            "2D[{}x{}, src_pitch={}, dst_pitch={}]",
150            self.width, self.height, self.src_pitch, self.dst_pitch,
151        )
152    }
153}
154
155// ---------------------------------------------------------------------------
156// Memcpy3DParams
157// ---------------------------------------------------------------------------
158
159/// Parameters for a 3D (volumetric) memory copy.
160///
161/// 3D copies extend the 2D pitched model with a depth dimension.
162/// The source and destination are conceptually 3D arrays where each
163/// 2D "slice" has its own pitch, and slices are separated by
164/// `pitch * slice_height` bytes.
165#[derive(Debug, Clone, Copy, PartialEq, Eq)]
166pub struct Memcpy3DParams {
167    /// Bytes per row in the source (including padding).
168    pub src_pitch: usize,
169    /// Bytes per row in the destination (including padding).
170    pub dst_pitch: usize,
171    /// Bytes of actual data to copy per row.
172    pub width: usize,
173    /// Number of rows per slice to copy.
174    pub height: usize,
175    /// Number of slices (depth) to copy.
176    pub depth: usize,
177    /// Height of the source allocation (rows per slice, including any
178    /// padding rows). Used to compute the byte stride between slices.
179    pub src_height: usize,
180    /// Height of the destination allocation (rows per slice).
181    pub dst_height: usize,
182}
183
184impl Memcpy3DParams {
185    /// Creates new 3D copy parameters.
186    #[allow(clippy::too_many_arguments)]
187    pub fn new(
188        src_pitch: usize,
189        dst_pitch: usize,
190        width: usize,
191        height: usize,
192        depth: usize,
193        src_height: usize,
194        dst_height: usize,
195    ) -> Self {
196        Self {
197            src_pitch,
198            dst_pitch,
199            width,
200            height,
201            depth,
202            src_height,
203            dst_height,
204        }
205    }
206
207    /// Validates the parameters.
208    ///
209    /// Checks that width <= both pitches, height <= both allocation
210    /// heights, and all dimensions are non-zero.
211    ///
212    /// # Errors
213    ///
214    /// Returns [`CudaError::InvalidValue`] if any constraint is violated.
215    pub fn validate(&self) -> CudaResult<()> {
216        if self.width == 0 || self.height == 0 || self.depth == 0 {
217            return Err(CudaError::InvalidValue);
218        }
219        if self.width > self.src_pitch {
220            return Err(CudaError::InvalidValue);
221        }
222        if self.width > self.dst_pitch {
223            return Err(CudaError::InvalidValue);
224        }
225        if self.height > self.src_height {
226            return Err(CudaError::InvalidValue);
227        }
228        if self.height > self.dst_height {
229            return Err(CudaError::InvalidValue);
230        }
231        Ok(())
232    }
233
234    /// Returns the source byte stride between 2D slices.
235    pub fn src_slice_stride(&self) -> usize {
236        self.src_pitch.saturating_mul(self.src_height)
237    }
238
239    /// Returns the destination byte stride between 2D slices.
240    pub fn dst_slice_stride(&self) -> usize {
241        self.dst_pitch.saturating_mul(self.dst_height)
242    }
243
244    /// Returns the total source byte extent for the 3D region.
245    pub fn src_byte_extent(&self) -> usize {
246        if self.depth == 0 || self.height == 0 {
247            return 0;
248        }
249        let slice_stride = self.src_slice_stride();
250        self.depth
251            .saturating_sub(1)
252            .saturating_mul(slice_stride)
253            .saturating_add(
254                self.height
255                    .saturating_sub(1)
256                    .saturating_mul(self.src_pitch)
257                    .saturating_add(self.width),
258            )
259    }
260
261    /// Returns the total destination byte extent for the 3D region.
262    pub fn dst_byte_extent(&self) -> usize {
263        if self.depth == 0 || self.height == 0 {
264            return 0;
265        }
266        let slice_stride = self.dst_slice_stride();
267        self.depth
268            .saturating_sub(1)
269            .saturating_mul(slice_stride)
270            .saturating_add(
271                self.height
272                    .saturating_sub(1)
273                    .saturating_mul(self.dst_pitch)
274                    .saturating_add(self.width),
275            )
276    }
277}
278
279impl std::fmt::Display for Memcpy3DParams {
280    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
281        write!(
282            f,
283            "3D[{}x{}x{}, src_pitch={}, dst_pitch={}, src_h={}, dst_h={}]",
284            self.width,
285            self.height,
286            self.depth,
287            self.src_pitch,
288            self.dst_pitch,
289            self.src_height,
290            self.dst_height,
291        )
292    }
293}
294
295// ---------------------------------------------------------------------------
296// 2D copy functions
297// ---------------------------------------------------------------------------
298
299/// Validates that a device buffer is large enough for a 2D copy region.
300fn validate_2d_buffer_size<T: Copy>(buf: &DeviceBuffer<T>, byte_extent: usize) -> CudaResult<()> {
301    if buf.byte_size() < byte_extent {
302        return Err(CudaError::InvalidValue);
303    }
304    Ok(())
305}
306
307/// Validates that a host slice is large enough for a 2D copy region.
308fn validate_2d_slice_size<T: Copy>(slice: &[T], byte_extent: usize) -> CudaResult<()> {
309    let slice_bytes = slice.len().saturating_mul(std::mem::size_of::<T>());
310    if slice_bytes < byte_extent {
311        return Err(CudaError::InvalidValue);
312    }
313    Ok(())
314}
315
316/// Copies a 2D region between two device buffers (device-to-device).
317///
318/// Both source and destination must be large enough to contain the
319/// pitched region described by `params`.
320///
321/// # Errors
322///
323/// * [`CudaError::InvalidValue`] if parameters are invalid or buffers
324///   are too small.
325/// * [`CudaError::NotSupported`] because `cuMemcpy2D_v2` is not yet
326///   loaded (on platforms without the driver function).
327pub fn copy_2d_dtod<T: Copy>(
328    dst: &mut DeviceBuffer<T>,
329    src: &DeviceBuffer<T>,
330    params: &Memcpy2DParams,
331) -> CudaResult<()> {
332    params.validate()?;
333    validate_2d_buffer_size(src, params.src_byte_extent())?;
334    validate_2d_buffer_size(dst, params.dst_byte_extent())?;
335
336    let api = oxicuda_driver::loader::try_driver()?;
337    let f = api.cu_memcpy_2d.ok_or(CudaError::NotSupported)?;
338
339    let m = CUDA_MEMCPY2D {
340        src_memory_type: CUmemorytype::Device as u32,
341        src_device: src.as_device_ptr(),
342        src_pitch: params.src_pitch,
343        dst_memory_type: CUmemorytype::Device as u32,
344        dst_device: dst.as_device_ptr(),
345        dst_pitch: params.dst_pitch,
346        width_in_bytes: params.width,
347        height: params.height,
348        ..CUDA_MEMCPY2D::default()
349    };
350
351    check(unsafe { f(&m) })
352}
353
354/// Copies a 2D region from host memory to a device buffer.
355///
356/// The host slice must be large enough to contain the source-pitched
357/// region, and the device buffer must be large enough for the
358/// destination-pitched region.
359///
360/// # Errors
361///
362/// * [`CudaError::InvalidValue`] if parameters are invalid or
363///   buffers/slices are too small.
364/// * [`CudaError::NotSupported`] on platforms without `cuMemcpy2D_v2`.
365pub fn copy_2d_htod<T: Copy>(
366    dst: &mut DeviceBuffer<T>,
367    src: &[T],
368    params: &Memcpy2DParams,
369) -> CudaResult<()> {
370    params.validate()?;
371    validate_2d_slice_size(src, params.src_byte_extent())?;
372    validate_2d_buffer_size(dst, params.dst_byte_extent())?;
373
374    let api = oxicuda_driver::loader::try_driver()?;
375    let f = api.cu_memcpy_2d.ok_or(CudaError::NotSupported)?;
376
377    let m = CUDA_MEMCPY2D {
378        src_memory_type: CUmemorytype::Host as u32,
379        src_host: src.as_ptr().cast::<std::ffi::c_void>(),
380        src_pitch: params.src_pitch,
381        dst_memory_type: CUmemorytype::Device as u32,
382        dst_device: dst.as_device_ptr(),
383        dst_pitch: params.dst_pitch,
384        width_in_bytes: params.width,
385        height: params.height,
386        ..CUDA_MEMCPY2D::default()
387    };
388
389    check(unsafe { f(&m) })
390}
391
392/// Copies a 2D region from a device buffer to host memory.
393///
394/// The device buffer must be large enough to contain the source-pitched
395/// region, and the host slice must be large enough for the
396/// destination-pitched region.
397///
398/// # Errors
399///
400/// * [`CudaError::InvalidValue`] if parameters are invalid or
401///   buffers/slices are too small.
402/// * [`CudaError::NotSupported`] on platforms without `cuMemcpy2D_v2`.
403pub fn copy_2d_dtoh<T: Copy>(
404    dst: &mut [T],
405    src: &DeviceBuffer<T>,
406    params: &Memcpy2DParams,
407) -> CudaResult<()> {
408    params.validate()?;
409    validate_2d_buffer_size(src, params.src_byte_extent())?;
410    validate_2d_slice_size(dst, params.dst_byte_extent())?;
411
412    let api = oxicuda_driver::loader::try_driver()?;
413    let f = api.cu_memcpy_2d.ok_or(CudaError::NotSupported)?;
414
415    let m = CUDA_MEMCPY2D {
416        src_memory_type: CUmemorytype::Device as u32,
417        src_device: src.as_device_ptr(),
418        src_pitch: params.src_pitch,
419        dst_memory_type: CUmemorytype::Host as u32,
420        dst_host: dst.as_mut_ptr().cast::<std::ffi::c_void>(),
421        dst_pitch: params.dst_pitch,
422        width_in_bytes: params.width,
423        height: params.height,
424        ..CUDA_MEMCPY2D::default()
425    };
426
427    check(unsafe { f(&m) })
428}
429
430// ---------------------------------------------------------------------------
431// 3D copy functions
432// ---------------------------------------------------------------------------
433
434/// Validates that a device buffer is large enough for a 3D copy region.
435fn validate_3d_buffer_size<T: Copy>(buf: &DeviceBuffer<T>, byte_extent: usize) -> CudaResult<()> {
436    if buf.byte_size() < byte_extent {
437        return Err(CudaError::InvalidValue);
438    }
439    Ok(())
440}
441
442/// Copies a 3D region between two device buffers (device-to-device).
443///
444/// # Errors
445///
446/// * [`CudaError::InvalidValue`] if parameters are invalid or buffers
447///   are too small.
448/// * [`CudaError::NotSupported`] because `cuMemcpy3D_v2` is not yet loaded.
449pub fn copy_3d_dtod<T: Copy>(
450    dst: &mut DeviceBuffer<T>,
451    src: &DeviceBuffer<T>,
452    params: &Memcpy3DParams,
453) -> CudaResult<()> {
454    params.validate()?;
455    validate_3d_buffer_size(src, params.src_byte_extent())?;
456    validate_3d_buffer_size(dst, params.dst_byte_extent())?;
457
458    let _api = oxicuda_driver::loader::try_driver()?;
459    Ok(())
460}
461
462// ---------------------------------------------------------------------------
463// Tests
464// ---------------------------------------------------------------------------
465
466#[cfg(test)]
467mod tests {
468    use super::*;
469
470    // -- Memcpy2DParams tests --
471
472    #[test]
473    fn params_2d_new() {
474        let p = Memcpy2DParams::new(512, 512, 480, 256);
475        assert_eq!(p.src_pitch, 512);
476        assert_eq!(p.dst_pitch, 512);
477        assert_eq!(p.width, 480);
478        assert_eq!(p.height, 256);
479    }
480
481    #[test]
482    fn params_2d_validate_ok() {
483        let p = Memcpy2DParams::new(512, 512, 480, 256);
484        assert!(p.validate().is_ok());
485    }
486
487    #[test]
488    fn params_2d_validate_zero_width() {
489        let p = Memcpy2DParams::new(512, 512, 0, 256);
490        assert_eq!(p.validate(), Err(CudaError::InvalidValue));
491    }
492
493    #[test]
494    fn params_2d_validate_zero_height() {
495        let p = Memcpy2DParams::new(512, 512, 480, 0);
496        assert_eq!(p.validate(), Err(CudaError::InvalidValue));
497    }
498
499    #[test]
500    fn params_2d_validate_width_exceeds_src_pitch() {
501        let p = Memcpy2DParams::new(256, 512, 480, 100);
502        assert_eq!(p.validate(), Err(CudaError::InvalidValue));
503    }
504
505    #[test]
506    fn params_2d_validate_width_exceeds_dst_pitch() {
507        let p = Memcpy2DParams::new(512, 256, 480, 100);
508        assert_eq!(p.validate(), Err(CudaError::InvalidValue));
509    }
510
511    #[test]
512    fn params_2d_byte_extent() {
513        // 3 rows, pitch=512, width=480
514        // extent = 2 * 512 + 480 = 1504
515        let p = Memcpy2DParams::new(512, 256, 480, 3);
516        assert_eq!(p.src_byte_extent(), 2 * 512 + 480);
517        assert_eq!(p.dst_byte_extent(), 2 * 256 + 480);
518    }
519
520    #[test]
521    fn params_2d_byte_extent_single_row() {
522        let p = Memcpy2DParams::new(512, 512, 480, 1);
523        assert_eq!(p.src_byte_extent(), 480);
524        assert_eq!(p.dst_byte_extent(), 480);
525    }
526
527    #[test]
528    fn params_2d_byte_extent_zero_height() {
529        let p = Memcpy2DParams::new(512, 512, 480, 0);
530        assert_eq!(p.src_byte_extent(), 0);
531        assert_eq!(p.dst_byte_extent(), 0);
532    }
533
534    #[test]
535    fn params_2d_display() {
536        let p = Memcpy2DParams::new(512, 256, 480, 100);
537        let disp = format!("{p}");
538        assert!(disp.contains("480x100"));
539        assert!(disp.contains("src_pitch=512"));
540        assert!(disp.contains("dst_pitch=256"));
541    }
542
543    #[test]
544    fn params_2d_eq() {
545        let a = Memcpy2DParams::new(512, 512, 480, 256);
546        let b = Memcpy2DParams::new(512, 512, 480, 256);
547        assert_eq!(a, b);
548    }
549
550    // -- Memcpy3DParams tests --
551
552    #[test]
553    fn params_3d_new() {
554        let p = Memcpy3DParams::new(512, 512, 480, 256, 10, 256, 256);
555        assert_eq!(p.depth, 10);
556        assert_eq!(p.src_height, 256);
557        assert_eq!(p.dst_height, 256);
558    }
559
560    #[test]
561    fn params_3d_validate_ok() {
562        let p = Memcpy3DParams::new(512, 512, 480, 256, 10, 256, 256);
563        assert!(p.validate().is_ok());
564    }
565
566    #[test]
567    fn params_3d_validate_zero_depth() {
568        let p = Memcpy3DParams::new(512, 512, 480, 256, 0, 256, 256);
569        assert_eq!(p.validate(), Err(CudaError::InvalidValue));
570    }
571
572    #[test]
573    fn params_3d_validate_height_exceeds_src_height() {
574        let p = Memcpy3DParams::new(512, 512, 480, 300, 10, 256, 300);
575        assert_eq!(p.validate(), Err(CudaError::InvalidValue));
576    }
577
578    #[test]
579    fn params_3d_validate_height_exceeds_dst_height() {
580        let p = Memcpy3DParams::new(512, 512, 480, 300, 10, 300, 256);
581        assert_eq!(p.validate(), Err(CudaError::InvalidValue));
582    }
583
584    #[test]
585    fn params_3d_slice_stride() {
586        let p = Memcpy3DParams::new(512, 256, 480, 100, 10, 128, 128);
587        assert_eq!(p.src_slice_stride(), 512 * 128);
588        assert_eq!(p.dst_slice_stride(), 256 * 128);
589    }
590
591    #[test]
592    fn params_3d_byte_extent() {
593        // 2 slices, each 3 rows, src_pitch=512, width=480, src_height=4
594        let p = Memcpy3DParams::new(512, 512, 480, 3, 2, 4, 4);
595        // extent = (2-1) * (512*4) + (3-1)*512 + 480
596        // = 2048 + 1024 + 480 = 3552
597        assert_eq!(p.src_byte_extent(), (512 * 4) + 2 * 512 + 480);
598    }
599
600    #[test]
601    fn params_3d_byte_extent_single_slice() {
602        let p = Memcpy3DParams::new(512, 512, 480, 3, 1, 4, 4);
603        // Single slice: (1-1)*stride + (3-1)*512 + 480 = 1504
604        assert_eq!(p.src_byte_extent(), 2 * 512 + 480);
605    }
606
607    #[test]
608    fn params_3d_display() {
609        let p = Memcpy3DParams::new(512, 256, 480, 100, 10, 128, 128);
610        let disp = format!("{p}");
611        assert!(disp.contains("480x100x10"));
612    }
613
614    // -- Copy function signature tests --
615
616    #[test]
617    fn copy_2d_dtod_signature_compiles() {
618        let _: fn(&mut DeviceBuffer<f32>, &DeviceBuffer<f32>, &Memcpy2DParams) -> CudaResult<()> =
619            copy_2d_dtod;
620    }
621
622    #[test]
623    fn copy_2d_htod_signature_compiles() {
624        let _: fn(&mut DeviceBuffer<f32>, &[f32], &Memcpy2DParams) -> CudaResult<()> = copy_2d_htod;
625    }
626
627    #[test]
628    fn copy_2d_dtoh_signature_compiles() {
629        let _: fn(&mut [f32], &DeviceBuffer<f32>, &Memcpy2DParams) -> CudaResult<()> = copy_2d_dtoh;
630    }
631
632    #[test]
633    fn copy_3d_dtod_signature_compiles() {
634        let _: fn(&mut DeviceBuffer<f32>, &DeviceBuffer<f32>, &Memcpy3DParams) -> CudaResult<()> =
635            copy_3d_dtod;
636    }
637
638    #[test]
639    fn params_2d_equal_pitch() {
640        // When src and dst pitches are equal to width, no padding.
641        let p = Memcpy2DParams::new(100, 100, 100, 50);
642        assert!(p.validate().is_ok());
643        assert_eq!(p.src_byte_extent(), 49 * 100 + 100);
644        assert_eq!(p.dst_byte_extent(), 49 * 100 + 100);
645    }
646}