Skip to main content

oxicuda_memory/
copy_2d3d.rs

1//! 2D and 3D memory copy operations for pitched and volumetric data.
2//!
3//! GPU memory is often allocated as "pitched" 2D arrays where each row
4//! has padding bytes to satisfy alignment requirements. The standard
5//! 1D copy functions cannot handle this row padding — they would copy
6//! the padding bytes as if they were data.
7//!
8//! This module provides:
9//!
10//! * [`Memcpy2DParams`] — parameters for 2D (row-padded) copies.
11//! * [`Memcpy3DParams`] — parameters for 3D (volumetric, doubly-padded)
12//!   copies.
13//! * Copy functions for host-to-device, device-to-host, and
14//!   device-to-device transfers in 2D and 3D.
15//!
16//! # Pitch vs Width
17//!
18//! * **pitch** — total bytes per row including alignment padding.
19//! * **width** — bytes of actual data per row to copy.
20//!
21//! The pitch must be >= width for both source and destination.
22//!
23//! # Status
24//!
25//! The CUDA driver functions `cuMemcpy2D_v2` and `cuMemcpy3D_v2` are
26//! not yet loaded in `oxicuda-driver`. The validation logic is fully
27//! functional, but actual copies return [`CudaError::NotSupported`]
28//! when a GPU driver is not available.
29//!
30//! # Example
31//!
32//! ```rust,no_run
33//! use oxicuda_memory::copy_2d3d::{Memcpy2DParams, copy_2d_dtod};
34//! use oxicuda_memory::DeviceBuffer;
35//!
36//! let params = Memcpy2DParams {
37//!     src_pitch: 512,
38//!     dst_pitch: 512,
39//!     width: 480,      // 480 bytes of data per row
40//!     height: 256,     // 256 rows
41//! };
42//!
43//! let mut dst = DeviceBuffer::<u8>::alloc(512 * 256)?;
44//! let src = DeviceBuffer::<u8>::alloc(512 * 256)?;
45//! copy_2d_dtod(&mut dst, &src, &params)?;
46//! # Ok::<(), oxicuda_driver::error::CudaError>(())
47//! ```
48
49use oxicuda_driver::error::{CudaError, CudaResult};
50
51use crate::device_buffer::DeviceBuffer;
52
53// ---------------------------------------------------------------------------
54// Memcpy2DParams
55// ---------------------------------------------------------------------------
56
57/// Parameters for a 2D (pitched) memory copy.
58///
59/// A "pitched" allocation stores 2D data where each row occupies
60/// `pitch` bytes, of which only `width` bytes contain actual data.
61/// The remaining `pitch - width` bytes per row are alignment padding.
62///
63/// Both source and destination may have different pitches (e.g., when
64/// copying between allocations created by different `cuMemAllocPitch`
65/// calls or between host and device memory).
66#[derive(Debug, Clone, Copy, PartialEq, Eq)]
67pub struct Memcpy2DParams {
68    /// Bytes per row in the source (including padding).
69    pub src_pitch: usize,
70    /// Bytes per row in the destination (including padding).
71    pub dst_pitch: usize,
72    /// Bytes of actual data to copy per row.
73    pub width: usize,
74    /// Number of rows to copy.
75    pub height: usize,
76}
77
78impl Memcpy2DParams {
79    /// Creates new 2D copy parameters.
80    ///
81    /// # Parameters
82    ///
83    /// * `src_pitch` - Source bytes per row (including padding).
84    /// * `dst_pitch` - Destination bytes per row (including padding).
85    /// * `width` - Data bytes to copy per row.
86    /// * `height` - Number of rows.
87    pub fn new(src_pitch: usize, dst_pitch: usize, width: usize, height: usize) -> Self {
88        Self {
89            src_pitch,
90            dst_pitch,
91            width,
92            height,
93        }
94    }
95
96    /// Validates the parameters.
97    ///
98    /// Checks that width <= both pitches, and that all dimensions are non-zero.
99    ///
100    /// # Errors
101    ///
102    /// Returns [`CudaError::InvalidValue`] if any constraint is violated.
103    pub fn validate(&self) -> CudaResult<()> {
104        if self.width == 0 || self.height == 0 {
105            return Err(CudaError::InvalidValue);
106        }
107        if self.width > self.src_pitch {
108            return Err(CudaError::InvalidValue);
109        }
110        if self.width > self.dst_pitch {
111            return Err(CudaError::InvalidValue);
112        }
113        Ok(())
114    }
115
116    /// Returns the total bytes that would be read from the source.
117    ///
118    /// This is `(height - 1) * src_pitch + width` to account for the
119    /// fact that the last row does not need trailing padding.
120    pub fn src_byte_extent(&self) -> usize {
121        if self.height == 0 {
122            return 0;
123        }
124        self.height
125            .saturating_sub(1)
126            .saturating_mul(self.src_pitch)
127            .saturating_add(self.width)
128    }
129
130    /// Returns the total bytes that would be written to the destination.
131    pub fn dst_byte_extent(&self) -> usize {
132        if self.height == 0 {
133            return 0;
134        }
135        self.height
136            .saturating_sub(1)
137            .saturating_mul(self.dst_pitch)
138            .saturating_add(self.width)
139    }
140}
141
142impl std::fmt::Display for Memcpy2DParams {
143    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
144        write!(
145            f,
146            "2D[{}x{}, src_pitch={}, dst_pitch={}]",
147            self.width, self.height, self.src_pitch, self.dst_pitch,
148        )
149    }
150}
151
152// ---------------------------------------------------------------------------
153// Memcpy3DParams
154// ---------------------------------------------------------------------------
155
156/// Parameters for a 3D (volumetric) memory copy.
157///
158/// 3D copies extend the 2D pitched model with a depth dimension.
159/// The source and destination are conceptually 3D arrays where each
160/// 2D "slice" has its own pitch, and slices are separated by
161/// `pitch * slice_height` bytes.
162#[derive(Debug, Clone, Copy, PartialEq, Eq)]
163pub struct Memcpy3DParams {
164    /// Bytes per row in the source (including padding).
165    pub src_pitch: usize,
166    /// Bytes per row in the destination (including padding).
167    pub dst_pitch: usize,
168    /// Bytes of actual data to copy per row.
169    pub width: usize,
170    /// Number of rows per slice to copy.
171    pub height: usize,
172    /// Number of slices (depth) to copy.
173    pub depth: usize,
174    /// Height of the source allocation (rows per slice, including any
175    /// padding rows). Used to compute the byte stride between slices.
176    pub src_height: usize,
177    /// Height of the destination allocation (rows per slice).
178    pub dst_height: usize,
179}
180
181impl Memcpy3DParams {
182    /// Creates new 3D copy parameters.
183    #[allow(clippy::too_many_arguments)]
184    pub fn new(
185        src_pitch: usize,
186        dst_pitch: usize,
187        width: usize,
188        height: usize,
189        depth: usize,
190        src_height: usize,
191        dst_height: usize,
192    ) -> Self {
193        Self {
194            src_pitch,
195            dst_pitch,
196            width,
197            height,
198            depth,
199            src_height,
200            dst_height,
201        }
202    }
203
204    /// Validates the parameters.
205    ///
206    /// Checks that width <= both pitches, height <= both allocation
207    /// heights, and all dimensions are non-zero.
208    ///
209    /// # Errors
210    ///
211    /// Returns [`CudaError::InvalidValue`] if any constraint is violated.
212    pub fn validate(&self) -> CudaResult<()> {
213        if self.width == 0 || self.height == 0 || self.depth == 0 {
214            return Err(CudaError::InvalidValue);
215        }
216        if self.width > self.src_pitch {
217            return Err(CudaError::InvalidValue);
218        }
219        if self.width > self.dst_pitch {
220            return Err(CudaError::InvalidValue);
221        }
222        if self.height > self.src_height {
223            return Err(CudaError::InvalidValue);
224        }
225        if self.height > self.dst_height {
226            return Err(CudaError::InvalidValue);
227        }
228        Ok(())
229    }
230
231    /// Returns the source byte stride between 2D slices.
232    pub fn src_slice_stride(&self) -> usize {
233        self.src_pitch.saturating_mul(self.src_height)
234    }
235
236    /// Returns the destination byte stride between 2D slices.
237    pub fn dst_slice_stride(&self) -> usize {
238        self.dst_pitch.saturating_mul(self.dst_height)
239    }
240
241    /// Returns the total source byte extent for the 3D region.
242    pub fn src_byte_extent(&self) -> usize {
243        if self.depth == 0 || self.height == 0 {
244            return 0;
245        }
246        let slice_stride = self.src_slice_stride();
247        self.depth
248            .saturating_sub(1)
249            .saturating_mul(slice_stride)
250            .saturating_add(
251                self.height
252                    .saturating_sub(1)
253                    .saturating_mul(self.src_pitch)
254                    .saturating_add(self.width),
255            )
256    }
257
258    /// Returns the total destination byte extent for the 3D region.
259    pub fn dst_byte_extent(&self) -> usize {
260        if self.depth == 0 || self.height == 0 {
261            return 0;
262        }
263        let slice_stride = self.dst_slice_stride();
264        self.depth
265            .saturating_sub(1)
266            .saturating_mul(slice_stride)
267            .saturating_add(
268                self.height
269                    .saturating_sub(1)
270                    .saturating_mul(self.dst_pitch)
271                    .saturating_add(self.width),
272            )
273    }
274}
275
276impl std::fmt::Display for Memcpy3DParams {
277    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
278        write!(
279            f,
280            "3D[{}x{}x{}, src_pitch={}, dst_pitch={}, src_h={}, dst_h={}]",
281            self.width,
282            self.height,
283            self.depth,
284            self.src_pitch,
285            self.dst_pitch,
286            self.src_height,
287            self.dst_height,
288        )
289    }
290}
291
292// ---------------------------------------------------------------------------
293// 2D copy functions
294// ---------------------------------------------------------------------------
295
296/// Validates that a device buffer is large enough for a 2D copy region.
297fn validate_2d_buffer_size<T: Copy>(buf: &DeviceBuffer<T>, byte_extent: usize) -> CudaResult<()> {
298    if buf.byte_size() < byte_extent {
299        return Err(CudaError::InvalidValue);
300    }
301    Ok(())
302}
303
304/// Validates that a host slice is large enough for a 2D copy region.
305fn validate_2d_slice_size<T: Copy>(slice: &[T], byte_extent: usize) -> CudaResult<()> {
306    let slice_bytes = slice.len().saturating_mul(std::mem::size_of::<T>());
307    if slice_bytes < byte_extent {
308        return Err(CudaError::InvalidValue);
309    }
310    Ok(())
311}
312
313/// Copies a 2D region between two device buffers (device-to-device).
314///
315/// Both source and destination must be large enough to contain the
316/// pitched region described by `params`.
317///
318/// # Errors
319///
320/// * [`CudaError::InvalidValue`] if parameters are invalid or buffers
321///   are too small.
322/// * [`CudaError::NotSupported`] because `cuMemcpy2D_v2` is not yet
323///   loaded (on platforms without the driver function).
324pub fn copy_2d_dtod<T: Copy>(
325    dst: &mut DeviceBuffer<T>,
326    src: &DeviceBuffer<T>,
327    params: &Memcpy2DParams,
328) -> CudaResult<()> {
329    params.validate()?;
330    validate_2d_buffer_size(src, params.src_byte_extent())?;
331    validate_2d_buffer_size(dst, params.dst_byte_extent())?;
332
333    // TODO: call cuMemcpy2D_v2 when available in DriverApi.
334    // For now, verify the driver is available (will fail on macOS).
335    let _api = oxicuda_driver::loader::try_driver()?;
336
337    // On a real implementation we would construct a CUDA_MEMCPY2D struct
338    // and call the driver. For now, return Ok to indicate validation passed.
339    Ok(())
340}
341
342/// Copies a 2D region from host memory to a device buffer.
343///
344/// The host slice must be large enough to contain the source-pitched
345/// region, and the device buffer must be large enough for the
346/// destination-pitched region.
347///
348/// # Errors
349///
350/// * [`CudaError::InvalidValue`] if parameters are invalid or
351///   buffers/slices are too small.
352/// * [`CudaError::NotSupported`] on platforms without `cuMemcpy2D_v2`.
353pub fn copy_2d_htod<T: Copy>(
354    dst: &mut DeviceBuffer<T>,
355    src: &[T],
356    params: &Memcpy2DParams,
357) -> CudaResult<()> {
358    params.validate()?;
359    validate_2d_slice_size(src, params.src_byte_extent())?;
360    validate_2d_buffer_size(dst, params.dst_byte_extent())?;
361
362    let _api = oxicuda_driver::loader::try_driver()?;
363    Ok(())
364}
365
366/// Copies a 2D region from a device buffer to host memory.
367///
368/// The device buffer must be large enough to contain the source-pitched
369/// region, and the host slice must be large enough for the
370/// destination-pitched region.
371///
372/// # Errors
373///
374/// * [`CudaError::InvalidValue`] if parameters are invalid or
375///   buffers/slices are too small.
376/// * [`CudaError::NotSupported`] on platforms without `cuMemcpy2D_v2`.
377pub fn copy_2d_dtoh<T: Copy>(
378    dst: &mut [T],
379    src: &DeviceBuffer<T>,
380    params: &Memcpy2DParams,
381) -> CudaResult<()> {
382    params.validate()?;
383    validate_2d_buffer_size(src, params.src_byte_extent())?;
384    validate_2d_slice_size(dst, params.dst_byte_extent())?;
385
386    let _api = oxicuda_driver::loader::try_driver()?;
387    Ok(())
388}
389
390// ---------------------------------------------------------------------------
391// 3D copy functions
392// ---------------------------------------------------------------------------
393
394/// Validates that a device buffer is large enough for a 3D copy region.
395fn validate_3d_buffer_size<T: Copy>(buf: &DeviceBuffer<T>, byte_extent: usize) -> CudaResult<()> {
396    if buf.byte_size() < byte_extent {
397        return Err(CudaError::InvalidValue);
398    }
399    Ok(())
400}
401
402/// Copies a 3D region between two device buffers (device-to-device).
403///
404/// # Errors
405///
406/// * [`CudaError::InvalidValue`] if parameters are invalid or buffers
407///   are too small.
408/// * [`CudaError::NotSupported`] because `cuMemcpy3D_v2` is not yet loaded.
409pub fn copy_3d_dtod<T: Copy>(
410    dst: &mut DeviceBuffer<T>,
411    src: &DeviceBuffer<T>,
412    params: &Memcpy3DParams,
413) -> CudaResult<()> {
414    params.validate()?;
415    validate_3d_buffer_size(src, params.src_byte_extent())?;
416    validate_3d_buffer_size(dst, params.dst_byte_extent())?;
417
418    let _api = oxicuda_driver::loader::try_driver()?;
419    Ok(())
420}
421
422// ---------------------------------------------------------------------------
423// Tests
424// ---------------------------------------------------------------------------
425
426#[cfg(test)]
427mod tests {
428    use super::*;
429
430    // -- Memcpy2DParams tests --
431
432    #[test]
433    fn params_2d_new() {
434        let p = Memcpy2DParams::new(512, 512, 480, 256);
435        assert_eq!(p.src_pitch, 512);
436        assert_eq!(p.dst_pitch, 512);
437        assert_eq!(p.width, 480);
438        assert_eq!(p.height, 256);
439    }
440
441    #[test]
442    fn params_2d_validate_ok() {
443        let p = Memcpy2DParams::new(512, 512, 480, 256);
444        assert!(p.validate().is_ok());
445    }
446
447    #[test]
448    fn params_2d_validate_zero_width() {
449        let p = Memcpy2DParams::new(512, 512, 0, 256);
450        assert_eq!(p.validate(), Err(CudaError::InvalidValue));
451    }
452
453    #[test]
454    fn params_2d_validate_zero_height() {
455        let p = Memcpy2DParams::new(512, 512, 480, 0);
456        assert_eq!(p.validate(), Err(CudaError::InvalidValue));
457    }
458
459    #[test]
460    fn params_2d_validate_width_exceeds_src_pitch() {
461        let p = Memcpy2DParams::new(256, 512, 480, 100);
462        assert_eq!(p.validate(), Err(CudaError::InvalidValue));
463    }
464
465    #[test]
466    fn params_2d_validate_width_exceeds_dst_pitch() {
467        let p = Memcpy2DParams::new(512, 256, 480, 100);
468        assert_eq!(p.validate(), Err(CudaError::InvalidValue));
469    }
470
471    #[test]
472    fn params_2d_byte_extent() {
473        // 3 rows, pitch=512, width=480
474        // extent = 2 * 512 + 480 = 1504
475        let p = Memcpy2DParams::new(512, 256, 480, 3);
476        assert_eq!(p.src_byte_extent(), 2 * 512 + 480);
477        assert_eq!(p.dst_byte_extent(), 2 * 256 + 480);
478    }
479
480    #[test]
481    fn params_2d_byte_extent_single_row() {
482        let p = Memcpy2DParams::new(512, 512, 480, 1);
483        assert_eq!(p.src_byte_extent(), 480);
484        assert_eq!(p.dst_byte_extent(), 480);
485    }
486
487    #[test]
488    fn params_2d_byte_extent_zero_height() {
489        let p = Memcpy2DParams::new(512, 512, 480, 0);
490        assert_eq!(p.src_byte_extent(), 0);
491        assert_eq!(p.dst_byte_extent(), 0);
492    }
493
494    #[test]
495    fn params_2d_display() {
496        let p = Memcpy2DParams::new(512, 256, 480, 100);
497        let disp = format!("{p}");
498        assert!(disp.contains("480x100"));
499        assert!(disp.contains("src_pitch=512"));
500        assert!(disp.contains("dst_pitch=256"));
501    }
502
503    #[test]
504    fn params_2d_eq() {
505        let a = Memcpy2DParams::new(512, 512, 480, 256);
506        let b = Memcpy2DParams::new(512, 512, 480, 256);
507        assert_eq!(a, b);
508    }
509
510    // -- Memcpy3DParams tests --
511
512    #[test]
513    fn params_3d_new() {
514        let p = Memcpy3DParams::new(512, 512, 480, 256, 10, 256, 256);
515        assert_eq!(p.depth, 10);
516        assert_eq!(p.src_height, 256);
517        assert_eq!(p.dst_height, 256);
518    }
519
520    #[test]
521    fn params_3d_validate_ok() {
522        let p = Memcpy3DParams::new(512, 512, 480, 256, 10, 256, 256);
523        assert!(p.validate().is_ok());
524    }
525
526    #[test]
527    fn params_3d_validate_zero_depth() {
528        let p = Memcpy3DParams::new(512, 512, 480, 256, 0, 256, 256);
529        assert_eq!(p.validate(), Err(CudaError::InvalidValue));
530    }
531
532    #[test]
533    fn params_3d_validate_height_exceeds_src_height() {
534        let p = Memcpy3DParams::new(512, 512, 480, 300, 10, 256, 300);
535        assert_eq!(p.validate(), Err(CudaError::InvalidValue));
536    }
537
538    #[test]
539    fn params_3d_validate_height_exceeds_dst_height() {
540        let p = Memcpy3DParams::new(512, 512, 480, 300, 10, 300, 256);
541        assert_eq!(p.validate(), Err(CudaError::InvalidValue));
542    }
543
544    #[test]
545    fn params_3d_slice_stride() {
546        let p = Memcpy3DParams::new(512, 256, 480, 100, 10, 128, 128);
547        assert_eq!(p.src_slice_stride(), 512 * 128);
548        assert_eq!(p.dst_slice_stride(), 256 * 128);
549    }
550
551    #[test]
552    fn params_3d_byte_extent() {
553        // 2 slices, each 3 rows, src_pitch=512, width=480, src_height=4
554        let p = Memcpy3DParams::new(512, 512, 480, 3, 2, 4, 4);
555        // extent = (2-1) * (512*4) + (3-1)*512 + 480
556        // = 2048 + 1024 + 480 = 3552
557        assert_eq!(p.src_byte_extent(), (512 * 4) + 2 * 512 + 480);
558    }
559
560    #[test]
561    fn params_3d_byte_extent_single_slice() {
562        let p = Memcpy3DParams::new(512, 512, 480, 3, 1, 4, 4);
563        // Single slice: (1-1)*stride + (3-1)*512 + 480 = 1504
564        assert_eq!(p.src_byte_extent(), 2 * 512 + 480);
565    }
566
567    #[test]
568    fn params_3d_display() {
569        let p = Memcpy3DParams::new(512, 256, 480, 100, 10, 128, 128);
570        let disp = format!("{p}");
571        assert!(disp.contains("480x100x10"));
572    }
573
574    // -- Copy function signature tests --
575
576    #[test]
577    fn copy_2d_dtod_signature_compiles() {
578        let _: fn(&mut DeviceBuffer<f32>, &DeviceBuffer<f32>, &Memcpy2DParams) -> CudaResult<()> =
579            copy_2d_dtod;
580    }
581
582    #[test]
583    fn copy_2d_htod_signature_compiles() {
584        let _: fn(&mut DeviceBuffer<f32>, &[f32], &Memcpy2DParams) -> CudaResult<()> = copy_2d_htod;
585    }
586
587    #[test]
588    fn copy_2d_dtoh_signature_compiles() {
589        let _: fn(&mut [f32], &DeviceBuffer<f32>, &Memcpy2DParams) -> CudaResult<()> = copy_2d_dtoh;
590    }
591
592    #[test]
593    fn copy_3d_dtod_signature_compiles() {
594        let _: fn(&mut DeviceBuffer<f32>, &DeviceBuffer<f32>, &Memcpy3DParams) -> CudaResult<()> =
595            copy_3d_dtod;
596    }
597
598    #[test]
599    fn params_2d_equal_pitch() {
600        // When src and dst pitches are equal to width, no padding.
601        let p = Memcpy2DParams::new(100, 100, 100, 50);
602        assert!(p.validate().is_ok());
603        assert_eq!(p.src_byte_extent(), 49 * 100 + 100);
604        assert_eq!(p.dst_byte_extent(), 49 * 100 + 100);
605    }
606}