Skip to main content

baracuda_runtime/
memcpy2d.rs

1//! Runtime-API 2-D memory copies + pitched device allocations.
2//!
3//! Mirrors [`baracuda_driver::memcpy2d`]. `PitchedBuffer<T>` owns a
4//! 2-D device allocation with driver-chosen row stride; free functions
5//! handle host ↔ pitched-device copies.
6
7use core::ffi::c_void;
8use core::mem::size_of;
9
10use baracuda_cuda_sys::runtime::{cudaMemcpyKind, runtime};
11use baracuda_types::DeviceRepr;
12
13use crate::error::{check, Result};
14use crate::stream::Stream;
15
16/// A pitched device allocation — `height × width_elems` grid of `T`s
17/// with driver-chosen `pitch_bytes ≥ width_elems * size_of::<T>()`.
18pub struct PitchedBuffer<T: DeviceRepr> {
19    ptr: *mut c_void,
20    pitch_bytes: usize,
21    width_elems: usize,
22    height: usize,
23    _marker: core::marker::PhantomData<T>,
24}
25
26unsafe impl<T: DeviceRepr + Send> Send for PitchedBuffer<T> {}
27
28impl<T: DeviceRepr> core::fmt::Debug for PitchedBuffer<T> {
29    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
30        f.debug_struct("PitchedBuffer")
31            .field("ptr", &self.ptr)
32            .field("width_elems", &self.width_elems)
33            .field("height", &self.height)
34            .field("pitch_bytes", &self.pitch_bytes)
35            .field("type", &core::any::type_name::<T>())
36            .finish()
37    }
38}
39
40impl<T: DeviceRepr> PitchedBuffer<T> {
41    /// Allocate a `height × width_elems` grid with driver-chosen pitch.
42    pub fn new(width_elems: usize, height: usize) -> Result<Self> {
43        let r = runtime()?;
44        let cu = r.cuda_malloc_pitch()?;
45        let width_bytes = width_elems
46            .checked_mul(size_of::<T>())
47            .expect("overflow in 2D allocation width");
48        let mut ptr: *mut c_void = core::ptr::null_mut();
49        let mut pitch: usize = 0;
50        check(unsafe { cu(&mut ptr, &mut pitch, width_bytes, height) })?;
51        Ok(Self {
52            ptr,
53            pitch_bytes: pitch,
54            width_elems,
55            height,
56            _marker: core::marker::PhantomData,
57        })
58    }
59
60    #[inline]
61    pub fn width_elems(&self) -> usize {
62        self.width_elems
63    }
64    #[inline]
65    pub fn height(&self) -> usize {
66        self.height
67    }
68    /// Row stride in bytes as chosen by the runtime.
69    #[inline]
70    pub fn pitch_bytes(&self) -> usize {
71        self.pitch_bytes
72    }
73    #[inline]
74    pub fn as_raw(&self) -> *mut c_void {
75        self.ptr
76    }
77}
78
79impl<T: DeviceRepr> Drop for PitchedBuffer<T> {
80    fn drop(&mut self) {
81        if self.ptr.is_null() {
82            return;
83        }
84        if let Ok(r) = runtime() {
85            if let Ok(cu) = r.cuda_free() {
86                let _ = unsafe { cu(self.ptr) };
87            }
88        }
89    }
90}
91
92/// Synchronous host → pitched-device 2-D copy. `src` holds `height`
93/// rows of `width_elems` `T`s starting `src_host_pitch_bytes` apart.
94pub fn copy_h_to_d_2d<T: DeviceRepr>(
95    src: &[T],
96    src_host_pitch_bytes: usize,
97    dst: &PitchedBuffer<T>,
98    width_elems: usize,
99    height: usize,
100) -> Result<()> {
101    assert!(width_elems <= dst.width_elems);
102    assert!(height <= dst.height);
103    let r = runtime()?;
104    let cu = r.cuda_memcpy_2d()?;
105    check(unsafe {
106        cu(
107            dst.ptr,
108            dst.pitch_bytes,
109            src.as_ptr() as *const c_void,
110            src_host_pitch_bytes,
111            width_elems * size_of::<T>(),
112            height,
113            cudaMemcpyKind::HostToDevice,
114        )
115    })
116}
117
118/// Synchronous pitched-device → host 2-D copy.
119pub fn copy_d_to_h_2d<T: DeviceRepr>(
120    src: &PitchedBuffer<T>,
121    dst: &mut [T],
122    dst_host_pitch_bytes: usize,
123    width_elems: usize,
124    height: usize,
125) -> Result<()> {
126    assert!(width_elems <= src.width_elems);
127    assert!(height <= src.height);
128    let r = runtime()?;
129    let cu = r.cuda_memcpy_2d()?;
130    check(unsafe {
131        cu(
132            dst.as_mut_ptr() as *mut c_void,
133            dst_host_pitch_bytes,
134            src.ptr,
135            src.pitch_bytes,
136            width_elems * size_of::<T>(),
137            height,
138            cudaMemcpyKind::DeviceToHost,
139        )
140    })
141}
142
143/// Async variant of [`copy_h_to_d_2d`].
144pub fn copy_h_to_d_2d_async<T: DeviceRepr>(
145    src: &[T],
146    src_host_pitch_bytes: usize,
147    dst: &PitchedBuffer<T>,
148    width_elems: usize,
149    height: usize,
150    stream: &Stream,
151) -> Result<()> {
152    assert!(width_elems <= dst.width_elems);
153    assert!(height <= dst.height);
154    let r = runtime()?;
155    let cu = r.cuda_memcpy_2d_async()?;
156    check(unsafe {
157        cu(
158            dst.ptr,
159            dst.pitch_bytes,
160            src.as_ptr() as *const c_void,
161            src_host_pitch_bytes,
162            width_elems * size_of::<T>(),
163            height,
164            cudaMemcpyKind::HostToDevice,
165            stream.as_raw(),
166        )
167    })
168}
169
170/// 2-D memset: fill a pitched region with byte `value`.
171pub fn memset_2d<T: DeviceRepr>(
172    dst: &PitchedBuffer<T>,
173    value: u8,
174    width_elems: usize,
175    height: usize,
176) -> Result<()> {
177    let r = runtime()?;
178    let cu = r.cuda_memset_2d()?;
179    check(unsafe {
180        cu(
181            dst.ptr,
182            dst.pitch_bytes,
183            value as core::ffi::c_int,
184            width_elems * size_of::<T>(),
185            height,
186        )
187    })
188}
189
190/// Async 2-D memset on `stream`.
191pub fn memset_2d_async<T: DeviceRepr>(
192    dst: &PitchedBuffer<T>,
193    value: u8,
194    width_elems: usize,
195    height: usize,
196    stream: &Stream,
197) -> Result<()> {
198    let r = runtime()?;
199    let cu = r.cuda_memset_2d_async()?;
200    check(unsafe {
201        cu(
202            dst.ptr,
203            dst.pitch_bytes,
204            value as core::ffi::c_int,
205            width_elems * size_of::<T>(),
206            height,
207            stream.as_raw(),
208        )
209    })
210}