j2k_cuda/
surface.rs

1// SPDX-License-Identifier: Apache-2.0
2
3#[cfg(feature = "cuda-runtime")]
4use std::sync::Arc;
5
6use j2k_core::{
7    copy_tight_pixels_to_strided_output, BackendKind, BufferError, DeviceMemoryRange,
8    DeviceSurface, ExecutionStats, PixelFormat,
9};
10#[cfg(feature = "cuda-runtime")]
11use j2k_cuda_runtime::CudaDeviceBuffer;
12
13#[cfg(feature = "cuda-runtime")]
14use crate::runtime::cuda_error;
15use crate::Error;
16
17#[derive(Debug)]
18pub(crate) enum Storage {
19    Host(Vec<u8>),
20    #[cfg(feature = "cuda-runtime")]
21    Cuda(CudaDeviceBuffer),
22    #[cfg(feature = "cuda-runtime")]
23    CudaRange {
24        buffer: Arc<CudaDeviceBuffer>,
25        offset: usize,
26        len: usize,
27    },
28}
29
30/// CUDA surface execution counters.
31#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
32pub struct CudaSurfaceStats {
33    pub(crate) total: usize,
34    pub(crate) copy: usize,
35    pub(crate) decode: usize,
36}
37
38impl CudaSurfaceStats {
39    /// Total CUDA kernel dispatches associated with the surface.
40    pub fn kernel_dispatches(self) -> usize {
41        self.total
42    }
43
44    /// CUDA copy/upload kernel dispatches associated with the surface.
45    pub fn copy_kernel_dispatches(self) -> usize {
46        self.copy
47    }
48
49    /// CUDA codestream decode kernel dispatches associated with the surface.
50    pub fn decode_kernel_dispatches(self) -> usize {
51        self.decode
52    }
53}
54
55/// Borrowed view of a CUDA-resident surface.
56#[derive(Clone, Copy, Debug)]
57pub struct CudaSurface<'a> {
58    #[cfg(feature = "cuda-runtime")]
59    buffer: &'a CudaDeviceBuffer,
60    #[cfg(feature = "cuda-runtime")]
61    offset: usize,
62    #[cfg(not(feature = "cuda-runtime"))]
63    _marker: core::marker::PhantomData<&'a ()>,
64    pub(crate) stats: CudaSurfaceStats,
65}
66
67impl CudaSurface<'_> {
68    /// Raw CUDA device pointer value.
69    pub fn device_ptr(&self) -> u64 {
70        #[cfg(feature = "cuda-runtime")]
71        {
72            self.buffer.device_ptr().saturating_add(self.offset as u64)
73        }
74        #[cfg(not(feature = "cuda-runtime"))]
75        {
76            unreachable!("CudaSurface cannot be constructed without cuda-runtime support")
77        }
78    }
79
80    /// Execution counters for this surface.
81    pub fn stats(&self) -> CudaSurfaceStats {
82        self.stats
83    }
84}
85
86/// Residency of a decoded J2K CUDA adapter surface.
87#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
88#[non_exhaustive]
89pub enum SurfaceResidency {
90    /// Pixels are stored in host memory.
91    #[default]
92    Host,
93    /// Pixels were produced directly by a CUDA codestream decode path.
94    CudaResidentDecode,
95    /// Pixels were decoded on CPU and uploaded into a CUDA buffer.
96    CpuStagedCudaUpload,
97}
98
99/// Host- or CUDA-backed decoded surface.
100#[derive(Debug)]
101pub struct Surface {
102    pub(crate) backend: BackendKind,
103    pub(crate) residency: SurfaceResidency,
104    pub(crate) dimensions: (u32, u32),
105    pub(crate) fmt: PixelFormat,
106    pub(crate) pitch_bytes: usize,
107    pub(crate) stats: CudaSurfaceStats,
108    pub(crate) storage: Storage,
109}
110
111impl Surface {
112    /// Return where the surface's pixels currently reside.
113    pub fn residency(&self) -> SurfaceResidency {
114        self.residency
115    }
116
117    /// Row pitch in bytes.
118    pub fn pitch_bytes(&self) -> usize {
119        self.pitch_bytes
120    }
121
122    /// Borrow host bytes when the surface is host-backed.
123    pub fn as_host_bytes(&self) -> Option<&[u8]> {
124        match &self.storage {
125            Storage::Host(bytes) => Some(bytes),
126            #[cfg(feature = "cuda-runtime")]
127            Storage::Cuda(_) | Storage::CudaRange { .. } => None,
128        }
129    }
130
131    /// Download or copy the surface into caller-owned strided output.
132    pub fn download_into(&self, out: &mut [u8], stride: usize) -> Result<(), Error> {
133        match &self.storage {
134            Storage::Host(bytes) => {
135                copy_tight_pixels_to_strided_output(bytes, self.dimensions, self.fmt, out, stride)
136                    .map_err(Error::from)
137            }
138            #[cfg(feature = "cuda-runtime")]
139            Storage::Cuda(buffer) => {
140                let byte_len = self.byte_len();
141                if let Some(len) =
142                    tight_cuda_download_len(byte_len, self.pitch_bytes, stride, out.len())
143                {
144                    return buffer.copy_to_host(&mut out[..len]).map_err(cuda_error);
145                }
146                let mut tight = vec![0u8; byte_len];
147                buffer.copy_to_host(&mut tight).map_err(cuda_error)?;
148                copy_tight_pixels_to_strided_output(&tight, self.dimensions, self.fmt, out, stride)
149                    .map_err(Error::from)
150            }
151            #[cfg(feature = "cuda-runtime")]
152            Storage::CudaRange {
153                buffer,
154                offset,
155                len,
156            } => {
157                let byte_len = self.byte_len();
158                debug_assert_eq!(*len, byte_len);
159                if let Some(len) =
160                    tight_cuda_download_len(byte_len, self.pitch_bytes, stride, out.len())
161                {
162                    return buffer
163                        .copy_range_to_host(*offset, &mut out[..len])
164                        .map_err(cuda_error);
165                }
166                let mut tight = vec![0u8; byte_len];
167                buffer
168                    .copy_range_to_host(*offset, &mut tight)
169                    .map_err(cuda_error)?;
170                copy_tight_pixels_to_strided_output(&tight, self.dimensions, self.fmt, out, stride)
171                    .map_err(Error::from)
172            }
173        }
174    }
175
176    /// Download the surface and return elapsed host copy time in microseconds.
177    pub fn download_into_profiled(&self, out: &mut [u8], stride: usize) -> Result<u128, Error> {
178        let started = std::time::Instant::now();
179        self.download_into(out, stride)?;
180        Ok(started.elapsed().as_micros())
181    }
182
183    /// Borrow CUDA metadata when the surface is CUDA-backed.
184    pub fn cuda_surface(&self) -> Option<CudaSurface<'_>> {
185        #[cfg(feature = "cuda-runtime")]
186        match &self.storage {
187            Storage::Cuda(buffer) => Some(CudaSurface {
188                buffer,
189                offset: 0,
190                stats: self.stats,
191            }),
192            Storage::CudaRange { buffer, offset, .. } => Some(CudaSurface {
193                buffer,
194                offset: *offset,
195                stats: self.stats,
196            }),
197            Storage::Host(_) => None,
198        }
199        #[cfg(not(feature = "cuda-runtime"))]
200        {
201            let _ = self.stats;
202            None
203        }
204    }
205
206    /// Download a sequence of surfaces into a tightly concatenated output buffer.
207    ///
208    /// CUDA surfaces produced from one contiguous batch allocation are copied
209    /// with one device-to-host transfer. Other layouts fall back to downloading
210    /// each surface tightly in order.
211    pub fn download_batch_tight(surfaces: &[Self]) -> Result<Vec<u8>, Error> {
212        let required = batch_tight_required_len(surfaces)?;
213        if required == 0 {
214            return Ok(Vec::new());
215        }
216
217        #[cfg(feature = "cuda-runtime")]
218        if let Some((buffer, offset)) = contiguous_cuda_batch_range(surfaces) {
219            let mut out = Vec::with_capacity(required);
220            buffer
221                .copy_range_to_host_uninit(offset, out.spare_capacity_mut())
222                .map_err(cuda_error)?;
223            // SAFETY: the CUDA copy above initialized exactly `required`
224            // bytes in this Vec's spare capacity and returned success.
225            unsafe {
226                out.set_len(required);
227            }
228            return Ok(out);
229        }
230
231        let mut out = vec![0u8; required];
232        Self::download_batch_tight_into(surfaces, &mut out)?;
233        Ok(out)
234    }
235
236    /// Download a sequence of surfaces into a tightly concatenated output buffer.
237    ///
238    /// CUDA surfaces produced from one contiguous batch allocation are copied
239    /// with one device-to-host transfer. Other layouts fall back to downloading
240    /// each surface tightly in order.
241    pub fn download_batch_tight_into(surfaces: &[Self], out: &mut [u8]) -> Result<(), Error> {
242        let required = batch_tight_required_len(surfaces)?;
243        if out.len() < required {
244            return Err(BufferError::OutputTooSmall {
245                required,
246                have: out.len(),
247            }
248            .into());
249        }
250        if required == 0 {
251            return Ok(());
252        }
253
254        #[cfg(feature = "cuda-runtime")]
255        if let Some((buffer, offset)) = contiguous_cuda_batch_range(surfaces) {
256            return buffer
257                .copy_range_to_host(offset, &mut out[..required])
258                .map_err(cuda_error);
259        }
260
261        let mut cursor = 0usize;
262        for surface in surfaces {
263            let len = surface.byte_len();
264            surface.download_into(&mut out[cursor..cursor + len], surface.pitch_bytes)?;
265            cursor += len;
266        }
267        Ok(())
268    }
269}
270
271fn batch_tight_required_len(surfaces: &[Surface]) -> Result<usize, Error> {
272    surfaces
273        .iter()
274        .try_fold(0usize, |sum, surface| sum.checked_add(surface.byte_len()))
275        .ok_or(BufferError::SizeOverflow {
276            what: "tight batch surface output",
277        })
278        .map_err(Error::from)
279}
280
281#[cfg(feature = "cuda-runtime")]
282pub(crate) fn cuda_range_storage(
283    buffer: Arc<CudaDeviceBuffer>,
284    offset: usize,
285    len: usize,
286) -> Storage {
287    Storage::CudaRange {
288        buffer,
289        offset,
290        len,
291    }
292}
293
294#[cfg(feature = "cuda-runtime")]
295fn contiguous_cuda_batch_range(surfaces: &[Surface]) -> Option<(&CudaDeviceBuffer, usize)> {
296    let first = surfaces.first()?;
297    let Storage::CudaRange {
298        buffer,
299        offset,
300        len,
301    } = &first.storage
302    else {
303        return None;
304    };
305    let first_buffer = buffer;
306    let first_offset = *offset;
307    let mut expected_offset = first_offset.checked_add(*len)?;
308    for surface in &surfaces[1..] {
309        let Storage::CudaRange {
310            buffer,
311            offset,
312            len,
313        } = &surface.storage
314        else {
315            return None;
316        };
317        if !Arc::ptr_eq(first_buffer, buffer) || *offset != expected_offset {
318            return None;
319        }
320        expected_offset = expected_offset.checked_add(*len)?;
321    }
322    Some((first_buffer.as_ref(), first_offset))
323}
324
325#[cfg(any(feature = "cuda-runtime", test))]
326fn tight_cuda_download_len(
327    byte_len: usize,
328    pitch_bytes: usize,
329    stride: usize,
330    out_len: usize,
331) -> Option<usize> {
332    (stride == pitch_bytes && out_len >= byte_len).then_some(byte_len)
333}
334
335impl DeviceSurface for Surface {
336    fn backend_kind(&self) -> BackendKind {
337        self.backend
338    }
339
340    fn residency(&self) -> j2k_core::SurfaceResidency {
341        match self.residency {
342            SurfaceResidency::Host => j2k_core::SurfaceResidency::Host,
343            SurfaceResidency::CudaResidentDecode => j2k_core::SurfaceResidency::CudaResidentDecode,
344            SurfaceResidency::CpuStagedCudaUpload => {
345                j2k_core::SurfaceResidency::CpuStagedCudaUpload
346            }
347        }
348    }
349
350    fn dimensions(&self) -> (u32, u32) {
351        self.dimensions
352    }
353
354    fn pixel_format(&self) -> PixelFormat {
355        self.fmt
356    }
357
358    fn byte_len(&self) -> usize {
359        self.pitch_bytes * self.dimensions.1 as usize
360    }
361
362    fn execution_stats(&self) -> ExecutionStats {
363        ExecutionStats {
364            kernel_dispatches: self.stats.total as u64,
365            ..ExecutionStats::default()
366        }
367    }
368
369    fn memory_range(&self) -> Option<DeviceMemoryRange> {
370        match &self.storage {
371            Storage::Host(_) => None,
372            #[cfg(feature = "cuda-runtime")]
373            Storage::Cuda(buffer) => Some(DeviceMemoryRange::new(
374                BackendKind::Cuda,
375                buffer.device_ptr(),
376                0,
377                self.byte_len(),
378            )),
379            #[cfg(feature = "cuda-runtime")]
380            Storage::CudaRange {
381                buffer,
382                offset,
383                len,
384            } => Some(DeviceMemoryRange::new(
385                BackendKind::Cuda,
386                buffer.device_ptr(),
387                *offset,
388                *len,
389            )),
390        }
391    }
392}
393
394#[cfg(test)]
395mod tests {
396    use super::{tight_cuda_download_len, CudaSurfaceStats, Storage, Surface, SurfaceResidency};
397    use j2k_core::{BackendKind, PixelFormat};
398
399    #[test]
400    fn tight_cuda_download_len_accepts_exact_tight_output() {
401        assert_eq!(tight_cuda_download_len(32, 8, 8, 32), Some(32));
402    }
403
404    #[test]
405    fn download_batch_tight_returns_tightly_concatenated_host_surfaces() {
406        let surfaces = [
407            Surface {
408                backend: BackendKind::Cpu,
409                residency: SurfaceResidency::Host,
410                dimensions: (2, 1),
411                fmt: PixelFormat::Gray8,
412                pitch_bytes: 2,
413                stats: CudaSurfaceStats::default(),
414                storage: Storage::Host(vec![1, 2]),
415            },
416            Surface {
417                backend: BackendKind::Cpu,
418                residency: SurfaceResidency::Host,
419                dimensions: (1, 1),
420                fmt: PixelFormat::Rgb8,
421                pitch_bytes: 3,
422                stats: CudaSurfaceStats::default(),
423                storage: Storage::Host(vec![3, 4, 5]),
424            },
425        ];
426
427        let tight = Surface::download_batch_tight(&surfaces).expect("batch download");
428
429        assert_eq!(tight, vec![1, 2, 3, 4, 5]);
430    }
431}
j2k_cuda/surface.rs

j2k_cuda/
surface.rs