1#[cfg(feature = "cuda-runtime")]
4use std::sync::Arc;
5
6use j2k_core::{
7 copy_tight_pixels_to_strided_output, BackendKind, BufferError, DeviceMemoryRange,
8 DeviceSurface, ExecutionStats, PixelFormat,
9};
10#[cfg(feature = "cuda-runtime")]
11use j2k_cuda_runtime::CudaDeviceBuffer;
12
13#[cfg(feature = "cuda-runtime")]
14use crate::runtime::cuda_error;
15use crate::Error;
16
17#[derive(Debug)]
18pub(crate) enum Storage {
19 Host(Vec<u8>),
20 #[cfg(feature = "cuda-runtime")]
21 Cuda(CudaDeviceBuffer),
22 #[cfg(feature = "cuda-runtime")]
23 CudaRange {
24 buffer: Arc<CudaDeviceBuffer>,
25 offset: usize,
26 len: usize,
27 },
28}
29
30#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
32pub struct CudaSurfaceStats {
33 pub(crate) total: usize,
34 pub(crate) copy: usize,
35 pub(crate) decode: usize,
36}
37
38impl CudaSurfaceStats {
39 pub fn kernel_dispatches(self) -> usize {
41 self.total
42 }
43
44 pub fn copy_kernel_dispatches(self) -> usize {
46 self.copy
47 }
48
49 pub fn decode_kernel_dispatches(self) -> usize {
51 self.decode
52 }
53}
54
55#[derive(Clone, Copy, Debug)]
57pub struct CudaSurface<'a> {
58 #[cfg(feature = "cuda-runtime")]
59 buffer: &'a CudaDeviceBuffer,
60 #[cfg(feature = "cuda-runtime")]
61 offset: usize,
62 #[cfg(not(feature = "cuda-runtime"))]
63 _marker: core::marker::PhantomData<&'a ()>,
64 pub(crate) stats: CudaSurfaceStats,
65}
66
67impl CudaSurface<'_> {
68 pub fn device_ptr(&self) -> u64 {
70 #[cfg(feature = "cuda-runtime")]
71 {
72 self.buffer.device_ptr().saturating_add(self.offset as u64)
73 }
74 #[cfg(not(feature = "cuda-runtime"))]
75 {
76 unreachable!("CudaSurface cannot be constructed without cuda-runtime support")
77 }
78 }
79
80 pub fn stats(&self) -> CudaSurfaceStats {
82 self.stats
83 }
84}
85
86#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
88#[non_exhaustive]
89pub enum SurfaceResidency {
90 #[default]
92 Host,
93 CudaResidentDecode,
95 CpuStagedCudaUpload,
97}
98
99#[derive(Debug)]
101pub struct Surface {
102 pub(crate) backend: BackendKind,
103 pub(crate) residency: SurfaceResidency,
104 pub(crate) dimensions: (u32, u32),
105 pub(crate) fmt: PixelFormat,
106 pub(crate) pitch_bytes: usize,
107 pub(crate) stats: CudaSurfaceStats,
108 pub(crate) storage: Storage,
109}
110
111impl Surface {
112 pub fn residency(&self) -> SurfaceResidency {
114 self.residency
115 }
116
117 pub fn pitch_bytes(&self) -> usize {
119 self.pitch_bytes
120 }
121
122 pub fn as_host_bytes(&self) -> Option<&[u8]> {
124 match &self.storage {
125 Storage::Host(bytes) => Some(bytes),
126 #[cfg(feature = "cuda-runtime")]
127 Storage::Cuda(_) | Storage::CudaRange { .. } => None,
128 }
129 }
130
131 pub fn download_into(&self, out: &mut [u8], stride: usize) -> Result<(), Error> {
133 match &self.storage {
134 Storage::Host(bytes) => {
135 copy_tight_pixels_to_strided_output(bytes, self.dimensions, self.fmt, out, stride)
136 .map_err(Error::from)
137 }
138 #[cfg(feature = "cuda-runtime")]
139 Storage::Cuda(buffer) => {
140 let byte_len = self.byte_len();
141 if let Some(len) =
142 tight_cuda_download_len(byte_len, self.pitch_bytes, stride, out.len())
143 {
144 return buffer.copy_to_host(&mut out[..len]).map_err(cuda_error);
145 }
146 let mut tight = vec![0u8; byte_len];
147 buffer.copy_to_host(&mut tight).map_err(cuda_error)?;
148 copy_tight_pixels_to_strided_output(&tight, self.dimensions, self.fmt, out, stride)
149 .map_err(Error::from)
150 }
151 #[cfg(feature = "cuda-runtime")]
152 Storage::CudaRange {
153 buffer,
154 offset,
155 len,
156 } => {
157 let byte_len = self.byte_len();
158 debug_assert_eq!(*len, byte_len);
159 if let Some(len) =
160 tight_cuda_download_len(byte_len, self.pitch_bytes, stride, out.len())
161 {
162 return buffer
163 .copy_range_to_host(*offset, &mut out[..len])
164 .map_err(cuda_error);
165 }
166 let mut tight = vec![0u8; byte_len];
167 buffer
168 .copy_range_to_host(*offset, &mut tight)
169 .map_err(cuda_error)?;
170 copy_tight_pixels_to_strided_output(&tight, self.dimensions, self.fmt, out, stride)
171 .map_err(Error::from)
172 }
173 }
174 }
175
176 pub fn download_into_profiled(&self, out: &mut [u8], stride: usize) -> Result<u128, Error> {
178 let started = std::time::Instant::now();
179 self.download_into(out, stride)?;
180 Ok(started.elapsed().as_micros())
181 }
182
183 pub fn cuda_surface(&self) -> Option<CudaSurface<'_>> {
185 #[cfg(feature = "cuda-runtime")]
186 match &self.storage {
187 Storage::Cuda(buffer) => Some(CudaSurface {
188 buffer,
189 offset: 0,
190 stats: self.stats,
191 }),
192 Storage::CudaRange { buffer, offset, .. } => Some(CudaSurface {
193 buffer,
194 offset: *offset,
195 stats: self.stats,
196 }),
197 Storage::Host(_) => None,
198 }
199 #[cfg(not(feature = "cuda-runtime"))]
200 {
201 let _ = self.stats;
202 None
203 }
204 }
205
206 pub fn download_batch_tight(surfaces: &[Self]) -> Result<Vec<u8>, Error> {
212 let required = batch_tight_required_len(surfaces)?;
213 if required == 0 {
214 return Ok(Vec::new());
215 }
216
217 #[cfg(feature = "cuda-runtime")]
218 if let Some((buffer, offset)) = contiguous_cuda_batch_range(surfaces) {
219 let mut out = Vec::with_capacity(required);
220 buffer
221 .copy_range_to_host_uninit(offset, out.spare_capacity_mut())
222 .map_err(cuda_error)?;
223 unsafe {
226 out.set_len(required);
227 }
228 return Ok(out);
229 }
230
231 let mut out = vec![0u8; required];
232 Self::download_batch_tight_into(surfaces, &mut out)?;
233 Ok(out)
234 }
235
236 pub fn download_batch_tight_into(surfaces: &[Self], out: &mut [u8]) -> Result<(), Error> {
242 let required = batch_tight_required_len(surfaces)?;
243 if out.len() < required {
244 return Err(BufferError::OutputTooSmall {
245 required,
246 have: out.len(),
247 }
248 .into());
249 }
250 if required == 0 {
251 return Ok(());
252 }
253
254 #[cfg(feature = "cuda-runtime")]
255 if let Some((buffer, offset)) = contiguous_cuda_batch_range(surfaces) {
256 return buffer
257 .copy_range_to_host(offset, &mut out[..required])
258 .map_err(cuda_error);
259 }
260
261 let mut cursor = 0usize;
262 for surface in surfaces {
263 let len = surface.byte_len();
264 surface.download_into(&mut out[cursor..cursor + len], surface.pitch_bytes)?;
265 cursor += len;
266 }
267 Ok(())
268 }
269}
270
271fn batch_tight_required_len(surfaces: &[Surface]) -> Result<usize, Error> {
272 surfaces
273 .iter()
274 .try_fold(0usize, |sum, surface| sum.checked_add(surface.byte_len()))
275 .ok_or(BufferError::SizeOverflow {
276 what: "tight batch surface output",
277 })
278 .map_err(Error::from)
279}
280
281#[cfg(feature = "cuda-runtime")]
282pub(crate) fn cuda_range_storage(
283 buffer: Arc<CudaDeviceBuffer>,
284 offset: usize,
285 len: usize,
286) -> Storage {
287 Storage::CudaRange {
288 buffer,
289 offset,
290 len,
291 }
292}
293
294#[cfg(feature = "cuda-runtime")]
295fn contiguous_cuda_batch_range(surfaces: &[Surface]) -> Option<(&CudaDeviceBuffer, usize)> {
296 let first = surfaces.first()?;
297 let Storage::CudaRange {
298 buffer,
299 offset,
300 len,
301 } = &first.storage
302 else {
303 return None;
304 };
305 let first_buffer = buffer;
306 let first_offset = *offset;
307 let mut expected_offset = first_offset.checked_add(*len)?;
308 for surface in &surfaces[1..] {
309 let Storage::CudaRange {
310 buffer,
311 offset,
312 len,
313 } = &surface.storage
314 else {
315 return None;
316 };
317 if !Arc::ptr_eq(first_buffer, buffer) || *offset != expected_offset {
318 return None;
319 }
320 expected_offset = expected_offset.checked_add(*len)?;
321 }
322 Some((first_buffer.as_ref(), first_offset))
323}
324
325#[cfg(any(feature = "cuda-runtime", test))]
326fn tight_cuda_download_len(
327 byte_len: usize,
328 pitch_bytes: usize,
329 stride: usize,
330 out_len: usize,
331) -> Option<usize> {
332 (stride == pitch_bytes && out_len >= byte_len).then_some(byte_len)
333}
334
335impl DeviceSurface for Surface {
336 fn backend_kind(&self) -> BackendKind {
337 self.backend
338 }
339
340 fn residency(&self) -> j2k_core::SurfaceResidency {
341 match self.residency {
342 SurfaceResidency::Host => j2k_core::SurfaceResidency::Host,
343 SurfaceResidency::CudaResidentDecode => j2k_core::SurfaceResidency::CudaResidentDecode,
344 SurfaceResidency::CpuStagedCudaUpload => {
345 j2k_core::SurfaceResidency::CpuStagedCudaUpload
346 }
347 }
348 }
349
350 fn dimensions(&self) -> (u32, u32) {
351 self.dimensions
352 }
353
354 fn pixel_format(&self) -> PixelFormat {
355 self.fmt
356 }
357
358 fn byte_len(&self) -> usize {
359 self.pitch_bytes * self.dimensions.1 as usize
360 }
361
362 fn execution_stats(&self) -> ExecutionStats {
363 ExecutionStats {
364 kernel_dispatches: self.stats.total as u64,
365 ..ExecutionStats::default()
366 }
367 }
368
369 fn memory_range(&self) -> Option<DeviceMemoryRange> {
370 match &self.storage {
371 Storage::Host(_) => None,
372 #[cfg(feature = "cuda-runtime")]
373 Storage::Cuda(buffer) => Some(DeviceMemoryRange::new(
374 BackendKind::Cuda,
375 buffer.device_ptr(),
376 0,
377 self.byte_len(),
378 )),
379 #[cfg(feature = "cuda-runtime")]
380 Storage::CudaRange {
381 buffer,
382 offset,
383 len,
384 } => Some(DeviceMemoryRange::new(
385 BackendKind::Cuda,
386 buffer.device_ptr(),
387 *offset,
388 *len,
389 )),
390 }
391 }
392}
393
394#[cfg(test)]
395mod tests {
396 use super::{tight_cuda_download_len, CudaSurfaceStats, Storage, Surface, SurfaceResidency};
397 use j2k_core::{BackendKind, PixelFormat};
398
399 #[test]
400 fn tight_cuda_download_len_accepts_exact_tight_output() {
401 assert_eq!(tight_cuda_download_len(32, 8, 8, 32), Some(32));
402 }
403
404 #[test]
405 fn download_batch_tight_returns_tightly_concatenated_host_surfaces() {
406 let surfaces = [
407 Surface {
408 backend: BackendKind::Cpu,
409 residency: SurfaceResidency::Host,
410 dimensions: (2, 1),
411 fmt: PixelFormat::Gray8,
412 pitch_bytes: 2,
413 stats: CudaSurfaceStats::default(),
414 storage: Storage::Host(vec![1, 2]),
415 },
416 Surface {
417 backend: BackendKind::Cpu,
418 residency: SurfaceResidency::Host,
419 dimensions: (1, 1),
420 fmt: PixelFormat::Rgb8,
421 pitch_bytes: 3,
422 stats: CudaSurfaceStats::default(),
423 storage: Storage::Host(vec![3, 4, 5]),
424 },
425 ];
426
427 let tight = Surface::download_batch_tight(&surfaces).expect("batch download");
428
429 assert_eq!(tight, vec![1, 2, 3, 4, 5]);
430 }
431}