#[cfg(feature = "cuda-runtime")]
use std::sync::Arc;
use j2k_core::{
copy_tight_pixels_to_strided_output, BackendKind, BufferError, DeviceMemoryRange,
DeviceSurface, ExecutionStats, PixelFormat,
};
#[cfg(feature = "cuda-runtime")]
use j2k_cuda_runtime::CudaDeviceBuffer;
#[cfg(feature = "cuda-runtime")]
use crate::runtime::cuda_error;
use crate::Error;
#[derive(Debug)]
pub(crate) enum Storage {
Host(Vec<u8>),
#[cfg(feature = "cuda-runtime")]
Cuda(CudaDeviceBuffer),
#[cfg(feature = "cuda-runtime")]
CudaRange {
buffer: Arc<CudaDeviceBuffer>,
offset: usize,
len: usize,
},
}
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
pub struct CudaSurfaceStats {
pub(crate) total: usize,
pub(crate) copy: usize,
pub(crate) decode: usize,
}
impl CudaSurfaceStats {
pub fn kernel_dispatches(self) -> usize {
self.total
}
pub fn copy_kernel_dispatches(self) -> usize {
self.copy
}
pub fn decode_kernel_dispatches(self) -> usize {
self.decode
}
}
#[derive(Clone, Copy, Debug)]
pub struct CudaSurface<'a> {
#[cfg(feature = "cuda-runtime")]
buffer: &'a CudaDeviceBuffer,
#[cfg(feature = "cuda-runtime")]
offset: usize,
#[cfg(not(feature = "cuda-runtime"))]
_marker: core::marker::PhantomData<&'a ()>,
pub(crate) stats: CudaSurfaceStats,
}
impl CudaSurface<'_> {
pub fn device_ptr(&self) -> u64 {
#[cfg(feature = "cuda-runtime")]
{
self.buffer.device_ptr().saturating_add(self.offset as u64)
}
#[cfg(not(feature = "cuda-runtime"))]
{
unreachable!("CudaSurface cannot be constructed without cuda-runtime support")
}
}
pub fn stats(&self) -> CudaSurfaceStats {
self.stats
}
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
#[non_exhaustive]
pub enum SurfaceResidency {
#[default]
Host,
CudaResidentDecode,
CpuStagedCudaUpload,
}
#[derive(Debug)]
pub struct Surface {
pub(crate) backend: BackendKind,
pub(crate) residency: SurfaceResidency,
pub(crate) dimensions: (u32, u32),
pub(crate) fmt: PixelFormat,
pub(crate) pitch_bytes: usize,
pub(crate) stats: CudaSurfaceStats,
pub(crate) storage: Storage,
}
impl Surface {
pub fn residency(&self) -> SurfaceResidency {
self.residency
}
pub fn pitch_bytes(&self) -> usize {
self.pitch_bytes
}
pub fn as_host_bytes(&self) -> Option<&[u8]> {
match &self.storage {
Storage::Host(bytes) => Some(bytes),
#[cfg(feature = "cuda-runtime")]
Storage::Cuda(_) | Storage::CudaRange { .. } => None,
}
}
pub fn download_into(&self, out: &mut [u8], stride: usize) -> Result<(), Error> {
match &self.storage {
Storage::Host(bytes) => {
copy_tight_pixels_to_strided_output(bytes, self.dimensions, self.fmt, out, stride)
.map_err(Error::from)
}
#[cfg(feature = "cuda-runtime")]
Storage::Cuda(buffer) => {
let byte_len = self.byte_len();
if let Some(len) =
tight_cuda_download_len(byte_len, self.pitch_bytes, stride, out.len())
{
return buffer.copy_to_host(&mut out[..len]).map_err(cuda_error);
}
let mut tight = vec![0u8; byte_len];
buffer.copy_to_host(&mut tight).map_err(cuda_error)?;
copy_tight_pixels_to_strided_output(&tight, self.dimensions, self.fmt, out, stride)
.map_err(Error::from)
}
#[cfg(feature = "cuda-runtime")]
Storage::CudaRange {
buffer,
offset,
len,
} => {
let byte_len = self.byte_len();
debug_assert_eq!(*len, byte_len);
if let Some(len) =
tight_cuda_download_len(byte_len, self.pitch_bytes, stride, out.len())
{
return buffer
.copy_range_to_host(*offset, &mut out[..len])
.map_err(cuda_error);
}
let mut tight = vec![0u8; byte_len];
buffer
.copy_range_to_host(*offset, &mut tight)
.map_err(cuda_error)?;
copy_tight_pixels_to_strided_output(&tight, self.dimensions, self.fmt, out, stride)
.map_err(Error::from)
}
}
}
pub fn download_into_profiled(&self, out: &mut [u8], stride: usize) -> Result<u128, Error> {
let started = std::time::Instant::now();
self.download_into(out, stride)?;
Ok(started.elapsed().as_micros())
}
pub fn cuda_surface(&self) -> Option<CudaSurface<'_>> {
#[cfg(feature = "cuda-runtime")]
match &self.storage {
Storage::Cuda(buffer) => Some(CudaSurface {
buffer,
offset: 0,
stats: self.stats,
}),
Storage::CudaRange { buffer, offset, .. } => Some(CudaSurface {
buffer,
offset: *offset,
stats: self.stats,
}),
Storage::Host(_) => None,
}
#[cfg(not(feature = "cuda-runtime"))]
{
let _ = self.stats;
None
}
}
pub fn download_batch_tight(surfaces: &[Self]) -> Result<Vec<u8>, Error> {
let required = batch_tight_required_len(surfaces)?;
if required == 0 {
return Ok(Vec::new());
}
#[cfg(feature = "cuda-runtime")]
if let Some((buffer, offset)) = contiguous_cuda_batch_range(surfaces) {
let mut out = Vec::with_capacity(required);
buffer
.copy_range_to_host_uninit(offset, out.spare_capacity_mut())
.map_err(cuda_error)?;
unsafe {
out.set_len(required);
}
return Ok(out);
}
let mut out = vec![0u8; required];
Self::download_batch_tight_into(surfaces, &mut out)?;
Ok(out)
}
pub fn download_batch_tight_into(surfaces: &[Self], out: &mut [u8]) -> Result<(), Error> {
let required = batch_tight_required_len(surfaces)?;
if out.len() < required {
return Err(BufferError::OutputTooSmall {
required,
have: out.len(),
}
.into());
}
if required == 0 {
return Ok(());
}
#[cfg(feature = "cuda-runtime")]
if let Some((buffer, offset)) = contiguous_cuda_batch_range(surfaces) {
return buffer
.copy_range_to_host(offset, &mut out[..required])
.map_err(cuda_error);
}
let mut cursor = 0usize;
for surface in surfaces {
let len = surface.byte_len();
surface.download_into(&mut out[cursor..cursor + len], surface.pitch_bytes)?;
cursor += len;
}
Ok(())
}
}
fn batch_tight_required_len(surfaces: &[Surface]) -> Result<usize, Error> {
surfaces
.iter()
.try_fold(0usize, |sum, surface| sum.checked_add(surface.byte_len()))
.ok_or(BufferError::SizeOverflow {
what: "tight batch surface output",
})
.map_err(Error::from)
}
#[cfg(feature = "cuda-runtime")]
pub(crate) fn cuda_range_storage(
buffer: Arc<CudaDeviceBuffer>,
offset: usize,
len: usize,
) -> Storage {
Storage::CudaRange {
buffer,
offset,
len,
}
}
#[cfg(feature = "cuda-runtime")]
fn contiguous_cuda_batch_range(surfaces: &[Surface]) -> Option<(&CudaDeviceBuffer, usize)> {
let first = surfaces.first()?;
let Storage::CudaRange {
buffer,
offset,
len,
} = &first.storage
else {
return None;
};
let first_buffer = buffer;
let first_offset = *offset;
let mut expected_offset = first_offset.checked_add(*len)?;
for surface in &surfaces[1..] {
let Storage::CudaRange {
buffer,
offset,
len,
} = &surface.storage
else {
return None;
};
if !Arc::ptr_eq(first_buffer, buffer) || *offset != expected_offset {
return None;
}
expected_offset = expected_offset.checked_add(*len)?;
}
Some((first_buffer.as_ref(), first_offset))
}
#[cfg(any(feature = "cuda-runtime", test))]
fn tight_cuda_download_len(
byte_len: usize,
pitch_bytes: usize,
stride: usize,
out_len: usize,
) -> Option<usize> {
(stride == pitch_bytes && out_len >= byte_len).then_some(byte_len)
}
impl DeviceSurface for Surface {
fn backend_kind(&self) -> BackendKind {
self.backend
}
fn residency(&self) -> j2k_core::SurfaceResidency {
match self.residency {
SurfaceResidency::Host => j2k_core::SurfaceResidency::Host,
SurfaceResidency::CudaResidentDecode => j2k_core::SurfaceResidency::CudaResidentDecode,
SurfaceResidency::CpuStagedCudaUpload => {
j2k_core::SurfaceResidency::CpuStagedCudaUpload
}
}
}
fn dimensions(&self) -> (u32, u32) {
self.dimensions
}
fn pixel_format(&self) -> PixelFormat {
self.fmt
}
fn byte_len(&self) -> usize {
self.pitch_bytes * self.dimensions.1 as usize
}
fn execution_stats(&self) -> ExecutionStats {
ExecutionStats {
kernel_dispatches: self.stats.total as u64,
..ExecutionStats::default()
}
}
fn memory_range(&self) -> Option<DeviceMemoryRange> {
match &self.storage {
Storage::Host(_) => None,
#[cfg(feature = "cuda-runtime")]
Storage::Cuda(buffer) => Some(DeviceMemoryRange::new(
BackendKind::Cuda,
buffer.device_ptr(),
0,
self.byte_len(),
)),
#[cfg(feature = "cuda-runtime")]
Storage::CudaRange {
buffer,
offset,
len,
} => Some(DeviceMemoryRange::new(
BackendKind::Cuda,
buffer.device_ptr(),
*offset,
*len,
)),
}
}
}
#[cfg(test)]
mod tests {
use super::{tight_cuda_download_len, CudaSurfaceStats, Storage, Surface, SurfaceResidency};
use j2k_core::{BackendKind, PixelFormat};
#[test]
fn tight_cuda_download_len_accepts_exact_tight_output() {
assert_eq!(tight_cuda_download_len(32, 8, 8, 32), Some(32));
}
#[test]
fn download_batch_tight_returns_tightly_concatenated_host_surfaces() {
let surfaces = [
Surface {
backend: BackendKind::Cpu,
residency: SurfaceResidency::Host,
dimensions: (2, 1),
fmt: PixelFormat::Gray8,
pitch_bytes: 2,
stats: CudaSurfaceStats::default(),
storage: Storage::Host(vec![1, 2]),
},
Surface {
backend: BackendKind::Cpu,
residency: SurfaceResidency::Host,
dimensions: (1, 1),
fmt: PixelFormat::Rgb8,
pitch_bytes: 3,
stats: CudaSurfaceStats::default(),
storage: Storage::Host(vec![3, 4, 5]),
},
];
let tight = Surface::download_batch_tight(&surfaces).expect("batch download");
assert_eq!(tight, vec![1, 2, 3, 4, 5]);
}
}