use crate::{
bytes::{f32_slice_as_bytes, f32_slice_as_bytes_mut, i32_slice_as_bytes_mut},
context::CudaContext,
driver::CuDevicePtr,
error::CudaError,
execution::{cuda_kernel_param, CudaExecutionStats},
j2k_decode::{active_dwt53_buffers, CudaJ2kStridedInterleavedPixels},
kernels::{j2k_dwt53_launch_geometry, j2k_forward_rct_launch_geometry, CudaKernel},
memory::{checked_image_words, CudaDeviceBuffer},
};
impl CudaContext {
pub fn j2k_deinterleave_to_f32(
&self,
pixels: &[u8],
num_pixels: usize,
num_components: u8,
bit_depth: u8,
signed: bool,
) -> Result<CudaJ2kDeinterleavedComponents, CudaError> {
let resident = self.j2k_deinterleave_to_f32_resident(
pixels,
num_pixels,
num_components,
bit_depth,
signed,
)?;
let execution = resident.execution();
let components = resident.download_components()?;
Ok(CudaJ2kDeinterleavedComponents {
components,
execution,
})
}
pub fn j2k_deinterleave_to_f32_resident(
&self,
pixels: &[u8],
num_pixels: usize,
num_components: u8,
bit_depth: u8,
signed: bool,
) -> Result<CudaJ2kResidentComponents, CudaError> {
if num_components == 0 || num_components > 4 {
return Err(CudaError::InvalidArgument {
message: "component count must be between 1 and 4".to_string(),
});
}
if bit_depth == 0 || bit_depth > 16 {
return Err(CudaError::InvalidArgument {
message: "bit depth must be between 1 and 16".to_string(),
});
}
let bytes_per_sample = if bit_depth <= 8 { 1usize } else { 2usize };
let expected_len = num_pixels
.checked_mul(usize::from(num_components))
.and_then(|len| len.checked_mul(bytes_per_sample))
.ok_or(CudaError::LengthTooLarge { len: num_pixels })?;
if pixels.len() < expected_len {
return Err(CudaError::InvalidArgument {
message: "pixel buffer is shorter than the requested image".to_string(),
});
}
self.inner.set_current()?;
let sample_count = num_pixels
.checked_mul(usize::from(num_components))
.ok_or(CudaError::LengthTooLarge { len: num_pixels })?;
let output_bytes = sample_count
.checked_mul(std::mem::size_of::<f32>())
.ok_or(CudaError::LengthTooLarge { len: sample_count })?;
let output = self.allocate(output_bytes)?;
if num_pixels == 0 {
return Ok(CudaJ2kResidentComponents {
buffer: output,
num_pixels,
num_components,
execution: CudaExecutionStats::default(),
});
}
let pixels = self.upload(&pixels[..expected_len])?;
self.launch_j2k_deinterleave_to_f32(
&pixels,
&output,
num_pixels,
num_components,
bit_depth,
signed,
)?;
Ok(CudaJ2kResidentComponents {
buffer: output,
num_pixels,
num_components,
execution: CudaExecutionStats {
kernel_dispatches: 1,
copy_kernel_dispatches: 0,
decode_kernel_dispatches: 0,
hardware_decode: false,
},
})
}
pub fn j2k_deinterleave_strided_to_f32_resident(
&self,
image: CudaJ2kStridedInterleavedPixels<'_>,
) -> Result<CudaJ2kResidentComponents, CudaError> {
let CudaJ2kStridedInterleavedPixels {
buffer: pixels,
byte_offset,
width,
height,
pitch_bytes,
num_components,
bit_depth,
signed,
} = image;
if width == 0 || height == 0 {
return Err(CudaError::InvalidArgument {
message: "image dimensions must be nonzero".to_string(),
});
}
if num_components == 0 || num_components > 4 {
return Err(CudaError::InvalidArgument {
message: "component count must be between 1 and 4".to_string(),
});
}
if bit_depth == 0 || bit_depth > 16 {
return Err(CudaError::InvalidArgument {
message: "bit depth must be between 1 and 16".to_string(),
});
}
let bytes_per_sample = if bit_depth <= 8 { 1usize } else { 2usize };
let bytes_per_pixel = usize::from(num_components)
.checked_mul(bytes_per_sample)
.ok_or(CudaError::LengthTooLarge {
len: usize::from(num_components),
})?;
let row_bytes =
(width as usize)
.checked_mul(bytes_per_pixel)
.ok_or(CudaError::ImageTooLarge {
width,
height,
channels: usize::from(num_components),
})?;
if pitch_bytes < row_bytes {
return Err(CudaError::InvalidArgument {
message: "pitch is shorter than one row".to_string(),
});
}
let required_end = byte_offset
.checked_add(
pitch_bytes
.checked_mul(height.saturating_sub(1) as usize)
.and_then(|prefix| prefix.checked_add(row_bytes))
.ok_or(CudaError::LengthTooLarge { len: pitch_bytes })?,
)
.ok_or(CudaError::LengthTooLarge { len: byte_offset })?;
if required_end > pixels.byte_len() {
return Err(CudaError::OutputTooSmall {
required: required_end,
have: pixels.byte_len(),
});
}
self.inner.set_current()?;
let num_pixels =
(width as usize)
.checked_mul(height as usize)
.ok_or(CudaError::ImageTooLarge {
width,
height,
channels: usize::from(num_components),
})?;
let sample_count = num_pixels
.checked_mul(usize::from(num_components))
.ok_or(CudaError::LengthTooLarge { len: num_pixels })?;
let output_bytes = sample_count
.checked_mul(std::mem::size_of::<f32>())
.ok_or(CudaError::LengthTooLarge { len: sample_count })?;
let output = self.allocate(output_bytes)?;
self.launch_j2k_deinterleave_strided_to_f32(
pixels,
&output,
width,
height,
byte_offset,
pitch_bytes,
num_components,
bit_depth,
signed,
)?;
Ok(CudaJ2kResidentComponents {
buffer: output,
num_pixels,
num_components,
execution: CudaExecutionStats {
kernel_dispatches: 1,
copy_kernel_dispatches: 0,
decode_kernel_dispatches: 0,
hardware_decode: false,
},
})
}
pub fn j2k_forward_rct_resident(
&self,
components: &mut CudaJ2kResidentComponents,
) -> Result<CudaExecutionStats, CudaError> {
if components.num_components < 3 {
return Err(CudaError::InvalidArgument {
message: "forward RCT requires at least three resident component planes"
.to_string(),
});
}
if components.num_pixels == 0 {
return Ok(CudaExecutionStats::default());
}
self.inner.set_current()?;
let plane0 = components.component_plane_device_ptr(0)?;
let plane1 = components.component_plane_device_ptr(1)?;
let plane2 = components.component_plane_device_ptr(2)?;
self.launch_j2k_forward_rct_ptrs(plane0, plane1, plane2, components.num_pixels)?;
Ok(CudaExecutionStats {
kernel_dispatches: 1,
copy_kernel_dispatches: 0,
decode_kernel_dispatches: 0,
hardware_decode: false,
})
}
pub fn j2k_forward_ict_resident(
&self,
components: &mut CudaJ2kResidentComponents,
) -> Result<CudaExecutionStats, CudaError> {
if components.num_components < 3 {
return Err(CudaError::InvalidArgument {
message: "forward ICT requires at least three resident component planes"
.to_string(),
});
}
if components.num_pixels == 0 {
return Ok(CudaExecutionStats::default());
}
self.inner.set_current()?;
let plane0 = components.component_plane_device_ptr(0)?;
let plane1 = components.component_plane_device_ptr(1)?;
let plane2 = components.component_plane_device_ptr(2)?;
self.launch_j2k_forward_ict_ptrs(plane0, plane1, plane2, components.num_pixels)?;
Ok(CudaExecutionStats {
kernel_dispatches: 1,
copy_kernel_dispatches: 0,
decode_kernel_dispatches: 0,
hardware_decode: false,
})
}
pub fn j2k_forward_rct(
&self,
plane0: &mut [f32],
plane1: &mut [f32],
plane2: &mut [f32],
) -> Result<CudaExecutionStats, CudaError> {
if plane0.len() != plane1.len() || plane0.len() != plane2.len() {
return Err(CudaError::ImageTooLarge {
width: u32::try_from(plane0.len()).unwrap_or(u32::MAX),
height: 1,
channels: 3,
});
}
if plane0.is_empty() {
return Ok(CudaExecutionStats::default());
}
self.inner.set_current()?;
let buffer0 = self.upload(f32_slice_as_bytes(plane0))?;
let buffer1 = self.upload(f32_slice_as_bytes(plane1))?;
let buffer2 = self.upload(f32_slice_as_bytes(plane2))?;
self.launch_j2k_forward_rct_buffers(&buffer0, &buffer1, &buffer2, plane0.len())?;
buffer0.copy_to_host(f32_slice_as_bytes_mut(plane0))?;
buffer1.copy_to_host(f32_slice_as_bytes_mut(plane1))?;
buffer2.copy_to_host(f32_slice_as_bytes_mut(plane2))?;
Ok(CudaExecutionStats {
kernel_dispatches: 1,
copy_kernel_dispatches: 0,
decode_kernel_dispatches: 0,
hardware_decode: false,
})
}
pub fn j2k_forward_ict(
&self,
plane0: &mut [f32],
plane1: &mut [f32],
plane2: &mut [f32],
) -> Result<CudaExecutionStats, CudaError> {
if plane0.len() != plane1.len() || plane0.len() != plane2.len() {
return Err(CudaError::ImageTooLarge {
width: u32::try_from(plane0.len()).unwrap_or(u32::MAX),
height: 1,
channels: 3,
});
}
if plane0.is_empty() {
return Ok(CudaExecutionStats::default());
}
self.inner.set_current()?;
let buffer0 = self.upload(f32_slice_as_bytes(plane0))?;
let buffer1 = self.upload(f32_slice_as_bytes(plane1))?;
let buffer2 = self.upload(f32_slice_as_bytes(plane2))?;
self.launch_j2k_forward_ict_buffers(&buffer0, &buffer1, &buffer2, plane0.len())?;
buffer0.copy_to_host(f32_slice_as_bytes_mut(plane0))?;
buffer1.copy_to_host(f32_slice_as_bytes_mut(plane1))?;
buffer2.copy_to_host(f32_slice_as_bytes_mut(plane2))?;
Ok(CudaExecutionStats {
kernel_dispatches: 1,
copy_kernel_dispatches: 0,
decode_kernel_dispatches: 0,
hardware_decode: false,
})
}
pub fn j2k_forward_dwt53(
&self,
samples: &[f32],
width: u32,
height: u32,
num_levels: u8,
) -> Result<CudaDwt53Output, CudaError> {
let expected_len =
(width as usize)
.checked_mul(height as usize)
.ok_or(CudaError::ImageTooLarge {
width,
height,
channels: 1,
})?;
if expected_len != samples.len() {
return Err(CudaError::ImageTooLarge {
width,
height,
channels: 1,
});
}
if samples.is_empty() || num_levels == 0 {
return Ok(CudaDwt53Output {
transformed: samples.to_vec(),
levels: Vec::new(),
ll_width: width,
ll_height: height,
execution: CudaExecutionStats::default(),
});
}
self.inner.set_current()?;
let buffer_a = self.upload(f32_slice_as_bytes(samples))?;
let resident = self.j2k_forward_dwt53_resident_buffer(
buffer_a,
samples.len(),
width,
height,
num_levels,
0,
)?;
let transformed = resident.download_transformed()?;
Ok(CudaDwt53Output {
transformed,
levels: resident.levels().to_vec(),
ll_width: resident.ll_dimensions().0,
ll_height: resident.ll_dimensions().1,
execution: resident.execution(),
})
}
pub fn j2k_forward_dwt53_resident_component(
&self,
components: &CudaJ2kResidentComponents,
component: u8,
width: u32,
height: u32,
num_levels: u8,
) -> Result<CudaResidentDwt53Output, CudaError> {
let expected_len =
(width as usize)
.checked_mul(height as usize)
.ok_or(CudaError::ImageTooLarge {
width,
height,
channels: 1,
})?;
if expected_len != components.num_pixels {
return Err(CudaError::ImageTooLarge {
width,
height,
channels: 1,
});
}
self.inner.set_current()?;
let plane_ptr = components.component_plane_device_ptr(component)?;
let byte_len = expected_len
.checked_mul(std::mem::size_of::<f32>())
.ok_or(CudaError::LengthTooLarge { len: expected_len })?;
let buffer_a = self.copy_device_ptr_to_device_with_kernel(plane_ptr, byte_len)?;
let copy_dispatches = usize::from(byte_len != 0);
self.j2k_forward_dwt53_resident_buffer(
buffer_a,
expected_len,
width,
height,
num_levels,
copy_dispatches,
)
}
fn j2k_forward_dwt53_resident_buffer(
&self,
buffer_a: CudaDeviceBuffer,
sample_count: usize,
width: u32,
height: u32,
num_levels: u8,
initial_copy_dispatches: usize,
) -> Result<CudaResidentDwt53Output, CudaError> {
if sample_count == 0 || num_levels == 0 {
return Ok(CudaResidentDwt53Output {
buffer: buffer_a,
sample_count,
levels: Vec::new(),
ll_width: width,
ll_height: height,
execution: CudaExecutionStats {
kernel_dispatches: initial_copy_dispatches,
copy_kernel_dispatches: initial_copy_dispatches,
decode_kernel_dispatches: 0,
hardware_decode: false,
},
});
}
let buffer_b = self.allocate(
sample_count
.checked_mul(std::mem::size_of::<f32>())
.ok_or(CudaError::LengthTooLarge { len: sample_count })?,
)?;
let mut current_width = width;
let mut current_height = height;
let mut levels = Vec::new();
let mut dispatches = 0usize;
let mut active_is_a = true;
for _ in 0..num_levels {
if current_width < 2 && current_height < 2 {
break;
}
let (level_dispatches, level_shape) = self.launch_j2k_forward_dwt53_level(
&buffer_a,
&buffer_b,
&mut active_is_a,
CudaDwt53LevelPass {
full_width: width,
current_width,
current_height,
},
)?;
dispatches = dispatches.saturating_add(level_dispatches);
levels.push(level_shape);
current_width = level_shape.low_width;
current_height = level_shape.low_height;
}
let buffer = if active_is_a { buffer_a } else { buffer_b };
Ok(CudaResidentDwt53Output {
buffer,
sample_count,
levels,
ll_width: current_width,
ll_height: current_height,
execution: CudaExecutionStats {
kernel_dispatches: initial_copy_dispatches.saturating_add(dispatches),
copy_kernel_dispatches: initial_copy_dispatches,
decode_kernel_dispatches: 0,
hardware_decode: false,
},
})
}
pub fn j2k_forward_dwt97(
&self,
samples: &[f32],
width: u32,
height: u32,
num_levels: u8,
) -> Result<CudaDwt97Output, CudaError> {
let expected_len =
(width as usize)
.checked_mul(height as usize)
.ok_or(CudaError::ImageTooLarge {
width,
height,
channels: 1,
})?;
if expected_len != samples.len() {
return Err(CudaError::ImageTooLarge {
width,
height,
channels: 1,
});
}
if samples.is_empty() || num_levels == 0 {
return Ok(CudaDwt97Output {
transformed: samples.to_vec(),
levels: Vec::new(),
ll_width: width,
ll_height: height,
execution: CudaExecutionStats::default(),
});
}
self.inner.set_current()?;
let buffer_a = self.upload(f32_slice_as_bytes(samples))?;
let resident = self.j2k_forward_dwt97_resident_buffer(
buffer_a,
samples.len(),
width,
height,
num_levels,
0,
)?;
let transformed = resident.download_transformed()?;
Ok(CudaDwt97Output {
transformed,
levels: resident.levels().to_vec(),
ll_width: resident.ll_dimensions().0,
ll_height: resident.ll_dimensions().1,
execution: resident.execution(),
})
}
pub fn j2k_forward_dwt97_resident_component(
&self,
components: &CudaJ2kResidentComponents,
component: u8,
width: u32,
height: u32,
num_levels: u8,
) -> Result<CudaResidentDwt97Output, CudaError> {
let expected_len =
(width as usize)
.checked_mul(height as usize)
.ok_or(CudaError::ImageTooLarge {
width,
height,
channels: 1,
})?;
if expected_len != components.num_pixels {
return Err(CudaError::ImageTooLarge {
width,
height,
channels: 1,
});
}
self.inner.set_current()?;
let plane_ptr = components.component_plane_device_ptr(component)?;
let byte_len = expected_len
.checked_mul(std::mem::size_of::<f32>())
.ok_or(CudaError::LengthTooLarge { len: expected_len })?;
let buffer_a = self.copy_device_ptr_to_device_with_kernel(plane_ptr, byte_len)?;
let copy_dispatches = usize::from(byte_len != 0);
self.j2k_forward_dwt97_resident_buffer(
buffer_a,
expected_len,
width,
height,
num_levels,
copy_dispatches,
)
}
fn j2k_forward_dwt97_resident_buffer(
&self,
buffer_a: CudaDeviceBuffer,
sample_count: usize,
width: u32,
height: u32,
num_levels: u8,
initial_copy_dispatches: usize,
) -> Result<CudaResidentDwt97Output, CudaError> {
if sample_count == 0 || num_levels == 0 {
return Ok(CudaResidentDwt97Output {
buffer: buffer_a,
sample_count,
levels: Vec::new(),
ll_width: width,
ll_height: height,
execution: CudaExecutionStats {
kernel_dispatches: initial_copy_dispatches,
copy_kernel_dispatches: initial_copy_dispatches,
decode_kernel_dispatches: 0,
hardware_decode: false,
},
});
}
let buffer_b = self.allocate(
sample_count
.checked_mul(std::mem::size_of::<f32>())
.ok_or(CudaError::LengthTooLarge { len: sample_count })?,
)?;
let mut current_width = width;
let mut current_height = height;
let mut levels = Vec::new();
let mut dispatches = 0usize;
let mut active_is_a = true;
for _ in 0..num_levels {
if current_width < 2 && current_height < 2 {
break;
}
let (level_dispatches, level_shape) = self.launch_j2k_forward_dwt97_level(
&buffer_a,
&buffer_b,
&mut active_is_a,
CudaDwt53LevelPass {
full_width: width,
current_width,
current_height,
},
)?;
dispatches = dispatches.saturating_add(level_dispatches);
levels.push(level_shape);
current_width = level_shape.low_width;
current_height = level_shape.low_height;
}
let buffer = if active_is_a { buffer_a } else { buffer_b };
Ok(CudaResidentDwt97Output {
buffer,
sample_count,
levels,
ll_width: current_width,
ll_height: current_height,
execution: CudaExecutionStats {
kernel_dispatches: initial_copy_dispatches.saturating_add(dispatches),
copy_kernel_dispatches: initial_copy_dispatches,
decode_kernel_dispatches: 0,
hardware_decode: false,
},
})
}
pub fn j2k_quantize_subband(
&self,
samples: &[f32],
job: CudaJ2kQuantizeJob,
) -> Result<CudaJ2kQuantizedSubband, CudaError> {
let sample_buffer = self.upload(f32_slice_as_bytes(samples))?;
let resident = self.j2k_quantize_subband_resident(&sample_buffer, samples.len(), job)?;
let coefficients = resident.download_coefficients()?;
Ok(CudaJ2kQuantizedSubband {
coefficients,
execution: resident.execution(),
})
}
pub fn j2k_quantize_subband_resident(
&self,
samples: &CudaDeviceBuffer,
sample_count: usize,
job: CudaJ2kQuantizeJob,
) -> Result<CudaJ2kResidentQuantizedSubband, CudaError> {
if sample_count == 0 {
return Ok(CudaJ2kResidentQuantizedSubband {
coefficients: self.allocate(0)?,
coefficient_count: 0,
execution: CudaExecutionStats::default(),
});
}
let available_samples = samples.typed_view::<f32>()?.len();
if available_samples < sample_count {
return Err(CudaError::OutputTooSmall {
required: sample_count
.checked_mul(std::mem::size_of::<f32>())
.ok_or(CudaError::LengthTooLarge { len: sample_count })?,
have: samples.byte_len(),
});
}
self.inner.set_current()?;
let coefficient_buffer = self.allocate(
sample_count
.checked_mul(std::mem::size_of::<i32>())
.ok_or(CudaError::LengthTooLarge { len: sample_count })?,
)?;
self.launch_j2k_quantize_subband(samples, &coefficient_buffer, sample_count, job)?;
Ok(CudaJ2kResidentQuantizedSubband {
coefficients: coefficient_buffer,
coefficient_count: sample_count,
execution: CudaExecutionStats {
kernel_dispatches: 1,
copy_kernel_dispatches: 0,
decode_kernel_dispatches: 0,
hardware_decode: false,
},
})
}
pub fn j2k_quantize_subband_region_resident(
&self,
samples: &CudaDeviceBuffer,
job: CudaJ2kQuantizeSubbandRegionJob,
) -> Result<CudaJ2kResidentQuantizedSubband, CudaError> {
let coefficient_count = checked_image_words(job.width, job.height, 1)?;
if coefficient_count == 0 {
return Ok(CudaJ2kResidentQuantizedSubband {
coefficients: self.allocate(0)?,
coefficient_count: 0,
execution: CudaExecutionStats::default(),
});
}
let available_samples = samples.typed_view::<f32>()?.len();
validate_quantize_region(job, available_samples)?;
self.inner.set_current()?;
let coefficient_buffer = self.allocate(
coefficient_count
.checked_mul(std::mem::size_of::<i32>())
.ok_or(CudaError::LengthTooLarge {
len: coefficient_count,
})?,
)?;
self.launch_j2k_quantize_subband_region(samples, &coefficient_buffer, job)?;
Ok(CudaJ2kResidentQuantizedSubband {
coefficients: coefficient_buffer,
coefficient_count,
execution: CudaExecutionStats {
kernel_dispatches: 1,
copy_kernel_dispatches: 0,
decode_kernel_dispatches: 0,
hardware_decode: false,
},
})
}
fn launch_j2k_forward_dwt53_level(
&self,
buffer_a: &CudaDeviceBuffer,
buffer_b: &CudaDeviceBuffer,
active_is_a: &mut bool,
pass: CudaDwt53LevelPass,
) -> Result<(usize, CudaDwt53LevelShape), CudaError> {
let low_width = pass.current_width.div_ceil(2);
let low_height = pass.current_height.div_ceil(2);
let mut dispatches = 0usize;
if pass.current_height >= 2 {
let (input, output) = active_dwt53_buffers(buffer_a, buffer_b, *active_is_a);
self.launch_j2k_forward_dwt53_pass(
CudaKernel::J2kForwardDwt53Vertical,
input,
output,
CudaDwt53Pass {
full_width: pass.full_width,
current_width: pass.current_width,
current_height: pass.current_height,
low_extent: low_height,
},
)?;
*active_is_a = !*active_is_a;
dispatches = dispatches.saturating_add(1);
}
if pass.current_width >= 2 {
let (input, output) = active_dwt53_buffers(buffer_a, buffer_b, *active_is_a);
self.launch_j2k_forward_dwt53_pass(
CudaKernel::J2kForwardDwt53Horizontal,
input,
output,
CudaDwt53Pass {
full_width: pass.full_width,
current_width: pass.current_width,
current_height: pass.current_height,
low_extent: low_width,
},
)?;
*active_is_a = !*active_is_a;
dispatches = dispatches.saturating_add(1);
}
Ok((
dispatches,
CudaDwt53LevelShape {
width: pass.current_width,
height: pass.current_height,
low_width,
low_height,
high_width: pass.current_width / 2,
high_height: pass.current_height / 2,
},
))
}
fn launch_j2k_forward_dwt97_level(
&self,
buffer_a: &CudaDeviceBuffer,
buffer_b: &CudaDeviceBuffer,
active_is_a: &mut bool,
pass: CudaDwt53LevelPass,
) -> Result<(usize, CudaDwt53LevelShape), CudaError> {
let low_width = pass.current_width.div_ceil(2);
let low_height = pass.current_height.div_ceil(2);
let mut dispatches = 0usize;
if pass.current_height >= 2 {
let (input, output) = active_dwt53_buffers(buffer_a, buffer_b, *active_is_a);
self.launch_j2k_forward_dwt53_pass(
CudaKernel::J2kForwardDwt97Vertical,
input,
output,
CudaDwt53Pass {
full_width: pass.full_width,
current_width: pass.current_width,
current_height: pass.current_height,
low_extent: low_height,
},
)?;
*active_is_a = !*active_is_a;
dispatches = dispatches.saturating_add(1);
}
if pass.current_width >= 2 {
let (input, output) = active_dwt53_buffers(buffer_a, buffer_b, *active_is_a);
self.launch_j2k_forward_dwt53_pass(
CudaKernel::J2kForwardDwt97Horizontal,
input,
output,
CudaDwt53Pass {
full_width: pass.full_width,
current_width: pass.current_width,
current_height: pass.current_height,
low_extent: low_width,
},
)?;
*active_is_a = !*active_is_a;
dispatches = dispatches.saturating_add(1);
}
Ok((
dispatches,
CudaDwt53LevelShape {
width: pass.current_width,
height: pass.current_height,
low_width,
low_height,
high_width: pass.current_width / 2,
high_height: pass.current_height / 2,
},
))
}
fn launch_j2k_forward_rct_buffers(
&self,
plane0: &CudaDeviceBuffer,
plane1: &CudaDeviceBuffer,
plane2: &CudaDeviceBuffer,
len: usize,
) -> Result<(), CudaError> {
self.launch_j2k_forward_rct_ptrs(
plane0.device_ptr(),
plane1.device_ptr(),
plane2.device_ptr(),
len,
)
}
fn launch_j2k_forward_rct_ptrs(
&self,
plane0: CuDevicePtr,
plane1: CuDevicePtr,
plane2: CuDevicePtr,
len: usize,
) -> Result<(), CudaError> {
let function = self.inner.kernel_function(CudaKernel::J2kForwardRct)?;
let mut plane0_ptr = plane0;
let mut plane1_ptr = plane1;
let mut plane2_ptr = plane2;
let mut len_u64 = u64::try_from(len).map_err(|_| CudaError::LengthTooLarge { len })?;
let mut params = cuda_kernel_params!(plane0_ptr, plane1_ptr, plane2_ptr, len_u64);
let geometry =
j2k_forward_rct_launch_geometry(len).ok_or(CudaError::LengthTooLarge { len })?;
self.launch_kernel(function, geometry, &mut params)
}
fn launch_j2k_deinterleave_to_f32(
&self,
pixels: &CudaDeviceBuffer,
output: &CudaDeviceBuffer,
num_pixels: usize,
num_components: u8,
bit_depth: u8,
signed: bool,
) -> Result<(), CudaError> {
let function = self
.inner
.kernel_function(CudaKernel::J2kDeinterleaveToF32)?;
let mut pixels_ptr = pixels.device_ptr();
let mut output_ptr = output.device_ptr();
let mut num_pixels_u64 =
u64::try_from(num_pixels).map_err(|_| CudaError::LengthTooLarge { len: num_pixels })?;
let mut num_components_u32 = u32::from(num_components);
let mut bit_depth_u32 = u32::from(bit_depth);
let mut signed_u32 = u32::from(signed);
let mut params = cuda_kernel_params!(
pixels_ptr,
output_ptr,
num_pixels_u64,
num_components_u32,
bit_depth_u32,
signed_u32
);
let geometry = j2k_forward_rct_launch_geometry(num_pixels)
.ok_or(CudaError::LengthTooLarge { len: num_pixels })?;
self.launch_kernel(function, geometry, &mut params)
}
#[allow(clippy::too_many_arguments)]
fn launch_j2k_deinterleave_strided_to_f32(
&self,
pixels: &CudaDeviceBuffer,
output: &CudaDeviceBuffer,
width: u32,
height: u32,
byte_offset: usize,
pitch_bytes: usize,
num_components: u8,
bit_depth: u8,
signed: bool,
) -> Result<(), CudaError> {
let function = self
.inner
.kernel_function(CudaKernel::J2kDeinterleaveStridedToF32)?;
let mut pixels_ptr = pixels.device_ptr();
let mut output_ptr = output.device_ptr();
let mut width_u64 = u64::from(width);
let mut height_u64 = u64::from(height);
let mut byte_offset_u64 = u64::try_from(byte_offset)
.map_err(|_| CudaError::LengthTooLarge { len: byte_offset })?;
let mut pitch_bytes_u64 = u64::try_from(pitch_bytes)
.map_err(|_| CudaError::LengthTooLarge { len: pitch_bytes })?;
let mut num_components_u32 = u32::from(num_components);
let mut bit_depth_u32 = u32::from(bit_depth);
let mut signed_u32 = u32::from(signed);
let mut params = cuda_kernel_params!(
pixels_ptr,
output_ptr,
width_u64,
height_u64,
byte_offset_u64,
pitch_bytes_u64,
num_components_u32,
bit_depth_u32,
signed_u32
);
let num_pixels =
(width as usize)
.checked_mul(height as usize)
.ok_or(CudaError::ImageTooLarge {
width,
height,
channels: usize::from(num_components),
})?;
let geometry = j2k_forward_rct_launch_geometry(num_pixels)
.ok_or(CudaError::LengthTooLarge { len: num_pixels })?;
self.launch_kernel(function, geometry, &mut params)
}
fn launch_j2k_forward_ict_buffers(
&self,
plane0: &CudaDeviceBuffer,
plane1: &CudaDeviceBuffer,
plane2: &CudaDeviceBuffer,
len: usize,
) -> Result<(), CudaError> {
self.launch_j2k_forward_ict_ptrs(
plane0.device_ptr(),
plane1.device_ptr(),
plane2.device_ptr(),
len,
)
}
fn launch_j2k_forward_ict_ptrs(
&self,
plane0: CuDevicePtr,
plane1: CuDevicePtr,
plane2: CuDevicePtr,
len: usize,
) -> Result<(), CudaError> {
let function = self.inner.kernel_function(CudaKernel::J2kForwardIct)?;
let mut plane0_ptr = plane0;
let mut plane1_ptr = plane1;
let mut plane2_ptr = plane2;
let mut len_u64 = u64::try_from(len).map_err(|_| CudaError::LengthTooLarge { len })?;
let mut params = cuda_kernel_params!(plane0_ptr, plane1_ptr, plane2_ptr, len_u64);
let geometry =
j2k_forward_rct_launch_geometry(len).ok_or(CudaError::LengthTooLarge { len })?;
self.launch_kernel(function, geometry, &mut params)
}
fn launch_j2k_forward_dwt53_pass(
&self,
kernel: CudaKernel,
input: &CudaDeviceBuffer,
output: &CudaDeviceBuffer,
pass: CudaDwt53Pass,
) -> Result<(), CudaError> {
let function = self.inner.kernel_function(kernel)?;
let mut input_ptr = input.device_ptr();
let mut output_ptr = output.device_ptr();
let mut full_width = pass.full_width;
let mut current_width = pass.current_width;
let mut current_height = pass.current_height;
let mut low_extent = pass.low_extent;
let mut params = cuda_kernel_params!(
input_ptr,
output_ptr,
full_width,
current_width,
current_height,
low_extent
);
let geometry = j2k_dwt53_launch_geometry(current_width, current_height).ok_or(
CudaError::ImageTooLarge {
width: pass.current_width,
height: pass.current_height,
channels: 1,
},
)?;
self.launch_kernel(function, geometry, &mut params)
}
fn launch_j2k_quantize_subband(
&self,
samples: &CudaDeviceBuffer,
coefficients: &CudaDeviceBuffer,
len: usize,
job: CudaJ2kQuantizeJob,
) -> Result<(), CudaError> {
let function = self.inner.kernel_function(CudaKernel::J2kQuantizeSubband)?;
let mut samples_ptr = samples.device_ptr();
let mut coefficients_ptr = coefficients.device_ptr();
let mut len_u64 = u64::try_from(len).map_err(|_| CudaError::LengthTooLarge { len })?;
let mut step_exponent = u32::from(job.step_exponent);
let mut step_mantissa = u32::from(job.step_mantissa);
let mut range_bits = u32::from(job.range_bits);
let mut reversible = u32::from(job.reversible);
let mut params = cuda_kernel_params!(
samples_ptr,
coefficients_ptr,
len_u64,
step_exponent,
step_mantissa,
range_bits,
reversible
);
let geometry =
j2k_forward_rct_launch_geometry(len).ok_or(CudaError::LengthTooLarge { len })?;
self.launch_kernel(function, geometry, &mut params)
}
fn launch_j2k_quantize_subband_region(
&self,
samples: &CudaDeviceBuffer,
coefficients: &CudaDeviceBuffer,
job: CudaJ2kQuantizeSubbandRegionJob,
) -> Result<(), CudaError> {
let function = self
.inner
.kernel_function(CudaKernel::J2kQuantizeSubbandStrided)?;
let mut samples_ptr = samples.device_ptr();
let mut coefficients_ptr = coefficients.device_ptr();
let mut x0 = job.x0;
let mut y0 = job.y0;
let mut width = job.width;
let mut height = job.height;
let mut stride = job.stride;
let mut step_exponent = u32::from(job.quantization.step_exponent);
let mut step_mantissa = u32::from(job.quantization.step_mantissa);
let mut range_bits = u32::from(job.quantization.range_bits);
let mut reversible = u32::from(job.quantization.reversible);
let mut params = cuda_kernel_params!(
samples_ptr,
coefficients_ptr,
x0,
y0,
width,
height,
stride,
step_exponent,
step_mantissa,
range_bits,
reversible
);
let geometry =
j2k_dwt53_launch_geometry(job.width, job.height).ok_or(CudaError::ImageTooLarge {
width: job.width,
height: job.height,
channels: 1,
})?;
self.launch_kernel(function, geometry, &mut params)
}
}
#[derive(Debug)]
pub struct CudaJ2kResidentComponents {
pub(crate) buffer: CudaDeviceBuffer,
pub(crate) num_pixels: usize,
pub(crate) num_components: u8,
pub(crate) execution: CudaExecutionStats,
}
impl CudaJ2kResidentComponents {
pub fn buffer(&self) -> &CudaDeviceBuffer {
&self.buffer
}
pub fn num_pixels(&self) -> usize {
self.num_pixels
}
pub fn num_components(&self) -> u8 {
self.num_components
}
pub fn execution(&self) -> CudaExecutionStats {
self.execution
}
pub fn download_components(&self) -> Result<Vec<Vec<f32>>, CudaError> {
if self.num_pixels == 0 {
return Ok(vec![Vec::new(); usize::from(self.num_components)]);
}
let sample_count = self
.num_pixels
.checked_mul(usize::from(self.num_components))
.ok_or(CudaError::LengthTooLarge {
len: self.num_pixels,
})?;
let mut flattened = vec![0.0f32; sample_count];
self.buffer
.copy_to_host(f32_slice_as_bytes_mut(&mut flattened))?;
Ok(flattened
.chunks_exact(self.num_pixels)
.map(<[f32]>::to_vec)
.collect())
}
fn component_plane_device_ptr(&self, component: u8) -> Result<CuDevicePtr, CudaError> {
if component >= self.num_components {
return Err(CudaError::InvalidArgument {
message: "component plane index is out of range".to_string(),
});
}
let plane_bytes = self
.num_pixels
.checked_mul(std::mem::size_of::<f32>())
.ok_or(CudaError::LengthTooLarge {
len: self.num_pixels,
})?;
let offset = plane_bytes
.checked_mul(usize::from(component))
.ok_or(CudaError::LengthTooLarge { len: plane_bytes })?;
let end = offset
.checked_add(plane_bytes)
.ok_or(CudaError::LengthTooLarge { len: offset })?;
if end > self.buffer.byte_len() {
return Err(CudaError::OutputTooSmall {
required: end,
have: self.buffer.byte_len(),
});
}
let offset =
u64::try_from(offset).map_err(|_| CudaError::LengthTooLarge { len: offset })?;
self.buffer
.device_ptr()
.checked_add(offset)
.ok_or(CudaError::LengthTooLarge {
len: self.buffer.byte_len(),
})
}
}
#[derive(Debug)]
pub struct CudaJ2kDeinterleavedComponents {
pub(crate) components: Vec<Vec<f32>>,
pub(crate) execution: CudaExecutionStats,
}
impl CudaJ2kDeinterleavedComponents {
pub fn components(&self) -> &[Vec<f32>] {
&self.components
}
pub fn execution(&self) -> CudaExecutionStats {
self.execution
}
pub fn into_components(self) -> Vec<Vec<f32>> {
self.components
}
}
#[derive(Debug)]
pub struct CudaDwt53Output {
pub(crate) transformed: Vec<f32>,
pub(crate) levels: Vec<CudaDwt53LevelShape>,
pub(crate) ll_width: u32,
pub(crate) ll_height: u32,
pub(crate) execution: CudaExecutionStats,
}
impl CudaDwt53Output {
pub fn transformed(&self) -> &[f32] {
&self.transformed
}
pub fn levels(&self) -> &[CudaDwt53LevelShape] {
&self.levels
}
pub fn ll_dimensions(&self) -> (u32, u32) {
(self.ll_width, self.ll_height)
}
pub fn execution(&self) -> CudaExecutionStats {
self.execution
}
}
#[derive(Debug)]
pub struct CudaResidentDwt53Output {
pub(crate) buffer: CudaDeviceBuffer,
pub(crate) sample_count: usize,
pub(crate) levels: Vec<CudaDwt53LevelShape>,
pub(crate) ll_width: u32,
pub(crate) ll_height: u32,
pub(crate) execution: CudaExecutionStats,
}
impl CudaResidentDwt53Output {
pub fn buffer(&self) -> &CudaDeviceBuffer {
&self.buffer
}
pub fn sample_count(&self) -> usize {
self.sample_count
}
pub fn download_transformed(&self) -> Result<Vec<f32>, CudaError> {
let mut transformed = vec![0f32; self.sample_count];
self.buffer
.copy_to_host(f32_slice_as_bytes_mut(&mut transformed))?;
Ok(transformed)
}
pub fn levels(&self) -> &[CudaDwt53LevelShape] {
&self.levels
}
pub fn ll_dimensions(&self) -> (u32, u32) {
(self.ll_width, self.ll_height)
}
pub fn execution(&self) -> CudaExecutionStats {
self.execution
}
}
#[derive(Debug)]
pub struct CudaDwt97Output {
pub(crate) transformed: Vec<f32>,
pub(crate) levels: Vec<CudaDwt53LevelShape>,
pub(crate) ll_width: u32,
pub(crate) ll_height: u32,
pub(crate) execution: CudaExecutionStats,
}
impl CudaDwt97Output {
pub fn transformed(&self) -> &[f32] {
&self.transformed
}
pub fn levels(&self) -> &[CudaDwt53LevelShape] {
&self.levels
}
pub fn ll_dimensions(&self) -> (u32, u32) {
(self.ll_width, self.ll_height)
}
pub fn execution(&self) -> CudaExecutionStats {
self.execution
}
}
#[derive(Debug)]
pub struct CudaResidentDwt97Output {
pub(crate) buffer: CudaDeviceBuffer,
pub(crate) sample_count: usize,
pub(crate) levels: Vec<CudaDwt53LevelShape>,
pub(crate) ll_width: u32,
pub(crate) ll_height: u32,
pub(crate) execution: CudaExecutionStats,
}
impl CudaResidentDwt97Output {
pub fn buffer(&self) -> &CudaDeviceBuffer {
&self.buffer
}
pub fn sample_count(&self) -> usize {
self.sample_count
}
pub fn download_transformed(&self) -> Result<Vec<f32>, CudaError> {
let mut transformed = vec![0f32; self.sample_count];
self.buffer
.copy_to_host(f32_slice_as_bytes_mut(&mut transformed))?;
Ok(transformed)
}
pub fn levels(&self) -> &[CudaDwt53LevelShape] {
&self.levels
}
pub fn ll_dimensions(&self) -> (u32, u32) {
(self.ll_width, self.ll_height)
}
pub fn execution(&self) -> CudaExecutionStats {
self.execution
}
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct CudaJ2kQuantizeJob {
pub step_exponent: u16,
pub step_mantissa: u16,
pub range_bits: u8,
pub reversible: bool,
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct CudaJ2kQuantizeSubbandRegionJob {
pub x0: u32,
pub y0: u32,
pub width: u32,
pub height: u32,
pub stride: u32,
pub quantization: CudaJ2kQuantizeJob,
}
#[derive(Debug)]
pub struct CudaJ2kQuantizedSubband {
pub(crate) coefficients: Vec<i32>,
pub(crate) execution: CudaExecutionStats,
}
impl CudaJ2kQuantizedSubband {
pub fn coefficients(&self) -> &[i32] {
&self.coefficients
}
pub fn execution(&self) -> CudaExecutionStats {
self.execution
}
}
#[derive(Debug)]
pub struct CudaJ2kResidentQuantizedSubband {
pub(crate) coefficients: CudaDeviceBuffer,
pub(crate) coefficient_count: usize,
pub(crate) execution: CudaExecutionStats,
}
impl CudaJ2kResidentQuantizedSubband {
pub fn buffer(&self) -> &CudaDeviceBuffer {
&self.coefficients
}
pub fn coefficient_count(&self) -> usize {
self.coefficient_count
}
pub fn download_coefficients(&self) -> Result<Vec<i32>, CudaError> {
let mut coefficients = vec![0i32; self.coefficient_count];
self.coefficients
.copy_to_host(i32_slice_as_bytes_mut(&mut coefficients))?;
Ok(coefficients)
}
pub fn execution(&self) -> CudaExecutionStats {
self.execution
}
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct CudaDwt53LevelShape {
pub width: u32,
pub height: u32,
pub low_width: u32,
pub low_height: u32,
pub high_width: u32,
pub high_height: u32,
}
#[derive(Clone, Copy, Debug)]
pub(crate) struct CudaDwt53Pass {
pub(crate) full_width: u32,
pub(crate) current_width: u32,
pub(crate) current_height: u32,
pub(crate) low_extent: u32,
}
#[derive(Clone, Copy, Debug)]
pub(crate) struct CudaDwt53LevelPass {
pub(crate) full_width: u32,
pub(crate) current_width: u32,
pub(crate) current_height: u32,
}
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
pub struct CudaDwt97BatchStageTimings {
pub pack_upload_us: u128,
pub idct_row_lift_us: u128,
pub column_lift_us: u128,
pub quantize_codeblock_us: u128,
pub ht_encode_us: u128,
pub ht_codeblock_dispatches: usize,
pub readback_us: u128,
}
pub(crate) fn validate_quantize_region(
job: CudaJ2kQuantizeSubbandRegionJob,
available_samples: usize,
) -> Result<(), CudaError> {
if job.width == 0 || job.height == 0 {
return Ok(());
}
if job.stride == 0
|| job
.x0
.checked_add(job.width)
.is_none_or(|end_x| end_x > job.stride)
{
return Err(CudaError::LengthTooLarge {
len: available_samples,
});
}
let last_sample = (job.y0 as usize)
.checked_add(job.height as usize - 1)
.and_then(|row| row.checked_mul(job.stride as usize))
.and_then(|row| row.checked_add(job.x0 as usize))
.and_then(|row| row.checked_add(job.width as usize))
.ok_or(CudaError::LengthTooLarge {
len: available_samples,
})?;
if last_sample > available_samples {
return Err(CudaError::OutputTooSmall {
required: last_sample
.checked_mul(std::mem::size_of::<f32>())
.ok_or(CudaError::LengthTooLarge { len: last_sample })?,
have: available_samples
.checked_mul(std::mem::size_of::<f32>())
.ok_or(CudaError::LengthTooLarge {
len: available_samples,
})?,
});
}
Ok(())
}