#[cfg(all(test, feature = "cuda-runtime"))]
use core::cell::Cell;
use core::convert::Infallible;
#[cfg(feature = "cuda-runtime")]
use std::sync::Arc;
use j2k::{
adapter::device_plan::{DeviceDecodePlan, DeviceDecodeRequest},
J2kDecoder as CpuDecoder, J2kError, J2kScratchPool as CpuJ2kScratchPool, J2kView,
};
#[cfg(feature = "cuda-runtime")]
use j2k_core::BackendKind;
use j2k_core::{
submit_ready_device, BackendRequest, DecodeOutcome, Downscale, ImageCodec, ImageDecode,
ImageDecodeDevice, ImageDecodeSubmit, PixelFormat, ReadySubmission, Rect,
};
#[cfg(feature = "cuda-runtime")]
use j2k_cuda_runtime::{
CudaBufferPool, CudaBufferPoolTakeTrace, CudaDeviceBuffer, CudaError, CudaHtj2kCleanupTarget,
CudaHtj2kCodeBlockJob, CudaHtj2kDecodeResources, CudaHtj2kDecodeTableResources,
CudaHtj2kDequantizeTarget, CudaJ2kIdwtJob, CudaJ2kIdwtTarget, CudaJ2kInverseMctJob,
CudaJ2kRect, CudaJ2kStoreGray16Job, CudaJ2kStoreGray8Job, CudaJ2kStoreRgb16Job,
CudaJ2kStoreRgb16MctJob, CudaJ2kStoreRgb8Job, CudaJ2kStoreRgb8MctJob,
CudaJ2kStoreRgb8MctTarget, CudaPooledDeviceBuffer, CudaQueuedExecution, CudaQueuedHtj2kCleanup,
};
use j2k_native::{DecodeSettings, DecoderContext as NativeDecoderContext, Image as NativeImage};
#[cfg(feature = "cuda-runtime")]
use crate::runtime::cuda_error;
use crate::runtime::{validate_surface_request, wrap_cpu_staged_cuda_surface, wrap_surface};
#[cfg(feature = "cuda-runtime")]
use crate::surface::{cuda_range_storage, Storage};
use crate::{
profile, CudaHtj2kDecodePlan, CudaHtj2kDecodeProfileDetail, CudaHtj2kProfileReport,
CudaSession, Error, Surface,
};
#[cfg(feature = "cuda-runtime")]
use crate::{
CudaHtj2kBandId, CudaHtj2kIdwtStep, CudaHtj2kStoreStep, CudaHtj2kTransform, CudaSurfaceStats,
SurfaceResidency,
};
#[cfg(feature = "cuda-runtime")]
const CUDA_HTJ2K_KERNELS_NOT_READY: &str =
"strict CUDA HTJ2K resident codestream decode kernels are not available in this build";
#[cfg(feature = "cuda-runtime")]
const CUDA_HTJ2K_OUTPUT_FORMAT_UNSUPPORTED: &str =
"strict CUDA HTJ2K resident decode currently accepts Gray8, Gray16, Rgb8, Rgba8, Rgb16, and Rgba16 output";
#[cfg(feature = "cuda-runtime")]
const CUDA_HTJ2K_PLAN_INVARIANT_FAILED: &str =
"strict CUDA HTJ2K resident decode plan has invalid internal ranges";
#[cfg(feature = "cuda-runtime")]
const CUDA_HTJ2K_STORE_UNSUPPORTED: &str =
"strict CUDA HTJ2K resident decode requires a single grayscale store step";
#[cfg(feature = "cuda-runtime")]
const CUDA_HTJ2K_BATCH_PAYLOAD_TOO_LARGE: &str =
"strict CUDA HTJ2K resident batch decode payload is too large";
#[cfg(feature = "cuda-runtime")]
const CUDA_IDWT_TRACE_ENV_VAR: &str = "J2K_CUDA_IDWT_TRACE";
#[cfg(all(test, feature = "cuda-runtime"))]
std::thread_local! {
static CUDA_HTJ2K_BATCH_DECODE_CALLS: Cell<usize> = const { Cell::new(0) };
}
#[cfg(all(test, feature = "cuda-runtime"))]
pub(crate) fn testing_reset_cuda_htj2k_batch_decode_calls() {
CUDA_HTJ2K_BATCH_DECODE_CALLS.with(|calls| calls.set(0));
}
#[cfg(all(test, feature = "cuda-runtime"))]
pub(crate) fn testing_cuda_htj2k_batch_decode_calls() -> usize {
CUDA_HTJ2K_BATCH_DECODE_CALLS.with(Cell::get)
}
#[cfg(any(test, feature = "cuda-runtime"))]
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
struct CudaIdwtBatchHostTraceRow {
component_count: usize,
step_count: usize,
output_alloc_us: u128,
target_build_us: u128,
enqueue_us: u128,
output_take_count: usize,
output_pool_reuse_count: usize,
output_pool_alloc_count: usize,
output_pool_scanned_count: usize,
output_pool_max_free_count: usize,
output_requested_bytes: usize,
}
#[cfg(any(test, feature = "cuda-runtime"))]
fn format_cuda_idwt_batch_host_trace_row(row: CudaIdwtBatchHostTraceRow) -> String {
format!(
"j2k_profile codec=j2k op=cuda_idwt_batch_host path=decode \
component_count={} step_count={} output_alloc_us={} target_build_us={} enqueue_us={} \
output_take_count={} output_pool_reuse_count={} output_pool_alloc_count={} \
output_pool_scanned_count={} output_pool_max_free_count={} output_requested_bytes={}",
row.component_count,
row.step_count,
row.output_alloc_us,
row.target_build_us,
row.enqueue_us,
row.output_take_count,
row.output_pool_reuse_count,
row.output_pool_alloc_count,
row.output_pool_scanned_count,
row.output_pool_max_free_count,
row.output_requested_bytes
)
}
#[cfg(feature = "cuda-runtime")]
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
struct CudaIdwtOutputPoolTraceTotals {
take_count: usize,
reuse_count: usize,
alloc_count: usize,
scanned_count: usize,
max_free_count: usize,
requested_bytes: usize,
}
#[cfg(feature = "cuda-runtime")]
impl CudaIdwtOutputPoolTraceTotals {
fn add_take(&mut self, trace: CudaBufferPoolTakeTrace) {
self.take_count = self.take_count.saturating_add(1);
if trace.reused {
self.reuse_count = self.reuse_count.saturating_add(1);
} else {
self.alloc_count = self.alloc_count.saturating_add(1);
}
self.scanned_count = self.scanned_count.saturating_add(trace.scanned_count);
self.max_free_count = self.max_free_count.max(trace.free_count_before);
self.requested_bytes = self.requested_bytes.saturating_add(trace.requested_len);
}
}
#[cfg(feature = "cuda-runtime")]
fn cuda_idwt_trace_enabled() -> bool {
std::env::var_os(CUDA_IDWT_TRACE_ENV_VAR).is_some()
}
#[cfg(feature = "cuda-runtime")]
fn elapsed_host_us(start: Option<std::time::Instant>) -> u128 {
start.map_or(0, |start| start.elapsed().as_micros())
}
pub struct J2kDecoder<'a> {
inner: CpuDecoder<'a>,
pool: CpuJ2kScratchPool,
}
impl<'a> J2kDecoder<'a> {
pub fn new(input: &'a [u8]) -> Result<Self, Error> {
Ok(Self {
inner: CpuDecoder::new(input)?,
pool: CpuJ2kScratchPool::new(),
})
}
fn decode_to_surface_impl(
&mut self,
session: &mut CudaSession,
fmt: PixelFormat,
backend: BackendRequest,
) -> Result<Surface, Error> {
validate_surface_request(backend)?;
if matches!(backend, BackendRequest::Cuda) {
return self.decode_to_cuda_resident_surface_impl(session, fmt);
}
let dims = self.inner.info().dimensions;
let stride = dims.0 as usize * fmt.bytes_per_pixel();
let mut out = vec![0u8; stride * dims.1 as usize];
if j2k_profile::gpu_route_profile_enabled() {
let request_s = format!("{backend:?}");
let fmt_s = format!("{fmt:?}");
let width_s = dims.0.to_string();
let height_s = dims.1.to_string();
j2k_profile::emit_gpu_route_profile(
"j2k",
"cuda",
&[
("op", "full"),
("request", request_s.as_str()),
("fmt", fmt_s.as_str()),
("width", width_s.as_str()),
("height", height_s.as_str()),
("decision", "cpu_decode_then_wrap"),
],
);
}
self.inner
.decode_into_with_scratch(&mut self.pool, &mut out, stride, fmt)?;
wrap_surface(out, dims, fmt, backend, session)
}
fn decode_to_cuda_resident_surface_impl(
&mut self,
session: &mut CudaSession,
fmt: PixelFormat,
) -> Result<Surface, Error> {
decode_to_cuda_resident_surface_impl(self, session, fmt)
}
fn decode_region_to_cuda_resident_surface_impl(
&mut self,
session: &mut CudaSession,
fmt: PixelFormat,
roi: Rect,
) -> Result<Surface, Error> {
decode_region_to_cuda_resident_surface_impl(self, session, fmt, roi)
}
fn decode_scaled_to_cuda_resident_surface_impl(
&mut self,
session: &mut CudaSession,
fmt: PixelFormat,
scale: Downscale,
) -> Result<Surface, Error> {
decode_scaled_to_cuda_resident_surface_impl(self, session, fmt, scale)
}
fn decode_region_scaled_to_cuda_resident_surface_impl(
&mut self,
session: &mut CudaSession,
fmt: PixelFormat,
roi: Rect,
scale: Downscale,
) -> Result<Surface, Error> {
decode_region_scaled_to_cuda_resident_surface_impl(self, session, fmt, roi, scale)
}
fn decode_region_to_surface_impl(
&mut self,
session: &mut CudaSession,
fmt: PixelFormat,
roi: Rect,
backend: BackendRequest,
) -> Result<Surface, Error> {
validate_surface_request(backend)?;
if matches!(backend, BackendRequest::Cuda) {
return self.decode_region_to_cuda_resident_surface_impl(session, fmt, roi);
}
let plan = DeviceDecodePlan::for_image(
self.inner.info().dimensions,
DeviceDecodeRequest::Region { roi },
)?;
let dims = plan.output_dims();
let stride = dims.0 as usize * fmt.bytes_per_pixel();
let mut out = vec![0u8; stride * dims.1 as usize];
self.inner
.decode_region_into(&mut self.pool, &mut out, stride, fmt, plan.source_rect())?;
wrap_surface(out, dims, fmt, backend, session)
}
fn decode_scaled_to_surface_impl(
&mut self,
session: &mut CudaSession,
fmt: PixelFormat,
scale: Downscale,
backend: BackendRequest,
) -> Result<Surface, Error> {
validate_surface_request(backend)?;
if matches!(backend, BackendRequest::Cuda) {
return self.decode_scaled_to_cuda_resident_surface_impl(session, fmt, scale);
}
let dims = DeviceDecodePlan::for_image(
self.inner.info().dimensions,
DeviceDecodeRequest::Scaled { scale },
)?
.output_dims();
let stride = dims.0 as usize * fmt.bytes_per_pixel();
let mut out = vec![0u8; stride * dims.1 as usize];
self.inner
.decode_scaled_into(&mut self.pool, &mut out, stride, fmt, scale)?;
wrap_surface(out, dims, fmt, backend, session)
}
fn decode_region_scaled_to_surface_impl(
&mut self,
session: &mut CudaSession,
fmt: PixelFormat,
roi: Rect,
scale: Downscale,
backend: BackendRequest,
) -> Result<Surface, Error> {
validate_surface_request(backend)?;
if matches!(backend, BackendRequest::Cuda) {
return self
.decode_region_scaled_to_cuda_resident_surface_impl(session, fmt, roi, scale);
}
let plan = DeviceDecodePlan::for_image(
self.inner.info().dimensions,
DeviceDecodeRequest::RegionScaled { roi, scale },
)?;
let dims = plan.output_dims();
let stride = dims.0 as usize * fmt.bytes_per_pixel();
let mut out = vec![0u8; stride * dims.1 as usize];
self.inner.decode_region_scaled_into(
&mut self.pool,
&mut out,
stride,
fmt,
plan.source_rect(),
scale,
)?;
wrap_surface(out, dims, fmt, backend, session)
}
pub fn decode_to_device_with_session(
&mut self,
fmt: PixelFormat,
session: &mut CudaSession,
) -> Result<Surface, Error> {
self.decode_to_surface_impl(session, fmt, BackendRequest::Cuda)
}
pub fn decode_to_device_with_session_and_profile(
&mut self,
fmt: PixelFormat,
session: &mut CudaSession,
) -> Result<(Surface, CudaHtj2kProfileReport), Error> {
decode_to_cuda_resident_surface_with_profile_impl(self, session, fmt)
}
pub fn decode_batch_to_device_with_session(
inputs: &[&[u8]],
fmt: PixelFormat,
session: &mut CudaSession,
) -> Result<Vec<Surface>, Error> {
decode_batch_to_cuda_resident_surface_with_profile_control(inputs, session, fmt, false)
.map(|(surfaces, _report)| surfaces)
}
pub fn decode_batch_to_device_with_session_and_profile(
inputs: &[&[u8]],
fmt: PixelFormat,
session: &mut CudaSession,
) -> Result<(Vec<Surface>, CudaHtj2kProfileReport), Error> {
decode_batch_to_cuda_resident_surface_with_profile_control(inputs, session, fmt, true)
}
pub(crate) fn decode_region_to_device_with_session(
&mut self,
fmt: PixelFormat,
roi: Rect,
session: &mut CudaSession,
) -> Result<Surface, Error> {
self.decode_region_to_surface_impl(session, fmt, roi, BackendRequest::Cuda)
}
pub(crate) fn decode_scaled_to_device_with_session(
&mut self,
fmt: PixelFormat,
scale: Downscale,
session: &mut CudaSession,
) -> Result<Surface, Error> {
self.decode_scaled_to_surface_impl(session, fmt, scale, BackendRequest::Cuda)
}
pub(crate) fn decode_region_scaled_to_device_with_session(
&mut self,
fmt: PixelFormat,
roi: Rect,
scale: Downscale,
session: &mut CudaSession,
) -> Result<Surface, Error> {
self.decode_region_scaled_to_surface_impl(session, fmt, roi, scale, BackendRequest::Cuda)
}
pub fn decode_to_host_surface(&mut self, fmt: PixelFormat) -> Result<Surface, Error> {
let mut session = CudaSession::default();
self.decode_to_surface_impl(&mut session, fmt, BackendRequest::Cpu)
}
pub fn build_cuda_htj2k_grayscale_plan_with_profile(
&mut self,
fmt: PixelFormat,
) -> Result<(CudaHtj2kDecodePlan, CudaHtj2kProfileReport), Error> {
let total_start = profile::profile_now(true);
let parse_start = profile::profile_now(true);
let image = NativeImage::new(self.inner.bytes(), &DecodeSettings::default())
.map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
let parse_us = profile::elapsed_us(parse_start);
let plan_start = profile::profile_now(true);
let mut native_context = NativeDecoderContext::default();
let native_plan = image
.build_direct_grayscale_plan_with_context(&mut native_context)
.map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
let plan_us = profile::elapsed_us(plan_start);
let flatten_start = profile::profile_now(true);
let cuda_plan = CudaHtj2kDecodePlan::from_grayscale_direct_plan(&native_plan, fmt, (0, 0))?;
let flatten_us = profile::elapsed_us(flatten_start);
let report = CudaHtj2kProfileReport {
parse_us,
plan_us,
flatten_us,
total_us: profile::elapsed_us(total_start),
block_count: cuda_plan.code_blocks().len(),
payload_bytes: cuda_plan.payload().len(),
dispatch_count: 0,
residency: crate::SurfaceResidency::CudaResidentDecode,
detail: CudaHtj2kDecodeProfileDetail::default(),
..CudaHtj2kProfileReport::default()
};
report.emit("plan");
Ok((cuda_plan, report))
}
pub fn build_cuda_htj2k_grayscale_region_plan_with_profile(
&mut self,
fmt: PixelFormat,
roi: Rect,
) -> Result<(CudaHtj2kDecodePlan, CudaHtj2kProfileReport), Error> {
let total_start = profile::profile_now(true);
let parse_start = profile::profile_now(true);
let image = NativeImage::new(self.inner.bytes(), &DecodeSettings::default())
.map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
let parse_us = profile::elapsed_us(parse_start);
let plan_start = profile::profile_now(true);
let mut native_context = NativeDecoderContext::default();
let native_plan = image
.build_direct_grayscale_plan_region_with_context(
&mut native_context,
(roi.x, roi.y, roi.w, roi.h),
)
.map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
let plan_us = profile::elapsed_us(plan_start);
let flatten_start = profile::profile_now(true);
let cuda_plan = CudaHtj2kDecodePlan::from_grayscale_direct_plan_region(
&native_plan,
fmt,
(roi.x, roi.y),
(roi.w, roi.h),
)?;
let flatten_us = profile::elapsed_us(flatten_start);
let report = CudaHtj2kProfileReport {
parse_us,
plan_us,
flatten_us,
total_us: profile::elapsed_us(total_start),
block_count: cuda_plan.code_blocks().len(),
payload_bytes: cuda_plan.payload().len(),
dispatch_count: 0,
residency: crate::SurfaceResidency::CudaResidentDecode,
detail: CudaHtj2kDecodeProfileDetail::default(),
..CudaHtj2kProfileReport::default()
};
report.emit("plan");
Ok((cuda_plan, report))
}
pub fn build_cuda_htj2k_grayscale_scaled_plan_with_profile(
&mut self,
fmt: PixelFormat,
output_dimensions: (u32, u32),
) -> Result<(CudaHtj2kDecodePlan, CudaHtj2kProfileReport), Error> {
let total_start = profile::profile_now(true);
let parse_start = profile::profile_now(true);
let image = NativeImage::new(
self.inner.bytes(),
&DecodeSettings {
target_resolution: Some(output_dimensions),
..DecodeSettings::default()
},
)
.map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
let parse_us = profile::elapsed_us(parse_start);
let plan_start = profile::profile_now(true);
let mut native_context = NativeDecoderContext::default();
let native_plan = image
.build_direct_grayscale_plan_with_context(&mut native_context)
.map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
let plan_us = profile::elapsed_us(plan_start);
let flatten_start = profile::profile_now(true);
let cuda_plan = CudaHtj2kDecodePlan::from_grayscale_direct_plan(&native_plan, fmt, (0, 0))?;
let flatten_us = profile::elapsed_us(flatten_start);
let report = CudaHtj2kProfileReport {
parse_us,
plan_us,
flatten_us,
total_us: profile::elapsed_us(total_start),
block_count: cuda_plan.code_blocks().len(),
payload_bytes: cuda_plan.payload().len(),
dispatch_count: 0,
residency: crate::SurfaceResidency::CudaResidentDecode,
detail: CudaHtj2kDecodeProfileDetail::default(),
..CudaHtj2kProfileReport::default()
};
report.emit("plan");
Ok((cuda_plan, report))
}
pub fn build_cuda_htj2k_grayscale_region_scaled_plan_with_profile(
&mut self,
fmt: PixelFormat,
scaled_roi: Rect,
output_dimensions: (u32, u32),
) -> Result<(CudaHtj2kDecodePlan, CudaHtj2kProfileReport), Error> {
let total_start = profile::profile_now(true);
let parse_start = profile::profile_now(true);
let image = NativeImage::new(
self.inner.bytes(),
&DecodeSettings {
target_resolution: Some(output_dimensions),
..DecodeSettings::default()
},
)
.map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
let parse_us = profile::elapsed_us(parse_start);
let plan_start = profile::profile_now(true);
let mut native_context = NativeDecoderContext::default();
let native_plan = image
.build_direct_grayscale_plan_region_with_context(
&mut native_context,
(scaled_roi.x, scaled_roi.y, scaled_roi.w, scaled_roi.h),
)
.map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
let plan_us = profile::elapsed_us(plan_start);
let flatten_start = profile::profile_now(true);
let cuda_plan = CudaHtj2kDecodePlan::from_grayscale_direct_plan_region(
&native_plan,
fmt,
(scaled_roi.x, scaled_roi.y),
(scaled_roi.w, scaled_roi.h),
)?;
let flatten_us = profile::elapsed_us(flatten_start);
let report = CudaHtj2kProfileReport {
parse_us,
plan_us,
flatten_us,
total_us: profile::elapsed_us(total_start),
block_count: cuda_plan.code_blocks().len(),
payload_bytes: cuda_plan.payload().len(),
dispatch_count: 0,
residency: crate::SurfaceResidency::CudaResidentDecode,
detail: CudaHtj2kDecodeProfileDetail::default(),
..CudaHtj2kProfileReport::default()
};
report.emit("plan");
Ok((cuda_plan, report))
}
#[cfg(feature = "cuda-runtime")]
fn build_cuda_htj2k_color_plans_with_profile(
&mut self,
fmt: PixelFormat,
) -> Result<CudaHtj2kColorDecodePlans, Error> {
let mut native_context = NativeDecoderContext::default();
build_cuda_htj2k_color_plans_from_bytes_with_profile(
self.inner.bytes(),
fmt,
&mut native_context,
)
}
#[cfg(feature = "cuda-runtime")]
fn build_cuda_htj2k_color_scaled_plans_with_profile(
&mut self,
fmt: PixelFormat,
output_dimensions: (u32, u32),
) -> Result<CudaHtj2kColorDecodePlans, Error> {
let total_start = profile::profile_now(true);
let parse_start = profile::profile_now(true);
let image = NativeImage::new(
self.inner.bytes(),
&DecodeSettings {
target_resolution: Some(output_dimensions),
..DecodeSettings::default()
},
)
.map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
let parse_us = profile::elapsed_us(parse_start);
let plan_start = profile::profile_now(true);
let mut native_context = NativeDecoderContext::default();
let native_plan = image
.build_direct_color_plan_with_context(&mut native_context)
.map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
let plan_us = profile::elapsed_us(plan_start);
let flatten_start = profile::profile_now(true);
let mut payload = Vec::new();
let mut components = Vec::with_capacity(native_plan.component_plans.len());
for component_plan in &native_plan.component_plans {
let mut component =
CudaHtj2kDecodePlan::from_grayscale_direct_plan(component_plan, fmt, (0, 0))?;
component.append_payload_to_shared(&mut payload)?;
components.push(component);
}
let flatten_us = profile::elapsed_us(flatten_start);
let block_count = components
.iter()
.map(|plan| plan.code_blocks().len())
.sum::<usize>();
let payload_bytes = payload.len();
let report = CudaHtj2kProfileReport {
parse_us,
plan_us,
flatten_us,
total_us: profile::elapsed_us(total_start),
block_count,
payload_bytes,
dispatch_count: 0,
residency: crate::SurfaceResidency::CudaResidentDecode,
detail: CudaHtj2kDecodeProfileDetail::default(),
..CudaHtj2kProfileReport::default()
};
report.emit("plan");
Ok(CudaHtj2kColorDecodePlans {
dimensions: native_plan.dimensions,
mct_dimensions: native_plan.dimensions,
bit_depths: native_plan.bit_depths,
mct: native_plan.mct,
transform: CudaHtj2kTransform::from_native(native_plan.transform),
payload,
components,
report,
})
}
#[cfg(feature = "cuda-runtime")]
fn build_cuda_htj2k_color_region_plans_with_profile(
&mut self,
fmt: PixelFormat,
roi: Rect,
) -> Result<CudaHtj2kColorDecodePlans, Error> {
let total_start = profile::profile_now(true);
let parse_start = profile::profile_now(true);
let image = NativeImage::new(self.inner.bytes(), &DecodeSettings::default())
.map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
let parse_us = profile::elapsed_us(parse_start);
let plan_start = profile::profile_now(true);
let mut native_context = NativeDecoderContext::default();
let native_plan = image
.build_direct_color_plan_with_context(&mut native_context)
.map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
let plan_us = profile::elapsed_us(plan_start);
let flatten_start = profile::profile_now(true);
let mut payload = Vec::new();
let mut components = Vec::with_capacity(native_plan.component_plans.len());
for component_plan in &native_plan.component_plans {
let mut component = CudaHtj2kDecodePlan::from_grayscale_direct_plan_region(
component_plan,
fmt,
(roi.x, roi.y),
(roi.w, roi.h),
)?;
component.append_payload_to_shared(&mut payload)?;
components.push(component);
}
let flatten_us = profile::elapsed_us(flatten_start);
let block_count = components
.iter()
.map(|plan| plan.code_blocks().len())
.sum::<usize>();
let payload_bytes = payload.len();
let report = CudaHtj2kProfileReport {
parse_us,
plan_us,
flatten_us,
total_us: profile::elapsed_us(total_start),
block_count,
payload_bytes,
dispatch_count: 0,
residency: crate::SurfaceResidency::CudaResidentDecode,
detail: CudaHtj2kDecodeProfileDetail::default(),
..CudaHtj2kProfileReport::default()
};
report.emit("plan");
Ok(CudaHtj2kColorDecodePlans {
dimensions: (roi.w, roi.h),
mct_dimensions: native_plan.dimensions,
bit_depths: native_plan.bit_depths,
mct: native_plan.mct,
transform: CudaHtj2kTransform::from_native(native_plan.transform),
payload,
components,
report,
})
}
#[cfg(feature = "cuda-runtime")]
fn build_cuda_htj2k_color_region_scaled_plans_with_profile(
&mut self,
fmt: PixelFormat,
scaled_roi: Rect,
output_dimensions: (u32, u32),
) -> Result<CudaHtj2kColorDecodePlans, Error> {
let total_start = profile::profile_now(true);
let parse_start = profile::profile_now(true);
let image = NativeImage::new(
self.inner.bytes(),
&DecodeSettings {
target_resolution: Some(output_dimensions),
..DecodeSettings::default()
},
)
.map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
let parse_us = profile::elapsed_us(parse_start);
let plan_start = profile::profile_now(true);
let mut native_context = NativeDecoderContext::default();
let native_plan = image
.build_direct_color_plan_with_context(&mut native_context)
.map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
let plan_us = profile::elapsed_us(plan_start);
let flatten_start = profile::profile_now(true);
let mut payload = Vec::new();
let mut components = Vec::with_capacity(native_plan.component_plans.len());
for component_plan in &native_plan.component_plans {
let mut component = CudaHtj2kDecodePlan::from_grayscale_direct_plan_region(
component_plan,
fmt,
(scaled_roi.x, scaled_roi.y),
(scaled_roi.w, scaled_roi.h),
)?;
component.append_payload_to_shared(&mut payload)?;
components.push(component);
}
let flatten_us = profile::elapsed_us(flatten_start);
let block_count = components
.iter()
.map(|plan| plan.code_blocks().len())
.sum::<usize>();
let payload_bytes = payload.len();
let report = CudaHtj2kProfileReport {
parse_us,
plan_us,
flatten_us,
total_us: profile::elapsed_us(total_start),
block_count,
payload_bytes,
dispatch_count: 0,
residency: crate::SurfaceResidency::CudaResidentDecode,
detail: CudaHtj2kDecodeProfileDetail::default(),
..CudaHtj2kProfileReport::default()
};
report.emit("plan");
Ok(CudaHtj2kColorDecodePlans {
dimensions: (scaled_roi.w, scaled_roi.h),
mct_dimensions: native_plan.dimensions,
bit_depths: native_plan.bit_depths,
mct: native_plan.mct,
transform: CudaHtj2kTransform::from_native(native_plan.transform),
payload,
components,
report,
})
}
pub fn decode_to_cpu_staged_cuda_surface_with_session(
&mut self,
fmt: PixelFormat,
session: &mut CudaSession,
) -> Result<Surface, Error> {
let dims = self.inner.info().dimensions;
let stride = dims.0 as usize * fmt.bytes_per_pixel();
let mut out = vec![0u8; stride * dims.1 as usize];
self.inner
.decode_into_with_scratch(&mut self.pool, &mut out, stride, fmt)?;
wrap_cpu_staged_cuda_surface(&out, dims, fmt, session)
}
pub fn decode_region_to_cpu_staged_cuda_surface_with_session(
&mut self,
fmt: PixelFormat,
roi: Rect,
session: &mut CudaSession,
) -> Result<Surface, Error> {
let plan = DeviceDecodePlan::for_image(
self.inner.info().dimensions,
DeviceDecodeRequest::Region { roi },
)?;
let dims = plan.output_dims();
let stride = dims.0 as usize * fmt.bytes_per_pixel();
let mut out = vec![0u8; stride * dims.1 as usize];
self.inner
.decode_region_into(&mut self.pool, &mut out, stride, fmt, plan.source_rect())?;
wrap_cpu_staged_cuda_surface(&out, dims, fmt, session)
}
pub fn decode_scaled_to_cpu_staged_cuda_surface_with_session(
&mut self,
fmt: PixelFormat,
scale: Downscale,
session: &mut CudaSession,
) -> Result<Surface, Error> {
let dims = DeviceDecodePlan::for_image(
self.inner.info().dimensions,
DeviceDecodeRequest::Scaled { scale },
)?
.output_dims();
let stride = dims.0 as usize * fmt.bytes_per_pixel();
let mut out = vec![0u8; stride * dims.1 as usize];
self.inner
.decode_scaled_into(&mut self.pool, &mut out, stride, fmt, scale)?;
wrap_cpu_staged_cuda_surface(&out, dims, fmt, session)
}
pub fn decode_region_scaled_to_cpu_staged_cuda_surface_with_session(
&mut self,
fmt: PixelFormat,
roi: Rect,
scale: Downscale,
session: &mut CudaSession,
) -> Result<Surface, Error> {
let plan = DeviceDecodePlan::for_image(
self.inner.info().dimensions,
DeviceDecodeRequest::RegionScaled { roi, scale },
)?;
let dims = plan.output_dims();
let stride = dims.0 as usize * fmt.bytes_per_pixel();
let mut out = vec![0u8; stride * dims.1 as usize];
self.inner.decode_region_scaled_into(
&mut self.pool,
&mut out,
stride,
fmt,
plan.source_rect(),
scale,
)?;
wrap_cpu_staged_cuda_surface(&out, dims, fmt, session)
}
}
#[cfg(feature = "cuda-runtime")]
struct CudaCoefficientBand {
band_id: CudaHtj2kBandId,
buffer: CudaPooledDeviceBuffer,
}
#[cfg(feature = "cuda-runtime")]
struct CudaPendingDequantBand {
band_index: usize,
jobs: Vec<CudaHtj2kCodeBlockJob>,
output_words: usize,
}
#[cfg(feature = "cuda-runtime")]
struct CudaComponentDecodeWork {
bands: Vec<CudaCoefficientBand>,
pending_dequant_bands: Vec<CudaPendingDequantBand>,
store: CudaHtj2kStoreStep,
dispatches: usize,
decode_dispatches: usize,
timings: CudaDecodeStageTimings,
}
#[cfg(feature = "cuda-runtime")]
struct CudaQueuedIdwtBatch {
queued: Vec<CudaQueuedExecution>,
kernel_dispatches: usize,
decode_dispatches: usize,
}
#[cfg(feature = "cuda-runtime")]
struct CudaDecodedComponent {
buffer: CudaPooledDeviceBuffer,
store: CudaHtj2kStoreStep,
dispatches: usize,
decode_dispatches: usize,
timings: CudaDecodeStageTimings,
}
#[cfg(feature = "cuda-runtime")]
struct CudaPreparedRgb8MctBatchStore {
color: CudaHtj2kColorDecodePlans,
decoded_components: Vec<CudaDecodedComponent>,
dispatches: usize,
decode_dispatches: usize,
job: CudaJ2kStoreRgb8MctJob,
}
#[cfg(feature = "cuda-runtime")]
#[derive(Clone, Copy, Debug, Default)]
struct CudaDecodeStageTimings {
h2d: u128,
payload_upload: u128,
status_d2h: u128,
ht_cleanup: u128,
ht_refine: u128,
dequant: u128,
ht_dispatch_count: usize,
idwt: u128,
dequant_dispatch_count: usize,
idwt_dispatch_count: usize,
}
#[cfg(feature = "cuda-runtime")]
impl CudaDecodeStageTimings {
fn add_to_report(self, report: &mut CudaHtj2kProfileReport) {
report.h2d_us = report.h2d_us.saturating_add(self.h2d);
report.detail.payload_upload_us = report
.detail
.payload_upload_us
.saturating_add(self.payload_upload);
report.detail.status_d2h_us = report.detail.status_d2h_us.saturating_add(self.status_d2h);
report.ht_cleanup_us = report.ht_cleanup_us.saturating_add(self.ht_cleanup);
report.ht_refine_us = report.ht_refine_us.saturating_add(self.ht_refine);
report.dequant_us = report.dequant_us.saturating_add(self.dequant);
report.idwt_us = report.idwt_us.saturating_add(self.idwt);
report.detail.ht_dispatch_count = report
.detail
.ht_dispatch_count
.saturating_add(self.ht_dispatch_count);
report.detail.dequant_dispatch_count = report
.detail
.dequant_dispatch_count
.saturating_add(self.dequant_dispatch_count);
report.detail.idwt_dispatch_count = report
.detail
.idwt_dispatch_count
.saturating_add(self.idwt_dispatch_count);
}
}
#[cfg(feature = "cuda-runtime")]
struct CudaHtj2kColorDecodePlans {
dimensions: (u32, u32),
mct_dimensions: (u32, u32),
bit_depths: [u8; 3],
mct: bool,
transform: CudaHtj2kTransform,
payload: Vec<u8>,
components: Vec<CudaHtj2kDecodePlan>,
report: CudaHtj2kProfileReport,
}
#[cfg(feature = "cuda-runtime")]
fn decode_to_cuda_resident_surface_impl(
decoder: &mut J2kDecoder<'_>,
session: &mut CudaSession,
fmt: PixelFormat,
) -> Result<Surface, Error> {
decode_to_cuda_resident_surface_with_profile_control(decoder, session, fmt, false)
.map(|(surface, _report)| surface)
}
#[cfg(feature = "cuda-runtime")]
fn decode_to_cuda_resident_surface_with_profile_impl(
decoder: &mut J2kDecoder<'_>,
session: &mut CudaSession,
fmt: PixelFormat,
) -> Result<(Surface, CudaHtj2kProfileReport), Error> {
decode_to_cuda_resident_surface_with_profile_control(decoder, session, fmt, true)
}
#[cfg(feature = "cuda-runtime")]
fn decode_to_cuda_resident_surface_with_profile_control(
decoder: &mut J2kDecoder<'_>,
session: &mut CudaSession,
fmt: PixelFormat,
collect_stage_timings: bool,
) -> Result<(Surface, CudaHtj2kProfileReport), Error> {
let collect_stage_timings = collect_stage_timings || profile::profile_stages_enabled();
let wall_started = profile::profile_now(collect_stage_timings);
match fmt {
PixelFormat::Gray8 | PixelFormat::Gray16 => {
decode_grayscale_cuda_resident_surface_with_profile(
decoder,
session,
fmt,
wall_started,
collect_stage_timings,
)
}
PixelFormat::Rgb8 | PixelFormat::Rgba8 | PixelFormat::Rgb16 | PixelFormat::Rgba16 => {
decode_color_cuda_resident_surface_with_profile(
decoder,
session,
fmt,
wall_started,
collect_stage_timings,
)
}
_ => Err(Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_OUTPUT_FORMAT_UNSUPPORTED,
}),
}
}
#[cfg(feature = "cuda-runtime")]
fn decode_batch_to_cuda_resident_surface_with_profile_control(
inputs: &[&[u8]],
session: &mut CudaSession,
fmt: PixelFormat,
collect_stage_timings: bool,
) -> Result<(Vec<Surface>, CudaHtj2kProfileReport), Error> {
#[cfg(all(test, feature = "cuda-runtime"))]
CUDA_HTJ2K_BATCH_DECODE_CALLS.with(|calls| calls.set(calls.get().saturating_add(1)));
let collect_stage_timings = collect_stage_timings || profile::profile_stages_enabled();
if inputs.is_empty() {
return Ok((
Vec::new(),
CudaHtj2kProfileReport {
residency: SurfaceResidency::CudaResidentDecode,
..CudaHtj2kProfileReport::default()
},
));
}
match fmt {
PixelFormat::Rgb8 | PixelFormat::Rgba8 | PixelFormat::Rgb16 | PixelFormat::Rgba16 => {
decode_color_cuda_resident_batch_surfaces_with_profile(
inputs,
session,
fmt,
collect_stage_timings,
)
}
_ => Err(Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_OUTPUT_FORMAT_UNSUPPORTED,
}),
}
}
#[cfg(feature = "cuda-runtime")]
fn decode_region_to_cuda_resident_surface_impl(
decoder: &mut J2kDecoder<'_>,
session: &mut CudaSession,
fmt: PixelFormat,
roi: Rect,
) -> Result<Surface, Error> {
let plan = DeviceDecodePlan::for_image(
decoder.inner.info().dimensions,
DeviceDecodeRequest::Region { roi },
)?;
if plan.is_full_frame() {
return decode_to_cuda_resident_surface_impl(decoder, session, fmt);
}
match fmt {
PixelFormat::Gray8 | PixelFormat::Gray16 => {
decode_grayscale_cuda_resident_region_surface(decoder, session, fmt, plan.source_rect())
}
PixelFormat::Rgb8 | PixelFormat::Rgba8 | PixelFormat::Rgb16 | PixelFormat::Rgba16 => {
decode_color_cuda_resident_region_surface(decoder, session, fmt, plan.source_rect())
}
_ => Err(Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_OUTPUT_FORMAT_UNSUPPORTED,
}),
}
}
#[cfg(feature = "cuda-runtime")]
fn decode_scaled_to_cuda_resident_surface_impl(
decoder: &mut J2kDecoder<'_>,
session: &mut CudaSession,
fmt: PixelFormat,
scale: Downscale,
) -> Result<Surface, Error> {
if scale == Downscale::None {
return decode_to_cuda_resident_surface_impl(decoder, session, fmt);
}
let output_dimensions = DeviceDecodePlan::for_image(
decoder.inner.info().dimensions,
DeviceDecodeRequest::Scaled { scale },
)?
.output_dims();
match fmt {
PixelFormat::Gray8 | PixelFormat::Gray16 => {
decode_grayscale_cuda_resident_scaled_surface(decoder, session, fmt, output_dimensions)
}
PixelFormat::Rgb8 | PixelFormat::Rgba8 | PixelFormat::Rgb16 | PixelFormat::Rgba16 => {
decode_color_cuda_resident_scaled_surface(decoder, session, fmt, output_dimensions)
}
_ => Err(Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_OUTPUT_FORMAT_UNSUPPORTED,
}),
}
}
#[cfg(feature = "cuda-runtime")]
fn decode_region_scaled_to_cuda_resident_surface_impl(
decoder: &mut J2kDecoder<'_>,
session: &mut CudaSession,
fmt: PixelFormat,
roi: Rect,
scale: Downscale,
) -> Result<Surface, Error> {
if scale == Downscale::None {
return decode_region_to_cuda_resident_surface_impl(decoder, session, fmt, roi);
}
let source_dimensions = decoder.inner.info().dimensions;
let scaled_dimensions =
DeviceDecodePlan::for_image(source_dimensions, DeviceDecodeRequest::Scaled { scale })?
.output_dims();
let plan = DeviceDecodePlan::for_image(
source_dimensions,
DeviceDecodeRequest::RegionScaled { roi, scale },
)?;
let scaled_roi = plan.output_rect();
match fmt {
PixelFormat::Gray8 | PixelFormat::Gray16 => {
decode_grayscale_cuda_resident_region_scaled_surface(
decoder,
session,
fmt,
scaled_roi,
scaled_dimensions,
)
}
PixelFormat::Rgb8 | PixelFormat::Rgba8 | PixelFormat::Rgb16 | PixelFormat::Rgba16 => {
decode_color_cuda_resident_region_scaled_surface(
decoder,
session,
fmt,
scaled_roi,
scaled_dimensions,
)
}
_ => Err(Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_OUTPUT_FORMAT_UNSUPPORTED,
}),
}
}
#[cfg(feature = "cuda-runtime")]
fn decode_grayscale_cuda_resident_surface_with_profile(
decoder: &mut J2kDecoder<'_>,
session: &mut CudaSession,
fmt: PixelFormat,
wall_started: Option<profile::ProfileInstant>,
collect_stage_timings: bool,
) -> Result<(Surface, CudaHtj2kProfileReport), Error> {
let (plan, mut report) = decoder.build_cuda_htj2k_grayscale_plan_with_profile(fmt)?;
decode_grayscale_cuda_resident_surface_with_plan_profile(
session,
fmt,
&plan,
&mut report,
wall_started,
collect_stage_timings,
)
}
#[cfg(feature = "cuda-runtime")]
fn decode_grayscale_cuda_resident_region_surface(
decoder: &mut J2kDecoder<'_>,
session: &mut CudaSession,
fmt: PixelFormat,
roi: Rect,
) -> Result<Surface, Error> {
let collect_stage_timings = profile::profile_stages_enabled();
let wall_started = profile::profile_now(collect_stage_timings);
let (plan, mut report) =
decoder.build_cuda_htj2k_grayscale_region_plan_with_profile(fmt, roi)?;
decode_grayscale_cuda_resident_surface_with_plan_profile(
session,
fmt,
&plan,
&mut report,
wall_started,
collect_stage_timings,
)
.map(|(surface, _report)| surface)
}
#[cfg(feature = "cuda-runtime")]
fn decode_grayscale_cuda_resident_scaled_surface(
decoder: &mut J2kDecoder<'_>,
session: &mut CudaSession,
fmt: PixelFormat,
output_dimensions: (u32, u32),
) -> Result<Surface, Error> {
let collect_stage_timings = profile::profile_stages_enabled();
let wall_started = profile::profile_now(collect_stage_timings);
let (plan, mut report) =
decoder.build_cuda_htj2k_grayscale_scaled_plan_with_profile(fmt, output_dimensions)?;
decode_grayscale_cuda_resident_surface_with_plan_profile(
session,
fmt,
&plan,
&mut report,
wall_started,
collect_stage_timings,
)
.map(|(surface, _report)| surface)
}
#[cfg(feature = "cuda-runtime")]
fn decode_grayscale_cuda_resident_region_scaled_surface(
decoder: &mut J2kDecoder<'_>,
session: &mut CudaSession,
fmt: PixelFormat,
scaled_roi: Rect,
scaled_dimensions: (u32, u32),
) -> Result<Surface, Error> {
let collect_stage_timings = profile::profile_stages_enabled();
let wall_started = profile::profile_now(collect_stage_timings);
let (plan, mut report) = decoder.build_cuda_htj2k_grayscale_region_scaled_plan_with_profile(
fmt,
scaled_roi,
scaled_dimensions,
)?;
decode_grayscale_cuda_resident_surface_with_plan_profile(
session,
fmt,
&plan,
&mut report,
wall_started,
collect_stage_timings,
)
.map(|(surface, _report)| surface)
}
#[cfg(feature = "cuda-runtime")]
fn decode_grayscale_cuda_resident_surface_with_plan_profile(
session: &mut CudaSession,
fmt: PixelFormat,
plan: &CudaHtj2kDecodePlan,
report: &mut CudaHtj2kProfileReport,
wall_started: Option<profile::ProfileInstant>,
collect_stage_timings: bool,
) -> Result<(Surface, CudaHtj2kProfileReport), Error> {
let context = session.cuda_context()?;
let table_upload_start = profile::profile_now(collect_stage_timings);
let table_resources = session.htj2k_decode_table_resources()?;
let table_upload_us = profile::elapsed_us(table_upload_start);
report.h2d_us = report.h2d_us.saturating_add(table_upload_us);
report.detail.table_upload_us = report
.detail
.table_upload_us
.saturating_add(table_upload_us);
let pool = session.decode_buffer_pool()?;
let component = decode_cuda_component_plan(
&context,
plan,
&table_resources,
&pool,
collect_stage_timings,
)?;
let input_width = component
.store
.input_rect
.x1
.saturating_sub(component.store.input_rect.x0);
let component_buffer = pooled_cuda_buffer(&component.buffer)?;
let (store_output, store_us) = context
.time_default_stream_named_us_if(
collect_stage_timings,
"j2k.htj2k.decode.store.gray",
|| match fmt {
PixelFormat::Gray8 => context.j2k_store_gray8_device(
component_buffer,
CudaJ2kStoreGray8Job {
input_width,
source_x: component.store.source_x,
source_y: component.store.source_y,
copy_width: component.store.copy_width,
copy_height: component.store.copy_height,
output_width: component.store.output_width,
output_height: component.store.output_height,
output_x: component.store.output_x,
output_y: component.store.output_y,
addend: component.store.addend,
bit_depth: u32::from(plan.bit_depth()),
},
),
PixelFormat::Gray16 => context.j2k_store_gray16_device(
component_buffer,
CudaJ2kStoreGray16Job {
input_width,
source_x: component.store.source_x,
source_y: component.store.source_y,
copy_width: component.store.copy_width,
copy_height: component.store.copy_height,
output_width: component.store.output_width,
output_height: component.store.output_height,
output_x: component.store.output_x,
output_y: component.store.output_y,
addend: component.store.addend,
bit_depth: u32::from(plan.bit_depth()),
},
),
_ => Err(CudaError::InvalidArgument {
message: CUDA_HTJ2K_OUTPUT_FORMAT_UNSUPPORTED.to_string(),
}),
},
)
.map_err(cuda_error)?;
let (surface_buffer, store_stats) = store_output.into_parts();
let dispatches = component
.dispatches
.saturating_add(store_stats.kernel_dispatches());
let decode_dispatches = component
.decode_dispatches
.saturating_add(store_stats.decode_kernel_dispatches());
report.dispatch_count = dispatches;
component.timings.add_to_report(report);
report.store_us = report.store_us.saturating_add(store_us);
report.detail.store_dispatch_count = report
.detail
.store_dispatch_count
.saturating_add(store_stats.kernel_dispatches());
report.detail.wall_total_us = profile::elapsed_us(wall_started);
profile::finalize_decode_total_us(report);
report.emit("decode");
let dimensions = (component.store.output_width, component.store.output_height);
let surface = Surface {
backend: BackendKind::Cuda,
residency: SurfaceResidency::CudaResidentDecode,
dimensions,
fmt,
pitch_bytes: dimensions.0 as usize * fmt.bytes_per_pixel(),
stats: CudaSurfaceStats {
total: dispatches,
copy: 0,
decode: decode_dispatches,
},
storage: Storage::Cuda(surface_buffer),
};
Ok((surface, report.clone()))
}
#[cfg(feature = "cuda-runtime")]
fn decode_color_cuda_resident_surface_with_profile(
decoder: &mut J2kDecoder<'_>,
session: &mut CudaSession,
fmt: PixelFormat,
wall_started: Option<profile::ProfileInstant>,
collect_stage_timings: bool,
) -> Result<(Surface, CudaHtj2kProfileReport), Error> {
let color = decoder.build_cuda_htj2k_color_plans_with_profile(fmt)?;
decode_color_cuda_resident_surface_with_plans_profile(
session,
fmt,
color,
wall_started,
collect_stage_timings,
)
}
#[cfg(feature = "cuda-runtime")]
fn decode_color_cuda_resident_scaled_surface(
decoder: &mut J2kDecoder<'_>,
session: &mut CudaSession,
fmt: PixelFormat,
output_dimensions: (u32, u32),
) -> Result<Surface, Error> {
let collect_stage_timings = profile::profile_stages_enabled();
let wall_started = profile::profile_now(collect_stage_timings);
let color = decoder.build_cuda_htj2k_color_scaled_plans_with_profile(fmt, output_dimensions)?;
decode_color_cuda_resident_surface_with_plans_profile(
session,
fmt,
color,
wall_started,
collect_stage_timings,
)
.map(|(surface, _report)| surface)
}
#[cfg(feature = "cuda-runtime")]
fn decode_color_cuda_resident_region_surface(
decoder: &mut J2kDecoder<'_>,
session: &mut CudaSession,
fmt: PixelFormat,
roi: Rect,
) -> Result<Surface, Error> {
let collect_stage_timings = profile::profile_stages_enabled();
let wall_started = profile::profile_now(collect_stage_timings);
let color = decoder.build_cuda_htj2k_color_region_plans_with_profile(fmt, roi)?;
decode_color_cuda_resident_surface_with_plans_profile(
session,
fmt,
color,
wall_started,
collect_stage_timings,
)
.map(|(surface, _report)| surface)
}
#[cfg(feature = "cuda-runtime")]
fn decode_color_cuda_resident_region_scaled_surface(
decoder: &mut J2kDecoder<'_>,
session: &mut CudaSession,
fmt: PixelFormat,
scaled_roi: Rect,
scaled_dimensions: (u32, u32),
) -> Result<Surface, Error> {
let collect_stage_timings = profile::profile_stages_enabled();
let wall_started = profile::profile_now(collect_stage_timings);
let color = decoder.build_cuda_htj2k_color_region_scaled_plans_with_profile(
fmt,
scaled_roi,
scaled_dimensions,
)?;
decode_color_cuda_resident_surface_with_plans_profile(
session,
fmt,
color,
wall_started,
collect_stage_timings,
)
.map(|(surface, _report)| surface)
}
#[cfg(feature = "cuda-runtime")]
fn decode_color_cuda_resident_surface_with_plans_profile(
session: &mut CudaSession,
fmt: PixelFormat,
mut color: CudaHtj2kColorDecodePlans,
wall_started: Option<profile::ProfileInstant>,
collect_stage_timings: bool,
) -> Result<(Surface, CudaHtj2kProfileReport), Error> {
if color.components.len() != 3 {
return Err(Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_KERNELS_NOT_READY,
});
}
let context = session.cuda_context()?;
let pool = session.decode_buffer_pool()?;
let table_upload_start = profile::profile_now(collect_stage_timings);
let table_resources = session.htj2k_decode_table_resources()?;
let table_upload_us = profile::elapsed_us(table_upload_start);
color.report.h2d_us = color.report.h2d_us.saturating_add(table_upload_us);
color.report.detail.table_upload_us = color
.report
.detail
.table_upload_us
.saturating_add(table_upload_us);
let payload_upload_start = profile::profile_now(collect_stage_timings);
let decode_resources = context
.upload_htj2k_decode_resources_with_tables(&color.payload, &table_resources)
.map_err(cuda_error)?;
let payload_upload_us = profile::elapsed_us(payload_upload_start);
profile::add_payload_resource_upload_us(&mut color.report, payload_upload_us);
let mut component_work = Vec::with_capacity(3);
for plan in &color.components {
component_work.push(decode_cuda_component_subbands_with_resources(
&context,
plan,
&pool,
collect_stage_timings,
)?);
}
run_component_cleanup_dequant_batches(
&context,
&decode_resources,
&mut component_work,
&pool,
collect_stage_timings,
)?;
finish_color_cuda_resident_surface_with_component_work(
&context,
&pool,
fmt,
color,
component_work,
wall_started,
collect_stage_timings,
true,
true,
)
}
#[cfg(feature = "cuda-runtime")]
fn decode_color_cuda_resident_batch_surfaces_with_profile(
inputs: &[&[u8]],
session: &mut CudaSession,
fmt: PixelFormat,
collect_stage_timings: bool,
) -> Result<(Vec<Surface>, CudaHtj2kProfileReport), Error> {
let batch_wall_started = profile::profile_now(collect_stage_timings);
let mut colors = Vec::with_capacity(inputs.len());
let mut shared_payload = Vec::new();
let mut native_context = NativeDecoderContext::default();
for input in inputs {
let mut color =
build_cuda_htj2k_color_plans_from_bytes_with_profile(input, fmt, &mut native_context)?;
if color.components.len() != 3 {
return Err(Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_KERNELS_NOT_READY,
});
}
append_color_payload_to_shared(&mut color, &mut shared_payload)?;
colors.push(color);
}
let context = session.cuda_context()?;
let pool = session.decode_batch_buffer_pool()?;
let table_upload_start = profile::profile_now(collect_stage_timings);
let table_resources = session.htj2k_decode_table_resources()?;
let table_upload_us = profile::elapsed_us(table_upload_start);
let payload_upload_start = profile::profile_now(collect_stage_timings);
let decode_resources = context
.upload_htj2k_decode_resources_with_tables_and_pool(
&shared_payload,
&table_resources,
&pool,
)
.map_err(cuda_error)?;
let payload_upload_us = profile::elapsed_us(payload_upload_start);
let component_count = colors
.iter()
.map(|color| color.components.len())
.sum::<usize>();
let mut all_component_work = Vec::with_capacity(component_count);
for color in &colors {
for plan in &color.components {
all_component_work.push(decode_cuda_component_subbands_with_resources(
&context,
plan,
&pool,
collect_stage_timings,
)?);
}
}
run_component_cleanup_dequant_batches(
&context,
&decode_resources,
&mut all_component_work,
&pool,
collect_stage_timings,
)?;
let batch_components = colors
.iter()
.flat_map(|color| color.components.iter())
.collect::<Vec<_>>();
let idwt_batched = can_batch_color_idwt(&batch_components);
let pending_idwt_batch = if idwt_batched {
run_color_component_idwt_batches(
&context,
&batch_components,
&mut all_component_work,
&pool,
collect_stage_timings,
)?
} else {
None
};
drop(batch_components);
let can_use_batch_store =
idwt_batched && can_batch_rgb8_mct_color_store(fmt, &colors, &all_component_work)?;
let (surfaces, reports) = if can_use_batch_store {
finish_color_cuda_resident_batch_surfaces_with_rgb8_mct_store(
&context,
fmt,
colors,
all_component_work,
collect_stage_timings,
)?
} else {
let mut surfaces = Vec::with_capacity(colors.len());
let mut reports = Vec::with_capacity(colors.len());
let mut work_iter = all_component_work.into_iter();
for color in colors {
let component_count = color.components.len();
let component_work = work_iter.by_ref().take(component_count).collect::<Vec<_>>();
if component_work.len() != component_count {
return Err(Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_KERNELS_NOT_READY,
});
}
let (surface, report) = finish_color_cuda_resident_surface_with_component_work(
&context,
&pool,
fmt,
color,
component_work,
None,
collect_stage_timings,
!idwt_batched,
false,
)?;
surfaces.push(surface);
reports.push(report);
}
(surfaces, reports)
};
drop(pending_idwt_batch);
let aggregate = finalize_color_batch_decode_report(
&reports,
table_upload_us,
payload_upload_us,
batch_wall_started,
);
aggregate.emit("decode_batch");
Ok((surfaces, aggregate))
}
#[cfg(feature = "cuda-runtime")]
fn build_cuda_htj2k_color_plans_from_bytes_with_profile<'a>(
input: &'a [u8],
fmt: PixelFormat,
native_context: &mut NativeDecoderContext<'a>,
) -> Result<CudaHtj2kColorDecodePlans, Error> {
let total_start = profile::profile_now(true);
let parse_start = profile::profile_now(true);
let image = NativeImage::new(input, &DecodeSettings::default())
.map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
let parse_us = profile::elapsed_us(parse_start);
let plan_start = profile::profile_now(true);
let native_plan = image
.build_direct_color_plan_with_context(native_context)
.map_err(|error| Error::Decode(J2kError::Backend(error.to_string())))?;
let plan_us = profile::elapsed_us(plan_start);
let flatten_start = profile::profile_now(true);
let mut payload = Vec::new();
let mut components = Vec::with_capacity(native_plan.component_plans.len());
for component_plan in &native_plan.component_plans {
let mut component =
CudaHtj2kDecodePlan::from_grayscale_direct_plan(component_plan, fmt, (0, 0))?;
component.append_payload_to_shared(&mut payload)?;
components.push(component);
}
let flatten_us = profile::elapsed_us(flatten_start);
let block_count = components
.iter()
.map(|plan| plan.code_blocks().len())
.sum::<usize>();
let payload_bytes = payload.len();
let report = CudaHtj2kProfileReport {
parse_us,
plan_us,
flatten_us,
total_us: profile::elapsed_us(total_start),
block_count,
payload_bytes,
dispatch_count: 0,
residency: crate::SurfaceResidency::CudaResidentDecode,
detail: CudaHtj2kDecodeProfileDetail::default(),
..CudaHtj2kProfileReport::default()
};
report.emit("plan");
Ok(CudaHtj2kColorDecodePlans {
dimensions: native_plan.dimensions,
mct_dimensions: native_plan.dimensions,
bit_depths: native_plan.bit_depths,
mct: native_plan.mct,
transform: CudaHtj2kTransform::from_native(native_plan.transform),
payload,
components,
report,
})
}
#[cfg(feature = "cuda-runtime")]
fn finalize_color_batch_decode_report(
reports: &[CudaHtj2kProfileReport],
table_upload_us: u128,
payload_upload_us: u128,
batch_wall_started: Option<profile::ProfileInstant>,
) -> CudaHtj2kProfileReport {
let mut aggregate = aggregate_decode_reports(reports);
aggregate.h2d_us = aggregate
.h2d_us
.saturating_add(table_upload_us)
.saturating_add(payload_upload_us);
aggregate.detail.table_upload_us = aggregate
.detail
.table_upload_us
.saturating_add(table_upload_us);
aggregate.detail.payload_upload_us = aggregate
.detail
.payload_upload_us
.saturating_add(payload_upload_us);
aggregate.detail.wall_total_us = profile::elapsed_us(batch_wall_started);
profile::finalize_decode_total_us(&mut aggregate);
aggregate
}
#[cfg(feature = "cuda-runtime")]
fn can_batch_rgb8_mct_color_store(
fmt: PixelFormat,
colors: &[CudaHtj2kColorDecodePlans],
all_component_work: &[CudaComponentDecodeWork],
) -> Result<bool, Error> {
if !matches!(fmt, PixelFormat::Rgb8 | PixelFormat::Rgba8) {
return Ok(false);
}
let mut offset = 0usize;
for color in colors {
let component_count = color.components.len();
if component_count != 3 || offset.saturating_add(component_count) > all_component_work.len()
{
return Err(Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_KERNELS_NOT_READY,
});
}
if !color.mct {
return Ok(false);
}
let component_work = &all_component_work[offset..offset + component_count];
let stores = [
&component_work[0].store,
&component_work[1].store,
&component_work[2].store,
];
validate_color_stores(stores, color.dimensions)?;
if !can_fuse_mct_store_for_stores(stores) {
return Ok(false);
}
offset = offset.saturating_add(component_count);
}
if offset != all_component_work.len() {
return Err(Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_KERNELS_NOT_READY,
});
}
Ok(!colors.is_empty())
}
#[cfg(feature = "cuda-runtime")]
fn finish_color_cuda_resident_batch_surfaces_with_rgb8_mct_store(
context: &j2k_cuda_runtime::CudaContext,
fmt: PixelFormat,
colors: Vec<CudaHtj2kColorDecodePlans>,
all_component_work: Vec<CudaComponentDecodeWork>,
collect_stage_timings: bool,
) -> Result<(Vec<Surface>, Vec<CudaHtj2kProfileReport>), Error> {
let mut prepared = Vec::with_capacity(colors.len());
let mut work_iter = all_component_work.into_iter();
for color in colors {
let component_count = color.components.len();
let component_work = work_iter.by_ref().take(component_count).collect::<Vec<_>>();
if component_work.len() != component_count {
return Err(Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_KERNELS_NOT_READY,
});
}
prepared.push(prepare_rgb8_mct_batch_store(fmt, color, component_work)?);
}
if work_iter.next().is_some() {
return Err(Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_KERNELS_NOT_READY,
});
}
let targets = prepared
.iter()
.map(rgb8_mct_batch_store_target)
.collect::<Result<Vec<_>, Error>>()?;
let (store_output, store_us) = context
.time_default_stream_named_us_if(
collect_stage_timings,
"j2k.htj2k.decode.store.color.batch",
|| context.j2k_store_rgb8_mct_batch_contiguous_device(&targets),
)
.map_err(cuda_error)?;
drop(targets);
let (surface_buffer, surface_ranges, store_stats) = store_output.into_parts();
if surface_ranges.len() != prepared.len() {
return Err(Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_KERNELS_NOT_READY,
});
}
let shared_surface_buffer = Arc::new(surface_buffer);
let mut surfaces = Vec::with_capacity(prepared.len());
let mut reports = Vec::with_capacity(prepared.len());
let store_dispatches = store_stats.kernel_dispatches();
let store_decode_dispatches = store_stats.decode_kernel_dispatches();
for (index, (mut prepared, surface_range)) in
prepared.into_iter().zip(surface_ranges).enumerate()
{
let report_store_dispatches = if index == 0 { store_dispatches } else { 0 };
let report_store_decode_dispatches = if index == 0 {
store_decode_dispatches
} else {
0
};
let report_store_us = if index == 0 { store_us } else { 0 };
let dispatches = prepared.dispatches.saturating_add(report_store_dispatches);
let decode_dispatches = prepared
.decode_dispatches
.saturating_add(report_store_decode_dispatches);
prepared.color.report.dispatch_count = dispatches;
prepared.color.report.store_us = prepared
.color
.report
.store_us
.saturating_add(report_store_us);
prepared.color.report.detail.store_dispatch_count = prepared
.color
.report
.detail
.store_dispatch_count
.saturating_add(report_store_dispatches);
profile::finalize_decode_total_us(&mut prepared.color.report);
let dimensions = prepared.color.dimensions;
surfaces.push(Surface {
backend: BackendKind::Cuda,
residency: SurfaceResidency::CudaResidentDecode,
dimensions,
fmt,
pitch_bytes: dimensions.0 as usize * fmt.bytes_per_pixel(),
stats: CudaSurfaceStats {
total: dispatches,
copy: 0,
decode: decode_dispatches,
},
storage: cuda_range_storage(
shared_surface_buffer.clone(),
surface_range.offset,
surface_range.len,
),
});
reports.push(prepared.color.report);
}
Ok((surfaces, reports))
}
#[cfg(feature = "cuda-runtime")]
fn prepare_rgb8_mct_batch_store(
fmt: PixelFormat,
mut color: CudaHtj2kColorDecodePlans,
component_work: Vec<CudaComponentDecodeWork>,
) -> Result<CudaPreparedRgb8MctBatchStore, Error> {
let decoded_components = component_work
.into_iter()
.map(finish_cuda_component_decode)
.collect::<Result<Vec<_>, Error>>()?;
let [component0, component1, component2] = decoded_components.as_slice() else {
return Err(Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_KERNELS_NOT_READY,
});
};
let stores = [&component0.store, &component1.store, &component2.store];
validate_color_stores(stores, color.dimensions)?;
if !color.mct || !can_fuse_mct_store_for_stores(stores) {
return Err(Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_KERNELS_NOT_READY,
});
}
let dispatches = decoded_components
.iter()
.map(|component| component.dispatches)
.sum::<usize>();
let decode_dispatches = decoded_components
.iter()
.map(|component| component.decode_dispatches)
.sum::<usize>();
for component in &decoded_components {
component.timings.add_to_report(&mut color.report);
}
let addends = [
bit_depth_addend(color.bit_depths[0]),
bit_depth_addend(color.bit_depths[1]),
bit_depth_addend(color.bit_depths[2]),
];
let job = CudaJ2kStoreRgb8MctJob {
store: CudaJ2kStoreRgb8Job {
input_width0: color_store_input_width(&component0.store),
input_width1: color_store_input_width(&component1.store),
input_width2: color_store_input_width(&component2.store),
source_x0: component0.store.source_x,
source_y0: component0.store.source_y,
source_x1: component1.store.source_x,
source_y1: component1.store.source_y,
source_x2: component2.store.source_x,
source_y2: component2.store.source_y,
copy_width: component0.store.copy_width,
copy_height: component0.store.copy_height,
output_width: component0.store.output_width,
output_height: component0.store.output_height,
output_x: component0.store.output_x,
output_y: component0.store.output_y,
addend0: addends[0],
addend1: addends[1],
addend2: addends[2],
bit_depth0: u32::from(color.bit_depths[0]),
bit_depth1: u32::from(color.bit_depths[1]),
bit_depth2: u32::from(color.bit_depths[2]),
rgba: u32::from(fmt == PixelFormat::Rgba8),
},
irreversible97: u32::from(color.transform == CudaHtj2kTransform::Irreversible97),
};
Ok(CudaPreparedRgb8MctBatchStore {
color,
decoded_components,
dispatches,
decode_dispatches,
job,
})
}
#[cfg(feature = "cuda-runtime")]
fn rgb8_mct_batch_store_target(
prepared: &CudaPreparedRgb8MctBatchStore,
) -> Result<CudaJ2kStoreRgb8MctTarget<'_>, Error> {
let [component0, component1, component2] = prepared.decoded_components.as_slice() else {
return Err(Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_KERNELS_NOT_READY,
});
};
Ok(CudaJ2kStoreRgb8MctTarget {
plane0: pooled_cuda_buffer(&component0.buffer)?,
plane1: pooled_cuda_buffer(&component1.buffer)?,
plane2: pooled_cuda_buffer(&component2.buffer)?,
job: prepared.job,
})
}
#[cfg(feature = "cuda-runtime")]
fn can_fuse_mct_store_for_stores(stores: [&CudaHtj2kStoreStep; 3]) -> bool {
let input_width0 = color_store_input_width(stores[0]);
let input_width1 = color_store_input_width(stores[1]);
let input_width2 = color_store_input_width(stores[2]);
input_width0 == input_width1
&& input_width0 == input_width2
&& stores[0].source_x == stores[1].source_x
&& stores[0].source_x == stores[2].source_x
&& stores[0].source_y == stores[1].source_y
&& stores[0].source_y == stores[2].source_y
}
#[cfg(feature = "cuda-runtime")]
fn color_store_input_width(store: &CudaHtj2kStoreStep) -> u32 {
store.input_rect.x1.saturating_sub(store.input_rect.x0)
}
#[cfg(feature = "cuda-runtime")]
#[allow(clippy::too_many_arguments)]
fn finish_color_cuda_resident_surface_with_component_work(
context: &j2k_cuda_runtime::CudaContext,
pool: &CudaBufferPool,
fmt: PixelFormat,
mut color: CudaHtj2kColorDecodePlans,
mut component_work: Vec<CudaComponentDecodeWork>,
wall_started: Option<profile::ProfileInstant>,
collect_stage_timings: bool,
run_idwt: bool,
emit_report: bool,
) -> Result<(Surface, CudaHtj2kProfileReport), Error> {
let pending_idwt_batch = if run_idwt {
let batch_components = color.components.iter().collect::<Vec<_>>();
if can_batch_color_idwt(&batch_components) {
run_color_component_idwt_batches(
context,
&batch_components,
&mut component_work,
pool,
collect_stage_timings,
)?
} else {
for (plan, work) in color.components.iter().zip(component_work.iter_mut()) {
run_cuda_component_idwt_steps(
context,
plan.idwt_steps(),
work,
pool,
collect_stage_timings,
)?;
}
None
}
} else {
None
};
let decoded_components = component_work
.into_iter()
.map(finish_cuda_component_decode)
.collect::<Result<Vec<_>, Error>>()?;
let [component0, component1, component2] = decoded_components.as_slice() else {
return Err(Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_KERNELS_NOT_READY,
});
};
validate_color_stores(
[&component0.store, &component1.store, &component2.store],
color.dimensions,
)?;
let mut dispatches = decoded_components
.iter()
.map(|component| component.dispatches)
.sum::<usize>();
let mut decode_dispatches = decoded_components
.iter()
.map(|component| component.decode_dispatches)
.sum::<usize>();
for component in &decoded_components {
component.timings.add_to_report(&mut color.report);
}
let component0_buffer = pooled_cuda_buffer(&component0.buffer)?;
let component1_buffer = pooled_cuda_buffer(&component1.buffer)?;
let component2_buffer = pooled_cuda_buffer(&component2.buffer)?;
let input_width0 = component0
.store
.input_rect
.x1
.saturating_sub(component0.store.input_rect.x0);
let input_width1 = component1
.store
.input_rect
.x1
.saturating_sub(component1.store.input_rect.x0);
let input_width2 = component2
.store
.input_rect
.x1
.saturating_sub(component2.store.input_rect.x0);
let irreversible97 = u32::from(color.transform == CudaHtj2kTransform::Irreversible97);
let mct_store_addends = [
bit_depth_addend(color.bit_depths[0]),
bit_depth_addend(color.bit_depths[1]),
bit_depth_addend(color.bit_depths[2]),
];
let can_fuse_mct_store = color.mct
&& input_width0 == input_width1
&& input_width0 == input_width2
&& component0.store.source_x == component1.store.source_x
&& component0.store.source_x == component2.store.source_x
&& component0.store.source_y == component1.store.source_y
&& component0.store.source_y == component2.store.source_y;
let addends = if color.mct && can_fuse_mct_store {
mct_store_addends
} else if color.mct {
let mct_len = u32::try_from(checked_area(
color.mct_dimensions.0,
color.mct_dimensions.1,
)?)
.map_err(|_| Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_KERNELS_NOT_READY,
})?;
let stats = context
.time_default_stream_named_us_if(collect_stage_timings, "j2k.htj2k.decode.mct", || {
context.j2k_inverse_mct_device(
component0_buffer,
component1_buffer,
component2_buffer,
CudaJ2kInverseMctJob {
len: mct_len,
irreversible97,
addend0: mct_store_addends[0],
addend1: mct_store_addends[1],
addend2: mct_store_addends[2],
},
)
})
.map_err(cuda_error)?;
let (stats, mct_us) = stats;
dispatches = dispatches.saturating_add(stats.kernel_dispatches());
decode_dispatches = decode_dispatches.saturating_add(stats.decode_kernel_dispatches());
color.report.mct_us = color.report.mct_us.saturating_add(mct_us);
color.report.detail.mct_dispatch_count = color
.report
.detail
.mct_dispatch_count
.saturating_add(stats.kernel_dispatches());
[0.0, 0.0, 0.0]
} else {
[
component0.store.addend,
component1.store.addend,
component2.store.addend,
]
};
let (store_output, store_us) = context
.time_default_stream_named_us_if(
collect_stage_timings,
"j2k.htj2k.decode.store.color",
|| match fmt {
PixelFormat::Rgb8 | PixelFormat::Rgba8 => {
let store_job = CudaJ2kStoreRgb8Job {
input_width0,
input_width1,
input_width2,
source_x0: component0.store.source_x,
source_y0: component0.store.source_y,
source_x1: component1.store.source_x,
source_y1: component1.store.source_y,
source_x2: component2.store.source_x,
source_y2: component2.store.source_y,
copy_width: component0.store.copy_width,
copy_height: component0.store.copy_height,
output_width: component0.store.output_width,
output_height: component0.store.output_height,
output_x: component0.store.output_x,
output_y: component0.store.output_y,
addend0: addends[0],
addend1: addends[1],
addend2: addends[2],
bit_depth0: u32::from(color.bit_depths[0]),
bit_depth1: u32::from(color.bit_depths[1]),
bit_depth2: u32::from(color.bit_depths[2]),
rgba: u32::from(fmt == PixelFormat::Rgba8),
};
if can_fuse_mct_store {
context.j2k_store_rgb8_mct_device(
component0_buffer,
component1_buffer,
component2_buffer,
CudaJ2kStoreRgb8MctJob {
store: store_job,
irreversible97,
},
)
} else {
context.j2k_store_rgb8_device(
component0_buffer,
component1_buffer,
component2_buffer,
store_job,
)
}
}
PixelFormat::Rgb16 | PixelFormat::Rgba16 => {
let store_job = CudaJ2kStoreRgb16Job {
input_width0,
input_width1,
input_width2,
source_x0: component0.store.source_x,
source_y0: component0.store.source_y,
source_x1: component1.store.source_x,
source_y1: component1.store.source_y,
source_x2: component2.store.source_x,
source_y2: component2.store.source_y,
copy_width: component0.store.copy_width,
copy_height: component0.store.copy_height,
output_width: component0.store.output_width,
output_height: component0.store.output_height,
output_x: component0.store.output_x,
output_y: component0.store.output_y,
addend0: addends[0],
addend1: addends[1],
addend2: addends[2],
bit_depth0: u32::from(color.bit_depths[0]),
bit_depth1: u32::from(color.bit_depths[1]),
bit_depth2: u32::from(color.bit_depths[2]),
rgba: u32::from(fmt == PixelFormat::Rgba16),
};
if can_fuse_mct_store {
context.j2k_store_rgb16_mct_device(
component0_buffer,
component1_buffer,
component2_buffer,
CudaJ2kStoreRgb16MctJob {
store: store_job,
irreversible97,
},
)
} else {
context.j2k_store_rgb16_device(
component0_buffer,
component1_buffer,
component2_buffer,
store_job,
)
}
}
_ => Err(CudaError::InvalidArgument {
message: CUDA_HTJ2K_OUTPUT_FORMAT_UNSUPPORTED.to_string(),
}),
},
)
.map_err(cuda_error)?;
drop(pending_idwt_batch);
let (surface_buffer, store_stats) = store_output.into_parts();
dispatches = dispatches.saturating_add(store_stats.kernel_dispatches());
decode_dispatches = decode_dispatches.saturating_add(store_stats.decode_kernel_dispatches());
color.report.dispatch_count = dispatches;
color.report.store_us = color.report.store_us.saturating_add(store_us);
color.report.detail.store_dispatch_count = color
.report
.detail
.store_dispatch_count
.saturating_add(store_stats.kernel_dispatches());
color.report.detail.wall_total_us = profile::elapsed_us(wall_started);
profile::finalize_decode_total_us(&mut color.report);
if emit_report {
color.report.emit("decode");
}
let surface = Surface {
backend: BackendKind::Cuda,
residency: SurfaceResidency::CudaResidentDecode,
dimensions: color.dimensions,
fmt,
pitch_bytes: color.dimensions.0 as usize * fmt.bytes_per_pixel(),
stats: CudaSurfaceStats {
total: dispatches,
copy: 0,
decode: decode_dispatches,
},
storage: Storage::Cuda(surface_buffer),
};
Ok((surface, color.report))
}
#[cfg(feature = "cuda-runtime")]
fn append_color_payload_to_shared(
color: &mut CudaHtj2kColorDecodePlans,
shared_payload: &mut Vec<u8>,
) -> Result<(), Error> {
let base = u64::try_from(shared_payload.len()).map_err(|_| Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_BATCH_PAYLOAD_TOO_LARGE,
})?;
shared_payload
.try_reserve(color.payload.len())
.map_err(|_| Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_BATCH_PAYLOAD_TOO_LARGE,
})?;
for component in &mut color.components {
component.rebase_payload_offsets(base)?;
}
shared_payload.append(&mut color.payload);
Ok(())
}
#[cfg(feature = "cuda-runtime")]
fn aggregate_decode_reports(reports: &[CudaHtj2kProfileReport]) -> CudaHtj2kProfileReport {
let mut aggregate = CudaHtj2kProfileReport {
residency: SurfaceResidency::CudaResidentDecode,
..CudaHtj2kProfileReport::default()
};
for report in reports {
add_decode_report(&mut aggregate, report);
}
aggregate
}
#[cfg(feature = "cuda-runtime")]
fn add_decode_report(aggregate: &mut CudaHtj2kProfileReport, report: &CudaHtj2kProfileReport) {
aggregate.parse_us = aggregate.parse_us.saturating_add(report.parse_us);
aggregate.plan_us = aggregate.plan_us.saturating_add(report.plan_us);
aggregate.flatten_us = aggregate.flatten_us.saturating_add(report.flatten_us);
aggregate.h2d_us = aggregate.h2d_us.saturating_add(report.h2d_us);
aggregate.ht_cleanup_us = aggregate.ht_cleanup_us.saturating_add(report.ht_cleanup_us);
aggregate.ht_refine_us = aggregate.ht_refine_us.saturating_add(report.ht_refine_us);
aggregate.dequant_us = aggregate.dequant_us.saturating_add(report.dequant_us);
aggregate.idwt_us = aggregate.idwt_us.saturating_add(report.idwt_us);
aggregate.mct_us = aggregate.mct_us.saturating_add(report.mct_us);
aggregate.store_us = aggregate.store_us.saturating_add(report.store_us);
aggregate.block_count = aggregate.block_count.saturating_add(report.block_count);
aggregate.payload_bytes = aggregate.payload_bytes.saturating_add(report.payload_bytes);
aggregate.dispatch_count = aggregate
.dispatch_count
.saturating_add(report.dispatch_count);
aggregate.detail.table_upload_us = aggregate
.detail
.table_upload_us
.saturating_add(report.detail.table_upload_us);
aggregate.detail.payload_upload_us = aggregate
.detail
.payload_upload_us
.saturating_add(report.detail.payload_upload_us);
aggregate.detail.job_upload_us = aggregate
.detail
.job_upload_us
.saturating_add(report.detail.job_upload_us);
aggregate.detail.status_d2h_us = aggregate
.detail
.status_d2h_us
.saturating_add(report.detail.status_d2h_us);
aggregate.detail.output_d2h_us = aggregate
.detail
.output_d2h_us
.saturating_add(report.detail.output_d2h_us);
aggregate.detail.ht_dispatch_count = aggregate
.detail
.ht_dispatch_count
.saturating_add(report.detail.ht_dispatch_count);
aggregate.detail.dequant_dispatch_count = aggregate
.detail
.dequant_dispatch_count
.saturating_add(report.detail.dequant_dispatch_count);
aggregate.detail.idwt_dispatch_count = aggregate
.detail
.idwt_dispatch_count
.saturating_add(report.detail.idwt_dispatch_count);
aggregate.detail.mct_dispatch_count = aggregate
.detail
.mct_dispatch_count
.saturating_add(report.detail.mct_dispatch_count);
aggregate.detail.store_dispatch_count = aggregate
.detail
.store_dispatch_count
.saturating_add(report.detail.store_dispatch_count);
}
#[cfg(not(feature = "cuda-runtime"))]
fn decode_to_cuda_resident_surface_impl(
_decoder: &mut J2kDecoder<'_>,
_session: &mut CudaSession,
_fmt: PixelFormat,
) -> Result<Surface, Error> {
Err(Error::CudaUnavailable)
}
#[cfg(not(feature = "cuda-runtime"))]
fn decode_to_cuda_resident_surface_with_profile_impl(
_decoder: &mut J2kDecoder<'_>,
_session: &mut CudaSession,
_fmt: PixelFormat,
) -> Result<(Surface, CudaHtj2kProfileReport), Error> {
Err(Error::CudaUnavailable)
}
#[cfg(not(feature = "cuda-runtime"))]
fn decode_region_to_cuda_resident_surface_impl(
_decoder: &mut J2kDecoder<'_>,
_session: &mut CudaSession,
_fmt: PixelFormat,
_roi: Rect,
) -> Result<Surface, Error> {
Err(Error::CudaUnavailable)
}
#[cfg(not(feature = "cuda-runtime"))]
fn decode_scaled_to_cuda_resident_surface_impl(
_decoder: &mut J2kDecoder<'_>,
_session: &mut CudaSession,
_fmt: PixelFormat,
_scale: Downscale,
) -> Result<Surface, Error> {
Err(Error::CudaUnavailable)
}
#[cfg(not(feature = "cuda-runtime"))]
fn decode_region_scaled_to_cuda_resident_surface_impl(
_decoder: &mut J2kDecoder<'_>,
_session: &mut CudaSession,
_fmt: PixelFormat,
_roi: Rect,
_scale: Downscale,
) -> Result<Surface, Error> {
Err(Error::CudaUnavailable)
}
#[cfg(not(feature = "cuda-runtime"))]
fn decode_batch_to_cuda_resident_surface_with_profile_control(
_inputs: &[&[u8]],
_session: &mut CudaSession,
_fmt: PixelFormat,
_collect_stage_timings: bool,
) -> Result<(Vec<Surface>, CudaHtj2kProfileReport), Error> {
Err(Error::CudaUnavailable)
}
#[cfg(feature = "cuda-runtime")]
fn decode_cuda_component_plan(
context: &j2k_cuda_runtime::CudaContext,
plan: &CudaHtj2kDecodePlan,
tables: &CudaHtj2kDecodeTableResources,
pool: &CudaBufferPool,
collect_stage_timings: bool,
) -> Result<CudaDecodedComponent, Error> {
let resource_upload_start = profile::profile_now(collect_stage_timings);
let decode_resources = context
.upload_htj2k_decode_resources_with_tables(plan.payload(), tables)
.map_err(cuda_error)?;
let resource_upload_us = profile::elapsed_us(resource_upload_start);
let mut component = decode_cuda_component_plan_with_resources(
context,
plan,
&decode_resources,
pool,
collect_stage_timings,
)?;
component.timings.h2d = component.timings.h2d.saturating_add(resource_upload_us);
component.timings.payload_upload = component
.timings
.payload_upload
.saturating_add(resource_upload_us);
Ok(component)
}
#[cfg(test)]
fn split_htj2k_subband_decode_dispatches(kernel_dispatches: usize) -> (usize, usize) {
if kernel_dispatches == 0 {
return (0, 0);
}
let dequant_dispatches = usize::from(kernel_dispatches > 1);
(
kernel_dispatches.saturating_sub(dequant_dispatches),
dequant_dispatches,
)
}
#[cfg(feature = "cuda-runtime")]
fn htj2k_batched_cleanup_dispatches(target_count: usize) -> usize {
usize::from(target_count > 0)
}
#[cfg(any(feature = "cuda-runtime", test))]
fn htj2k_batched_dequant_dispatches(target_count: usize) -> usize {
usize::from(target_count > 0)
}
#[cfg(feature = "cuda-runtime")]
fn htj2k_batched_cleanup_dequant_dispatches(
target_count: usize,
fused_cleanup_dequant: bool,
) -> (usize, usize) {
if target_count == 0 {
return (0, 0);
}
if fused_cleanup_dequant {
(1, 0)
} else {
(1, 1)
}
}
#[cfg(feature = "cuda-runtime")]
fn decode_cuda_component_plan_with_resources(
context: &j2k_cuda_runtime::CudaContext,
plan: &CudaHtj2kDecodePlan,
decode_resources: &CudaHtj2kDecodeResources,
pool: &CudaBufferPool,
collect_stage_timings: bool,
) -> Result<CudaDecodedComponent, Error> {
let mut work =
decode_cuda_component_subbands_with_resources(context, plan, pool, collect_stage_timings)?;
run_component_cleanup_dequant_batches(
context,
decode_resources,
std::slice::from_mut(&mut work),
pool,
collect_stage_timings,
)?;
run_cuda_component_idwt_steps(
context,
plan.idwt_steps(),
&mut work,
pool,
collect_stage_timings,
)?;
finish_cuda_component_decode(work)
}
#[cfg(feature = "cuda-runtime")]
fn decode_cuda_component_subbands_with_resources(
context: &j2k_cuda_runtime::CudaContext,
plan: &CudaHtj2kDecodePlan,
pool: &CudaBufferPool,
collect_stage_timings: bool,
) -> Result<CudaComponentDecodeWork, Error> {
let mut bands = Vec::with_capacity(plan.subbands().len() + plan.idwt_steps().len());
let mut pending_dequant_bands = Vec::with_capacity(plan.subbands().len());
let dispatches = 0usize;
let decode_dispatches = 0usize;
let mut timings = CudaDecodeStageTimings::default();
for subband in plan.subbands() {
let start = subband.code_block_start as usize;
let end = start.checked_add(subband.code_block_count as usize).ok_or(
Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_PLAN_INVARIANT_FAILED,
},
)?;
let code_blocks =
plan.code_blocks()
.get(start..end)
.ok_or(Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_PLAN_INVARIANT_FAILED,
})?;
let jobs = code_blocks
.iter()
.map(|block| cuda_code_block_job_from_plan_block(block, subband.width))
.collect::<Result<Vec<_>, Error>>()?;
let output_words = checked_area(subband.width, subband.height)?;
let allocate_start = profile::profile_now(collect_stage_timings);
let output = context
.allocate_htj2k_codeblock_coefficients_with_pool(&jobs, output_words, pool)
.map_err(cuda_error)?;
let allocate_wall_us = profile::elapsed_us(allocate_start);
timings.h2d = timings.h2d.saturating_add(allocate_wall_us);
let (buffer, _, _) = output.into_parts();
let band_index = bands.len();
bands.push(CudaCoefficientBand {
band_id: subband.band_id,
buffer,
});
if !jobs.is_empty() {
pending_dequant_bands.push(CudaPendingDequantBand {
band_index,
jobs,
output_words,
});
}
}
let [store] = plan.store_steps() else {
return Err(Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_STORE_UNSUPPORTED,
});
};
Ok(CudaComponentDecodeWork {
bands,
pending_dequant_bands,
store: *store,
dispatches,
decode_dispatches,
timings,
})
}
#[cfg(feature = "cuda-runtime")]
fn run_component_cleanup_dequant_batches(
context: &j2k_cuda_runtime::CudaContext,
decode_resources: &CudaHtj2kDecodeResources,
component_work: &mut [CudaComponentDecodeWork],
pool: &CudaBufferPool,
collect_stage_timings: bool,
) -> Result<(), Error> {
let pending_count = component_work
.iter()
.map(|work| work.pending_dequant_bands.len())
.sum::<usize>();
if pending_count == 0 {
return Ok(());
}
let accounting_index = component_work
.iter()
.position(|work| !work.pending_dequant_bands.is_empty())
.ok_or(Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_KERNELS_NOT_READY,
})?;
let has_refinement = component_work.iter().any(|work| {
work.pending_dequant_bands.iter().any(|pending| {
pending
.jobs
.iter()
.any(|job| job.refinement_length > 0 || u32::from(job.number_of_coding_passes) > 1)
})
});
let cleanup_targets = component_work
.iter()
.flat_map(|work| {
work.pending_dequant_bands
.iter()
.map(move |pending| (work, pending))
})
.map(|(work, pending)| {
let coefficients = pooled_cuda_buffer(&work.bands[pending.band_index].buffer)?;
Ok(CudaHtj2kCleanupTarget {
coefficients,
jobs: &pending.jobs,
output_words: pending.output_words,
})
})
.collect::<Result<Vec<_>, Error>>()?;
if !has_refinement {
let stage_start = profile::profile_now(collect_stage_timings);
let ((stats, runtime_timings), fused_us) = context
.time_default_stream_named_us_if(
collect_stage_timings,
"j2k.htj2k.decode.cleanup_dequantize.batch",
|| {
context
.decode_htj2k_codeblocks_cleanup_dequantize_multi_with_resources_and_pool_timed(
decode_resources,
&cleanup_targets,
pool,
collect_stage_timings,
)
},
)
.map_err(cuda_error)?;
let stage_wall_us = profile::elapsed_us(stage_start);
let (cleanup_dispatches, dequant_dispatches) =
htj2k_batched_cleanup_dequant_dispatches(pending_count, true);
{
let accounting = &mut component_work[accounting_index];
accounting.timings.h2d = accounting
.timings
.h2d
.saturating_add(stage_wall_us.saturating_sub(fused_us));
accounting.timings.ht_cleanup = accounting.timings.ht_cleanup.saturating_add(fused_us);
accounting.timings.status_d2h = accounting
.timings
.status_d2h
.saturating_add(runtime_timings.status_d2h_us);
accounting.timings.ht_dispatch_count = accounting
.timings
.ht_dispatch_count
.saturating_add(cleanup_dispatches);
accounting.timings.dequant_dispatch_count = accounting
.timings
.dequant_dispatch_count
.saturating_add(dequant_dispatches);
accounting.dispatches = accounting
.dispatches
.saturating_add(stats.kernel_dispatches());
accounting.decode_dispatches = accounting
.decode_dispatches
.saturating_add(stats.decode_kernel_dispatches());
}
for work in component_work {
work.pending_dequant_bands.clear();
}
return Ok(());
}
let mut queued_cleanup: Option<CudaQueuedHtj2kCleanup> = None;
let stage_start = profile::profile_now(collect_stage_timings);
let (stats, cleanup_us, status_d2h_us) = if collect_stage_timings {
let ((stats, runtime_timings), cleanup_us) = context
.time_default_stream_named_us_if(
collect_stage_timings,
"j2k.htj2k.decode.cleanup.batch",
|| {
context.decode_htj2k_codeblocks_cleanup_multi_with_resources_and_pool_timed(
decode_resources,
&cleanup_targets,
pool,
collect_stage_timings,
)
},
)
.map_err(cuda_error)?;
(stats, cleanup_us, runtime_timings.status_d2h_us)
} else {
let (queued, cleanup_us) = context
.time_default_stream_named_us_if(false, "j2k.htj2k.decode.cleanup.batch", || {
context.decode_htj2k_codeblocks_cleanup_multi_enqueue_with_resources_and_pool(
decode_resources,
&cleanup_targets,
pool,
)
})
.map_err(cuda_error)?;
let stats = queued.execution();
queued_cleanup = Some(queued);
(stats, cleanup_us, 0)
};
drop(cleanup_targets);
let stage_wall_us = profile::elapsed_us(stage_start);
{
let accounting = &mut component_work[accounting_index];
accounting.timings.h2d = accounting
.timings
.h2d
.saturating_add(stage_wall_us.saturating_sub(cleanup_us));
accounting.timings.ht_cleanup = accounting.timings.ht_cleanup.saturating_add(cleanup_us);
accounting.timings.status_d2h = accounting.timings.status_d2h.saturating_add(status_d2h_us);
if has_refinement {
accounting.timings.ht_refine = accounting.timings.ht_refine.saturating_add(cleanup_us);
}
accounting.timings.ht_dispatch_count = accounting
.timings
.ht_dispatch_count
.saturating_add(htj2k_batched_cleanup_dispatches(pending_count));
accounting.dispatches = accounting
.dispatches
.saturating_add(stats.kernel_dispatches());
accounting.decode_dispatches = accounting
.decode_dispatches
.saturating_add(stats.decode_kernel_dispatches());
}
let stage_start = profile::profile_now(collect_stage_timings);
let (stats, dequant_us, dequant_target_count) = {
let dequant_target_count = pending_count;
let dequant_result = if let Some(queued) = queued_cleanup.as_ref() {
context.time_default_stream_named_us_if(
collect_stage_timings,
"j2k.htj2k.decode.dequantize.batch",
|| context.j2k_dequantize_queued_htj2k_cleanup_with_pool(queued),
)
} else {
let dequant_targets = component_work
.iter()
.flat_map(|work| {
work.pending_dequant_bands
.iter()
.map(move |pending| (work, pending))
})
.map(|(work, pending)| {
let coefficients = pooled_cuda_buffer(&work.bands[pending.band_index].buffer)?;
Ok(CudaHtj2kDequantizeTarget {
coefficients,
jobs: &pending.jobs,
output_words: pending.output_words,
})
})
.collect::<Result<Vec<_>, Error>>()?;
context.time_default_stream_named_us_if(
collect_stage_timings,
"j2k.htj2k.decode.dequantize.batch",
|| {
context.j2k_dequantize_htj2k_codeblocks_multi_device_with_pool(
&dequant_targets,
pool,
)
},
)
};
let (stats, dequant_us) = match dequant_result {
Ok(result) => result,
Err(error) => {
if let Some(queued) = queued_cleanup.take() {
queued.finish().map_err(cuda_error)?;
}
return Err(cuda_error(error));
}
};
(stats, dequant_us, dequant_target_count)
};
let stage_wall_us = profile::elapsed_us(stage_start);
{
let accounting = &mut component_work[accounting_index];
accounting.timings.h2d = accounting
.timings
.h2d
.saturating_add(stage_wall_us.saturating_sub(dequant_us));
accounting.timings.dequant = accounting.timings.dequant.saturating_add(dequant_us);
accounting.timings.dequant_dispatch_count = accounting
.timings
.dequant_dispatch_count
.saturating_add(htj2k_batched_dequant_dispatches(dequant_target_count));
accounting.dispatches = accounting
.dispatches
.saturating_add(stats.kernel_dispatches());
accounting.decode_dispatches = accounting
.decode_dispatches
.saturating_add(stats.decode_kernel_dispatches());
}
if let Some(queued) = queued_cleanup.take() {
queued.finish().map_err(cuda_error)?;
}
for work in component_work {
work.pending_dequant_bands.clear();
}
Ok(())
}
#[cfg(feature = "cuda-runtime")]
fn run_cuda_component_idwt_steps(
context: &j2k_cuda_runtime::CudaContext,
steps: &[CudaHtj2kIdwtStep],
work: &mut CudaComponentDecodeWork,
pool: &CudaBufferPool,
collect_stage_timings: bool,
) -> Result<(), Error> {
for step in steps {
let ll = find_cuda_band(&work.bands, step.ll_band_id)?;
let hl = find_cuda_band(&work.bands, step.hl_band_id)?;
let lh = find_cuda_band(&work.bands, step.lh_band_id)?;
let hh = find_cuda_band(&work.bands, step.hh_band_id)?;
let low_low_device = pooled_cuda_buffer(&ll.buffer)?;
let high_low_device = pooled_cuda_buffer(&hl.buffer)?;
let low_high_device = pooled_cuda_buffer(&lh.buffer)?;
let high_high_device = pooled_cuda_buffer(&hh.buffer)?;
let job = cuda_idwt_job_from_step(step);
let (output, idwt_us) = context
.time_default_stream_named_us_if(collect_stage_timings, "j2k.htj2k.decode.idwt", || {
if collect_stage_timings {
return context.j2k_inverse_dwt_single_device_with_pool(
low_low_device,
high_low_device,
low_high_device,
high_high_device,
job,
pool,
);
}
context.j2k_inverse_dwt_single_device_untimed_with_pool(
low_low_device,
high_low_device,
low_high_device,
high_high_device,
job,
pool,
)
})
.map_err(cuda_error)?;
work.timings.idwt = work.timings.idwt.saturating_add(idwt_us);
let (buffer, stats) = output.into_parts();
work.dispatches = work.dispatches.saturating_add(stats.kernel_dispatches());
work.decode_dispatches = work
.decode_dispatches
.saturating_add(stats.decode_kernel_dispatches());
work.timings.idwt_dispatch_count = work
.timings
.idwt_dispatch_count
.saturating_add(stats.kernel_dispatches());
work.bands.push(CudaCoefficientBand {
band_id: step.output_band_id,
buffer,
});
}
Ok(())
}
#[cfg(feature = "cuda-runtime")]
fn finish_cuda_component_decode(
mut work: CudaComponentDecodeWork,
) -> Result<CudaDecodedComponent, Error> {
let input_index = work
.bands
.iter()
.position(|band| band.band_id == work.store.input_band_id)
.ok_or(Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_KERNELS_NOT_READY,
})?;
let input = work.bands.swap_remove(input_index);
Ok(CudaDecodedComponent {
buffer: input.buffer,
store: work.store,
dispatches: work.dispatches,
decode_dispatches: work.decode_dispatches,
timings: work.timings,
})
}
#[cfg(feature = "cuda-runtime")]
fn can_batch_color_idwt(components: &[&CudaHtj2kDecodePlan]) -> bool {
let Some(first) = components.first() else {
return false;
};
components
.iter()
.all(|component| component.idwt_steps().len() == first.idwt_steps().len())
}
#[cfg(feature = "cuda-runtime")]
fn run_color_component_idwt_batches(
context: &j2k_cuda_runtime::CudaContext,
components: &[&CudaHtj2kDecodePlan],
component_work: &mut [CudaComponentDecodeWork],
pool: &CudaBufferPool,
collect_stage_timings: bool,
) -> Result<Option<CudaQueuedIdwtBatch>, Error> {
let (queued_batch, idwt_us) = context
.time_default_stream_named_us_if(
collect_stage_timings,
"j2k.htj2k.decode.idwt.batch",
|| enqueue_color_component_idwt_batches(context, components, component_work, pool),
)
.map_err(cuda_error)?;
if let Some(accounting) = component_work.first_mut() {
accounting.timings.idwt = accounting.timings.idwt.saturating_add(idwt_us);
accounting.dispatches = accounting
.dispatches
.saturating_add(queued_batch.kernel_dispatches);
accounting.decode_dispatches = accounting
.decode_dispatches
.saturating_add(queued_batch.decode_dispatches);
accounting.timings.idwt_dispatch_count = accounting
.timings
.idwt_dispatch_count
.saturating_add(queued_batch.kernel_dispatches);
}
let _queued_resource_count = queued_batch
.queued
.iter()
.map(CudaQueuedExecution::resource_count)
.sum::<usize>();
if collect_stage_timings {
drop(queued_batch);
Ok(None)
} else {
Ok(Some(queued_batch))
}
}
#[cfg(feature = "cuda-runtime")]
fn enqueue_color_component_idwt_batches(
context: &j2k_cuda_runtime::CudaContext,
components: &[&CudaHtj2kDecodePlan],
component_work: &mut [CudaComponentDecodeWork],
pool: &CudaBufferPool,
) -> Result<CudaQueuedIdwtBatch, CudaError> {
if components.len() != component_work.len() {
return Err(CudaError::InvalidArgument {
message: CUDA_HTJ2K_KERNELS_NOT_READY.to_string(),
});
}
let Some(first) = components.first() else {
return Ok(CudaQueuedIdwtBatch {
queued: Vec::new(),
kernel_dispatches: 0,
decode_dispatches: 0,
});
};
let mut queued = Vec::with_capacity(first.idwt_steps().len());
let mut kernel_dispatches = 0usize;
let mut decode_dispatches = 0usize;
let step_count = first.idwt_steps().len();
let trace_enabled = cuda_idwt_trace_enabled();
let enqueue_result = (|| -> Result<(), CudaError> {
let mut output_pool_trace = CudaIdwtOutputPoolTraceTotals::default();
let output_alloc_start = trace_enabled.then(std::time::Instant::now);
for step_index in 0..step_count {
for (component_index, component) in components.iter().enumerate() {
let step = component.idwt_steps().get(step_index).ok_or_else(|| {
CudaError::InvalidArgument {
message: CUDA_HTJ2K_KERNELS_NOT_READY.to_string(),
}
})?;
let width = step.rect.x1.saturating_sub(step.rect.x0);
let height = step.rect.y1.saturating_sub(step.rect.y0);
let output_words = checked_area(width, height).map_err(cuda_invalid_decode_plan)?;
let output_bytes = output_words
.checked_mul(std::mem::size_of::<f32>())
.ok_or_else(|| CudaError::InvalidArgument {
message: CUDA_HTJ2K_KERNELS_NOT_READY.to_string(),
})?;
let buffer = if trace_enabled {
let (buffer, trace) = pool.take_with_trace(output_bytes)?;
output_pool_trace.add_take(trace);
buffer
} else {
pool.take(output_bytes)?
};
component_work[component_index]
.bands
.push(CudaCoefficientBand {
band_id: step.output_band_id,
buffer,
});
}
}
let output_alloc_us = elapsed_host_us(output_alloc_start);
let target_build_start = trace_enabled.then(std::time::Instant::now);
let mut target_batches = Vec::with_capacity(step_count);
for step_index in 0..step_count {
let targets = components
.iter()
.enumerate()
.map(|(component_index, component)| {
let step = component.idwt_steps().get(step_index).ok_or_else(|| {
CudaError::InvalidArgument {
message: CUDA_HTJ2K_KERNELS_NOT_READY.to_string(),
}
})?;
let work = &component_work[component_index];
let ll = find_cuda_band(&work.bands, step.ll_band_id)
.map_err(cuda_invalid_decode_plan)?;
let hl = find_cuda_band(&work.bands, step.hl_band_id)
.map_err(cuda_invalid_decode_plan)?;
let lh = find_cuda_band(&work.bands, step.lh_band_id)
.map_err(cuda_invalid_decode_plan)?;
let hh = find_cuda_band(&work.bands, step.hh_band_id)
.map_err(cuda_invalid_decode_plan)?;
let output = find_cuda_band(&work.bands, step.output_band_id)
.map_err(cuda_invalid_decode_plan)?;
Ok(CudaJ2kIdwtTarget {
ll: pooled_cuda_buffer(&ll.buffer).map_err(cuda_invalid_decode_plan)?,
hl: pooled_cuda_buffer(&hl.buffer).map_err(cuda_invalid_decode_plan)?,
lh: pooled_cuda_buffer(&lh.buffer).map_err(cuda_invalid_decode_plan)?,
hh: pooled_cuda_buffer(&hh.buffer).map_err(cuda_invalid_decode_plan)?,
output: pooled_cuda_buffer(&output.buffer)
.map_err(cuda_invalid_decode_plan)?,
job: cuda_idwt_job_from_step(step),
})
})
.collect::<Result<Vec<_>, CudaError>>()?;
target_batches.push(targets);
}
let target_build_us = elapsed_host_us(target_build_start);
let target_slices = target_batches.iter().map(Vec::as_slice).collect::<Vec<_>>();
let enqueue_start = trace_enabled.then(std::time::Instant::now);
let queued_execution =
context.j2k_inverse_dwt_batch_sequence_enqueue_with_pool(&target_slices, pool)?;
let enqueue_us = elapsed_host_us(enqueue_start);
let execution = queued_execution.execution();
kernel_dispatches = kernel_dispatches.saturating_add(execution.kernel_dispatches());
decode_dispatches = decode_dispatches.saturating_add(execution.decode_kernel_dispatches());
queued.push(queued_execution);
if trace_enabled {
let row = CudaIdwtBatchHostTraceRow {
component_count: components.len(),
step_count,
output_alloc_us,
target_build_us,
enqueue_us,
output_take_count: output_pool_trace.take_count,
output_pool_reuse_count: output_pool_trace.reuse_count,
output_pool_alloc_count: output_pool_trace.alloc_count,
output_pool_scanned_count: output_pool_trace.scanned_count,
output_pool_max_free_count: output_pool_trace.max_free_count,
output_requested_bytes: output_pool_trace.requested_bytes,
};
eprintln!("{}", format_cuda_idwt_batch_host_trace_row(row));
}
Ok(())
})();
if let Err(error) = enqueue_result {
if !queued.is_empty() {
let _ = context.synchronize();
}
return Err(error);
}
Ok(CudaQueuedIdwtBatch {
queued,
kernel_dispatches,
decode_dispatches,
})
}
#[cfg(feature = "cuda-runtime")]
fn cuda_code_block_job_from_plan_block(
block: &crate::CudaHtj2kCodeBlock,
subband_width: u32,
) -> Result<CudaHtj2kCodeBlockJob, Error> {
let output_offset = block
.output_y
.checked_mul(subband_width)
.and_then(|base| base.checked_add(block.output_x))
.ok_or(Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_KERNELS_NOT_READY,
})?;
Ok(CudaHtj2kCodeBlockJob {
payload_offset: block.payload_offset,
width: block.width,
height: block.height,
payload_len: block.payload_len,
cleanup_length: block.cleanup_length,
refinement_length: block.refinement_length,
missing_bit_planes: block.missing_bit_planes,
num_bitplanes: block.num_bitplanes,
number_of_coding_passes: block.number_of_coding_passes,
output_stride: block.output_stride,
output_offset,
dequantization_step: block.dequantization_step,
stripe_causal: block.stripe_causal != 0,
})
}
#[cfg(feature = "cuda-runtime")]
fn validate_color_stores(
stores: [&CudaHtj2kStoreStep; 3],
dimensions: (u32, u32),
) -> Result<(), Error> {
let first = stores[0];
for store in stores {
let input_width = store.input_rect.x1.saturating_sub(store.input_rect.x0);
let input_height = store.input_rect.y1.saturating_sub(store.input_rect.y0);
let source_end_x =
store
.source_x
.checked_add(store.copy_width)
.ok_or(Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_KERNELS_NOT_READY,
})?;
let source_end_y =
store
.source_y
.checked_add(store.copy_height)
.ok_or(Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_KERNELS_NOT_READY,
})?;
if store.output_x != 0
|| store.output_y != 0
|| store.copy_width != dimensions.0
|| store.copy_height != dimensions.1
|| store.output_width != dimensions.0
|| store.output_height != dimensions.1
|| source_end_x > input_width
|| source_end_y > input_height
|| store.source_x != first.source_x
|| store.source_y != first.source_y
{
return Err(Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_KERNELS_NOT_READY,
});
}
}
Ok(())
}
#[cfg(feature = "cuda-runtime")]
fn bit_depth_addend(bit_depth: u8) -> f32 {
let shift = bit_depth.saturating_sub(1).min(15);
f32::from(1_u16 << shift)
}
#[cfg(feature = "cuda-runtime")]
fn checked_area(width: u32, height: u32) -> Result<usize, Error> {
width
.try_into()
.ok()
.and_then(|width: usize| width.checked_mul(height as usize))
.ok_or(Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_KERNELS_NOT_READY,
})
}
#[cfg(feature = "cuda-runtime")]
fn find_cuda_band(
bands: &[CudaCoefficientBand],
band_id: CudaHtj2kBandId,
) -> Result<&CudaCoefficientBand, Error> {
bands
.iter()
.find(|band| band.band_id == band_id)
.ok_or(Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_KERNELS_NOT_READY,
})
}
#[cfg(feature = "cuda-runtime")]
fn pooled_cuda_buffer(buffer: &CudaPooledDeviceBuffer) -> Result<&CudaDeviceBuffer, Error> {
buffer
.as_device_buffer()
.ok_or(Error::UnsupportedCudaRequest {
reason: CUDA_HTJ2K_KERNELS_NOT_READY,
})
}
#[cfg(feature = "cuda-runtime")]
#[allow(clippy::needless_pass_by_value)]
fn cuda_invalid_decode_plan(error: Error) -> CudaError {
CudaError::InvalidArgument {
message: error.to_string(),
}
}
#[cfg(feature = "cuda-runtime")]
fn cuda_runtime_rect(rect: crate::CudaHtj2kRect) -> CudaJ2kRect {
CudaJ2kRect {
x0: rect.x0,
y0: rect.y0,
x1: rect.x1,
y1: rect.y1,
}
}
#[cfg(feature = "cuda-runtime")]
fn cuda_idwt_job_from_step(step: &CudaHtj2kIdwtStep) -> CudaJ2kIdwtJob {
CudaJ2kIdwtJob {
rect: cuda_runtime_rect(step.rect),
ll_rect: cuda_runtime_rect(step.ll_rect),
hl_rect: cuda_runtime_rect(step.hl_rect),
lh_rect: cuda_runtime_rect(step.lh_rect),
hh_rect: cuda_runtime_rect(step.hh_rect),
irreversible97: u32::from(step.transform == CudaHtj2kTransform::Irreversible97),
}
}
#[cfg(all(test, feature = "cuda-runtime"))]
mod tests {
use super::{
build_cuda_htj2k_color_plans_from_bytes_with_profile, can_batch_color_idwt,
cuda_code_block_job_from_plan_block, htj2k_batched_cleanup_dequant_dispatches,
htj2k_batched_cleanup_dispatches, htj2k_batched_dequant_dispatches, CudaDecodeStageTimings,
};
use j2k_core::PixelFormat;
use j2k_native::{encode_htj2k, DecoderContext as NativeDecoderContext, EncodeOptions};
use crate::CudaHtj2kCodeBlock;
#[test]
fn cuda_runtime_code_block_job_preserves_plan_output_stride() {
let block = CudaHtj2kCodeBlock {
subband_index: 0,
payload_offset: 13,
payload_len: 5,
cleanup_length: 5,
refinement_length: 0,
output_x: 3,
output_y: 2,
width: 4,
height: 5,
output_stride: 99,
missing_bit_planes: 1,
number_of_coding_passes: 1,
num_bitplanes: 8,
stripe_causal: 0,
dequantization_step: 1.0,
};
let job = cuda_code_block_job_from_plan_block(&block, 64)
.expect("valid CUDA code-block runtime job");
assert_eq!(job.output_offset, 131);
assert_eq!(job.output_stride, 99);
}
#[test]
fn batched_cleanup_and_dequant_dispatch_helpers_count_one_shared_dispatch() {
assert_eq!(htj2k_batched_cleanup_dispatches(0), 0);
assert_eq!(htj2k_batched_cleanup_dispatches(1), 1);
assert_eq!(htj2k_batched_cleanup_dispatches(3), 1);
assert_eq!(htj2k_batched_dequant_dispatches(0), 0);
assert_eq!(htj2k_batched_dequant_dispatches(1), 1);
assert_eq!(htj2k_batched_dequant_dispatches(3), 1);
assert_eq!(htj2k_batched_cleanup_dequant_dispatches(0, true), (0, 0));
assert_eq!(htj2k_batched_cleanup_dequant_dispatches(1, true), (1, 0));
assert_eq!(htj2k_batched_cleanup_dequant_dispatches(3, true), (1, 0));
assert_eq!(htj2k_batched_cleanup_dequant_dispatches(1, false), (1, 1));
assert_eq!(htj2k_batched_cleanup_dequant_dispatches(3, false), (1, 1));
}
#[test]
fn profiled_cuda_batch_decode_api_accepts_empty_batch() {
let mut session = crate::CudaSession::default();
let inputs: [&[u8]; 0] = [];
let (surfaces, report) =
crate::J2kDecoder::decode_batch_to_device_with_session_and_profile(
&inputs,
PixelFormat::Rgb8,
&mut session,
)
.expect("empty CUDA batch decode");
assert!(surfaces.is_empty());
assert_eq!(report.block_count, 0);
assert_eq!(report.payload_bytes, 0);
}
#[test]
fn cuda_batch_decode_two_color_images_matches_single_when_runtime_required() {
let pixels_a: Vec<u8> = (0u16..16 * 16 * 3)
.map(|idx| u8::try_from((idx * 7 + idx / 5) & 0xff).expect("masked byte"))
.collect();
let pixels_b: Vec<u8> = (0u16..16 * 16 * 3)
.map(|idx| u8::try_from((idx * 11 + 23) & 0xff).expect("masked byte"))
.collect();
let options = EncodeOptions {
reversible: true,
num_decomposition_levels: 1,
..EncodeOptions::default()
};
let codestream_a =
encode_htj2k(&pixels_a, 16, 16, 3, 8, false, &options).expect("encode fixture A");
let codestream_b =
encode_htj2k(&pixels_b, 16, 16, 3, 8, false, &options).expect("encode fixture B");
let inputs = [codestream_a.as_slice(), codestream_b.as_slice()];
let mut batch_session = crate::CudaSession::default();
let batch = crate::J2kDecoder::decode_batch_to_device_with_session_and_profile(
&inputs,
PixelFormat::Rgb8,
&mut batch_session,
);
let (surfaces, report) = match batch {
Ok(result) => result,
Err(crate::Error::CudaUnavailable | crate::Error::CudaRuntime { .. })
if !cuda_runtime_required() =>
{
return;
}
Err(error) => panic!("batch CUDA decode failed: {error}"),
};
assert_eq!(surfaces.len(), 2);
assert_eq!(report.detail.ht_dispatch_count, 1);
assert_eq!(report.detail.dequant_dispatch_count, 0);
assert_eq!(report.detail.store_dispatch_count, 1);
let batch_pixels_tight =
crate::Surface::download_batch_tight(&surfaces).expect("download tight CUDA batch");
assert_eq!(batch_pixels_tight.len(), surfaces.len() * 16 * 16 * 3);
for (index, codestream) in inputs.iter().enumerate() {
let mut single_session = crate::CudaSession::default();
let mut decoder = crate::J2kDecoder::new(codestream).expect("single decoder");
let single = decoder
.decode_to_device_with_session(PixelFormat::Rgb8, &mut single_session)
.expect("single CUDA decode");
let mut single_pixels = vec![0u8; 16 * 16 * 3];
let mut batch_pixels = vec![0u8; 16 * 16 * 3];
single
.download_into(&mut single_pixels, 16 * 3)
.expect("download single decode");
surfaces[index]
.download_into(&mut batch_pixels, 16 * 3)
.expect("download batch decode");
assert_eq!(batch_pixels, single_pixels);
assert_eq!(
&batch_pixels_tight[index * 16 * 16 * 3..(index + 1) * 16 * 16 * 3],
single_pixels.as_slice()
);
}
}
#[test]
fn cuda_batch_decode_mixed_idwt_shapes_avoids_fused_batch_store_without_idwt_batch() {
let codestream_a = rgb8_htj2k_fixture(32, 32, 1, 7);
let codestream_b = rgb8_htj2k_fixture(32, 32, 2, 19);
let inputs = [codestream_a.as_slice(), codestream_b.as_slice()];
let mut batch_session = crate::CudaSession::default();
let result = crate::J2kDecoder::decode_batch_to_device_with_session(
&inputs,
PixelFormat::Rgb8,
&mut batch_session,
);
let surfaces = match result {
Ok(surfaces) => surfaces,
Err(crate::Error::CudaUnavailable | crate::Error::CudaRuntime { .. })
if !cuda_runtime_required() =>
{
return;
}
Err(crate::Error::UnsupportedCudaRequest { .. }) => return,
Err(error) => panic!("mixed-shape batch CUDA decode failed: {error}"),
};
assert_eq!(surfaces.len(), inputs.len());
for (index, codestream) in inputs.iter().enumerate() {
let mut single_session = crate::CudaSession::default();
let mut decoder = crate::J2kDecoder::new(codestream).expect("single decoder");
let single = decoder
.decode_to_device_with_session(PixelFormat::Rgb8, &mut single_session)
.expect("single CUDA decode");
let mut single_pixels = vec![0u8; 32 * 32 * 3];
let mut batch_pixels = vec![0u8; 32 * 32 * 3];
single
.download_into(&mut single_pixels, 32 * 3)
.expect("download single decode");
surfaces[index]
.download_into(&mut batch_pixels, 32 * 3)
.expect("download mixed-shape batch decode");
assert_eq!(batch_pixels, single_pixels);
}
}
#[test]
fn decode_stage_timings_report_status_download_detail() {
let mut report = crate::CudaHtj2kProfileReport::default();
let timings = CudaDecodeStageTimings {
h2d: 17,
status_d2h: 5,
..CudaDecodeStageTimings::default()
};
timings.add_to_report(&mut report);
assert_eq!(report.h2d_us, 17);
assert_eq!(report.detail.status_d2h_us, 5);
}
fn cuda_runtime_required() -> bool {
std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_some()
}
fn rgb8_htj2k_fixture(width: u32, height: u32, levels: u8, seed: u16) -> Vec<u8> {
let mut pixels = Vec::with_capacity(width as usize * height as usize * 3);
for idx in 0..width * height {
let seed = u32::from(seed);
pixels.push(u8::try_from((idx * seed + idx / 3) & 0xff).expect("red"));
pixels.push(u8::try_from((idx * (seed + 11) + 7) & 0xff).expect("green"));
pixels.push(u8::try_from((idx * (seed + 23) + 19) & 0xff).expect("blue"));
}
let options = EncodeOptions {
reversible: true,
num_decomposition_levels: levels,
..EncodeOptions::default()
};
encode_htj2k(&pixels, width, height, 3, 8, false, &options)
.expect("encode RGB HTJ2K fixture")
}
#[test]
fn color_plan_flattens_one_shared_payload_for_component_decode() {
let pixels: Vec<u8> = (0u16..4 * 4 * 3)
.map(|idx| u8::try_from((idx * 13 + idx / 3) & 0xff).expect("masked byte"))
.collect();
let options = EncodeOptions {
reversible: true,
num_decomposition_levels: 1,
..EncodeOptions::default()
};
let codestream =
encode_htj2k(&pixels, 4, 4, 3, 8, false, &options).expect("encode HTJ2K RGB fixture");
let mut decoder = crate::J2kDecoder::new(&codestream).expect("decoder");
let color = decoder
.build_cuda_htj2k_color_plans_with_profile(PixelFormat::Rgb8)
.expect("CUDA color plans");
assert_eq!(color.components.len(), 3);
assert!(!color.payload.is_empty());
assert_eq!(color.report.payload_bytes, color.payload.len());
for component in &color.components {
assert!(component.payload().is_empty());
for block in component.code_blocks() {
let start = usize::try_from(block.payload_offset).expect("payload offset");
let end = start + block.payload_len as usize;
assert!(end <= color.payload.len());
}
}
}
#[test]
fn byte_color_plan_builder_matches_decoder_color_plan() {
let pixels: Vec<u8> = (0u16..8 * 8 * 3)
.map(|idx| u8::try_from((idx * 19 + idx / 5) & 0xff).expect("masked byte"))
.collect();
let options = EncodeOptions {
reversible: true,
num_decomposition_levels: 1,
..EncodeOptions::default()
};
let codestream =
encode_htj2k(&pixels, 8, 8, 3, 8, false, &options).expect("encode HTJ2K RGB fixture");
let mut decoder = crate::J2kDecoder::new(&codestream).expect("decoder");
let decoder_plan = decoder
.build_cuda_htj2k_color_plans_with_profile(PixelFormat::Rgb8)
.expect("decoder CUDA color plans");
let mut native_context = NativeDecoderContext::default();
let byte_plan = build_cuda_htj2k_color_plans_from_bytes_with_profile(
&codestream,
PixelFormat::Rgb8,
&mut native_context,
)
.expect("byte CUDA color plans");
assert_eq!(byte_plan.dimensions, decoder_plan.dimensions);
assert_eq!(byte_plan.mct_dimensions, decoder_plan.mct_dimensions);
assert_eq!(byte_plan.bit_depths, decoder_plan.bit_depths);
assert_eq!(byte_plan.mct, decoder_plan.mct);
assert_eq!(byte_plan.components.len(), decoder_plan.components.len());
assert_eq!(byte_plan.payload.len(), decoder_plan.payload.len());
assert_eq!(
byte_plan
.components
.iter()
.map(|component| component.code_blocks().len())
.collect::<Vec<_>>(),
decoder_plan
.components
.iter()
.map(|component| component.code_blocks().len())
.collect::<Vec<_>>()
);
}
#[test]
fn multi_image_color_components_can_share_one_idwt_batch() {
let pixels: Vec<u8> = (0u16..16 * 16 * 3)
.map(|idx| u8::try_from((idx * 17 + idx / 7) & 0xff).expect("masked byte"))
.collect();
let options = EncodeOptions {
reversible: true,
num_decomposition_levels: 1,
..EncodeOptions::default()
};
let codestream =
encode_htj2k(&pixels, 16, 16, 3, 8, false, &options).expect("encode HTJ2K RGB fixture");
let mut first = crate::J2kDecoder::new(&codestream).expect("first decoder");
let mut second = crate::J2kDecoder::new(&codestream).expect("second decoder");
let first = first
.build_cuda_htj2k_color_plans_with_profile(PixelFormat::Rgb8)
.expect("first CUDA color plans");
let second = second
.build_cuda_htj2k_color_plans_with_profile(PixelFormat::Rgb8)
.expect("second CUDA color plans");
let components = first
.components
.iter()
.chain(second.components.iter())
.collect::<Vec<_>>();
assert_eq!(components.len(), 6);
assert!(can_batch_color_idwt(&components));
}
#[test]
fn batched_color_idwt_defers_completion_to_store_sync() {
let source = include_str!("decoder.rs");
assert!(
!source.contains(
"if !collect_stage_timings {\n context.synchronize().map_err(cuda_error)?;\n }"
),
"batched color IDWT should keep queued resources live and let the following store synchronize"
);
}
}
impl ImageCodec for J2kDecoder<'_> {
type Error = Error;
type Warning = Infallible;
type Pool = CpuJ2kScratchPool;
}
impl<'a> ImageDecode<'a> for J2kDecoder<'a> {
type View = J2kView<'a>;
fn inspect(input: &'a [u8]) -> Result<j2k_core::Info, Self::Error> {
Ok(CpuDecoder::inspect(input)?)
}
fn parse(input: &'a [u8]) -> Result<Self::View, Self::Error> {
Ok(J2kView::parse(input)?)
}
fn from_view(view: Self::View) -> Result<Self, Self::Error> {
Ok(Self {
inner: CpuDecoder::from_view(view)?,
pool: CpuJ2kScratchPool::new(),
})
}
fn decode_into(
&mut self,
out: &mut [u8],
stride: usize,
fmt: PixelFormat,
) -> Result<DecodeOutcome<Self::Warning>, Self::Error> {
Ok(self.inner.decode_into(out, stride, fmt)?)
}
fn decode_into_with_scratch(
&mut self,
pool: &mut Self::Pool,
out: &mut [u8],
stride: usize,
fmt: PixelFormat,
) -> Result<DecodeOutcome<Self::Warning>, Self::Error> {
Ok(self
.inner
.decode_into_with_scratch(pool, out, stride, fmt)?)
}
fn decode_region_into(
&mut self,
pool: &mut Self::Pool,
out: &mut [u8],
stride: usize,
fmt: PixelFormat,
roi: Rect,
) -> Result<DecodeOutcome<Self::Warning>, Self::Error> {
Ok(self.inner.decode_region_into(pool, out, stride, fmt, roi)?)
}
fn decode_scaled_into(
&mut self,
pool: &mut Self::Pool,
out: &mut [u8],
stride: usize,
fmt: PixelFormat,
scale: Downscale,
) -> Result<DecodeOutcome<Self::Warning>, Self::Error> {
Ok(self
.inner
.decode_scaled_into(pool, out, stride, fmt, scale)?)
}
fn decode_region_scaled_into(
&mut self,
pool: &mut Self::Pool,
out: &mut [u8],
stride: usize,
fmt: PixelFormat,
roi: Rect,
scale: Downscale,
) -> Result<DecodeOutcome<Self::Warning>, Self::Error> {
Ok(self
.inner
.decode_region_scaled_into(pool, out, stride, fmt, roi, scale)?)
}
}
impl<'a> ImageDecodeDevice<'a> for J2kDecoder<'a> {
type DeviceSurface = Surface;
}
impl<'a> ImageDecodeSubmit<'a> for J2kDecoder<'a> {
type Session = CudaSession;
type DeviceSurface = Surface;
type SubmittedSurface = ReadySubmission<Surface, Error>;
fn submit_to_device(
&mut self,
session: &mut Self::Session,
fmt: PixelFormat,
backend: BackendRequest,
) -> Result<Self::SubmittedSurface, Self::Error> {
validate_surface_request(backend)?;
Ok(submit_ready_device(session, |session| {
self.decode_to_surface_impl(session, fmt, backend)
}))
}
fn submit_region_to_device(
&mut self,
session: &mut Self::Session,
fmt: PixelFormat,
roi: Rect,
backend: BackendRequest,
) -> Result<Self::SubmittedSurface, Self::Error> {
validate_surface_request(backend)?;
Ok(submit_ready_device(session, |session| {
self.decode_region_to_surface_impl(session, fmt, roi, backend)
}))
}
fn submit_scaled_to_device(
&mut self,
session: &mut Self::Session,
fmt: PixelFormat,
scale: Downscale,
backend: BackendRequest,
) -> Result<Self::SubmittedSurface, Self::Error> {
validate_surface_request(backend)?;
Ok(submit_ready_device(session, |session| {
self.decode_scaled_to_surface_impl(session, fmt, scale, backend)
}))
}
fn submit_region_scaled_to_device(
&mut self,
session: &mut Self::Session,
fmt: PixelFormat,
roi: Rect,
scale: Downscale,
backend: BackendRequest,
) -> Result<Self::SubmittedSurface, Self::Error> {
validate_surface_request(backend)?;
Ok(submit_ready_device(session, |session| {
self.decode_region_scaled_to_surface_impl(session, fmt, roi, scale, backend)
}))
}
}
#[cfg(test)]
mod dispatch_tests {
use super::{
format_cuda_idwt_batch_host_trace_row, htj2k_batched_dequant_dispatches,
split_htj2k_subband_decode_dispatches, CudaIdwtBatchHostTraceRow,
};
#[test]
fn htj2k_decode_dispatch_split_separates_ht_and_dequant_counts() {
assert_eq!(split_htj2k_subband_decode_dispatches(0), (0, 0));
assert_eq!(split_htj2k_subband_decode_dispatches(1), (1, 0));
assert_eq!(split_htj2k_subband_decode_dispatches(2), (1, 1));
assert_eq!(split_htj2k_subband_decode_dispatches(3), (2, 1));
}
#[test]
fn htj2k_batched_dequant_dispatch_count_is_one_for_any_non_empty_batch() {
assert_eq!(htj2k_batched_dequant_dispatches(0), 0);
assert_eq!(htj2k_batched_dequant_dispatches(1), 1);
assert_eq!(htj2k_batched_dequant_dispatches(48), 1);
}
#[test]
fn cuda_idwt_batch_host_trace_row_reports_host_split() {
let row = CudaIdwtBatchHostTraceRow {
component_count: 327,
step_count: 5,
output_alloc_us: 11,
target_build_us: 22,
enqueue_us: 33,
output_take_count: 1635,
output_pool_reuse_count: 1600,
output_pool_alloc_count: 35,
output_pool_scanned_count: 2400,
output_pool_max_free_count: 1700,
output_requested_bytes: 28,
};
assert_eq!(
format_cuda_idwt_batch_host_trace_row(row),
"j2k_profile codec=j2k op=cuda_idwt_batch_host path=decode component_count=327 step_count=5 output_alloc_us=11 target_build_us=22 enqueue_us=33 output_take_count=1635 output_pool_reuse_count=1600 output_pool_alloc_count=35 output_pool_scanned_count=2400 output_pool_max_free_count=1700 output_requested_bytes=28"
);
}
}