#[cfg(feature = "cuda-runtime")]
mod cuda;
use core::fmt;
use signinum_transcode::accelerator::{
DctGridI16ToHtj2k97CodeBlockBatch, DctGridI16ToHtj2k97CodeBlockJob, DctGridToDwt53Job,
DctGridToDwt97Job, DctGridToHtj2k97CodeBlockJob, DctGridToReversibleDwt53Job,
DctToWaveletStageAccelerator, Dwt97BatchStageTimings, Htj2k97CodeBlockOptions,
PreencodedHtj2k97CompactBatch, PreencodedHtj2k97CompactBatchGroups, PreencodedHtj2k97Component,
PrequantizedHtj2k97Component, ReversibleDwt53FirstLevel, TranscodeStageError,
};
use signinum_transcode::dct53_2d::Dwt53TwoDimensional;
use signinum_transcode::dct97_2d::Dwt97TwoDimensional;
pub const CUDA_UNAVAILABLE: &str = "CUDA is unavailable on this host";
const DEFAULT_AUTO_MIN_SAMPLES: usize = 224 * 224;
const DEFAULT_AUTO_REVERSIBLE_BATCH_MIN_JOBS: usize = 32;
const DEFAULT_AUTO_REVERSIBLE_BATCH_MIN_SAMPLES: usize = 224 * 224 * 32;
const DEFAULT_AUTO_DWT97_BATCH_MIN_JOBS: usize = 32;
const DEFAULT_AUTO_DWT97_BATCH_MIN_SAMPLES: usize = 224 * 224 * 32;
const DISABLE_COMPACT_PREENCODED_ENV: &str = "SIGNINUM_CUDA_DISABLE_COMPACT_PREENCODED";
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CudaTranscodeError {
CudaUnavailable,
UnsupportedJob(&'static str),
Kernel(&'static str),
}
impl CudaTranscodeError {
#[cfg(feature = "cuda-runtime")]
const fn is_recoverable(self) -> bool {
matches!(self, Self::CudaUnavailable | Self::UnsupportedJob(_))
}
}
impl fmt::Display for CudaTranscodeError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::CudaUnavailable => f.write_str(CUDA_UNAVAILABLE),
Self::UnsupportedJob(reason) | Self::Kernel(reason) => f.write_str(reason),
}
}
}
impl From<CudaTranscodeError> for TranscodeStageError {
fn from(error: CudaTranscodeError) -> Self {
match error {
CudaTranscodeError::CudaUnavailable => Self::DeviceUnavailable,
CudaTranscodeError::UnsupportedJob(reason) => Self::Unsupported(reason),
CudaTranscodeError::Kernel(reason) => Self::Backend(reason.to_string()),
}
}
}
impl std::error::Error for CudaTranscodeError {}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum CudaDispatchMode {
Explicit,
Auto,
}
#[derive(Debug, Clone)]
pub struct CudaDctToWaveletStageAccelerator {
mode: CudaDispatchMode,
min_auto_samples: usize,
min_auto_reversible_batch_jobs: usize,
min_auto_reversible_batch_samples: usize,
min_auto_dwt97_batch_jobs: usize,
min_auto_dwt97_batch_samples: usize,
reversible_dwt53_attempts: usize,
reversible_dwt53_dispatches: usize,
reversible_dwt53_batch_attempts: usize,
reversible_dwt53_batch_dispatches: usize,
dwt53_attempts: usize,
dwt53_dispatches: usize,
dwt97_attempts: usize,
dwt97_dispatches: usize,
dwt97_batch_attempts: usize,
dwt97_batch_dispatches: usize,
htj2k97_codeblock_batch_attempts: usize,
htj2k97_codeblock_batch_dispatches: usize,
last_dwt97_batch_stage_timings: Option<Dwt97BatchStageTimings>,
resident_ht_encode: bool,
#[cfg(feature = "cuda-runtime")]
session: Option<cuda::CudaTranscodeSession>,
}
impl CudaDctToWaveletStageAccelerator {
#[must_use]
pub const fn new_explicit() -> Self {
Self::with_mode(CudaDispatchMode::Explicit, 0)
}
#[must_use]
pub const fn new_explicit_resident_ht_encode() -> Self {
let mut accelerator = Self::with_mode(CudaDispatchMode::Explicit, 0);
accelerator.resident_ht_encode = true;
accelerator
}
#[must_use]
pub const fn for_auto() -> Self {
let mut accelerator = Self::with_mode(CudaDispatchMode::Auto, DEFAULT_AUTO_MIN_SAMPLES);
accelerator.min_auto_reversible_batch_jobs = DEFAULT_AUTO_REVERSIBLE_BATCH_MIN_JOBS;
accelerator.min_auto_reversible_batch_samples = DEFAULT_AUTO_REVERSIBLE_BATCH_MIN_SAMPLES;
accelerator.min_auto_dwt97_batch_jobs = DEFAULT_AUTO_DWT97_BATCH_MIN_JOBS;
accelerator.min_auto_dwt97_batch_samples = DEFAULT_AUTO_DWT97_BATCH_MIN_SAMPLES;
accelerator
}
const fn with_mode(mode: CudaDispatchMode, min_auto_samples: usize) -> Self {
Self {
mode,
min_auto_samples,
min_auto_reversible_batch_jobs: 0,
min_auto_reversible_batch_samples: 0,
min_auto_dwt97_batch_jobs: 0,
min_auto_dwt97_batch_samples: 0,
reversible_dwt53_attempts: 0,
reversible_dwt53_dispatches: 0,
reversible_dwt53_batch_attempts: 0,
reversible_dwt53_batch_dispatches: 0,
dwt53_attempts: 0,
dwt53_dispatches: 0,
dwt97_attempts: 0,
dwt97_dispatches: 0,
dwt97_batch_attempts: 0,
dwt97_batch_dispatches: 0,
htj2k97_codeblock_batch_attempts: 0,
htj2k97_codeblock_batch_dispatches: 0,
last_dwt97_batch_stage_timings: None,
resident_ht_encode: false,
#[cfg(feature = "cuda-runtime")]
session: None,
}
}
#[cfg(feature = "cuda-runtime")]
fn cuda_session(&mut self) -> &mut cuda::CudaTranscodeSession {
self.session
.get_or_insert_with(cuda::CudaTranscodeSession::default)
}
#[must_use]
pub const fn with_auto_reversible_batch_thresholds(
mut self,
min_jobs: usize,
min_samples: usize,
) -> Self {
self.min_auto_reversible_batch_jobs = min_jobs;
self.min_auto_reversible_batch_samples = min_samples;
self
}
#[must_use]
pub const fn with_auto_dwt97_batch_thresholds(
mut self,
min_jobs: usize,
min_samples: usize,
) -> Self {
self.min_auto_dwt97_batch_jobs = min_jobs;
self.min_auto_dwt97_batch_samples = min_samples;
self
}
#[must_use]
pub const fn reversible_dwt53_attempts(&self) -> usize {
self.reversible_dwt53_attempts
}
#[must_use]
pub const fn reversible_dwt53_dispatches(&self) -> usize {
self.reversible_dwt53_dispatches
}
#[must_use]
pub const fn reversible_dwt53_batch_attempts(&self) -> usize {
self.reversible_dwt53_batch_attempts
}
#[must_use]
pub const fn reversible_dwt53_batch_dispatches(&self) -> usize {
self.reversible_dwt53_batch_dispatches
}
#[must_use]
pub const fn dwt53_attempts(&self) -> usize {
self.dwt53_attempts
}
#[must_use]
pub const fn dwt53_dispatches(&self) -> usize {
self.dwt53_dispatches
}
#[must_use]
pub const fn dwt97_attempts(&self) -> usize {
self.dwt97_attempts
}
#[must_use]
pub const fn dwt97_dispatches(&self) -> usize {
self.dwt97_dispatches
}
#[must_use]
pub const fn dwt97_batch_attempts(&self) -> usize {
self.dwt97_batch_attempts
}
#[must_use]
pub const fn dwt97_batch_dispatches(&self) -> usize {
self.dwt97_batch_dispatches
}
#[must_use]
pub const fn htj2k97_codeblock_batch_attempts(&self) -> usize {
self.htj2k97_codeblock_batch_attempts
}
#[must_use]
pub const fn htj2k97_codeblock_batch_dispatches(&self) -> usize {
self.htj2k97_codeblock_batch_dispatches
}
#[cfg(not(feature = "cuda-runtime"))]
fn unavailable<T>(&self) -> Result<Option<T>, TranscodeStageError> {
match self.mode {
CudaDispatchMode::Explicit => Err(TranscodeStageError::DeviceUnavailable),
CudaDispatchMode::Auto => Ok(None),
}
}
#[cfg(feature = "cuda-runtime")]
fn recover<T>(&self, error: CudaTranscodeError) -> Result<Option<T>, TranscodeStageError> {
if self.mode == CudaDispatchMode::Auto && error.is_recoverable() {
Ok(None)
} else {
Err(error.into())
}
}
}
fn reversible_batch_total_samples(jobs: &[DctGridToReversibleDwt53Job<'_>]) -> usize {
jobs.iter().fold(0usize, |total, job| {
total.saturating_add(job.width.saturating_mul(job.height))
})
}
fn dwt97_batch_total_samples(jobs: &[DctGridToDwt97Job<'_>]) -> usize {
jobs.iter().fold(0usize, |total, job| {
total.saturating_add(job.width.saturating_mul(job.height))
})
}
fn htj2k97_codeblock_batch_total_samples(jobs: &[DctGridToHtj2k97CodeBlockJob<'_>]) -> usize {
jobs.iter().fold(0usize, |total, job| {
total.saturating_add(job.width.saturating_mul(job.height))
})
}
fn htj2k97_i16_codeblock_batch_total_samples(
jobs: &[DctGridI16ToHtj2k97CodeBlockJob<'_>],
) -> usize {
jobs.iter().fold(0usize, |total, job| {
total.saturating_add(job.width.saturating_mul(job.height))
})
}
fn htj2k97_i16_codeblock_batch_group_total_samples(
groups: &[DctGridI16ToHtj2k97CodeBlockBatch<'_, '_>],
) -> usize {
groups.iter().fold(0usize, |total, group| {
total.saturating_add(htj2k97_i16_codeblock_batch_total_samples(group.jobs))
})
}
impl Default for CudaDctToWaveletStageAccelerator {
fn default() -> Self {
Self::for_auto()
}
}
impl DctToWaveletStageAccelerator for CudaDctToWaveletStageAccelerator {
fn supports_dwt97_batch(&self) -> bool {
true
}
fn supports_htj2k97_codeblock_batch(&self) -> bool {
true
}
fn supports_htj2k97_i16_preencoded_batch(&self) -> bool {
self.resident_ht_encode
}
fn supports_htj2k97_compact_preencoded_batch(&self) -> bool {
self.resident_ht_encode && std::env::var_os(DISABLE_COMPACT_PREENCODED_ENV).is_none()
}
fn dct_grid_to_reversible_dwt53(
&mut self,
job: DctGridToReversibleDwt53Job<'_>,
) -> Result<Option<ReversibleDwt53FirstLevel>, TranscodeStageError> {
self.reversible_dwt53_attempts = self.reversible_dwt53_attempts.saturating_add(1);
if self.mode == CudaDispatchMode::Auto
&& job.width.saturating_mul(job.height) < self.min_auto_samples
{
return Ok(None);
}
#[cfg(not(feature = "cuda-runtime"))]
{
let _ = job;
self.unavailable()
}
#[cfg(feature = "cuda-runtime")]
{
match cuda::dispatch_reversible_dwt53(self.cuda_session(), job) {
Ok(output) => {
self.reversible_dwt53_dispatches =
self.reversible_dwt53_dispatches.saturating_add(1);
Ok(Some(output))
}
Err(error) => self.recover(error),
}
}
}
fn dct_grid_to_reversible_dwt53_batch(
&mut self,
jobs: &[DctGridToReversibleDwt53Job<'_>],
) -> Result<Option<Vec<ReversibleDwt53FirstLevel>>, TranscodeStageError> {
self.reversible_dwt53_batch_attempts =
self.reversible_dwt53_batch_attempts.saturating_add(1);
if jobs.is_empty() {
return Ok(Some(Vec::new()));
}
if self.mode == CudaDispatchMode::Auto
&& (jobs.len() < self.min_auto_reversible_batch_jobs
|| reversible_batch_total_samples(jobs) < self.min_auto_reversible_batch_samples)
{
return Ok(None);
}
#[cfg(not(feature = "cuda-runtime"))]
{
let _ = jobs;
self.unavailable()
}
#[cfg(feature = "cuda-runtime")]
{
match cuda::dispatch_reversible_dwt53_batch(self.cuda_session(), jobs) {
Ok(output) => {
self.reversible_dwt53_batch_dispatches =
self.reversible_dwt53_batch_dispatches.saturating_add(1);
Ok(Some(output))
}
Err(error) => self.recover(error),
}
}
}
fn dct_grid_to_dwt53(
&mut self,
job: DctGridToDwt53Job<'_>,
) -> Result<Option<Dwt53TwoDimensional<f64>>, TranscodeStageError> {
self.dwt53_attempts = self.dwt53_attempts.saturating_add(1);
if self.mode == CudaDispatchMode::Auto
&& job.width.saturating_mul(job.height) < self.min_auto_samples
{
return Ok(None);
}
#[cfg(not(feature = "cuda-runtime"))]
{
let _ = job;
self.unavailable()
}
#[cfg(feature = "cuda-runtime")]
{
match cuda::dispatch_dwt53(job) {
Ok(output) => {
self.dwt53_dispatches = self.dwt53_dispatches.saturating_add(1);
Ok(Some(output))
}
Err(error) => self.recover(error),
}
}
}
fn dct_grid_to_dwt97(
&mut self,
job: DctGridToDwt97Job<'_>,
) -> Result<Option<Dwt97TwoDimensional<f64>>, TranscodeStageError> {
self.dwt97_attempts = self.dwt97_attempts.saturating_add(1);
if self.mode == CudaDispatchMode::Auto
&& job.width.saturating_mul(job.height) < self.min_auto_samples
{
return Ok(None);
}
#[cfg(not(feature = "cuda-runtime"))]
{
let _ = job;
self.unavailable()
}
#[cfg(feature = "cuda-runtime")]
{
match cuda::dispatch_dwt97(self.cuda_session(), job) {
Ok(output) => {
self.dwt97_dispatches = self.dwt97_dispatches.saturating_add(1);
Ok(Some(output))
}
Err(error) => self.recover(error),
}
}
}
fn dct_grid_to_dwt97_batch(
&mut self,
jobs: &[DctGridToDwt97Job<'_>],
) -> Result<Option<Vec<Dwt97TwoDimensional<f64>>>, TranscodeStageError> {
self.dwt97_batch_attempts = self.dwt97_batch_attempts.saturating_add(1);
self.last_dwt97_batch_stage_timings = None;
if jobs.is_empty() {
return Ok(Some(Vec::new()));
}
if self.mode == CudaDispatchMode::Auto
&& (jobs.len() < self.min_auto_dwt97_batch_jobs
|| dwt97_batch_total_samples(jobs) < self.min_auto_dwt97_batch_samples)
{
return Ok(None);
}
#[cfg(not(feature = "cuda-runtime"))]
{
let _ = jobs;
self.unavailable()
}
#[cfg(feature = "cuda-runtime")]
{
match cuda::dispatch_dwt97_batch(self.cuda_session(), jobs) {
Ok((output, timings)) => {
self.dwt97_batch_dispatches = self.dwt97_batch_dispatches.saturating_add(1);
self.last_dwt97_batch_stage_timings = Some(timings);
Ok(Some(output))
}
Err(error) => self.recover(error),
}
}
}
fn dct_grid_to_htj2k97_codeblock_batch(
&mut self,
jobs: &[DctGridToHtj2k97CodeBlockJob<'_>],
options: Htj2k97CodeBlockOptions,
) -> Result<Option<Vec<PrequantizedHtj2k97Component>>, TranscodeStageError> {
self.dwt97_batch_attempts = self.dwt97_batch_attempts.saturating_add(1);
self.htj2k97_codeblock_batch_attempts =
self.htj2k97_codeblock_batch_attempts.saturating_add(1);
self.last_dwt97_batch_stage_timings = None;
if jobs.is_empty() {
return Ok(Some(Vec::new()));
}
if self.mode == CudaDispatchMode::Auto
&& (jobs.len() < self.min_auto_dwt97_batch_jobs
|| htj2k97_codeblock_batch_total_samples(jobs) < self.min_auto_dwt97_batch_samples)
{
return Ok(None);
}
#[cfg(not(feature = "cuda-runtime"))]
{
let _ = (jobs, options);
self.unavailable()
}
#[cfg(feature = "cuda-runtime")]
{
match cuda::dispatch_htj2k97_codeblock_batch(self.cuda_session(), jobs, options) {
Ok((output, timings)) => {
self.dwt97_batch_dispatches = self.dwt97_batch_dispatches.saturating_add(1);
self.htj2k97_codeblock_batch_dispatches =
self.htj2k97_codeblock_batch_dispatches.saturating_add(1);
self.last_dwt97_batch_stage_timings = Some(timings);
Ok(Some(output))
}
Err(error) => self.recover(error),
}
}
}
fn dct_grid_to_htj2k97_preencoded_batch(
&mut self,
jobs: &[DctGridToHtj2k97CodeBlockJob<'_>],
options: Htj2k97CodeBlockOptions,
) -> Result<Option<Vec<PreencodedHtj2k97Component>>, TranscodeStageError> {
if !self.resident_ht_encode {
return Ok(None);
}
self.dwt97_batch_attempts = self.dwt97_batch_attempts.saturating_add(1);
self.htj2k97_codeblock_batch_attempts =
self.htj2k97_codeblock_batch_attempts.saturating_add(1);
self.last_dwt97_batch_stage_timings = None;
if jobs.is_empty() {
return Ok(Some(Vec::new()));
}
if self.mode == CudaDispatchMode::Auto
&& (jobs.len() < self.min_auto_dwt97_batch_jobs
|| htj2k97_codeblock_batch_total_samples(jobs) < self.min_auto_dwt97_batch_samples)
{
return Ok(None);
}
#[cfg(not(feature = "cuda-runtime"))]
{
let _ = (jobs, options);
self.unavailable()
}
#[cfg(feature = "cuda-runtime")]
{
match cuda::dispatch_htj2k97_preencoded_batch(self.cuda_session(), jobs, options) {
Ok((output, timings)) => {
self.dwt97_batch_dispatches = self.dwt97_batch_dispatches.saturating_add(1);
self.htj2k97_codeblock_batch_dispatches =
self.htj2k97_codeblock_batch_dispatches.saturating_add(1);
self.last_dwt97_batch_stage_timings = Some(timings);
Ok(Some(output))
}
Err(error) => self.recover(error),
}
}
}
fn dct_grid_i16_to_htj2k97_preencoded_batch(
&mut self,
jobs: &[DctGridI16ToHtj2k97CodeBlockJob<'_>],
options: Htj2k97CodeBlockOptions,
) -> Result<Option<Vec<PreencodedHtj2k97Component>>, TranscodeStageError> {
if !self.resident_ht_encode {
return Ok(None);
}
self.dwt97_batch_attempts = self.dwt97_batch_attempts.saturating_add(1);
self.htj2k97_codeblock_batch_attempts =
self.htj2k97_codeblock_batch_attempts.saturating_add(1);
self.last_dwt97_batch_stage_timings = None;
if jobs.is_empty() {
return Ok(Some(Vec::new()));
}
if self.mode == CudaDispatchMode::Auto
&& (jobs.len() < self.min_auto_dwt97_batch_jobs
|| htj2k97_i16_codeblock_batch_total_samples(jobs)
< self.min_auto_dwt97_batch_samples)
{
return Ok(None);
}
#[cfg(not(feature = "cuda-runtime"))]
{
let _ = (jobs, options);
self.unavailable()
}
#[cfg(feature = "cuda-runtime")]
{
match cuda::dispatch_htj2k97_preencoded_i16_batch(self.cuda_session(), jobs, options) {
Ok((output, timings)) => {
self.dwt97_batch_dispatches = self.dwt97_batch_dispatches.saturating_add(1);
self.htj2k97_codeblock_batch_dispatches =
self.htj2k97_codeblock_batch_dispatches.saturating_add(1);
self.last_dwt97_batch_stage_timings = Some(timings);
Ok(Some(output))
}
Err(error) => self.recover(error),
}
}
}
fn dct_grid_i16_to_htj2k97_compact_preencoded_batch(
&mut self,
jobs: &[DctGridI16ToHtj2k97CodeBlockJob<'_>],
options: Htj2k97CodeBlockOptions,
) -> Result<Option<PreencodedHtj2k97CompactBatch>, TranscodeStageError> {
if !self.resident_ht_encode {
return Ok(None);
}
self.dwt97_batch_attempts = self.dwt97_batch_attempts.saturating_add(1);
self.htj2k97_codeblock_batch_attempts =
self.htj2k97_codeblock_batch_attempts.saturating_add(1);
self.last_dwt97_batch_stage_timings = None;
if jobs.is_empty() {
return Ok(Some(PreencodedHtj2k97CompactBatch {
payload: Vec::new(),
components: Vec::new(),
}));
}
if self.mode == CudaDispatchMode::Auto
&& (jobs.len() < self.min_auto_dwt97_batch_jobs
|| htj2k97_i16_codeblock_batch_total_samples(jobs)
< self.min_auto_dwt97_batch_samples)
{
return Ok(None);
}
#[cfg(not(feature = "cuda-runtime"))]
{
let _ = (jobs, options);
self.unavailable()
}
#[cfg(feature = "cuda-runtime")]
{
match cuda::dispatch_htj2k97_compact_preencoded_i16_batch(
self.cuda_session(),
jobs,
options,
) {
Ok((output, timings)) => {
self.dwt97_batch_dispatches = self.dwt97_batch_dispatches.saturating_add(1);
self.htj2k97_codeblock_batch_dispatches =
self.htj2k97_codeblock_batch_dispatches.saturating_add(1);
self.last_dwt97_batch_stage_timings = Some(timings);
Ok(Some(output))
}
Err(error) => self.recover(error),
}
}
}
fn dct_grid_i16_to_htj2k97_preencoded_batch_groups(
&mut self,
groups: &[DctGridI16ToHtj2k97CodeBlockBatch<'_, '_>],
options: Htj2k97CodeBlockOptions,
) -> Result<Option<Vec<Vec<PreencodedHtj2k97Component>>>, TranscodeStageError> {
if !self.resident_ht_encode {
return Ok(None);
}
self.dwt97_batch_attempts = self.dwt97_batch_attempts.saturating_add(groups.len());
self.htj2k97_codeblock_batch_attempts = self
.htj2k97_codeblock_batch_attempts
.saturating_add(groups.len());
self.last_dwt97_batch_stage_timings = None;
if groups.is_empty() {
return Ok(Some(Vec::new()));
}
let total_jobs = groups.iter().map(|group| group.jobs.len()).sum::<usize>();
if self.mode == CudaDispatchMode::Auto
&& (total_jobs < self.min_auto_dwt97_batch_jobs
|| htj2k97_i16_codeblock_batch_group_total_samples(groups)
< self.min_auto_dwt97_batch_samples)
{
return Ok(None);
}
#[cfg(not(feature = "cuda-runtime"))]
{
let _ = (groups, options);
self.unavailable()
}
#[cfg(feature = "cuda-runtime")]
{
match cuda::dispatch_htj2k97_preencoded_i16_batch_groups(
self.cuda_session(),
groups,
options,
) {
Ok((output, timings)) => {
self.dwt97_batch_dispatches =
self.dwt97_batch_dispatches.saturating_add(groups.len());
self.htj2k97_codeblock_batch_dispatches = self
.htj2k97_codeblock_batch_dispatches
.saturating_add(timings.ht_codeblock_dispatches);
self.last_dwt97_batch_stage_timings = Some(timings);
Ok(Some(output))
}
Err(error) => self.recover(error),
}
}
}
fn dct_grid_i16_to_htj2k97_compact_preencoded_batch_groups(
&mut self,
groups: &[DctGridI16ToHtj2k97CodeBlockBatch<'_, '_>],
options: Htj2k97CodeBlockOptions,
) -> Result<Option<PreencodedHtj2k97CompactBatchGroups>, TranscodeStageError> {
if !self.resident_ht_encode {
return Ok(None);
}
self.dwt97_batch_attempts = self.dwt97_batch_attempts.saturating_add(groups.len());
self.htj2k97_codeblock_batch_attempts = self
.htj2k97_codeblock_batch_attempts
.saturating_add(groups.len());
self.last_dwt97_batch_stage_timings = None;
if groups.is_empty() {
return Ok(Some(PreencodedHtj2k97CompactBatchGroups {
payload: Vec::new(),
groups: Vec::new(),
}));
}
let total_jobs = groups.iter().map(|group| group.jobs.len()).sum::<usize>();
if self.mode == CudaDispatchMode::Auto
&& (total_jobs < self.min_auto_dwt97_batch_jobs
|| htj2k97_i16_codeblock_batch_group_total_samples(groups)
< self.min_auto_dwt97_batch_samples)
{
return Ok(None);
}
#[cfg(not(feature = "cuda-runtime"))]
{
let _ = (groups, options);
self.unavailable()
}
#[cfg(feature = "cuda-runtime")]
{
match cuda::dispatch_htj2k97_compact_preencoded_i16_batch_groups(
self.cuda_session(),
groups,
options,
) {
Ok((output, timings)) => {
self.dwt97_batch_dispatches =
self.dwt97_batch_dispatches.saturating_add(groups.len());
self.htj2k97_codeblock_batch_dispatches = self
.htj2k97_codeblock_batch_dispatches
.saturating_add(timings.ht_codeblock_dispatches);
self.last_dwt97_batch_stage_timings = Some(timings);
Ok(Some(output))
}
Err(error) => self.recover(error),
}
}
}
fn last_dwt97_batch_stage_timings(&self) -> Option<Dwt97BatchStageTimings> {
self.last_dwt97_batch_stage_timings
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::sync::Mutex;
static ENV_LOCK: Mutex<()> = Mutex::new(());
fn test_htj2k97_codeblock_options() -> Htj2k97CodeBlockOptions {
Htj2k97CodeBlockOptions {
bit_depth: 8,
guard_bits: 2,
code_block_width_exp: 4,
code_block_height_exp: 4,
irreversible_quantization_scale: 1.0,
irreversible_quantization_subband_scales:
signinum_transcode::accelerator::IrreversibleQuantizationSubbandScales::default(),
}
}
#[test]
fn explicit_mode_without_cuda_runtime_errors_on_reversible_job() {
let mut accelerator = CudaDctToWaveletStageAccelerator::new_explicit();
let blocks: Vec<[i16; 64]> = vec![[0i16; 64]];
let job = DctGridToReversibleDwt53Job {
dequantized_blocks: &blocks,
block_cols: 1,
block_rows: 1,
width: 8,
height: 8,
};
let result = accelerator.dct_grid_to_reversible_dwt53(job);
#[cfg(not(feature = "cuda-runtime"))]
assert_eq!(result, Err(TranscodeStageError::DeviceUnavailable));
let _ = result;
assert_eq!(accelerator.reversible_dwt53_attempts(), 1);
}
#[test]
fn auto_mode_falls_back_to_scalar_for_small_jobs() {
let mut accelerator = CudaDctToWaveletStageAccelerator::for_auto();
let blocks: Vec<[i16; 64]> = vec![[0i16; 64]];
let job = DctGridToReversibleDwt53Job {
dequantized_blocks: &blocks,
block_cols: 1,
block_rows: 1,
width: 8,
height: 8,
};
assert_eq!(accelerator.dct_grid_to_reversible_dwt53(job), Ok(None));
}
#[test]
fn empty_batches_return_empty_without_dispatch() {
let mut accelerator = CudaDctToWaveletStageAccelerator::new_explicit();
assert_eq!(
accelerator.dct_grid_to_reversible_dwt53_batch(&[]),
Ok(Some(Vec::new()))
);
assert_eq!(
accelerator.dct_grid_to_dwt97_batch(&[]),
Ok(Some(Vec::new()))
);
}
#[test]
fn compact_preencoded_support_obeys_cuda_env_gate() {
let _guard = ENV_LOCK.lock().expect("env lock");
let previous = std::env::var_os(DISABLE_COMPACT_PREENCODED_ENV);
std::env::remove_var(DISABLE_COMPACT_PREENCODED_ENV);
let accelerator = CudaDctToWaveletStageAccelerator::new_explicit_resident_ht_encode();
assert!(accelerator.supports_htj2k97_i16_preencoded_batch());
assert!(accelerator.supports_htj2k97_compact_preencoded_batch());
std::env::set_var(DISABLE_COMPACT_PREENCODED_ENV, "1");
let accelerator = CudaDctToWaveletStageAccelerator::new_explicit_resident_ht_encode();
assert!(accelerator.supports_htj2k97_i16_preencoded_batch());
assert!(!accelerator.supports_htj2k97_compact_preencoded_batch());
if let Some(previous) = previous {
std::env::set_var(DISABLE_COMPACT_PREENCODED_ENV, previous);
} else {
std::env::remove_var(DISABLE_COMPACT_PREENCODED_ENV);
}
}
#[test]
fn auto_mode_declines_under_amortized_reversible_batches() {
let mut accelerator = CudaDctToWaveletStageAccelerator::for_auto()
.with_auto_reversible_batch_thresholds(2, 224 * 224 * 2);
let blocks = vec![[0i16; 64]; 256 * 256 / 64];
let job = DctGridToReversibleDwt53Job {
dequantized_blocks: &blocks,
block_cols: 32,
block_rows: 32,
width: 256,
height: 256,
};
assert_eq!(
accelerator.dct_grid_to_reversible_dwt53_batch(&[job]),
Ok(None)
);
assert_eq!(accelerator.reversible_dwt53_batch_attempts(), 1);
assert_eq!(accelerator.reversible_dwt53_batch_dispatches(), 0);
}
#[test]
fn auto_mode_declines_under_amortized_dwt97_batches() {
let mut accelerator = CudaDctToWaveletStageAccelerator::for_auto()
.with_auto_dwt97_batch_thresholds(2, 224 * 224 * 2);
let blocks = vec![[[0.0f64; 8]; 8]; 256 * 256 / 64];
let job = DctGridToDwt97Job {
blocks: &blocks,
block_cols: 32,
block_rows: 32,
width: 256,
height: 256,
};
assert_eq!(accelerator.dct_grid_to_dwt97_batch(&[job]), Ok(None));
assert_eq!(accelerator.dwt97_batch_attempts(), 1);
assert_eq!(accelerator.dwt97_batch_dispatches(), 0);
}
#[test]
fn auto_mode_declines_under_amortized_htj2k97_codeblock_batches() {
let mut accelerator = CudaDctToWaveletStageAccelerator::for_auto()
.with_auto_dwt97_batch_thresholds(2, 224 * 224 * 2);
let blocks = vec![[[0.0f64; 8]; 8]; 256 * 256 / 64];
let job = DctGridToHtj2k97CodeBlockJob {
blocks: &blocks,
block_cols: 32,
block_rows: 32,
width: 256,
height: 256,
x_rsiz: 1,
y_rsiz: 1,
};
let result = accelerator
.dct_grid_to_htj2k97_codeblock_batch(&[job], test_htj2k97_codeblock_options());
assert!(matches!(result, Ok(None)));
assert_eq!(accelerator.dwt97_batch_attempts(), 1);
assert_eq!(accelerator.dwt97_batch_dispatches(), 0);
assert_eq!(accelerator.htj2k97_codeblock_batch_attempts(), 1);
assert_eq!(accelerator.htj2k97_codeblock_batch_dispatches(), 0);
}
}