Skip to main content

signinum_transcode_cuda/
lib.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! CUDA acceleration for coefficient-domain JPEG to HTJ2K transcode stages.
4//!
5//! Mirrors `signinum-transcode-metal`: it implements
6//! [`DctToWaveletStageAccelerator`] for direct DCT-grid to one-level 5/3 and 9/7
7//! wavelet projections (and the fused 9/7 HTJ2K code-block path), so JPEG can be
8//! transcoded to HTJ2K without an IDCT->pixels->DWT spatial round-trip. The CPU
9//! scalar code in `signinum-transcode` remains the oracle and fallback; this
10//! crate never reimplements it.
11//!
12//! The actual GPU kernels live in `signinum-cuda-runtime` (the repo keeps all
13//! `.cu` + `build.rs` PTX there). The GPU path is gated behind the
14//! `cuda-runtime` feature; without it this accelerator behaves like Metal's
15//! non-macOS path (Explicit -> typed `Err`, Auto -> `Ok(None)` scalar fallback).
16
17#[cfg(feature = "cuda-runtime")]
18mod cuda;
19
20use core::fmt;
21
22use signinum_transcode::accelerator::{
23    DctGridI16ToHtj2k97CodeBlockBatch, DctGridI16ToHtj2k97CodeBlockJob, DctGridToDwt53Job,
24    DctGridToDwt97Job, DctGridToHtj2k97CodeBlockJob, DctGridToReversibleDwt53Job,
25    DctToWaveletStageAccelerator, Dwt97BatchStageTimings, Htj2k97CodeBlockOptions,
26    PreencodedHtj2k97CompactBatch, PreencodedHtj2k97CompactBatchGroups, PreencodedHtj2k97Component,
27    PrequantizedHtj2k97Component, ReversibleDwt53FirstLevel, TranscodeStageError,
28};
29use signinum_transcode::dct53_2d::Dwt53TwoDimensional;
30use signinum_transcode::dct97_2d::Dwt97TwoDimensional;
31
32/// Stable message returned when the CUDA runtime is unavailable (feature not
33/// compiled, no device, or the transcode kernels were not built).
34pub const CUDA_UNAVAILABLE: &str = "CUDA is unavailable on this host";
35
36/// Default minimum component sample count before Auto mode offers a job to CUDA.
37const DEFAULT_AUTO_MIN_SAMPLES: usize = 224 * 224;
38const DEFAULT_AUTO_REVERSIBLE_BATCH_MIN_JOBS: usize = 32;
39const DEFAULT_AUTO_REVERSIBLE_BATCH_MIN_SAMPLES: usize = 224 * 224 * 32;
40const DEFAULT_AUTO_DWT97_BATCH_MIN_JOBS: usize = 32;
41const DEFAULT_AUTO_DWT97_BATCH_MIN_SAMPLES: usize = 224 * 224 * 32;
42const DISABLE_COMPACT_PREENCODED_ENV: &str = "SIGNINUM_CUDA_DISABLE_COMPACT_PREENCODED";
43
44/// Error returned by the CUDA transcode accelerator.
45#[derive(Debug, Clone, Copy, PartialEq, Eq)]
46pub enum CudaTranscodeError {
47    /// CUDA is unavailable on this host or the kernels were not built.
48    CudaUnavailable,
49    /// The request is outside the current CUDA implementation.
50    UnsupportedJob(&'static str),
51    /// CUDA runtime or kernel execution failed.
52    Kernel(&'static str),
53}
54
55impl CudaTranscodeError {
56    /// Whether Auto mode may recover from this error by using the scalar
57    /// fallback (`Ok(None)`). Hard kernel failures propagate as `Err`.
58    #[cfg(feature = "cuda-runtime")]
59    const fn is_recoverable(self) -> bool {
60        matches!(self, Self::CudaUnavailable | Self::UnsupportedJob(_))
61    }
62}
63
64impl fmt::Display for CudaTranscodeError {
65    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
66        match self {
67            Self::CudaUnavailable => f.write_str(CUDA_UNAVAILABLE),
68            Self::UnsupportedJob(reason) | Self::Kernel(reason) => f.write_str(reason),
69        }
70    }
71}
72
73impl From<CudaTranscodeError> for TranscodeStageError {
74    fn from(error: CudaTranscodeError) -> Self {
75        match error {
76            CudaTranscodeError::CudaUnavailable => Self::DeviceUnavailable,
77            CudaTranscodeError::UnsupportedJob(reason) => Self::Unsupported(reason),
78            CudaTranscodeError::Kernel(reason) => Self::Backend(reason.to_string()),
79        }
80    }
81}
82
83impl std::error::Error for CudaTranscodeError {}
84
85#[derive(Debug, Clone, Copy, PartialEq, Eq)]
86enum CudaDispatchMode {
87    /// Treat an unavailable/unsupported CUDA dispatch as an error.
88    Explicit,
89    /// Fall back to the scalar oracle (`Ok(None)`) for small or unsupported
90    /// jobs.
91    Auto,
92}
93
94/// Optional CUDA accelerator for `signinum-transcode` transform stages.
95#[derive(Debug, Clone)]
96pub struct CudaDctToWaveletStageAccelerator {
97    mode: CudaDispatchMode,
98    min_auto_samples: usize,
99    min_auto_reversible_batch_jobs: usize,
100    min_auto_reversible_batch_samples: usize,
101    min_auto_dwt97_batch_jobs: usize,
102    min_auto_dwt97_batch_samples: usize,
103    reversible_dwt53_attempts: usize,
104    reversible_dwt53_dispatches: usize,
105    reversible_dwt53_batch_attempts: usize,
106    reversible_dwt53_batch_dispatches: usize,
107    dwt53_attempts: usize,
108    dwt53_dispatches: usize,
109    dwt97_attempts: usize,
110    dwt97_dispatches: usize,
111    dwt97_batch_attempts: usize,
112    dwt97_batch_dispatches: usize,
113    htj2k97_codeblock_batch_attempts: usize,
114    htj2k97_codeblock_batch_dispatches: usize,
115    last_dwt97_batch_stage_timings: Option<Dwt97BatchStageTimings>,
116    resident_ht_encode: bool,
117    #[cfg(feature = "cuda-runtime")]
118    session: Option<cuda::CudaTranscodeSession>,
119}
120
121impl CudaDctToWaveletStageAccelerator {
122    /// Create an accelerator that treats unavailable/unsupported CUDA dispatch
123    /// as an error (no silent scalar fallback).
124    #[must_use]
125    pub const fn new_explicit() -> Self {
126        Self::with_mode(CudaDispatchMode::Explicit, 0)
127    }
128
129    /// Create an explicit accelerator that keeps 9/7 code-block coefficients
130    /// resident and HT-encodes them on the same CUDA context before CPU
131    /// packetization.
132    #[must_use]
133    pub const fn new_explicit_resident_ht_encode() -> Self {
134        let mut accelerator = Self::with_mode(CudaDispatchMode::Explicit, 0);
135        accelerator.resident_ht_encode = true;
136        accelerator
137    }
138
139    /// Create an accelerator that falls back to the scalar oracle for small or
140    /// unsupported jobs.
141    #[must_use]
142    pub const fn for_auto() -> Self {
143        let mut accelerator = Self::with_mode(CudaDispatchMode::Auto, DEFAULT_AUTO_MIN_SAMPLES);
144        accelerator.min_auto_reversible_batch_jobs = DEFAULT_AUTO_REVERSIBLE_BATCH_MIN_JOBS;
145        accelerator.min_auto_reversible_batch_samples = DEFAULT_AUTO_REVERSIBLE_BATCH_MIN_SAMPLES;
146        accelerator.min_auto_dwt97_batch_jobs = DEFAULT_AUTO_DWT97_BATCH_MIN_JOBS;
147        accelerator.min_auto_dwt97_batch_samples = DEFAULT_AUTO_DWT97_BATCH_MIN_SAMPLES;
148        accelerator
149    }
150
151    const fn with_mode(mode: CudaDispatchMode, min_auto_samples: usize) -> Self {
152        Self {
153            mode,
154            min_auto_samples,
155            min_auto_reversible_batch_jobs: 0,
156            min_auto_reversible_batch_samples: 0,
157            min_auto_dwt97_batch_jobs: 0,
158            min_auto_dwt97_batch_samples: 0,
159            reversible_dwt53_attempts: 0,
160            reversible_dwt53_dispatches: 0,
161            reversible_dwt53_batch_attempts: 0,
162            reversible_dwt53_batch_dispatches: 0,
163            dwt53_attempts: 0,
164            dwt53_dispatches: 0,
165            dwt97_attempts: 0,
166            dwt97_dispatches: 0,
167            dwt97_batch_attempts: 0,
168            dwt97_batch_dispatches: 0,
169            htj2k97_codeblock_batch_attempts: 0,
170            htj2k97_codeblock_batch_dispatches: 0,
171            last_dwt97_batch_stage_timings: None,
172            resident_ht_encode: false,
173            #[cfg(feature = "cuda-runtime")]
174            session: None,
175        }
176    }
177
178    #[cfg(feature = "cuda-runtime")]
179    fn cuda_session(&mut self) -> &mut cuda::CudaTranscodeSession {
180        self.session
181            .get_or_insert_with(cuda::CudaTranscodeSession::default)
182    }
183
184    /// Override the reversible 5/3 batch thresholds used before Auto mode
185    /// dispatches a batch to CUDA.
186    #[must_use]
187    pub const fn with_auto_reversible_batch_thresholds(
188        mut self,
189        min_jobs: usize,
190        min_samples: usize,
191    ) -> Self {
192        self.min_auto_reversible_batch_jobs = min_jobs;
193        self.min_auto_reversible_batch_samples = min_samples;
194        self
195    }
196
197    /// Override the 9/7 batch thresholds used before Auto mode dispatches a
198    /// same-geometry batch to CUDA.
199    #[must_use]
200    pub const fn with_auto_dwt97_batch_thresholds(
201        mut self,
202        min_jobs: usize,
203        min_samples: usize,
204    ) -> Self {
205        self.min_auto_dwt97_batch_jobs = min_jobs;
206        self.min_auto_dwt97_batch_samples = min_samples;
207        self
208    }
209
210    /// Number of reversible 5/3 jobs offered to this accelerator.
211    #[must_use]
212    pub const fn reversible_dwt53_attempts(&self) -> usize {
213        self.reversible_dwt53_attempts
214    }
215
216    /// Number of reversible 5/3 jobs handled on the GPU.
217    #[must_use]
218    pub const fn reversible_dwt53_dispatches(&self) -> usize {
219        self.reversible_dwt53_dispatches
220    }
221
222    /// Number of reversible 5/3 batches offered to this accelerator.
223    #[must_use]
224    pub const fn reversible_dwt53_batch_attempts(&self) -> usize {
225        self.reversible_dwt53_batch_attempts
226    }
227
228    /// Number of reversible 5/3 batches handled on the GPU.
229    #[must_use]
230    pub const fn reversible_dwt53_batch_dispatches(&self) -> usize {
231        self.reversible_dwt53_batch_dispatches
232    }
233
234    /// Number of float 5/3 jobs offered to this accelerator.
235    #[must_use]
236    pub const fn dwt53_attempts(&self) -> usize {
237        self.dwt53_attempts
238    }
239
240    /// Number of float 5/3 jobs handled on the GPU.
241    #[must_use]
242    pub const fn dwt53_dispatches(&self) -> usize {
243        self.dwt53_dispatches
244    }
245
246    /// Number of 9/7 jobs offered to this accelerator.
247    #[must_use]
248    pub const fn dwt97_attempts(&self) -> usize {
249        self.dwt97_attempts
250    }
251
252    /// Number of 9/7 jobs handled on the GPU.
253    #[must_use]
254    pub const fn dwt97_dispatches(&self) -> usize {
255        self.dwt97_dispatches
256    }
257
258    /// Number of 9/7 batches offered to this accelerator.
259    #[must_use]
260    pub const fn dwt97_batch_attempts(&self) -> usize {
261        self.dwt97_batch_attempts
262    }
263
264    /// Number of 9/7 batches handled on the GPU.
265    #[must_use]
266    pub const fn dwt97_batch_dispatches(&self) -> usize {
267        self.dwt97_batch_dispatches
268    }
269
270    /// Number of prequantized 9/7 HTJ2K code-block batches offered.
271    #[must_use]
272    pub const fn htj2k97_codeblock_batch_attempts(&self) -> usize {
273        self.htj2k97_codeblock_batch_attempts
274    }
275
276    /// Number of prequantized 9/7 HTJ2K code-block batches handled on the GPU.
277    #[must_use]
278    pub const fn htj2k97_codeblock_batch_dispatches(&self) -> usize {
279        self.htj2k97_codeblock_batch_dispatches
280    }
281
282    /// Outcome for a job that CUDA cannot serve, resolved by dispatch mode.
283    #[cfg(not(feature = "cuda-runtime"))]
284    fn unavailable<T>(&self) -> Result<Option<T>, TranscodeStageError> {
285        match self.mode {
286            CudaDispatchMode::Explicit => Err(TranscodeStageError::DeviceUnavailable),
287            CudaDispatchMode::Auto => Ok(None),
288        }
289    }
290
291    /// Map a CUDA dispatch error to the trait outcome for the current mode:
292    /// Auto recovers from recoverable errors with `Ok(None)`; Explicit and hard
293    /// kernel failures propagate as `Err`.
294    #[cfg(feature = "cuda-runtime")]
295    fn recover<T>(&self, error: CudaTranscodeError) -> Result<Option<T>, TranscodeStageError> {
296        if self.mode == CudaDispatchMode::Auto && error.is_recoverable() {
297            Ok(None)
298        } else {
299            Err(error.into())
300        }
301    }
302}
303
304fn reversible_batch_total_samples(jobs: &[DctGridToReversibleDwt53Job<'_>]) -> usize {
305    jobs.iter().fold(0usize, |total, job| {
306        total.saturating_add(job.width.saturating_mul(job.height))
307    })
308}
309
310fn dwt97_batch_total_samples(jobs: &[DctGridToDwt97Job<'_>]) -> usize {
311    jobs.iter().fold(0usize, |total, job| {
312        total.saturating_add(job.width.saturating_mul(job.height))
313    })
314}
315
316fn htj2k97_codeblock_batch_total_samples(jobs: &[DctGridToHtj2k97CodeBlockJob<'_>]) -> usize {
317    jobs.iter().fold(0usize, |total, job| {
318        total.saturating_add(job.width.saturating_mul(job.height))
319    })
320}
321
322fn htj2k97_i16_codeblock_batch_total_samples(
323    jobs: &[DctGridI16ToHtj2k97CodeBlockJob<'_>],
324) -> usize {
325    jobs.iter().fold(0usize, |total, job| {
326        total.saturating_add(job.width.saturating_mul(job.height))
327    })
328}
329
330fn htj2k97_i16_codeblock_batch_group_total_samples(
331    groups: &[DctGridI16ToHtj2k97CodeBlockBatch<'_, '_>],
332) -> usize {
333    groups.iter().fold(0usize, |total, group| {
334        total.saturating_add(htj2k97_i16_codeblock_batch_total_samples(group.jobs))
335    })
336}
337
338impl Default for CudaDctToWaveletStageAccelerator {
339    fn default() -> Self {
340        Self::for_auto()
341    }
342}
343
344impl DctToWaveletStageAccelerator for CudaDctToWaveletStageAccelerator {
345    fn supports_dwt97_batch(&self) -> bool {
346        true
347    }
348
349    // The fused DCT->9/7->prequantized-codeblock path runs the staged 9/7
350    // kernels followed by per-subband deadzone quantization into code-block-major
351    // layout, mirroring the local Metal backend.
352    fn supports_htj2k97_codeblock_batch(&self) -> bool {
353        true
354    }
355
356    fn supports_htj2k97_i16_preencoded_batch(&self) -> bool {
357        self.resident_ht_encode
358    }
359
360    fn supports_htj2k97_compact_preencoded_batch(&self) -> bool {
361        self.resident_ht_encode && std::env::var_os(DISABLE_COMPACT_PREENCODED_ENV).is_none()
362    }
363
364    fn dct_grid_to_reversible_dwt53(
365        &mut self,
366        job: DctGridToReversibleDwt53Job<'_>,
367    ) -> Result<Option<ReversibleDwt53FirstLevel>, TranscodeStageError> {
368        self.reversible_dwt53_attempts = self.reversible_dwt53_attempts.saturating_add(1);
369
370        if self.mode == CudaDispatchMode::Auto
371            && job.width.saturating_mul(job.height) < self.min_auto_samples
372        {
373            return Ok(None);
374        }
375
376        #[cfg(not(feature = "cuda-runtime"))]
377        {
378            let _ = job;
379            self.unavailable()
380        }
381
382        #[cfg(feature = "cuda-runtime")]
383        {
384            match cuda::dispatch_reversible_dwt53(self.cuda_session(), job) {
385                Ok(output) => {
386                    self.reversible_dwt53_dispatches =
387                        self.reversible_dwt53_dispatches.saturating_add(1);
388                    Ok(Some(output))
389                }
390                Err(error) => self.recover(error),
391            }
392        }
393    }
394
395    fn dct_grid_to_reversible_dwt53_batch(
396        &mut self,
397        jobs: &[DctGridToReversibleDwt53Job<'_>],
398    ) -> Result<Option<Vec<ReversibleDwt53FirstLevel>>, TranscodeStageError> {
399        self.reversible_dwt53_batch_attempts =
400            self.reversible_dwt53_batch_attempts.saturating_add(1);
401
402        if jobs.is_empty() {
403            return Ok(Some(Vec::new()));
404        }
405        if self.mode == CudaDispatchMode::Auto
406            && (jobs.len() < self.min_auto_reversible_batch_jobs
407                || reversible_batch_total_samples(jobs) < self.min_auto_reversible_batch_samples)
408        {
409            return Ok(None);
410        }
411
412        #[cfg(not(feature = "cuda-runtime"))]
413        {
414            let _ = jobs;
415            self.unavailable()
416        }
417
418        #[cfg(feature = "cuda-runtime")]
419        {
420            match cuda::dispatch_reversible_dwt53_batch(self.cuda_session(), jobs) {
421                Ok(output) => {
422                    self.reversible_dwt53_batch_dispatches =
423                        self.reversible_dwt53_batch_dispatches.saturating_add(1);
424                    Ok(Some(output))
425                }
426                Err(error) => self.recover(error),
427            }
428        }
429    }
430
431    fn dct_grid_to_dwt53(
432        &mut self,
433        job: DctGridToDwt53Job<'_>,
434    ) -> Result<Option<Dwt53TwoDimensional<f64>>, TranscodeStageError> {
435        self.dwt53_attempts = self.dwt53_attempts.saturating_add(1);
436
437        if self.mode == CudaDispatchMode::Auto
438            && job.width.saturating_mul(job.height) < self.min_auto_samples
439        {
440            return Ok(None);
441        }
442
443        #[cfg(not(feature = "cuda-runtime"))]
444        {
445            let _ = job;
446            self.unavailable()
447        }
448
449        #[cfg(feature = "cuda-runtime")]
450        {
451            match cuda::dispatch_dwt53(job) {
452                Ok(output) => {
453                    self.dwt53_dispatches = self.dwt53_dispatches.saturating_add(1);
454                    Ok(Some(output))
455                }
456                Err(error) => self.recover(error),
457            }
458        }
459    }
460
461    fn dct_grid_to_dwt97(
462        &mut self,
463        job: DctGridToDwt97Job<'_>,
464    ) -> Result<Option<Dwt97TwoDimensional<f64>>, TranscodeStageError> {
465        self.dwt97_attempts = self.dwt97_attempts.saturating_add(1);
466
467        if self.mode == CudaDispatchMode::Auto
468            && job.width.saturating_mul(job.height) < self.min_auto_samples
469        {
470            return Ok(None);
471        }
472
473        #[cfg(not(feature = "cuda-runtime"))]
474        {
475            let _ = job;
476            self.unavailable()
477        }
478
479        #[cfg(feature = "cuda-runtime")]
480        {
481            match cuda::dispatch_dwt97(self.cuda_session(), job) {
482                Ok(output) => {
483                    self.dwt97_dispatches = self.dwt97_dispatches.saturating_add(1);
484                    Ok(Some(output))
485                }
486                Err(error) => self.recover(error),
487            }
488        }
489    }
490
491    fn dct_grid_to_dwt97_batch(
492        &mut self,
493        jobs: &[DctGridToDwt97Job<'_>],
494    ) -> Result<Option<Vec<Dwt97TwoDimensional<f64>>>, TranscodeStageError> {
495        self.dwt97_batch_attempts = self.dwt97_batch_attempts.saturating_add(1);
496        self.last_dwt97_batch_stage_timings = None;
497
498        if jobs.is_empty() {
499            return Ok(Some(Vec::new()));
500        }
501        if self.mode == CudaDispatchMode::Auto
502            && (jobs.len() < self.min_auto_dwt97_batch_jobs
503                || dwt97_batch_total_samples(jobs) < self.min_auto_dwt97_batch_samples)
504        {
505            return Ok(None);
506        }
507
508        #[cfg(not(feature = "cuda-runtime"))]
509        {
510            let _ = jobs;
511            self.unavailable()
512        }
513
514        #[cfg(feature = "cuda-runtime")]
515        {
516            match cuda::dispatch_dwt97_batch(self.cuda_session(), jobs) {
517                Ok((output, timings)) => {
518                    self.dwt97_batch_dispatches = self.dwt97_batch_dispatches.saturating_add(1);
519                    self.last_dwt97_batch_stage_timings = Some(timings);
520                    Ok(Some(output))
521                }
522                Err(error) => self.recover(error),
523            }
524        }
525    }
526
527    fn dct_grid_to_htj2k97_codeblock_batch(
528        &mut self,
529        jobs: &[DctGridToHtj2k97CodeBlockJob<'_>],
530        options: Htj2k97CodeBlockOptions,
531    ) -> Result<Option<Vec<PrequantizedHtj2k97Component>>, TranscodeStageError> {
532        // The code-block path is a staged 9/7 batch plus quantization, so it
533        // counts as both a 9/7 batch and a code-block batch (matching Metal).
534        self.dwt97_batch_attempts = self.dwt97_batch_attempts.saturating_add(1);
535        self.htj2k97_codeblock_batch_attempts =
536            self.htj2k97_codeblock_batch_attempts.saturating_add(1);
537        self.last_dwt97_batch_stage_timings = None;
538
539        if jobs.is_empty() {
540            return Ok(Some(Vec::new()));
541        }
542        if self.mode == CudaDispatchMode::Auto
543            && (jobs.len() < self.min_auto_dwt97_batch_jobs
544                || htj2k97_codeblock_batch_total_samples(jobs) < self.min_auto_dwt97_batch_samples)
545        {
546            return Ok(None);
547        }
548
549        #[cfg(not(feature = "cuda-runtime"))]
550        {
551            let _ = (jobs, options);
552            self.unavailable()
553        }
554
555        #[cfg(feature = "cuda-runtime")]
556        {
557            match cuda::dispatch_htj2k97_codeblock_batch(self.cuda_session(), jobs, options) {
558                Ok((output, timings)) => {
559                    self.dwt97_batch_dispatches = self.dwt97_batch_dispatches.saturating_add(1);
560                    self.htj2k97_codeblock_batch_dispatches =
561                        self.htj2k97_codeblock_batch_dispatches.saturating_add(1);
562                    self.last_dwt97_batch_stage_timings = Some(timings);
563                    Ok(Some(output))
564                }
565                Err(error) => self.recover(error),
566            }
567        }
568    }
569
570    fn dct_grid_to_htj2k97_preencoded_batch(
571        &mut self,
572        jobs: &[DctGridToHtj2k97CodeBlockJob<'_>],
573        options: Htj2k97CodeBlockOptions,
574    ) -> Result<Option<Vec<PreencodedHtj2k97Component>>, TranscodeStageError> {
575        if !self.resident_ht_encode {
576            return Ok(None);
577        }
578
579        self.dwt97_batch_attempts = self.dwt97_batch_attempts.saturating_add(1);
580        self.htj2k97_codeblock_batch_attempts =
581            self.htj2k97_codeblock_batch_attempts.saturating_add(1);
582        self.last_dwt97_batch_stage_timings = None;
583
584        if jobs.is_empty() {
585            return Ok(Some(Vec::new()));
586        }
587        if self.mode == CudaDispatchMode::Auto
588            && (jobs.len() < self.min_auto_dwt97_batch_jobs
589                || htj2k97_codeblock_batch_total_samples(jobs) < self.min_auto_dwt97_batch_samples)
590        {
591            return Ok(None);
592        }
593
594        #[cfg(not(feature = "cuda-runtime"))]
595        {
596            let _ = (jobs, options);
597            self.unavailable()
598        }
599
600        #[cfg(feature = "cuda-runtime")]
601        {
602            match cuda::dispatch_htj2k97_preencoded_batch(self.cuda_session(), jobs, options) {
603                Ok((output, timings)) => {
604                    self.dwt97_batch_dispatches = self.dwt97_batch_dispatches.saturating_add(1);
605                    self.htj2k97_codeblock_batch_dispatches =
606                        self.htj2k97_codeblock_batch_dispatches.saturating_add(1);
607                    self.last_dwt97_batch_stage_timings = Some(timings);
608                    Ok(Some(output))
609                }
610                Err(error) => self.recover(error),
611            }
612        }
613    }
614
615    fn dct_grid_i16_to_htj2k97_preencoded_batch(
616        &mut self,
617        jobs: &[DctGridI16ToHtj2k97CodeBlockJob<'_>],
618        options: Htj2k97CodeBlockOptions,
619    ) -> Result<Option<Vec<PreencodedHtj2k97Component>>, TranscodeStageError> {
620        if !self.resident_ht_encode {
621            return Ok(None);
622        }
623
624        self.dwt97_batch_attempts = self.dwt97_batch_attempts.saturating_add(1);
625        self.htj2k97_codeblock_batch_attempts =
626            self.htj2k97_codeblock_batch_attempts.saturating_add(1);
627        self.last_dwt97_batch_stage_timings = None;
628
629        if jobs.is_empty() {
630            return Ok(Some(Vec::new()));
631        }
632        if self.mode == CudaDispatchMode::Auto
633            && (jobs.len() < self.min_auto_dwt97_batch_jobs
634                || htj2k97_i16_codeblock_batch_total_samples(jobs)
635                    < self.min_auto_dwt97_batch_samples)
636        {
637            return Ok(None);
638        }
639
640        #[cfg(not(feature = "cuda-runtime"))]
641        {
642            let _ = (jobs, options);
643            self.unavailable()
644        }
645
646        #[cfg(feature = "cuda-runtime")]
647        {
648            match cuda::dispatch_htj2k97_preencoded_i16_batch(self.cuda_session(), jobs, options) {
649                Ok((output, timings)) => {
650                    self.dwt97_batch_dispatches = self.dwt97_batch_dispatches.saturating_add(1);
651                    self.htj2k97_codeblock_batch_dispatches =
652                        self.htj2k97_codeblock_batch_dispatches.saturating_add(1);
653                    self.last_dwt97_batch_stage_timings = Some(timings);
654                    Ok(Some(output))
655                }
656                Err(error) => self.recover(error),
657            }
658        }
659    }
660
661    fn dct_grid_i16_to_htj2k97_compact_preencoded_batch(
662        &mut self,
663        jobs: &[DctGridI16ToHtj2k97CodeBlockJob<'_>],
664        options: Htj2k97CodeBlockOptions,
665    ) -> Result<Option<PreencodedHtj2k97CompactBatch>, TranscodeStageError> {
666        if !self.resident_ht_encode {
667            return Ok(None);
668        }
669
670        self.dwt97_batch_attempts = self.dwt97_batch_attempts.saturating_add(1);
671        self.htj2k97_codeblock_batch_attempts =
672            self.htj2k97_codeblock_batch_attempts.saturating_add(1);
673        self.last_dwt97_batch_stage_timings = None;
674
675        if jobs.is_empty() {
676            return Ok(Some(PreencodedHtj2k97CompactBatch {
677                payload: Vec::new(),
678                components: Vec::new(),
679            }));
680        }
681        if self.mode == CudaDispatchMode::Auto
682            && (jobs.len() < self.min_auto_dwt97_batch_jobs
683                || htj2k97_i16_codeblock_batch_total_samples(jobs)
684                    < self.min_auto_dwt97_batch_samples)
685        {
686            return Ok(None);
687        }
688
689        #[cfg(not(feature = "cuda-runtime"))]
690        {
691            let _ = (jobs, options);
692            self.unavailable()
693        }
694
695        #[cfg(feature = "cuda-runtime")]
696        {
697            match cuda::dispatch_htj2k97_compact_preencoded_i16_batch(
698                self.cuda_session(),
699                jobs,
700                options,
701            ) {
702                Ok((output, timings)) => {
703                    self.dwt97_batch_dispatches = self.dwt97_batch_dispatches.saturating_add(1);
704                    self.htj2k97_codeblock_batch_dispatches =
705                        self.htj2k97_codeblock_batch_dispatches.saturating_add(1);
706                    self.last_dwt97_batch_stage_timings = Some(timings);
707                    Ok(Some(output))
708                }
709                Err(error) => self.recover(error),
710            }
711        }
712    }
713
714    fn dct_grid_i16_to_htj2k97_preencoded_batch_groups(
715        &mut self,
716        groups: &[DctGridI16ToHtj2k97CodeBlockBatch<'_, '_>],
717        options: Htj2k97CodeBlockOptions,
718    ) -> Result<Option<Vec<Vec<PreencodedHtj2k97Component>>>, TranscodeStageError> {
719        if !self.resident_ht_encode {
720            return Ok(None);
721        }
722
723        self.dwt97_batch_attempts = self.dwt97_batch_attempts.saturating_add(groups.len());
724        self.htj2k97_codeblock_batch_attempts = self
725            .htj2k97_codeblock_batch_attempts
726            .saturating_add(groups.len());
727        self.last_dwt97_batch_stage_timings = None;
728
729        if groups.is_empty() {
730            return Ok(Some(Vec::new()));
731        }
732        let total_jobs = groups.iter().map(|group| group.jobs.len()).sum::<usize>();
733        if self.mode == CudaDispatchMode::Auto
734            && (total_jobs < self.min_auto_dwt97_batch_jobs
735                || htj2k97_i16_codeblock_batch_group_total_samples(groups)
736                    < self.min_auto_dwt97_batch_samples)
737        {
738            return Ok(None);
739        }
740
741        #[cfg(not(feature = "cuda-runtime"))]
742        {
743            let _ = (groups, options);
744            self.unavailable()
745        }
746
747        #[cfg(feature = "cuda-runtime")]
748        {
749            match cuda::dispatch_htj2k97_preencoded_i16_batch_groups(
750                self.cuda_session(),
751                groups,
752                options,
753            ) {
754                Ok((output, timings)) => {
755                    self.dwt97_batch_dispatches =
756                        self.dwt97_batch_dispatches.saturating_add(groups.len());
757                    self.htj2k97_codeblock_batch_dispatches = self
758                        .htj2k97_codeblock_batch_dispatches
759                        .saturating_add(timings.ht_codeblock_dispatches);
760                    self.last_dwt97_batch_stage_timings = Some(timings);
761                    Ok(Some(output))
762                }
763                Err(error) => self.recover(error),
764            }
765        }
766    }
767
768    fn dct_grid_i16_to_htj2k97_compact_preencoded_batch_groups(
769        &mut self,
770        groups: &[DctGridI16ToHtj2k97CodeBlockBatch<'_, '_>],
771        options: Htj2k97CodeBlockOptions,
772    ) -> Result<Option<PreencodedHtj2k97CompactBatchGroups>, TranscodeStageError> {
773        if !self.resident_ht_encode {
774            return Ok(None);
775        }
776
777        self.dwt97_batch_attempts = self.dwt97_batch_attempts.saturating_add(groups.len());
778        self.htj2k97_codeblock_batch_attempts = self
779            .htj2k97_codeblock_batch_attempts
780            .saturating_add(groups.len());
781        self.last_dwt97_batch_stage_timings = None;
782
783        if groups.is_empty() {
784            return Ok(Some(PreencodedHtj2k97CompactBatchGroups {
785                payload: Vec::new(),
786                groups: Vec::new(),
787            }));
788        }
789        let total_jobs = groups.iter().map(|group| group.jobs.len()).sum::<usize>();
790        if self.mode == CudaDispatchMode::Auto
791            && (total_jobs < self.min_auto_dwt97_batch_jobs
792                || htj2k97_i16_codeblock_batch_group_total_samples(groups)
793                    < self.min_auto_dwt97_batch_samples)
794        {
795            return Ok(None);
796        }
797
798        #[cfg(not(feature = "cuda-runtime"))]
799        {
800            let _ = (groups, options);
801            self.unavailable()
802        }
803
804        #[cfg(feature = "cuda-runtime")]
805        {
806            match cuda::dispatch_htj2k97_compact_preencoded_i16_batch_groups(
807                self.cuda_session(),
808                groups,
809                options,
810            ) {
811                Ok((output, timings)) => {
812                    self.dwt97_batch_dispatches =
813                        self.dwt97_batch_dispatches.saturating_add(groups.len());
814                    self.htj2k97_codeblock_batch_dispatches = self
815                        .htj2k97_codeblock_batch_dispatches
816                        .saturating_add(timings.ht_codeblock_dispatches);
817                    self.last_dwt97_batch_stage_timings = Some(timings);
818                    Ok(Some(output))
819                }
820                Err(error) => self.recover(error),
821            }
822        }
823    }
824
825    fn last_dwt97_batch_stage_timings(&self) -> Option<Dwt97BatchStageTimings> {
826        self.last_dwt97_batch_stage_timings
827    }
828}
829
830#[cfg(test)]
831mod tests {
832    use super::*;
833    use std::sync::Mutex;
834
835    static ENV_LOCK: Mutex<()> = Mutex::new(());
836
837    fn test_htj2k97_codeblock_options() -> Htj2k97CodeBlockOptions {
838        Htj2k97CodeBlockOptions {
839            bit_depth: 8,
840            guard_bits: 2,
841            code_block_width_exp: 4,
842            code_block_height_exp: 4,
843            irreversible_quantization_scale: 1.0,
844            irreversible_quantization_subband_scales:
845                signinum_transcode::accelerator::IrreversibleQuantizationSubbandScales::default(),
846        }
847    }
848
849    #[test]
850    fn explicit_mode_without_cuda_runtime_errors_on_reversible_job() {
851        // Without the cuda-runtime feature, Explicit mode must surface a typed
852        // error rather than silently using the scalar fallback.
853        let mut accelerator = CudaDctToWaveletStageAccelerator::new_explicit();
854        let blocks: Vec<[i16; 64]> = vec![[0i16; 64]];
855        let job = DctGridToReversibleDwt53Job {
856            dequantized_blocks: &blocks,
857            block_cols: 1,
858            block_rows: 1,
859            width: 8,
860            height: 8,
861        };
862        let result = accelerator.dct_grid_to_reversible_dwt53(job);
863        #[cfg(not(feature = "cuda-runtime"))]
864        assert_eq!(result, Err(TranscodeStageError::DeviceUnavailable));
865        let _ = result;
866        assert_eq!(accelerator.reversible_dwt53_attempts(), 1);
867    }
868
869    #[test]
870    fn auto_mode_falls_back_to_scalar_for_small_jobs() {
871        // Auto mode returns Ok(None) for sub-threshold jobs so the transcode
872        // pipeline uses its scalar oracle.
873        let mut accelerator = CudaDctToWaveletStageAccelerator::for_auto();
874        let blocks: Vec<[i16; 64]> = vec![[0i16; 64]];
875        let job = DctGridToReversibleDwt53Job {
876            dequantized_blocks: &blocks,
877            block_cols: 1,
878            block_rows: 1,
879            width: 8,
880            height: 8,
881        };
882        assert_eq!(accelerator.dct_grid_to_reversible_dwt53(job), Ok(None));
883    }
884
885    #[test]
886    fn empty_batches_return_empty_without_dispatch() {
887        let mut accelerator = CudaDctToWaveletStageAccelerator::new_explicit();
888        assert_eq!(
889            accelerator.dct_grid_to_reversible_dwt53_batch(&[]),
890            Ok(Some(Vec::new()))
891        );
892        assert_eq!(
893            accelerator.dct_grid_to_dwt97_batch(&[]),
894            Ok(Some(Vec::new()))
895        );
896    }
897
898    #[test]
899    fn compact_preencoded_support_obeys_cuda_env_gate() {
900        let _guard = ENV_LOCK.lock().expect("env lock");
901        let previous = std::env::var_os(DISABLE_COMPACT_PREENCODED_ENV);
902        std::env::remove_var(DISABLE_COMPACT_PREENCODED_ENV);
903        let accelerator = CudaDctToWaveletStageAccelerator::new_explicit_resident_ht_encode();
904        assert!(accelerator.supports_htj2k97_i16_preencoded_batch());
905        assert!(accelerator.supports_htj2k97_compact_preencoded_batch());
906
907        std::env::set_var(DISABLE_COMPACT_PREENCODED_ENV, "1");
908        let accelerator = CudaDctToWaveletStageAccelerator::new_explicit_resident_ht_encode();
909        assert!(accelerator.supports_htj2k97_i16_preencoded_batch());
910        assert!(!accelerator.supports_htj2k97_compact_preencoded_batch());
911
912        if let Some(previous) = previous {
913            std::env::set_var(DISABLE_COMPACT_PREENCODED_ENV, previous);
914        } else {
915            std::env::remove_var(DISABLE_COMPACT_PREENCODED_ENV);
916        }
917    }
918
919    #[test]
920    fn auto_mode_declines_under_amortized_reversible_batches() {
921        let mut accelerator = CudaDctToWaveletStageAccelerator::for_auto()
922            .with_auto_reversible_batch_thresholds(2, 224 * 224 * 2);
923        let blocks = vec![[0i16; 64]; 256 * 256 / 64];
924        let job = DctGridToReversibleDwt53Job {
925            dequantized_blocks: &blocks,
926            block_cols: 32,
927            block_rows: 32,
928            width: 256,
929            height: 256,
930        };
931
932        assert_eq!(
933            accelerator.dct_grid_to_reversible_dwt53_batch(&[job]),
934            Ok(None)
935        );
936        assert_eq!(accelerator.reversible_dwt53_batch_attempts(), 1);
937        assert_eq!(accelerator.reversible_dwt53_batch_dispatches(), 0);
938    }
939
940    #[test]
941    fn auto_mode_declines_under_amortized_dwt97_batches() {
942        let mut accelerator = CudaDctToWaveletStageAccelerator::for_auto()
943            .with_auto_dwt97_batch_thresholds(2, 224 * 224 * 2);
944        let blocks = vec![[[0.0f64; 8]; 8]; 256 * 256 / 64];
945        let job = DctGridToDwt97Job {
946            blocks: &blocks,
947            block_cols: 32,
948            block_rows: 32,
949            width: 256,
950            height: 256,
951        };
952
953        assert_eq!(accelerator.dct_grid_to_dwt97_batch(&[job]), Ok(None));
954        assert_eq!(accelerator.dwt97_batch_attempts(), 1);
955        assert_eq!(accelerator.dwt97_batch_dispatches(), 0);
956    }
957
958    #[test]
959    fn auto_mode_declines_under_amortized_htj2k97_codeblock_batches() {
960        let mut accelerator = CudaDctToWaveletStageAccelerator::for_auto()
961            .with_auto_dwt97_batch_thresholds(2, 224 * 224 * 2);
962        let blocks = vec![[[0.0f64; 8]; 8]; 256 * 256 / 64];
963        let job = DctGridToHtj2k97CodeBlockJob {
964            blocks: &blocks,
965            block_cols: 32,
966            block_rows: 32,
967            width: 256,
968            height: 256,
969            x_rsiz: 1,
970            y_rsiz: 1,
971        };
972
973        let result = accelerator
974            .dct_grid_to_htj2k97_codeblock_batch(&[job], test_htj2k97_codeblock_options());
975        assert!(matches!(result, Ok(None)));
976        assert_eq!(accelerator.dwt97_batch_attempts(), 1);
977        assert_eq!(accelerator.dwt97_batch_dispatches(), 0);
978        assert_eq!(accelerator.htj2k97_codeblock_batch_attempts(), 1);
979        assert_eq!(accelerator.htj2k97_codeblock_batch_dispatches(), 0);
980    }
981}