1#[cfg(feature = "cuda-runtime")]
18mod cuda;
19
20use core::fmt;
21
22use signinum_transcode::accelerator::{
23 DctGridI16ToHtj2k97CodeBlockBatch, DctGridI16ToHtj2k97CodeBlockJob, DctGridToDwt53Job,
24 DctGridToDwt97Job, DctGridToHtj2k97CodeBlockJob, DctGridToReversibleDwt53Job,
25 DctToWaveletStageAccelerator, Dwt97BatchStageTimings, Htj2k97CodeBlockOptions,
26 PreencodedHtj2k97CompactBatch, PreencodedHtj2k97CompactBatchGroups, PreencodedHtj2k97Component,
27 PrequantizedHtj2k97Component, ReversibleDwt53FirstLevel, TranscodeStageError,
28};
29use signinum_transcode::dct53_2d::Dwt53TwoDimensional;
30use signinum_transcode::dct97_2d::Dwt97TwoDimensional;
31
32pub const CUDA_UNAVAILABLE: &str = "CUDA is unavailable on this host";
35
36const DEFAULT_AUTO_MIN_SAMPLES: usize = 224 * 224;
38const DEFAULT_AUTO_REVERSIBLE_BATCH_MIN_JOBS: usize = 32;
39const DEFAULT_AUTO_REVERSIBLE_BATCH_MIN_SAMPLES: usize = 224 * 224 * 32;
40const DEFAULT_AUTO_DWT97_BATCH_MIN_JOBS: usize = 32;
41const DEFAULT_AUTO_DWT97_BATCH_MIN_SAMPLES: usize = 224 * 224 * 32;
42const DISABLE_COMPACT_PREENCODED_ENV: &str = "SIGNINUM_CUDA_DISABLE_COMPACT_PREENCODED";
43
44#[derive(Debug, Clone, Copy, PartialEq, Eq)]
46pub enum CudaTranscodeError {
47 CudaUnavailable,
49 UnsupportedJob(&'static str),
51 Kernel(&'static str),
53}
54
55impl CudaTranscodeError {
56 #[cfg(feature = "cuda-runtime")]
59 const fn is_recoverable(self) -> bool {
60 matches!(self, Self::CudaUnavailable | Self::UnsupportedJob(_))
61 }
62}
63
64impl fmt::Display for CudaTranscodeError {
65 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
66 match self {
67 Self::CudaUnavailable => f.write_str(CUDA_UNAVAILABLE),
68 Self::UnsupportedJob(reason) | Self::Kernel(reason) => f.write_str(reason),
69 }
70 }
71}
72
73impl From<CudaTranscodeError> for TranscodeStageError {
74 fn from(error: CudaTranscodeError) -> Self {
75 match error {
76 CudaTranscodeError::CudaUnavailable => Self::DeviceUnavailable,
77 CudaTranscodeError::UnsupportedJob(reason) => Self::Unsupported(reason),
78 CudaTranscodeError::Kernel(reason) => Self::Backend(reason.to_string()),
79 }
80 }
81}
82
83impl std::error::Error for CudaTranscodeError {}
84
85#[derive(Debug, Clone, Copy, PartialEq, Eq)]
86enum CudaDispatchMode {
87 Explicit,
89 Auto,
92}
93
94#[derive(Debug, Clone)]
96pub struct CudaDctToWaveletStageAccelerator {
97 mode: CudaDispatchMode,
98 min_auto_samples: usize,
99 min_auto_reversible_batch_jobs: usize,
100 min_auto_reversible_batch_samples: usize,
101 min_auto_dwt97_batch_jobs: usize,
102 min_auto_dwt97_batch_samples: usize,
103 reversible_dwt53_attempts: usize,
104 reversible_dwt53_dispatches: usize,
105 reversible_dwt53_batch_attempts: usize,
106 reversible_dwt53_batch_dispatches: usize,
107 dwt53_attempts: usize,
108 dwt53_dispatches: usize,
109 dwt97_attempts: usize,
110 dwt97_dispatches: usize,
111 dwt97_batch_attempts: usize,
112 dwt97_batch_dispatches: usize,
113 htj2k97_codeblock_batch_attempts: usize,
114 htj2k97_codeblock_batch_dispatches: usize,
115 last_dwt97_batch_stage_timings: Option<Dwt97BatchStageTimings>,
116 resident_ht_encode: bool,
117 #[cfg(feature = "cuda-runtime")]
118 session: Option<cuda::CudaTranscodeSession>,
119}
120
121impl CudaDctToWaveletStageAccelerator {
122 #[must_use]
125 pub const fn new_explicit() -> Self {
126 Self::with_mode(CudaDispatchMode::Explicit, 0)
127 }
128
129 #[must_use]
133 pub const fn new_explicit_resident_ht_encode() -> Self {
134 let mut accelerator = Self::with_mode(CudaDispatchMode::Explicit, 0);
135 accelerator.resident_ht_encode = true;
136 accelerator
137 }
138
139 #[must_use]
142 pub const fn for_auto() -> Self {
143 let mut accelerator = Self::with_mode(CudaDispatchMode::Auto, DEFAULT_AUTO_MIN_SAMPLES);
144 accelerator.min_auto_reversible_batch_jobs = DEFAULT_AUTO_REVERSIBLE_BATCH_MIN_JOBS;
145 accelerator.min_auto_reversible_batch_samples = DEFAULT_AUTO_REVERSIBLE_BATCH_MIN_SAMPLES;
146 accelerator.min_auto_dwt97_batch_jobs = DEFAULT_AUTO_DWT97_BATCH_MIN_JOBS;
147 accelerator.min_auto_dwt97_batch_samples = DEFAULT_AUTO_DWT97_BATCH_MIN_SAMPLES;
148 accelerator
149 }
150
151 const fn with_mode(mode: CudaDispatchMode, min_auto_samples: usize) -> Self {
152 Self {
153 mode,
154 min_auto_samples,
155 min_auto_reversible_batch_jobs: 0,
156 min_auto_reversible_batch_samples: 0,
157 min_auto_dwt97_batch_jobs: 0,
158 min_auto_dwt97_batch_samples: 0,
159 reversible_dwt53_attempts: 0,
160 reversible_dwt53_dispatches: 0,
161 reversible_dwt53_batch_attempts: 0,
162 reversible_dwt53_batch_dispatches: 0,
163 dwt53_attempts: 0,
164 dwt53_dispatches: 0,
165 dwt97_attempts: 0,
166 dwt97_dispatches: 0,
167 dwt97_batch_attempts: 0,
168 dwt97_batch_dispatches: 0,
169 htj2k97_codeblock_batch_attempts: 0,
170 htj2k97_codeblock_batch_dispatches: 0,
171 last_dwt97_batch_stage_timings: None,
172 resident_ht_encode: false,
173 #[cfg(feature = "cuda-runtime")]
174 session: None,
175 }
176 }
177
178 #[cfg(feature = "cuda-runtime")]
179 fn cuda_session(&mut self) -> &mut cuda::CudaTranscodeSession {
180 self.session
181 .get_or_insert_with(cuda::CudaTranscodeSession::default)
182 }
183
184 #[must_use]
187 pub const fn with_auto_reversible_batch_thresholds(
188 mut self,
189 min_jobs: usize,
190 min_samples: usize,
191 ) -> Self {
192 self.min_auto_reversible_batch_jobs = min_jobs;
193 self.min_auto_reversible_batch_samples = min_samples;
194 self
195 }
196
197 #[must_use]
200 pub const fn with_auto_dwt97_batch_thresholds(
201 mut self,
202 min_jobs: usize,
203 min_samples: usize,
204 ) -> Self {
205 self.min_auto_dwt97_batch_jobs = min_jobs;
206 self.min_auto_dwt97_batch_samples = min_samples;
207 self
208 }
209
210 #[must_use]
212 pub const fn reversible_dwt53_attempts(&self) -> usize {
213 self.reversible_dwt53_attempts
214 }
215
216 #[must_use]
218 pub const fn reversible_dwt53_dispatches(&self) -> usize {
219 self.reversible_dwt53_dispatches
220 }
221
222 #[must_use]
224 pub const fn reversible_dwt53_batch_attempts(&self) -> usize {
225 self.reversible_dwt53_batch_attempts
226 }
227
228 #[must_use]
230 pub const fn reversible_dwt53_batch_dispatches(&self) -> usize {
231 self.reversible_dwt53_batch_dispatches
232 }
233
234 #[must_use]
236 pub const fn dwt53_attempts(&self) -> usize {
237 self.dwt53_attempts
238 }
239
240 #[must_use]
242 pub const fn dwt53_dispatches(&self) -> usize {
243 self.dwt53_dispatches
244 }
245
246 #[must_use]
248 pub const fn dwt97_attempts(&self) -> usize {
249 self.dwt97_attempts
250 }
251
252 #[must_use]
254 pub const fn dwt97_dispatches(&self) -> usize {
255 self.dwt97_dispatches
256 }
257
258 #[must_use]
260 pub const fn dwt97_batch_attempts(&self) -> usize {
261 self.dwt97_batch_attempts
262 }
263
264 #[must_use]
266 pub const fn dwt97_batch_dispatches(&self) -> usize {
267 self.dwt97_batch_dispatches
268 }
269
270 #[must_use]
272 pub const fn htj2k97_codeblock_batch_attempts(&self) -> usize {
273 self.htj2k97_codeblock_batch_attempts
274 }
275
276 #[must_use]
278 pub const fn htj2k97_codeblock_batch_dispatches(&self) -> usize {
279 self.htj2k97_codeblock_batch_dispatches
280 }
281
282 #[cfg(not(feature = "cuda-runtime"))]
284 fn unavailable<T>(&self) -> Result<Option<T>, TranscodeStageError> {
285 match self.mode {
286 CudaDispatchMode::Explicit => Err(TranscodeStageError::DeviceUnavailable),
287 CudaDispatchMode::Auto => Ok(None),
288 }
289 }
290
291 #[cfg(feature = "cuda-runtime")]
295 fn recover<T>(&self, error: CudaTranscodeError) -> Result<Option<T>, TranscodeStageError> {
296 if self.mode == CudaDispatchMode::Auto && error.is_recoverable() {
297 Ok(None)
298 } else {
299 Err(error.into())
300 }
301 }
302}
303
304fn reversible_batch_total_samples(jobs: &[DctGridToReversibleDwt53Job<'_>]) -> usize {
305 jobs.iter().fold(0usize, |total, job| {
306 total.saturating_add(job.width.saturating_mul(job.height))
307 })
308}
309
310fn dwt97_batch_total_samples(jobs: &[DctGridToDwt97Job<'_>]) -> usize {
311 jobs.iter().fold(0usize, |total, job| {
312 total.saturating_add(job.width.saturating_mul(job.height))
313 })
314}
315
316fn htj2k97_codeblock_batch_total_samples(jobs: &[DctGridToHtj2k97CodeBlockJob<'_>]) -> usize {
317 jobs.iter().fold(0usize, |total, job| {
318 total.saturating_add(job.width.saturating_mul(job.height))
319 })
320}
321
322fn htj2k97_i16_codeblock_batch_total_samples(
323 jobs: &[DctGridI16ToHtj2k97CodeBlockJob<'_>],
324) -> usize {
325 jobs.iter().fold(0usize, |total, job| {
326 total.saturating_add(job.width.saturating_mul(job.height))
327 })
328}
329
330fn htj2k97_i16_codeblock_batch_group_total_samples(
331 groups: &[DctGridI16ToHtj2k97CodeBlockBatch<'_, '_>],
332) -> usize {
333 groups.iter().fold(0usize, |total, group| {
334 total.saturating_add(htj2k97_i16_codeblock_batch_total_samples(group.jobs))
335 })
336}
337
338impl Default for CudaDctToWaveletStageAccelerator {
339 fn default() -> Self {
340 Self::for_auto()
341 }
342}
343
344impl DctToWaveletStageAccelerator for CudaDctToWaveletStageAccelerator {
345 fn supports_dwt97_batch(&self) -> bool {
346 true
347 }
348
349 fn supports_htj2k97_codeblock_batch(&self) -> bool {
353 true
354 }
355
356 fn supports_htj2k97_i16_preencoded_batch(&self) -> bool {
357 self.resident_ht_encode
358 }
359
360 fn supports_htj2k97_compact_preencoded_batch(&self) -> bool {
361 self.resident_ht_encode && std::env::var_os(DISABLE_COMPACT_PREENCODED_ENV).is_none()
362 }
363
364 fn dct_grid_to_reversible_dwt53(
365 &mut self,
366 job: DctGridToReversibleDwt53Job<'_>,
367 ) -> Result<Option<ReversibleDwt53FirstLevel>, TranscodeStageError> {
368 self.reversible_dwt53_attempts = self.reversible_dwt53_attempts.saturating_add(1);
369
370 if self.mode == CudaDispatchMode::Auto
371 && job.width.saturating_mul(job.height) < self.min_auto_samples
372 {
373 return Ok(None);
374 }
375
376 #[cfg(not(feature = "cuda-runtime"))]
377 {
378 let _ = job;
379 self.unavailable()
380 }
381
382 #[cfg(feature = "cuda-runtime")]
383 {
384 match cuda::dispatch_reversible_dwt53(self.cuda_session(), job) {
385 Ok(output) => {
386 self.reversible_dwt53_dispatches =
387 self.reversible_dwt53_dispatches.saturating_add(1);
388 Ok(Some(output))
389 }
390 Err(error) => self.recover(error),
391 }
392 }
393 }
394
395 fn dct_grid_to_reversible_dwt53_batch(
396 &mut self,
397 jobs: &[DctGridToReversibleDwt53Job<'_>],
398 ) -> Result<Option<Vec<ReversibleDwt53FirstLevel>>, TranscodeStageError> {
399 self.reversible_dwt53_batch_attempts =
400 self.reversible_dwt53_batch_attempts.saturating_add(1);
401
402 if jobs.is_empty() {
403 return Ok(Some(Vec::new()));
404 }
405 if self.mode == CudaDispatchMode::Auto
406 && (jobs.len() < self.min_auto_reversible_batch_jobs
407 || reversible_batch_total_samples(jobs) < self.min_auto_reversible_batch_samples)
408 {
409 return Ok(None);
410 }
411
412 #[cfg(not(feature = "cuda-runtime"))]
413 {
414 let _ = jobs;
415 self.unavailable()
416 }
417
418 #[cfg(feature = "cuda-runtime")]
419 {
420 match cuda::dispatch_reversible_dwt53_batch(self.cuda_session(), jobs) {
421 Ok(output) => {
422 self.reversible_dwt53_batch_dispatches =
423 self.reversible_dwt53_batch_dispatches.saturating_add(1);
424 Ok(Some(output))
425 }
426 Err(error) => self.recover(error),
427 }
428 }
429 }
430
431 fn dct_grid_to_dwt53(
432 &mut self,
433 job: DctGridToDwt53Job<'_>,
434 ) -> Result<Option<Dwt53TwoDimensional<f64>>, TranscodeStageError> {
435 self.dwt53_attempts = self.dwt53_attempts.saturating_add(1);
436
437 if self.mode == CudaDispatchMode::Auto
438 && job.width.saturating_mul(job.height) < self.min_auto_samples
439 {
440 return Ok(None);
441 }
442
443 #[cfg(not(feature = "cuda-runtime"))]
444 {
445 let _ = job;
446 self.unavailable()
447 }
448
449 #[cfg(feature = "cuda-runtime")]
450 {
451 match cuda::dispatch_dwt53(job) {
452 Ok(output) => {
453 self.dwt53_dispatches = self.dwt53_dispatches.saturating_add(1);
454 Ok(Some(output))
455 }
456 Err(error) => self.recover(error),
457 }
458 }
459 }
460
461 fn dct_grid_to_dwt97(
462 &mut self,
463 job: DctGridToDwt97Job<'_>,
464 ) -> Result<Option<Dwt97TwoDimensional<f64>>, TranscodeStageError> {
465 self.dwt97_attempts = self.dwt97_attempts.saturating_add(1);
466
467 if self.mode == CudaDispatchMode::Auto
468 && job.width.saturating_mul(job.height) < self.min_auto_samples
469 {
470 return Ok(None);
471 }
472
473 #[cfg(not(feature = "cuda-runtime"))]
474 {
475 let _ = job;
476 self.unavailable()
477 }
478
479 #[cfg(feature = "cuda-runtime")]
480 {
481 match cuda::dispatch_dwt97(self.cuda_session(), job) {
482 Ok(output) => {
483 self.dwt97_dispatches = self.dwt97_dispatches.saturating_add(1);
484 Ok(Some(output))
485 }
486 Err(error) => self.recover(error),
487 }
488 }
489 }
490
491 fn dct_grid_to_dwt97_batch(
492 &mut self,
493 jobs: &[DctGridToDwt97Job<'_>],
494 ) -> Result<Option<Vec<Dwt97TwoDimensional<f64>>>, TranscodeStageError> {
495 self.dwt97_batch_attempts = self.dwt97_batch_attempts.saturating_add(1);
496 self.last_dwt97_batch_stage_timings = None;
497
498 if jobs.is_empty() {
499 return Ok(Some(Vec::new()));
500 }
501 if self.mode == CudaDispatchMode::Auto
502 && (jobs.len() < self.min_auto_dwt97_batch_jobs
503 || dwt97_batch_total_samples(jobs) < self.min_auto_dwt97_batch_samples)
504 {
505 return Ok(None);
506 }
507
508 #[cfg(not(feature = "cuda-runtime"))]
509 {
510 let _ = jobs;
511 self.unavailable()
512 }
513
514 #[cfg(feature = "cuda-runtime")]
515 {
516 match cuda::dispatch_dwt97_batch(self.cuda_session(), jobs) {
517 Ok((output, timings)) => {
518 self.dwt97_batch_dispatches = self.dwt97_batch_dispatches.saturating_add(1);
519 self.last_dwt97_batch_stage_timings = Some(timings);
520 Ok(Some(output))
521 }
522 Err(error) => self.recover(error),
523 }
524 }
525 }
526
527 fn dct_grid_to_htj2k97_codeblock_batch(
528 &mut self,
529 jobs: &[DctGridToHtj2k97CodeBlockJob<'_>],
530 options: Htj2k97CodeBlockOptions,
531 ) -> Result<Option<Vec<PrequantizedHtj2k97Component>>, TranscodeStageError> {
532 self.dwt97_batch_attempts = self.dwt97_batch_attempts.saturating_add(1);
535 self.htj2k97_codeblock_batch_attempts =
536 self.htj2k97_codeblock_batch_attempts.saturating_add(1);
537 self.last_dwt97_batch_stage_timings = None;
538
539 if jobs.is_empty() {
540 return Ok(Some(Vec::new()));
541 }
542 if self.mode == CudaDispatchMode::Auto
543 && (jobs.len() < self.min_auto_dwt97_batch_jobs
544 || htj2k97_codeblock_batch_total_samples(jobs) < self.min_auto_dwt97_batch_samples)
545 {
546 return Ok(None);
547 }
548
549 #[cfg(not(feature = "cuda-runtime"))]
550 {
551 let _ = (jobs, options);
552 self.unavailable()
553 }
554
555 #[cfg(feature = "cuda-runtime")]
556 {
557 match cuda::dispatch_htj2k97_codeblock_batch(self.cuda_session(), jobs, options) {
558 Ok((output, timings)) => {
559 self.dwt97_batch_dispatches = self.dwt97_batch_dispatches.saturating_add(1);
560 self.htj2k97_codeblock_batch_dispatches =
561 self.htj2k97_codeblock_batch_dispatches.saturating_add(1);
562 self.last_dwt97_batch_stage_timings = Some(timings);
563 Ok(Some(output))
564 }
565 Err(error) => self.recover(error),
566 }
567 }
568 }
569
570 fn dct_grid_to_htj2k97_preencoded_batch(
571 &mut self,
572 jobs: &[DctGridToHtj2k97CodeBlockJob<'_>],
573 options: Htj2k97CodeBlockOptions,
574 ) -> Result<Option<Vec<PreencodedHtj2k97Component>>, TranscodeStageError> {
575 if !self.resident_ht_encode {
576 return Ok(None);
577 }
578
579 self.dwt97_batch_attempts = self.dwt97_batch_attempts.saturating_add(1);
580 self.htj2k97_codeblock_batch_attempts =
581 self.htj2k97_codeblock_batch_attempts.saturating_add(1);
582 self.last_dwt97_batch_stage_timings = None;
583
584 if jobs.is_empty() {
585 return Ok(Some(Vec::new()));
586 }
587 if self.mode == CudaDispatchMode::Auto
588 && (jobs.len() < self.min_auto_dwt97_batch_jobs
589 || htj2k97_codeblock_batch_total_samples(jobs) < self.min_auto_dwt97_batch_samples)
590 {
591 return Ok(None);
592 }
593
594 #[cfg(not(feature = "cuda-runtime"))]
595 {
596 let _ = (jobs, options);
597 self.unavailable()
598 }
599
600 #[cfg(feature = "cuda-runtime")]
601 {
602 match cuda::dispatch_htj2k97_preencoded_batch(self.cuda_session(), jobs, options) {
603 Ok((output, timings)) => {
604 self.dwt97_batch_dispatches = self.dwt97_batch_dispatches.saturating_add(1);
605 self.htj2k97_codeblock_batch_dispatches =
606 self.htj2k97_codeblock_batch_dispatches.saturating_add(1);
607 self.last_dwt97_batch_stage_timings = Some(timings);
608 Ok(Some(output))
609 }
610 Err(error) => self.recover(error),
611 }
612 }
613 }
614
615 fn dct_grid_i16_to_htj2k97_preencoded_batch(
616 &mut self,
617 jobs: &[DctGridI16ToHtj2k97CodeBlockJob<'_>],
618 options: Htj2k97CodeBlockOptions,
619 ) -> Result<Option<Vec<PreencodedHtj2k97Component>>, TranscodeStageError> {
620 if !self.resident_ht_encode {
621 return Ok(None);
622 }
623
624 self.dwt97_batch_attempts = self.dwt97_batch_attempts.saturating_add(1);
625 self.htj2k97_codeblock_batch_attempts =
626 self.htj2k97_codeblock_batch_attempts.saturating_add(1);
627 self.last_dwt97_batch_stage_timings = None;
628
629 if jobs.is_empty() {
630 return Ok(Some(Vec::new()));
631 }
632 if self.mode == CudaDispatchMode::Auto
633 && (jobs.len() < self.min_auto_dwt97_batch_jobs
634 || htj2k97_i16_codeblock_batch_total_samples(jobs)
635 < self.min_auto_dwt97_batch_samples)
636 {
637 return Ok(None);
638 }
639
640 #[cfg(not(feature = "cuda-runtime"))]
641 {
642 let _ = (jobs, options);
643 self.unavailable()
644 }
645
646 #[cfg(feature = "cuda-runtime")]
647 {
648 match cuda::dispatch_htj2k97_preencoded_i16_batch(self.cuda_session(), jobs, options) {
649 Ok((output, timings)) => {
650 self.dwt97_batch_dispatches = self.dwt97_batch_dispatches.saturating_add(1);
651 self.htj2k97_codeblock_batch_dispatches =
652 self.htj2k97_codeblock_batch_dispatches.saturating_add(1);
653 self.last_dwt97_batch_stage_timings = Some(timings);
654 Ok(Some(output))
655 }
656 Err(error) => self.recover(error),
657 }
658 }
659 }
660
661 fn dct_grid_i16_to_htj2k97_compact_preencoded_batch(
662 &mut self,
663 jobs: &[DctGridI16ToHtj2k97CodeBlockJob<'_>],
664 options: Htj2k97CodeBlockOptions,
665 ) -> Result<Option<PreencodedHtj2k97CompactBatch>, TranscodeStageError> {
666 if !self.resident_ht_encode {
667 return Ok(None);
668 }
669
670 self.dwt97_batch_attempts = self.dwt97_batch_attempts.saturating_add(1);
671 self.htj2k97_codeblock_batch_attempts =
672 self.htj2k97_codeblock_batch_attempts.saturating_add(1);
673 self.last_dwt97_batch_stage_timings = None;
674
675 if jobs.is_empty() {
676 return Ok(Some(PreencodedHtj2k97CompactBatch {
677 payload: Vec::new(),
678 components: Vec::new(),
679 }));
680 }
681 if self.mode == CudaDispatchMode::Auto
682 && (jobs.len() < self.min_auto_dwt97_batch_jobs
683 || htj2k97_i16_codeblock_batch_total_samples(jobs)
684 < self.min_auto_dwt97_batch_samples)
685 {
686 return Ok(None);
687 }
688
689 #[cfg(not(feature = "cuda-runtime"))]
690 {
691 let _ = (jobs, options);
692 self.unavailable()
693 }
694
695 #[cfg(feature = "cuda-runtime")]
696 {
697 match cuda::dispatch_htj2k97_compact_preencoded_i16_batch(
698 self.cuda_session(),
699 jobs,
700 options,
701 ) {
702 Ok((output, timings)) => {
703 self.dwt97_batch_dispatches = self.dwt97_batch_dispatches.saturating_add(1);
704 self.htj2k97_codeblock_batch_dispatches =
705 self.htj2k97_codeblock_batch_dispatches.saturating_add(1);
706 self.last_dwt97_batch_stage_timings = Some(timings);
707 Ok(Some(output))
708 }
709 Err(error) => self.recover(error),
710 }
711 }
712 }
713
714 fn dct_grid_i16_to_htj2k97_preencoded_batch_groups(
715 &mut self,
716 groups: &[DctGridI16ToHtj2k97CodeBlockBatch<'_, '_>],
717 options: Htj2k97CodeBlockOptions,
718 ) -> Result<Option<Vec<Vec<PreencodedHtj2k97Component>>>, TranscodeStageError> {
719 if !self.resident_ht_encode {
720 return Ok(None);
721 }
722
723 self.dwt97_batch_attempts = self.dwt97_batch_attempts.saturating_add(groups.len());
724 self.htj2k97_codeblock_batch_attempts = self
725 .htj2k97_codeblock_batch_attempts
726 .saturating_add(groups.len());
727 self.last_dwt97_batch_stage_timings = None;
728
729 if groups.is_empty() {
730 return Ok(Some(Vec::new()));
731 }
732 let total_jobs = groups.iter().map(|group| group.jobs.len()).sum::<usize>();
733 if self.mode == CudaDispatchMode::Auto
734 && (total_jobs < self.min_auto_dwt97_batch_jobs
735 || htj2k97_i16_codeblock_batch_group_total_samples(groups)
736 < self.min_auto_dwt97_batch_samples)
737 {
738 return Ok(None);
739 }
740
741 #[cfg(not(feature = "cuda-runtime"))]
742 {
743 let _ = (groups, options);
744 self.unavailable()
745 }
746
747 #[cfg(feature = "cuda-runtime")]
748 {
749 match cuda::dispatch_htj2k97_preencoded_i16_batch_groups(
750 self.cuda_session(),
751 groups,
752 options,
753 ) {
754 Ok((output, timings)) => {
755 self.dwt97_batch_dispatches =
756 self.dwt97_batch_dispatches.saturating_add(groups.len());
757 self.htj2k97_codeblock_batch_dispatches = self
758 .htj2k97_codeblock_batch_dispatches
759 .saturating_add(timings.ht_codeblock_dispatches);
760 self.last_dwt97_batch_stage_timings = Some(timings);
761 Ok(Some(output))
762 }
763 Err(error) => self.recover(error),
764 }
765 }
766 }
767
768 fn dct_grid_i16_to_htj2k97_compact_preencoded_batch_groups(
769 &mut self,
770 groups: &[DctGridI16ToHtj2k97CodeBlockBatch<'_, '_>],
771 options: Htj2k97CodeBlockOptions,
772 ) -> Result<Option<PreencodedHtj2k97CompactBatchGroups>, TranscodeStageError> {
773 if !self.resident_ht_encode {
774 return Ok(None);
775 }
776
777 self.dwt97_batch_attempts = self.dwt97_batch_attempts.saturating_add(groups.len());
778 self.htj2k97_codeblock_batch_attempts = self
779 .htj2k97_codeblock_batch_attempts
780 .saturating_add(groups.len());
781 self.last_dwt97_batch_stage_timings = None;
782
783 if groups.is_empty() {
784 return Ok(Some(PreencodedHtj2k97CompactBatchGroups {
785 payload: Vec::new(),
786 groups: Vec::new(),
787 }));
788 }
789 let total_jobs = groups.iter().map(|group| group.jobs.len()).sum::<usize>();
790 if self.mode == CudaDispatchMode::Auto
791 && (total_jobs < self.min_auto_dwt97_batch_jobs
792 || htj2k97_i16_codeblock_batch_group_total_samples(groups)
793 < self.min_auto_dwt97_batch_samples)
794 {
795 return Ok(None);
796 }
797
798 #[cfg(not(feature = "cuda-runtime"))]
799 {
800 let _ = (groups, options);
801 self.unavailable()
802 }
803
804 #[cfg(feature = "cuda-runtime")]
805 {
806 match cuda::dispatch_htj2k97_compact_preencoded_i16_batch_groups(
807 self.cuda_session(),
808 groups,
809 options,
810 ) {
811 Ok((output, timings)) => {
812 self.dwt97_batch_dispatches =
813 self.dwt97_batch_dispatches.saturating_add(groups.len());
814 self.htj2k97_codeblock_batch_dispatches = self
815 .htj2k97_codeblock_batch_dispatches
816 .saturating_add(timings.ht_codeblock_dispatches);
817 self.last_dwt97_batch_stage_timings = Some(timings);
818 Ok(Some(output))
819 }
820 Err(error) => self.recover(error),
821 }
822 }
823 }
824
825 fn last_dwt97_batch_stage_timings(&self) -> Option<Dwt97BatchStageTimings> {
826 self.last_dwt97_batch_stage_timings
827 }
828}
829
830#[cfg(test)]
831mod tests {
832 use super::*;
833 use std::sync::Mutex;
834
835 static ENV_LOCK: Mutex<()> = Mutex::new(());
836
837 fn test_htj2k97_codeblock_options() -> Htj2k97CodeBlockOptions {
838 Htj2k97CodeBlockOptions {
839 bit_depth: 8,
840 guard_bits: 2,
841 code_block_width_exp: 4,
842 code_block_height_exp: 4,
843 irreversible_quantization_scale: 1.0,
844 irreversible_quantization_subband_scales:
845 signinum_transcode::accelerator::IrreversibleQuantizationSubbandScales::default(),
846 }
847 }
848
849 #[test]
850 fn explicit_mode_without_cuda_runtime_errors_on_reversible_job() {
851 let mut accelerator = CudaDctToWaveletStageAccelerator::new_explicit();
854 let blocks: Vec<[i16; 64]> = vec![[0i16; 64]];
855 let job = DctGridToReversibleDwt53Job {
856 dequantized_blocks: &blocks,
857 block_cols: 1,
858 block_rows: 1,
859 width: 8,
860 height: 8,
861 };
862 let result = accelerator.dct_grid_to_reversible_dwt53(job);
863 #[cfg(not(feature = "cuda-runtime"))]
864 assert_eq!(result, Err(TranscodeStageError::DeviceUnavailable));
865 let _ = result;
866 assert_eq!(accelerator.reversible_dwt53_attempts(), 1);
867 }
868
869 #[test]
870 fn auto_mode_falls_back_to_scalar_for_small_jobs() {
871 let mut accelerator = CudaDctToWaveletStageAccelerator::for_auto();
874 let blocks: Vec<[i16; 64]> = vec![[0i16; 64]];
875 let job = DctGridToReversibleDwt53Job {
876 dequantized_blocks: &blocks,
877 block_cols: 1,
878 block_rows: 1,
879 width: 8,
880 height: 8,
881 };
882 assert_eq!(accelerator.dct_grid_to_reversible_dwt53(job), Ok(None));
883 }
884
885 #[test]
886 fn empty_batches_return_empty_without_dispatch() {
887 let mut accelerator = CudaDctToWaveletStageAccelerator::new_explicit();
888 assert_eq!(
889 accelerator.dct_grid_to_reversible_dwt53_batch(&[]),
890 Ok(Some(Vec::new()))
891 );
892 assert_eq!(
893 accelerator.dct_grid_to_dwt97_batch(&[]),
894 Ok(Some(Vec::new()))
895 );
896 }
897
898 #[test]
899 fn compact_preencoded_support_obeys_cuda_env_gate() {
900 let _guard = ENV_LOCK.lock().expect("env lock");
901 let previous = std::env::var_os(DISABLE_COMPACT_PREENCODED_ENV);
902 std::env::remove_var(DISABLE_COMPACT_PREENCODED_ENV);
903 let accelerator = CudaDctToWaveletStageAccelerator::new_explicit_resident_ht_encode();
904 assert!(accelerator.supports_htj2k97_i16_preencoded_batch());
905 assert!(accelerator.supports_htj2k97_compact_preencoded_batch());
906
907 std::env::set_var(DISABLE_COMPACT_PREENCODED_ENV, "1");
908 let accelerator = CudaDctToWaveletStageAccelerator::new_explicit_resident_ht_encode();
909 assert!(accelerator.supports_htj2k97_i16_preencoded_batch());
910 assert!(!accelerator.supports_htj2k97_compact_preencoded_batch());
911
912 if let Some(previous) = previous {
913 std::env::set_var(DISABLE_COMPACT_PREENCODED_ENV, previous);
914 } else {
915 std::env::remove_var(DISABLE_COMPACT_PREENCODED_ENV);
916 }
917 }
918
919 #[test]
920 fn auto_mode_declines_under_amortized_reversible_batches() {
921 let mut accelerator = CudaDctToWaveletStageAccelerator::for_auto()
922 .with_auto_reversible_batch_thresholds(2, 224 * 224 * 2);
923 let blocks = vec![[0i16; 64]; 256 * 256 / 64];
924 let job = DctGridToReversibleDwt53Job {
925 dequantized_blocks: &blocks,
926 block_cols: 32,
927 block_rows: 32,
928 width: 256,
929 height: 256,
930 };
931
932 assert_eq!(
933 accelerator.dct_grid_to_reversible_dwt53_batch(&[job]),
934 Ok(None)
935 );
936 assert_eq!(accelerator.reversible_dwt53_batch_attempts(), 1);
937 assert_eq!(accelerator.reversible_dwt53_batch_dispatches(), 0);
938 }
939
940 #[test]
941 fn auto_mode_declines_under_amortized_dwt97_batches() {
942 let mut accelerator = CudaDctToWaveletStageAccelerator::for_auto()
943 .with_auto_dwt97_batch_thresholds(2, 224 * 224 * 2);
944 let blocks = vec![[[0.0f64; 8]; 8]; 256 * 256 / 64];
945 let job = DctGridToDwt97Job {
946 blocks: &blocks,
947 block_cols: 32,
948 block_rows: 32,
949 width: 256,
950 height: 256,
951 };
952
953 assert_eq!(accelerator.dct_grid_to_dwt97_batch(&[job]), Ok(None));
954 assert_eq!(accelerator.dwt97_batch_attempts(), 1);
955 assert_eq!(accelerator.dwt97_batch_dispatches(), 0);
956 }
957
958 #[test]
959 fn auto_mode_declines_under_amortized_htj2k97_codeblock_batches() {
960 let mut accelerator = CudaDctToWaveletStageAccelerator::for_auto()
961 .with_auto_dwt97_batch_thresholds(2, 224 * 224 * 2);
962 let blocks = vec![[[0.0f64; 8]; 8]; 256 * 256 / 64];
963 let job = DctGridToHtj2k97CodeBlockJob {
964 blocks: &blocks,
965 block_cols: 32,
966 block_rows: 32,
967 width: 256,
968 height: 256,
969 x_rsiz: 1,
970 y_rsiz: 1,
971 };
972
973 let result = accelerator
974 .dct_grid_to_htj2k97_codeblock_batch(&[job], test_htj2k97_codeblock_options());
975 assert!(matches!(result, Ok(None)));
976 assert_eq!(accelerator.dwt97_batch_attempts(), 1);
977 assert_eq!(accelerator.dwt97_batch_dispatches(), 0);
978 assert_eq!(accelerator.htj2k97_codeblock_batch_attempts(), 1);
979 assert_eq!(accelerator.htj2k97_codeblock_batch_dispatches(), 0);
980 }
981}