1use j2k::adapter::encode_stage::{
4 EncodedHtJ2kCodeBlock, EncodedJ2kCodeBlock, J2kDeinterleaveToF32Job, J2kEncodeDispatchReport,
5 J2kEncodeStageAccelerator, J2kForwardDwt53Job, J2kForwardDwt53Output, J2kForwardDwt97Job,
6 J2kForwardDwt97Output, J2kForwardIctJob, J2kForwardRctJob, J2kHtCodeBlockEncodeJob,
7 J2kHtSubbandEncodeJob, J2kHtj2kTileEncodeJob, J2kPacketizationBlockCodingMode,
8 J2kPacketizationCodeBlock, J2kPacketizationEncodeJob, J2kPacketizationResolution,
9 J2kQuantizeSubbandJob, J2kTier1CodeBlockEncodeJob,
10};
11#[cfg(feature = "cuda-runtime")]
12use j2k::adapter::encode_stage::{
13 J2kForwardDwt53Level, J2kForwardDwt97Level, J2kPacketizationPacketDescriptor,
14 J2kPacketizationSubband,
15};
16use j2k_core::BackendKind;
17#[cfg(feature = "cuda-runtime")]
18use j2k_core::{DeviceSubmission, DeviceSubmitSession, PixelFormat, ReadySubmission};
19#[cfg(feature = "cuda-runtime")]
20use j2k_cuda_runtime::{
21 CudaContext, CudaDeviceBuffer, CudaDwt53LevelShape, CudaDwt53Output, CudaDwt97Output,
22 CudaError, CudaHtj2kEncodeCodeBlockJob, CudaHtj2kEncodeCodeBlockRegionJob,
23 CudaHtj2kEncodeResources, CudaHtj2kEncodeTables, CudaHtj2kPacketizationBlock,
24 CudaHtj2kPacketizationPacket, CudaHtj2kPacketizationSubband,
25 CudaHtj2kPacketizationSubbandTagState, CudaHtj2kPacketizationTagNodeState, CudaJ2kQuantizeJob,
26 CudaJ2kQuantizeSubbandRegionJob, CudaJ2kResidentComponents, CudaJ2kStridedInterleavedPixels,
27};
28#[cfg(feature = "cuda-runtime")]
29use std::{
30 sync::Arc,
31 time::{Duration, Instant},
32};
33
34use j2k_native::packet_math;
35
36use crate::profile;
37#[cfg(feature = "cuda-runtime")]
38use crate::{runtime::cuda_error, session::CudaSession};
39
40pub fn encode_j2k_lossless_with_cuda(
46 samples: j2k::J2kLosslessSamples<'_>,
47 options: &j2k::J2kLosslessEncodeOptions,
48) -> Result<j2k::EncodedJ2k, crate::Error> {
49 let strict_options = strict_cuda_encode_options(*options);
50 let profile_enabled = profile::profile_stages_enabled();
51 let mut accelerator = CudaEncodeStageAccelerator::with_profile_collection(profile_enabled);
52 let total_start = profile::profile_now(profile_enabled);
53 let encoded = j2k::encode_j2k_lossless_with_accelerator(
54 samples,
55 &strict_options,
56 BackendKind::Cuda,
57 &mut accelerator,
58 )?;
59 reject_non_cuda_encode_backend(&encoded)?;
60 if profile_enabled {
61 accelerator
62 .encode_profile_report(
63 &encoded,
64 samples.data.len(),
65 profile::elapsed_us(total_start),
66 )
67 .emit("encode");
68 }
69 Ok(encoded)
70}
71
72pub fn encode_j2k_lossless_with_cuda_and_profile(
74 samples: j2k::J2kLosslessSamples<'_>,
75 options: &j2k::J2kLosslessEncodeOptions,
76) -> Result<(j2k::EncodedJ2k, profile::CudaHtj2kEncodeProfileReport), crate::Error> {
77 let input_bytes = samples.data.len();
78 let strict_options = strict_cuda_encode_options(*options);
79 let mut accelerator = CudaEncodeStageAccelerator::with_profile_collection(true);
80 let total_start = profile::profile_now(true);
81 let encoded = j2k::encode_j2k_lossless_with_accelerator(
82 samples,
83 &strict_options,
84 BackendKind::Cuda,
85 &mut accelerator,
86 )?;
87 reject_non_cuda_encode_backend(&encoded)?;
88 let report =
89 accelerator.encode_profile_report(&encoded, input_bytes, profile::elapsed_us(total_start));
90 report.emit("encode");
91 Ok((encoded, report))
92}
93
94fn strict_cuda_encode_options(
95 options: j2k::J2kLosslessEncodeOptions,
96) -> j2k::J2kLosslessEncodeOptions {
97 options.with_backend(j2k::EncodeBackendPreference::RequireDevice)
98}
99
100fn reject_non_cuda_encode_backend(encoded: &j2k::EncodedJ2k) -> Result<(), crate::Error> {
101 if encoded.backend == BackendKind::Cuda {
102 Ok(())
103 } else {
104 Err(crate::Error::UnsupportedCudaRequest {
105 reason: "strict CUDA HTJ2K encode did not dispatch all required stages",
106 })
107 }
108}
109
110#[cfg(feature = "cuda-runtime")]
111#[derive(Debug, Clone, Copy)]
113pub struct CudaLosslessEncodeTile<'a> {
114 pub buffer: &'a CudaDeviceBuffer,
116 pub byte_offset: usize,
118 pub width: u32,
120 pub height: u32,
122 pub pitch_bytes: usize,
124 pub output_width: u32,
126 pub output_height: u32,
128 pub format: PixelFormat,
130}
131
132#[cfg(feature = "cuda-runtime")]
133#[derive(Debug, Clone, Copy, PartialEq, Eq)]
135pub struct CudaLosslessEncodeResidency {
136 pub coefficient_prep_used: bool,
138 pub packetization_used: bool,
140 pub codestream_assembly_used: bool,
142}
143
144#[cfg(feature = "cuda-runtime")]
145#[derive(Debug, Clone, PartialEq, Eq)]
147pub struct CudaLosslessEncodeOutcome {
148 pub encoded: j2k::EncodedJ2k,
150 pub input_copy_used: bool,
152 pub resident: CudaLosslessEncodeResidency,
154 pub input_copy_duration: Duration,
156 pub encode_duration: Duration,
158 pub gpu_duration: Option<Duration>,
160 pub validation_duration: Duration,
162 pub host_readback_duration: Duration,
164 pub stage_timings: CudaEncodeStageTimings,
166}
167
168#[cfg(feature = "cuda-runtime")]
169#[derive(Debug)]
171pub struct SubmittedJ2kLosslessCudaEncode {
172 inner: ReadySubmission<j2k::EncodedJ2k, crate::Error>,
173}
174
175#[cfg(feature = "cuda-runtime")]
176#[derive(Debug)]
178pub struct SubmittedJ2kLosslessCudaEncodeBatch {
179 inner: ReadySubmission<Vec<j2k::EncodedJ2k>, crate::Error>,
180}
181
182#[cfg(feature = "cuda-runtime")]
183impl DeviceSubmission for SubmittedJ2kLosslessCudaEncode {
184 type Output = j2k::EncodedJ2k;
185 type Error = crate::Error;
186
187 fn wait(self) -> Result<Self::Output, Self::Error> {
188 self.inner.wait()
189 }
190}
191
192#[cfg(feature = "cuda-runtime")]
193impl DeviceSubmission for SubmittedJ2kLosslessCudaEncodeBatch {
194 type Output = Vec<j2k::EncodedJ2k>;
195 type Error = crate::Error;
196
197 fn wait(self) -> Result<Self::Output, Self::Error> {
198 self.inner.wait()
199 }
200}
201
202#[cfg(feature = "cuda-runtime")]
203pub fn encode_lossless_from_cuda_buffer(
205 tile: CudaLosslessEncodeTile<'_>,
206 options: &j2k::J2kLosslessEncodeOptions,
207 session: &mut CudaSession,
208) -> Result<j2k::EncodedJ2k, crate::Error> {
209 submit_lossless_from_cuda_buffer(tile, options, session)?.wait()
210}
211
212#[cfg(feature = "cuda-runtime")]
213pub fn submit_lossless_from_cuda_buffer(
215 tile: CudaLosslessEncodeTile<'_>,
216 options: &j2k::J2kLosslessEncodeOptions,
217 session: &mut CudaSession,
218) -> Result<SubmittedJ2kLosslessCudaEncode, crate::Error> {
219 let result = encode_lossless_from_cuda_buffer_with_report(tile, options, session)
220 .map(|outcome| outcome.encoded);
221 Ok(SubmittedJ2kLosslessCudaEncode {
222 inner: ReadySubmission::from_result(result),
223 })
224}
225
226#[cfg(feature = "cuda-runtime")]
227pub fn encode_lossless_from_cuda_buffer_with_report(
229 tile: CudaLosslessEncodeTile<'_>,
230 options: &j2k::J2kLosslessEncodeOptions,
231 session: &mut CudaSession,
232) -> Result<CudaLosslessEncodeOutcome, crate::Error> {
233 validate_cuda_encode_options(*options)?;
234 validate_cuda_encode_tile(tile)?;
235 session.record_submit();
236 encode_lossless_cuda_tile_with_report(tile, *options)
237}
238
239#[cfg(feature = "cuda-runtime")]
240pub fn encode_lossless_from_cuda_buffers(
242 tiles: &[CudaLosslessEncodeTile<'_>],
243 options: &j2k::J2kLosslessEncodeOptions,
244 session: &mut CudaSession,
245) -> Result<Vec<j2k::EncodedJ2k>, crate::Error> {
246 submit_lossless_from_cuda_buffers(tiles, options, session)?.wait()
247}
248
249#[cfg(feature = "cuda-runtime")]
250pub fn submit_lossless_from_cuda_buffers(
252 tiles: &[CudaLosslessEncodeTile<'_>],
253 options: &j2k::J2kLosslessEncodeOptions,
254 session: &mut CudaSession,
255) -> Result<SubmittedJ2kLosslessCudaEncodeBatch, crate::Error> {
256 let result =
257 encode_lossless_from_cuda_buffers_with_report(tiles, options, session).map(|outcomes| {
258 outcomes
259 .into_iter()
260 .map(|outcome| outcome.encoded)
261 .collect()
262 });
263 Ok(SubmittedJ2kLosslessCudaEncodeBatch {
264 inner: ReadySubmission::from_result(result),
265 })
266}
267
268#[cfg(feature = "cuda-runtime")]
269pub fn encode_lossless_from_cuda_buffers_with_report(
271 tiles: &[CudaLosslessEncodeTile<'_>],
272 options: &j2k::J2kLosslessEncodeOptions,
273 session: &mut CudaSession,
274) -> Result<Vec<CudaLosslessEncodeOutcome>, crate::Error> {
275 if tiles.is_empty() {
276 return Err(crate::Error::UnsupportedCudaRequest {
277 reason: "J2K CUDA encode received an empty tile batch",
278 });
279 }
280 validate_cuda_encode_options(*options)?;
281 tiles
282 .iter()
283 .copied()
284 .map(|tile| {
285 validate_cuda_encode_tile(tile)?;
286 session.record_submit();
287 encode_lossless_cuda_tile_with_report(tile, *options)
288 })
289 .collect()
290}
291
292#[cfg(feature = "cuda-runtime")]
293fn validate_cuda_encode_options(
294 options: j2k::J2kLosslessEncodeOptions,
295) -> Result<(), crate::Error> {
296 if options.block_coding_mode != j2k::J2kBlockCodingMode::HighThroughput {
297 return Err(crate::Error::UnsupportedCudaRequest {
298 reason: "J2K CUDA device-buffer encode currently requires HTJ2K block coding",
299 });
300 }
301 if options.validation != j2k::J2kEncodeValidation::External {
302 return Err(crate::Error::UnsupportedCudaRequest {
303 reason: "J2K CUDA device-buffer encode requires external validation to avoid host input readback",
304 });
305 }
306 Ok(())
307}
308
309#[cfg(feature = "cuda-runtime")]
310fn validate_cuda_encode_tile(tile: CudaLosslessEncodeTile<'_>) -> Result<(), crate::Error> {
311 if tile.width == 0 || tile.height == 0 || tile.output_width == 0 || tile.output_height == 0 {
312 return Err(crate::Error::UnsupportedCudaRequest {
313 reason: "J2K CUDA encode tile dimensions must be nonzero",
314 });
315 }
316 if tile.width != tile.output_width || tile.height != tile.output_height {
317 return Err(crate::Error::UnsupportedCudaRequest {
318 reason: "J2K CUDA device-buffer encode does not yet support input padding",
319 });
320 }
321 let format = cuda_encode_format(tile.format)?;
322 let row_bytes = (tile.width as usize)
323 .checked_mul(format.bytes_per_pixel)
324 .ok_or(crate::Error::UnsupportedCudaRequest {
325 reason: "J2K CUDA encode row byte count overflow",
326 })?;
327 if tile.pitch_bytes < row_bytes {
328 return Err(crate::Error::UnsupportedCudaRequest {
329 reason: "J2K CUDA encode tile pitch is shorter than one row",
330 });
331 }
332 let required_end = tile
333 .byte_offset
334 .checked_add(
335 tile.pitch_bytes
336 .checked_mul(tile.height.saturating_sub(1) as usize)
337 .and_then(|prefix| prefix.checked_add(row_bytes))
338 .ok_or(crate::Error::UnsupportedCudaRequest {
339 reason: "J2K CUDA encode input byte range overflow",
340 })?,
341 )
342 .ok_or(crate::Error::UnsupportedCudaRequest {
343 reason: "J2K CUDA encode input byte range overflow",
344 })?;
345 if required_end > tile.buffer.byte_len() {
346 return Err(crate::Error::UnsupportedCudaRequest {
347 reason: "J2K CUDA encode input byte range exceeds buffer length",
348 });
349 }
350 Ok(())
351}
352
353#[cfg(feature = "cuda-runtime")]
354#[derive(Debug, Clone, Copy)]
355struct CudaEncodeFormat {
356 components: u8,
357 bit_depth: u8,
358 bytes_per_pixel: usize,
359}
360
361#[cfg(feature = "cuda-runtime")]
362fn cuda_encode_format(format: PixelFormat) -> Result<CudaEncodeFormat, crate::Error> {
363 let components =
364 u8::try_from(format.channels()).map_err(|_| crate::Error::UnsupportedCudaRequest {
365 reason: "J2K CUDA encode received a pixel format with too many components",
366 })?;
367 let bit_depth = match format.bytes_per_sample() {
368 1 => 8,
369 2 => 16,
370 _ => {
371 return Err(crate::Error::UnsupportedCudaRequest {
372 reason: "J2K CUDA encode received an unsupported sample width",
373 });
374 }
375 };
376 Ok(CudaEncodeFormat {
377 components,
378 bit_depth,
379 bytes_per_pixel: format.bytes_per_pixel(),
380 })
381}
382
383#[cfg(feature = "cuda-runtime")]
384fn encode_lossless_cuda_tile_with_report(
385 tile: CudaLosslessEncodeTile<'_>,
386 options: j2k::J2kLosslessEncodeOptions,
387) -> Result<CudaLosslessEncodeOutcome, crate::Error> {
388 let encode_started = Instant::now();
389 let format = cuda_encode_format(tile.format)?;
390 let dummy_len = (tile.output_width as usize)
391 .checked_mul(tile.output_height as usize)
392 .and_then(|pixels| pixels.checked_mul(format.bytes_per_pixel))
393 .ok_or(crate::Error::UnsupportedCudaRequest {
394 reason: "J2K CUDA encode sample descriptor length overflow",
395 })?;
396 let dummy = vec![0u8; dummy_len];
397 let samples = j2k::J2kLosslessSamples::new(
398 &dummy,
399 tile.output_width,
400 tile.output_height,
401 format.components,
402 format.bit_depth,
403 false,
404 )?;
405 let context = tile.buffer.context();
406 let resources = context
407 .upload_htj2k_encode_resources(cuda_htj2k_encode_tables())
408 .map_err(cuda_error)?;
409 let mut accelerator = CudaDeviceBufferEncodeAccelerator {
410 tile,
411 context,
412 resources,
413 dispatch: J2kEncodeDispatchReport::default(),
414 stage_timings: CudaEncodeStageTimings::default(),
415 };
416 let encoded = j2k::encode_j2k_lossless_with_accelerator(
417 samples,
418 &strict_cuda_encode_options(options),
419 BackendKind::Cuda,
420 &mut accelerator,
421 )?;
422 reject_non_cuda_encode_backend(&encoded)?;
423 Ok(CudaLosslessEncodeOutcome {
424 encoded,
425 input_copy_used: false,
426 resident: CudaLosslessEncodeResidency {
427 coefficient_prep_used: accelerator.dispatch.deinterleave > 0,
428 packetization_used: accelerator.dispatch.packetization > 0,
429 codestream_assembly_used: false,
430 },
431 input_copy_duration: Duration::ZERO,
432 encode_duration: encode_started.elapsed(),
433 gpu_duration: None,
434 validation_duration: Duration::ZERO,
435 host_readback_duration: Duration::ZERO,
436 stage_timings: accelerator.stage_timings,
437 })
438}
439
440#[cfg(feature = "cuda-runtime")]
441struct CudaDeviceBufferEncodeAccelerator<'a> {
442 tile: CudaLosslessEncodeTile<'a>,
443 context: CudaContext,
444 resources: CudaHtj2kEncodeResources,
445 dispatch: J2kEncodeDispatchReport,
446 stage_timings: CudaEncodeStageTimings,
447}
448
449#[cfg(feature = "cuda-runtime")]
450impl J2kEncodeStageAccelerator for CudaDeviceBufferEncodeAccelerator<'_> {
451 fn dispatch_report(&self) -> J2kEncodeDispatchReport {
452 self.dispatch
453 }
454
455 fn encode_htj2k_tile(
456 &mut self,
457 job: J2kHtj2kTileEncodeJob<'_>,
458 ) -> core::result::Result<Option<Vec<u8>>, &'static str> {
459 let Some(encoded) = cuda_encode_htj2k_device_tile_body(
460 &self.context,
461 &self.resources,
462 self.tile,
463 job,
464 true,
465 )?
466 else {
467 return Ok(None);
468 };
469 self.dispatch.deinterleave = self
470 .dispatch
471 .deinterleave
472 .saturating_add(encoded.deinterleave_dispatches);
473 self.dispatch.forward_rct = self
474 .dispatch
475 .forward_rct
476 .saturating_add(encoded.forward_rct_dispatches);
477 self.dispatch.forward_ict = self
478 .dispatch
479 .forward_ict
480 .saturating_add(encoded.forward_ict_dispatches);
481 self.dispatch.forward_dwt53 = self
482 .dispatch
483 .forward_dwt53
484 .saturating_add(encoded.forward_dwt53_dispatches);
485 self.dispatch.forward_dwt97 = self
486 .dispatch
487 .forward_dwt97
488 .saturating_add(encoded.forward_dwt97_dispatches);
489 self.dispatch.quantize_subband = self
490 .dispatch
491 .quantize_subband
492 .saturating_add(encoded.quantize_dispatches);
493 self.dispatch.ht_code_block = self
494 .dispatch
495 .ht_code_block
496 .saturating_add(encoded.ht_code_block_dispatches);
497 self.dispatch.packetization = self
498 .dispatch
499 .packetization
500 .saturating_add(encoded.packetization_dispatches);
501 self.stage_timings = self.stage_timings.saturating_add(encoded.timings);
502 Ok(Some(encoded.tile_data))
503 }
504}
505
506#[derive(Debug, Default, Clone)]
508#[allow(clippy::struct_excessive_bools)]
509pub struct CudaEncodeStageAccelerator {
510 #[cfg(feature = "cuda-runtime")]
511 context: Option<CudaContext>,
512 #[cfg(feature = "cuda-runtime")]
513 encode_resources: Option<Arc<CudaHtj2kEncodeResources>>,
514 #[cfg_attr(not(feature = "cuda-runtime"), allow(dead_code))]
515 collect_profile: bool,
516 deinterleave_attempts: usize,
517 forward_rct_attempts: usize,
518 forward_ict_attempts: usize,
519 forward_dwt53_attempts: usize,
520 forward_dwt97_attempts: usize,
521 htj2k_tile_attempts: usize,
522 quantize_subband_attempts: usize,
523 ht_subband_attempts: usize,
524 tier1_code_block_attempts: usize,
525 ht_code_block_attempts: usize,
526 packetization_attempts: usize,
527 prefer_cpu_forward_rct: bool,
528 prefer_cpu_ht_subband: bool,
529 prefer_cpu_quantize_subband: bool,
530 prefer_cpu_packetization: bool,
531 deinterleave_dispatches: usize,
532 forward_rct_dispatches: usize,
533 forward_ict_dispatches: usize,
534 forward_dwt53_dispatches: usize,
535 forward_dwt97_dispatches: usize,
536 #[cfg_attr(not(feature = "cuda-runtime"), allow(dead_code))]
537 htj2k_tile_dispatches: usize,
538 quantize_subband_dispatches: usize,
539 #[cfg_attr(not(feature = "cuda-runtime"), allow(dead_code))]
540 ht_subband_dispatches: usize,
541 tier1_code_block_dispatches: usize,
542 ht_code_block_dispatches: usize,
543 packetization_dispatches: usize,
544 deinterleave_us: u128,
545 mct_us: u128,
546 dwt_us: u128,
547 quantize_us: u128,
548 ht_encode_us: u128,
549 packetize_us: u128,
550}
551
552impl CudaEncodeStageAccelerator {
553 #[must_use]
555 pub fn with_profile_collection(collect_profile: bool) -> Self {
556 Self {
557 collect_profile,
558 ..Self::default()
559 }
560 }
561
562 #[must_use]
567 pub fn for_auto_host_output() -> Self {
568 Self::default()
569 .prefer_cpu_forward_rct(true)
570 .prefer_cpu_packetization(true)
571 }
572
573 #[must_use]
575 pub fn prefer_cpu_forward_rct(mut self, prefer_cpu_forward_rct: bool) -> Self {
576 self.prefer_cpu_forward_rct = prefer_cpu_forward_rct;
577 self
578 }
579
580 #[must_use]
586 pub fn prefer_cpu_packetization(mut self, prefer_cpu_packetization: bool) -> Self {
587 self.prefer_cpu_packetization = prefer_cpu_packetization;
588 self
589 }
590
591 #[must_use]
597 pub fn prefer_cpu_ht_subband(mut self, prefer_cpu_ht_subband: bool) -> Self {
598 self.prefer_cpu_ht_subband = prefer_cpu_ht_subband;
599 self
600 }
601
602 #[must_use]
609 pub fn prefer_cpu_quantize_subband(mut self, prefer_cpu_quantize_subband: bool) -> Self {
610 self.prefer_cpu_quantize_subband = prefer_cpu_quantize_subband;
611 self
612 }
613
614 #[must_use]
616 pub const fn collected_stage_timings(&self) -> CudaEncodeStageTimings {
617 CudaEncodeStageTimings {
618 deinterleave_us: self.deinterleave_us,
619 mct_us: self.mct_us,
620 dwt_us: self.dwt_us,
621 quantize_us: self.quantize_us,
622 ht_encode_us: self.ht_encode_us,
623 packetize_us: self.packetize_us,
624 }
625 }
626
627 pub fn reset_collected_stage_timings(&mut self) {
629 self.deinterleave_us = 0;
630 self.mct_us = 0;
631 self.dwt_us = 0;
632 self.quantize_us = 0;
633 self.ht_encode_us = 0;
634 self.packetize_us = 0;
635 }
636
637 #[cfg(feature = "cuda-runtime")]
638 fn cuda_context(&mut self) -> core::result::Result<Option<CudaContext>, &'static str> {
639 if self.context.is_none() {
640 match CudaContext::system_default() {
641 Ok(context) => self.context = Some(context),
642 Err(_) if cuda_runtime_required() => return Err("CUDA encode stage unavailable"),
643 Err(_) => return Ok(None),
644 }
645 }
646 Ok(self.context.clone())
647 }
648
649 #[cfg(feature = "cuda-runtime")]
650 fn cuda_encode_resources(
651 &mut self,
652 context: &CudaContext,
653 ) -> core::result::Result<Arc<CudaHtj2kEncodeResources>, &'static str> {
654 if self.encode_resources.is_none() {
655 let resources = context
656 .upload_htj2k_encode_resources(cuda_htj2k_encode_tables())
657 .map_err(|_| "CUDA HTJ2K encode resource upload failed")?;
658 self.encode_resources = Some(Arc::new(resources));
659 }
660 self.encode_resources
661 .clone()
662 .ok_or("CUDA HTJ2K encode resources unavailable")
663 }
664
665 fn encode_profile_report(
666 &self,
667 encoded: &j2k::EncodedJ2k,
668 input_bytes: usize,
669 total_us: u128,
670 ) -> profile::CudaHtj2kEncodeProfileReport {
671 profile::CudaHtj2kEncodeProfileReport {
672 deinterleave_us: self.deinterleave_us,
673 mct_us: self.mct_us,
674 dwt_us: self.dwt_us,
675 quantize_us: self.quantize_us,
676 ht_encode_us: self.ht_encode_us,
677 packetize_us: self.packetize_us,
678 total_us,
679 input_bytes,
680 codestream_bytes: encoded.codestream.len(),
681 block_count: self.ht_code_block_attempts,
682 dispatch_count: self.dispatch_report().total(),
683 backend: encoded.backend,
684 }
685 }
686
687 pub fn deinterleave_attempts(&self) -> usize {
689 self.deinterleave_attempts
690 }
691
692 pub fn forward_rct_attempts(&self) -> usize {
694 self.forward_rct_attempts
695 }
696
697 pub fn forward_ict_attempts(&self) -> usize {
699 self.forward_ict_attempts
700 }
701
702 pub fn forward_dwt53_attempts(&self) -> usize {
704 self.forward_dwt53_attempts
705 }
706
707 pub fn forward_dwt97_attempts(&self) -> usize {
709 self.forward_dwt97_attempts
710 }
711
712 pub fn quantize_subband_attempts(&self) -> usize {
714 self.quantize_subband_attempts
715 }
716
717 pub fn tier1_code_block_attempts(&self) -> usize {
719 self.tier1_code_block_attempts
720 }
721
722 pub fn ht_code_block_attempts(&self) -> usize {
724 self.ht_code_block_attempts
725 }
726
727 pub fn packetization_attempts(&self) -> usize {
729 self.packetization_attempts
730 }
731
732 pub fn deinterleave_dispatches(&self) -> usize {
734 self.deinterleave_dispatches
735 }
736
737 pub fn forward_rct_dispatches(&self) -> usize {
739 self.forward_rct_dispatches
740 }
741
742 pub fn forward_ict_dispatches(&self) -> usize {
744 self.forward_ict_dispatches
745 }
746
747 pub fn forward_dwt53_dispatches(&self) -> usize {
749 self.forward_dwt53_dispatches
750 }
751
752 pub fn forward_dwt97_dispatches(&self) -> usize {
754 self.forward_dwt97_dispatches
755 }
756
757 pub fn quantize_subband_dispatches(&self) -> usize {
759 self.quantize_subband_dispatches
760 }
761
762 pub fn tier1_code_block_dispatches(&self) -> usize {
764 self.tier1_code_block_dispatches
765 }
766
767 pub fn ht_code_block_dispatches(&self) -> usize {
769 self.ht_code_block_dispatches
770 }
771
772 pub fn packetization_dispatches(&self) -> usize {
774 self.packetization_dispatches
775 }
776}
777
778#[cfg(feature = "cuda-runtime")]
779fn cuda_runtime_required() -> bool {
780 std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_some()
781}
782
783#[cfg(feature = "cuda-runtime")]
784fn time_cuda_stage<T>(
785 name: &'static str,
786 context: &CudaContext,
787 collect_profile: bool,
788 work: impl FnOnce() -> core::result::Result<T, CudaError>,
789) -> core::result::Result<(T, u128), CudaError> {
790 if collect_profile {
791 context.time_default_stream_named_us(name, work)
792 } else {
793 context
794 .with_nvtx_range(name, work)
795 .map(|output| (output, 0))
796 }
797}
798
799#[allow(clippy::struct_field_names)]
801#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
802pub struct CudaEncodeStageTimings {
803 pub deinterleave_us: u128,
805 pub mct_us: u128,
807 pub dwt_us: u128,
809 pub quantize_us: u128,
811 pub ht_encode_us: u128,
813 pub packetize_us: u128,
815}
816
817impl CudaEncodeStageTimings {
818 #[must_use]
820 pub const fn saturating_add(self, other: Self) -> Self {
821 Self {
822 deinterleave_us: self.deinterleave_us.saturating_add(other.deinterleave_us),
823 mct_us: self.mct_us.saturating_add(other.mct_us),
824 dwt_us: self.dwt_us.saturating_add(other.dwt_us),
825 quantize_us: self.quantize_us.saturating_add(other.quantize_us),
826 ht_encode_us: self.ht_encode_us.saturating_add(other.ht_encode_us),
827 packetize_us: self.packetize_us.saturating_add(other.packetize_us),
828 }
829 }
830
831 #[must_use]
833 pub const fn total_us(self) -> u128 {
834 self.deinterleave_us
835 .saturating_add(self.mct_us)
836 .saturating_add(self.dwt_us)
837 .saturating_add(self.quantize_us)
838 .saturating_add(self.ht_encode_us)
839 .saturating_add(self.packetize_us)
840 }
841}
842
843#[derive(Debug, Clone, PartialEq, Eq)]
844struct CudaHtj2kPacketizationPlan {
845 payload: Vec<u8>,
846 packets: Vec<CudaHtj2kPacketizationPlanPacket>,
847 subbands: Vec<CudaHtj2kPacketizationPlanSubband>,
848 blocks: Vec<CudaHtj2kPacketizationPlanBlock>,
849 tag_states: Vec<CudaHtj2kPacketizationPlanSubbandTagState>,
850 tag_nodes: Vec<CudaHtj2kPacketizationPlanTagNodeState>,
851}
852
853struct CudaHtj2kPacketizationPlanSink<'a> {
854 payload: &'a mut Vec<u8>,
855 packets: &'a mut Vec<CudaHtj2kPacketizationPlanPacket>,
856 subbands: &'a mut Vec<CudaHtj2kPacketizationPlanSubband>,
857 blocks: &'a mut Vec<CudaHtj2kPacketizationPlanBlock>,
858 tag_states: &'a mut Vec<CudaHtj2kPacketizationPlanSubbandTagState>,
859 tag_nodes: &'a mut Vec<CudaHtj2kPacketizationPlanTagNodeState>,
860}
861
862#[derive(Debug, Clone, Copy, PartialEq, Eq)]
863struct CudaHtj2kPacketizationPlanPacket {
864 block_start: u32,
865 block_count: u32,
866 subband_start: u32,
867 subband_count: u32,
868 output_capacity: u32,
869 layer: u32,
870}
871
872#[derive(Debug, Clone, Copy, PartialEq, Eq)]
873struct CudaHtj2kPacketizationPlanSubband {
874 block_start: u32,
875 block_count: u32,
876 num_cbs_x: u32,
877 num_cbs_y: u32,
878}
879
880#[derive(Debug, Clone, Copy, PartialEq, Eq)]
881struct CudaHtj2kPacketizationPlanBlock {
882 data_offset: u32,
883 data_len: u32,
884 cleanup_length: u32,
885 refinement_length: u32,
886 num_coding_passes: u32,
887 num_zero_bitplanes: u32,
888 l_block: u32,
889 previously_included: u32,
890 inclusion_layer: u32,
891}
892
893#[derive(Debug, Clone, Copy, PartialEq, Eq)]
894struct CudaHtj2kPacketizationPlanSubbandTagState {
895 inclusion_node_start: u32,
896 zero_bitplane_node_start: u32,
897 node_count: u32,
898}
899
900#[derive(Debug, Clone, Copy, PartialEq, Eq)]
901struct CudaHtj2kPacketizationPlanTagNodeState {
902 current: u32,
903 known: u32,
904}
905
906#[derive(Debug, Clone, PartialEq, Eq)]
907struct CudaHtj2kPacketizationTagTreeState {
908 values: Vec<u32>,
909 current: Vec<u32>,
910 known: Vec<u32>,
911 widths: Vec<u32>,
912 heights: Vec<u32>,
913 offsets: Vec<usize>,
914}
915
916#[derive(Debug, Clone, Copy, PartialEq, Eq)]
917struct CudaHtj2kPacketizationBlockState {
918 previously_included: bool,
919 l_block: u32,
920 inclusion_layer: u32,
921 first_inclusion_zero_bitplanes: u32,
922}
923
924#[derive(Debug, Clone, PartialEq, Eq)]
925struct CudaHtj2kPacketizationSubbandState {
926 num_cbs_x: u32,
927 num_cbs_y: u32,
928 inclusion_tree: CudaHtj2kPacketizationTagTreeState,
929 zero_bitplane_tree: CudaHtj2kPacketizationTagTreeState,
930 blocks: Vec<CudaHtj2kPacketizationBlockState>,
931}
932
933#[derive(Debug, Clone, PartialEq, Eq)]
934struct CudaHtj2kPacketizationState {
935 subbands: Vec<CudaHtj2kPacketizationSubbandState>,
936}
937
938fn flatten_cuda_htj2k_packetization_job(
939 job: J2kPacketizationEncodeJob<'_>,
940) -> core::result::Result<CudaHtj2kPacketizationPlan, &'static str> {
941 if job.resolution_count as usize != job.resolutions.len() {
942 return Err("CUDA HTJ2K packetization resolution count mismatch");
943 }
944
945 let mut payload = Vec::new();
946 let mut packets = Vec::new();
947 let mut subbands = Vec::new();
948 let mut blocks = Vec::new();
949 let mut tag_states = Vec::new();
950 let mut tag_nodes = Vec::new();
951
952 {
953 let mut sink = CudaHtj2kPacketizationPlanSink {
954 payload: &mut payload,
955 packets: &mut packets,
956 subbands: &mut subbands,
957 blocks: &mut blocks,
958 tag_states: &mut tag_states,
959 tag_nodes: &mut tag_nodes,
960 };
961 if job.packet_descriptors.is_empty() {
962 if job.num_layers != 1 {
963 return Err(
964 "CUDA HTJ2K packetization requires explicit descriptors for multiple layers",
965 );
966 }
967 for packet_index in 0..job.resolutions.len() {
968 flatten_cuda_htj2k_packet(
969 job.resolutions
970 .get(packet_index)
971 .ok_or("CUDA HTJ2K packet descriptor index out of range")?,
972 &mut sink,
973 )?;
974 }
975 } else {
976 let state_count = job
977 .packet_descriptors
978 .iter()
979 .map(|descriptor| descriptor.state_index as usize)
980 .max()
981 .map_or(0usize, |max_state| max_state + 1);
982 let mut states: Vec<Option<CudaHtj2kPacketizationState>> =
983 core::iter::repeat_with(|| None).take(state_count).collect();
984 for descriptor in job.packet_descriptors {
985 if descriptor.layer >= job.num_layers {
986 return Err("CUDA HTJ2K packetization descriptor layer exceeds layer count");
987 }
988 let resolution = job
989 .resolutions
990 .get(descriptor.packet_index as usize)
991 .ok_or("CUDA HTJ2K packet descriptor index out of range")?;
992 let state = states
993 .get_mut(descriptor.state_index as usize)
994 .ok_or("CUDA HTJ2K packet descriptor state index out of range")?;
995 if let Some(existing) = state {
996 validate_cuda_htj2k_packetization_state_layout(existing, resolution)?;
997 } else {
998 *state = Some(seed_cuda_htj2k_packetization_state(resolution)?);
999 }
1000 let state = state
1001 .as_mut()
1002 .ok_or("CUDA HTJ2K packetization state initialization failed")?;
1003 record_cuda_htj2k_packetization_first_inclusion_layers(
1004 state,
1005 resolution,
1006 descriptor.layer,
1007 )?;
1008 }
1009 for state in states.iter_mut().flatten() {
1010 finalize_cuda_htj2k_packetization_tag_trees(state);
1011 }
1012 for descriptor in job.packet_descriptors {
1013 if descriptor.layer >= job.num_layers {
1014 return Err("CUDA HTJ2K packetization descriptor layer exceeds layer count");
1015 }
1016 let resolution = job
1017 .resolutions
1018 .get(descriptor.packet_index as usize)
1019 .ok_or("CUDA HTJ2K packet descriptor index out of range")?;
1020 let state = states
1021 .get_mut(descriptor.state_index as usize)
1022 .ok_or("CUDA HTJ2K packet descriptor state index out of range")?;
1023 if let Some(existing) = state {
1024 validate_cuda_htj2k_packetization_state_layout(existing, resolution)?;
1025 } else {
1026 *state = Some(seed_cuda_htj2k_packetization_state(resolution)?);
1027 }
1028 let state = state
1029 .as_mut()
1030 .ok_or("CUDA HTJ2K packetization state initialization failed")?;
1031 flatten_cuda_htj2k_packet_with_state(
1032 resolution,
1033 descriptor.layer,
1034 state,
1035 &mut sink,
1036 )?;
1037 }
1038 }
1039 }
1040
1041 if job.code_block_count as usize != blocks.len() {
1042 return Err("CUDA HTJ2K packetization code-block count mismatch");
1043 }
1044
1045 Ok(CudaHtj2kPacketizationPlan {
1046 payload,
1047 packets,
1048 subbands,
1049 blocks,
1050 tag_states,
1051 tag_nodes,
1052 })
1053}
1054
1055fn seed_cuda_htj2k_packetization_state(
1056 resolution: &J2kPacketizationResolution<'_>,
1057) -> core::result::Result<CudaHtj2kPacketizationState, &'static str> {
1058 let mut subbands = Vec::with_capacity(resolution.subbands.len());
1059 for subband in &resolution.subbands {
1060 let block_count = u32::try_from(subband.code_blocks.len())
1061 .map_err(|_| "CUDA HTJ2K packetization block count exceeds u32")?;
1062 if subband.num_cbs_x == 0
1063 || subband.num_cbs_y == 0
1064 || subband.num_cbs_x.saturating_mul(subband.num_cbs_y) != block_count
1065 {
1066 return Err("CUDA HTJ2K packetization subband code-block layout mismatch");
1067 }
1068 let mut inclusion_tree =
1069 CudaHtj2kPacketizationTagTreeState::new(subband.num_cbs_x, subband.num_cbs_y)?;
1070 let zero_bitplane_tree =
1071 CudaHtj2kPacketizationTagTreeState::new(subband.num_cbs_x, subband.num_cbs_y)?;
1072 for idx in 0..subband.code_blocks.len() {
1073 let (x, y) = cuda_htj2k_packetization_block_xy(idx, subband.num_cbs_x)?;
1074 inclusion_tree.set_leaf_value(x, y, CUDA_HTJ2K_PACKET_TAG_INF);
1075 }
1076 subbands.push(CudaHtj2kPacketizationSubbandState {
1077 num_cbs_x: subband.num_cbs_x,
1078 num_cbs_y: subband.num_cbs_y,
1079 inclusion_tree,
1080 zero_bitplane_tree,
1081 blocks: subband
1082 .code_blocks
1083 .iter()
1084 .map(|block| CudaHtj2kPacketizationBlockState {
1085 previously_included: block.previously_included,
1086 l_block: block.l_block,
1087 inclusion_layer: CUDA_HTJ2K_PACKET_TAG_INF,
1088 first_inclusion_zero_bitplanes: 0,
1089 })
1090 .collect(),
1091 });
1092 }
1093 Ok(CudaHtj2kPacketizationState { subbands })
1094}
1095
1096fn validate_cuda_htj2k_packetization_state_layout(
1097 state: &CudaHtj2kPacketizationState,
1098 resolution: &J2kPacketizationResolution<'_>,
1099) -> core::result::Result<(), &'static str> {
1100 if state.subbands.len() != resolution.subbands.len() {
1101 return Err("CUDA HTJ2K packetization state layout mismatch");
1102 }
1103 for (state_subband, packet_subband) in state.subbands.iter().zip(&resolution.subbands) {
1104 if state_subband.num_cbs_x != packet_subband.num_cbs_x
1105 || state_subband.num_cbs_y != packet_subband.num_cbs_y
1106 || state_subband.blocks.len() != packet_subband.code_blocks.len()
1107 {
1108 return Err("CUDA HTJ2K packetization state layout mismatch");
1109 }
1110 }
1111 Ok(())
1112}
1113
1114const CUDA_HTJ2K_PACKET_TAG_INF: u32 = 0x7FFF_FFFF;
1115const CUDA_HTJ2K_PACKET_MAX_TAG_NODES: usize = 2048;
1116const CUDA_HTJ2K_PACKET_MAX_TAG_LEVELS: usize = 16;
1117
1118fn cuda_htj2k_packetization_block_xy(
1119 index: usize,
1120 num_cbs_x: u32,
1121) -> core::result::Result<(u32, u32), &'static str> {
1122 let index =
1123 u32::try_from(index).map_err(|_| "CUDA HTJ2K packetization block count exceeds u32")?;
1124 Ok((index % num_cbs_x, index / num_cbs_x))
1125}
1126
1127impl CudaHtj2kPacketizationTagTreeState {
1128 fn new(width: u32, height: u32) -> core::result::Result<Self, &'static str> {
1129 if width == 0 || height == 0 {
1130 return Err("CUDA HTJ2K packetization subband code-block layout mismatch");
1131 }
1132
1133 let mut widths = Vec::new();
1134 let mut heights = Vec::new();
1135 let mut offsets = Vec::new();
1136 let mut total_nodes = 0usize;
1137 let mut w = width;
1138 let mut h = height;
1139 loop {
1140 if widths.len() >= CUDA_HTJ2K_PACKET_MAX_TAG_LEVELS {
1141 return Err("CUDA HTJ2K packetization tag-tree exceeds kernel bounds");
1142 }
1143 let nodes = (w as usize)
1144 .checked_mul(h as usize)
1145 .ok_or("CUDA HTJ2K packetization tag-tree exceeds kernel bounds")?;
1146 let next_total = total_nodes
1147 .checked_add(nodes)
1148 .ok_or("CUDA HTJ2K packetization tag-tree exceeds kernel bounds")?;
1149 if next_total > CUDA_HTJ2K_PACKET_MAX_TAG_NODES {
1150 return Err("CUDA HTJ2K packetization tag-tree exceeds kernel bounds");
1151 }
1152 offsets.push(total_nodes);
1153 widths.push(w);
1154 heights.push(h);
1155 total_nodes = next_total;
1156 if w <= 1 && h <= 1 {
1157 break;
1158 }
1159 w = w.div_ceil(2);
1160 h = h.div_ceil(2);
1161 }
1162
1163 Ok(Self {
1164 values: vec![0; total_nodes],
1165 current: vec![0; total_nodes],
1166 known: vec![0; total_nodes],
1167 widths,
1168 heights,
1169 offsets,
1170 })
1171 }
1172
1173 fn set_leaf_value(&mut self, x: u32, y: u32, value: u32) {
1174 let idx = self.offsets[0] + (y * self.widths[0] + x) as usize;
1175 self.values[idx] = value;
1176 }
1177
1178 #[allow(clippy::similar_names)]
1179 fn propagate(&mut self) {
1180 for level in 1..self.widths.len() {
1181 let prev_w = self.widths[level - 1];
1182 let prev_h = self.heights[level - 1];
1183 let curr_w = self.widths[level];
1184 let curr_h = self.heights[level];
1185 for cy in 0..curr_h {
1186 for cx in 0..curr_w {
1187 let child_x_start = cx * 2;
1188 let child_y_start = cy * 2;
1189 let child_x_end = ((cx + 1) * 2).min(prev_w);
1190 let child_y_end = ((cy + 1) * 2).min(prev_h);
1191 let mut min_value = u32::MAX;
1192 for child_y in child_y_start..child_y_end {
1193 for child_x in child_x_start..child_x_end {
1194 let child_idx =
1195 self.offsets[level - 1] + (child_y * prev_w + child_x) as usize;
1196 min_value = min_value.min(self.values[child_idx]);
1197 }
1198 }
1199 let parent_idx = self.offsets[level] + (cy * curr_w + cx) as usize;
1200 self.values[parent_idx] = min_value;
1201 }
1202 }
1203 }
1204 }
1205
1206 fn encode_state_only(&mut self, x: u32, y: u32, max_value: u32) {
1207 let mut path = Vec::with_capacity(self.widths.len());
1208 let mut cx = x;
1209 let mut cy = y;
1210 for level in 0..self.widths.len() {
1211 path.push(self.offsets[level] + (cy * self.widths[level] + cx) as usize);
1212 cx /= 2;
1213 cy /= 2;
1214 }
1215
1216 for node_idx in path.into_iter().rev() {
1217 if self.known[node_idx] == 0 {
1218 let target = self.values[node_idx].min(max_value);
1219 if self.values[node_idx] < max_value {
1220 self.known[node_idx] = 1;
1221 }
1222 self.current[node_idx] = target;
1223 }
1224 }
1225 }
1226
1227 fn append_snapshot(
1228 &self,
1229 out: &mut Vec<CudaHtj2kPacketizationPlanTagNodeState>,
1230 ) -> core::result::Result<u32, &'static str> {
1231 let start = u32::try_from(out.len())
1232 .map_err(|_| "CUDA HTJ2K packetization tag-state exceeds u32")?;
1233 out.extend(
1234 self.current
1235 .iter()
1236 .copied()
1237 .zip(self.known.iter().copied())
1238 .map(|(current, known)| CudaHtj2kPacketizationPlanTagNodeState { current, known }),
1239 );
1240 Ok(start)
1241 }
1242
1243 fn node_count(&self) -> u32 {
1244 u32::try_from(self.current.len()).expect("tag tree node count was bounded at construction")
1245 }
1246}
1247
1248fn record_cuda_htj2k_packetization_first_inclusion_layers(
1249 state: &mut CudaHtj2kPacketizationState,
1250 resolution: &J2kPacketizationResolution<'_>,
1251 layer: u8,
1252) -> core::result::Result<(), &'static str> {
1253 validate_cuda_htj2k_packetization_state_layout(state, resolution)?;
1254 for (state_subband, packet_subband) in state.subbands.iter_mut().zip(&resolution.subbands) {
1255 for (idx, (state_block, packet_block)) in state_subband
1256 .blocks
1257 .iter_mut()
1258 .zip(&packet_subband.code_blocks)
1259 .enumerate()
1260 {
1261 if packet_block.num_coding_passes == 0 {
1262 continue;
1263 }
1264 let layer = u32::from(layer);
1265 if layer < state_block.inclusion_layer {
1266 state_block.inclusion_layer = layer;
1267 state_block.first_inclusion_zero_bitplanes =
1268 u32::from(packet_block.num_zero_bitplanes);
1269 let (x, y) = cuda_htj2k_packetization_block_xy(idx, state_subband.num_cbs_x)?;
1270 state_subband.inclusion_tree.set_leaf_value(x, y, layer);
1271 state_subband.zero_bitplane_tree.set_leaf_value(
1272 x,
1273 y,
1274 state_block.first_inclusion_zero_bitplanes,
1275 );
1276 }
1277 }
1278 }
1279 Ok(())
1280}
1281
1282fn finalize_cuda_htj2k_packetization_tag_trees(state: &mut CudaHtj2kPacketizationState) {
1283 for subband in &mut state.subbands {
1284 subband.inclusion_tree.propagate();
1285 subband.zero_bitplane_tree.propagate();
1286 }
1287}
1288
1289fn append_cuda_htj2k_packetization_tag_state(
1290 state_subband: Option<&CudaHtj2kPacketizationSubbandState>,
1291 num_cbs_x: u32,
1292 num_cbs_y: u32,
1293 tag_states: &mut Vec<CudaHtj2kPacketizationPlanSubbandTagState>,
1294 tag_nodes: &mut Vec<CudaHtj2kPacketizationPlanTagNodeState>,
1295) -> core::result::Result<(), &'static str> {
1296 let (inclusion_node_start, zero_bitplane_node_start, node_count) =
1297 if let Some(state_subband) = state_subband {
1298 let inclusion_start = state_subband.inclusion_tree.append_snapshot(tag_nodes)?;
1299 let zero_bitplane_start = state_subband
1300 .zero_bitplane_tree
1301 .append_snapshot(tag_nodes)?;
1302 (
1303 inclusion_start,
1304 zero_bitplane_start,
1305 state_subband.inclusion_tree.node_count(),
1306 )
1307 } else {
1308 let zero_tree = CudaHtj2kPacketizationTagTreeState::new(num_cbs_x, num_cbs_y)?;
1309 let inclusion_start = zero_tree.append_snapshot(tag_nodes)?;
1310 let zero_bitplane_start = zero_tree.append_snapshot(tag_nodes)?;
1311 (inclusion_start, zero_bitplane_start, zero_tree.node_count())
1312 };
1313 tag_states.push(CudaHtj2kPacketizationPlanSubbandTagState {
1314 inclusion_node_start,
1315 zero_bitplane_node_start,
1316 node_count,
1317 });
1318 Ok(())
1319}
1320
1321fn update_cuda_htj2k_packetization_state_after_block(
1322 state: &mut CudaHtj2kPacketizationState,
1323 subband_index: usize,
1324 block_index: usize,
1325 layer: u8,
1326 code_block: &J2kPacketizationCodeBlock<'_>,
1327 l_block: u32,
1328) -> core::result::Result<(), &'static str> {
1329 let state_subband = state
1330 .subbands
1331 .get_mut(subband_index)
1332 .ok_or("CUDA HTJ2K packetization state layout mismatch")?;
1333 let (x, y) = cuda_htj2k_packetization_block_xy(block_index, state_subband.num_cbs_x)?;
1334 let previously_included = state_subband
1335 .blocks
1336 .get(block_index)
1337 .ok_or("CUDA HTJ2K packetization state layout mismatch")?
1338 .previously_included;
1339
1340 if !previously_included {
1341 state_subband
1342 .inclusion_tree
1343 .encode_state_only(x, y, u32::from(layer) + 1);
1344 if code_block.num_coding_passes == 0 {
1345 return Ok(());
1346 }
1347 state_subband.zero_bitplane_tree.encode_state_only(
1348 x,
1349 y,
1350 u32::from(code_block.num_zero_bitplanes) + 1,
1351 );
1352 }
1353
1354 if code_block.num_coding_passes > 0 {
1355 let state_block = state_subband
1356 .blocks
1357 .get_mut(block_index)
1358 .ok_or("CUDA HTJ2K packetization state layout mismatch")?;
1359 let (cleanup_length, refinement_length) = cuda_ht_segment_lengths(code_block)?;
1360 state_block.l_block = updated_ht_l_block(
1361 l_block,
1362 code_block.num_coding_passes,
1363 cleanup_length,
1364 refinement_length,
1365 )?;
1366 state_block.previously_included = true;
1367 }
1368 Ok(())
1369}
1370
1371fn flatten_cuda_htj2k_packet(
1372 resolution: &J2kPacketizationResolution<'_>,
1373 sink: &mut CudaHtj2kPacketizationPlanSink<'_>,
1374) -> core::result::Result<(), &'static str> {
1375 flatten_cuda_htj2k_packet_inner(resolution, 0, None, sink)
1376}
1377
1378fn flatten_cuda_htj2k_packet_with_state(
1379 resolution: &J2kPacketizationResolution<'_>,
1380 layer: u8,
1381 state: &mut CudaHtj2kPacketizationState,
1382 sink: &mut CudaHtj2kPacketizationPlanSink<'_>,
1383) -> core::result::Result<(), &'static str> {
1384 flatten_cuda_htj2k_packet_inner(resolution, layer, Some(state), sink)
1385}
1386
1387fn flatten_cuda_htj2k_packet_inner(
1388 resolution: &J2kPacketizationResolution<'_>,
1389 layer: u8,
1390 mut state: Option<&mut CudaHtj2kPacketizationState>,
1391 sink: &mut CudaHtj2kPacketizationPlanSink<'_>,
1392) -> core::result::Result<(), &'static str> {
1393 let block_start = u32::try_from(sink.blocks.len())
1394 .map_err(|_| "CUDA HTJ2K packetization block count exceeds u32")?;
1395 let subband_start = u32::try_from(sink.subbands.len())
1396 .map_err(|_| "CUDA HTJ2K packetization subband count exceeds u32")?;
1397 let mut body_len = 0usize;
1398 let mut block_count = 0usize;
1399 let packet_has_data = resolution.subbands.iter().any(|subband| {
1400 subband
1401 .code_blocks
1402 .iter()
1403 .any(|block| block.num_coding_passes > 0)
1404 });
1405
1406 for (subband_index, subband) in resolution.subbands.iter().enumerate() {
1407 let subband_code_blocks = u32::try_from(subband.code_blocks.len())
1408 .map_err(|_| "CUDA HTJ2K packetization block count exceeds u32")?;
1409 if subband.num_cbs_x == 0
1410 || subband.num_cbs_y == 0
1411 || subband.num_cbs_x.saturating_mul(subband.num_cbs_y) != subband_code_blocks
1412 {
1413 return Err("CUDA HTJ2K packetization subband code-block layout mismatch");
1414 }
1415
1416 let subband_block_start = u32::try_from(sink.blocks.len())
1417 .map_err(|_| "CUDA HTJ2K packetization block count exceeds u32")?;
1418 let state_subband = state
1419 .as_deref()
1420 .and_then(|state| state.subbands.get(subband_index));
1421 append_cuda_htj2k_packetization_tag_state(
1422 state_subband,
1423 subband.num_cbs_x,
1424 subband.num_cbs_y,
1425 sink.tag_states,
1426 sink.tag_nodes,
1427 )?;
1428 for (block_index, code_block) in subband.code_blocks.iter().enumerate() {
1429 if code_block.block_coding_mode != J2kPacketizationBlockCodingMode::HighThroughput {
1430 return Err("CUDA packetization only supports HTJ2K block-coded packets");
1431 }
1432 if code_block.num_coding_passes > 164 {
1433 return Err("CUDA HTJ2K packetization coding pass count exceeds JPEG 2000 bounds");
1434 }
1435 let (previously_included, l_block, inclusion_layer, zero_bitplanes) =
1436 if let Some(state) = state.as_deref() {
1437 let state_block = state
1438 .subbands
1439 .get(subband_index)
1440 .and_then(|state_subband| state_subband.blocks.get(block_index))
1441 .ok_or("CUDA HTJ2K packetization state layout mismatch")?;
1442 (
1443 state_block.previously_included,
1444 state_block.l_block,
1445 state_block.inclusion_layer,
1446 state_block.first_inclusion_zero_bitplanes,
1447 )
1448 } else {
1449 (
1450 code_block.previously_included,
1451 code_block.l_block,
1452 if code_block.num_coding_passes > 0 {
1453 0
1454 } else {
1455 CUDA_HTJ2K_PACKET_TAG_INF
1456 },
1457 u32::from(code_block.num_zero_bitplanes),
1458 )
1459 };
1460 if code_block.num_coding_passes > 0
1461 && !previously_included
1462 && inclusion_layer != u32::from(layer)
1463 {
1464 return Err(
1465 "CUDA HTJ2K packetization descriptor order does not match first inclusion layer",
1466 );
1467 }
1468 if state.is_none() && previously_included {
1469 return Err("CUDA HTJ2K packetization requires first-inclusion packets");
1470 }
1471 if code_block.num_coding_passes == 0 && !code_block.data.is_empty() {
1472 return Err("CUDA HTJ2K packetization empty contributions must not carry payload");
1473 }
1474 if zero_bitplanes > 31 || l_block > 31 {
1475 return Err("CUDA HTJ2K packetization header fields exceed kernel bounds");
1476 }
1477
1478 let data_offset = u32::try_from(sink.payload.len())
1479 .map_err(|_| "CUDA HTJ2K packetization payload exceeds u32")?;
1480 let data_len = if code_block.num_coding_passes == 0 {
1481 0
1482 } else {
1483 u32::try_from(code_block.data.len())
1484 .map_err(|_| "CUDA HTJ2K packetization code-block payload exceeds u32")?
1485 };
1486 let (cleanup_length, refinement_length) = cuda_ht_segment_lengths(code_block)?;
1487 if code_block.num_coding_passes > 0 {
1488 sink.payload.extend_from_slice(code_block.data);
1489 body_len = body_len
1490 .checked_add(code_block.data.len())
1491 .ok_or("CUDA HTJ2K packetization body length overflow")?;
1492 }
1493 sink.blocks.push(CudaHtj2kPacketizationPlanBlock {
1494 data_offset,
1495 data_len,
1496 cleanup_length,
1497 refinement_length,
1498 num_coding_passes: u32::from(code_block.num_coding_passes),
1499 num_zero_bitplanes: zero_bitplanes,
1500 l_block,
1501 previously_included: u32::from(previously_included),
1502 inclusion_layer,
1503 });
1504 if packet_has_data {
1505 if let Some(state) = state.as_deref_mut() {
1506 update_cuda_htj2k_packetization_state_after_block(
1507 state,
1508 subband_index,
1509 block_index,
1510 layer,
1511 code_block,
1512 l_block,
1513 )?;
1514 }
1515 }
1516 block_count = block_count
1517 .checked_add(1)
1518 .ok_or("CUDA HTJ2K packetization block count overflow")?;
1519 }
1520 sink.subbands.push(CudaHtj2kPacketizationPlanSubband {
1521 block_start: subband_block_start,
1522 block_count: subband_code_blocks,
1523 num_cbs_x: subband.num_cbs_x,
1524 num_cbs_y: subband.num_cbs_y,
1525 });
1526 }
1527
1528 let header_capacity = 256usize
1529 .checked_add(
1530 block_count
1531 .checked_mul(64)
1532 .ok_or("CUDA HTJ2K packetization capacity overflow")?,
1533 )
1534 .ok_or("CUDA HTJ2K packetization capacity overflow")?;
1535 let output_capacity = body_len
1536 .checked_add(header_capacity)
1537 .ok_or("CUDA HTJ2K packetization capacity overflow")?;
1538 sink.packets.push(CudaHtj2kPacketizationPlanPacket {
1539 block_start,
1540 block_count: u32::try_from(block_count)
1541 .map_err(|_| "CUDA HTJ2K packetization block count exceeds u32")?,
1542 subband_start,
1543 subband_count: u32::try_from(resolution.subbands.len())
1544 .map_err(|_| "CUDA HTJ2K packetization subband count exceeds u32")?,
1545 output_capacity: u32::try_from(output_capacity)
1546 .map_err(|_| "CUDA HTJ2K packetization packet capacity exceeds u32")?,
1547 layer: u32::from(layer),
1548 });
1549 Ok(())
1550}
1551
1552fn updated_ht_l_block(
1553 mut l_block: u32,
1554 num_coding_passes: u8,
1555 cleanup_length: u32,
1556 refinement_length: u32,
1557) -> core::result::Result<u32, &'static str> {
1558 let mut num_bits = packet_math::bits_for_ht_cleanup_length(l_block, num_coding_passes);
1559 let refinement_extra_bits = u32::from(num_coding_passes > 2);
1560 while !packet_math::value_fits_in_bits(cleanup_length, num_bits)
1561 || (num_coding_passes > 1
1562 && !packet_math::value_fits_in_bits(refinement_length, l_block + refinement_extra_bits))
1563 {
1564 l_block = l_block
1565 .checked_add(1)
1566 .ok_or("CUDA HTJ2K packetization L-block overflow")?;
1567 num_bits = num_bits
1568 .checked_add(1)
1569 .ok_or("CUDA HTJ2K packetization L-block overflow")?;
1570 }
1571 Ok(l_block)
1572}
1573
1574fn cuda_ht_segment_lengths(
1575 code_block: &J2kPacketizationCodeBlock<'_>,
1576) -> core::result::Result<(u32, u32), &'static str> {
1577 packet_math::ht_segment_lengths(
1578 code_block.num_coding_passes,
1579 code_block.data.len(),
1580 code_block.ht_cleanup_length,
1581 code_block.ht_refinement_length,
1582 )
1583}
1584
1585impl J2kEncodeStageAccelerator for CudaEncodeStageAccelerator {
1586 fn dispatch_report(&self) -> J2kEncodeDispatchReport {
1587 J2kEncodeDispatchReport {
1588 deinterleave: self.deinterleave_dispatches,
1589 forward_rct: self.forward_rct_dispatches,
1590 forward_ict: self.forward_ict_dispatches,
1591 forward_dwt53: self.forward_dwt53_dispatches,
1592 forward_dwt97: self.forward_dwt97_dispatches,
1593 quantize_subband: self.quantize_subband_dispatches,
1594 tier1_code_block: self.tier1_code_block_dispatches,
1595 ht_code_block: self.ht_code_block_dispatches,
1596 packetization: self.packetization_dispatches,
1597 }
1598 }
1599
1600 fn encode_deinterleave(
1601 &mut self,
1602 job: J2kDeinterleaveToF32Job<'_>,
1603 ) -> core::result::Result<Option<Vec<Vec<f32>>>, &'static str> {
1604 self.deinterleave_attempts = self.deinterleave_attempts.saturating_add(1);
1605 #[cfg(feature = "cuda-runtime")]
1606 if let Some(context) = self.cuda_context()? {
1607 let (output, elapsed_us) = time_cuda_stage(
1608 "j2k.j2k.cuda.encode.deinterleave",
1609 &context,
1610 self.collect_profile,
1611 || {
1612 context.j2k_deinterleave_to_f32(
1613 job.pixels,
1614 job.num_pixels,
1615 job.num_components,
1616 job.bit_depth,
1617 job.signed,
1618 )
1619 },
1620 )
1621 .map_err(|_| "CUDA deinterleave encode kernel failed")?;
1622 let dispatches = output.execution().kernel_dispatches();
1623 self.deinterleave_dispatches = self.deinterleave_dispatches.saturating_add(dispatches);
1624 self.deinterleave_us = self.deinterleave_us.saturating_add(elapsed_us);
1625 if j2k_profile::gpu_route_profile_enabled() {
1626 let pixels_s = job.num_pixels.to_string();
1627 let components_s = job.num_components.to_string();
1628 let dispatches_s = dispatches.to_string();
1629 j2k_profile::emit_gpu_route_profile(
1630 "j2k",
1631 "cuda",
1632 &[
1633 ("op", "encode_deinterleave"),
1634 ("decision", "cuda_dispatch"),
1635 ("pixels", pixels_s.as_str()),
1636 ("components", components_s.as_str()),
1637 ("dispatches", dispatches_s.as_str()),
1638 ],
1639 );
1640 }
1641 return Ok(Some(output.into_components()));
1642 }
1643 #[cfg(not(feature = "cuda-runtime"))]
1644 let _ = job;
1645 if j2k_profile::gpu_route_profile_enabled() {
1646 j2k_profile::emit_gpu_route_profile(
1647 "j2k",
1648 "cuda",
1649 &[
1650 ("op", "encode_deinterleave"),
1651 ("decision", "cpu_fallback"),
1652 ("reason", "cuda_unavailable"),
1653 ],
1654 );
1655 }
1656 Ok(None)
1657 }
1658
1659 fn encode_forward_rct(
1660 &mut self,
1661 job: J2kForwardRctJob<'_>,
1662 ) -> core::result::Result<bool, &'static str> {
1663 self.forward_rct_attempts = self.forward_rct_attempts.saturating_add(1);
1664 if self.prefer_cpu_forward_rct {
1665 if j2k_profile::gpu_route_profile_enabled() {
1666 j2k_profile::emit_gpu_route_profile(
1667 "j2k",
1668 "cuda",
1669 &[
1670 ("op", "encode_forward_rct"),
1671 ("decision", "cpu_fallback"),
1672 ("reason", "prefer_cpu_forward_rct"),
1673 ],
1674 );
1675 }
1676 let _ = job;
1677 return Ok(false);
1678 }
1679 #[cfg(feature = "cuda-runtime")]
1680 if let Some(context) = self.cuda_context()? {
1681 let (execution, elapsed_us) = time_cuda_stage(
1682 "j2k.j2k.cuda.encode.rct",
1683 &context,
1684 self.collect_profile,
1685 || context.j2k_forward_rct(job.plane0, job.plane1, job.plane2),
1686 )
1687 .map_err(|_| "CUDA forward RCT encode kernel failed")?;
1688 self.forward_rct_dispatches = self
1689 .forward_rct_dispatches
1690 .saturating_add(execution.kernel_dispatches());
1691 self.mct_us = self.mct_us.saturating_add(elapsed_us);
1692 if j2k_profile::gpu_route_profile_enabled() {
1693 j2k_profile::emit_gpu_route_profile(
1694 "j2k",
1695 "cuda",
1696 &[
1697 ("op", "encode_forward_rct"),
1698 ("decision", "cuda_dispatch"),
1699 ("dispatches", "1"),
1700 ],
1701 );
1702 }
1703 return Ok(true);
1704 }
1705 #[cfg(not(feature = "cuda-runtime"))]
1706 let _ = job;
1707 if j2k_profile::gpu_route_profile_enabled() {
1708 j2k_profile::emit_gpu_route_profile(
1709 "j2k",
1710 "cuda",
1711 &[
1712 ("op", "encode_forward_rct"),
1713 ("decision", "cpu_fallback"),
1714 ("reason", "cuda_unavailable"),
1715 ],
1716 );
1717 }
1718 Ok(false)
1719 }
1720
1721 fn encode_forward_ict(
1722 &mut self,
1723 job: J2kForwardIctJob<'_>,
1724 ) -> core::result::Result<bool, &'static str> {
1725 self.forward_ict_attempts = self.forward_ict_attempts.saturating_add(1);
1726 #[cfg(feature = "cuda-runtime")]
1727 if let Some(context) = self.cuda_context()? {
1728 let (execution, elapsed_us) = time_cuda_stage(
1729 "j2k.j2k.cuda.encode.ict",
1730 &context,
1731 self.collect_profile,
1732 || context.j2k_forward_ict(job.plane0, job.plane1, job.plane2),
1733 )
1734 .map_err(|_| "CUDA forward ICT encode kernel failed")?;
1735 self.forward_ict_dispatches = self
1736 .forward_ict_dispatches
1737 .saturating_add(execution.kernel_dispatches());
1738 self.mct_us = self.mct_us.saturating_add(elapsed_us);
1739 if j2k_profile::gpu_route_profile_enabled() {
1740 j2k_profile::emit_gpu_route_profile(
1741 "j2k",
1742 "cuda",
1743 &[
1744 ("op", "encode_forward_ict"),
1745 ("decision", "cuda_dispatch"),
1746 ("dispatches", "1"),
1747 ],
1748 );
1749 }
1750 return Ok(true);
1751 }
1752 #[cfg(not(feature = "cuda-runtime"))]
1753 let _ = job;
1754 if j2k_profile::gpu_route_profile_enabled() {
1755 j2k_profile::emit_gpu_route_profile(
1756 "j2k",
1757 "cuda",
1758 &[
1759 ("op", "encode_forward_ict"),
1760 ("decision", "cpu_fallback"),
1761 ("reason", "cuda_unavailable"),
1762 ],
1763 );
1764 }
1765 Ok(false)
1766 }
1767
1768 fn encode_forward_dwt53(
1769 &mut self,
1770 job: J2kForwardDwt53Job<'_>,
1771 ) -> core::result::Result<Option<J2kForwardDwt53Output>, &'static str> {
1772 self.forward_dwt53_attempts = self.forward_dwt53_attempts.saturating_add(1);
1773 if job.num_levels == 0 {
1774 if j2k_profile::gpu_route_profile_enabled() {
1775 j2k_profile::emit_gpu_route_profile(
1776 "j2k",
1777 "cuda",
1778 &[
1779 ("op", "encode_forward_dwt53"),
1780 ("decision", "cpu_fallback"),
1781 ("reason", "zero_levels"),
1782 ],
1783 );
1784 }
1785 return Ok(None);
1786 }
1787 #[cfg(feature = "cuda-runtime")]
1788 if let Some(context) = self.cuda_context()? {
1789 let (output, elapsed_us) = time_cuda_stage(
1790 "j2k.j2k.cuda.encode.dwt53",
1791 &context,
1792 self.collect_profile,
1793 || context.j2k_forward_dwt53(job.samples, job.width, job.height, job.num_levels),
1794 )
1795 .map_err(|_| "CUDA forward 5/3 DWT encode kernel failed")?;
1796 let dispatches = output.execution().kernel_dispatches();
1797 self.forward_dwt53_dispatches =
1798 self.forward_dwt53_dispatches.saturating_add(dispatches);
1799 self.dwt_us = self.dwt_us.saturating_add(elapsed_us);
1800 if j2k_profile::gpu_route_profile_enabled() {
1801 let width_s = job.width.to_string();
1802 let height_s = job.height.to_string();
1803 let levels_s = job.num_levels.to_string();
1804 let dispatches_s = dispatches.to_string();
1805 j2k_profile::emit_gpu_route_profile(
1806 "j2k",
1807 "cuda",
1808 &[
1809 ("op", "encode_forward_dwt53"),
1810 ("decision", "cuda_dispatch"),
1811 ("width", width_s.as_str()),
1812 ("height", height_s.as_str()),
1813 ("levels", levels_s.as_str()),
1814 ("dispatches", dispatches_s.as_str()),
1815 ],
1816 );
1817 }
1818 return Ok(Some(cuda_dwt53_output_to_j2k(&output)?));
1819 }
1820 #[cfg(not(feature = "cuda-runtime"))]
1821 let _ = job;
1822 if j2k_profile::gpu_route_profile_enabled() {
1823 j2k_profile::emit_gpu_route_profile(
1824 "j2k",
1825 "cuda",
1826 &[
1827 ("op", "encode_forward_dwt53"),
1828 ("decision", "cpu_fallback"),
1829 ("reason", "cuda_unavailable"),
1830 ],
1831 );
1832 }
1833 Ok(None)
1834 }
1835
1836 fn encode_forward_dwt97(
1837 &mut self,
1838 job: J2kForwardDwt97Job<'_>,
1839 ) -> core::result::Result<Option<J2kForwardDwt97Output>, &'static str> {
1840 self.forward_dwt97_attempts = self.forward_dwt97_attempts.saturating_add(1);
1841 if job.num_levels == 0 {
1842 if j2k_profile::gpu_route_profile_enabled() {
1843 j2k_profile::emit_gpu_route_profile(
1844 "j2k",
1845 "cuda",
1846 &[
1847 ("op", "encode_forward_dwt97"),
1848 ("decision", "cpu_fallback"),
1849 ("reason", "zero_levels"),
1850 ],
1851 );
1852 }
1853 return Ok(None);
1854 }
1855 #[cfg(feature = "cuda-runtime")]
1856 if let Some(context) = self.cuda_context()? {
1857 let (output, elapsed_us) = time_cuda_stage(
1858 "j2k.j2k.cuda.encode.dwt97",
1859 &context,
1860 self.collect_profile,
1861 || context.j2k_forward_dwt97(job.samples, job.width, job.height, job.num_levels),
1862 )
1863 .map_err(|_| "CUDA forward 9/7 DWT encode kernel failed")?;
1864 let dispatches = output.execution().kernel_dispatches();
1865 self.forward_dwt97_dispatches =
1866 self.forward_dwt97_dispatches.saturating_add(dispatches);
1867 self.dwt_us = self.dwt_us.saturating_add(elapsed_us);
1868 if j2k_profile::gpu_route_profile_enabled() {
1869 let width_s = job.width.to_string();
1870 let height_s = job.height.to_string();
1871 let levels_s = job.num_levels.to_string();
1872 let dispatches_s = dispatches.to_string();
1873 j2k_profile::emit_gpu_route_profile(
1874 "j2k",
1875 "cuda",
1876 &[
1877 ("op", "encode_forward_dwt97"),
1878 ("decision", "cuda_dispatch"),
1879 ("width", width_s.as_str()),
1880 ("height", height_s.as_str()),
1881 ("levels", levels_s.as_str()),
1882 ("dispatches", dispatches_s.as_str()),
1883 ],
1884 );
1885 }
1886 return Ok(Some(cuda_dwt97_output_to_j2k(&output)?));
1887 }
1888 #[cfg(not(feature = "cuda-runtime"))]
1889 let _ = job;
1890 if j2k_profile::gpu_route_profile_enabled() {
1891 j2k_profile::emit_gpu_route_profile(
1892 "j2k",
1893 "cuda",
1894 &[
1895 ("op", "encode_forward_dwt97"),
1896 ("decision", "cpu_fallback"),
1897 ("reason", "cuda_unavailable"),
1898 ],
1899 );
1900 }
1901 Ok(None)
1902 }
1903
1904 fn encode_quantize_subband(
1905 &mut self,
1906 job: J2kQuantizeSubbandJob<'_>,
1907 ) -> core::result::Result<Option<Vec<i32>>, &'static str> {
1908 self.quantize_subband_attempts = self.quantize_subband_attempts.saturating_add(1);
1909 if self.prefer_cpu_quantize_subband {
1910 if j2k_profile::gpu_route_profile_enabled() {
1911 j2k_profile::emit_gpu_route_profile(
1912 "j2k",
1913 "cuda",
1914 &[
1915 ("op", "encode_quantize_subband"),
1916 ("decision", "cpu_fallback"),
1917 ("reason", "prefer_cpu_quantize_subband"),
1918 ],
1919 );
1920 }
1921 let _ = job;
1922 return Ok(None);
1923 }
1924 #[cfg(feature = "cuda-runtime")]
1925 if let Some(context) = self.cuda_context()? {
1926 let (output, elapsed_us) = time_cuda_stage(
1927 "j2k.j2k.cuda.encode.quantize",
1928 &context,
1929 self.collect_profile,
1930 || {
1931 context.j2k_quantize_subband(
1932 job.coefficients,
1933 CudaJ2kQuantizeJob {
1934 step_exponent: job.step_exponent,
1935 step_mantissa: job.step_mantissa,
1936 range_bits: job.range_bits,
1937 reversible: job.reversible,
1938 },
1939 )
1940 },
1941 )
1942 .map_err(|_| "CUDA quantize subband encode kernel failed")?;
1943 let dispatches = output.execution().kernel_dispatches();
1944 self.quantize_subband_dispatches =
1945 self.quantize_subband_dispatches.saturating_add(dispatches);
1946 self.quantize_us = self.quantize_us.saturating_add(elapsed_us);
1947 if j2k_profile::gpu_route_profile_enabled() {
1948 let samples_s = job.coefficients.len().to_string();
1949 let dispatches_s = dispatches.to_string();
1950 j2k_profile::emit_gpu_route_profile(
1951 "j2k",
1952 "cuda",
1953 &[
1954 ("op", "encode_quantize_subband"),
1955 ("decision", "cuda_dispatch"),
1956 ("samples", samples_s.as_str()),
1957 ("dispatches", dispatches_s.as_str()),
1958 ],
1959 );
1960 }
1961 return Ok(Some(output.coefficients().to_vec()));
1962 }
1963 #[cfg(not(feature = "cuda-runtime"))]
1964 let _ = job;
1965 if j2k_profile::gpu_route_profile_enabled() {
1966 j2k_profile::emit_gpu_route_profile(
1967 "j2k",
1968 "cuda",
1969 &[
1970 ("op", "encode_quantize_subband"),
1971 ("decision", "cpu_fallback"),
1972 ("reason", "cuda_unavailable"),
1973 ],
1974 );
1975 }
1976 Ok(None)
1977 }
1978
1979 fn encode_tier1_code_block(
1980 &mut self,
1981 _job: J2kTier1CodeBlockEncodeJob<'_>,
1982 ) -> core::result::Result<Option<EncodedJ2kCodeBlock>, &'static str> {
1983 self.tier1_code_block_attempts = self.tier1_code_block_attempts.saturating_add(1);
1984 if j2k_profile::gpu_route_profile_enabled() {
1985 j2k_profile::emit_gpu_route_profile(
1986 "j2k",
1987 "cuda",
1988 &[
1989 ("op", "encode_tier1_code_block"),
1990 ("decision", "cpu_fallback"),
1991 ("reason", "unsupported_stage"),
1992 ],
1993 );
1994 }
1995 Ok(None)
1996 }
1997
1998 fn encode_ht_code_block(
1999 &mut self,
2000 job: J2kHtCodeBlockEncodeJob<'_>,
2001 ) -> core::result::Result<Option<EncodedHtJ2kCodeBlock>, &'static str> {
2002 self.ht_code_block_attempts = self.ht_code_block_attempts.saturating_add(1);
2003 #[cfg(feature = "cuda-runtime")]
2004 if let Some(context) = self.cuda_context()? {
2005 let resources = self.cuda_encode_resources(&context)?;
2006 let encoded = cuda_encode_ht_code_block(&context, resources.as_ref(), job)?;
2007 let dispatches = encoded.execution().kernel_dispatches();
2008 let ht_encode_us = encoded.stage_timings().ht_encode_us;
2009 let mut outputs = encoded_ht_code_blocks_from_cuda(&encoded);
2010 let output = outputs
2011 .pop()
2012 .ok_or("CUDA HTJ2K code-block encode returned no output")?;
2013 self.ht_code_block_dispatches =
2014 self.ht_code_block_dispatches.saturating_add(dispatches);
2015 if self.collect_profile {
2016 self.ht_encode_us = self.ht_encode_us.saturating_add(ht_encode_us);
2017 }
2018 if j2k_profile::gpu_route_profile_enabled() {
2019 let width_s = job.width.to_string();
2020 let height_s = job.height.to_string();
2021 let dispatches_s = dispatches.to_string();
2022 j2k_profile::emit_gpu_route_profile(
2023 "j2k",
2024 "cuda",
2025 &[
2026 ("op", "encode_ht_code_block"),
2027 ("decision", "cuda_dispatch"),
2028 ("width", width_s.as_str()),
2029 ("height", height_s.as_str()),
2030 ("dispatches", dispatches_s.as_str()),
2031 ],
2032 );
2033 }
2034 return Ok(Some(output));
2035 }
2036 #[cfg(not(feature = "cuda-runtime"))]
2037 let _ = job;
2038 if j2k_profile::gpu_route_profile_enabled() {
2039 j2k_profile::emit_gpu_route_profile(
2040 "j2k",
2041 "cuda",
2042 &[
2043 ("op", "encode_ht_code_block"),
2044 ("decision", "cpu_fallback"),
2045 ("reason", "unsupported_stage"),
2046 ],
2047 );
2048 }
2049 Ok(None)
2050 }
2051
2052 fn encode_ht_code_blocks(
2053 &mut self,
2054 jobs: &[J2kHtCodeBlockEncodeJob<'_>],
2055 ) -> core::result::Result<Option<Vec<EncodedHtJ2kCodeBlock>>, &'static str> {
2056 self.ht_code_block_attempts = self.ht_code_block_attempts.saturating_add(jobs.len());
2057 #[cfg(feature = "cuda-runtime")]
2058 if let Some(context) = self.cuda_context()? {
2059 let resources = self.cuda_encode_resources(&context)?;
2060 let encoded = cuda_encode_ht_code_blocks(&context, resources.as_ref(), jobs)?;
2061 let dispatches = encoded.execution().kernel_dispatches();
2062 let ht_encode_us = encoded.stage_timings().ht_encode_us;
2063 let outputs = encoded_ht_code_blocks_from_cuda(&encoded);
2064 self.ht_code_block_dispatches =
2065 self.ht_code_block_dispatches.saturating_add(dispatches);
2066 if self.collect_profile {
2067 self.ht_encode_us = self.ht_encode_us.saturating_add(ht_encode_us);
2068 }
2069 if j2k_profile::gpu_route_profile_enabled() {
2070 let jobs_s = jobs.len().to_string();
2071 let dispatches_s = dispatches.to_string();
2072 j2k_profile::emit_gpu_route_profile(
2073 "j2k",
2074 "cuda",
2075 &[
2076 ("op", "encode_ht_code_blocks"),
2077 ("decision", "cuda_dispatch"),
2078 ("jobs", jobs_s.as_str()),
2079 ("dispatches", dispatches_s.as_str()),
2080 ],
2081 );
2082 }
2083 return Ok(Some(outputs));
2084 }
2085 #[cfg(not(feature = "cuda-runtime"))]
2086 let _ = jobs;
2087 if j2k_profile::gpu_route_profile_enabled() {
2088 j2k_profile::emit_gpu_route_profile(
2089 "j2k",
2090 "cuda",
2091 &[
2092 ("op", "encode_ht_code_blocks"),
2093 ("decision", "cpu_fallback"),
2094 ("reason", "cuda_unavailable"),
2095 ],
2096 );
2097 }
2098 Ok(None)
2099 }
2100
2101 fn encode_htj2k_tile(
2102 &mut self,
2103 job: J2kHtj2kTileEncodeJob<'_>,
2104 ) -> core::result::Result<Option<Vec<u8>>, &'static str> {
2105 self.htj2k_tile_attempts = self.htj2k_tile_attempts.saturating_add(1);
2106 if self.prefer_cpu_forward_rct || self.prefer_cpu_packetization {
2107 if j2k_profile::gpu_route_profile_enabled() {
2108 j2k_profile::emit_gpu_route_profile(
2109 "j2k",
2110 "cuda",
2111 &[
2112 ("op", "encode_htj2k_tile"),
2113 ("decision", "cpu_fallback"),
2114 ("reason", "prefer_stage_hybrid"),
2115 ],
2116 );
2117 }
2118 let _ = job;
2119 return Ok(None);
2120 }
2121 #[cfg(feature = "cuda-runtime")]
2122 if let Some(context) = self.cuda_context()? {
2123 let resources = self.cuda_encode_resources(&context)?;
2124 let Some(encoded) = cuda_encode_htj2k_tile_body(
2125 &context,
2126 resources.as_ref(),
2127 job,
2128 self.collect_profile,
2129 )?
2130 else {
2131 return Ok(None);
2132 };
2133 self.htj2k_tile_dispatches = self.htj2k_tile_dispatches.saturating_add(1);
2134 self.deinterleave_attempts = self.deinterleave_attempts.saturating_add(1);
2135 self.deinterleave_dispatches = self
2136 .deinterleave_dispatches
2137 .saturating_add(encoded.deinterleave_dispatches);
2138 if job.use_mct {
2139 if job.reversible {
2140 self.forward_rct_attempts = self.forward_rct_attempts.saturating_add(1);
2141 } else {
2142 self.forward_ict_attempts = self.forward_ict_attempts.saturating_add(1);
2143 }
2144 }
2145 self.forward_rct_dispatches = self
2146 .forward_rct_dispatches
2147 .saturating_add(encoded.forward_rct_dispatches);
2148 self.forward_ict_dispatches = self
2149 .forward_ict_dispatches
2150 .saturating_add(encoded.forward_ict_dispatches);
2151 if job.num_decomposition_levels > 0 {
2152 if job.reversible {
2153 self.forward_dwt53_attempts = self
2154 .forward_dwt53_attempts
2155 .saturating_add(usize::from(job.num_components));
2156 } else {
2157 self.forward_dwt97_attempts = self
2158 .forward_dwt97_attempts
2159 .saturating_add(usize::from(job.num_components));
2160 }
2161 }
2162 self.forward_dwt53_dispatches = self
2163 .forward_dwt53_dispatches
2164 .saturating_add(encoded.forward_dwt53_dispatches);
2165 self.forward_dwt97_dispatches = self
2166 .forward_dwt97_dispatches
2167 .saturating_add(encoded.forward_dwt97_dispatches);
2168 self.quantize_subband_attempts = self
2169 .quantize_subband_attempts
2170 .saturating_add(encoded.quantize_jobs);
2171 self.quantize_subband_dispatches = self
2172 .quantize_subband_dispatches
2173 .saturating_add(encoded.quantize_dispatches);
2174 self.ht_code_block_attempts = self
2175 .ht_code_block_attempts
2176 .saturating_add(encoded.ht_code_block_jobs);
2177 self.ht_code_block_dispatches = self
2178 .ht_code_block_dispatches
2179 .saturating_add(encoded.ht_code_block_dispatches);
2180 self.packetization_attempts = self.packetization_attempts.saturating_add(1);
2181 self.packetization_dispatches = self
2182 .packetization_dispatches
2183 .saturating_add(encoded.packetization_dispatches);
2184 if self.collect_profile {
2185 self.deinterleave_us = self
2186 .deinterleave_us
2187 .saturating_add(encoded.timings.deinterleave_us);
2188 self.mct_us = self.mct_us.saturating_add(encoded.timings.mct_us);
2189 self.dwt_us = self.dwt_us.saturating_add(encoded.timings.dwt_us);
2190 self.quantize_us = self.quantize_us.saturating_add(encoded.timings.quantize_us);
2191 self.ht_encode_us = self
2192 .ht_encode_us
2193 .saturating_add(encoded.timings.ht_encode_us);
2194 self.packetize_us = self
2195 .packetize_us
2196 .saturating_add(encoded.timings.packetize_us);
2197 }
2198 if j2k_profile::gpu_route_profile_enabled() {
2199 let components_s = job.num_components.to_string();
2200 let blocks_s = encoded.ht_code_block_jobs.to_string();
2201 j2k_profile::emit_gpu_route_profile(
2202 "j2k",
2203 "cuda",
2204 &[
2205 ("op", "encode_htj2k_tile"),
2206 ("decision", "cuda_dispatch"),
2207 ("components", components_s.as_str()),
2208 ("blocks", blocks_s.as_str()),
2209 ],
2210 );
2211 }
2212 return Ok(Some(encoded.tile_data));
2213 }
2214 #[cfg(not(feature = "cuda-runtime"))]
2215 let _ = job;
2216 if j2k_profile::gpu_route_profile_enabled() {
2217 j2k_profile::emit_gpu_route_profile(
2218 "j2k",
2219 "cuda",
2220 &[
2221 ("op", "encode_htj2k_tile"),
2222 ("decision", "cpu_fallback"),
2223 ("reason", "cuda_unavailable"),
2224 ],
2225 );
2226 }
2227 Ok(None)
2228 }
2229
2230 fn encode_ht_subband(
2231 &mut self,
2232 job: J2kHtSubbandEncodeJob<'_>,
2233 ) -> core::result::Result<Option<Vec<EncodedHtJ2kCodeBlock>>, &'static str> {
2234 let code_block_count = ht_subband_code_block_count(job)?;
2235 self.ht_subband_attempts = self.ht_subband_attempts.saturating_add(1);
2236 self.quantize_subband_attempts = self.quantize_subband_attempts.saturating_add(1);
2237 self.ht_code_block_attempts = self.ht_code_block_attempts.saturating_add(code_block_count);
2238 if self.prefer_cpu_ht_subband {
2239 if j2k_profile::gpu_route_profile_enabled() {
2240 j2k_profile::emit_gpu_route_profile(
2241 "j2k",
2242 "cuda",
2243 &[
2244 ("op", "encode_ht_subband"),
2245 ("decision", "cpu_fallback"),
2246 ("reason", "prefer_cpu_ht_subband"),
2247 ],
2248 );
2249 }
2250 return Ok(None);
2251 }
2252 #[cfg(feature = "cuda-runtime")]
2253 if let Some(context) = self.cuda_context()? {
2254 let resources = self.cuda_encode_resources(&context)?;
2255 let encoded =
2256 cuda_encode_ht_subband(&context, resources.as_ref(), job, self.collect_profile)?;
2257 let quantize_dispatches = encoded.quantize_dispatches;
2258 let encode_dispatches = encoded.encode.execution().kernel_dispatches();
2259 let outputs = encoded_ht_code_blocks_from_cuda(&encoded.encode);
2260 self.ht_subband_dispatches = self.ht_subband_dispatches.saturating_add(1);
2261 self.quantize_subband_dispatches = self
2262 .quantize_subband_dispatches
2263 .saturating_add(quantize_dispatches);
2264 self.ht_code_block_dispatches = self
2265 .ht_code_block_dispatches
2266 .saturating_add(encode_dispatches);
2267 if self.collect_profile {
2268 self.quantize_us = self.quantize_us.saturating_add(encoded.timings.quantize_us);
2269 self.ht_encode_us = self
2270 .ht_encode_us
2271 .saturating_add(encoded.timings.ht_encode_us);
2272 }
2273 if j2k_profile::gpu_route_profile_enabled() {
2274 let width_s = job.width.to_string();
2275 let height_s = job.height.to_string();
2276 let blocks_s = code_block_count.to_string();
2277 let quantize_dispatches_s = quantize_dispatches.to_string();
2278 let encode_dispatches_s = encode_dispatches.to_string();
2279 j2k_profile::emit_gpu_route_profile(
2280 "j2k",
2281 "cuda",
2282 &[
2283 ("op", "encode_ht_subband"),
2284 ("decision", "cuda_dispatch"),
2285 ("width", width_s.as_str()),
2286 ("height", height_s.as_str()),
2287 ("blocks", blocks_s.as_str()),
2288 ("quantize_dispatches", quantize_dispatches_s.as_str()),
2289 ("encode_dispatches", encode_dispatches_s.as_str()),
2290 ],
2291 );
2292 }
2293 return Ok(Some(outputs));
2294 }
2295 #[cfg(not(feature = "cuda-runtime"))]
2296 let _ = job;
2297 if j2k_profile::gpu_route_profile_enabled() {
2298 j2k_profile::emit_gpu_route_profile(
2299 "j2k",
2300 "cuda",
2301 &[
2302 ("op", "encode_ht_subband"),
2303 ("decision", "cpu_fallback"),
2304 ("reason", "cuda_unavailable"),
2305 ],
2306 );
2307 }
2308 Ok(None)
2309 }
2310
2311 fn encode_packetization(
2312 &mut self,
2313 job: J2kPacketizationEncodeJob<'_>,
2314 ) -> core::result::Result<Option<Vec<u8>>, &'static str> {
2315 self.packetization_attempts = self.packetization_attempts.saturating_add(1);
2316 if self.prefer_cpu_packetization {
2317 if j2k_profile::gpu_route_profile_enabled() {
2318 j2k_profile::emit_gpu_route_profile(
2319 "j2k",
2320 "cuda",
2321 &[
2322 ("op", "encode_packetization"),
2323 ("decision", "cpu_fallback"),
2324 ("reason", "prefer_cpu_packetization"),
2325 ],
2326 );
2327 }
2328 let _ = job;
2329 return Ok(None);
2330 }
2331 let plan = match flatten_cuda_htj2k_packetization_job(job) {
2332 Ok(plan) => plan,
2333 Err(reason) => {
2334 if j2k_profile::gpu_route_profile_enabled() {
2335 j2k_profile::emit_gpu_route_profile(
2336 "j2k",
2337 "cuda",
2338 &[
2339 ("op", "encode_packetization"),
2340 ("decision", "cpu_fallback"),
2341 ("reason", reason),
2342 ],
2343 );
2344 }
2345 return Ok(None);
2346 }
2347 };
2348 #[cfg(feature = "cuda-runtime")]
2349 if let Some(context) = self.cuda_context()? {
2350 let packets = cuda_packetization_packets(&plan);
2351 let subbands = cuda_packetization_subbands(&plan);
2352 let blocks = cuda_packetization_blocks(&plan);
2353 let tag_states = cuda_packetization_tag_states(&plan);
2354 let tag_nodes = cuda_packetization_tag_nodes(&plan);
2355 let packetized = context
2356 .packetize_htj2k_cleanup_packets_with_tag_state(
2357 &plan.payload,
2358 &packets,
2359 &subbands,
2360 &blocks,
2361 &tag_states,
2362 &tag_nodes,
2363 )
2364 .map_err(|_| "CUDA HTJ2K packetization kernel failed")?;
2365 let dispatches = packetized.execution().kernel_dispatches();
2366 let packetize_us = packetized.stage_timings().packetize_us;
2367 self.packetization_dispatches =
2368 self.packetization_dispatches.saturating_add(dispatches);
2369 if self.collect_profile {
2370 self.packetize_us = self.packetize_us.saturating_add(packetize_us);
2371 }
2372 if j2k_profile::gpu_route_profile_enabled() {
2373 let packets_s = packets.len().to_string();
2374 let dispatches_s = dispatches.to_string();
2375 j2k_profile::emit_gpu_route_profile(
2376 "j2k",
2377 "cuda",
2378 &[
2379 ("op", "encode_packetization"),
2380 ("decision", "cuda_dispatch"),
2381 ("packets", packets_s.as_str()),
2382 ("dispatches", dispatches_s.as_str()),
2383 ],
2384 );
2385 }
2386 return Ok(Some(packetized.data().to_vec()));
2387 }
2388 #[cfg(not(feature = "cuda-runtime"))]
2389 let _ = plan;
2390 if j2k_profile::gpu_route_profile_enabled() {
2391 j2k_profile::emit_gpu_route_profile(
2392 "j2k",
2393 "cuda",
2394 &[
2395 ("op", "encode_packetization"),
2396 ("decision", "cpu_fallback"),
2397 ("reason", "unsupported_stage"),
2398 ],
2399 );
2400 }
2401 Ok(None)
2402 }
2403}
2404
2405#[cfg(feature = "cuda-runtime")]
2406fn cuda_packetization_packets(
2407 plan: &CudaHtj2kPacketizationPlan,
2408) -> Vec<CudaHtj2kPacketizationPacket> {
2409 plan.packets
2410 .iter()
2411 .map(|packet| CudaHtj2kPacketizationPacket {
2412 block_start: packet.block_start,
2413 block_count: packet.block_count,
2414 subband_start: packet.subband_start,
2415 subband_count: packet.subband_count,
2416 output_capacity: packet.output_capacity,
2417 layer: packet.layer,
2418 })
2419 .collect()
2420}
2421
2422#[cfg(feature = "cuda-runtime")]
2423fn cuda_packetization_subbands(
2424 plan: &CudaHtj2kPacketizationPlan,
2425) -> Vec<CudaHtj2kPacketizationSubband> {
2426 plan.subbands
2427 .iter()
2428 .map(|subband| CudaHtj2kPacketizationSubband {
2429 block_start: subband.block_start,
2430 block_count: subband.block_count,
2431 num_cbs_x: subband.num_cbs_x,
2432 num_cbs_y: subband.num_cbs_y,
2433 })
2434 .collect()
2435}
2436
2437#[cfg(feature = "cuda-runtime")]
2438fn cuda_packetization_blocks(
2439 plan: &CudaHtj2kPacketizationPlan,
2440) -> Vec<CudaHtj2kPacketizationBlock> {
2441 plan.blocks
2442 .iter()
2443 .map(|block| CudaHtj2kPacketizationBlock {
2444 data_offset: block.data_offset,
2445 data_len: block.data_len,
2446 cleanup_length: block.cleanup_length,
2447 refinement_length: block.refinement_length,
2448 num_coding_passes: block.num_coding_passes,
2449 num_zero_bitplanes: block.num_zero_bitplanes,
2450 l_block: block.l_block,
2451 previously_included: block.previously_included,
2452 inclusion_layer: block.inclusion_layer,
2453 })
2454 .collect()
2455}
2456
2457#[cfg(feature = "cuda-runtime")]
2458fn cuda_packetization_tag_states(
2459 plan: &CudaHtj2kPacketizationPlan,
2460) -> Vec<CudaHtj2kPacketizationSubbandTagState> {
2461 plan.tag_states
2462 .iter()
2463 .map(|state| CudaHtj2kPacketizationSubbandTagState {
2464 inclusion_node_start: state.inclusion_node_start,
2465 zero_bitplane_node_start: state.zero_bitplane_node_start,
2466 node_count: state.node_count,
2467 reserved0: 0,
2468 })
2469 .collect()
2470}
2471
2472#[cfg(feature = "cuda-runtime")]
2473fn cuda_packetization_tag_nodes(
2474 plan: &CudaHtj2kPacketizationPlan,
2475) -> Vec<CudaHtj2kPacketizationTagNodeState> {
2476 plan.tag_nodes
2477 .iter()
2478 .map(|node| CudaHtj2kPacketizationTagNodeState {
2479 current: node.current,
2480 known: node.known,
2481 })
2482 .collect()
2483}
2484
2485#[cfg(feature = "cuda-runtime")]
2486fn cuda_encode_ht_code_block(
2487 context: &CudaContext,
2488 resources: &CudaHtj2kEncodeResources,
2489 job: J2kHtCodeBlockEncodeJob<'_>,
2490) -> core::result::Result<j2k_cuda_runtime::CudaHtj2kEncodedCodeBlocks, &'static str> {
2491 let coefficient_len = (job.width as usize)
2492 .checked_mul(job.height as usize)
2493 .ok_or("CUDA HTJ2K code-block encode job is too large")?;
2494 if coefficient_len != job.coefficients.len() {
2495 return Err("CUDA HTJ2K code-block encode job has invalid coefficient length");
2496 }
2497 let cuda_jobs = [CudaHtj2kEncodeCodeBlockJob {
2498 coefficient_offset: 0,
2499 width: job.width,
2500 height: job.height,
2501 total_bitplanes: job.total_bitplanes,
2502 target_coding_passes: job.target_coding_passes,
2503 }];
2504 context
2505 .encode_htj2k_codeblocks_with_resources(job.coefficients, &cuda_jobs, resources)
2506 .map_err(|_| "CUDA HTJ2K code-block encode kernel failed")
2507}
2508
2509#[cfg(feature = "cuda-runtime")]
2510fn cuda_encode_ht_code_blocks(
2511 context: &CudaContext,
2512 resources: &CudaHtj2kEncodeResources,
2513 jobs: &[J2kHtCodeBlockEncodeJob<'_>],
2514) -> core::result::Result<j2k_cuda_runtime::CudaHtj2kEncodedCodeBlocks, &'static str> {
2515 let total_coefficients = jobs.iter().try_fold(0usize, |acc, job| {
2516 let coefficient_len = (job.width as usize)
2517 .checked_mul(job.height as usize)
2518 .ok_or("CUDA HTJ2K code-block batch is too large")?;
2519 if coefficient_len != job.coefficients.len() {
2520 return Err("CUDA HTJ2K code-block encode job has invalid coefficient length");
2521 }
2522 acc.checked_add(coefficient_len)
2523 .ok_or("CUDA HTJ2K code-block batch is too large")
2524 })?;
2525 let mut coefficients = Vec::with_capacity(total_coefficients);
2526 let mut cuda_jobs = Vec::with_capacity(jobs.len());
2527 for job in jobs {
2528 let coefficient_offset = u32::try_from(coefficients.len())
2529 .map_err(|_| "CUDA HTJ2K code-block batch is too large")?;
2530 coefficients.extend_from_slice(job.coefficients);
2531 cuda_jobs.push(CudaHtj2kEncodeCodeBlockJob {
2532 coefficient_offset,
2533 width: job.width,
2534 height: job.height,
2535 total_bitplanes: job.total_bitplanes,
2536 target_coding_passes: job.target_coding_passes,
2537 });
2538 }
2539
2540 context
2541 .encode_htj2k_codeblocks_with_resources(&coefficients, &cuda_jobs, resources)
2542 .map_err(|_| "CUDA HTJ2K code-block batch encode kernel failed")
2543}
2544
2545#[cfg(feature = "cuda-runtime")]
2546struct CudaEncodedHtj2kTile {
2547 tile_data: Vec<u8>,
2548 deinterleave_dispatches: usize,
2549 forward_rct_dispatches: usize,
2550 forward_ict_dispatches: usize,
2551 forward_dwt53_dispatches: usize,
2552 forward_dwt97_dispatches: usize,
2553 quantize_jobs: usize,
2554 quantize_dispatches: usize,
2555 ht_code_block_dispatches: usize,
2556 ht_code_block_jobs: usize,
2557 packetization_dispatches: usize,
2558 timings: CudaEncodeStageTimings,
2559}
2560
2561#[cfg(feature = "cuda-runtime")]
2562#[derive(Default)]
2563struct CudaHtj2kTileEncodeStats {
2564 collect_profile: bool,
2565 deinterleave_dispatches: usize,
2566 forward_rct_dispatches: usize,
2567 forward_ict_dispatches: usize,
2568 forward_dwt53_dispatches: usize,
2569 forward_dwt97_dispatches: usize,
2570 quantize_jobs: usize,
2571 quantize_dispatches: usize,
2572 ht_code_block_dispatches: usize,
2573 ht_code_block_jobs: usize,
2574 timings: CudaEncodeStageTimings,
2575}
2576
2577#[cfg(feature = "cuda-runtime")]
2578struct CudaEncodedHtj2kResolution {
2579 subbands: Vec<CudaEncodedHtj2kSubband>,
2580}
2581
2582#[cfg(feature = "cuda-runtime")]
2583struct CudaEncodedHtj2kSubband {
2584 code_blocks: Vec<EncodedHtJ2kCodeBlock>,
2585 num_cbs_x: u32,
2586 num_cbs_y: u32,
2587}
2588
2589#[cfg(feature = "cuda-runtime")]
2590#[derive(Clone, Copy)]
2591struct CudaTileSubbandRegion {
2592 x0: u32,
2593 y0: u32,
2594 width: u32,
2595 height: u32,
2596 stride: u32,
2597}
2598
2599#[cfg(feature = "cuda-runtime")]
2600#[derive(Clone, Copy)]
2601enum CudaTileSubbandKind {
2602 LowLow,
2603 HighLow,
2604 LowHigh,
2605 HighHigh,
2606}
2607
2608#[cfg(feature = "cuda-runtime")]
2609#[derive(Clone, Copy)]
2610struct CudaHtj2kEncodeRuntime<'a> {
2611 context: &'a CudaContext,
2612 resources: &'a CudaHtj2kEncodeResources,
2613}
2614
2615#[cfg(feature = "cuda-runtime")]
2616fn cuda_encode_htj2k_tile_body(
2617 context: &CudaContext,
2618 encode_resources: &CudaHtj2kEncodeResources,
2619 job: J2kHtj2kTileEncodeJob<'_>,
2620 collect_profile: bool,
2621) -> core::result::Result<Option<CudaEncodedHtj2kTile>, &'static str> {
2622 validate_cuda_htj2k_tile_job(job)?;
2623 let num_pixels = (job.width as usize)
2624 .checked_mul(job.height as usize)
2625 .ok_or("CUDA HTJ2K tile dimensions are too large")?;
2626 let (components, deinterleave_us) = time_cuda_stage(
2627 "j2k.htj2k.encode.tile.deinterleave",
2628 context,
2629 collect_profile,
2630 || {
2631 context.j2k_deinterleave_to_f32_resident(
2632 job.pixels,
2633 num_pixels,
2634 job.num_components,
2635 job.bit_depth,
2636 job.signed,
2637 )
2638 },
2639 )
2640 .map_err(|_| "CUDA HTJ2K tile deinterleave failed")?;
2641 cuda_encode_htj2k_resident_components_body(
2642 context,
2643 encode_resources,
2644 job,
2645 components,
2646 deinterleave_us,
2647 collect_profile,
2648 )
2649}
2650
2651#[cfg(feature = "cuda-runtime")]
2652fn validate_cuda_htj2k_tile_job(
2653 job: J2kHtj2kTileEncodeJob<'_>,
2654) -> core::result::Result<(), &'static str> {
2655 if job
2656 .component_sampling
2657 .iter()
2658 .any(|&sampling| sampling != (1, 1))
2659 {
2660 return Err("CUDA HTJ2K tile encode does not support component subsampling != (1, 1)");
2661 }
2662 if !matches!(job.num_components, 1 | 3 | 4) {
2673 return Err("CUDA HTJ2K tile encode supports 1, 3, or 4 components");
2674 }
2675 if job.use_mct && job.num_components < 3 {
2676 return Err("CUDA HTJ2K tile encode requires at least three components for MCT");
2677 }
2678 if job.code_block_width == 0 || job.code_block_height == 0 {
2679 return Err("CUDA HTJ2K tile encode job has invalid code-block dimensions");
2680 }
2681 let expected_quantization_steps = 1usize
2682 .checked_add(usize::from(job.num_decomposition_levels).saturating_mul(3))
2683 .ok_or("CUDA HTJ2K tile quantization step count overflow")?;
2684 if job.quantization_steps.len() != expected_quantization_steps {
2685 return Err("CUDA HTJ2K tile quantization step count mismatch");
2686 }
2687 Ok(())
2688}
2689
2690#[cfg(feature = "cuda-runtime")]
2691fn cuda_encode_htj2k_device_tile_body(
2692 context: &CudaContext,
2693 encode_resources: &CudaHtj2kEncodeResources,
2694 tile: CudaLosslessEncodeTile<'_>,
2695 job: J2kHtj2kTileEncodeJob<'_>,
2696 collect_profile: bool,
2697) -> core::result::Result<Option<CudaEncodedHtj2kTile>, &'static str> {
2698 validate_cuda_htj2k_tile_job(job)?;
2699 let format = cuda_encode_format(tile.format).map_err(|_| "CUDA HTJ2K tile format failed")?;
2700 if job.width != tile.output_width || job.height != tile.output_height {
2701 return Err("CUDA HTJ2K tile encode job dimensions do not match CUDA tile");
2702 }
2703 if tile.width != tile.output_width || tile.height != tile.output_height {
2704 return Err("CUDA HTJ2K tile encode does not support input padding");
2705 }
2706 if job.num_components != format.components || job.bit_depth != format.bit_depth || job.signed {
2707 return Err("CUDA HTJ2K tile encode job sample format does not match CUDA tile");
2708 }
2709 let (components, deinterleave_us) = time_cuda_stage(
2710 "j2k.htj2k.encode.tile.device_deinterleave",
2711 context,
2712 collect_profile,
2713 || {
2714 context.j2k_deinterleave_strided_to_f32_resident(CudaJ2kStridedInterleavedPixels {
2715 buffer: tile.buffer,
2716 byte_offset: tile.byte_offset,
2717 width: tile.width,
2718 height: tile.height,
2719 pitch_bytes: tile.pitch_bytes,
2720 num_components: job.num_components,
2721 bit_depth: job.bit_depth,
2722 signed: job.signed,
2723 })
2724 },
2725 )
2726 .map_err(|_| "CUDA HTJ2K tile device deinterleave failed")?;
2727 cuda_encode_htj2k_resident_components_body(
2728 context,
2729 encode_resources,
2730 job,
2731 components,
2732 deinterleave_us,
2733 collect_profile,
2734 )
2735}
2736
2737#[cfg(feature = "cuda-runtime")]
2738fn cuda_encode_htj2k_resident_components_body(
2739 context: &CudaContext,
2740 encode_resources: &CudaHtj2kEncodeResources,
2741 job: J2kHtj2kTileEncodeJob<'_>,
2742 mut components: CudaJ2kResidentComponents,
2743 deinterleave_us: u128,
2744 collect_profile: bool,
2745) -> core::result::Result<Option<CudaEncodedHtj2kTile>, &'static str> {
2746 let mut stats = CudaHtj2kTileEncodeStats {
2747 collect_profile,
2748 deinterleave_dispatches: components.execution().kernel_dispatches(),
2749 timings: CudaEncodeStageTimings {
2750 deinterleave_us,
2751 ..CudaEncodeStageTimings::default()
2752 },
2753 ..CudaHtj2kTileEncodeStats::default()
2754 };
2755 let runtime = CudaHtj2kEncodeRuntime {
2756 context,
2757 resources: encode_resources,
2758 };
2759
2760 if job.use_mct {
2761 let (execution, mct_us) = if job.reversible {
2762 time_cuda_stage(
2763 "j2k.htj2k.encode.tile.rct",
2764 context,
2765 collect_profile,
2766 || context.j2k_forward_rct_resident(&mut components),
2767 )
2768 .map_err(|_| "CUDA HTJ2K tile RCT failed")?
2769 } else {
2770 time_cuda_stage(
2771 "j2k.htj2k.encode.tile.ict",
2772 context,
2773 collect_profile,
2774 || context.j2k_forward_ict_resident(&mut components),
2775 )
2776 .map_err(|_| "CUDA HTJ2K tile ICT failed")?
2777 };
2778 stats.timings.mct_us = stats.timings.mct_us.saturating_add(mct_us);
2779 if job.reversible {
2780 stats.forward_rct_dispatches = execution.kernel_dispatches();
2781 } else {
2782 stats.forward_ict_dispatches = execution.kernel_dispatches();
2783 }
2784 }
2785
2786 let mut component_resolution_packets = Vec::with_capacity(usize::from(job.num_components));
2787 if job.num_decomposition_levels == 0 {
2788 for component in 0..job.num_components {
2789 let y0 = u32::from(component)
2790 .checked_mul(job.height)
2791 .ok_or("CUDA HTJ2K tile component offset overflow")?;
2792 let subband = cuda_encode_tile_subband_region(
2793 runtime,
2794 components.buffer(),
2795 CudaTileSubbandRegion {
2796 x0: 0,
2797 y0,
2798 width: job.width,
2799 height: job.height,
2800 stride: job.width,
2801 },
2802 job.quantization_steps[0],
2803 job,
2804 CudaTileSubbandKind::LowLow,
2805 &mut stats,
2806 )?;
2807 component_resolution_packets.push(vec![CudaEncodedHtj2kResolution {
2808 subbands: vec![subband],
2809 }]);
2810 }
2811 } else {
2812 for component in 0..job.num_components {
2813 let packets = if job.reversible {
2814 let (dwt, dwt_us) = time_cuda_stage(
2815 "j2k.htj2k.encode.tile.dwt53",
2816 context,
2817 collect_profile,
2818 || {
2819 context.j2k_forward_dwt53_resident_component(
2820 &components,
2821 component,
2822 job.width,
2823 job.height,
2824 job.num_decomposition_levels,
2825 )
2826 },
2827 )
2828 .map_err(|_| "CUDA HTJ2K tile DWT 5/3 failed")?;
2829 stats.forward_dwt53_dispatches = stats
2830 .forward_dwt53_dispatches
2831 .saturating_add(dwt.execution().kernel_dispatches());
2832 stats.timings.dwt_us = stats.timings.dwt_us.saturating_add(dwt_us);
2833 cuda_encode_dwt_component_packets(
2834 runtime,
2835 job,
2836 dwt.buffer(),
2837 dwt.levels(),
2838 dwt.ll_dimensions(),
2839 &mut stats,
2840 )?
2841 } else {
2842 let (dwt, dwt_us) = time_cuda_stage(
2843 "j2k.htj2k.encode.tile.dwt97",
2844 context,
2845 collect_profile,
2846 || {
2847 context.j2k_forward_dwt97_resident_component(
2848 &components,
2849 component,
2850 job.width,
2851 job.height,
2852 job.num_decomposition_levels,
2853 )
2854 },
2855 )
2856 .map_err(|_| "CUDA HTJ2K tile DWT 9/7 failed")?;
2857 stats.forward_dwt97_dispatches = stats
2858 .forward_dwt97_dispatches
2859 .saturating_add(dwt.execution().kernel_dispatches());
2860 stats.timings.dwt_us = stats.timings.dwt_us.saturating_add(dwt_us);
2861 cuda_encode_dwt_component_packets(
2862 runtime,
2863 job,
2864 dwt.buffer(),
2865 dwt.levels(),
2866 dwt.ll_dimensions(),
2867 &mut stats,
2868 )?
2869 };
2870 component_resolution_packets.push(packets);
2871 }
2872 }
2873
2874 let resolution_packets =
2875 cuda_order_component_resolution_packets(component_resolution_packets, job.num_components)?;
2876 let (tile_data, packetization_dispatches, packetize_us) =
2877 cuda_packetize_tile_body(context, job, &resolution_packets, stats.ht_code_block_jobs)?;
2878 stats.timings.packetize_us = stats.timings.packetize_us.saturating_add(packetize_us);
2879 Ok(Some(CudaEncodedHtj2kTile {
2880 tile_data,
2881 deinterleave_dispatches: stats.deinterleave_dispatches,
2882 forward_rct_dispatches: stats.forward_rct_dispatches,
2883 forward_ict_dispatches: stats.forward_ict_dispatches,
2884 forward_dwt53_dispatches: stats.forward_dwt53_dispatches,
2885 forward_dwt97_dispatches: stats.forward_dwt97_dispatches,
2886 quantize_jobs: stats.quantize_jobs,
2887 quantize_dispatches: stats.quantize_dispatches,
2888 ht_code_block_dispatches: stats.ht_code_block_dispatches,
2889 ht_code_block_jobs: stats.ht_code_block_jobs,
2890 packetization_dispatches,
2891 timings: stats.timings,
2892 }))
2893}
2894
2895#[cfg(feature = "cuda-runtime")]
2896fn cuda_encode_dwt_component_packets(
2897 runtime: CudaHtj2kEncodeRuntime<'_>,
2898 job: J2kHtj2kTileEncodeJob<'_>,
2899 transformed: &CudaDeviceBuffer,
2900 levels: &[CudaDwt53LevelShape],
2901 ll_dimensions: (u32, u32),
2902 stats: &mut CudaHtj2kTileEncodeStats,
2903) -> core::result::Result<Vec<CudaEncodedHtj2kResolution>, &'static str> {
2904 if levels.len() != usize::from(job.num_decomposition_levels) {
2905 return Err("CUDA HTJ2K tile DWT level count mismatch");
2906 }
2907 let (ll_width, ll_height) = ll_dimensions;
2908 let full_width = levels.first().map_or(ll_width, |level| level.width);
2909 let mut packets = Vec::with_capacity(levels.len().saturating_add(1));
2910
2911 let ll_subband = cuda_encode_tile_subband_region(
2912 runtime,
2913 transformed,
2914 CudaTileSubbandRegion {
2915 x0: 0,
2916 y0: 0,
2917 width: ll_width,
2918 height: ll_height,
2919 stride: full_width,
2920 },
2921 job.quantization_steps[0],
2922 job,
2923 CudaTileSubbandKind::LowLow,
2924 stats,
2925 )?;
2926 packets.push(CudaEncodedHtj2kResolution {
2927 subbands: vec![ll_subband],
2928 });
2929
2930 for (level_idx, level) in levels.iter().rev().enumerate() {
2931 let step_base = 1usize
2932 .checked_add(level_idx.saturating_mul(3))
2933 .ok_or("CUDA HTJ2K tile quantization step index overflow")?;
2934 let hl = cuda_encode_tile_subband_region(
2935 runtime,
2936 transformed,
2937 CudaTileSubbandRegion {
2938 x0: level.low_width,
2939 y0: 0,
2940 width: level.high_width,
2941 height: level.low_height,
2942 stride: full_width,
2943 },
2944 job.quantization_steps[step_base],
2945 job,
2946 CudaTileSubbandKind::HighLow,
2947 stats,
2948 )?;
2949 let lh = cuda_encode_tile_subband_region(
2950 runtime,
2951 transformed,
2952 CudaTileSubbandRegion {
2953 x0: 0,
2954 y0: level.low_height,
2955 width: level.low_width,
2956 height: level.high_height,
2957 stride: full_width,
2958 },
2959 job.quantization_steps[step_base + 1],
2960 job,
2961 CudaTileSubbandKind::LowHigh,
2962 stats,
2963 )?;
2964 let hh = cuda_encode_tile_subband_region(
2965 runtime,
2966 transformed,
2967 CudaTileSubbandRegion {
2968 x0: level.low_width,
2969 y0: level.low_height,
2970 width: level.high_width,
2971 height: level.high_height,
2972 stride: full_width,
2973 },
2974 job.quantization_steps[step_base + 2],
2975 job,
2976 CudaTileSubbandKind::HighHigh,
2977 stats,
2978 )?;
2979 packets.push(CudaEncodedHtj2kResolution {
2980 subbands: vec![hl, lh, hh],
2981 });
2982 }
2983
2984 Ok(packets)
2985}
2986
2987#[cfg(feature = "cuda-runtime")]
2988fn cuda_encode_tile_subband_region(
2989 runtime: CudaHtj2kEncodeRuntime<'_>,
2990 source: &CudaDeviceBuffer,
2991 region: CudaTileSubbandRegion,
2992 quantization_step: (u16, u16),
2993 job: J2kHtj2kTileEncodeJob<'_>,
2994 subband_kind: CudaTileSubbandKind,
2995 stats: &mut CudaHtj2kTileEncodeStats,
2996) -> core::result::Result<CudaEncodedHtj2kSubband, &'static str> {
2997 if region.width == 0 || region.height == 0 {
2998 return Ok(CudaEncodedHtj2kSubband {
2999 code_blocks: Vec::new(),
3000 num_cbs_x: 0,
3001 num_cbs_y: 0,
3002 });
3003 }
3004
3005 let (step_exponent, step_mantissa) = quantization_step;
3006 let step_exponent_u8 = u8::try_from(step_exponent)
3007 .map_err(|_| "CUDA HTJ2K tile quantization exponent exceeds u8")?;
3008 let total_bitplanes = job
3009 .guard_bits
3010 .saturating_add(step_exponent_u8)
3011 .saturating_sub(1);
3012 let (quantized, quantize_us) = time_cuda_stage(
3013 "j2k.htj2k.encode.tile.quantize",
3014 runtime.context,
3015 stats.collect_profile,
3016 || {
3017 runtime.context.j2k_quantize_subband_region_resident(
3018 source,
3019 CudaJ2kQuantizeSubbandRegionJob {
3020 x0: region.x0,
3021 y0: region.y0,
3022 width: region.width,
3023 height: region.height,
3024 stride: region.stride,
3025 quantization: CudaJ2kQuantizeJob {
3026 step_exponent,
3027 step_mantissa,
3028 range_bits: cuda_tile_subband_range_bits(job.bit_depth, subband_kind),
3029 reversible: job.reversible,
3030 },
3031 },
3032 )
3033 },
3034 )
3035 .map_err(|_| "CUDA HTJ2K tile quantize failed")?;
3036 stats.quantize_jobs = stats.quantize_jobs.saturating_add(1);
3037 stats.quantize_dispatches = stats
3038 .quantize_dispatches
3039 .saturating_add(quantized.execution().kernel_dispatches());
3040 stats.timings.quantize_us = stats.timings.quantize_us.saturating_add(quantize_us);
3041
3042 let region_jobs = cuda_ht_region_jobs(
3043 region.width,
3044 region.height,
3045 job.code_block_width,
3046 job.code_block_height,
3047 total_bitplanes,
3048 )?;
3049 stats.ht_code_block_jobs = stats.ht_code_block_jobs.saturating_add(region_jobs.len());
3050 let encoded = runtime
3051 .context
3052 .encode_htj2k_codeblock_regions_resident_with_resources(
3053 quantized.buffer(),
3054 quantized.coefficient_count(),
3055 ®ion_jobs,
3056 runtime.resources,
3057 )
3058 .map_err(|_| "CUDA HTJ2K tile code-block encode failed")?;
3059 stats.ht_code_block_dispatches = stats
3060 .ht_code_block_dispatches
3061 .saturating_add(encoded.execution().kernel_dispatches());
3062 stats.timings.ht_encode_us = stats
3063 .timings
3064 .ht_encode_us
3065 .saturating_add(encoded.stage_timings().ht_encode_us);
3066
3067 Ok(CudaEncodedHtj2kSubband {
3068 code_blocks: encoded_ht_code_blocks_from_cuda(&encoded),
3069 num_cbs_x: region.width.div_ceil(job.code_block_width),
3070 num_cbs_y: region.height.div_ceil(job.code_block_height),
3071 })
3072}
3073
3074#[cfg(feature = "cuda-runtime")]
3075fn cuda_tile_subband_range_bits(bit_depth: u8, subband_kind: CudaTileSubbandKind) -> u8 {
3076 let log_gain = match subband_kind {
3077 CudaTileSubbandKind::LowLow => 0,
3078 CudaTileSubbandKind::HighLow | CudaTileSubbandKind::LowHigh => 1,
3079 CudaTileSubbandKind::HighHigh => 2,
3080 };
3081 bit_depth.saturating_add(log_gain)
3082}
3083
3084#[cfg(feature = "cuda-runtime")]
3085fn cuda_order_component_resolution_packets(
3086 component_resolution_packets: Vec<Vec<CudaEncodedHtj2kResolution>>,
3087 num_components: u8,
3088) -> core::result::Result<Vec<CudaEncodedHtj2kResolution>, &'static str> {
3089 if component_resolution_packets.len() != usize::from(num_components) {
3090 return Err("CUDA HTJ2K tile component packet count mismatch");
3091 }
3092 let resolution_count = component_resolution_packets
3093 .first()
3094 .map_or(0usize, Vec::len);
3095 let mut component_iters: Vec<_> = component_resolution_packets
3096 .into_iter()
3097 .map(Vec::into_iter)
3098 .collect();
3099 let mut resolution_packets =
3100 Vec::with_capacity(resolution_count.saturating_mul(component_iters.len()));
3101
3102 for _resolution in 0..resolution_count {
3103 for component in &mut component_iters {
3104 resolution_packets.push(
3105 component
3106 .next()
3107 .ok_or("CUDA HTJ2K tile component resolution count mismatch")?,
3108 );
3109 }
3110 }
3111 if component_iters
3112 .iter_mut()
3113 .any(|component| component.next().is_some())
3114 {
3115 return Err("CUDA HTJ2K tile component resolution count mismatch");
3116 }
3117
3118 Ok(resolution_packets)
3119}
3120
3121#[cfg(feature = "cuda-runtime")]
3122fn cuda_ht_region_jobs(
3123 width: u32,
3124 height: u32,
3125 code_block_width: u32,
3126 code_block_height: u32,
3127 total_bitplanes: u8,
3128) -> core::result::Result<Vec<CudaHtj2kEncodeCodeBlockRegionJob>, &'static str> {
3129 if code_block_width == 0 || code_block_height == 0 {
3130 return Err("CUDA HTJ2K encode job has invalid code-block dimensions");
3131 }
3132 if width == 0 || height == 0 {
3133 return Ok(Vec::new());
3134 }
3135
3136 let num_cbs_x = width.div_ceil(code_block_width);
3137 let num_cbs_y = height.div_ceil(code_block_height);
3138 let count = (num_cbs_x as usize)
3139 .checked_mul(num_cbs_y as usize)
3140 .ok_or("CUDA HTJ2K code-block count overflow")?;
3141 let mut cuda_jobs = Vec::with_capacity(count);
3142 for cby in 0..num_cbs_y {
3143 for cbx in 0..num_cbs_x {
3144 let x0 = cbx
3145 .checked_mul(code_block_width)
3146 .ok_or("CUDA HTJ2K code-block x offset overflow")?;
3147 let y0 = cby
3148 .checked_mul(code_block_height)
3149 .ok_or("CUDA HTJ2K code-block y offset overflow")?;
3150 let block_width = (x0 + code_block_width).min(width) - x0;
3151 let block_height = (y0 + code_block_height).min(height) - y0;
3152 let offset = (y0 as usize)
3153 .checked_mul(width as usize)
3154 .and_then(|row| row.checked_add(x0 as usize))
3155 .ok_or("CUDA HTJ2K code-block offset overflow")?;
3156 cuda_jobs.push(CudaHtj2kEncodeCodeBlockRegionJob {
3157 coefficient_offset: u32::try_from(offset)
3158 .map_err(|_| "CUDA HTJ2K code-block offset exceeds u32")?,
3159 coefficient_stride: width,
3160 width: block_width,
3161 height: block_height,
3162 total_bitplanes,
3163 target_coding_passes: 1,
3164 });
3165 }
3166 }
3167 Ok(cuda_jobs)
3168}
3169
3170#[cfg(feature = "cuda-runtime")]
3171fn cuda_packetize_tile_body(
3172 context: &CudaContext,
3173 job: J2kHtj2kTileEncodeJob<'_>,
3174 resolution_packets: &[CudaEncodedHtj2kResolution],
3175 code_block_count: usize,
3176) -> core::result::Result<(Vec<u8>, usize, u128), &'static str> {
3177 let packet_descriptors =
3178 cuda_tile_packet_descriptors(resolution_packets.len(), 1, job.num_components)?;
3179 let resolutions: Vec<J2kPacketizationResolution<'_>> = resolution_packets
3180 .iter()
3181 .map(|resolution| J2kPacketizationResolution {
3182 subbands: resolution
3183 .subbands
3184 .iter()
3185 .map(|subband| {
3186 let code_blocks = subband
3187 .code_blocks
3188 .iter()
3189 .map(|block| J2kPacketizationCodeBlock {
3190 data: block.data.as_slice(),
3191 ht_cleanup_length: block.cleanup_length,
3192 ht_refinement_length: block.refinement_length,
3193 num_coding_passes: block.num_coding_passes,
3194 num_zero_bitplanes: block.num_zero_bitplanes,
3195 previously_included: false,
3196 l_block: 3,
3197 block_coding_mode: J2kPacketizationBlockCodingMode::HighThroughput,
3198 })
3199 .collect();
3200 J2kPacketizationSubband {
3201 code_blocks,
3202 num_cbs_x: subband.num_cbs_x,
3203 num_cbs_y: subband.num_cbs_y,
3204 }
3205 })
3206 .collect(),
3207 })
3208 .collect();
3209
3210 let packetization_job = J2kPacketizationEncodeJob {
3211 resolution_count: u32::try_from(resolutions.len())
3212 .map_err(|_| "CUDA HTJ2K tile resolution count exceeds u32")?,
3213 num_layers: 1,
3214 num_components: job.num_components,
3215 code_block_count: u32::try_from(code_block_count)
3216 .map_err(|_| "CUDA HTJ2K tile code-block count exceeds u32")?,
3217 progression_order: job.progression_order,
3218 packet_descriptors: &packet_descriptors,
3219 resolutions: &resolutions,
3220 };
3221 let plan = flatten_cuda_htj2k_packetization_job(packetization_job)?;
3222 let packets = cuda_packetization_packets(&plan);
3223 let subbands = cuda_packetization_subbands(&plan);
3224 let blocks = cuda_packetization_blocks(&plan);
3225 let tag_states = cuda_packetization_tag_states(&plan);
3226 let tag_nodes = cuda_packetization_tag_nodes(&plan);
3227 let packetized = context
3228 .packetize_htj2k_cleanup_packets_with_tag_state(
3229 &plan.payload,
3230 &packets,
3231 &subbands,
3232 &blocks,
3233 &tag_states,
3234 &tag_nodes,
3235 )
3236 .map_err(|_| "CUDA HTJ2K tile packetization failed")?;
3237 Ok((
3238 packetized.data().to_vec(),
3239 packetized.execution().kernel_dispatches(),
3240 packetized.stage_timings().packetize_us,
3241 ))
3242}
3243
3244#[cfg(feature = "cuda-runtime")]
3245fn cuda_tile_packet_descriptors(
3246 packet_count: usize,
3247 num_layers: u8,
3248 num_components: u8,
3249) -> core::result::Result<Vec<J2kPacketizationPacketDescriptor>, &'static str> {
3250 if num_layers != 1 {
3251 return Err("CUDA HTJ2K tile encode currently prepares one packet layer");
3252 }
3253 let component_count = usize::from(num_components).max(1);
3254 (0..packet_count)
3255 .map(|packet_index| {
3256 Ok(J2kPacketizationPacketDescriptor {
3257 packet_index: u32::try_from(packet_index)
3258 .map_err(|_| "CUDA HTJ2K tile packet index exceeds u32")?,
3259 state_index: u32::try_from(packet_index)
3260 .map_err(|_| "CUDA HTJ2K tile packet state index exceeds u32")?,
3261 layer: 0,
3262 resolution: u32::try_from(packet_index / component_count)
3263 .map_err(|_| "CUDA HTJ2K tile packet resolution exceeds u32")?,
3264 component: u8::try_from(packet_index % component_count)
3265 .map_err(|_| "CUDA HTJ2K tile packet component exceeds u8")?,
3266 precinct: 0,
3267 })
3268 })
3269 .collect()
3270}
3271
3272#[cfg(feature = "cuda-runtime")]
3273struct CudaEncodedHtSubband {
3274 quantize_dispatches: usize,
3275 encode: j2k_cuda_runtime::CudaHtj2kEncodedCodeBlocks,
3276 timings: CudaEncodeStageTimings,
3277}
3278
3279#[cfg(feature = "cuda-runtime")]
3280fn cuda_encode_ht_subband(
3281 context: &CudaContext,
3282 encode_resources: &CudaHtj2kEncodeResources,
3283 job: J2kHtSubbandEncodeJob<'_>,
3284 collect_profile: bool,
3285) -> core::result::Result<CudaEncodedHtSubband, &'static str> {
3286 let expected_len = (job.width as usize)
3287 .checked_mul(job.height as usize)
3288 .ok_or("CUDA HTJ2K subband encode dimensions are too large")?;
3289 if expected_len != job.coefficients.len() {
3290 return Err("CUDA HTJ2K subband encode job has invalid coefficient length");
3291 }
3292 if job.code_block_width == 0 || job.code_block_height == 0 {
3293 return Err("CUDA HTJ2K subband encode job has invalid code-block dimensions");
3294 }
3295
3296 let sample_buffer = context
3297 .upload_f32_pinned(job.coefficients)
3298 .map_err(|_| "CUDA HTJ2K subband upload failed")?;
3299 let (quantized, quantize_us) = time_cuda_stage(
3300 "j2k.htj2k.encode.subband.quantize",
3301 context,
3302 collect_profile,
3303 || {
3304 context.j2k_quantize_subband_resident(
3305 &sample_buffer,
3306 job.coefficients.len(),
3307 CudaJ2kQuantizeJob {
3308 step_exponent: job.step_exponent,
3309 step_mantissa: job.step_mantissa,
3310 range_bits: job.range_bits,
3311 reversible: job.reversible,
3312 },
3313 )
3314 },
3315 )
3316 .map_err(|_| "CUDA quantize subband encode kernel failed")?;
3317 let cuda_jobs = cuda_ht_subband_region_jobs(job)?;
3318 let encoded = context
3319 .encode_htj2k_codeblock_regions_resident_with_resources(
3320 quantized.buffer(),
3321 quantized.coefficient_count(),
3322 &cuda_jobs,
3323 encode_resources,
3324 )
3325 .map_err(|_| "CUDA HTJ2K resident subband encode kernel failed")?;
3326
3327 Ok(CudaEncodedHtSubband {
3328 quantize_dispatches: quantized.execution().kernel_dispatches(),
3329 timings: CudaEncodeStageTimings {
3330 quantize_us,
3331 ht_encode_us: encoded.stage_timings().ht_encode_us,
3332 ..CudaEncodeStageTimings::default()
3333 },
3334 encode: encoded,
3335 })
3336}
3337
3338#[cfg(feature = "cuda-runtime")]
3339fn cuda_ht_subband_region_jobs(
3340 job: J2kHtSubbandEncodeJob<'_>,
3341) -> core::result::Result<Vec<CudaHtj2kEncodeCodeBlockRegionJob>, &'static str> {
3342 cuda_ht_region_jobs(
3343 job.width,
3344 job.height,
3345 job.code_block_width,
3346 job.code_block_height,
3347 job.total_bitplanes,
3348 )
3349}
3350
3351fn ht_subband_code_block_count(
3352 job: J2kHtSubbandEncodeJob<'_>,
3353) -> core::result::Result<usize, &'static str> {
3354 if job.code_block_width == 0 || job.code_block_height == 0 {
3355 return Err("CUDA HTJ2K subband encode job has invalid code-block dimensions");
3356 }
3357 let num_cbs_x = job.width.div_ceil(job.code_block_width);
3358 let num_cbs_y = job.height.div_ceil(job.code_block_height);
3359 (num_cbs_x as usize)
3360 .checked_mul(num_cbs_y as usize)
3361 .ok_or("CUDA HTJ2K subband code-block count overflow")
3362}
3363
3364#[cfg(feature = "cuda-runtime")]
3365fn encoded_ht_code_block_from_cuda(
3366 encoded: &j2k_cuda_runtime::CudaHtj2kEncodedCodeBlock,
3367) -> EncodedHtJ2kCodeBlock {
3368 EncodedHtJ2kCodeBlock {
3369 data: encoded.data().to_vec(),
3370 cleanup_length: encoded.cleanup_length(),
3371 refinement_length: encoded.refinement_length(),
3372 num_coding_passes: encoded.num_coding_passes(),
3373 num_zero_bitplanes: encoded.num_zero_bitplanes(),
3374 }
3375}
3376
3377#[cfg(feature = "cuda-runtime")]
3378fn encoded_ht_code_blocks_from_cuda(
3379 encoded: &j2k_cuda_runtime::CudaHtj2kEncodedCodeBlocks,
3380) -> Vec<EncodedHtJ2kCodeBlock> {
3381 encoded
3382 .code_blocks()
3383 .iter()
3384 .map(encoded_ht_code_block_from_cuda)
3385 .collect()
3386}
3387
3388#[cfg(feature = "cuda-runtime")]
3389fn cuda_htj2k_encode_tables() -> CudaHtj2kEncodeTables<'static> {
3390 CudaHtj2kEncodeTables {
3391 vlc_table0: j2k_native::ht_vlc_encode_table0(),
3392 vlc_table1: j2k_native::ht_vlc_encode_table1(),
3393 uvlc_table: j2k_native::ht_uvlc_encode_table_bytes(),
3394 }
3395}
3396
3397#[cfg(feature = "cuda-runtime")]
3398fn cuda_dwt53_output_to_j2k(
3399 output: &CudaDwt53Output,
3400) -> core::result::Result<J2kForwardDwt53Output, &'static str> {
3401 let (ll_width, ll_height) = output.ll_dimensions();
3402 let transformed = output.transformed();
3403 let full_width = output
3404 .levels()
3405 .first()
3406 .map_or(ll_width, |level| level.width) as usize;
3407 let mut ll = Vec::with_capacity((ll_width as usize) * (ll_height as usize));
3408 for y in 0..ll_height as usize {
3409 let row_start = y
3410 .checked_mul(full_width)
3411 .ok_or("CUDA DWT LL row offset overflow")?;
3412 ll.extend_from_slice(&transformed[row_start..row_start + ll_width as usize]);
3413 }
3414
3415 let mut levels = Vec::with_capacity(output.levels().len());
3416 for shape in output.levels() {
3417 levels.push(J2kForwardDwt53Level {
3418 hl: extract_cuda_subband(
3419 transformed,
3420 full_width,
3421 shape.low_width,
3422 0,
3423 shape.high_width,
3424 shape.low_height,
3425 )?,
3426 lh: extract_cuda_subband(
3427 transformed,
3428 full_width,
3429 0,
3430 shape.low_height,
3431 shape.low_width,
3432 shape.high_height,
3433 )?,
3434 hh: extract_cuda_subband(
3435 transformed,
3436 full_width,
3437 shape.low_width,
3438 shape.low_height,
3439 shape.high_width,
3440 shape.high_height,
3441 )?,
3442 width: shape.width,
3443 height: shape.height,
3444 low_width: shape.low_width,
3445 low_height: shape.low_height,
3446 high_width: shape.high_width,
3447 high_height: shape.high_height,
3448 });
3449 }
3450 levels.reverse();
3451
3452 Ok(J2kForwardDwt53Output {
3453 ll,
3454 ll_width,
3455 ll_height,
3456 levels,
3457 })
3458}
3459
3460#[cfg(feature = "cuda-runtime")]
3469#[doc(hidden)]
3470pub fn cuda_dwt53_output_to_j2k_for_test(
3471 output: &CudaDwt53Output,
3472) -> core::result::Result<J2kForwardDwt53Output, &'static str> {
3473 cuda_dwt53_output_to_j2k(output)
3474}
3475
3476#[cfg(feature = "cuda-runtime")]
3477fn cuda_dwt97_output_to_j2k(
3478 output: &CudaDwt97Output,
3479) -> core::result::Result<J2kForwardDwt97Output, &'static str> {
3480 let (ll_width, ll_height) = output.ll_dimensions();
3481 let transformed = output.transformed();
3482 let full_width = output
3483 .levels()
3484 .first()
3485 .map_or(ll_width, |level| level.width) as usize;
3486 let mut ll = Vec::with_capacity((ll_width as usize) * (ll_height as usize));
3487 for y in 0..ll_height as usize {
3488 let row_start = y
3489 .checked_mul(full_width)
3490 .ok_or("CUDA DWT LL row offset overflow")?;
3491 ll.extend_from_slice(&transformed[row_start..row_start + ll_width as usize]);
3492 }
3493
3494 let mut levels = Vec::with_capacity(output.levels().len());
3495 for shape in output.levels() {
3496 levels.push(J2kForwardDwt97Level {
3497 hl: extract_cuda_subband(
3498 transformed,
3499 full_width,
3500 shape.low_width,
3501 0,
3502 shape.high_width,
3503 shape.low_height,
3504 )?,
3505 lh: extract_cuda_subband(
3506 transformed,
3507 full_width,
3508 0,
3509 shape.low_height,
3510 shape.low_width,
3511 shape.high_height,
3512 )?,
3513 hh: extract_cuda_subband(
3514 transformed,
3515 full_width,
3516 shape.low_width,
3517 shape.low_height,
3518 shape.high_width,
3519 shape.high_height,
3520 )?,
3521 width: shape.width,
3522 height: shape.height,
3523 low_width: shape.low_width,
3524 low_height: shape.low_height,
3525 high_width: shape.high_width,
3526 high_height: shape.high_height,
3527 });
3528 }
3529 levels.reverse();
3530
3531 Ok(J2kForwardDwt97Output {
3532 ll,
3533 ll_width,
3534 ll_height,
3535 levels,
3536 })
3537}
3538
3539#[cfg(feature = "cuda-runtime")]
3540fn extract_cuda_subband(
3541 transformed: &[f32],
3542 full_width: usize,
3543 x0: u32,
3544 y0: u32,
3545 width: u32,
3546 height: u32,
3547) -> core::result::Result<Vec<f32>, &'static str> {
3548 let mut out = Vec::with_capacity((width as usize) * (height as usize));
3549 for y in 0..height as usize {
3550 let row_start = (y0 as usize)
3551 .checked_add(y)
3552 .and_then(|row| row.checked_mul(full_width))
3553 .and_then(|row| row.checked_add(x0 as usize))
3554 .ok_or("CUDA DWT subband offset overflow")?;
3555 out.extend_from_slice(&transformed[row_start..row_start + width as usize]);
3556 }
3557 Ok(out)
3558}
3559
3560#[cfg(test)]
3561mod tests {
3562 #[cfg(feature = "cuda-runtime")]
3563 use super::{cuda_htj2k_encode_tables, cuda_runtime_required};
3564 use super::{
3565 encode_j2k_lossless_with_cuda, encode_j2k_lossless_with_cuda_and_profile,
3566 flatten_cuda_htj2k_packetization_job, CudaEncodeStageAccelerator,
3567 CudaHtj2kPacketizationPlanTagNodeState,
3568 };
3569 use j2k::adapter::encode_stage::NativeEncodeStageAdapter;
3570 #[cfg(feature = "cuda-runtime")]
3571 use j2k::adapter::encode_stage::{J2kDeinterleaveToF32Job, J2kHtCodeBlockEncodeJob};
3572 use j2k::adapter::encode_stage::{
3573 J2kEncodeStageAccelerator, J2kHtSubbandEncodeJob, J2kPacketizationBlockCodingMode,
3574 J2kPacketizationCodeBlock, J2kPacketizationEncodeJob, J2kPacketizationPacketDescriptor,
3575 J2kPacketizationProgressionOrder, J2kPacketizationResolution, J2kPacketizationSubband,
3576 J2kQuantizeSubbandJob,
3577 };
3578 #[cfg(feature = "cuda-runtime")]
3579 use j2k::{encode_j2k_lossy_with_accelerator, J2kLossyEncodeOptions, J2kLossySamples};
3580 use j2k::{
3581 EncodeBackendPreference, J2kBlockCodingMode, J2kEncodeValidation, J2kLosslessEncodeOptions,
3582 J2kLosslessSamples,
3583 };
3584 #[cfg(feature = "cuda-runtime")]
3585 use j2k_core::BackendKind;
3586 use j2k_core::CodecError;
3587 #[cfg(feature = "cuda-runtime")]
3588 use j2k_cuda_runtime::{
3589 CudaContext, CudaHtj2kEncodeCodeBlockJob, CudaHtj2kEncodeCodeBlockRegionJob,
3590 CudaJ2kQuantizeJob,
3591 };
3592 use j2k_native::{
3593 encode_with_accelerator as encode_with_native_accelerator, DecodeSettings, EncodeOptions,
3594 Image,
3595 };
3596
3597 fn assert_strict_cuda_classic_tier1_error<E: CodecError + ?Sized>(err: &E, context: &str) {
3598 assert!(err.is_unsupported());
3599 let message = err.to_string();
3600 assert!(
3601 message.contains("tier1_code_block") || message.contains("deinterleave"),
3602 "expected {context} error to mention either the missing classic tier-1 stage or unavailable CUDA deinterleave, got {message}"
3603 );
3604 }
3605
3606 #[allow(clippy::too_many_arguments)]
3607 fn encode_with_cuda_test_accelerator(
3608 pixels: &[u8],
3609 width: u32,
3610 height: u32,
3611 components: u8,
3612 bit_depth: u8,
3613 signed: bool,
3614 options: &EncodeOptions,
3615 accelerator: &mut CudaEncodeStageAccelerator,
3616 ) -> core::result::Result<Vec<u8>, &'static str> {
3617 let mut bridge = NativeEncodeStageAdapter::new(accelerator);
3618 encode_with_native_accelerator(
3619 pixels,
3620 width,
3621 height,
3622 components,
3623 bit_depth,
3624 signed,
3625 options,
3626 &mut bridge,
3627 )
3628 }
3629
3630 #[test]
3631 fn cuda_lossless_encode_auto_errors_for_unsupported_classic_tier1() {
3632 let pixels: Vec<u8> = (0u32..128 * 128)
3633 .map(|value| u8::try_from((value * 17 + 5) & 0xFF).expect("masked value fits in u8"))
3634 .collect();
3635 let samples =
3636 J2kLosslessSamples::new(&pixels, 128, 128, 1, 8, false).expect("valid gray8 samples");
3637 let options = J2kLosslessEncodeOptions::default()
3638 .with_backend(EncodeBackendPreference::Auto)
3639 .with_block_coding_mode(J2kBlockCodingMode::Classic)
3640 .with_max_decomposition_levels(Some(0))
3641 .with_validation(J2kEncodeValidation::CpuRoundTrip);
3642
3643 let err = encode_j2k_lossless_with_cuda(samples, &options)
3644 .expect_err("CUDA-named encode must not silently return CPU fallback");
3645
3646 assert_strict_cuda_classic_tier1_error(&err, "strict CUDA encode");
3647 }
3648
3649 #[test]
3650 fn cuda_lossless_encode_profile_auto_errors_for_unsupported_classic_tier1() {
3651 let pixels: Vec<u8> = (0u32..128 * 128)
3652 .map(|value| u8::try_from((value * 19 + 7) & 0xFF).expect("masked value fits in u8"))
3653 .collect();
3654 let samples =
3655 J2kLosslessSamples::new(&pixels, 128, 128, 1, 8, false).expect("valid gray8 samples");
3656 let options = J2kLosslessEncodeOptions::default()
3657 .with_backend(EncodeBackendPreference::Auto)
3658 .with_block_coding_mode(J2kBlockCodingMode::Classic)
3659 .with_max_decomposition_levels(Some(0))
3660 .with_validation(J2kEncodeValidation::External);
3661
3662 let err = encode_j2k_lossless_with_cuda_and_profile(samples, &options)
3663 .expect_err("profiled CUDA encode must not silently return CPU fallback");
3664
3665 assert_strict_cuda_classic_tier1_error(&err, "profiled strict CUDA encode");
3666 }
3667
3668 #[test]
3669 fn cuda_lossless_encode_require_device_errors_for_unsupported_classic_tier1() {
3670 let pixels: Vec<u8> = (0u32..128 * 128)
3671 .map(|value| u8::try_from((value * 29 + 11) & 0xFF).expect("masked value fits in u8"))
3672 .collect();
3673 let samples =
3674 J2kLosslessSamples::new(&pixels, 128, 128, 1, 8, false).expect("valid gray8 samples");
3675 let options = J2kLosslessEncodeOptions::default()
3676 .with_backend(EncodeBackendPreference::RequireDevice)
3677 .with_block_coding_mode(J2kBlockCodingMode::Classic)
3678 .with_max_decomposition_levels(Some(0))
3679 .with_validation(J2kEncodeValidation::External);
3680
3681 let err = encode_j2k_lossless_with_cuda(samples, &options)
3682 .expect_err("strict CUDA encode must not silently fall back to CPU");
3683
3684 assert_strict_cuda_classic_tier1_error(&err, "strict CUDA encode");
3685 }
3686
3687 #[test]
3688 fn cuda_packetization_flatten_accepts_cleanup_only_single_block_packet() {
3689 let payload = [0x12, 0x34, 0x56, 0x78];
3690 let code_block = J2kPacketizationCodeBlock {
3691 data: &payload,
3692 ht_cleanup_length: 0,
3693 ht_refinement_length: 0,
3694 num_coding_passes: 1,
3695 num_zero_bitplanes: 2,
3696 previously_included: false,
3697 l_block: 3,
3698 block_coding_mode: J2kPacketizationBlockCodingMode::HighThroughput,
3699 };
3700 let subband = J2kPacketizationSubband {
3701 code_blocks: vec![code_block],
3702 num_cbs_x: 1,
3703 num_cbs_y: 1,
3704 };
3705 let resolution = J2kPacketizationResolution {
3706 subbands: vec![subband],
3707 };
3708 let descriptor = J2kPacketizationPacketDescriptor {
3709 packet_index: 0,
3710 state_index: 0,
3711 layer: 0,
3712 resolution: 0,
3713 component: 0,
3714 precinct: 0,
3715 };
3716 let job = J2kPacketizationEncodeJob {
3717 resolution_count: 1,
3718 num_layers: 1,
3719 num_components: 1,
3720 code_block_count: 1,
3721 progression_order: J2kPacketizationProgressionOrder::Lrcp,
3722 packet_descriptors: &[descriptor],
3723 resolutions: &[resolution],
3724 };
3725
3726 let plan = flatten_cuda_htj2k_packetization_job(job).expect("supported CUDA packetization");
3727
3728 assert_eq!(plan.payload, payload);
3729 assert_eq!(plan.packets.len(), 1);
3730 assert_eq!(plan.subbands.len(), 1);
3731 assert_eq!(plan.blocks.len(), 1);
3732 assert_eq!(plan.packets[0].block_start, 0);
3733 assert_eq!(plan.packets[0].block_count, 1);
3734 assert_eq!(plan.packets[0].subband_start, 0);
3735 assert_eq!(plan.packets[0].subband_count, 1);
3736 assert_eq!(plan.subbands[0].block_start, 0);
3737 assert_eq!(plan.subbands[0].block_count, 1);
3738 let payload_len = u32::try_from(payload.len()).expect("test payload length fits in u32");
3739 assert!(plan.packets[0].output_capacity >= payload_len + 256);
3740 assert_eq!(plan.blocks[0].data_offset, 0);
3741 assert_eq!(plan.blocks[0].data_len, payload_len);
3742 assert_eq!(plan.blocks[0].num_coding_passes, 1);
3743 assert_eq!(plan.blocks[0].num_zero_bitplanes, 2);
3744 }
3745
3746 #[test]
3747 fn cuda_packetization_flatten_accepts_cleanup_only_multi_block_packet() {
3748 let payloads = vec![
3749 vec![0x10, 0x11, 0x12],
3750 vec![0x20, 0x21],
3751 vec![0x30, 0x31, 0x32, 0x33],
3752 vec![0x40],
3753 ];
3754 let code_blocks = payloads
3755 .iter()
3756 .enumerate()
3757 .map(|(idx, payload)| J2kPacketizationCodeBlock {
3758 data: payload.as_slice(),
3759 ht_cleanup_length: 0,
3760 ht_refinement_length: 0,
3761 num_coding_passes: 1,
3762 num_zero_bitplanes: u8::try_from(idx + 1).expect("test zbp fits in u8"),
3763 previously_included: false,
3764 l_block: 3,
3765 block_coding_mode: J2kPacketizationBlockCodingMode::HighThroughput,
3766 })
3767 .collect();
3768 let subband = J2kPacketizationSubband {
3769 code_blocks,
3770 num_cbs_x: 2,
3771 num_cbs_y: 2,
3772 };
3773 let resolution = J2kPacketizationResolution {
3774 subbands: vec![subband],
3775 };
3776 let descriptor = J2kPacketizationPacketDescriptor {
3777 packet_index: 0,
3778 state_index: 0,
3779 layer: 0,
3780 resolution: 0,
3781 component: 0,
3782 precinct: 0,
3783 };
3784 let job = J2kPacketizationEncodeJob {
3785 resolution_count: 1,
3786 num_layers: 1,
3787 num_components: 1,
3788 code_block_count: 4,
3789 progression_order: J2kPacketizationProgressionOrder::Lrcp,
3790 packet_descriptors: &[descriptor],
3791 resolutions: &[resolution],
3792 };
3793
3794 let plan =
3795 flatten_cuda_htj2k_packetization_job(job).expect("multi-block CUDA packetization");
3796
3797 assert_eq!(plan.packets.len(), 1);
3798 assert_eq!(plan.subbands.len(), 1);
3799 assert_eq!(plan.blocks.len(), 4);
3800 assert_eq!(plan.packets[0].block_start, 0);
3801 assert_eq!(plan.packets[0].block_count, 4);
3802 assert_eq!(plan.packets[0].subband_start, 0);
3803 assert_eq!(plan.packets[0].subband_count, 1);
3804 assert_eq!(plan.subbands[0].block_start, 0);
3805 assert_eq!(plan.subbands[0].block_count, 4);
3806 assert_eq!(plan.subbands[0].num_cbs_x, 2);
3807 assert_eq!(plan.subbands[0].num_cbs_y, 2);
3808 assert_eq!(
3809 plan.payload,
3810 payloads.into_iter().flatten().collect::<Vec<_>>()
3811 );
3812 assert_eq!(plan.blocks[2].num_zero_bitplanes, 3);
3813 }
3814
3815 #[test]
3816 fn cuda_packetization_flatten_accepts_ht_refinement_pass_packet() {
3817 let payload = [0x12, 0x34, 0x56, 0x78, 0x9a];
3818 let code_block = J2kPacketizationCodeBlock {
3819 data: &payload,
3820 ht_cleanup_length: 3,
3821 ht_refinement_length: 2,
3822 num_coding_passes: 3,
3823 num_zero_bitplanes: 2,
3824 previously_included: false,
3825 l_block: 3,
3826 block_coding_mode: J2kPacketizationBlockCodingMode::HighThroughput,
3827 };
3828 let subband = J2kPacketizationSubband {
3829 code_blocks: vec![code_block],
3830 num_cbs_x: 1,
3831 num_cbs_y: 1,
3832 };
3833 let resolution = J2kPacketizationResolution {
3834 subbands: vec![subband],
3835 };
3836 let descriptor = J2kPacketizationPacketDescriptor {
3837 packet_index: 0,
3838 state_index: 0,
3839 layer: 0,
3840 resolution: 0,
3841 component: 0,
3842 precinct: 0,
3843 };
3844 let job = J2kPacketizationEncodeJob {
3845 resolution_count: 1,
3846 num_layers: 1,
3847 num_components: 1,
3848 code_block_count: 1,
3849 progression_order: J2kPacketizationProgressionOrder::Lrcp,
3850 packet_descriptors: &[descriptor],
3851 resolutions: &[resolution],
3852 };
3853
3854 let plan = flatten_cuda_htj2k_packetization_job(job).expect("HT refinement packetization");
3855
3856 assert_eq!(plan.payload, payload);
3857 assert_eq!(plan.blocks.len(), 1);
3858 assert_eq!(plan.blocks[0].num_coding_passes, 3);
3859 assert_eq!(
3860 plan.blocks[0].data_len,
3861 u32::try_from(payload.len()).expect("test payload length fits in u32")
3862 );
3863 }
3864
3865 #[test]
3866 fn cuda_packetization_rejects_overflowing_ht_refinement_lengths() {
3867 let payload = [0x12];
3868 let code_block = J2kPacketizationCodeBlock {
3869 data: &payload,
3870 ht_cleanup_length: u32::MAX,
3871 ht_refinement_length: 1,
3872 num_coding_passes: 3,
3873 num_zero_bitplanes: 2,
3874 previously_included: false,
3875 l_block: 3,
3876 block_coding_mode: J2kPacketizationBlockCodingMode::HighThroughput,
3877 };
3878
3879 let err = super::cuda_ht_segment_lengths(&code_block)
3880 .expect_err("overflowing CUDA HT segment lengths rejected");
3881
3882 assert_eq!(err, "multi-pass HTJ2K packet contribution length overflow");
3883 }
3884
3885 #[test]
3886 fn cuda_packetization_flatten_rejects_out_of_range_ht_pass_count() {
3887 let payload = [0u8; 1];
3888 let code_block = J2kPacketizationCodeBlock {
3889 data: &payload,
3890 ht_cleanup_length: 0,
3891 ht_refinement_length: 0,
3892 num_coding_passes: 165,
3893 num_zero_bitplanes: 2,
3894 previously_included: false,
3895 l_block: 3,
3896 block_coding_mode: J2kPacketizationBlockCodingMode::HighThroughput,
3897 };
3898 let subband = J2kPacketizationSubband {
3899 code_blocks: vec![code_block],
3900 num_cbs_x: 1,
3901 num_cbs_y: 1,
3902 };
3903 let resolution = J2kPacketizationResolution {
3904 subbands: vec![subband],
3905 };
3906 let descriptor = J2kPacketizationPacketDescriptor {
3907 packet_index: 0,
3908 state_index: 0,
3909 layer: 0,
3910 resolution: 0,
3911 component: 0,
3912 precinct: 0,
3913 };
3914 let job = J2kPacketizationEncodeJob {
3915 resolution_count: 1,
3916 num_layers: 1,
3917 num_components: 1,
3918 code_block_count: 1,
3919 progression_order: J2kPacketizationProgressionOrder::Lrcp,
3920 packet_descriptors: &[descriptor],
3921 resolutions: &[resolution],
3922 };
3923
3924 let err = flatten_cuda_htj2k_packetization_job(job)
3925 .expect_err("invalid HT pass count must be rejected before CUDA launch");
3926
3927 assert_eq!(
3928 err,
3929 "CUDA HTJ2K packetization coding pass count exceeds JPEG 2000 bounds"
3930 );
3931 }
3932
3933 #[test]
3934 fn cuda_packetization_flatten_accepts_previously_included_second_layer_packet() {
3935 let first_payload = [0x11u8; 20];
3936 let second_payload = [0x22u8; 5];
3937 let first_block = J2kPacketizationCodeBlock {
3938 data: &first_payload,
3939 ht_cleanup_length: 0,
3940 ht_refinement_length: 0,
3941 num_coding_passes: 1,
3942 num_zero_bitplanes: 2,
3943 previously_included: false,
3944 l_block: 3,
3945 block_coding_mode: J2kPacketizationBlockCodingMode::HighThroughput,
3946 };
3947 let second_block = J2kPacketizationCodeBlock {
3948 data: &second_payload,
3949 ht_cleanup_length: 0,
3950 ht_refinement_length: 0,
3951 num_coding_passes: 1,
3952 num_zero_bitplanes: 2,
3953 previously_included: false,
3954 l_block: 3,
3955 block_coding_mode: J2kPacketizationBlockCodingMode::HighThroughput,
3956 };
3957 let first_resolution = J2kPacketizationResolution {
3958 subbands: vec![J2kPacketizationSubband {
3959 code_blocks: vec![first_block],
3960 num_cbs_x: 1,
3961 num_cbs_y: 1,
3962 }],
3963 };
3964 let second_resolution = J2kPacketizationResolution {
3965 subbands: vec![J2kPacketizationSubband {
3966 code_blocks: vec![second_block],
3967 num_cbs_x: 1,
3968 num_cbs_y: 1,
3969 }],
3970 };
3971 let descriptors = [
3972 J2kPacketizationPacketDescriptor {
3973 packet_index: 0,
3974 state_index: 0,
3975 layer: 0,
3976 resolution: 0,
3977 component: 0,
3978 precinct: 0,
3979 },
3980 J2kPacketizationPacketDescriptor {
3981 packet_index: 1,
3982 state_index: 0,
3983 layer: 1,
3984 resolution: 0,
3985 component: 0,
3986 precinct: 0,
3987 },
3988 ];
3989 let resolutions = [first_resolution, second_resolution];
3990 let job = J2kPacketizationEncodeJob {
3991 resolution_count: 2,
3992 num_layers: 2,
3993 num_components: 1,
3994 code_block_count: 2,
3995 progression_order: J2kPacketizationProgressionOrder::Lrcp,
3996 packet_descriptors: &descriptors,
3997 resolutions: &resolutions,
3998 };
3999
4000 let plan =
4001 flatten_cuda_htj2k_packetization_job(job).expect("stateful CUDA packetization plan");
4002
4003 assert_eq!(
4004 plan.payload,
4005 [first_payload.as_slice(), second_payload.as_slice()].concat()
4006 );
4007 assert_eq!(plan.packets.len(), 2);
4008 assert_eq!(plan.blocks.len(), 2);
4009 assert_eq!(plan.packets[0].layer, 0);
4010 assert_eq!(plan.packets[1].layer, 1);
4011 assert_eq!(plan.blocks[0].l_block, 3);
4012 assert_eq!(plan.blocks[0].previously_included, 0);
4013 assert_eq!(plan.blocks[1].previously_included, 1);
4014 assert_eq!(plan.blocks[0].inclusion_layer, 0);
4015 assert_eq!(plan.blocks[1].inclusion_layer, 0);
4016 assert_eq!(
4017 plan.blocks[1].l_block, 5,
4018 "first layer length must update L-block for later packet state"
4019 );
4020 }
4021
4022 #[test]
4023 fn cuda_packetization_flatten_accepts_deferred_first_inclusion_second_layer_packet() {
4024 let payload = [0x44u8; 5];
4025 let first_block = J2kPacketizationCodeBlock {
4026 data: &[],
4027 ht_cleanup_length: 0,
4028 ht_refinement_length: 0,
4029 num_coding_passes: 0,
4030 num_zero_bitplanes: 2,
4031 previously_included: false,
4032 l_block: 3,
4033 block_coding_mode: J2kPacketizationBlockCodingMode::HighThroughput,
4034 };
4035 let second_block = J2kPacketizationCodeBlock {
4036 data: &payload,
4037 ht_cleanup_length: 0,
4038 ht_refinement_length: 0,
4039 num_coding_passes: 1,
4040 num_zero_bitplanes: 2,
4041 previously_included: false,
4042 l_block: 3,
4043 block_coding_mode: J2kPacketizationBlockCodingMode::HighThroughput,
4044 };
4045 let first_resolution = J2kPacketizationResolution {
4046 subbands: vec![J2kPacketizationSubband {
4047 code_blocks: vec![first_block],
4048 num_cbs_x: 1,
4049 num_cbs_y: 1,
4050 }],
4051 };
4052 let second_resolution = J2kPacketizationResolution {
4053 subbands: vec![J2kPacketizationSubband {
4054 code_blocks: vec![second_block],
4055 num_cbs_x: 1,
4056 num_cbs_y: 1,
4057 }],
4058 };
4059 let descriptors = [
4060 J2kPacketizationPacketDescriptor {
4061 packet_index: 0,
4062 state_index: 0,
4063 layer: 0,
4064 resolution: 0,
4065 component: 0,
4066 precinct: 0,
4067 },
4068 J2kPacketizationPacketDescriptor {
4069 packet_index: 1,
4070 state_index: 0,
4071 layer: 1,
4072 resolution: 0,
4073 component: 0,
4074 precinct: 0,
4075 },
4076 ];
4077 let resolutions = [first_resolution, second_resolution];
4078 let job = J2kPacketizationEncodeJob {
4079 resolution_count: 2,
4080 num_layers: 2,
4081 num_components: 1,
4082 code_block_count: 2,
4083 progression_order: J2kPacketizationProgressionOrder::Lrcp,
4084 packet_descriptors: &descriptors,
4085 resolutions: &resolutions,
4086 };
4087
4088 let plan =
4089 flatten_cuda_htj2k_packetization_job(job).expect("deferred first inclusion plan");
4090
4091 assert_eq!(plan.payload, payload);
4092 assert_eq!(plan.packets.len(), 2);
4093 assert_eq!(plan.blocks.len(), 2);
4094 assert_eq!(plan.packets[0].layer, 0);
4095 assert_eq!(plan.packets[1].layer, 1);
4096 assert_eq!(plan.blocks[0].previously_included, 0);
4097 assert_eq!(plan.blocks[1].previously_included, 0);
4098 assert_eq!(plan.blocks[0].inclusion_layer, 1);
4099 assert_eq!(plan.blocks[1].inclusion_layer, 1);
4100 }
4101
4102 #[test]
4103 fn cuda_packetization_flatten_accepts_deferred_first_inclusion_after_non_empty_packet() {
4104 let first_payload = [0x11u8; 3];
4105 let second_payload = [0x22u8; 5];
4106 let first_resolution = J2kPacketizationResolution {
4107 subbands: vec![J2kPacketizationSubband {
4108 code_blocks: vec![
4109 J2kPacketizationCodeBlock {
4110 data: &first_payload,
4111 ht_cleanup_length: 0,
4112 ht_refinement_length: 0,
4113 num_coding_passes: 1,
4114 num_zero_bitplanes: 2,
4115 previously_included: false,
4116 l_block: 3,
4117 block_coding_mode: J2kPacketizationBlockCodingMode::HighThroughput,
4118 },
4119 J2kPacketizationCodeBlock {
4120 data: &[],
4121 ht_cleanup_length: 0,
4122 ht_refinement_length: 0,
4123 num_coding_passes: 0,
4124 num_zero_bitplanes: 2,
4125 previously_included: false,
4126 l_block: 3,
4127 block_coding_mode: J2kPacketizationBlockCodingMode::HighThroughput,
4128 },
4129 ],
4130 num_cbs_x: 2,
4131 num_cbs_y: 1,
4132 }],
4133 };
4134 let second_resolution = J2kPacketizationResolution {
4135 subbands: vec![J2kPacketizationSubband {
4136 code_blocks: vec![
4137 J2kPacketizationCodeBlock {
4138 data: &[],
4139 ht_cleanup_length: 0,
4140 ht_refinement_length: 0,
4141 num_coding_passes: 0,
4142 num_zero_bitplanes: 2,
4143 previously_included: false,
4144 l_block: 3,
4145 block_coding_mode: J2kPacketizationBlockCodingMode::HighThroughput,
4146 },
4147 J2kPacketizationCodeBlock {
4148 data: &second_payload,
4149 ht_cleanup_length: 0,
4150 ht_refinement_length: 0,
4151 num_coding_passes: 1,
4152 num_zero_bitplanes: 2,
4153 previously_included: false,
4154 l_block: 3,
4155 block_coding_mode: J2kPacketizationBlockCodingMode::HighThroughput,
4156 },
4157 ],
4158 num_cbs_x: 2,
4159 num_cbs_y: 1,
4160 }],
4161 };
4162 let descriptors = [
4163 J2kPacketizationPacketDescriptor {
4164 packet_index: 0,
4165 state_index: 0,
4166 layer: 0,
4167 resolution: 0,
4168 component: 0,
4169 precinct: 0,
4170 },
4171 J2kPacketizationPacketDescriptor {
4172 packet_index: 1,
4173 state_index: 0,
4174 layer: 1,
4175 resolution: 0,
4176 component: 0,
4177 precinct: 0,
4178 },
4179 ];
4180 let resolutions = [first_resolution, second_resolution];
4181 let job = J2kPacketizationEncodeJob {
4182 resolution_count: 2,
4183 num_layers: 2,
4184 num_components: 1,
4185 code_block_count: 4,
4186 progression_order: J2kPacketizationProgressionOrder::Lrcp,
4187 packet_descriptors: &descriptors,
4188 resolutions: &resolutions,
4189 };
4190
4191 let plan = flatten_cuda_htj2k_packetization_job(job)
4192 .expect("persistent tag-tree state is flattened for CUDA packetization");
4193
4194 assert_eq!(
4195 plan.payload,
4196 [first_payload.as_slice(), second_payload.as_slice()].concat()
4197 );
4198 assert_eq!(plan.packets.len(), 2);
4199 assert_eq!(plan.blocks.len(), 4);
4200 assert_eq!(plan.blocks[0].previously_included, 0);
4201 assert_eq!(plan.blocks[1].previously_included, 0);
4202 assert_eq!(plan.blocks[2].previously_included, 1);
4203 assert_eq!(plan.blocks[3].previously_included, 0);
4204 assert_eq!(plan.blocks[0].inclusion_layer, 0);
4205 assert_eq!(plan.blocks[1].inclusion_layer, 1);
4206 assert_eq!(plan.blocks[2].inclusion_layer, 0);
4207 assert_eq!(plan.blocks[3].inclusion_layer, 1);
4208 assert_eq!(plan.tag_states.len(), 2);
4209 assert_eq!(plan.tag_nodes.len(), 12);
4210 assert_eq!(plan.tag_states[1].inclusion_node_start, 6);
4211 assert_eq!(plan.tag_states[1].zero_bitplane_node_start, 9);
4212 assert_eq!(
4213 &plan.tag_nodes[6..9],
4214 &[
4215 CudaHtj2kPacketizationPlanTagNodeState {
4216 current: 0,
4217 known: 1,
4218 },
4219 CudaHtj2kPacketizationPlanTagNodeState {
4220 current: 1,
4221 known: 0,
4222 },
4223 CudaHtj2kPacketizationPlanTagNodeState {
4224 current: 0,
4225 known: 1,
4226 },
4227 ]
4228 );
4229 assert_eq!(
4230 &plan.tag_nodes[9..12],
4231 &[
4232 CudaHtj2kPacketizationPlanTagNodeState {
4233 current: 2,
4234 known: 1,
4235 },
4236 CudaHtj2kPacketizationPlanTagNodeState {
4237 current: 0,
4238 known: 0,
4239 },
4240 CudaHtj2kPacketizationPlanTagNodeState {
4241 current: 2,
4242 known: 1,
4243 },
4244 ]
4245 );
4246 }
4247
4248 #[cfg(feature = "cuda-runtime")]
4249 #[test]
4250 fn cuda_lossless_encode_require_device_dispatches_cleanup_packetization_when_runtime_required()
4251 {
4252 if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4253 return;
4254 }
4255
4256 let pixels: Vec<u8> = (0u16..8 * 8)
4257 .map(|value| u8::try_from((value * 31 + 7) & 0xFF).expect("masked value fits in u8"))
4258 .collect();
4259 let samples =
4260 J2kLosslessSamples::new(&pixels, 8, 8, 1, 8, false).expect("valid gray8 samples");
4261 let options = J2kLosslessEncodeOptions::default()
4262 .with_backend(EncodeBackendPreference::RequireDevice)
4263 .with_block_coding_mode(J2kBlockCodingMode::HighThroughput)
4264 .with_max_decomposition_levels(Some(0))
4265 .with_validation(J2kEncodeValidation::CpuRoundTrip);
4266
4267 let encoded = encode_j2k_lossless_with_cuda(samples, &options)
4268 .expect("strict CUDA single-pass HT encode should dispatch all required stages");
4269 let decoded = Image::new(&encoded.codestream, &DecodeSettings::default())
4270 .expect("codestream parses")
4271 .decode_native()
4272 .expect("codestream decodes");
4273
4274 assert_eq!(encoded.backend, BackendKind::Cuda);
4275 assert_eq!(decoded.data, pixels);
4276 }
4277
4278 #[cfg(feature = "cuda-runtime")]
4279 #[test]
4280 fn cuda_deinterleave_stage_dispatches_when_runtime_required() {
4281 if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4282 return;
4283 }
4284
4285 let pixels = [0u8, 128, 255, 64, 32, 16];
4286 let mut accelerator = CudaEncodeStageAccelerator::default();
4287 let components = accelerator
4288 .encode_deinterleave(J2kDeinterleaveToF32Job {
4289 pixels: &pixels,
4290 num_pixels: 2,
4291 num_components: 3,
4292 bit_depth: 8,
4293 signed: false,
4294 })
4295 .expect("CUDA deinterleave hook")
4296 .expect("CUDA deinterleave dispatch");
4297
4298 assert_eq!(accelerator.deinterleave_dispatches(), 1);
4299 assert_eq!(
4300 components,
4301 vec![vec![-128.0, -64.0], vec![0.0, -96.0], vec![127.0, -112.0]]
4302 );
4303 }
4304
4305 #[test]
4306 fn prefer_cpu_ht_subband_declines_fused_subband_but_counts_attempts() {
4307 let mut accelerator = CudaEncodeStageAccelerator::default()
4308 .prefer_cpu_ht_subband(true)
4309 .prefer_cpu_quantize_subband(true);
4310 let output = accelerator
4311 .encode_ht_subband(J2kHtSubbandEncodeJob {
4312 coefficients: &[0.0; 16],
4313 width: 4,
4314 height: 4,
4315 step_exponent: 8,
4316 step_mantissa: 0,
4317 range_bits: 8,
4318 reversible: false,
4319 code_block_width: 4,
4320 code_block_height: 4,
4321 total_bitplanes: 9,
4322 })
4323 .expect("subband hook can decline");
4324
4325 assert!(output.is_none());
4326 assert_eq!(accelerator.ht_subband_attempts, 1);
4327 assert_eq!(accelerator.quantize_subband_attempts, 1);
4328 assert_eq!(accelerator.ht_code_block_attempts, 1);
4329 assert_eq!(accelerator.dispatch_report().total(), 0);
4330
4331 let quantized = accelerator
4332 .encode_quantize_subband(J2kQuantizeSubbandJob {
4333 coefficients: &[0.0; 16],
4334 step_exponent: 8,
4335 step_mantissa: 0,
4336 range_bits: 8,
4337 reversible: false,
4338 })
4339 .expect("quantize hook can decline");
4340 assert!(quantized.is_none());
4341 assert_eq!(accelerator.quantize_subband_attempts, 2);
4342 assert_eq!(accelerator.dispatch_report().total(), 0);
4343 }
4344
4345 #[cfg(feature = "cuda-runtime")]
4346 #[test]
4347 fn cuda_lossless_encode_require_device_dispatches_multi_block_cleanup_when_runtime_required() {
4348 if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4349 return;
4350 }
4351
4352 let pixels: Vec<u8> = (0u32..128 * 128)
4353 .map(|value| u8::try_from((value * 19 + 23) & 0xFF).expect("masked value fits in u8"))
4354 .collect();
4355 let samples =
4356 J2kLosslessSamples::new(&pixels, 128, 128, 1, 8, false).expect("valid gray8 samples");
4357 let options = J2kLosslessEncodeOptions::default()
4358 .with_backend(EncodeBackendPreference::RequireDevice)
4359 .with_block_coding_mode(J2kBlockCodingMode::HighThroughput)
4360 .with_max_decomposition_levels(Some(0))
4361 .with_validation(J2kEncodeValidation::CpuRoundTrip);
4362
4363 let encoded = encode_j2k_lossless_with_cuda(samples, &options)
4364 .expect("strict CUDA multi-block cleanup encode should dispatch all required stages");
4365 let decoded = Image::new(&encoded.codestream, &DecodeSettings::default())
4366 .expect("codestream parses")
4367 .decode_native()
4368 .expect("codestream decodes");
4369
4370 assert_eq!(encoded.backend, BackendKind::Cuda);
4371 assert_eq!(decoded.data, pixels);
4372 }
4373
4374 #[cfg(feature = "cuda-runtime")]
4375 #[test]
4376 fn cuda_lossless_encode_require_device_dispatches_dwt53_cleanup_when_runtime_required() {
4377 if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4378 return;
4379 }
4380
4381 let pixels: Vec<u8> = (0u32..128 * 128)
4382 .map(|value| u8::try_from((value * 37 + 41) & 0xFF).expect("masked value fits in u8"))
4383 .collect();
4384 let samples =
4385 J2kLosslessSamples::new(&pixels, 128, 128, 1, 8, false).expect("valid gray8 samples");
4386 let options = J2kLosslessEncodeOptions::default()
4387 .with_backend(EncodeBackendPreference::RequireDevice)
4388 .with_block_coding_mode(J2kBlockCodingMode::HighThroughput)
4389 .with_max_decomposition_levels(Some(1))
4390 .with_validation(J2kEncodeValidation::CpuRoundTrip);
4391
4392 let encoded = encode_j2k_lossless_with_cuda(samples, &options)
4393 .expect("strict CUDA DWT cleanup encode should dispatch all required stages");
4394 let decoded = Image::new(&encoded.codestream, &DecodeSettings::default())
4395 .expect("codestream parses")
4396 .decode_native()
4397 .expect("codestream decodes");
4398
4399 assert_eq!(encoded.backend, BackendKind::Cuda);
4400 assert_eq!(decoded.data, pixels);
4401 }
4402
4403 #[cfg(feature = "cuda-runtime")]
4404 #[test]
4405 fn cuda_lossless_encode_profile_reports_resident_stage_timings_when_runtime_required() {
4406 if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4407 return;
4408 }
4409
4410 let pixels: Vec<u8> = (0u32..128 * 128)
4411 .map(|value| u8::try_from((value * 43 + 29) & 0xFF).expect("masked value fits in u8"))
4412 .collect();
4413 let samples =
4414 J2kLosslessSamples::new(&pixels, 128, 128, 1, 8, false).expect("valid gray8 samples");
4415 let options = J2kLosslessEncodeOptions::default()
4416 .with_backend(EncodeBackendPreference::RequireDevice)
4417 .with_block_coding_mode(J2kBlockCodingMode::HighThroughput)
4418 .with_max_decomposition_levels(Some(1))
4419 .with_validation(J2kEncodeValidation::CpuRoundTrip);
4420
4421 let (encoded, report) = encode_j2k_lossless_with_cuda_and_profile(samples, &options)
4422 .expect("strict CUDA profiled DWT cleanup encode should dispatch all required stages");
4423 let decoded = Image::new(&encoded.codestream, &DecodeSettings::default())
4424 .expect("codestream parses")
4425 .decode_native()
4426 .expect("codestream decodes");
4427
4428 assert_eq!(encoded.backend, BackendKind::Cuda);
4429 assert_eq!(decoded.data, pixels);
4430 assert_eq!(report.backend, BackendKind::Cuda);
4431 assert_eq!(report.input_bytes, pixels.len());
4432 assert_eq!(report.codestream_bytes, encoded.codestream.len());
4433 assert!(report.dispatch_count > 0);
4434 assert!(report.block_count > 0);
4435 assert!(report.deinterleave_us > 0);
4436 assert_eq!(report.mct_us, 0);
4437 assert!(report.dwt_us > 0);
4438 assert!(report.quantize_us > 0);
4439 assert!(report.ht_encode_us > 0);
4440 assert!(report.packetize_us > 0);
4441 assert!(report.total_us > 0);
4442 }
4443
4444 #[cfg(feature = "cuda-runtime")]
4445 #[test]
4446 fn cuda_lossless_encode_require_device_dispatches_rgb_rct_cleanup_when_runtime_required() {
4447 if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4448 return;
4449 }
4450
4451 let pixels: Vec<u8> = (0u32..128 * 128 * 3)
4452 .map(|value| u8::try_from((value * 13 + 71) & 0xFF).expect("masked value fits in u8"))
4453 .collect();
4454 let samples =
4455 J2kLosslessSamples::new(&pixels, 128, 128, 3, 8, false).expect("valid rgb8 samples");
4456 let options = J2kLosslessEncodeOptions::default()
4457 .with_backend(EncodeBackendPreference::RequireDevice)
4458 .with_block_coding_mode(J2kBlockCodingMode::HighThroughput)
4459 .with_max_decomposition_levels(Some(1))
4460 .with_validation(J2kEncodeValidation::CpuRoundTrip);
4461
4462 let encoded = encode_j2k_lossless_with_cuda(samples, &options)
4463 .expect("strict CUDA RGB cleanup encode should dispatch all required stages");
4464 let decoded = Image::new(&encoded.codestream, &DecodeSettings::default())
4465 .expect("codestream parses")
4466 .decode_native()
4467 .expect("codestream decodes");
4468
4469 assert_eq!(encoded.backend, BackendKind::Cuda);
4470 assert_eq!(decoded.data, pixels);
4471 }
4472
4473 #[cfg(feature = "cuda-runtime")]
4474 #[test]
4475 fn cuda_lossy_htj2k_facade_require_device_dispatches_supported_stages_when_runtime_required() {
4476 if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4477 return;
4478 }
4479
4480 let pixels: Vec<u8> = (0u32..64 * 64)
4481 .map(|value| u8::try_from((value * 41 + 17) & 0xFF).expect("masked value fits in u8"))
4482 .collect();
4483 let samples =
4484 J2kLossySamples::new(&pixels, 64, 64, 1, 8, false).expect("valid gray8 samples");
4485 let options = J2kLossyEncodeOptions::default()
4486 .with_backend(EncodeBackendPreference::RequireDevice)
4487 .with_block_coding_mode(J2kBlockCodingMode::HighThroughput)
4488 .with_max_decomposition_levels(Some(1))
4489 .with_validation(J2kEncodeValidation::CpuRoundTrip);
4490 let mut accelerator = CudaEncodeStageAccelerator::default();
4491
4492 let encoded = encode_j2k_lossy_with_accelerator(
4493 samples,
4494 &options,
4495 BackendKind::Cuda,
4496 &mut accelerator,
4497 )
4498 .expect("strict CUDA HTJ2K lossy facade encode should dispatch supported stages");
4499 let decoded = Image::new(&encoded.codestream, &DecodeSettings::default())
4500 .expect("codestream parses")
4501 .decode_native()
4502 .expect("codestream decodes");
4503
4504 assert_eq!(encoded.backend, BackendKind::Cuda);
4505 assert_eq!(decoded.width, 64);
4506 assert_eq!(decoded.height, 64);
4507 assert_eq!(decoded.num_components, 1);
4508 assert_eq!(accelerator.deinterleave_dispatches(), 1);
4509 assert!(accelerator.forward_dwt97_dispatches() > 0);
4510 assert_eq!(accelerator.quantize_subband_dispatches(), 4);
4511 assert_eq!(accelerator.ht_code_block_dispatches(), 4);
4512 assert_eq!(accelerator.packetization_dispatches(), 1);
4513 }
4514
4515 #[test]
4516 fn cuda_encode_stage_accelerator_preserves_cpu_codestream_validity() {
4517 let pixels: Vec<u8> = (0u8..192).collect();
4518 let options = EncodeOptions {
4519 reversible: true,
4520 num_decomposition_levels: 1,
4521 ..EncodeOptions::default()
4522 };
4523 let mut accelerator = CudaEncodeStageAccelerator::default();
4524
4525 let codestream = encode_with_cuda_test_accelerator(
4526 &pixels,
4527 8,
4528 8,
4529 3,
4530 8,
4531 false,
4532 &options,
4533 &mut accelerator,
4534 )
4535 .expect("encode with CUDA stage accelerator");
4536 let decoded = Image::new(&codestream, &DecodeSettings::default())
4537 .expect("codestream parses")
4538 .decode_native()
4539 .expect("codestream decodes");
4540
4541 assert_eq!(decoded.width, 8);
4542 assert_eq!(decoded.height, 8);
4543 assert_eq!(decoded.num_components, 3);
4544 assert_eq!(decoded.bit_depth, 8);
4545 assert_eq!(accelerator.forward_rct_attempts(), 1);
4546 assert_eq!(accelerator.forward_dwt53_attempts(), 3);
4547 assert!(accelerator.tier1_code_block_attempts() > 0);
4548 assert_eq!(accelerator.packetization_attempts(), 1);
4549 }
4550
4551 #[test]
4552 fn cuda_auto_host_output_declines_packetization_before_flattening() {
4553 let mut accelerator = CudaEncodeStageAccelerator::for_auto_host_output();
4554 let invalid_for_cuda_flattening = J2kPacketizationEncodeJob {
4555 resolution_count: 1,
4556 num_layers: 1,
4557 num_components: 3,
4558 code_block_count: 0,
4559 progression_order: J2kPacketizationProgressionOrder::Lrcp,
4560 packet_descriptors: &[],
4561 resolutions: &[],
4562 };
4563
4564 let encoded = J2kEncodeStageAccelerator::encode_packetization(
4565 &mut accelerator,
4566 invalid_for_cuda_flattening,
4567 )
4568 .expect("Auto host-output CUDA packetization should decline to CPU");
4569
4570 assert!(encoded.is_none());
4571 assert_eq!(accelerator.packetization_attempts(), 1);
4572 assert_eq!(accelerator.packetization_dispatches(), 0);
4573 }
4574
4575 #[cfg(feature = "cuda-runtime")]
4576 #[test]
4577 fn cuda_forward_rct_dispatches_when_runtime_required() {
4578 if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4579 return;
4580 }
4581
4582 let pixels: Vec<u8> = (0u16..7 * 5 * 3)
4583 .map(|i| u8::try_from((i * 17) & 0xFF).expect("masked value fits in u8"))
4584 .collect();
4585 let options = EncodeOptions {
4586 reversible: true,
4587 num_decomposition_levels: 0,
4588 ..EncodeOptions::default()
4589 };
4590 let mut accelerator = CudaEncodeStageAccelerator::default();
4591
4592 let codestream = encode_with_cuda_test_accelerator(
4593 &pixels,
4594 7,
4595 5,
4596 3,
4597 8,
4598 false,
4599 &options,
4600 &mut accelerator,
4601 )
4602 .expect("encode with CUDA forward RCT");
4603 let decoded = Image::new(&codestream, &DecodeSettings::default())
4604 .expect("codestream parses")
4605 .decode_native()
4606 .expect("codestream decodes");
4607
4608 assert_eq!(decoded.data, pixels);
4609 assert_eq!(accelerator.forward_rct_attempts(), 1);
4610 assert_eq!(accelerator.forward_rct_dispatches(), 1);
4611 }
4612
4613 #[cfg(feature = "cuda-runtime")]
4614 #[test]
4615 fn cuda_forward_ict_dispatches_when_runtime_required() {
4616 if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4617 return;
4618 }
4619
4620 let pixels: Vec<u8> = (0u32..32 * 32 * 3)
4621 .map(|i| u8::try_from((i * 23 + 19) & 0xFF).expect("masked value fits in u8"))
4622 .collect();
4623 let options = EncodeOptions {
4624 reversible: false,
4625 use_ht_block_coding: true,
4626 num_decomposition_levels: 1,
4627 ..EncodeOptions::default()
4628 };
4629 let mut accelerator = CudaEncodeStageAccelerator::default();
4630
4631 let codestream = encode_with_cuda_test_accelerator(
4632 &pixels,
4633 32,
4634 32,
4635 3,
4636 8,
4637 false,
4638 &options,
4639 &mut accelerator,
4640 )
4641 .expect("encode irreversible RGB with CUDA forward ICT");
4642 let decoded = Image::new(&codestream, &DecodeSettings::default())
4643 .expect("codestream parses")
4644 .decode_native()
4645 .expect("codestream decodes");
4646
4647 assert_eq!(decoded.data.len(), pixels.len());
4648 assert_eq!(accelerator.forward_ict_attempts(), 1);
4649 assert_eq!(accelerator.forward_ict_dispatches(), 1);
4650 }
4651
4652 #[cfg(feature = "cuda-runtime")]
4653 #[test]
4654 fn cuda_forward_dwt53_dispatches_when_runtime_required() {
4655 if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4656 return;
4657 }
4658
4659 let pixels: Vec<u8> = (0u16..8 * 8)
4660 .map(|i| u8::try_from((i * 5) & 0xFF).expect("masked value fits in u8"))
4661 .collect();
4662 let options = EncodeOptions {
4663 reversible: true,
4664 num_decomposition_levels: 1,
4665 ..EncodeOptions::default()
4666 };
4667 let mut accelerator = CudaEncodeStageAccelerator::default();
4668
4669 let codestream = encode_with_cuda_test_accelerator(
4670 &pixels,
4671 8,
4672 8,
4673 1,
4674 8,
4675 false,
4676 &options,
4677 &mut accelerator,
4678 )
4679 .expect("encode with CUDA forward DWT 5/3");
4680 let decoded = Image::new(&codestream, &DecodeSettings::default())
4681 .expect("codestream parses")
4682 .decode_native()
4683 .expect("codestream decodes");
4684
4685 assert_eq!(decoded.data, pixels);
4686 assert_eq!(accelerator.forward_dwt53_attempts(), 1);
4687 assert_eq!(accelerator.forward_dwt53_dispatches(), 2);
4688 }
4689
4690 #[cfg(feature = "cuda-runtime")]
4691 #[test]
4692 fn cuda_forward_dwt97_dispatches_when_runtime_required() {
4693 if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4694 return;
4695 }
4696
4697 let pixels: Vec<u8> = (0u16..32 * 32)
4698 .map(|i| u8::try_from((i * 7 + 13) & 0xFF).expect("masked value fits in u8"))
4699 .collect();
4700 let options = EncodeOptions {
4701 reversible: false,
4702 use_ht_block_coding: true,
4703 num_decomposition_levels: 1,
4704 ..EncodeOptions::default()
4705 };
4706 let mut accelerator = CudaEncodeStageAccelerator::default();
4707
4708 let codestream = encode_with_cuda_test_accelerator(
4709 &pixels,
4710 32,
4711 32,
4712 1,
4713 8,
4714 false,
4715 &options,
4716 &mut accelerator,
4717 )
4718 .expect("encode with CUDA forward DWT 9/7");
4719 let decoded = Image::new(&codestream, &DecodeSettings::default())
4720 .expect("codestream parses")
4721 .decode_native()
4722 .expect("codestream decodes");
4723
4724 assert_eq!(decoded.data.len(), pixels.len());
4725 assert_eq!(accelerator.forward_dwt97_attempts(), 1);
4726 assert_eq!(accelerator.forward_dwt97_dispatches(), 3);
4727 }
4728
4729 #[cfg(feature = "cuda-runtime")]
4730 #[test]
4731 fn cuda_quantize_subband_dispatches_when_runtime_required() {
4732 if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4733 return;
4734 }
4735
4736 let pixels: Vec<u8> = (0u16..32 * 32)
4737 .map(|i| u8::try_from((i * 19 + 5) & 0xFF).expect("masked value fits in u8"))
4738 .collect();
4739 let options = EncodeOptions {
4740 reversible: false,
4741 use_ht_block_coding: true,
4742 num_decomposition_levels: 1,
4743 ..EncodeOptions::default()
4744 };
4745 let mut accelerator = CudaEncodeStageAccelerator::default();
4746
4747 let codestream = encode_with_cuda_test_accelerator(
4748 &pixels,
4749 32,
4750 32,
4751 1,
4752 8,
4753 false,
4754 &options,
4755 &mut accelerator,
4756 )
4757 .expect("encode with CUDA quantization");
4758 let decoded = Image::new(&codestream, &DecodeSettings::default())
4759 .expect("codestream parses")
4760 .decode_native()
4761 .expect("codestream decodes");
4762
4763 assert_eq!(decoded.data.len(), pixels.len());
4764 assert_eq!(accelerator.quantize_subband_attempts(), 4);
4765 assert_eq!(accelerator.quantize_subband_dispatches(), 4);
4766 }
4767
4768 #[cfg(feature = "cuda-runtime")]
4769 #[test]
4770 fn cuda_encode_uses_resident_tile_body_when_runtime_required() {
4771 if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4772 return;
4773 }
4774
4775 let pixels: Vec<u8> = (0u16..32 * 32)
4776 .map(|i| u8::try_from((i * 23 + 11) & 0xFF).expect("masked value fits in u8"))
4777 .collect();
4778 let options = EncodeOptions {
4779 reversible: true,
4780 use_ht_block_coding: true,
4781 num_decomposition_levels: 0,
4782 code_block_width_exp: 2,
4783 code_block_height_exp: 2,
4784 ..EncodeOptions::default()
4785 };
4786 let mut accelerator = CudaEncodeStageAccelerator::default();
4787
4788 let codestream = encode_with_cuda_test_accelerator(
4789 &pixels,
4790 32,
4791 32,
4792 1,
4793 8,
4794 false,
4795 &options,
4796 &mut accelerator,
4797 )
4798 .expect("encode HTJ2K through CUDA tile-body hook");
4799 let decoded = Image::new(&codestream, &DecodeSettings::default())
4800 .expect("codestream parses")
4801 .decode_native()
4802 .expect("codestream decodes");
4803
4804 assert_eq!(decoded.data, pixels);
4805 assert_eq!(accelerator.htj2k_tile_attempts, 1);
4806 assert_eq!(accelerator.htj2k_tile_dispatches, 1);
4807 assert_eq!(accelerator.ht_subband_attempts, 0);
4808 assert_eq!(accelerator.ht_subband_dispatches, 0);
4809 assert_eq!(accelerator.deinterleave_dispatches(), 1);
4810 assert_eq!(accelerator.quantize_subband_attempts(), 1);
4811 assert_eq!(accelerator.quantize_subband_dispatches(), 1);
4812 assert_eq!(accelerator.ht_code_block_attempts(), 4);
4813 assert_eq!(accelerator.ht_code_block_dispatches(), 1);
4814 assert_eq!(accelerator.packetization_attempts(), 1);
4815 assert_eq!(accelerator.packetization_dispatches(), 1);
4816 }
4817
4818 #[cfg(feature = "cuda-runtime")]
4819 #[test]
4820 fn cuda_encode_uses_resident_dwt_tile_body_when_runtime_required() {
4821 if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4822 return;
4823 }
4824
4825 let pixels: Vec<u8> = (0u16..32 * 32)
4826 .map(|i| u8::try_from((i * 29 + 5) & 0xFF).expect("masked value fits in u8"))
4827 .collect();
4828 let options = EncodeOptions {
4829 reversible: true,
4830 use_ht_block_coding: true,
4831 num_decomposition_levels: 1,
4832 code_block_width_exp: 2,
4833 code_block_height_exp: 2,
4834 ..EncodeOptions::default()
4835 };
4836 let mut accelerator = CudaEncodeStageAccelerator::default();
4837
4838 let codestream = encode_with_cuda_test_accelerator(
4839 &pixels,
4840 32,
4841 32,
4842 1,
4843 8,
4844 false,
4845 &options,
4846 &mut accelerator,
4847 )
4848 .expect("encode HTJ2K DWT through CUDA tile-body hook");
4849 let decoded = Image::new(&codestream, &DecodeSettings::default())
4850 .expect("codestream parses")
4851 .decode_native()
4852 .expect("codestream decodes");
4853
4854 assert_eq!(decoded.data, pixels);
4855 assert_eq!(accelerator.htj2k_tile_attempts, 1);
4856 assert_eq!(accelerator.htj2k_tile_dispatches, 1);
4857 assert_eq!(accelerator.ht_subband_attempts, 0);
4858 assert_eq!(accelerator.ht_subband_dispatches, 0);
4859 assert_eq!(accelerator.forward_dwt53_attempts(), 1);
4860 assert!(accelerator.forward_dwt53_dispatches() > 0);
4861 assert_eq!(accelerator.quantize_subband_attempts(), 4);
4862 assert_eq!(accelerator.quantize_subband_dispatches(), 4);
4863 assert_eq!(accelerator.ht_code_block_attempts(), 4);
4864 assert_eq!(accelerator.ht_code_block_dispatches(), 4);
4865 assert_eq!(accelerator.packetization_attempts(), 1);
4866 assert_eq!(accelerator.packetization_dispatches(), 1);
4867 }
4868
4869 #[cfg(feature = "cuda-runtime")]
4870 #[test]
4871 fn cuda_encode_uses_resident_mct_dwt_tile_body_when_runtime_required() {
4872 if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4873 return;
4874 }
4875
4876 let pixels: Vec<u8> = (0u16..32 * 32 * 3)
4877 .map(|i| u8::try_from((i * 19 + 17) & 0xFF).expect("masked value fits in u8"))
4878 .collect();
4879 let options = EncodeOptions {
4880 reversible: true,
4881 use_mct: true,
4882 use_ht_block_coding: true,
4883 num_decomposition_levels: 1,
4884 code_block_width_exp: 2,
4885 code_block_height_exp: 2,
4886 ..EncodeOptions::default()
4887 };
4888 let mut accelerator = CudaEncodeStageAccelerator::default();
4889
4890 let codestream = encode_with_cuda_test_accelerator(
4891 &pixels,
4892 32,
4893 32,
4894 3,
4895 8,
4896 false,
4897 &options,
4898 &mut accelerator,
4899 )
4900 .expect("encode HTJ2K RGB DWT through CUDA tile-body hook");
4901 let decoded = Image::new(&codestream, &DecodeSettings::default())
4902 .expect("codestream parses")
4903 .decode_native()
4904 .expect("codestream decodes");
4905
4906 assert_eq!(decoded.data, pixels);
4907 assert_eq!(accelerator.htj2k_tile_attempts, 1);
4908 assert_eq!(accelerator.htj2k_tile_dispatches, 1);
4909 assert_eq!(accelerator.ht_subband_attempts, 0);
4910 assert_eq!(accelerator.forward_rct_attempts(), 1);
4911 assert_eq!(accelerator.forward_rct_dispatches(), 1);
4912 assert_eq!(accelerator.forward_dwt53_attempts(), 3);
4913 assert!(accelerator.forward_dwt53_dispatches() > 0);
4914 assert_eq!(accelerator.quantize_subband_attempts(), 12);
4915 assert_eq!(accelerator.quantize_subband_dispatches(), 12);
4916 assert_eq!(accelerator.ht_code_block_attempts(), 12);
4917 assert_eq!(accelerator.ht_code_block_dispatches(), 12);
4918 assert_eq!(accelerator.packetization_attempts(), 1);
4919 assert_eq!(accelerator.packetization_dispatches(), 1);
4920 }
4921
4922 #[cfg(feature = "cuda-runtime")]
4923 #[test]
4924 fn cuda_encode_uses_resident_dwt97_tile_body_when_runtime_required() {
4925 if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4926 return;
4927 }
4928
4929 let pixels: Vec<u8> = (0u16..32 * 32)
4930 .map(|i| u8::try_from((i * 31 + 7) & 0xFF).expect("masked value fits in u8"))
4931 .collect();
4932 let options = EncodeOptions {
4933 reversible: false,
4934 use_ht_block_coding: true,
4935 num_decomposition_levels: 1,
4936 code_block_width_exp: 2,
4937 code_block_height_exp: 2,
4938 ..EncodeOptions::default()
4939 };
4940 let mut accelerator = CudaEncodeStageAccelerator::default();
4941
4942 let codestream = encode_with_cuda_test_accelerator(
4943 &pixels,
4944 32,
4945 32,
4946 1,
4947 8,
4948 false,
4949 &options,
4950 &mut accelerator,
4951 )
4952 .expect("encode irreversible HTJ2K DWT through CUDA tile-body hook");
4953 let decoded = Image::new(&codestream, &DecodeSettings::default())
4954 .expect("codestream parses")
4955 .decode_native()
4956 .expect("codestream decodes");
4957
4958 assert_eq!(decoded.width, 32);
4959 assert_eq!(decoded.height, 32);
4960 assert_eq!(decoded.num_components, 1);
4961 assert_eq!(accelerator.htj2k_tile_attempts, 1);
4962 assert_eq!(accelerator.htj2k_tile_dispatches, 1);
4963 assert_eq!(accelerator.ht_subband_attempts, 0);
4964 assert_eq!(accelerator.forward_dwt97_attempts(), 1);
4965 assert!(accelerator.forward_dwt97_dispatches() > 0);
4966 assert_eq!(accelerator.quantize_subband_attempts(), 4);
4967 assert_eq!(accelerator.quantize_subband_dispatches(), 4);
4968 assert_eq!(accelerator.ht_code_block_attempts(), 4);
4969 assert_eq!(accelerator.ht_code_block_dispatches(), 4);
4970 assert_eq!(accelerator.packetization_attempts(), 1);
4971 assert_eq!(accelerator.packetization_dispatches(), 1);
4972 }
4973
4974 #[cfg(feature = "cuda-runtime")]
4975 #[test]
4976 fn cuda_htj2k_codeblock_dispatches_when_runtime_required() {
4977 if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4978 return;
4979 }
4980
4981 let pixels: Vec<u8> = (0u16..8 * 8)
4982 .map(|i| u8::try_from((i * 11 + 3) & 0xFF).expect("masked value fits in u8"))
4983 .collect();
4984 let options = EncodeOptions {
4985 reversible: true,
4986 use_ht_block_coding: true,
4987 num_decomposition_levels: 0,
4988 code_block_width_exp: 2,
4989 code_block_height_exp: 2,
4990 ..EncodeOptions::default()
4991 };
4992 let mut accelerator = CudaEncodeStageAccelerator::default();
4993
4994 let codestream = encode_with_cuda_test_accelerator(
4995 &pixels,
4996 8,
4997 8,
4998 1,
4999 8,
5000 false,
5001 &options,
5002 &mut accelerator,
5003 )
5004 .expect("encode HTJ2K with CUDA HT codeblock kernel");
5005 let decoded = Image::new(&codestream, &DecodeSettings::default())
5006 .expect("codestream parses")
5007 .decode_native()
5008 .expect("codestream decodes");
5009
5010 assert_eq!(decoded.data, pixels);
5011 assert!(accelerator.ht_code_block_attempts() > 0);
5012 assert!(accelerator.ht_code_block_dispatches() > 0);
5013 assert!(accelerator.ht_code_block_dispatches() <= accelerator.ht_code_block_attempts());
5014 assert_eq!(
5015 accelerator.dispatch_report().ht_code_block,
5016 accelerator.ht_code_block_dispatches()
5017 );
5018 }
5019
5020 #[cfg(feature = "cuda-runtime")]
5021 #[test]
5022 fn cuda_htj2k_codeblock_preserves_requested_refinement_passes_when_runtime_required() {
5023 if !cuda_runtime_required() {
5024 return;
5025 }
5026
5027 let coefficients = [0, 3, -5, 3, 5, 0, -3, 3, 7, -3, 0, 3, 0, 0, 5, -5];
5028 let mut accelerator = CudaEncodeStageAccelerator::default();
5029
5030 let encoded = accelerator
5031 .encode_ht_code_block(J2kHtCodeBlockEncodeJob {
5032 coefficients: &coefficients,
5033 width: 4,
5034 height: 4,
5035 total_bitplanes: 4,
5036 target_coding_passes: 2,
5037 })
5038 .expect("CUDA HTJ2K code-block encode hook")
5039 .expect("CUDA HTJ2K code-block encode output");
5040
5041 assert_eq!(encoded.num_coding_passes, 2);
5042 assert_eq!(encoded.num_zero_bitplanes, 2);
5043 assert_eq!(encoded.refinement_length, 1);
5044 assert_eq!(
5045 encoded.cleanup_length + encoded.refinement_length,
5046 u32::try_from(encoded.data.len()).expect("test payload length fits u32")
5047 );
5048 assert_eq!(accelerator.ht_code_block_dispatches(), 1);
5049 }
5050
5051 #[cfg(feature = "cuda-runtime")]
5052 #[test]
5053 fn cuda_htj2k_codeblock_batch_uses_single_dispatch_when_runtime_required() {
5054 if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
5055 return;
5056 }
5057
5058 let pixels: Vec<u8> = (0u16..32 * 32)
5059 .map(|i| u8::try_from((i * 17 + 9) & 0xFF).expect("masked value fits in u8"))
5060 .collect();
5061 let options = EncodeOptions {
5062 reversible: true,
5063 use_ht_block_coding: true,
5064 num_decomposition_levels: 0,
5065 code_block_width_exp: 2,
5066 code_block_height_exp: 2,
5067 ..EncodeOptions::default()
5068 };
5069 let mut accelerator = CudaEncodeStageAccelerator::default();
5070
5071 let codestream = encode_with_cuda_test_accelerator(
5072 &pixels,
5073 32,
5074 32,
5075 1,
5076 8,
5077 false,
5078 &options,
5079 &mut accelerator,
5080 )
5081 .expect("encode HTJ2K with CUDA HT batch codeblock kernel");
5082 let decoded = Image::new(&codestream, &DecodeSettings::default())
5083 .expect("codestream parses")
5084 .decode_native()
5085 .expect("codestream decodes");
5086
5087 assert_eq!(decoded.data, pixels);
5088 assert!(accelerator.ht_code_block_attempts() > 1);
5089 assert_eq!(accelerator.ht_code_block_dispatches(), 1);
5090 assert!(
5091 accelerator.ht_code_block_dispatches() < accelerator.ht_code_block_attempts(),
5092 "batch encode must not launch one kernel per codeblock"
5093 );
5094 assert_eq!(
5095 accelerator.dispatch_report().ht_code_block,
5096 accelerator.ht_code_block_dispatches()
5097 );
5098 }
5099
5100 #[cfg(feature = "cuda-runtime")]
5101 #[test]
5102 fn cuda_resident_quantized_subband_feeds_resident_ht_batch_when_runtime_required() {
5103 if !cuda_runtime_required() {
5104 return;
5105 }
5106
5107 let samples = [-3.6f32, -2.5, -0.4, 0.0, 0.49, 1.5, 3.2, 9.9];
5108 let context = CudaContext::system_default().expect("CUDA context");
5109 let sample_buffer = context.upload_f32(&samples).expect("resident samples");
5110 let quantization = CudaJ2kQuantizeJob {
5111 step_exponent: 8,
5112 step_mantissa: 0,
5113 range_bits: 8,
5114 reversible: true,
5115 };
5116 let resident_quantized = context
5117 .j2k_quantize_subband_resident(&sample_buffer, samples.len(), quantization)
5118 .expect("resident quantization");
5119 let host_quantized = context
5120 .j2k_quantize_subband(&samples, quantization)
5121 .expect("host-staged quantization");
5122 let jobs = [CudaHtj2kEncodeCodeBlockJob {
5123 coefficient_offset: 0,
5124 width: 4,
5125 height: 2,
5126 total_bitplanes: 5,
5127 target_coding_passes: 1,
5128 }];
5129
5130 let resident_encoded = context
5131 .encode_htj2k_codeblocks_resident(
5132 resident_quantized.buffer(),
5133 resident_quantized.coefficient_count(),
5134 &jobs,
5135 cuda_htj2k_encode_tables(),
5136 )
5137 .expect("resident HTJ2K encode");
5138 let staged_encoded = context
5139 .encode_htj2k_codeblocks(
5140 host_quantized.coefficients(),
5141 &jobs,
5142 cuda_htj2k_encode_tables(),
5143 )
5144 .expect("host-staged HTJ2K encode");
5145
5146 assert_eq!(resident_quantized.coefficient_count(), samples.len());
5147 assert_eq!(resident_encoded.execution().kernel_dispatches(), 1);
5148 assert_eq!(
5149 resident_encoded.code_blocks().len(),
5150 staged_encoded.code_blocks().len()
5151 );
5152 for (resident, staged) in resident_encoded
5153 .code_blocks()
5154 .iter()
5155 .zip(staged_encoded.code_blocks())
5156 {
5157 assert_eq!(resident.data(), staged.data());
5158 assert_eq!(resident.cleanup_length(), staged.cleanup_length());
5159 assert_eq!(resident.refinement_length(), staged.refinement_length());
5160 assert_eq!(resident.num_coding_passes(), staged.num_coding_passes());
5161 assert_eq!(resident.num_zero_bitplanes(), staged.num_zero_bitplanes());
5162 }
5163 }
5164
5165 #[cfg(feature = "cuda-runtime")]
5166 #[test]
5167 fn cuda_resident_strided_codeblock_region_matches_host_gather_when_runtime_required() {
5168 if !cuda_runtime_required() {
5169 return;
5170 }
5171
5172 let samples: Vec<f32> = (0u16..16).map(|value| f32::from(value) - 8.0).collect();
5173 let context = CudaContext::system_default().expect("CUDA context");
5174 let sample_buffer = context.upload_f32(&samples).expect("resident samples");
5175 let quantization = CudaJ2kQuantizeJob {
5176 step_exponent: 8,
5177 step_mantissa: 0,
5178 range_bits: 8,
5179 reversible: true,
5180 };
5181 let resident_quantized = context
5182 .j2k_quantize_subband_resident(&sample_buffer, samples.len(), quantization)
5183 .expect("resident quantization");
5184 let quantized = resident_quantized
5185 .download_coefficients()
5186 .expect("download quantized coefficients");
5187 let gathered_codeblock = vec![quantized[5], quantized[6], quantized[9], quantized[10]];
5188 let region_jobs = [CudaHtj2kEncodeCodeBlockRegionJob {
5189 coefficient_offset: 5,
5190 coefficient_stride: 4,
5191 width: 2,
5192 height: 2,
5193 total_bitplanes: 5,
5194 target_coding_passes: 1,
5195 }];
5196 let contiguous_jobs = [CudaHtj2kEncodeCodeBlockJob {
5197 coefficient_offset: 0,
5198 width: 2,
5199 height: 2,
5200 total_bitplanes: 5,
5201 target_coding_passes: 1,
5202 }];
5203
5204 let resident_encoded = context
5205 .encode_htj2k_codeblock_regions_resident(
5206 resident_quantized.buffer(),
5207 resident_quantized.coefficient_count(),
5208 ®ion_jobs,
5209 cuda_htj2k_encode_tables(),
5210 )
5211 .expect("resident strided HTJ2K encode");
5212 let staged_encoded = context
5213 .encode_htj2k_codeblocks(
5214 &gathered_codeblock,
5215 &contiguous_jobs,
5216 cuda_htj2k_encode_tables(),
5217 )
5218 .expect("host-gathered HTJ2K encode");
5219
5220 assert_eq!(resident_encoded.execution().kernel_dispatches(), 1);
5221 assert_eq!(resident_encoded.code_blocks().len(), 1);
5222 assert_eq!(
5223 resident_encoded.code_blocks()[0].data(),
5224 staged_encoded.code_blocks()[0].data()
5225 );
5226 assert_eq!(
5227 resident_encoded.code_blocks()[0].num_zero_bitplanes(),
5228 staged_encoded.code_blocks()[0].num_zero_bitplanes()
5229 );
5230 }
5231}