Skip to main content

j2k_cuda/
encode.rs

1// SPDX-License-Identifier: Apache-2.0
2
3use j2k::adapter::encode_stage::{
4    EncodedHtJ2kCodeBlock, EncodedJ2kCodeBlock, J2kDeinterleaveToF32Job, J2kEncodeDispatchReport,
5    J2kEncodeStageAccelerator, J2kForwardDwt53Job, J2kForwardDwt53Output, J2kForwardDwt97Job,
6    J2kForwardDwt97Output, J2kForwardIctJob, J2kForwardRctJob, J2kHtCodeBlockEncodeJob,
7    J2kHtSubbandEncodeJob, J2kHtj2kTileEncodeJob, J2kPacketizationBlockCodingMode,
8    J2kPacketizationCodeBlock, J2kPacketizationEncodeJob, J2kPacketizationResolution,
9    J2kQuantizeSubbandJob, J2kTier1CodeBlockEncodeJob,
10};
11#[cfg(feature = "cuda-runtime")]
12use j2k::adapter::encode_stage::{
13    J2kForwardDwt53Level, J2kForwardDwt97Level, J2kPacketizationPacketDescriptor,
14    J2kPacketizationSubband,
15};
16use j2k_core::BackendKind;
17#[cfg(feature = "cuda-runtime")]
18use j2k_core::{DeviceSubmission, DeviceSubmitSession, PixelFormat, ReadySubmission};
19#[cfg(feature = "cuda-runtime")]
20use j2k_cuda_runtime::{
21    CudaContext, CudaDeviceBuffer, CudaDwt53LevelShape, CudaDwt53Output, CudaDwt97Output,
22    CudaError, CudaHtj2kEncodeCodeBlockJob, CudaHtj2kEncodeCodeBlockRegionJob,
23    CudaHtj2kEncodeResources, CudaHtj2kEncodeTables, CudaHtj2kPacketizationBlock,
24    CudaHtj2kPacketizationPacket, CudaHtj2kPacketizationSubband,
25    CudaHtj2kPacketizationSubbandTagState, CudaHtj2kPacketizationTagNodeState, CudaJ2kQuantizeJob,
26    CudaJ2kQuantizeSubbandRegionJob, CudaJ2kResidentComponents, CudaJ2kStridedInterleavedPixels,
27};
28#[cfg(feature = "cuda-runtime")]
29use std::{
30    sync::Arc,
31    time::{Duration, Instant},
32};
33
34use j2k_native::packet_math;
35
36use crate::profile;
37#[cfg(feature = "cuda-runtime")]
38use crate::{runtime::cuda_error, session::CudaSession};
39
40/// Encode lossless JPEG 2000/HTJ2K samples through the CUDA encode-stage adapter.
41///
42/// This CUDA-named API is strict: every caller-provided backend preference is
43/// treated as `EncodeBackendPreference::RequireDevice`, so unsupported stage
44/// coverage returns an error instead of a CPU fallback codestream.
45pub fn encode_j2k_lossless_with_cuda(
46    samples: j2k::J2kLosslessSamples<'_>,
47    options: &j2k::J2kLosslessEncodeOptions,
48) -> Result<j2k::EncodedJ2k, crate::Error> {
49    let strict_options = strict_cuda_encode_options(*options);
50    let profile_enabled = profile::profile_stages_enabled();
51    let mut accelerator = CudaEncodeStageAccelerator::with_profile_collection(profile_enabled);
52    let total_start = profile::profile_now(profile_enabled);
53    let encoded = j2k::encode_j2k_lossless_with_accelerator(
54        samples,
55        &strict_options,
56        BackendKind::Cuda,
57        &mut accelerator,
58    )?;
59    reject_non_cuda_encode_backend(&encoded)?;
60    if profile_enabled {
61        accelerator
62            .encode_profile_report(
63                &encoded,
64                samples.data.len(),
65                profile::elapsed_us(total_start),
66            )
67            .emit("encode");
68    }
69    Ok(encoded)
70}
71
72/// Encode lossless JPEG 2000/HTJ2K samples through CUDA and return stage timings.
73pub fn encode_j2k_lossless_with_cuda_and_profile(
74    samples: j2k::J2kLosslessSamples<'_>,
75    options: &j2k::J2kLosslessEncodeOptions,
76) -> Result<(j2k::EncodedJ2k, profile::CudaHtj2kEncodeProfileReport), crate::Error> {
77    let input_bytes = samples.data.len();
78    let strict_options = strict_cuda_encode_options(*options);
79    let mut accelerator = CudaEncodeStageAccelerator::with_profile_collection(true);
80    let total_start = profile::profile_now(true);
81    let encoded = j2k::encode_j2k_lossless_with_accelerator(
82        samples,
83        &strict_options,
84        BackendKind::Cuda,
85        &mut accelerator,
86    )?;
87    reject_non_cuda_encode_backend(&encoded)?;
88    let report =
89        accelerator.encode_profile_report(&encoded, input_bytes, profile::elapsed_us(total_start));
90    report.emit("encode");
91    Ok((encoded, report))
92}
93
94fn strict_cuda_encode_options(
95    options: j2k::J2kLosslessEncodeOptions,
96) -> j2k::J2kLosslessEncodeOptions {
97    options.with_backend(j2k::EncodeBackendPreference::RequireDevice)
98}
99
100fn reject_non_cuda_encode_backend(encoded: &j2k::EncodedJ2k) -> Result<(), crate::Error> {
101    if encoded.backend == BackendKind::Cuda {
102        Ok(())
103    } else {
104        Err(crate::Error::UnsupportedCudaRequest {
105            reason: "strict CUDA HTJ2K encode did not dispatch all required stages",
106        })
107    }
108}
109
110#[cfg(feature = "cuda-runtime")]
111/// CUDA-resident lossless J2K/HTJ2K encode input tile.
112#[derive(Debug, Clone, Copy)]
113pub struct CudaLosslessEncodeTile<'a> {
114    /// Source CUDA buffer containing interleaved Gray/RGB/RGBA pixels.
115    pub buffer: &'a CudaDeviceBuffer,
116    /// Byte offset of the first source pixel in `buffer`.
117    pub byte_offset: usize,
118    /// Width of the valid input region in pixels.
119    pub width: u32,
120    /// Height of the valid input region in pixels.
121    pub height: u32,
122    /// Number of bytes between consecutive input rows.
123    pub pitch_bytes: usize,
124    /// Encoded image width in pixels.
125    pub output_width: u32,
126    /// Encoded image height in pixels.
127    pub output_height: u32,
128    /// Pixel format of the source buffer.
129    pub format: PixelFormat,
130}
131
132#[cfg(feature = "cuda-runtime")]
133/// Residency decisions used by a lossless CUDA device-buffer encode.
134#[derive(Debug, Clone, Copy, PartialEq, Eq)]
135pub struct CudaLosslessEncodeResidency {
136    /// Whether coefficient preparation ran on CUDA.
137    pub coefficient_prep_used: bool,
138    /// Whether packetization ran on CUDA.
139    pub packetization_used: bool,
140    /// Whether final codestream assembly stayed resident on CUDA.
141    pub codestream_assembly_used: bool,
142}
143
144#[cfg(feature = "cuda-runtime")]
145/// Lossless CUDA device-buffer encode output with host codestream bytes and timings.
146#[derive(Debug, Clone, PartialEq, Eq)]
147pub struct CudaLosslessEncodeOutcome {
148    /// Encoded J2K codestream.
149    pub encoded: j2k::EncodedJ2k,
150    /// Whether the input buffer had to be copied or padded.
151    pub input_copy_used: bool,
152    /// Residency decisions for encode stages.
153    pub resident: CudaLosslessEncodeResidency,
154    /// Time spent copying or padding input.
155    pub input_copy_duration: Duration,
156    /// End-to-end encode duration for this tile.
157    pub encode_duration: Duration,
158    /// GPU-only duration when timestamp data is available.
159    pub gpu_duration: Option<Duration>,
160    /// Time spent validating encoded output.
161    pub validation_duration: Duration,
162    /// Time spent materializing CUDA output into host codestream bytes.
163    pub host_readback_duration: Duration,
164    /// CUDA encode stage timing buckets collected for this tile.
165    pub stage_timings: CudaEncodeStageTimings,
166}
167
168#[cfg(feature = "cuda-runtime")]
169/// Submitted single-tile CUDA lossless encode.
170#[derive(Debug)]
171pub struct SubmittedJ2kLosslessCudaEncode {
172    inner: ReadySubmission<j2k::EncodedJ2k, crate::Error>,
173}
174
175#[cfg(feature = "cuda-runtime")]
176/// Submitted multi-tile CUDA lossless encode.
177#[derive(Debug)]
178pub struct SubmittedJ2kLosslessCudaEncodeBatch {
179    inner: ReadySubmission<Vec<j2k::EncodedJ2k>, crate::Error>,
180}
181
182#[cfg(feature = "cuda-runtime")]
183impl DeviceSubmission for SubmittedJ2kLosslessCudaEncode {
184    type Output = j2k::EncodedJ2k;
185    type Error = crate::Error;
186
187    fn wait(self) -> Result<Self::Output, Self::Error> {
188        self.inner.wait()
189    }
190}
191
192#[cfg(feature = "cuda-runtime")]
193impl DeviceSubmission for SubmittedJ2kLosslessCudaEncodeBatch {
194    type Output = Vec<j2k::EncodedJ2k>;
195    type Error = crate::Error;
196
197    fn wait(self) -> Result<Self::Output, Self::Error> {
198        self.inner.wait()
199    }
200}
201
202#[cfg(feature = "cuda-runtime")]
203/// Encode one CUDA-resident tile into host codestream bytes.
204pub fn encode_lossless_from_cuda_buffer(
205    tile: CudaLosslessEncodeTile<'_>,
206    options: &j2k::J2kLosslessEncodeOptions,
207    session: &mut CudaSession,
208) -> Result<j2k::EncodedJ2k, crate::Error> {
209    submit_lossless_from_cuda_buffer(tile, options, session)?.wait()
210}
211
212#[cfg(feature = "cuda-runtime")]
213/// Submit one CUDA-resident tile encode for later host-byte collection.
214pub fn submit_lossless_from_cuda_buffer(
215    tile: CudaLosslessEncodeTile<'_>,
216    options: &j2k::J2kLosslessEncodeOptions,
217    session: &mut CudaSession,
218) -> Result<SubmittedJ2kLosslessCudaEncode, crate::Error> {
219    let result = encode_lossless_from_cuda_buffer_with_report(tile, options, session)
220        .map(|outcome| outcome.encoded);
221    Ok(SubmittedJ2kLosslessCudaEncode {
222        inner: ReadySubmission::from_result(result),
223    })
224}
225
226#[cfg(feature = "cuda-runtime")]
227/// Encode one CUDA-resident tile and return a host-byte timing report.
228pub fn encode_lossless_from_cuda_buffer_with_report(
229    tile: CudaLosslessEncodeTile<'_>,
230    options: &j2k::J2kLosslessEncodeOptions,
231    session: &mut CudaSession,
232) -> Result<CudaLosslessEncodeOutcome, crate::Error> {
233    validate_cuda_encode_options(*options)?;
234    validate_cuda_encode_tile(tile)?;
235    session.record_submit();
236    encode_lossless_cuda_tile_with_report(tile, *options)
237}
238
239#[cfg(feature = "cuda-runtime")]
240/// Encode multiple CUDA-resident tiles into host codestream bytes.
241pub fn encode_lossless_from_cuda_buffers(
242    tiles: &[CudaLosslessEncodeTile<'_>],
243    options: &j2k::J2kLosslessEncodeOptions,
244    session: &mut CudaSession,
245) -> Result<Vec<j2k::EncodedJ2k>, crate::Error> {
246    submit_lossless_from_cuda_buffers(tiles, options, session)?.wait()
247}
248
249#[cfg(feature = "cuda-runtime")]
250/// Submit multiple CUDA-resident tile encodes for later host-byte collection.
251pub fn submit_lossless_from_cuda_buffers(
252    tiles: &[CudaLosslessEncodeTile<'_>],
253    options: &j2k::J2kLosslessEncodeOptions,
254    session: &mut CudaSession,
255) -> Result<SubmittedJ2kLosslessCudaEncodeBatch, crate::Error> {
256    let result =
257        encode_lossless_from_cuda_buffers_with_report(tiles, options, session).map(|outcomes| {
258            outcomes
259                .into_iter()
260                .map(|outcome| outcome.encoded)
261                .collect()
262        });
263    Ok(SubmittedJ2kLosslessCudaEncodeBatch {
264        inner: ReadySubmission::from_result(result),
265    })
266}
267
268#[cfg(feature = "cuda-runtime")]
269/// Encode multiple CUDA-resident tiles and return host-byte timing reports.
270pub fn encode_lossless_from_cuda_buffers_with_report(
271    tiles: &[CudaLosslessEncodeTile<'_>],
272    options: &j2k::J2kLosslessEncodeOptions,
273    session: &mut CudaSession,
274) -> Result<Vec<CudaLosslessEncodeOutcome>, crate::Error> {
275    if tiles.is_empty() {
276        return Err(crate::Error::UnsupportedCudaRequest {
277            reason: "J2K CUDA encode received an empty tile batch",
278        });
279    }
280    validate_cuda_encode_options(*options)?;
281    tiles
282        .iter()
283        .copied()
284        .map(|tile| {
285            validate_cuda_encode_tile(tile)?;
286            session.record_submit();
287            encode_lossless_cuda_tile_with_report(tile, *options)
288        })
289        .collect()
290}
291
292#[cfg(feature = "cuda-runtime")]
293fn validate_cuda_encode_options(
294    options: j2k::J2kLosslessEncodeOptions,
295) -> Result<(), crate::Error> {
296    if options.block_coding_mode != j2k::J2kBlockCodingMode::HighThroughput {
297        return Err(crate::Error::UnsupportedCudaRequest {
298            reason: "J2K CUDA device-buffer encode currently requires HTJ2K block coding",
299        });
300    }
301    if options.validation != j2k::J2kEncodeValidation::External {
302        return Err(crate::Error::UnsupportedCudaRequest {
303            reason: "J2K CUDA device-buffer encode requires external validation to avoid host input readback",
304        });
305    }
306    Ok(())
307}
308
309#[cfg(feature = "cuda-runtime")]
310fn validate_cuda_encode_tile(tile: CudaLosslessEncodeTile<'_>) -> Result<(), crate::Error> {
311    if tile.width == 0 || tile.height == 0 || tile.output_width == 0 || tile.output_height == 0 {
312        return Err(crate::Error::UnsupportedCudaRequest {
313            reason: "J2K CUDA encode tile dimensions must be nonzero",
314        });
315    }
316    if tile.width != tile.output_width || tile.height != tile.output_height {
317        return Err(crate::Error::UnsupportedCudaRequest {
318            reason: "J2K CUDA device-buffer encode does not yet support input padding",
319        });
320    }
321    let format = cuda_encode_format(tile.format)?;
322    let row_bytes = (tile.width as usize)
323        .checked_mul(format.bytes_per_pixel)
324        .ok_or(crate::Error::UnsupportedCudaRequest {
325            reason: "J2K CUDA encode row byte count overflow",
326        })?;
327    if tile.pitch_bytes < row_bytes {
328        return Err(crate::Error::UnsupportedCudaRequest {
329            reason: "J2K CUDA encode tile pitch is shorter than one row",
330        });
331    }
332    let required_end = tile
333        .byte_offset
334        .checked_add(
335            tile.pitch_bytes
336                .checked_mul(tile.height.saturating_sub(1) as usize)
337                .and_then(|prefix| prefix.checked_add(row_bytes))
338                .ok_or(crate::Error::UnsupportedCudaRequest {
339                    reason: "J2K CUDA encode input byte range overflow",
340                })?,
341        )
342        .ok_or(crate::Error::UnsupportedCudaRequest {
343            reason: "J2K CUDA encode input byte range overflow",
344        })?;
345    if required_end > tile.buffer.byte_len() {
346        return Err(crate::Error::UnsupportedCudaRequest {
347            reason: "J2K CUDA encode input byte range exceeds buffer length",
348        });
349    }
350    Ok(())
351}
352
353#[cfg(feature = "cuda-runtime")]
354#[derive(Debug, Clone, Copy)]
355struct CudaEncodeFormat {
356    components: u8,
357    bit_depth: u8,
358    bytes_per_pixel: usize,
359}
360
361#[cfg(feature = "cuda-runtime")]
362fn cuda_encode_format(format: PixelFormat) -> Result<CudaEncodeFormat, crate::Error> {
363    let components =
364        u8::try_from(format.channels()).map_err(|_| crate::Error::UnsupportedCudaRequest {
365            reason: "J2K CUDA encode received a pixel format with too many components",
366        })?;
367    let bit_depth = match format.bytes_per_sample() {
368        1 => 8,
369        2 => 16,
370        _ => {
371            return Err(crate::Error::UnsupportedCudaRequest {
372                reason: "J2K CUDA encode received an unsupported sample width",
373            });
374        }
375    };
376    Ok(CudaEncodeFormat {
377        components,
378        bit_depth,
379        bytes_per_pixel: format.bytes_per_pixel(),
380    })
381}
382
383#[cfg(feature = "cuda-runtime")]
384fn encode_lossless_cuda_tile_with_report(
385    tile: CudaLosslessEncodeTile<'_>,
386    options: j2k::J2kLosslessEncodeOptions,
387) -> Result<CudaLosslessEncodeOutcome, crate::Error> {
388    let encode_started = Instant::now();
389    let format = cuda_encode_format(tile.format)?;
390    let dummy_len = (tile.output_width as usize)
391        .checked_mul(tile.output_height as usize)
392        .and_then(|pixels| pixels.checked_mul(format.bytes_per_pixel))
393        .ok_or(crate::Error::UnsupportedCudaRequest {
394            reason: "J2K CUDA encode sample descriptor length overflow",
395        })?;
396    let dummy = vec![0u8; dummy_len];
397    let samples = j2k::J2kLosslessSamples::new(
398        &dummy,
399        tile.output_width,
400        tile.output_height,
401        format.components,
402        format.bit_depth,
403        false,
404    )?;
405    let context = tile.buffer.context();
406    let resources = context
407        .upload_htj2k_encode_resources(cuda_htj2k_encode_tables())
408        .map_err(cuda_error)?;
409    let mut accelerator = CudaDeviceBufferEncodeAccelerator {
410        tile,
411        context,
412        resources,
413        dispatch: J2kEncodeDispatchReport::default(),
414        stage_timings: CudaEncodeStageTimings::default(),
415    };
416    let encoded = j2k::encode_j2k_lossless_with_accelerator(
417        samples,
418        &strict_cuda_encode_options(options),
419        BackendKind::Cuda,
420        &mut accelerator,
421    )?;
422    reject_non_cuda_encode_backend(&encoded)?;
423    Ok(CudaLosslessEncodeOutcome {
424        encoded,
425        input_copy_used: false,
426        resident: CudaLosslessEncodeResidency {
427            coefficient_prep_used: accelerator.dispatch.deinterleave > 0,
428            packetization_used: accelerator.dispatch.packetization > 0,
429            codestream_assembly_used: false,
430        },
431        input_copy_duration: Duration::ZERO,
432        encode_duration: encode_started.elapsed(),
433        gpu_duration: None,
434        validation_duration: Duration::ZERO,
435        host_readback_duration: Duration::ZERO,
436        stage_timings: accelerator.stage_timings,
437    })
438}
439
440#[cfg(feature = "cuda-runtime")]
441struct CudaDeviceBufferEncodeAccelerator<'a> {
442    tile: CudaLosslessEncodeTile<'a>,
443    context: CudaContext,
444    resources: CudaHtj2kEncodeResources,
445    dispatch: J2kEncodeDispatchReport,
446    stage_timings: CudaEncodeStageTimings,
447}
448
449#[cfg(feature = "cuda-runtime")]
450impl J2kEncodeStageAccelerator for CudaDeviceBufferEncodeAccelerator<'_> {
451    fn dispatch_report(&self) -> J2kEncodeDispatchReport {
452        self.dispatch
453    }
454
455    fn encode_htj2k_tile(
456        &mut self,
457        job: J2kHtj2kTileEncodeJob<'_>,
458    ) -> core::result::Result<Option<Vec<u8>>, &'static str> {
459        let Some(encoded) = cuda_encode_htj2k_device_tile_body(
460            &self.context,
461            &self.resources,
462            self.tile,
463            job,
464            true,
465        )?
466        else {
467            return Ok(None);
468        };
469        self.dispatch.deinterleave = self
470            .dispatch
471            .deinterleave
472            .saturating_add(encoded.deinterleave_dispatches);
473        self.dispatch.forward_rct = self
474            .dispatch
475            .forward_rct
476            .saturating_add(encoded.forward_rct_dispatches);
477        self.dispatch.forward_ict = self
478            .dispatch
479            .forward_ict
480            .saturating_add(encoded.forward_ict_dispatches);
481        self.dispatch.forward_dwt53 = self
482            .dispatch
483            .forward_dwt53
484            .saturating_add(encoded.forward_dwt53_dispatches);
485        self.dispatch.forward_dwt97 = self
486            .dispatch
487            .forward_dwt97
488            .saturating_add(encoded.forward_dwt97_dispatches);
489        self.dispatch.quantize_subband = self
490            .dispatch
491            .quantize_subband
492            .saturating_add(encoded.quantize_dispatches);
493        self.dispatch.ht_code_block = self
494            .dispatch
495            .ht_code_block
496            .saturating_add(encoded.ht_code_block_dispatches);
497        self.dispatch.packetization = self
498            .dispatch
499            .packetization
500            .saturating_add(encoded.packetization_dispatches);
501        self.stage_timings = self.stage_timings.saturating_add(encoded.timings);
502        Ok(Some(encoded.tile_data))
503    }
504}
505
506/// CUDA implementation of selected JPEG 2000 encode stages.
507#[derive(Debug, Default, Clone)]
508#[allow(clippy::struct_excessive_bools)]
509pub struct CudaEncodeStageAccelerator {
510    #[cfg(feature = "cuda-runtime")]
511    context: Option<CudaContext>,
512    #[cfg(feature = "cuda-runtime")]
513    encode_resources: Option<Arc<CudaHtj2kEncodeResources>>,
514    #[cfg_attr(not(feature = "cuda-runtime"), allow(dead_code))]
515    collect_profile: bool,
516    deinterleave_attempts: usize,
517    forward_rct_attempts: usize,
518    forward_ict_attempts: usize,
519    forward_dwt53_attempts: usize,
520    forward_dwt97_attempts: usize,
521    htj2k_tile_attempts: usize,
522    quantize_subband_attempts: usize,
523    ht_subband_attempts: usize,
524    tier1_code_block_attempts: usize,
525    ht_code_block_attempts: usize,
526    packetization_attempts: usize,
527    prefer_cpu_forward_rct: bool,
528    prefer_cpu_ht_subband: bool,
529    prefer_cpu_quantize_subband: bool,
530    prefer_cpu_packetization: bool,
531    deinterleave_dispatches: usize,
532    forward_rct_dispatches: usize,
533    forward_ict_dispatches: usize,
534    forward_dwt53_dispatches: usize,
535    forward_dwt97_dispatches: usize,
536    #[cfg_attr(not(feature = "cuda-runtime"), allow(dead_code))]
537    htj2k_tile_dispatches: usize,
538    quantize_subband_dispatches: usize,
539    #[cfg_attr(not(feature = "cuda-runtime"), allow(dead_code))]
540    ht_subband_dispatches: usize,
541    tier1_code_block_dispatches: usize,
542    ht_code_block_dispatches: usize,
543    packetization_dispatches: usize,
544    deinterleave_us: u128,
545    mct_us: u128,
546    dwt_us: u128,
547    quantize_us: u128,
548    ht_encode_us: u128,
549    packetize_us: u128,
550}
551
552impl CudaEncodeStageAccelerator {
553    /// Create an encode-stage accelerator with optional CUDA stage timing collection.
554    #[must_use]
555    pub fn with_profile_collection(collect_profile: bool) -> Self {
556        Self {
557            collect_profile,
558            ..Self::default()
559        }
560    }
561
562    /// Create the measured Auto route for host-output HTJ2K encode.
563    ///
564    /// CUDA keeps the DWT and HT code-block stages, while forward RCT and
565    /// Tier-2 packetization stay on the CPU for the current host-pixel path.
566    #[must_use]
567    pub fn for_auto_host_output() -> Self {
568        Self::default()
569            .prefer_cpu_forward_rct(true)
570            .prefer_cpu_packetization(true)
571    }
572
573    /// Prefer scalar CPU forward RCT while keeping later CUDA stages enabled.
574    #[must_use]
575    pub fn prefer_cpu_forward_rct(mut self, prefer_cpu_forward_rct: bool) -> Self {
576        self.prefer_cpu_forward_rct = prefer_cpu_forward_rct;
577        self
578    }
579
580    /// Prefer scalar CPU Tier-2 packetization while keeping CUDA Tier-1/HT block coding enabled.
581    ///
582    /// This is useful for batches of many small tiles where launching a CUDA
583    /// packetization kernel and copying several tiny descriptor buffers per tile
584    /// costs more than forming the packet body on the host.
585    #[must_use]
586    pub fn prefer_cpu_packetization(mut self, prefer_cpu_packetization: bool) -> Self {
587        self.prefer_cpu_packetization = prefer_cpu_packetization;
588        self
589    }
590
591    /// Prefer host sub-band quantization while keeping batched CUDA HT code-block encode enabled.
592    ///
593    /// This avoids launching one CUDA quantize/subband path for every prepared
594    /// subband in multi-resolution precomputed transcode outputs, where the
595    /// many tiny launches cost more than CPU quantization.
596    #[must_use]
597    pub fn prefer_cpu_ht_subband(mut self, prefer_cpu_ht_subband: bool) -> Self {
598        self.prefer_cpu_ht_subband = prefer_cpu_ht_subband;
599        self
600    }
601
602    /// Prefer host sub-band quantization while keeping CUDA HT code-block encode enabled.
603    ///
604    /// Multi-resolution transcode workloads can contain thousands of small
605    /// subbands; for those, CPU quantization plus one batched HT code-block
606    /// encode per tile is currently faster than launching CUDA quantization for
607    /// every subband.
608    #[must_use]
609    pub fn prefer_cpu_quantize_subband(mut self, prefer_cpu_quantize_subband: bool) -> Self {
610        self.prefer_cpu_quantize_subband = prefer_cpu_quantize_subband;
611        self
612    }
613
614    /// Return cumulative CUDA encode stage timings collected by this accelerator.
615    #[must_use]
616    pub const fn collected_stage_timings(&self) -> CudaEncodeStageTimings {
617        CudaEncodeStageTimings {
618            deinterleave_us: self.deinterleave_us,
619            mct_us: self.mct_us,
620            dwt_us: self.dwt_us,
621            quantize_us: self.quantize_us,
622            ht_encode_us: self.ht_encode_us,
623            packetize_us: self.packetize_us,
624        }
625    }
626
627    /// Clear cumulative CUDA encode stage timings without changing dispatch counters.
628    pub fn reset_collected_stage_timings(&mut self) {
629        self.deinterleave_us = 0;
630        self.mct_us = 0;
631        self.dwt_us = 0;
632        self.quantize_us = 0;
633        self.ht_encode_us = 0;
634        self.packetize_us = 0;
635    }
636
637    #[cfg(feature = "cuda-runtime")]
638    fn cuda_context(&mut self) -> core::result::Result<Option<CudaContext>, &'static str> {
639        if self.context.is_none() {
640            match CudaContext::system_default() {
641                Ok(context) => self.context = Some(context),
642                Err(_) if cuda_runtime_required() => return Err("CUDA encode stage unavailable"),
643                Err(_) => return Ok(None),
644            }
645        }
646        Ok(self.context.clone())
647    }
648
649    #[cfg(feature = "cuda-runtime")]
650    fn cuda_encode_resources(
651        &mut self,
652        context: &CudaContext,
653    ) -> core::result::Result<Arc<CudaHtj2kEncodeResources>, &'static str> {
654        if self.encode_resources.is_none() {
655            let resources = context
656                .upload_htj2k_encode_resources(cuda_htj2k_encode_tables())
657                .map_err(|_| "CUDA HTJ2K encode resource upload failed")?;
658            self.encode_resources = Some(Arc::new(resources));
659        }
660        self.encode_resources
661            .clone()
662            .ok_or("CUDA HTJ2K encode resources unavailable")
663    }
664
665    fn encode_profile_report(
666        &self,
667        encoded: &j2k::EncodedJ2k,
668        input_bytes: usize,
669        total_us: u128,
670    ) -> profile::CudaHtj2kEncodeProfileReport {
671        profile::CudaHtj2kEncodeProfileReport {
672            deinterleave_us: self.deinterleave_us,
673            mct_us: self.mct_us,
674            dwt_us: self.dwt_us,
675            quantize_us: self.quantize_us,
676            ht_encode_us: self.ht_encode_us,
677            packetize_us: self.packetize_us,
678            total_us,
679            input_bytes,
680            codestream_bytes: encoded.codestream.len(),
681            block_count: self.ht_code_block_attempts,
682            dispatch_count: self.dispatch_report().total(),
683            backend: encoded.backend,
684        }
685    }
686
687    /// Number of deinterleave attempts observed.
688    pub fn deinterleave_attempts(&self) -> usize {
689        self.deinterleave_attempts
690    }
691
692    /// Number of forward RCT attempts observed.
693    pub fn forward_rct_attempts(&self) -> usize {
694        self.forward_rct_attempts
695    }
696
697    /// Number of forward ICT attempts observed.
698    pub fn forward_ict_attempts(&self) -> usize {
699        self.forward_ict_attempts
700    }
701
702    /// Number of forward 5/3 DWT attempts observed.
703    pub fn forward_dwt53_attempts(&self) -> usize {
704        self.forward_dwt53_attempts
705    }
706
707    /// Number of forward 9/7 DWT attempts observed.
708    pub fn forward_dwt97_attempts(&self) -> usize {
709        self.forward_dwt97_attempts
710    }
711
712    /// Number of sub-band quantization attempts observed.
713    pub fn quantize_subband_attempts(&self) -> usize {
714        self.quantize_subband_attempts
715    }
716
717    /// Number of classic Tier-1 code-block attempts observed.
718    pub fn tier1_code_block_attempts(&self) -> usize {
719        self.tier1_code_block_attempts
720    }
721
722    /// Number of HT code-block attempts observed.
723    pub fn ht_code_block_attempts(&self) -> usize {
724        self.ht_code_block_attempts
725    }
726
727    /// Number of packetization attempts observed.
728    pub fn packetization_attempts(&self) -> usize {
729        self.packetization_attempts
730    }
731
732    /// Number of deinterleave CUDA dispatches.
733    pub fn deinterleave_dispatches(&self) -> usize {
734        self.deinterleave_dispatches
735    }
736
737    /// Number of forward RCT CUDA dispatches.
738    pub fn forward_rct_dispatches(&self) -> usize {
739        self.forward_rct_dispatches
740    }
741
742    /// Number of forward ICT CUDA dispatches.
743    pub fn forward_ict_dispatches(&self) -> usize {
744        self.forward_ict_dispatches
745    }
746
747    /// Number of forward 5/3 DWT CUDA dispatches.
748    pub fn forward_dwt53_dispatches(&self) -> usize {
749        self.forward_dwt53_dispatches
750    }
751
752    /// Number of forward 9/7 DWT CUDA dispatches.
753    pub fn forward_dwt97_dispatches(&self) -> usize {
754        self.forward_dwt97_dispatches
755    }
756
757    /// Number of sub-band quantization CUDA dispatches.
758    pub fn quantize_subband_dispatches(&self) -> usize {
759        self.quantize_subband_dispatches
760    }
761
762    /// Number of classic Tier-1 CUDA dispatches.
763    pub fn tier1_code_block_dispatches(&self) -> usize {
764        self.tier1_code_block_dispatches
765    }
766
767    /// Number of HT code-block CUDA dispatches.
768    pub fn ht_code_block_dispatches(&self) -> usize {
769        self.ht_code_block_dispatches
770    }
771
772    /// Number of packetization CUDA dispatches.
773    pub fn packetization_dispatches(&self) -> usize {
774        self.packetization_dispatches
775    }
776}
777
778#[cfg(feature = "cuda-runtime")]
779fn cuda_runtime_required() -> bool {
780    std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_some()
781}
782
783#[cfg(feature = "cuda-runtime")]
784fn time_cuda_stage<T>(
785    name: &'static str,
786    context: &CudaContext,
787    collect_profile: bool,
788    work: impl FnOnce() -> core::result::Result<T, CudaError>,
789) -> core::result::Result<(T, u128), CudaError> {
790    if collect_profile {
791        context.time_default_stream_named_us(name, work)
792    } else {
793        context
794            .with_nvtx_range(name, work)
795            .map(|output| (output, 0))
796    }
797}
798
799/// Cumulative CUDA encode-stage timings collected by `CudaEncodeStageAccelerator`.
800#[allow(clippy::struct_field_names)]
801#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
802pub struct CudaEncodeStageTimings {
803    /// Pixel deinterleave and level-shift CUDA stage time.
804    pub deinterleave_us: u128,
805    /// Forward MCT CUDA stage time.
806    pub mct_us: u128,
807    /// Forward DWT CUDA stage time.
808    pub dwt_us: u128,
809    /// Quantization CUDA stage time.
810    pub quantize_us: u128,
811    /// HT code-block encode CUDA stage time.
812    pub ht_encode_us: u128,
813    /// HTJ2K packetization CUDA stage time.
814    pub packetize_us: u128,
815}
816
817impl CudaEncodeStageTimings {
818    /// Return field-wise saturating timing sums.
819    #[must_use]
820    pub const fn saturating_add(self, other: Self) -> Self {
821        Self {
822            deinterleave_us: self.deinterleave_us.saturating_add(other.deinterleave_us),
823            mct_us: self.mct_us.saturating_add(other.mct_us),
824            dwt_us: self.dwt_us.saturating_add(other.dwt_us),
825            quantize_us: self.quantize_us.saturating_add(other.quantize_us),
826            ht_encode_us: self.ht_encode_us.saturating_add(other.ht_encode_us),
827            packetize_us: self.packetize_us.saturating_add(other.packetize_us),
828        }
829    }
830
831    /// Total collected CUDA encode-stage time.
832    #[must_use]
833    pub const fn total_us(self) -> u128 {
834        self.deinterleave_us
835            .saturating_add(self.mct_us)
836            .saturating_add(self.dwt_us)
837            .saturating_add(self.quantize_us)
838            .saturating_add(self.ht_encode_us)
839            .saturating_add(self.packetize_us)
840    }
841}
842
843#[derive(Debug, Clone, PartialEq, Eq)]
844struct CudaHtj2kPacketizationPlan {
845    payload: Vec<u8>,
846    packets: Vec<CudaHtj2kPacketizationPlanPacket>,
847    subbands: Vec<CudaHtj2kPacketizationPlanSubband>,
848    blocks: Vec<CudaHtj2kPacketizationPlanBlock>,
849    tag_states: Vec<CudaHtj2kPacketizationPlanSubbandTagState>,
850    tag_nodes: Vec<CudaHtj2kPacketizationPlanTagNodeState>,
851}
852
853struct CudaHtj2kPacketizationPlanSink<'a> {
854    payload: &'a mut Vec<u8>,
855    packets: &'a mut Vec<CudaHtj2kPacketizationPlanPacket>,
856    subbands: &'a mut Vec<CudaHtj2kPacketizationPlanSubband>,
857    blocks: &'a mut Vec<CudaHtj2kPacketizationPlanBlock>,
858    tag_states: &'a mut Vec<CudaHtj2kPacketizationPlanSubbandTagState>,
859    tag_nodes: &'a mut Vec<CudaHtj2kPacketizationPlanTagNodeState>,
860}
861
862#[derive(Debug, Clone, Copy, PartialEq, Eq)]
863struct CudaHtj2kPacketizationPlanPacket {
864    block_start: u32,
865    block_count: u32,
866    subband_start: u32,
867    subband_count: u32,
868    output_capacity: u32,
869    layer: u32,
870}
871
872#[derive(Debug, Clone, Copy, PartialEq, Eq)]
873struct CudaHtj2kPacketizationPlanSubband {
874    block_start: u32,
875    block_count: u32,
876    num_cbs_x: u32,
877    num_cbs_y: u32,
878}
879
880#[derive(Debug, Clone, Copy, PartialEq, Eq)]
881struct CudaHtj2kPacketizationPlanBlock {
882    data_offset: u32,
883    data_len: u32,
884    cleanup_length: u32,
885    refinement_length: u32,
886    num_coding_passes: u32,
887    num_zero_bitplanes: u32,
888    l_block: u32,
889    previously_included: u32,
890    inclusion_layer: u32,
891}
892
893#[derive(Debug, Clone, Copy, PartialEq, Eq)]
894struct CudaHtj2kPacketizationPlanSubbandTagState {
895    inclusion_node_start: u32,
896    zero_bitplane_node_start: u32,
897    node_count: u32,
898}
899
900#[derive(Debug, Clone, Copy, PartialEq, Eq)]
901struct CudaHtj2kPacketizationPlanTagNodeState {
902    current: u32,
903    known: u32,
904}
905
906#[derive(Debug, Clone, PartialEq, Eq)]
907struct CudaHtj2kPacketizationTagTreeState {
908    values: Vec<u32>,
909    current: Vec<u32>,
910    known: Vec<u32>,
911    widths: Vec<u32>,
912    heights: Vec<u32>,
913    offsets: Vec<usize>,
914}
915
916#[derive(Debug, Clone, Copy, PartialEq, Eq)]
917struct CudaHtj2kPacketizationBlockState {
918    previously_included: bool,
919    l_block: u32,
920    inclusion_layer: u32,
921    first_inclusion_zero_bitplanes: u32,
922}
923
924#[derive(Debug, Clone, PartialEq, Eq)]
925struct CudaHtj2kPacketizationSubbandState {
926    num_cbs_x: u32,
927    num_cbs_y: u32,
928    inclusion_tree: CudaHtj2kPacketizationTagTreeState,
929    zero_bitplane_tree: CudaHtj2kPacketizationTagTreeState,
930    blocks: Vec<CudaHtj2kPacketizationBlockState>,
931}
932
933#[derive(Debug, Clone, PartialEq, Eq)]
934struct CudaHtj2kPacketizationState {
935    subbands: Vec<CudaHtj2kPacketizationSubbandState>,
936}
937
938fn flatten_cuda_htj2k_packetization_job(
939    job: J2kPacketizationEncodeJob<'_>,
940) -> core::result::Result<CudaHtj2kPacketizationPlan, &'static str> {
941    if job.resolution_count as usize != job.resolutions.len() {
942        return Err("CUDA HTJ2K packetization resolution count mismatch");
943    }
944
945    let mut payload = Vec::new();
946    let mut packets = Vec::new();
947    let mut subbands = Vec::new();
948    let mut blocks = Vec::new();
949    let mut tag_states = Vec::new();
950    let mut tag_nodes = Vec::new();
951
952    {
953        let mut sink = CudaHtj2kPacketizationPlanSink {
954            payload: &mut payload,
955            packets: &mut packets,
956            subbands: &mut subbands,
957            blocks: &mut blocks,
958            tag_states: &mut tag_states,
959            tag_nodes: &mut tag_nodes,
960        };
961        if job.packet_descriptors.is_empty() {
962            if job.num_layers != 1 {
963                return Err(
964                    "CUDA HTJ2K packetization requires explicit descriptors for multiple layers",
965                );
966            }
967            for packet_index in 0..job.resolutions.len() {
968                flatten_cuda_htj2k_packet(
969                    job.resolutions
970                        .get(packet_index)
971                        .ok_or("CUDA HTJ2K packet descriptor index out of range")?,
972                    &mut sink,
973                )?;
974            }
975        } else {
976            let state_count = job
977                .packet_descriptors
978                .iter()
979                .map(|descriptor| descriptor.state_index as usize)
980                .max()
981                .map_or(0usize, |max_state| max_state + 1);
982            let mut states: Vec<Option<CudaHtj2kPacketizationState>> =
983                core::iter::repeat_with(|| None).take(state_count).collect();
984            for descriptor in job.packet_descriptors {
985                if descriptor.layer >= job.num_layers {
986                    return Err("CUDA HTJ2K packetization descriptor layer exceeds layer count");
987                }
988                let resolution = job
989                    .resolutions
990                    .get(descriptor.packet_index as usize)
991                    .ok_or("CUDA HTJ2K packet descriptor index out of range")?;
992                let state = states
993                    .get_mut(descriptor.state_index as usize)
994                    .ok_or("CUDA HTJ2K packet descriptor state index out of range")?;
995                if let Some(existing) = state {
996                    validate_cuda_htj2k_packetization_state_layout(existing, resolution)?;
997                } else {
998                    *state = Some(seed_cuda_htj2k_packetization_state(resolution)?);
999                }
1000                let state = state
1001                    .as_mut()
1002                    .ok_or("CUDA HTJ2K packetization state initialization failed")?;
1003                record_cuda_htj2k_packetization_first_inclusion_layers(
1004                    state,
1005                    resolution,
1006                    descriptor.layer,
1007                )?;
1008            }
1009            for state in states.iter_mut().flatten() {
1010                finalize_cuda_htj2k_packetization_tag_trees(state);
1011            }
1012            for descriptor in job.packet_descriptors {
1013                if descriptor.layer >= job.num_layers {
1014                    return Err("CUDA HTJ2K packetization descriptor layer exceeds layer count");
1015                }
1016                let resolution = job
1017                    .resolutions
1018                    .get(descriptor.packet_index as usize)
1019                    .ok_or("CUDA HTJ2K packet descriptor index out of range")?;
1020                let state = states
1021                    .get_mut(descriptor.state_index as usize)
1022                    .ok_or("CUDA HTJ2K packet descriptor state index out of range")?;
1023                if let Some(existing) = state {
1024                    validate_cuda_htj2k_packetization_state_layout(existing, resolution)?;
1025                } else {
1026                    *state = Some(seed_cuda_htj2k_packetization_state(resolution)?);
1027                }
1028                let state = state
1029                    .as_mut()
1030                    .ok_or("CUDA HTJ2K packetization state initialization failed")?;
1031                flatten_cuda_htj2k_packet_with_state(
1032                    resolution,
1033                    descriptor.layer,
1034                    state,
1035                    &mut sink,
1036                )?;
1037            }
1038        }
1039    }
1040
1041    if job.code_block_count as usize != blocks.len() {
1042        return Err("CUDA HTJ2K packetization code-block count mismatch");
1043    }
1044
1045    Ok(CudaHtj2kPacketizationPlan {
1046        payload,
1047        packets,
1048        subbands,
1049        blocks,
1050        tag_states,
1051        tag_nodes,
1052    })
1053}
1054
1055fn seed_cuda_htj2k_packetization_state(
1056    resolution: &J2kPacketizationResolution<'_>,
1057) -> core::result::Result<CudaHtj2kPacketizationState, &'static str> {
1058    let mut subbands = Vec::with_capacity(resolution.subbands.len());
1059    for subband in &resolution.subbands {
1060        let block_count = u32::try_from(subband.code_blocks.len())
1061            .map_err(|_| "CUDA HTJ2K packetization block count exceeds u32")?;
1062        if subband.num_cbs_x == 0
1063            || subband.num_cbs_y == 0
1064            || subband.num_cbs_x.saturating_mul(subband.num_cbs_y) != block_count
1065        {
1066            return Err("CUDA HTJ2K packetization subband code-block layout mismatch");
1067        }
1068        let mut inclusion_tree =
1069            CudaHtj2kPacketizationTagTreeState::new(subband.num_cbs_x, subband.num_cbs_y)?;
1070        let zero_bitplane_tree =
1071            CudaHtj2kPacketizationTagTreeState::new(subband.num_cbs_x, subband.num_cbs_y)?;
1072        for idx in 0..subband.code_blocks.len() {
1073            let (x, y) = cuda_htj2k_packetization_block_xy(idx, subband.num_cbs_x)?;
1074            inclusion_tree.set_leaf_value(x, y, CUDA_HTJ2K_PACKET_TAG_INF);
1075        }
1076        subbands.push(CudaHtj2kPacketizationSubbandState {
1077            num_cbs_x: subband.num_cbs_x,
1078            num_cbs_y: subband.num_cbs_y,
1079            inclusion_tree,
1080            zero_bitplane_tree,
1081            blocks: subband
1082                .code_blocks
1083                .iter()
1084                .map(|block| CudaHtj2kPacketizationBlockState {
1085                    previously_included: block.previously_included,
1086                    l_block: block.l_block,
1087                    inclusion_layer: CUDA_HTJ2K_PACKET_TAG_INF,
1088                    first_inclusion_zero_bitplanes: 0,
1089                })
1090                .collect(),
1091        });
1092    }
1093    Ok(CudaHtj2kPacketizationState { subbands })
1094}
1095
1096fn validate_cuda_htj2k_packetization_state_layout(
1097    state: &CudaHtj2kPacketizationState,
1098    resolution: &J2kPacketizationResolution<'_>,
1099) -> core::result::Result<(), &'static str> {
1100    if state.subbands.len() != resolution.subbands.len() {
1101        return Err("CUDA HTJ2K packetization state layout mismatch");
1102    }
1103    for (state_subband, packet_subband) in state.subbands.iter().zip(&resolution.subbands) {
1104        if state_subband.num_cbs_x != packet_subband.num_cbs_x
1105            || state_subband.num_cbs_y != packet_subband.num_cbs_y
1106            || state_subband.blocks.len() != packet_subband.code_blocks.len()
1107        {
1108            return Err("CUDA HTJ2K packetization state layout mismatch");
1109        }
1110    }
1111    Ok(())
1112}
1113
1114const CUDA_HTJ2K_PACKET_TAG_INF: u32 = 0x7FFF_FFFF;
1115const CUDA_HTJ2K_PACKET_MAX_TAG_NODES: usize = 2048;
1116const CUDA_HTJ2K_PACKET_MAX_TAG_LEVELS: usize = 16;
1117
1118fn cuda_htj2k_packetization_block_xy(
1119    index: usize,
1120    num_cbs_x: u32,
1121) -> core::result::Result<(u32, u32), &'static str> {
1122    let index =
1123        u32::try_from(index).map_err(|_| "CUDA HTJ2K packetization block count exceeds u32")?;
1124    Ok((index % num_cbs_x, index / num_cbs_x))
1125}
1126
1127impl CudaHtj2kPacketizationTagTreeState {
1128    fn new(width: u32, height: u32) -> core::result::Result<Self, &'static str> {
1129        if width == 0 || height == 0 {
1130            return Err("CUDA HTJ2K packetization subband code-block layout mismatch");
1131        }
1132
1133        let mut widths = Vec::new();
1134        let mut heights = Vec::new();
1135        let mut offsets = Vec::new();
1136        let mut total_nodes = 0usize;
1137        let mut w = width;
1138        let mut h = height;
1139        loop {
1140            if widths.len() >= CUDA_HTJ2K_PACKET_MAX_TAG_LEVELS {
1141                return Err("CUDA HTJ2K packetization tag-tree exceeds kernel bounds");
1142            }
1143            let nodes = (w as usize)
1144                .checked_mul(h as usize)
1145                .ok_or("CUDA HTJ2K packetization tag-tree exceeds kernel bounds")?;
1146            let next_total = total_nodes
1147                .checked_add(nodes)
1148                .ok_or("CUDA HTJ2K packetization tag-tree exceeds kernel bounds")?;
1149            if next_total > CUDA_HTJ2K_PACKET_MAX_TAG_NODES {
1150                return Err("CUDA HTJ2K packetization tag-tree exceeds kernel bounds");
1151            }
1152            offsets.push(total_nodes);
1153            widths.push(w);
1154            heights.push(h);
1155            total_nodes = next_total;
1156            if w <= 1 && h <= 1 {
1157                break;
1158            }
1159            w = w.div_ceil(2);
1160            h = h.div_ceil(2);
1161        }
1162
1163        Ok(Self {
1164            values: vec![0; total_nodes],
1165            current: vec![0; total_nodes],
1166            known: vec![0; total_nodes],
1167            widths,
1168            heights,
1169            offsets,
1170        })
1171    }
1172
1173    fn set_leaf_value(&mut self, x: u32, y: u32, value: u32) {
1174        let idx = self.offsets[0] + (y * self.widths[0] + x) as usize;
1175        self.values[idx] = value;
1176    }
1177
1178    #[allow(clippy::similar_names)]
1179    fn propagate(&mut self) {
1180        for level in 1..self.widths.len() {
1181            let prev_w = self.widths[level - 1];
1182            let prev_h = self.heights[level - 1];
1183            let curr_w = self.widths[level];
1184            let curr_h = self.heights[level];
1185            for cy in 0..curr_h {
1186                for cx in 0..curr_w {
1187                    let child_x_start = cx * 2;
1188                    let child_y_start = cy * 2;
1189                    let child_x_end = ((cx + 1) * 2).min(prev_w);
1190                    let child_y_end = ((cy + 1) * 2).min(prev_h);
1191                    let mut min_value = u32::MAX;
1192                    for child_y in child_y_start..child_y_end {
1193                        for child_x in child_x_start..child_x_end {
1194                            let child_idx =
1195                                self.offsets[level - 1] + (child_y * prev_w + child_x) as usize;
1196                            min_value = min_value.min(self.values[child_idx]);
1197                        }
1198                    }
1199                    let parent_idx = self.offsets[level] + (cy * curr_w + cx) as usize;
1200                    self.values[parent_idx] = min_value;
1201                }
1202            }
1203        }
1204    }
1205
1206    fn encode_state_only(&mut self, x: u32, y: u32, max_value: u32) {
1207        let mut path = Vec::with_capacity(self.widths.len());
1208        let mut cx = x;
1209        let mut cy = y;
1210        for level in 0..self.widths.len() {
1211            path.push(self.offsets[level] + (cy * self.widths[level] + cx) as usize);
1212            cx /= 2;
1213            cy /= 2;
1214        }
1215
1216        for node_idx in path.into_iter().rev() {
1217            if self.known[node_idx] == 0 {
1218                let target = self.values[node_idx].min(max_value);
1219                if self.values[node_idx] < max_value {
1220                    self.known[node_idx] = 1;
1221                }
1222                self.current[node_idx] = target;
1223            }
1224        }
1225    }
1226
1227    fn append_snapshot(
1228        &self,
1229        out: &mut Vec<CudaHtj2kPacketizationPlanTagNodeState>,
1230    ) -> core::result::Result<u32, &'static str> {
1231        let start = u32::try_from(out.len())
1232            .map_err(|_| "CUDA HTJ2K packetization tag-state exceeds u32")?;
1233        out.extend(
1234            self.current
1235                .iter()
1236                .copied()
1237                .zip(self.known.iter().copied())
1238                .map(|(current, known)| CudaHtj2kPacketizationPlanTagNodeState { current, known }),
1239        );
1240        Ok(start)
1241    }
1242
1243    fn node_count(&self) -> u32 {
1244        u32::try_from(self.current.len()).expect("tag tree node count was bounded at construction")
1245    }
1246}
1247
1248fn record_cuda_htj2k_packetization_first_inclusion_layers(
1249    state: &mut CudaHtj2kPacketizationState,
1250    resolution: &J2kPacketizationResolution<'_>,
1251    layer: u8,
1252) -> core::result::Result<(), &'static str> {
1253    validate_cuda_htj2k_packetization_state_layout(state, resolution)?;
1254    for (state_subband, packet_subband) in state.subbands.iter_mut().zip(&resolution.subbands) {
1255        for (idx, (state_block, packet_block)) in state_subband
1256            .blocks
1257            .iter_mut()
1258            .zip(&packet_subband.code_blocks)
1259            .enumerate()
1260        {
1261            if packet_block.num_coding_passes == 0 {
1262                continue;
1263            }
1264            let layer = u32::from(layer);
1265            if layer < state_block.inclusion_layer {
1266                state_block.inclusion_layer = layer;
1267                state_block.first_inclusion_zero_bitplanes =
1268                    u32::from(packet_block.num_zero_bitplanes);
1269                let (x, y) = cuda_htj2k_packetization_block_xy(idx, state_subband.num_cbs_x)?;
1270                state_subband.inclusion_tree.set_leaf_value(x, y, layer);
1271                state_subband.zero_bitplane_tree.set_leaf_value(
1272                    x,
1273                    y,
1274                    state_block.first_inclusion_zero_bitplanes,
1275                );
1276            }
1277        }
1278    }
1279    Ok(())
1280}
1281
1282fn finalize_cuda_htj2k_packetization_tag_trees(state: &mut CudaHtj2kPacketizationState) {
1283    for subband in &mut state.subbands {
1284        subband.inclusion_tree.propagate();
1285        subband.zero_bitplane_tree.propagate();
1286    }
1287}
1288
1289fn append_cuda_htj2k_packetization_tag_state(
1290    state_subband: Option<&CudaHtj2kPacketizationSubbandState>,
1291    num_cbs_x: u32,
1292    num_cbs_y: u32,
1293    tag_states: &mut Vec<CudaHtj2kPacketizationPlanSubbandTagState>,
1294    tag_nodes: &mut Vec<CudaHtj2kPacketizationPlanTagNodeState>,
1295) -> core::result::Result<(), &'static str> {
1296    let (inclusion_node_start, zero_bitplane_node_start, node_count) =
1297        if let Some(state_subband) = state_subband {
1298            let inclusion_start = state_subband.inclusion_tree.append_snapshot(tag_nodes)?;
1299            let zero_bitplane_start = state_subband
1300                .zero_bitplane_tree
1301                .append_snapshot(tag_nodes)?;
1302            (
1303                inclusion_start,
1304                zero_bitplane_start,
1305                state_subband.inclusion_tree.node_count(),
1306            )
1307        } else {
1308            let zero_tree = CudaHtj2kPacketizationTagTreeState::new(num_cbs_x, num_cbs_y)?;
1309            let inclusion_start = zero_tree.append_snapshot(tag_nodes)?;
1310            let zero_bitplane_start = zero_tree.append_snapshot(tag_nodes)?;
1311            (inclusion_start, zero_bitplane_start, zero_tree.node_count())
1312        };
1313    tag_states.push(CudaHtj2kPacketizationPlanSubbandTagState {
1314        inclusion_node_start,
1315        zero_bitplane_node_start,
1316        node_count,
1317    });
1318    Ok(())
1319}
1320
1321fn update_cuda_htj2k_packetization_state_after_block(
1322    state: &mut CudaHtj2kPacketizationState,
1323    subband_index: usize,
1324    block_index: usize,
1325    layer: u8,
1326    code_block: &J2kPacketizationCodeBlock<'_>,
1327    l_block: u32,
1328) -> core::result::Result<(), &'static str> {
1329    let state_subband = state
1330        .subbands
1331        .get_mut(subband_index)
1332        .ok_or("CUDA HTJ2K packetization state layout mismatch")?;
1333    let (x, y) = cuda_htj2k_packetization_block_xy(block_index, state_subband.num_cbs_x)?;
1334    let previously_included = state_subband
1335        .blocks
1336        .get(block_index)
1337        .ok_or("CUDA HTJ2K packetization state layout mismatch")?
1338        .previously_included;
1339
1340    if !previously_included {
1341        state_subband
1342            .inclusion_tree
1343            .encode_state_only(x, y, u32::from(layer) + 1);
1344        if code_block.num_coding_passes == 0 {
1345            return Ok(());
1346        }
1347        state_subband.zero_bitplane_tree.encode_state_only(
1348            x,
1349            y,
1350            u32::from(code_block.num_zero_bitplanes) + 1,
1351        );
1352    }
1353
1354    if code_block.num_coding_passes > 0 {
1355        let state_block = state_subband
1356            .blocks
1357            .get_mut(block_index)
1358            .ok_or("CUDA HTJ2K packetization state layout mismatch")?;
1359        let (cleanup_length, refinement_length) = cuda_ht_segment_lengths(code_block)?;
1360        state_block.l_block = updated_ht_l_block(
1361            l_block,
1362            code_block.num_coding_passes,
1363            cleanup_length,
1364            refinement_length,
1365        )?;
1366        state_block.previously_included = true;
1367    }
1368    Ok(())
1369}
1370
1371fn flatten_cuda_htj2k_packet(
1372    resolution: &J2kPacketizationResolution<'_>,
1373    sink: &mut CudaHtj2kPacketizationPlanSink<'_>,
1374) -> core::result::Result<(), &'static str> {
1375    flatten_cuda_htj2k_packet_inner(resolution, 0, None, sink)
1376}
1377
1378fn flatten_cuda_htj2k_packet_with_state(
1379    resolution: &J2kPacketizationResolution<'_>,
1380    layer: u8,
1381    state: &mut CudaHtj2kPacketizationState,
1382    sink: &mut CudaHtj2kPacketizationPlanSink<'_>,
1383) -> core::result::Result<(), &'static str> {
1384    flatten_cuda_htj2k_packet_inner(resolution, layer, Some(state), sink)
1385}
1386
1387fn flatten_cuda_htj2k_packet_inner(
1388    resolution: &J2kPacketizationResolution<'_>,
1389    layer: u8,
1390    mut state: Option<&mut CudaHtj2kPacketizationState>,
1391    sink: &mut CudaHtj2kPacketizationPlanSink<'_>,
1392) -> core::result::Result<(), &'static str> {
1393    let block_start = u32::try_from(sink.blocks.len())
1394        .map_err(|_| "CUDA HTJ2K packetization block count exceeds u32")?;
1395    let subband_start = u32::try_from(sink.subbands.len())
1396        .map_err(|_| "CUDA HTJ2K packetization subband count exceeds u32")?;
1397    let mut body_len = 0usize;
1398    let mut block_count = 0usize;
1399    let packet_has_data = resolution.subbands.iter().any(|subband| {
1400        subband
1401            .code_blocks
1402            .iter()
1403            .any(|block| block.num_coding_passes > 0)
1404    });
1405
1406    for (subband_index, subband) in resolution.subbands.iter().enumerate() {
1407        let subband_code_blocks = u32::try_from(subband.code_blocks.len())
1408            .map_err(|_| "CUDA HTJ2K packetization block count exceeds u32")?;
1409        if subband.num_cbs_x == 0
1410            || subband.num_cbs_y == 0
1411            || subband.num_cbs_x.saturating_mul(subband.num_cbs_y) != subband_code_blocks
1412        {
1413            return Err("CUDA HTJ2K packetization subband code-block layout mismatch");
1414        }
1415
1416        let subband_block_start = u32::try_from(sink.blocks.len())
1417            .map_err(|_| "CUDA HTJ2K packetization block count exceeds u32")?;
1418        let state_subband = state
1419            .as_deref()
1420            .and_then(|state| state.subbands.get(subband_index));
1421        append_cuda_htj2k_packetization_tag_state(
1422            state_subband,
1423            subband.num_cbs_x,
1424            subband.num_cbs_y,
1425            sink.tag_states,
1426            sink.tag_nodes,
1427        )?;
1428        for (block_index, code_block) in subband.code_blocks.iter().enumerate() {
1429            if code_block.block_coding_mode != J2kPacketizationBlockCodingMode::HighThroughput {
1430                return Err("CUDA packetization only supports HTJ2K block-coded packets");
1431            }
1432            if code_block.num_coding_passes > 164 {
1433                return Err("CUDA HTJ2K packetization coding pass count exceeds JPEG 2000 bounds");
1434            }
1435            let (previously_included, l_block, inclusion_layer, zero_bitplanes) =
1436                if let Some(state) = state.as_deref() {
1437                    let state_block = state
1438                        .subbands
1439                        .get(subband_index)
1440                        .and_then(|state_subband| state_subband.blocks.get(block_index))
1441                        .ok_or("CUDA HTJ2K packetization state layout mismatch")?;
1442                    (
1443                        state_block.previously_included,
1444                        state_block.l_block,
1445                        state_block.inclusion_layer,
1446                        state_block.first_inclusion_zero_bitplanes,
1447                    )
1448                } else {
1449                    (
1450                        code_block.previously_included,
1451                        code_block.l_block,
1452                        if code_block.num_coding_passes > 0 {
1453                            0
1454                        } else {
1455                            CUDA_HTJ2K_PACKET_TAG_INF
1456                        },
1457                        u32::from(code_block.num_zero_bitplanes),
1458                    )
1459                };
1460            if code_block.num_coding_passes > 0
1461                && !previously_included
1462                && inclusion_layer != u32::from(layer)
1463            {
1464                return Err(
1465                    "CUDA HTJ2K packetization descriptor order does not match first inclusion layer",
1466                );
1467            }
1468            if state.is_none() && previously_included {
1469                return Err("CUDA HTJ2K packetization requires first-inclusion packets");
1470            }
1471            if code_block.num_coding_passes == 0 && !code_block.data.is_empty() {
1472                return Err("CUDA HTJ2K packetization empty contributions must not carry payload");
1473            }
1474            if zero_bitplanes > 31 || l_block > 31 {
1475                return Err("CUDA HTJ2K packetization header fields exceed kernel bounds");
1476            }
1477
1478            let data_offset = u32::try_from(sink.payload.len())
1479                .map_err(|_| "CUDA HTJ2K packetization payload exceeds u32")?;
1480            let data_len = if code_block.num_coding_passes == 0 {
1481                0
1482            } else {
1483                u32::try_from(code_block.data.len())
1484                    .map_err(|_| "CUDA HTJ2K packetization code-block payload exceeds u32")?
1485            };
1486            let (cleanup_length, refinement_length) = cuda_ht_segment_lengths(code_block)?;
1487            if code_block.num_coding_passes > 0 {
1488                sink.payload.extend_from_slice(code_block.data);
1489                body_len = body_len
1490                    .checked_add(code_block.data.len())
1491                    .ok_or("CUDA HTJ2K packetization body length overflow")?;
1492            }
1493            sink.blocks.push(CudaHtj2kPacketizationPlanBlock {
1494                data_offset,
1495                data_len,
1496                cleanup_length,
1497                refinement_length,
1498                num_coding_passes: u32::from(code_block.num_coding_passes),
1499                num_zero_bitplanes: zero_bitplanes,
1500                l_block,
1501                previously_included: u32::from(previously_included),
1502                inclusion_layer,
1503            });
1504            if packet_has_data {
1505                if let Some(state) = state.as_deref_mut() {
1506                    update_cuda_htj2k_packetization_state_after_block(
1507                        state,
1508                        subband_index,
1509                        block_index,
1510                        layer,
1511                        code_block,
1512                        l_block,
1513                    )?;
1514                }
1515            }
1516            block_count = block_count
1517                .checked_add(1)
1518                .ok_or("CUDA HTJ2K packetization block count overflow")?;
1519        }
1520        sink.subbands.push(CudaHtj2kPacketizationPlanSubband {
1521            block_start: subband_block_start,
1522            block_count: subband_code_blocks,
1523            num_cbs_x: subband.num_cbs_x,
1524            num_cbs_y: subband.num_cbs_y,
1525        });
1526    }
1527
1528    let header_capacity = 256usize
1529        .checked_add(
1530            block_count
1531                .checked_mul(64)
1532                .ok_or("CUDA HTJ2K packetization capacity overflow")?,
1533        )
1534        .ok_or("CUDA HTJ2K packetization capacity overflow")?;
1535    let output_capacity = body_len
1536        .checked_add(header_capacity)
1537        .ok_or("CUDA HTJ2K packetization capacity overflow")?;
1538    sink.packets.push(CudaHtj2kPacketizationPlanPacket {
1539        block_start,
1540        block_count: u32::try_from(block_count)
1541            .map_err(|_| "CUDA HTJ2K packetization block count exceeds u32")?,
1542        subband_start,
1543        subband_count: u32::try_from(resolution.subbands.len())
1544            .map_err(|_| "CUDA HTJ2K packetization subband count exceeds u32")?,
1545        output_capacity: u32::try_from(output_capacity)
1546            .map_err(|_| "CUDA HTJ2K packetization packet capacity exceeds u32")?,
1547        layer: u32::from(layer),
1548    });
1549    Ok(())
1550}
1551
1552fn updated_ht_l_block(
1553    mut l_block: u32,
1554    num_coding_passes: u8,
1555    cleanup_length: u32,
1556    refinement_length: u32,
1557) -> core::result::Result<u32, &'static str> {
1558    let mut num_bits = packet_math::bits_for_ht_cleanup_length(l_block, num_coding_passes);
1559    let refinement_extra_bits = u32::from(num_coding_passes > 2);
1560    while !packet_math::value_fits_in_bits(cleanup_length, num_bits)
1561        || (num_coding_passes > 1
1562            && !packet_math::value_fits_in_bits(refinement_length, l_block + refinement_extra_bits))
1563    {
1564        l_block = l_block
1565            .checked_add(1)
1566            .ok_or("CUDA HTJ2K packetization L-block overflow")?;
1567        num_bits = num_bits
1568            .checked_add(1)
1569            .ok_or("CUDA HTJ2K packetization L-block overflow")?;
1570    }
1571    Ok(l_block)
1572}
1573
1574fn cuda_ht_segment_lengths(
1575    code_block: &J2kPacketizationCodeBlock<'_>,
1576) -> core::result::Result<(u32, u32), &'static str> {
1577    packet_math::ht_segment_lengths(
1578        code_block.num_coding_passes,
1579        code_block.data.len(),
1580        code_block.ht_cleanup_length,
1581        code_block.ht_refinement_length,
1582    )
1583}
1584
1585impl J2kEncodeStageAccelerator for CudaEncodeStageAccelerator {
1586    fn dispatch_report(&self) -> J2kEncodeDispatchReport {
1587        J2kEncodeDispatchReport {
1588            deinterleave: self.deinterleave_dispatches,
1589            forward_rct: self.forward_rct_dispatches,
1590            forward_ict: self.forward_ict_dispatches,
1591            forward_dwt53: self.forward_dwt53_dispatches,
1592            forward_dwt97: self.forward_dwt97_dispatches,
1593            quantize_subband: self.quantize_subband_dispatches,
1594            tier1_code_block: self.tier1_code_block_dispatches,
1595            ht_code_block: self.ht_code_block_dispatches,
1596            packetization: self.packetization_dispatches,
1597        }
1598    }
1599
1600    fn encode_deinterleave(
1601        &mut self,
1602        job: J2kDeinterleaveToF32Job<'_>,
1603    ) -> core::result::Result<Option<Vec<Vec<f32>>>, &'static str> {
1604        self.deinterleave_attempts = self.deinterleave_attempts.saturating_add(1);
1605        #[cfg(feature = "cuda-runtime")]
1606        if let Some(context) = self.cuda_context()? {
1607            let (output, elapsed_us) = time_cuda_stage(
1608                "j2k.j2k.cuda.encode.deinterleave",
1609                &context,
1610                self.collect_profile,
1611                || {
1612                    context.j2k_deinterleave_to_f32(
1613                        job.pixels,
1614                        job.num_pixels,
1615                        job.num_components,
1616                        job.bit_depth,
1617                        job.signed,
1618                    )
1619                },
1620            )
1621            .map_err(|_| "CUDA deinterleave encode kernel failed")?;
1622            let dispatches = output.execution().kernel_dispatches();
1623            self.deinterleave_dispatches = self.deinterleave_dispatches.saturating_add(dispatches);
1624            self.deinterleave_us = self.deinterleave_us.saturating_add(elapsed_us);
1625            if j2k_profile::gpu_route_profile_enabled() {
1626                let pixels_s = job.num_pixels.to_string();
1627                let components_s = job.num_components.to_string();
1628                let dispatches_s = dispatches.to_string();
1629                j2k_profile::emit_gpu_route_profile(
1630                    "j2k",
1631                    "cuda",
1632                    &[
1633                        ("op", "encode_deinterleave"),
1634                        ("decision", "cuda_dispatch"),
1635                        ("pixels", pixels_s.as_str()),
1636                        ("components", components_s.as_str()),
1637                        ("dispatches", dispatches_s.as_str()),
1638                    ],
1639                );
1640            }
1641            return Ok(Some(output.into_components()));
1642        }
1643        #[cfg(not(feature = "cuda-runtime"))]
1644        let _ = job;
1645        if j2k_profile::gpu_route_profile_enabled() {
1646            j2k_profile::emit_gpu_route_profile(
1647                "j2k",
1648                "cuda",
1649                &[
1650                    ("op", "encode_deinterleave"),
1651                    ("decision", "cpu_fallback"),
1652                    ("reason", "cuda_unavailable"),
1653                ],
1654            );
1655        }
1656        Ok(None)
1657    }
1658
1659    fn encode_forward_rct(
1660        &mut self,
1661        job: J2kForwardRctJob<'_>,
1662    ) -> core::result::Result<bool, &'static str> {
1663        self.forward_rct_attempts = self.forward_rct_attempts.saturating_add(1);
1664        if self.prefer_cpu_forward_rct {
1665            if j2k_profile::gpu_route_profile_enabled() {
1666                j2k_profile::emit_gpu_route_profile(
1667                    "j2k",
1668                    "cuda",
1669                    &[
1670                        ("op", "encode_forward_rct"),
1671                        ("decision", "cpu_fallback"),
1672                        ("reason", "prefer_cpu_forward_rct"),
1673                    ],
1674                );
1675            }
1676            let _ = job;
1677            return Ok(false);
1678        }
1679        #[cfg(feature = "cuda-runtime")]
1680        if let Some(context) = self.cuda_context()? {
1681            let (execution, elapsed_us) = time_cuda_stage(
1682                "j2k.j2k.cuda.encode.rct",
1683                &context,
1684                self.collect_profile,
1685                || context.j2k_forward_rct(job.plane0, job.plane1, job.plane2),
1686            )
1687            .map_err(|_| "CUDA forward RCT encode kernel failed")?;
1688            self.forward_rct_dispatches = self
1689                .forward_rct_dispatches
1690                .saturating_add(execution.kernel_dispatches());
1691            self.mct_us = self.mct_us.saturating_add(elapsed_us);
1692            if j2k_profile::gpu_route_profile_enabled() {
1693                j2k_profile::emit_gpu_route_profile(
1694                    "j2k",
1695                    "cuda",
1696                    &[
1697                        ("op", "encode_forward_rct"),
1698                        ("decision", "cuda_dispatch"),
1699                        ("dispatches", "1"),
1700                    ],
1701                );
1702            }
1703            return Ok(true);
1704        }
1705        #[cfg(not(feature = "cuda-runtime"))]
1706        let _ = job;
1707        if j2k_profile::gpu_route_profile_enabled() {
1708            j2k_profile::emit_gpu_route_profile(
1709                "j2k",
1710                "cuda",
1711                &[
1712                    ("op", "encode_forward_rct"),
1713                    ("decision", "cpu_fallback"),
1714                    ("reason", "cuda_unavailable"),
1715                ],
1716            );
1717        }
1718        Ok(false)
1719    }
1720
1721    fn encode_forward_ict(
1722        &mut self,
1723        job: J2kForwardIctJob<'_>,
1724    ) -> core::result::Result<bool, &'static str> {
1725        self.forward_ict_attempts = self.forward_ict_attempts.saturating_add(1);
1726        #[cfg(feature = "cuda-runtime")]
1727        if let Some(context) = self.cuda_context()? {
1728            let (execution, elapsed_us) = time_cuda_stage(
1729                "j2k.j2k.cuda.encode.ict",
1730                &context,
1731                self.collect_profile,
1732                || context.j2k_forward_ict(job.plane0, job.plane1, job.plane2),
1733            )
1734            .map_err(|_| "CUDA forward ICT encode kernel failed")?;
1735            self.forward_ict_dispatches = self
1736                .forward_ict_dispatches
1737                .saturating_add(execution.kernel_dispatches());
1738            self.mct_us = self.mct_us.saturating_add(elapsed_us);
1739            if j2k_profile::gpu_route_profile_enabled() {
1740                j2k_profile::emit_gpu_route_profile(
1741                    "j2k",
1742                    "cuda",
1743                    &[
1744                        ("op", "encode_forward_ict"),
1745                        ("decision", "cuda_dispatch"),
1746                        ("dispatches", "1"),
1747                    ],
1748                );
1749            }
1750            return Ok(true);
1751        }
1752        #[cfg(not(feature = "cuda-runtime"))]
1753        let _ = job;
1754        if j2k_profile::gpu_route_profile_enabled() {
1755            j2k_profile::emit_gpu_route_profile(
1756                "j2k",
1757                "cuda",
1758                &[
1759                    ("op", "encode_forward_ict"),
1760                    ("decision", "cpu_fallback"),
1761                    ("reason", "cuda_unavailable"),
1762                ],
1763            );
1764        }
1765        Ok(false)
1766    }
1767
1768    fn encode_forward_dwt53(
1769        &mut self,
1770        job: J2kForwardDwt53Job<'_>,
1771    ) -> core::result::Result<Option<J2kForwardDwt53Output>, &'static str> {
1772        self.forward_dwt53_attempts = self.forward_dwt53_attempts.saturating_add(1);
1773        if job.num_levels == 0 {
1774            if j2k_profile::gpu_route_profile_enabled() {
1775                j2k_profile::emit_gpu_route_profile(
1776                    "j2k",
1777                    "cuda",
1778                    &[
1779                        ("op", "encode_forward_dwt53"),
1780                        ("decision", "cpu_fallback"),
1781                        ("reason", "zero_levels"),
1782                    ],
1783                );
1784            }
1785            return Ok(None);
1786        }
1787        #[cfg(feature = "cuda-runtime")]
1788        if let Some(context) = self.cuda_context()? {
1789            let (output, elapsed_us) = time_cuda_stage(
1790                "j2k.j2k.cuda.encode.dwt53",
1791                &context,
1792                self.collect_profile,
1793                || context.j2k_forward_dwt53(job.samples, job.width, job.height, job.num_levels),
1794            )
1795            .map_err(|_| "CUDA forward 5/3 DWT encode kernel failed")?;
1796            let dispatches = output.execution().kernel_dispatches();
1797            self.forward_dwt53_dispatches =
1798                self.forward_dwt53_dispatches.saturating_add(dispatches);
1799            self.dwt_us = self.dwt_us.saturating_add(elapsed_us);
1800            if j2k_profile::gpu_route_profile_enabled() {
1801                let width_s = job.width.to_string();
1802                let height_s = job.height.to_string();
1803                let levels_s = job.num_levels.to_string();
1804                let dispatches_s = dispatches.to_string();
1805                j2k_profile::emit_gpu_route_profile(
1806                    "j2k",
1807                    "cuda",
1808                    &[
1809                        ("op", "encode_forward_dwt53"),
1810                        ("decision", "cuda_dispatch"),
1811                        ("width", width_s.as_str()),
1812                        ("height", height_s.as_str()),
1813                        ("levels", levels_s.as_str()),
1814                        ("dispatches", dispatches_s.as_str()),
1815                    ],
1816                );
1817            }
1818            return Ok(Some(cuda_dwt53_output_to_j2k(&output)?));
1819        }
1820        #[cfg(not(feature = "cuda-runtime"))]
1821        let _ = job;
1822        if j2k_profile::gpu_route_profile_enabled() {
1823            j2k_profile::emit_gpu_route_profile(
1824                "j2k",
1825                "cuda",
1826                &[
1827                    ("op", "encode_forward_dwt53"),
1828                    ("decision", "cpu_fallback"),
1829                    ("reason", "cuda_unavailable"),
1830                ],
1831            );
1832        }
1833        Ok(None)
1834    }
1835
1836    fn encode_forward_dwt97(
1837        &mut self,
1838        job: J2kForwardDwt97Job<'_>,
1839    ) -> core::result::Result<Option<J2kForwardDwt97Output>, &'static str> {
1840        self.forward_dwt97_attempts = self.forward_dwt97_attempts.saturating_add(1);
1841        if job.num_levels == 0 {
1842            if j2k_profile::gpu_route_profile_enabled() {
1843                j2k_profile::emit_gpu_route_profile(
1844                    "j2k",
1845                    "cuda",
1846                    &[
1847                        ("op", "encode_forward_dwt97"),
1848                        ("decision", "cpu_fallback"),
1849                        ("reason", "zero_levels"),
1850                    ],
1851                );
1852            }
1853            return Ok(None);
1854        }
1855        #[cfg(feature = "cuda-runtime")]
1856        if let Some(context) = self.cuda_context()? {
1857            let (output, elapsed_us) = time_cuda_stage(
1858                "j2k.j2k.cuda.encode.dwt97",
1859                &context,
1860                self.collect_profile,
1861                || context.j2k_forward_dwt97(job.samples, job.width, job.height, job.num_levels),
1862            )
1863            .map_err(|_| "CUDA forward 9/7 DWT encode kernel failed")?;
1864            let dispatches = output.execution().kernel_dispatches();
1865            self.forward_dwt97_dispatches =
1866                self.forward_dwt97_dispatches.saturating_add(dispatches);
1867            self.dwt_us = self.dwt_us.saturating_add(elapsed_us);
1868            if j2k_profile::gpu_route_profile_enabled() {
1869                let width_s = job.width.to_string();
1870                let height_s = job.height.to_string();
1871                let levels_s = job.num_levels.to_string();
1872                let dispatches_s = dispatches.to_string();
1873                j2k_profile::emit_gpu_route_profile(
1874                    "j2k",
1875                    "cuda",
1876                    &[
1877                        ("op", "encode_forward_dwt97"),
1878                        ("decision", "cuda_dispatch"),
1879                        ("width", width_s.as_str()),
1880                        ("height", height_s.as_str()),
1881                        ("levels", levels_s.as_str()),
1882                        ("dispatches", dispatches_s.as_str()),
1883                    ],
1884                );
1885            }
1886            return Ok(Some(cuda_dwt97_output_to_j2k(&output)?));
1887        }
1888        #[cfg(not(feature = "cuda-runtime"))]
1889        let _ = job;
1890        if j2k_profile::gpu_route_profile_enabled() {
1891            j2k_profile::emit_gpu_route_profile(
1892                "j2k",
1893                "cuda",
1894                &[
1895                    ("op", "encode_forward_dwt97"),
1896                    ("decision", "cpu_fallback"),
1897                    ("reason", "cuda_unavailable"),
1898                ],
1899            );
1900        }
1901        Ok(None)
1902    }
1903
1904    fn encode_quantize_subband(
1905        &mut self,
1906        job: J2kQuantizeSubbandJob<'_>,
1907    ) -> core::result::Result<Option<Vec<i32>>, &'static str> {
1908        self.quantize_subband_attempts = self.quantize_subband_attempts.saturating_add(1);
1909        if self.prefer_cpu_quantize_subband {
1910            if j2k_profile::gpu_route_profile_enabled() {
1911                j2k_profile::emit_gpu_route_profile(
1912                    "j2k",
1913                    "cuda",
1914                    &[
1915                        ("op", "encode_quantize_subband"),
1916                        ("decision", "cpu_fallback"),
1917                        ("reason", "prefer_cpu_quantize_subband"),
1918                    ],
1919                );
1920            }
1921            let _ = job;
1922            return Ok(None);
1923        }
1924        #[cfg(feature = "cuda-runtime")]
1925        if let Some(context) = self.cuda_context()? {
1926            let (output, elapsed_us) = time_cuda_stage(
1927                "j2k.j2k.cuda.encode.quantize",
1928                &context,
1929                self.collect_profile,
1930                || {
1931                    context.j2k_quantize_subband(
1932                        job.coefficients,
1933                        CudaJ2kQuantizeJob {
1934                            step_exponent: job.step_exponent,
1935                            step_mantissa: job.step_mantissa,
1936                            range_bits: job.range_bits,
1937                            reversible: job.reversible,
1938                        },
1939                    )
1940                },
1941            )
1942            .map_err(|_| "CUDA quantize subband encode kernel failed")?;
1943            let dispatches = output.execution().kernel_dispatches();
1944            self.quantize_subband_dispatches =
1945                self.quantize_subband_dispatches.saturating_add(dispatches);
1946            self.quantize_us = self.quantize_us.saturating_add(elapsed_us);
1947            if j2k_profile::gpu_route_profile_enabled() {
1948                let samples_s = job.coefficients.len().to_string();
1949                let dispatches_s = dispatches.to_string();
1950                j2k_profile::emit_gpu_route_profile(
1951                    "j2k",
1952                    "cuda",
1953                    &[
1954                        ("op", "encode_quantize_subband"),
1955                        ("decision", "cuda_dispatch"),
1956                        ("samples", samples_s.as_str()),
1957                        ("dispatches", dispatches_s.as_str()),
1958                    ],
1959                );
1960            }
1961            return Ok(Some(output.coefficients().to_vec()));
1962        }
1963        #[cfg(not(feature = "cuda-runtime"))]
1964        let _ = job;
1965        if j2k_profile::gpu_route_profile_enabled() {
1966            j2k_profile::emit_gpu_route_profile(
1967                "j2k",
1968                "cuda",
1969                &[
1970                    ("op", "encode_quantize_subband"),
1971                    ("decision", "cpu_fallback"),
1972                    ("reason", "cuda_unavailable"),
1973                ],
1974            );
1975        }
1976        Ok(None)
1977    }
1978
1979    fn encode_tier1_code_block(
1980        &mut self,
1981        _job: J2kTier1CodeBlockEncodeJob<'_>,
1982    ) -> core::result::Result<Option<EncodedJ2kCodeBlock>, &'static str> {
1983        self.tier1_code_block_attempts = self.tier1_code_block_attempts.saturating_add(1);
1984        if j2k_profile::gpu_route_profile_enabled() {
1985            j2k_profile::emit_gpu_route_profile(
1986                "j2k",
1987                "cuda",
1988                &[
1989                    ("op", "encode_tier1_code_block"),
1990                    ("decision", "cpu_fallback"),
1991                    ("reason", "unsupported_stage"),
1992                ],
1993            );
1994        }
1995        Ok(None)
1996    }
1997
1998    fn encode_ht_code_block(
1999        &mut self,
2000        job: J2kHtCodeBlockEncodeJob<'_>,
2001    ) -> core::result::Result<Option<EncodedHtJ2kCodeBlock>, &'static str> {
2002        self.ht_code_block_attempts = self.ht_code_block_attempts.saturating_add(1);
2003        #[cfg(feature = "cuda-runtime")]
2004        if let Some(context) = self.cuda_context()? {
2005            let resources = self.cuda_encode_resources(&context)?;
2006            let encoded = cuda_encode_ht_code_block(&context, resources.as_ref(), job)?;
2007            let dispatches = encoded.execution().kernel_dispatches();
2008            let ht_encode_us = encoded.stage_timings().ht_encode_us;
2009            let mut outputs = encoded_ht_code_blocks_from_cuda(&encoded);
2010            let output = outputs
2011                .pop()
2012                .ok_or("CUDA HTJ2K code-block encode returned no output")?;
2013            self.ht_code_block_dispatches =
2014                self.ht_code_block_dispatches.saturating_add(dispatches);
2015            if self.collect_profile {
2016                self.ht_encode_us = self.ht_encode_us.saturating_add(ht_encode_us);
2017            }
2018            if j2k_profile::gpu_route_profile_enabled() {
2019                let width_s = job.width.to_string();
2020                let height_s = job.height.to_string();
2021                let dispatches_s = dispatches.to_string();
2022                j2k_profile::emit_gpu_route_profile(
2023                    "j2k",
2024                    "cuda",
2025                    &[
2026                        ("op", "encode_ht_code_block"),
2027                        ("decision", "cuda_dispatch"),
2028                        ("width", width_s.as_str()),
2029                        ("height", height_s.as_str()),
2030                        ("dispatches", dispatches_s.as_str()),
2031                    ],
2032                );
2033            }
2034            return Ok(Some(output));
2035        }
2036        #[cfg(not(feature = "cuda-runtime"))]
2037        let _ = job;
2038        if j2k_profile::gpu_route_profile_enabled() {
2039            j2k_profile::emit_gpu_route_profile(
2040                "j2k",
2041                "cuda",
2042                &[
2043                    ("op", "encode_ht_code_block"),
2044                    ("decision", "cpu_fallback"),
2045                    ("reason", "unsupported_stage"),
2046                ],
2047            );
2048        }
2049        Ok(None)
2050    }
2051
2052    fn encode_ht_code_blocks(
2053        &mut self,
2054        jobs: &[J2kHtCodeBlockEncodeJob<'_>],
2055    ) -> core::result::Result<Option<Vec<EncodedHtJ2kCodeBlock>>, &'static str> {
2056        self.ht_code_block_attempts = self.ht_code_block_attempts.saturating_add(jobs.len());
2057        #[cfg(feature = "cuda-runtime")]
2058        if let Some(context) = self.cuda_context()? {
2059            let resources = self.cuda_encode_resources(&context)?;
2060            let encoded = cuda_encode_ht_code_blocks(&context, resources.as_ref(), jobs)?;
2061            let dispatches = encoded.execution().kernel_dispatches();
2062            let ht_encode_us = encoded.stage_timings().ht_encode_us;
2063            let outputs = encoded_ht_code_blocks_from_cuda(&encoded);
2064            self.ht_code_block_dispatches =
2065                self.ht_code_block_dispatches.saturating_add(dispatches);
2066            if self.collect_profile {
2067                self.ht_encode_us = self.ht_encode_us.saturating_add(ht_encode_us);
2068            }
2069            if j2k_profile::gpu_route_profile_enabled() {
2070                let jobs_s = jobs.len().to_string();
2071                let dispatches_s = dispatches.to_string();
2072                j2k_profile::emit_gpu_route_profile(
2073                    "j2k",
2074                    "cuda",
2075                    &[
2076                        ("op", "encode_ht_code_blocks"),
2077                        ("decision", "cuda_dispatch"),
2078                        ("jobs", jobs_s.as_str()),
2079                        ("dispatches", dispatches_s.as_str()),
2080                    ],
2081                );
2082            }
2083            return Ok(Some(outputs));
2084        }
2085        #[cfg(not(feature = "cuda-runtime"))]
2086        let _ = jobs;
2087        if j2k_profile::gpu_route_profile_enabled() {
2088            j2k_profile::emit_gpu_route_profile(
2089                "j2k",
2090                "cuda",
2091                &[
2092                    ("op", "encode_ht_code_blocks"),
2093                    ("decision", "cpu_fallback"),
2094                    ("reason", "cuda_unavailable"),
2095                ],
2096            );
2097        }
2098        Ok(None)
2099    }
2100
2101    fn encode_htj2k_tile(
2102        &mut self,
2103        job: J2kHtj2kTileEncodeJob<'_>,
2104    ) -> core::result::Result<Option<Vec<u8>>, &'static str> {
2105        self.htj2k_tile_attempts = self.htj2k_tile_attempts.saturating_add(1);
2106        if self.prefer_cpu_forward_rct || self.prefer_cpu_packetization {
2107            if j2k_profile::gpu_route_profile_enabled() {
2108                j2k_profile::emit_gpu_route_profile(
2109                    "j2k",
2110                    "cuda",
2111                    &[
2112                        ("op", "encode_htj2k_tile"),
2113                        ("decision", "cpu_fallback"),
2114                        ("reason", "prefer_stage_hybrid"),
2115                    ],
2116                );
2117            }
2118            let _ = job;
2119            return Ok(None);
2120        }
2121        #[cfg(feature = "cuda-runtime")]
2122        if let Some(context) = self.cuda_context()? {
2123            let resources = self.cuda_encode_resources(&context)?;
2124            let Some(encoded) = cuda_encode_htj2k_tile_body(
2125                &context,
2126                resources.as_ref(),
2127                job,
2128                self.collect_profile,
2129            )?
2130            else {
2131                return Ok(None);
2132            };
2133            self.htj2k_tile_dispatches = self.htj2k_tile_dispatches.saturating_add(1);
2134            self.deinterleave_attempts = self.deinterleave_attempts.saturating_add(1);
2135            self.deinterleave_dispatches = self
2136                .deinterleave_dispatches
2137                .saturating_add(encoded.deinterleave_dispatches);
2138            if job.use_mct {
2139                if job.reversible {
2140                    self.forward_rct_attempts = self.forward_rct_attempts.saturating_add(1);
2141                } else {
2142                    self.forward_ict_attempts = self.forward_ict_attempts.saturating_add(1);
2143                }
2144            }
2145            self.forward_rct_dispatches = self
2146                .forward_rct_dispatches
2147                .saturating_add(encoded.forward_rct_dispatches);
2148            self.forward_ict_dispatches = self
2149                .forward_ict_dispatches
2150                .saturating_add(encoded.forward_ict_dispatches);
2151            if job.num_decomposition_levels > 0 {
2152                if job.reversible {
2153                    self.forward_dwt53_attempts = self
2154                        .forward_dwt53_attempts
2155                        .saturating_add(usize::from(job.num_components));
2156                } else {
2157                    self.forward_dwt97_attempts = self
2158                        .forward_dwt97_attempts
2159                        .saturating_add(usize::from(job.num_components));
2160                }
2161            }
2162            self.forward_dwt53_dispatches = self
2163                .forward_dwt53_dispatches
2164                .saturating_add(encoded.forward_dwt53_dispatches);
2165            self.forward_dwt97_dispatches = self
2166                .forward_dwt97_dispatches
2167                .saturating_add(encoded.forward_dwt97_dispatches);
2168            self.quantize_subband_attempts = self
2169                .quantize_subband_attempts
2170                .saturating_add(encoded.quantize_jobs);
2171            self.quantize_subband_dispatches = self
2172                .quantize_subband_dispatches
2173                .saturating_add(encoded.quantize_dispatches);
2174            self.ht_code_block_attempts = self
2175                .ht_code_block_attempts
2176                .saturating_add(encoded.ht_code_block_jobs);
2177            self.ht_code_block_dispatches = self
2178                .ht_code_block_dispatches
2179                .saturating_add(encoded.ht_code_block_dispatches);
2180            self.packetization_attempts = self.packetization_attempts.saturating_add(1);
2181            self.packetization_dispatches = self
2182                .packetization_dispatches
2183                .saturating_add(encoded.packetization_dispatches);
2184            if self.collect_profile {
2185                self.deinterleave_us = self
2186                    .deinterleave_us
2187                    .saturating_add(encoded.timings.deinterleave_us);
2188                self.mct_us = self.mct_us.saturating_add(encoded.timings.mct_us);
2189                self.dwt_us = self.dwt_us.saturating_add(encoded.timings.dwt_us);
2190                self.quantize_us = self.quantize_us.saturating_add(encoded.timings.quantize_us);
2191                self.ht_encode_us = self
2192                    .ht_encode_us
2193                    .saturating_add(encoded.timings.ht_encode_us);
2194                self.packetize_us = self
2195                    .packetize_us
2196                    .saturating_add(encoded.timings.packetize_us);
2197            }
2198            if j2k_profile::gpu_route_profile_enabled() {
2199                let components_s = job.num_components.to_string();
2200                let blocks_s = encoded.ht_code_block_jobs.to_string();
2201                j2k_profile::emit_gpu_route_profile(
2202                    "j2k",
2203                    "cuda",
2204                    &[
2205                        ("op", "encode_htj2k_tile"),
2206                        ("decision", "cuda_dispatch"),
2207                        ("components", components_s.as_str()),
2208                        ("blocks", blocks_s.as_str()),
2209                    ],
2210                );
2211            }
2212            return Ok(Some(encoded.tile_data));
2213        }
2214        #[cfg(not(feature = "cuda-runtime"))]
2215        let _ = job;
2216        if j2k_profile::gpu_route_profile_enabled() {
2217            j2k_profile::emit_gpu_route_profile(
2218                "j2k",
2219                "cuda",
2220                &[
2221                    ("op", "encode_htj2k_tile"),
2222                    ("decision", "cpu_fallback"),
2223                    ("reason", "cuda_unavailable"),
2224                ],
2225            );
2226        }
2227        Ok(None)
2228    }
2229
2230    fn encode_ht_subband(
2231        &mut self,
2232        job: J2kHtSubbandEncodeJob<'_>,
2233    ) -> core::result::Result<Option<Vec<EncodedHtJ2kCodeBlock>>, &'static str> {
2234        let code_block_count = ht_subband_code_block_count(job)?;
2235        self.ht_subband_attempts = self.ht_subband_attempts.saturating_add(1);
2236        self.quantize_subband_attempts = self.quantize_subband_attempts.saturating_add(1);
2237        self.ht_code_block_attempts = self.ht_code_block_attempts.saturating_add(code_block_count);
2238        if self.prefer_cpu_ht_subband {
2239            if j2k_profile::gpu_route_profile_enabled() {
2240                j2k_profile::emit_gpu_route_profile(
2241                    "j2k",
2242                    "cuda",
2243                    &[
2244                        ("op", "encode_ht_subband"),
2245                        ("decision", "cpu_fallback"),
2246                        ("reason", "prefer_cpu_ht_subband"),
2247                    ],
2248                );
2249            }
2250            return Ok(None);
2251        }
2252        #[cfg(feature = "cuda-runtime")]
2253        if let Some(context) = self.cuda_context()? {
2254            let resources = self.cuda_encode_resources(&context)?;
2255            let encoded =
2256                cuda_encode_ht_subband(&context, resources.as_ref(), job, self.collect_profile)?;
2257            let quantize_dispatches = encoded.quantize_dispatches;
2258            let encode_dispatches = encoded.encode.execution().kernel_dispatches();
2259            let outputs = encoded_ht_code_blocks_from_cuda(&encoded.encode);
2260            self.ht_subband_dispatches = self.ht_subband_dispatches.saturating_add(1);
2261            self.quantize_subband_dispatches = self
2262                .quantize_subband_dispatches
2263                .saturating_add(quantize_dispatches);
2264            self.ht_code_block_dispatches = self
2265                .ht_code_block_dispatches
2266                .saturating_add(encode_dispatches);
2267            if self.collect_profile {
2268                self.quantize_us = self.quantize_us.saturating_add(encoded.timings.quantize_us);
2269                self.ht_encode_us = self
2270                    .ht_encode_us
2271                    .saturating_add(encoded.timings.ht_encode_us);
2272            }
2273            if j2k_profile::gpu_route_profile_enabled() {
2274                let width_s = job.width.to_string();
2275                let height_s = job.height.to_string();
2276                let blocks_s = code_block_count.to_string();
2277                let quantize_dispatches_s = quantize_dispatches.to_string();
2278                let encode_dispatches_s = encode_dispatches.to_string();
2279                j2k_profile::emit_gpu_route_profile(
2280                    "j2k",
2281                    "cuda",
2282                    &[
2283                        ("op", "encode_ht_subband"),
2284                        ("decision", "cuda_dispatch"),
2285                        ("width", width_s.as_str()),
2286                        ("height", height_s.as_str()),
2287                        ("blocks", blocks_s.as_str()),
2288                        ("quantize_dispatches", quantize_dispatches_s.as_str()),
2289                        ("encode_dispatches", encode_dispatches_s.as_str()),
2290                    ],
2291                );
2292            }
2293            return Ok(Some(outputs));
2294        }
2295        #[cfg(not(feature = "cuda-runtime"))]
2296        let _ = job;
2297        if j2k_profile::gpu_route_profile_enabled() {
2298            j2k_profile::emit_gpu_route_profile(
2299                "j2k",
2300                "cuda",
2301                &[
2302                    ("op", "encode_ht_subband"),
2303                    ("decision", "cpu_fallback"),
2304                    ("reason", "cuda_unavailable"),
2305                ],
2306            );
2307        }
2308        Ok(None)
2309    }
2310
2311    fn encode_packetization(
2312        &mut self,
2313        job: J2kPacketizationEncodeJob<'_>,
2314    ) -> core::result::Result<Option<Vec<u8>>, &'static str> {
2315        self.packetization_attempts = self.packetization_attempts.saturating_add(1);
2316        if self.prefer_cpu_packetization {
2317            if j2k_profile::gpu_route_profile_enabled() {
2318                j2k_profile::emit_gpu_route_profile(
2319                    "j2k",
2320                    "cuda",
2321                    &[
2322                        ("op", "encode_packetization"),
2323                        ("decision", "cpu_fallback"),
2324                        ("reason", "prefer_cpu_packetization"),
2325                    ],
2326                );
2327            }
2328            let _ = job;
2329            return Ok(None);
2330        }
2331        let plan = match flatten_cuda_htj2k_packetization_job(job) {
2332            Ok(plan) => plan,
2333            Err(reason) => {
2334                if j2k_profile::gpu_route_profile_enabled() {
2335                    j2k_profile::emit_gpu_route_profile(
2336                        "j2k",
2337                        "cuda",
2338                        &[
2339                            ("op", "encode_packetization"),
2340                            ("decision", "cpu_fallback"),
2341                            ("reason", reason),
2342                        ],
2343                    );
2344                }
2345                return Ok(None);
2346            }
2347        };
2348        #[cfg(feature = "cuda-runtime")]
2349        if let Some(context) = self.cuda_context()? {
2350            let packets = cuda_packetization_packets(&plan);
2351            let subbands = cuda_packetization_subbands(&plan);
2352            let blocks = cuda_packetization_blocks(&plan);
2353            let tag_states = cuda_packetization_tag_states(&plan);
2354            let tag_nodes = cuda_packetization_tag_nodes(&plan);
2355            let packetized = context
2356                .packetize_htj2k_cleanup_packets_with_tag_state(
2357                    &plan.payload,
2358                    &packets,
2359                    &subbands,
2360                    &blocks,
2361                    &tag_states,
2362                    &tag_nodes,
2363                )
2364                .map_err(|_| "CUDA HTJ2K packetization kernel failed")?;
2365            let dispatches = packetized.execution().kernel_dispatches();
2366            let packetize_us = packetized.stage_timings().packetize_us;
2367            self.packetization_dispatches =
2368                self.packetization_dispatches.saturating_add(dispatches);
2369            if self.collect_profile {
2370                self.packetize_us = self.packetize_us.saturating_add(packetize_us);
2371            }
2372            if j2k_profile::gpu_route_profile_enabled() {
2373                let packets_s = packets.len().to_string();
2374                let dispatches_s = dispatches.to_string();
2375                j2k_profile::emit_gpu_route_profile(
2376                    "j2k",
2377                    "cuda",
2378                    &[
2379                        ("op", "encode_packetization"),
2380                        ("decision", "cuda_dispatch"),
2381                        ("packets", packets_s.as_str()),
2382                        ("dispatches", dispatches_s.as_str()),
2383                    ],
2384                );
2385            }
2386            return Ok(Some(packetized.data().to_vec()));
2387        }
2388        #[cfg(not(feature = "cuda-runtime"))]
2389        let _ = plan;
2390        if j2k_profile::gpu_route_profile_enabled() {
2391            j2k_profile::emit_gpu_route_profile(
2392                "j2k",
2393                "cuda",
2394                &[
2395                    ("op", "encode_packetization"),
2396                    ("decision", "cpu_fallback"),
2397                    ("reason", "unsupported_stage"),
2398                ],
2399            );
2400        }
2401        Ok(None)
2402    }
2403}
2404
2405#[cfg(feature = "cuda-runtime")]
2406fn cuda_packetization_packets(
2407    plan: &CudaHtj2kPacketizationPlan,
2408) -> Vec<CudaHtj2kPacketizationPacket> {
2409    plan.packets
2410        .iter()
2411        .map(|packet| CudaHtj2kPacketizationPacket {
2412            block_start: packet.block_start,
2413            block_count: packet.block_count,
2414            subband_start: packet.subband_start,
2415            subband_count: packet.subband_count,
2416            output_capacity: packet.output_capacity,
2417            layer: packet.layer,
2418        })
2419        .collect()
2420}
2421
2422#[cfg(feature = "cuda-runtime")]
2423fn cuda_packetization_subbands(
2424    plan: &CudaHtj2kPacketizationPlan,
2425) -> Vec<CudaHtj2kPacketizationSubband> {
2426    plan.subbands
2427        .iter()
2428        .map(|subband| CudaHtj2kPacketizationSubband {
2429            block_start: subband.block_start,
2430            block_count: subband.block_count,
2431            num_cbs_x: subband.num_cbs_x,
2432            num_cbs_y: subband.num_cbs_y,
2433        })
2434        .collect()
2435}
2436
2437#[cfg(feature = "cuda-runtime")]
2438fn cuda_packetization_blocks(
2439    plan: &CudaHtj2kPacketizationPlan,
2440) -> Vec<CudaHtj2kPacketizationBlock> {
2441    plan.blocks
2442        .iter()
2443        .map(|block| CudaHtj2kPacketizationBlock {
2444            data_offset: block.data_offset,
2445            data_len: block.data_len,
2446            cleanup_length: block.cleanup_length,
2447            refinement_length: block.refinement_length,
2448            num_coding_passes: block.num_coding_passes,
2449            num_zero_bitplanes: block.num_zero_bitplanes,
2450            l_block: block.l_block,
2451            previously_included: block.previously_included,
2452            inclusion_layer: block.inclusion_layer,
2453        })
2454        .collect()
2455}
2456
2457#[cfg(feature = "cuda-runtime")]
2458fn cuda_packetization_tag_states(
2459    plan: &CudaHtj2kPacketizationPlan,
2460) -> Vec<CudaHtj2kPacketizationSubbandTagState> {
2461    plan.tag_states
2462        .iter()
2463        .map(|state| CudaHtj2kPacketizationSubbandTagState {
2464            inclusion_node_start: state.inclusion_node_start,
2465            zero_bitplane_node_start: state.zero_bitplane_node_start,
2466            node_count: state.node_count,
2467            reserved0: 0,
2468        })
2469        .collect()
2470}
2471
2472#[cfg(feature = "cuda-runtime")]
2473fn cuda_packetization_tag_nodes(
2474    plan: &CudaHtj2kPacketizationPlan,
2475) -> Vec<CudaHtj2kPacketizationTagNodeState> {
2476    plan.tag_nodes
2477        .iter()
2478        .map(|node| CudaHtj2kPacketizationTagNodeState {
2479            current: node.current,
2480            known: node.known,
2481        })
2482        .collect()
2483}
2484
2485#[cfg(feature = "cuda-runtime")]
2486fn cuda_encode_ht_code_block(
2487    context: &CudaContext,
2488    resources: &CudaHtj2kEncodeResources,
2489    job: J2kHtCodeBlockEncodeJob<'_>,
2490) -> core::result::Result<j2k_cuda_runtime::CudaHtj2kEncodedCodeBlocks, &'static str> {
2491    let coefficient_len = (job.width as usize)
2492        .checked_mul(job.height as usize)
2493        .ok_or("CUDA HTJ2K code-block encode job is too large")?;
2494    if coefficient_len != job.coefficients.len() {
2495        return Err("CUDA HTJ2K code-block encode job has invalid coefficient length");
2496    }
2497    let cuda_jobs = [CudaHtj2kEncodeCodeBlockJob {
2498        coefficient_offset: 0,
2499        width: job.width,
2500        height: job.height,
2501        total_bitplanes: job.total_bitplanes,
2502        target_coding_passes: job.target_coding_passes,
2503    }];
2504    context
2505        .encode_htj2k_codeblocks_with_resources(job.coefficients, &cuda_jobs, resources)
2506        .map_err(|_| "CUDA HTJ2K code-block encode kernel failed")
2507}
2508
2509#[cfg(feature = "cuda-runtime")]
2510fn cuda_encode_ht_code_blocks(
2511    context: &CudaContext,
2512    resources: &CudaHtj2kEncodeResources,
2513    jobs: &[J2kHtCodeBlockEncodeJob<'_>],
2514) -> core::result::Result<j2k_cuda_runtime::CudaHtj2kEncodedCodeBlocks, &'static str> {
2515    let total_coefficients = jobs.iter().try_fold(0usize, |acc, job| {
2516        let coefficient_len = (job.width as usize)
2517            .checked_mul(job.height as usize)
2518            .ok_or("CUDA HTJ2K code-block batch is too large")?;
2519        if coefficient_len != job.coefficients.len() {
2520            return Err("CUDA HTJ2K code-block encode job has invalid coefficient length");
2521        }
2522        acc.checked_add(coefficient_len)
2523            .ok_or("CUDA HTJ2K code-block batch is too large")
2524    })?;
2525    let mut coefficients = Vec::with_capacity(total_coefficients);
2526    let mut cuda_jobs = Vec::with_capacity(jobs.len());
2527    for job in jobs {
2528        let coefficient_offset = u32::try_from(coefficients.len())
2529            .map_err(|_| "CUDA HTJ2K code-block batch is too large")?;
2530        coefficients.extend_from_slice(job.coefficients);
2531        cuda_jobs.push(CudaHtj2kEncodeCodeBlockJob {
2532            coefficient_offset,
2533            width: job.width,
2534            height: job.height,
2535            total_bitplanes: job.total_bitplanes,
2536            target_coding_passes: job.target_coding_passes,
2537        });
2538    }
2539
2540    context
2541        .encode_htj2k_codeblocks_with_resources(&coefficients, &cuda_jobs, resources)
2542        .map_err(|_| "CUDA HTJ2K code-block batch encode kernel failed")
2543}
2544
2545#[cfg(feature = "cuda-runtime")]
2546struct CudaEncodedHtj2kTile {
2547    tile_data: Vec<u8>,
2548    deinterleave_dispatches: usize,
2549    forward_rct_dispatches: usize,
2550    forward_ict_dispatches: usize,
2551    forward_dwt53_dispatches: usize,
2552    forward_dwt97_dispatches: usize,
2553    quantize_jobs: usize,
2554    quantize_dispatches: usize,
2555    ht_code_block_dispatches: usize,
2556    ht_code_block_jobs: usize,
2557    packetization_dispatches: usize,
2558    timings: CudaEncodeStageTimings,
2559}
2560
2561#[cfg(feature = "cuda-runtime")]
2562#[derive(Default)]
2563struct CudaHtj2kTileEncodeStats {
2564    collect_profile: bool,
2565    deinterleave_dispatches: usize,
2566    forward_rct_dispatches: usize,
2567    forward_ict_dispatches: usize,
2568    forward_dwt53_dispatches: usize,
2569    forward_dwt97_dispatches: usize,
2570    quantize_jobs: usize,
2571    quantize_dispatches: usize,
2572    ht_code_block_dispatches: usize,
2573    ht_code_block_jobs: usize,
2574    timings: CudaEncodeStageTimings,
2575}
2576
2577#[cfg(feature = "cuda-runtime")]
2578struct CudaEncodedHtj2kResolution {
2579    subbands: Vec<CudaEncodedHtj2kSubband>,
2580}
2581
2582#[cfg(feature = "cuda-runtime")]
2583struct CudaEncodedHtj2kSubband {
2584    code_blocks: Vec<EncodedHtJ2kCodeBlock>,
2585    num_cbs_x: u32,
2586    num_cbs_y: u32,
2587}
2588
2589#[cfg(feature = "cuda-runtime")]
2590#[derive(Clone, Copy)]
2591struct CudaTileSubbandRegion {
2592    x0: u32,
2593    y0: u32,
2594    width: u32,
2595    height: u32,
2596    stride: u32,
2597}
2598
2599#[cfg(feature = "cuda-runtime")]
2600#[derive(Clone, Copy)]
2601enum CudaTileSubbandKind {
2602    LowLow,
2603    HighLow,
2604    LowHigh,
2605    HighHigh,
2606}
2607
2608#[cfg(feature = "cuda-runtime")]
2609#[derive(Clone, Copy)]
2610struct CudaHtj2kEncodeRuntime<'a> {
2611    context: &'a CudaContext,
2612    resources: &'a CudaHtj2kEncodeResources,
2613}
2614
2615#[cfg(feature = "cuda-runtime")]
2616fn cuda_encode_htj2k_tile_body(
2617    context: &CudaContext,
2618    encode_resources: &CudaHtj2kEncodeResources,
2619    job: J2kHtj2kTileEncodeJob<'_>,
2620    collect_profile: bool,
2621) -> core::result::Result<Option<CudaEncodedHtj2kTile>, &'static str> {
2622    validate_cuda_htj2k_tile_job(job)?;
2623    let num_pixels = (job.width as usize)
2624        .checked_mul(job.height as usize)
2625        .ok_or("CUDA HTJ2K tile dimensions are too large")?;
2626    let (components, deinterleave_us) = time_cuda_stage(
2627        "j2k.htj2k.encode.tile.deinterleave",
2628        context,
2629        collect_profile,
2630        || {
2631            context.j2k_deinterleave_to_f32_resident(
2632                job.pixels,
2633                num_pixels,
2634                job.num_components,
2635                job.bit_depth,
2636                job.signed,
2637            )
2638        },
2639    )
2640    .map_err(|_| "CUDA HTJ2K tile deinterleave failed")?;
2641    cuda_encode_htj2k_resident_components_body(
2642        context,
2643        encode_resources,
2644        job,
2645        components,
2646        deinterleave_us,
2647        collect_profile,
2648    )
2649}
2650
2651#[cfg(feature = "cuda-runtime")]
2652fn validate_cuda_htj2k_tile_job(
2653    job: J2kHtj2kTileEncodeJob<'_>,
2654) -> core::result::Result<(), &'static str> {
2655    if job
2656        .component_sampling
2657        .iter()
2658        .any(|&sampling| sampling != (1, 1))
2659    {
2660        return Err("CUDA HTJ2K tile encode does not support component subsampling != (1, 1)");
2661    }
2662    // Native treats `use_mct = options.use_mct && num_components >= 3`, applying the
2663    // color transform to component planes 0,1,2 and passing any 4th plane through
2664    // unchanged. The resident path mirrors this: RCT/ICT runs on the first three
2665    // planes (see `j2k_forward_rct_resident`/`j2k_forward_ict_resident`), and every
2666    // component — including the passthrough 4th — still flows through the per-component
2667    // DWT → quantize → HT code-block → packetization loop below.
2668    //
2669    // Only `{1, 3, 4}` component counts are in scope. Reject any other count with a
2670    // typed hard error rather than `Ok(None)` (a silent CPU fallback is forbidden for
2671    // in-scope inputs).
2672    if !matches!(job.num_components, 1 | 3 | 4) {
2673        return Err("CUDA HTJ2K tile encode supports 1, 3, or 4 components");
2674    }
2675    if job.use_mct && job.num_components < 3 {
2676        return Err("CUDA HTJ2K tile encode requires at least three components for MCT");
2677    }
2678    if job.code_block_width == 0 || job.code_block_height == 0 {
2679        return Err("CUDA HTJ2K tile encode job has invalid code-block dimensions");
2680    }
2681    let expected_quantization_steps = 1usize
2682        .checked_add(usize::from(job.num_decomposition_levels).saturating_mul(3))
2683        .ok_or("CUDA HTJ2K tile quantization step count overflow")?;
2684    if job.quantization_steps.len() != expected_quantization_steps {
2685        return Err("CUDA HTJ2K tile quantization step count mismatch");
2686    }
2687    Ok(())
2688}
2689
2690#[cfg(feature = "cuda-runtime")]
2691fn cuda_encode_htj2k_device_tile_body(
2692    context: &CudaContext,
2693    encode_resources: &CudaHtj2kEncodeResources,
2694    tile: CudaLosslessEncodeTile<'_>,
2695    job: J2kHtj2kTileEncodeJob<'_>,
2696    collect_profile: bool,
2697) -> core::result::Result<Option<CudaEncodedHtj2kTile>, &'static str> {
2698    validate_cuda_htj2k_tile_job(job)?;
2699    let format = cuda_encode_format(tile.format).map_err(|_| "CUDA HTJ2K tile format failed")?;
2700    if job.width != tile.output_width || job.height != tile.output_height {
2701        return Err("CUDA HTJ2K tile encode job dimensions do not match CUDA tile");
2702    }
2703    if tile.width != tile.output_width || tile.height != tile.output_height {
2704        return Err("CUDA HTJ2K tile encode does not support input padding");
2705    }
2706    if job.num_components != format.components || job.bit_depth != format.bit_depth || job.signed {
2707        return Err("CUDA HTJ2K tile encode job sample format does not match CUDA tile");
2708    }
2709    let (components, deinterleave_us) = time_cuda_stage(
2710        "j2k.htj2k.encode.tile.device_deinterleave",
2711        context,
2712        collect_profile,
2713        || {
2714            context.j2k_deinterleave_strided_to_f32_resident(CudaJ2kStridedInterleavedPixels {
2715                buffer: tile.buffer,
2716                byte_offset: tile.byte_offset,
2717                width: tile.width,
2718                height: tile.height,
2719                pitch_bytes: tile.pitch_bytes,
2720                num_components: job.num_components,
2721                bit_depth: job.bit_depth,
2722                signed: job.signed,
2723            })
2724        },
2725    )
2726    .map_err(|_| "CUDA HTJ2K tile device deinterleave failed")?;
2727    cuda_encode_htj2k_resident_components_body(
2728        context,
2729        encode_resources,
2730        job,
2731        components,
2732        deinterleave_us,
2733        collect_profile,
2734    )
2735}
2736
2737#[cfg(feature = "cuda-runtime")]
2738fn cuda_encode_htj2k_resident_components_body(
2739    context: &CudaContext,
2740    encode_resources: &CudaHtj2kEncodeResources,
2741    job: J2kHtj2kTileEncodeJob<'_>,
2742    mut components: CudaJ2kResidentComponents,
2743    deinterleave_us: u128,
2744    collect_profile: bool,
2745) -> core::result::Result<Option<CudaEncodedHtj2kTile>, &'static str> {
2746    let mut stats = CudaHtj2kTileEncodeStats {
2747        collect_profile,
2748        deinterleave_dispatches: components.execution().kernel_dispatches(),
2749        timings: CudaEncodeStageTimings {
2750            deinterleave_us,
2751            ..CudaEncodeStageTimings::default()
2752        },
2753        ..CudaHtj2kTileEncodeStats::default()
2754    };
2755    let runtime = CudaHtj2kEncodeRuntime {
2756        context,
2757        resources: encode_resources,
2758    };
2759
2760    if job.use_mct {
2761        let (execution, mct_us) = if job.reversible {
2762            time_cuda_stage(
2763                "j2k.htj2k.encode.tile.rct",
2764                context,
2765                collect_profile,
2766                || context.j2k_forward_rct_resident(&mut components),
2767            )
2768            .map_err(|_| "CUDA HTJ2K tile RCT failed")?
2769        } else {
2770            time_cuda_stage(
2771                "j2k.htj2k.encode.tile.ict",
2772                context,
2773                collect_profile,
2774                || context.j2k_forward_ict_resident(&mut components),
2775            )
2776            .map_err(|_| "CUDA HTJ2K tile ICT failed")?
2777        };
2778        stats.timings.mct_us = stats.timings.mct_us.saturating_add(mct_us);
2779        if job.reversible {
2780            stats.forward_rct_dispatches = execution.kernel_dispatches();
2781        } else {
2782            stats.forward_ict_dispatches = execution.kernel_dispatches();
2783        }
2784    }
2785
2786    let mut component_resolution_packets = Vec::with_capacity(usize::from(job.num_components));
2787    if job.num_decomposition_levels == 0 {
2788        for component in 0..job.num_components {
2789            let y0 = u32::from(component)
2790                .checked_mul(job.height)
2791                .ok_or("CUDA HTJ2K tile component offset overflow")?;
2792            let subband = cuda_encode_tile_subband_region(
2793                runtime,
2794                components.buffer(),
2795                CudaTileSubbandRegion {
2796                    x0: 0,
2797                    y0,
2798                    width: job.width,
2799                    height: job.height,
2800                    stride: job.width,
2801                },
2802                job.quantization_steps[0],
2803                job,
2804                CudaTileSubbandKind::LowLow,
2805                &mut stats,
2806            )?;
2807            component_resolution_packets.push(vec![CudaEncodedHtj2kResolution {
2808                subbands: vec![subband],
2809            }]);
2810        }
2811    } else {
2812        for component in 0..job.num_components {
2813            let packets = if job.reversible {
2814                let (dwt, dwt_us) = time_cuda_stage(
2815                    "j2k.htj2k.encode.tile.dwt53",
2816                    context,
2817                    collect_profile,
2818                    || {
2819                        context.j2k_forward_dwt53_resident_component(
2820                            &components,
2821                            component,
2822                            job.width,
2823                            job.height,
2824                            job.num_decomposition_levels,
2825                        )
2826                    },
2827                )
2828                .map_err(|_| "CUDA HTJ2K tile DWT 5/3 failed")?;
2829                stats.forward_dwt53_dispatches = stats
2830                    .forward_dwt53_dispatches
2831                    .saturating_add(dwt.execution().kernel_dispatches());
2832                stats.timings.dwt_us = stats.timings.dwt_us.saturating_add(dwt_us);
2833                cuda_encode_dwt_component_packets(
2834                    runtime,
2835                    job,
2836                    dwt.buffer(),
2837                    dwt.levels(),
2838                    dwt.ll_dimensions(),
2839                    &mut stats,
2840                )?
2841            } else {
2842                let (dwt, dwt_us) = time_cuda_stage(
2843                    "j2k.htj2k.encode.tile.dwt97",
2844                    context,
2845                    collect_profile,
2846                    || {
2847                        context.j2k_forward_dwt97_resident_component(
2848                            &components,
2849                            component,
2850                            job.width,
2851                            job.height,
2852                            job.num_decomposition_levels,
2853                        )
2854                    },
2855                )
2856                .map_err(|_| "CUDA HTJ2K tile DWT 9/7 failed")?;
2857                stats.forward_dwt97_dispatches = stats
2858                    .forward_dwt97_dispatches
2859                    .saturating_add(dwt.execution().kernel_dispatches());
2860                stats.timings.dwt_us = stats.timings.dwt_us.saturating_add(dwt_us);
2861                cuda_encode_dwt_component_packets(
2862                    runtime,
2863                    job,
2864                    dwt.buffer(),
2865                    dwt.levels(),
2866                    dwt.ll_dimensions(),
2867                    &mut stats,
2868                )?
2869            };
2870            component_resolution_packets.push(packets);
2871        }
2872    }
2873
2874    let resolution_packets =
2875        cuda_order_component_resolution_packets(component_resolution_packets, job.num_components)?;
2876    let (tile_data, packetization_dispatches, packetize_us) =
2877        cuda_packetize_tile_body(context, job, &resolution_packets, stats.ht_code_block_jobs)?;
2878    stats.timings.packetize_us = stats.timings.packetize_us.saturating_add(packetize_us);
2879    Ok(Some(CudaEncodedHtj2kTile {
2880        tile_data,
2881        deinterleave_dispatches: stats.deinterleave_dispatches,
2882        forward_rct_dispatches: stats.forward_rct_dispatches,
2883        forward_ict_dispatches: stats.forward_ict_dispatches,
2884        forward_dwt53_dispatches: stats.forward_dwt53_dispatches,
2885        forward_dwt97_dispatches: stats.forward_dwt97_dispatches,
2886        quantize_jobs: stats.quantize_jobs,
2887        quantize_dispatches: stats.quantize_dispatches,
2888        ht_code_block_dispatches: stats.ht_code_block_dispatches,
2889        ht_code_block_jobs: stats.ht_code_block_jobs,
2890        packetization_dispatches,
2891        timings: stats.timings,
2892    }))
2893}
2894
2895#[cfg(feature = "cuda-runtime")]
2896fn cuda_encode_dwt_component_packets(
2897    runtime: CudaHtj2kEncodeRuntime<'_>,
2898    job: J2kHtj2kTileEncodeJob<'_>,
2899    transformed: &CudaDeviceBuffer,
2900    levels: &[CudaDwt53LevelShape],
2901    ll_dimensions: (u32, u32),
2902    stats: &mut CudaHtj2kTileEncodeStats,
2903) -> core::result::Result<Vec<CudaEncodedHtj2kResolution>, &'static str> {
2904    if levels.len() != usize::from(job.num_decomposition_levels) {
2905        return Err("CUDA HTJ2K tile DWT level count mismatch");
2906    }
2907    let (ll_width, ll_height) = ll_dimensions;
2908    let full_width = levels.first().map_or(ll_width, |level| level.width);
2909    let mut packets = Vec::with_capacity(levels.len().saturating_add(1));
2910
2911    let ll_subband = cuda_encode_tile_subband_region(
2912        runtime,
2913        transformed,
2914        CudaTileSubbandRegion {
2915            x0: 0,
2916            y0: 0,
2917            width: ll_width,
2918            height: ll_height,
2919            stride: full_width,
2920        },
2921        job.quantization_steps[0],
2922        job,
2923        CudaTileSubbandKind::LowLow,
2924        stats,
2925    )?;
2926    packets.push(CudaEncodedHtj2kResolution {
2927        subbands: vec![ll_subband],
2928    });
2929
2930    for (level_idx, level) in levels.iter().rev().enumerate() {
2931        let step_base = 1usize
2932            .checked_add(level_idx.saturating_mul(3))
2933            .ok_or("CUDA HTJ2K tile quantization step index overflow")?;
2934        let hl = cuda_encode_tile_subband_region(
2935            runtime,
2936            transformed,
2937            CudaTileSubbandRegion {
2938                x0: level.low_width,
2939                y0: 0,
2940                width: level.high_width,
2941                height: level.low_height,
2942                stride: full_width,
2943            },
2944            job.quantization_steps[step_base],
2945            job,
2946            CudaTileSubbandKind::HighLow,
2947            stats,
2948        )?;
2949        let lh = cuda_encode_tile_subband_region(
2950            runtime,
2951            transformed,
2952            CudaTileSubbandRegion {
2953                x0: 0,
2954                y0: level.low_height,
2955                width: level.low_width,
2956                height: level.high_height,
2957                stride: full_width,
2958            },
2959            job.quantization_steps[step_base + 1],
2960            job,
2961            CudaTileSubbandKind::LowHigh,
2962            stats,
2963        )?;
2964        let hh = cuda_encode_tile_subband_region(
2965            runtime,
2966            transformed,
2967            CudaTileSubbandRegion {
2968                x0: level.low_width,
2969                y0: level.low_height,
2970                width: level.high_width,
2971                height: level.high_height,
2972                stride: full_width,
2973            },
2974            job.quantization_steps[step_base + 2],
2975            job,
2976            CudaTileSubbandKind::HighHigh,
2977            stats,
2978        )?;
2979        packets.push(CudaEncodedHtj2kResolution {
2980            subbands: vec![hl, lh, hh],
2981        });
2982    }
2983
2984    Ok(packets)
2985}
2986
2987#[cfg(feature = "cuda-runtime")]
2988fn cuda_encode_tile_subband_region(
2989    runtime: CudaHtj2kEncodeRuntime<'_>,
2990    source: &CudaDeviceBuffer,
2991    region: CudaTileSubbandRegion,
2992    quantization_step: (u16, u16),
2993    job: J2kHtj2kTileEncodeJob<'_>,
2994    subband_kind: CudaTileSubbandKind,
2995    stats: &mut CudaHtj2kTileEncodeStats,
2996) -> core::result::Result<CudaEncodedHtj2kSubband, &'static str> {
2997    if region.width == 0 || region.height == 0 {
2998        return Ok(CudaEncodedHtj2kSubband {
2999            code_blocks: Vec::new(),
3000            num_cbs_x: 0,
3001            num_cbs_y: 0,
3002        });
3003    }
3004
3005    let (step_exponent, step_mantissa) = quantization_step;
3006    let step_exponent_u8 = u8::try_from(step_exponent)
3007        .map_err(|_| "CUDA HTJ2K tile quantization exponent exceeds u8")?;
3008    let total_bitplanes = job
3009        .guard_bits
3010        .saturating_add(step_exponent_u8)
3011        .saturating_sub(1);
3012    let (quantized, quantize_us) = time_cuda_stage(
3013        "j2k.htj2k.encode.tile.quantize",
3014        runtime.context,
3015        stats.collect_profile,
3016        || {
3017            runtime.context.j2k_quantize_subband_region_resident(
3018                source,
3019                CudaJ2kQuantizeSubbandRegionJob {
3020                    x0: region.x0,
3021                    y0: region.y0,
3022                    width: region.width,
3023                    height: region.height,
3024                    stride: region.stride,
3025                    quantization: CudaJ2kQuantizeJob {
3026                        step_exponent,
3027                        step_mantissa,
3028                        range_bits: cuda_tile_subband_range_bits(job.bit_depth, subband_kind),
3029                        reversible: job.reversible,
3030                    },
3031                },
3032            )
3033        },
3034    )
3035    .map_err(|_| "CUDA HTJ2K tile quantize failed")?;
3036    stats.quantize_jobs = stats.quantize_jobs.saturating_add(1);
3037    stats.quantize_dispatches = stats
3038        .quantize_dispatches
3039        .saturating_add(quantized.execution().kernel_dispatches());
3040    stats.timings.quantize_us = stats.timings.quantize_us.saturating_add(quantize_us);
3041
3042    let region_jobs = cuda_ht_region_jobs(
3043        region.width,
3044        region.height,
3045        job.code_block_width,
3046        job.code_block_height,
3047        total_bitplanes,
3048    )?;
3049    stats.ht_code_block_jobs = stats.ht_code_block_jobs.saturating_add(region_jobs.len());
3050    let encoded = runtime
3051        .context
3052        .encode_htj2k_codeblock_regions_resident_with_resources(
3053            quantized.buffer(),
3054            quantized.coefficient_count(),
3055            &region_jobs,
3056            runtime.resources,
3057        )
3058        .map_err(|_| "CUDA HTJ2K tile code-block encode failed")?;
3059    stats.ht_code_block_dispatches = stats
3060        .ht_code_block_dispatches
3061        .saturating_add(encoded.execution().kernel_dispatches());
3062    stats.timings.ht_encode_us = stats
3063        .timings
3064        .ht_encode_us
3065        .saturating_add(encoded.stage_timings().ht_encode_us);
3066
3067    Ok(CudaEncodedHtj2kSubband {
3068        code_blocks: encoded_ht_code_blocks_from_cuda(&encoded),
3069        num_cbs_x: region.width.div_ceil(job.code_block_width),
3070        num_cbs_y: region.height.div_ceil(job.code_block_height),
3071    })
3072}
3073
3074#[cfg(feature = "cuda-runtime")]
3075fn cuda_tile_subband_range_bits(bit_depth: u8, subband_kind: CudaTileSubbandKind) -> u8 {
3076    let log_gain = match subband_kind {
3077        CudaTileSubbandKind::LowLow => 0,
3078        CudaTileSubbandKind::HighLow | CudaTileSubbandKind::LowHigh => 1,
3079        CudaTileSubbandKind::HighHigh => 2,
3080    };
3081    bit_depth.saturating_add(log_gain)
3082}
3083
3084#[cfg(feature = "cuda-runtime")]
3085fn cuda_order_component_resolution_packets(
3086    component_resolution_packets: Vec<Vec<CudaEncodedHtj2kResolution>>,
3087    num_components: u8,
3088) -> core::result::Result<Vec<CudaEncodedHtj2kResolution>, &'static str> {
3089    if component_resolution_packets.len() != usize::from(num_components) {
3090        return Err("CUDA HTJ2K tile component packet count mismatch");
3091    }
3092    let resolution_count = component_resolution_packets
3093        .first()
3094        .map_or(0usize, Vec::len);
3095    let mut component_iters: Vec<_> = component_resolution_packets
3096        .into_iter()
3097        .map(Vec::into_iter)
3098        .collect();
3099    let mut resolution_packets =
3100        Vec::with_capacity(resolution_count.saturating_mul(component_iters.len()));
3101
3102    for _resolution in 0..resolution_count {
3103        for component in &mut component_iters {
3104            resolution_packets.push(
3105                component
3106                    .next()
3107                    .ok_or("CUDA HTJ2K tile component resolution count mismatch")?,
3108            );
3109        }
3110    }
3111    if component_iters
3112        .iter_mut()
3113        .any(|component| component.next().is_some())
3114    {
3115        return Err("CUDA HTJ2K tile component resolution count mismatch");
3116    }
3117
3118    Ok(resolution_packets)
3119}
3120
3121#[cfg(feature = "cuda-runtime")]
3122fn cuda_ht_region_jobs(
3123    width: u32,
3124    height: u32,
3125    code_block_width: u32,
3126    code_block_height: u32,
3127    total_bitplanes: u8,
3128) -> core::result::Result<Vec<CudaHtj2kEncodeCodeBlockRegionJob>, &'static str> {
3129    if code_block_width == 0 || code_block_height == 0 {
3130        return Err("CUDA HTJ2K encode job has invalid code-block dimensions");
3131    }
3132    if width == 0 || height == 0 {
3133        return Ok(Vec::new());
3134    }
3135
3136    let num_cbs_x = width.div_ceil(code_block_width);
3137    let num_cbs_y = height.div_ceil(code_block_height);
3138    let count = (num_cbs_x as usize)
3139        .checked_mul(num_cbs_y as usize)
3140        .ok_or("CUDA HTJ2K code-block count overflow")?;
3141    let mut cuda_jobs = Vec::with_capacity(count);
3142    for cby in 0..num_cbs_y {
3143        for cbx in 0..num_cbs_x {
3144            let x0 = cbx
3145                .checked_mul(code_block_width)
3146                .ok_or("CUDA HTJ2K code-block x offset overflow")?;
3147            let y0 = cby
3148                .checked_mul(code_block_height)
3149                .ok_or("CUDA HTJ2K code-block y offset overflow")?;
3150            let block_width = (x0 + code_block_width).min(width) - x0;
3151            let block_height = (y0 + code_block_height).min(height) - y0;
3152            let offset = (y0 as usize)
3153                .checked_mul(width as usize)
3154                .and_then(|row| row.checked_add(x0 as usize))
3155                .ok_or("CUDA HTJ2K code-block offset overflow")?;
3156            cuda_jobs.push(CudaHtj2kEncodeCodeBlockRegionJob {
3157                coefficient_offset: u32::try_from(offset)
3158                    .map_err(|_| "CUDA HTJ2K code-block offset exceeds u32")?,
3159                coefficient_stride: width,
3160                width: block_width,
3161                height: block_height,
3162                total_bitplanes,
3163                target_coding_passes: 1,
3164            });
3165        }
3166    }
3167    Ok(cuda_jobs)
3168}
3169
3170#[cfg(feature = "cuda-runtime")]
3171fn cuda_packetize_tile_body(
3172    context: &CudaContext,
3173    job: J2kHtj2kTileEncodeJob<'_>,
3174    resolution_packets: &[CudaEncodedHtj2kResolution],
3175    code_block_count: usize,
3176) -> core::result::Result<(Vec<u8>, usize, u128), &'static str> {
3177    let packet_descriptors =
3178        cuda_tile_packet_descriptors(resolution_packets.len(), 1, job.num_components)?;
3179    let resolutions: Vec<J2kPacketizationResolution<'_>> = resolution_packets
3180        .iter()
3181        .map(|resolution| J2kPacketizationResolution {
3182            subbands: resolution
3183                .subbands
3184                .iter()
3185                .map(|subband| {
3186                    let code_blocks = subband
3187                        .code_blocks
3188                        .iter()
3189                        .map(|block| J2kPacketizationCodeBlock {
3190                            data: block.data.as_slice(),
3191                            ht_cleanup_length: block.cleanup_length,
3192                            ht_refinement_length: block.refinement_length,
3193                            num_coding_passes: block.num_coding_passes,
3194                            num_zero_bitplanes: block.num_zero_bitplanes,
3195                            previously_included: false,
3196                            l_block: 3,
3197                            block_coding_mode: J2kPacketizationBlockCodingMode::HighThroughput,
3198                        })
3199                        .collect();
3200                    J2kPacketizationSubband {
3201                        code_blocks,
3202                        num_cbs_x: subband.num_cbs_x,
3203                        num_cbs_y: subband.num_cbs_y,
3204                    }
3205                })
3206                .collect(),
3207        })
3208        .collect();
3209
3210    let packetization_job = J2kPacketizationEncodeJob {
3211        resolution_count: u32::try_from(resolutions.len())
3212            .map_err(|_| "CUDA HTJ2K tile resolution count exceeds u32")?,
3213        num_layers: 1,
3214        num_components: job.num_components,
3215        code_block_count: u32::try_from(code_block_count)
3216            .map_err(|_| "CUDA HTJ2K tile code-block count exceeds u32")?,
3217        progression_order: job.progression_order,
3218        packet_descriptors: &packet_descriptors,
3219        resolutions: &resolutions,
3220    };
3221    let plan = flatten_cuda_htj2k_packetization_job(packetization_job)?;
3222    let packets = cuda_packetization_packets(&plan);
3223    let subbands = cuda_packetization_subbands(&plan);
3224    let blocks = cuda_packetization_blocks(&plan);
3225    let tag_states = cuda_packetization_tag_states(&plan);
3226    let tag_nodes = cuda_packetization_tag_nodes(&plan);
3227    let packetized = context
3228        .packetize_htj2k_cleanup_packets_with_tag_state(
3229            &plan.payload,
3230            &packets,
3231            &subbands,
3232            &blocks,
3233            &tag_states,
3234            &tag_nodes,
3235        )
3236        .map_err(|_| "CUDA HTJ2K tile packetization failed")?;
3237    Ok((
3238        packetized.data().to_vec(),
3239        packetized.execution().kernel_dispatches(),
3240        packetized.stage_timings().packetize_us,
3241    ))
3242}
3243
3244#[cfg(feature = "cuda-runtime")]
3245fn cuda_tile_packet_descriptors(
3246    packet_count: usize,
3247    num_layers: u8,
3248    num_components: u8,
3249) -> core::result::Result<Vec<J2kPacketizationPacketDescriptor>, &'static str> {
3250    if num_layers != 1 {
3251        return Err("CUDA HTJ2K tile encode currently prepares one packet layer");
3252    }
3253    let component_count = usize::from(num_components).max(1);
3254    (0..packet_count)
3255        .map(|packet_index| {
3256            Ok(J2kPacketizationPacketDescriptor {
3257                packet_index: u32::try_from(packet_index)
3258                    .map_err(|_| "CUDA HTJ2K tile packet index exceeds u32")?,
3259                state_index: u32::try_from(packet_index)
3260                    .map_err(|_| "CUDA HTJ2K tile packet state index exceeds u32")?,
3261                layer: 0,
3262                resolution: u32::try_from(packet_index / component_count)
3263                    .map_err(|_| "CUDA HTJ2K tile packet resolution exceeds u32")?,
3264                component: u8::try_from(packet_index % component_count)
3265                    .map_err(|_| "CUDA HTJ2K tile packet component exceeds u8")?,
3266                precinct: 0,
3267            })
3268        })
3269        .collect()
3270}
3271
3272#[cfg(feature = "cuda-runtime")]
3273struct CudaEncodedHtSubband {
3274    quantize_dispatches: usize,
3275    encode: j2k_cuda_runtime::CudaHtj2kEncodedCodeBlocks,
3276    timings: CudaEncodeStageTimings,
3277}
3278
3279#[cfg(feature = "cuda-runtime")]
3280fn cuda_encode_ht_subband(
3281    context: &CudaContext,
3282    encode_resources: &CudaHtj2kEncodeResources,
3283    job: J2kHtSubbandEncodeJob<'_>,
3284    collect_profile: bool,
3285) -> core::result::Result<CudaEncodedHtSubband, &'static str> {
3286    let expected_len = (job.width as usize)
3287        .checked_mul(job.height as usize)
3288        .ok_or("CUDA HTJ2K subband encode dimensions are too large")?;
3289    if expected_len != job.coefficients.len() {
3290        return Err("CUDA HTJ2K subband encode job has invalid coefficient length");
3291    }
3292    if job.code_block_width == 0 || job.code_block_height == 0 {
3293        return Err("CUDA HTJ2K subband encode job has invalid code-block dimensions");
3294    }
3295
3296    let sample_buffer = context
3297        .upload_f32_pinned(job.coefficients)
3298        .map_err(|_| "CUDA HTJ2K subband upload failed")?;
3299    let (quantized, quantize_us) = time_cuda_stage(
3300        "j2k.htj2k.encode.subband.quantize",
3301        context,
3302        collect_profile,
3303        || {
3304            context.j2k_quantize_subband_resident(
3305                &sample_buffer,
3306                job.coefficients.len(),
3307                CudaJ2kQuantizeJob {
3308                    step_exponent: job.step_exponent,
3309                    step_mantissa: job.step_mantissa,
3310                    range_bits: job.range_bits,
3311                    reversible: job.reversible,
3312                },
3313            )
3314        },
3315    )
3316    .map_err(|_| "CUDA quantize subband encode kernel failed")?;
3317    let cuda_jobs = cuda_ht_subband_region_jobs(job)?;
3318    let encoded = context
3319        .encode_htj2k_codeblock_regions_resident_with_resources(
3320            quantized.buffer(),
3321            quantized.coefficient_count(),
3322            &cuda_jobs,
3323            encode_resources,
3324        )
3325        .map_err(|_| "CUDA HTJ2K resident subband encode kernel failed")?;
3326
3327    Ok(CudaEncodedHtSubband {
3328        quantize_dispatches: quantized.execution().kernel_dispatches(),
3329        timings: CudaEncodeStageTimings {
3330            quantize_us,
3331            ht_encode_us: encoded.stage_timings().ht_encode_us,
3332            ..CudaEncodeStageTimings::default()
3333        },
3334        encode: encoded,
3335    })
3336}
3337
3338#[cfg(feature = "cuda-runtime")]
3339fn cuda_ht_subband_region_jobs(
3340    job: J2kHtSubbandEncodeJob<'_>,
3341) -> core::result::Result<Vec<CudaHtj2kEncodeCodeBlockRegionJob>, &'static str> {
3342    cuda_ht_region_jobs(
3343        job.width,
3344        job.height,
3345        job.code_block_width,
3346        job.code_block_height,
3347        job.total_bitplanes,
3348    )
3349}
3350
3351fn ht_subband_code_block_count(
3352    job: J2kHtSubbandEncodeJob<'_>,
3353) -> core::result::Result<usize, &'static str> {
3354    if job.code_block_width == 0 || job.code_block_height == 0 {
3355        return Err("CUDA HTJ2K subband encode job has invalid code-block dimensions");
3356    }
3357    let num_cbs_x = job.width.div_ceil(job.code_block_width);
3358    let num_cbs_y = job.height.div_ceil(job.code_block_height);
3359    (num_cbs_x as usize)
3360        .checked_mul(num_cbs_y as usize)
3361        .ok_or("CUDA HTJ2K subband code-block count overflow")
3362}
3363
3364#[cfg(feature = "cuda-runtime")]
3365fn encoded_ht_code_block_from_cuda(
3366    encoded: &j2k_cuda_runtime::CudaHtj2kEncodedCodeBlock,
3367) -> EncodedHtJ2kCodeBlock {
3368    EncodedHtJ2kCodeBlock {
3369        data: encoded.data().to_vec(),
3370        cleanup_length: encoded.cleanup_length(),
3371        refinement_length: encoded.refinement_length(),
3372        num_coding_passes: encoded.num_coding_passes(),
3373        num_zero_bitplanes: encoded.num_zero_bitplanes(),
3374    }
3375}
3376
3377#[cfg(feature = "cuda-runtime")]
3378fn encoded_ht_code_blocks_from_cuda(
3379    encoded: &j2k_cuda_runtime::CudaHtj2kEncodedCodeBlocks,
3380) -> Vec<EncodedHtJ2kCodeBlock> {
3381    encoded
3382        .code_blocks()
3383        .iter()
3384        .map(encoded_ht_code_block_from_cuda)
3385        .collect()
3386}
3387
3388#[cfg(feature = "cuda-runtime")]
3389fn cuda_htj2k_encode_tables() -> CudaHtj2kEncodeTables<'static> {
3390    CudaHtj2kEncodeTables {
3391        vlc_table0: j2k_native::ht_vlc_encode_table0(),
3392        vlc_table1: j2k_native::ht_vlc_encode_table1(),
3393        uvlc_table: j2k_native::ht_uvlc_encode_table_bytes(),
3394    }
3395}
3396
3397#[cfg(feature = "cuda-runtime")]
3398fn cuda_dwt53_output_to_j2k(
3399    output: &CudaDwt53Output,
3400) -> core::result::Result<J2kForwardDwt53Output, &'static str> {
3401    let (ll_width, ll_height) = output.ll_dimensions();
3402    let transformed = output.transformed();
3403    let full_width = output
3404        .levels()
3405        .first()
3406        .map_or(ll_width, |level| level.width) as usize;
3407    let mut ll = Vec::with_capacity((ll_width as usize) * (ll_height as usize));
3408    for y in 0..ll_height as usize {
3409        let row_start = y
3410            .checked_mul(full_width)
3411            .ok_or("CUDA DWT LL row offset overflow")?;
3412        ll.extend_from_slice(&transformed[row_start..row_start + ll_width as usize]);
3413    }
3414
3415    let mut levels = Vec::with_capacity(output.levels().len());
3416    for shape in output.levels() {
3417        levels.push(J2kForwardDwt53Level {
3418            hl: extract_cuda_subband(
3419                transformed,
3420                full_width,
3421                shape.low_width,
3422                0,
3423                shape.high_width,
3424                shape.low_height,
3425            )?,
3426            lh: extract_cuda_subband(
3427                transformed,
3428                full_width,
3429                0,
3430                shape.low_height,
3431                shape.low_width,
3432                shape.high_height,
3433            )?,
3434            hh: extract_cuda_subband(
3435                transformed,
3436                full_width,
3437                shape.low_width,
3438                shape.low_height,
3439                shape.high_width,
3440                shape.high_height,
3441            )?,
3442            width: shape.width,
3443            height: shape.height,
3444            low_width: shape.low_width,
3445            low_height: shape.low_height,
3446            high_width: shape.high_width,
3447            high_height: shape.high_height,
3448        });
3449    }
3450    levels.reverse();
3451
3452    Ok(J2kForwardDwt53Output {
3453        ll,
3454        ll_width,
3455        ll_height,
3456        levels,
3457    })
3458}
3459
3460/// Test-only accessor that converts a CUDA forward 5/3 DWT output into the
3461/// native `J2kForwardDwt53Output` sub-band representation using the *exact*
3462/// production reshape (`cuda_dwt53_output_to_j2k`).
3463///
3464/// This is `#[doc(hidden)]` and exists solely so the stage-parity test crate
3465/// can compare CUDA output against `forward_dwt53_reference` through the same
3466/// conversion the encoder uses (correct nested-band offsets + finest→coarsest
3467/// to coarsest→finest level reversal), instead of re-deriving the geometry.
3468#[cfg(feature = "cuda-runtime")]
3469#[doc(hidden)]
3470pub fn cuda_dwt53_output_to_j2k_for_test(
3471    output: &CudaDwt53Output,
3472) -> core::result::Result<J2kForwardDwt53Output, &'static str> {
3473    cuda_dwt53_output_to_j2k(output)
3474}
3475
3476#[cfg(feature = "cuda-runtime")]
3477fn cuda_dwt97_output_to_j2k(
3478    output: &CudaDwt97Output,
3479) -> core::result::Result<J2kForwardDwt97Output, &'static str> {
3480    let (ll_width, ll_height) = output.ll_dimensions();
3481    let transformed = output.transformed();
3482    let full_width = output
3483        .levels()
3484        .first()
3485        .map_or(ll_width, |level| level.width) as usize;
3486    let mut ll = Vec::with_capacity((ll_width as usize) * (ll_height as usize));
3487    for y in 0..ll_height as usize {
3488        let row_start = y
3489            .checked_mul(full_width)
3490            .ok_or("CUDA DWT LL row offset overflow")?;
3491        ll.extend_from_slice(&transformed[row_start..row_start + ll_width as usize]);
3492    }
3493
3494    let mut levels = Vec::with_capacity(output.levels().len());
3495    for shape in output.levels() {
3496        levels.push(J2kForwardDwt97Level {
3497            hl: extract_cuda_subband(
3498                transformed,
3499                full_width,
3500                shape.low_width,
3501                0,
3502                shape.high_width,
3503                shape.low_height,
3504            )?,
3505            lh: extract_cuda_subband(
3506                transformed,
3507                full_width,
3508                0,
3509                shape.low_height,
3510                shape.low_width,
3511                shape.high_height,
3512            )?,
3513            hh: extract_cuda_subband(
3514                transformed,
3515                full_width,
3516                shape.low_width,
3517                shape.low_height,
3518                shape.high_width,
3519                shape.high_height,
3520            )?,
3521            width: shape.width,
3522            height: shape.height,
3523            low_width: shape.low_width,
3524            low_height: shape.low_height,
3525            high_width: shape.high_width,
3526            high_height: shape.high_height,
3527        });
3528    }
3529    levels.reverse();
3530
3531    Ok(J2kForwardDwt97Output {
3532        ll,
3533        ll_width,
3534        ll_height,
3535        levels,
3536    })
3537}
3538
3539#[cfg(feature = "cuda-runtime")]
3540fn extract_cuda_subband(
3541    transformed: &[f32],
3542    full_width: usize,
3543    x0: u32,
3544    y0: u32,
3545    width: u32,
3546    height: u32,
3547) -> core::result::Result<Vec<f32>, &'static str> {
3548    let mut out = Vec::with_capacity((width as usize) * (height as usize));
3549    for y in 0..height as usize {
3550        let row_start = (y0 as usize)
3551            .checked_add(y)
3552            .and_then(|row| row.checked_mul(full_width))
3553            .and_then(|row| row.checked_add(x0 as usize))
3554            .ok_or("CUDA DWT subband offset overflow")?;
3555        out.extend_from_slice(&transformed[row_start..row_start + width as usize]);
3556    }
3557    Ok(out)
3558}
3559
3560#[cfg(test)]
3561mod tests {
3562    #[cfg(feature = "cuda-runtime")]
3563    use super::{cuda_htj2k_encode_tables, cuda_runtime_required};
3564    use super::{
3565        encode_j2k_lossless_with_cuda, encode_j2k_lossless_with_cuda_and_profile,
3566        flatten_cuda_htj2k_packetization_job, CudaEncodeStageAccelerator,
3567        CudaHtj2kPacketizationPlanTagNodeState,
3568    };
3569    use j2k::adapter::encode_stage::NativeEncodeStageAdapter;
3570    #[cfg(feature = "cuda-runtime")]
3571    use j2k::adapter::encode_stage::{J2kDeinterleaveToF32Job, J2kHtCodeBlockEncodeJob};
3572    use j2k::adapter::encode_stage::{
3573        J2kEncodeStageAccelerator, J2kHtSubbandEncodeJob, J2kPacketizationBlockCodingMode,
3574        J2kPacketizationCodeBlock, J2kPacketizationEncodeJob, J2kPacketizationPacketDescriptor,
3575        J2kPacketizationProgressionOrder, J2kPacketizationResolution, J2kPacketizationSubband,
3576        J2kQuantizeSubbandJob,
3577    };
3578    #[cfg(feature = "cuda-runtime")]
3579    use j2k::{encode_j2k_lossy_with_accelerator, J2kLossyEncodeOptions, J2kLossySamples};
3580    use j2k::{
3581        EncodeBackendPreference, J2kBlockCodingMode, J2kEncodeValidation, J2kLosslessEncodeOptions,
3582        J2kLosslessSamples,
3583    };
3584    #[cfg(feature = "cuda-runtime")]
3585    use j2k_core::BackendKind;
3586    use j2k_core::CodecError;
3587    #[cfg(feature = "cuda-runtime")]
3588    use j2k_cuda_runtime::{
3589        CudaContext, CudaHtj2kEncodeCodeBlockJob, CudaHtj2kEncodeCodeBlockRegionJob,
3590        CudaJ2kQuantizeJob,
3591    };
3592    use j2k_native::{
3593        encode_with_accelerator as encode_with_native_accelerator, DecodeSettings, EncodeOptions,
3594        Image,
3595    };
3596
3597    fn assert_strict_cuda_classic_tier1_error<E: CodecError + ?Sized>(err: &E, context: &str) {
3598        assert!(err.is_unsupported());
3599        let message = err.to_string();
3600        assert!(
3601            message.contains("tier1_code_block") || message.contains("deinterleave"),
3602            "expected {context} error to mention either the missing classic tier-1 stage or unavailable CUDA deinterleave, got {message}"
3603        );
3604    }
3605
3606    #[allow(clippy::too_many_arguments)]
3607    fn encode_with_cuda_test_accelerator(
3608        pixels: &[u8],
3609        width: u32,
3610        height: u32,
3611        components: u8,
3612        bit_depth: u8,
3613        signed: bool,
3614        options: &EncodeOptions,
3615        accelerator: &mut CudaEncodeStageAccelerator,
3616    ) -> core::result::Result<Vec<u8>, &'static str> {
3617        let mut bridge = NativeEncodeStageAdapter::new(accelerator);
3618        encode_with_native_accelerator(
3619            pixels,
3620            width,
3621            height,
3622            components,
3623            bit_depth,
3624            signed,
3625            options,
3626            &mut bridge,
3627        )
3628    }
3629
3630    #[test]
3631    fn cuda_lossless_encode_auto_errors_for_unsupported_classic_tier1() {
3632        let pixels: Vec<u8> = (0u32..128 * 128)
3633            .map(|value| u8::try_from((value * 17 + 5) & 0xFF).expect("masked value fits in u8"))
3634            .collect();
3635        let samples =
3636            J2kLosslessSamples::new(&pixels, 128, 128, 1, 8, false).expect("valid gray8 samples");
3637        let options = J2kLosslessEncodeOptions::default()
3638            .with_backend(EncodeBackendPreference::Auto)
3639            .with_block_coding_mode(J2kBlockCodingMode::Classic)
3640            .with_max_decomposition_levels(Some(0))
3641            .with_validation(J2kEncodeValidation::CpuRoundTrip);
3642
3643        let err = encode_j2k_lossless_with_cuda(samples, &options)
3644            .expect_err("CUDA-named encode must not silently return CPU fallback");
3645
3646        assert_strict_cuda_classic_tier1_error(&err, "strict CUDA encode");
3647    }
3648
3649    #[test]
3650    fn cuda_lossless_encode_profile_auto_errors_for_unsupported_classic_tier1() {
3651        let pixels: Vec<u8> = (0u32..128 * 128)
3652            .map(|value| u8::try_from((value * 19 + 7) & 0xFF).expect("masked value fits in u8"))
3653            .collect();
3654        let samples =
3655            J2kLosslessSamples::new(&pixels, 128, 128, 1, 8, false).expect("valid gray8 samples");
3656        let options = J2kLosslessEncodeOptions::default()
3657            .with_backend(EncodeBackendPreference::Auto)
3658            .with_block_coding_mode(J2kBlockCodingMode::Classic)
3659            .with_max_decomposition_levels(Some(0))
3660            .with_validation(J2kEncodeValidation::External);
3661
3662        let err = encode_j2k_lossless_with_cuda_and_profile(samples, &options)
3663            .expect_err("profiled CUDA encode must not silently return CPU fallback");
3664
3665        assert_strict_cuda_classic_tier1_error(&err, "profiled strict CUDA encode");
3666    }
3667
3668    #[test]
3669    fn cuda_lossless_encode_require_device_errors_for_unsupported_classic_tier1() {
3670        let pixels: Vec<u8> = (0u32..128 * 128)
3671            .map(|value| u8::try_from((value * 29 + 11) & 0xFF).expect("masked value fits in u8"))
3672            .collect();
3673        let samples =
3674            J2kLosslessSamples::new(&pixels, 128, 128, 1, 8, false).expect("valid gray8 samples");
3675        let options = J2kLosslessEncodeOptions::default()
3676            .with_backend(EncodeBackendPreference::RequireDevice)
3677            .with_block_coding_mode(J2kBlockCodingMode::Classic)
3678            .with_max_decomposition_levels(Some(0))
3679            .with_validation(J2kEncodeValidation::External);
3680
3681        let err = encode_j2k_lossless_with_cuda(samples, &options)
3682            .expect_err("strict CUDA encode must not silently fall back to CPU");
3683
3684        assert_strict_cuda_classic_tier1_error(&err, "strict CUDA encode");
3685    }
3686
3687    #[test]
3688    fn cuda_packetization_flatten_accepts_cleanup_only_single_block_packet() {
3689        let payload = [0x12, 0x34, 0x56, 0x78];
3690        let code_block = J2kPacketizationCodeBlock {
3691            data: &payload,
3692            ht_cleanup_length: 0,
3693            ht_refinement_length: 0,
3694            num_coding_passes: 1,
3695            num_zero_bitplanes: 2,
3696            previously_included: false,
3697            l_block: 3,
3698            block_coding_mode: J2kPacketizationBlockCodingMode::HighThroughput,
3699        };
3700        let subband = J2kPacketizationSubband {
3701            code_blocks: vec![code_block],
3702            num_cbs_x: 1,
3703            num_cbs_y: 1,
3704        };
3705        let resolution = J2kPacketizationResolution {
3706            subbands: vec![subband],
3707        };
3708        let descriptor = J2kPacketizationPacketDescriptor {
3709            packet_index: 0,
3710            state_index: 0,
3711            layer: 0,
3712            resolution: 0,
3713            component: 0,
3714            precinct: 0,
3715        };
3716        let job = J2kPacketizationEncodeJob {
3717            resolution_count: 1,
3718            num_layers: 1,
3719            num_components: 1,
3720            code_block_count: 1,
3721            progression_order: J2kPacketizationProgressionOrder::Lrcp,
3722            packet_descriptors: &[descriptor],
3723            resolutions: &[resolution],
3724        };
3725
3726        let plan = flatten_cuda_htj2k_packetization_job(job).expect("supported CUDA packetization");
3727
3728        assert_eq!(plan.payload, payload);
3729        assert_eq!(plan.packets.len(), 1);
3730        assert_eq!(plan.subbands.len(), 1);
3731        assert_eq!(plan.blocks.len(), 1);
3732        assert_eq!(plan.packets[0].block_start, 0);
3733        assert_eq!(plan.packets[0].block_count, 1);
3734        assert_eq!(plan.packets[0].subband_start, 0);
3735        assert_eq!(plan.packets[0].subband_count, 1);
3736        assert_eq!(plan.subbands[0].block_start, 0);
3737        assert_eq!(plan.subbands[0].block_count, 1);
3738        let payload_len = u32::try_from(payload.len()).expect("test payload length fits in u32");
3739        assert!(plan.packets[0].output_capacity >= payload_len + 256);
3740        assert_eq!(plan.blocks[0].data_offset, 0);
3741        assert_eq!(plan.blocks[0].data_len, payload_len);
3742        assert_eq!(plan.blocks[0].num_coding_passes, 1);
3743        assert_eq!(plan.blocks[0].num_zero_bitplanes, 2);
3744    }
3745
3746    #[test]
3747    fn cuda_packetization_flatten_accepts_cleanup_only_multi_block_packet() {
3748        let payloads = vec![
3749            vec![0x10, 0x11, 0x12],
3750            vec![0x20, 0x21],
3751            vec![0x30, 0x31, 0x32, 0x33],
3752            vec![0x40],
3753        ];
3754        let code_blocks = payloads
3755            .iter()
3756            .enumerate()
3757            .map(|(idx, payload)| J2kPacketizationCodeBlock {
3758                data: payload.as_slice(),
3759                ht_cleanup_length: 0,
3760                ht_refinement_length: 0,
3761                num_coding_passes: 1,
3762                num_zero_bitplanes: u8::try_from(idx + 1).expect("test zbp fits in u8"),
3763                previously_included: false,
3764                l_block: 3,
3765                block_coding_mode: J2kPacketizationBlockCodingMode::HighThroughput,
3766            })
3767            .collect();
3768        let subband = J2kPacketizationSubband {
3769            code_blocks,
3770            num_cbs_x: 2,
3771            num_cbs_y: 2,
3772        };
3773        let resolution = J2kPacketizationResolution {
3774            subbands: vec![subband],
3775        };
3776        let descriptor = J2kPacketizationPacketDescriptor {
3777            packet_index: 0,
3778            state_index: 0,
3779            layer: 0,
3780            resolution: 0,
3781            component: 0,
3782            precinct: 0,
3783        };
3784        let job = J2kPacketizationEncodeJob {
3785            resolution_count: 1,
3786            num_layers: 1,
3787            num_components: 1,
3788            code_block_count: 4,
3789            progression_order: J2kPacketizationProgressionOrder::Lrcp,
3790            packet_descriptors: &[descriptor],
3791            resolutions: &[resolution],
3792        };
3793
3794        let plan =
3795            flatten_cuda_htj2k_packetization_job(job).expect("multi-block CUDA packetization");
3796
3797        assert_eq!(plan.packets.len(), 1);
3798        assert_eq!(plan.subbands.len(), 1);
3799        assert_eq!(plan.blocks.len(), 4);
3800        assert_eq!(plan.packets[0].block_start, 0);
3801        assert_eq!(plan.packets[0].block_count, 4);
3802        assert_eq!(plan.packets[0].subband_start, 0);
3803        assert_eq!(plan.packets[0].subband_count, 1);
3804        assert_eq!(plan.subbands[0].block_start, 0);
3805        assert_eq!(plan.subbands[0].block_count, 4);
3806        assert_eq!(plan.subbands[0].num_cbs_x, 2);
3807        assert_eq!(plan.subbands[0].num_cbs_y, 2);
3808        assert_eq!(
3809            plan.payload,
3810            payloads.into_iter().flatten().collect::<Vec<_>>()
3811        );
3812        assert_eq!(plan.blocks[2].num_zero_bitplanes, 3);
3813    }
3814
3815    #[test]
3816    fn cuda_packetization_flatten_accepts_ht_refinement_pass_packet() {
3817        let payload = [0x12, 0x34, 0x56, 0x78, 0x9a];
3818        let code_block = J2kPacketizationCodeBlock {
3819            data: &payload,
3820            ht_cleanup_length: 3,
3821            ht_refinement_length: 2,
3822            num_coding_passes: 3,
3823            num_zero_bitplanes: 2,
3824            previously_included: false,
3825            l_block: 3,
3826            block_coding_mode: J2kPacketizationBlockCodingMode::HighThroughput,
3827        };
3828        let subband = J2kPacketizationSubband {
3829            code_blocks: vec![code_block],
3830            num_cbs_x: 1,
3831            num_cbs_y: 1,
3832        };
3833        let resolution = J2kPacketizationResolution {
3834            subbands: vec![subband],
3835        };
3836        let descriptor = J2kPacketizationPacketDescriptor {
3837            packet_index: 0,
3838            state_index: 0,
3839            layer: 0,
3840            resolution: 0,
3841            component: 0,
3842            precinct: 0,
3843        };
3844        let job = J2kPacketizationEncodeJob {
3845            resolution_count: 1,
3846            num_layers: 1,
3847            num_components: 1,
3848            code_block_count: 1,
3849            progression_order: J2kPacketizationProgressionOrder::Lrcp,
3850            packet_descriptors: &[descriptor],
3851            resolutions: &[resolution],
3852        };
3853
3854        let plan = flatten_cuda_htj2k_packetization_job(job).expect("HT refinement packetization");
3855
3856        assert_eq!(plan.payload, payload);
3857        assert_eq!(plan.blocks.len(), 1);
3858        assert_eq!(plan.blocks[0].num_coding_passes, 3);
3859        assert_eq!(
3860            plan.blocks[0].data_len,
3861            u32::try_from(payload.len()).expect("test payload length fits in u32")
3862        );
3863    }
3864
3865    #[test]
3866    fn cuda_packetization_rejects_overflowing_ht_refinement_lengths() {
3867        let payload = [0x12];
3868        let code_block = J2kPacketizationCodeBlock {
3869            data: &payload,
3870            ht_cleanup_length: u32::MAX,
3871            ht_refinement_length: 1,
3872            num_coding_passes: 3,
3873            num_zero_bitplanes: 2,
3874            previously_included: false,
3875            l_block: 3,
3876            block_coding_mode: J2kPacketizationBlockCodingMode::HighThroughput,
3877        };
3878
3879        let err = super::cuda_ht_segment_lengths(&code_block)
3880            .expect_err("overflowing CUDA HT segment lengths rejected");
3881
3882        assert_eq!(err, "multi-pass HTJ2K packet contribution length overflow");
3883    }
3884
3885    #[test]
3886    fn cuda_packetization_flatten_rejects_out_of_range_ht_pass_count() {
3887        let payload = [0u8; 1];
3888        let code_block = J2kPacketizationCodeBlock {
3889            data: &payload,
3890            ht_cleanup_length: 0,
3891            ht_refinement_length: 0,
3892            num_coding_passes: 165,
3893            num_zero_bitplanes: 2,
3894            previously_included: false,
3895            l_block: 3,
3896            block_coding_mode: J2kPacketizationBlockCodingMode::HighThroughput,
3897        };
3898        let subband = J2kPacketizationSubband {
3899            code_blocks: vec![code_block],
3900            num_cbs_x: 1,
3901            num_cbs_y: 1,
3902        };
3903        let resolution = J2kPacketizationResolution {
3904            subbands: vec![subband],
3905        };
3906        let descriptor = J2kPacketizationPacketDescriptor {
3907            packet_index: 0,
3908            state_index: 0,
3909            layer: 0,
3910            resolution: 0,
3911            component: 0,
3912            precinct: 0,
3913        };
3914        let job = J2kPacketizationEncodeJob {
3915            resolution_count: 1,
3916            num_layers: 1,
3917            num_components: 1,
3918            code_block_count: 1,
3919            progression_order: J2kPacketizationProgressionOrder::Lrcp,
3920            packet_descriptors: &[descriptor],
3921            resolutions: &[resolution],
3922        };
3923
3924        let err = flatten_cuda_htj2k_packetization_job(job)
3925            .expect_err("invalid HT pass count must be rejected before CUDA launch");
3926
3927        assert_eq!(
3928            err,
3929            "CUDA HTJ2K packetization coding pass count exceeds JPEG 2000 bounds"
3930        );
3931    }
3932
3933    #[test]
3934    fn cuda_packetization_flatten_accepts_previously_included_second_layer_packet() {
3935        let first_payload = [0x11u8; 20];
3936        let second_payload = [0x22u8; 5];
3937        let first_block = J2kPacketizationCodeBlock {
3938            data: &first_payload,
3939            ht_cleanup_length: 0,
3940            ht_refinement_length: 0,
3941            num_coding_passes: 1,
3942            num_zero_bitplanes: 2,
3943            previously_included: false,
3944            l_block: 3,
3945            block_coding_mode: J2kPacketizationBlockCodingMode::HighThroughput,
3946        };
3947        let second_block = J2kPacketizationCodeBlock {
3948            data: &second_payload,
3949            ht_cleanup_length: 0,
3950            ht_refinement_length: 0,
3951            num_coding_passes: 1,
3952            num_zero_bitplanes: 2,
3953            previously_included: false,
3954            l_block: 3,
3955            block_coding_mode: J2kPacketizationBlockCodingMode::HighThroughput,
3956        };
3957        let first_resolution = J2kPacketizationResolution {
3958            subbands: vec![J2kPacketizationSubband {
3959                code_blocks: vec![first_block],
3960                num_cbs_x: 1,
3961                num_cbs_y: 1,
3962            }],
3963        };
3964        let second_resolution = J2kPacketizationResolution {
3965            subbands: vec![J2kPacketizationSubband {
3966                code_blocks: vec![second_block],
3967                num_cbs_x: 1,
3968                num_cbs_y: 1,
3969            }],
3970        };
3971        let descriptors = [
3972            J2kPacketizationPacketDescriptor {
3973                packet_index: 0,
3974                state_index: 0,
3975                layer: 0,
3976                resolution: 0,
3977                component: 0,
3978                precinct: 0,
3979            },
3980            J2kPacketizationPacketDescriptor {
3981                packet_index: 1,
3982                state_index: 0,
3983                layer: 1,
3984                resolution: 0,
3985                component: 0,
3986                precinct: 0,
3987            },
3988        ];
3989        let resolutions = [first_resolution, second_resolution];
3990        let job = J2kPacketizationEncodeJob {
3991            resolution_count: 2,
3992            num_layers: 2,
3993            num_components: 1,
3994            code_block_count: 2,
3995            progression_order: J2kPacketizationProgressionOrder::Lrcp,
3996            packet_descriptors: &descriptors,
3997            resolutions: &resolutions,
3998        };
3999
4000        let plan =
4001            flatten_cuda_htj2k_packetization_job(job).expect("stateful CUDA packetization plan");
4002
4003        assert_eq!(
4004            plan.payload,
4005            [first_payload.as_slice(), second_payload.as_slice()].concat()
4006        );
4007        assert_eq!(plan.packets.len(), 2);
4008        assert_eq!(plan.blocks.len(), 2);
4009        assert_eq!(plan.packets[0].layer, 0);
4010        assert_eq!(plan.packets[1].layer, 1);
4011        assert_eq!(plan.blocks[0].l_block, 3);
4012        assert_eq!(plan.blocks[0].previously_included, 0);
4013        assert_eq!(plan.blocks[1].previously_included, 1);
4014        assert_eq!(plan.blocks[0].inclusion_layer, 0);
4015        assert_eq!(plan.blocks[1].inclusion_layer, 0);
4016        assert_eq!(
4017            plan.blocks[1].l_block, 5,
4018            "first layer length must update L-block for later packet state"
4019        );
4020    }
4021
4022    #[test]
4023    fn cuda_packetization_flatten_accepts_deferred_first_inclusion_second_layer_packet() {
4024        let payload = [0x44u8; 5];
4025        let first_block = J2kPacketizationCodeBlock {
4026            data: &[],
4027            ht_cleanup_length: 0,
4028            ht_refinement_length: 0,
4029            num_coding_passes: 0,
4030            num_zero_bitplanes: 2,
4031            previously_included: false,
4032            l_block: 3,
4033            block_coding_mode: J2kPacketizationBlockCodingMode::HighThroughput,
4034        };
4035        let second_block = J2kPacketizationCodeBlock {
4036            data: &payload,
4037            ht_cleanup_length: 0,
4038            ht_refinement_length: 0,
4039            num_coding_passes: 1,
4040            num_zero_bitplanes: 2,
4041            previously_included: false,
4042            l_block: 3,
4043            block_coding_mode: J2kPacketizationBlockCodingMode::HighThroughput,
4044        };
4045        let first_resolution = J2kPacketizationResolution {
4046            subbands: vec![J2kPacketizationSubband {
4047                code_blocks: vec![first_block],
4048                num_cbs_x: 1,
4049                num_cbs_y: 1,
4050            }],
4051        };
4052        let second_resolution = J2kPacketizationResolution {
4053            subbands: vec![J2kPacketizationSubband {
4054                code_blocks: vec![second_block],
4055                num_cbs_x: 1,
4056                num_cbs_y: 1,
4057            }],
4058        };
4059        let descriptors = [
4060            J2kPacketizationPacketDescriptor {
4061                packet_index: 0,
4062                state_index: 0,
4063                layer: 0,
4064                resolution: 0,
4065                component: 0,
4066                precinct: 0,
4067            },
4068            J2kPacketizationPacketDescriptor {
4069                packet_index: 1,
4070                state_index: 0,
4071                layer: 1,
4072                resolution: 0,
4073                component: 0,
4074                precinct: 0,
4075            },
4076        ];
4077        let resolutions = [first_resolution, second_resolution];
4078        let job = J2kPacketizationEncodeJob {
4079            resolution_count: 2,
4080            num_layers: 2,
4081            num_components: 1,
4082            code_block_count: 2,
4083            progression_order: J2kPacketizationProgressionOrder::Lrcp,
4084            packet_descriptors: &descriptors,
4085            resolutions: &resolutions,
4086        };
4087
4088        let plan =
4089            flatten_cuda_htj2k_packetization_job(job).expect("deferred first inclusion plan");
4090
4091        assert_eq!(plan.payload, payload);
4092        assert_eq!(plan.packets.len(), 2);
4093        assert_eq!(plan.blocks.len(), 2);
4094        assert_eq!(plan.packets[0].layer, 0);
4095        assert_eq!(plan.packets[1].layer, 1);
4096        assert_eq!(plan.blocks[0].previously_included, 0);
4097        assert_eq!(plan.blocks[1].previously_included, 0);
4098        assert_eq!(plan.blocks[0].inclusion_layer, 1);
4099        assert_eq!(plan.blocks[1].inclusion_layer, 1);
4100    }
4101
4102    #[test]
4103    fn cuda_packetization_flatten_accepts_deferred_first_inclusion_after_non_empty_packet() {
4104        let first_payload = [0x11u8; 3];
4105        let second_payload = [0x22u8; 5];
4106        let first_resolution = J2kPacketizationResolution {
4107            subbands: vec![J2kPacketizationSubband {
4108                code_blocks: vec![
4109                    J2kPacketizationCodeBlock {
4110                        data: &first_payload,
4111                        ht_cleanup_length: 0,
4112                        ht_refinement_length: 0,
4113                        num_coding_passes: 1,
4114                        num_zero_bitplanes: 2,
4115                        previously_included: false,
4116                        l_block: 3,
4117                        block_coding_mode: J2kPacketizationBlockCodingMode::HighThroughput,
4118                    },
4119                    J2kPacketizationCodeBlock {
4120                        data: &[],
4121                        ht_cleanup_length: 0,
4122                        ht_refinement_length: 0,
4123                        num_coding_passes: 0,
4124                        num_zero_bitplanes: 2,
4125                        previously_included: false,
4126                        l_block: 3,
4127                        block_coding_mode: J2kPacketizationBlockCodingMode::HighThroughput,
4128                    },
4129                ],
4130                num_cbs_x: 2,
4131                num_cbs_y: 1,
4132            }],
4133        };
4134        let second_resolution = J2kPacketizationResolution {
4135            subbands: vec![J2kPacketizationSubband {
4136                code_blocks: vec![
4137                    J2kPacketizationCodeBlock {
4138                        data: &[],
4139                        ht_cleanup_length: 0,
4140                        ht_refinement_length: 0,
4141                        num_coding_passes: 0,
4142                        num_zero_bitplanes: 2,
4143                        previously_included: false,
4144                        l_block: 3,
4145                        block_coding_mode: J2kPacketizationBlockCodingMode::HighThroughput,
4146                    },
4147                    J2kPacketizationCodeBlock {
4148                        data: &second_payload,
4149                        ht_cleanup_length: 0,
4150                        ht_refinement_length: 0,
4151                        num_coding_passes: 1,
4152                        num_zero_bitplanes: 2,
4153                        previously_included: false,
4154                        l_block: 3,
4155                        block_coding_mode: J2kPacketizationBlockCodingMode::HighThroughput,
4156                    },
4157                ],
4158                num_cbs_x: 2,
4159                num_cbs_y: 1,
4160            }],
4161        };
4162        let descriptors = [
4163            J2kPacketizationPacketDescriptor {
4164                packet_index: 0,
4165                state_index: 0,
4166                layer: 0,
4167                resolution: 0,
4168                component: 0,
4169                precinct: 0,
4170            },
4171            J2kPacketizationPacketDescriptor {
4172                packet_index: 1,
4173                state_index: 0,
4174                layer: 1,
4175                resolution: 0,
4176                component: 0,
4177                precinct: 0,
4178            },
4179        ];
4180        let resolutions = [first_resolution, second_resolution];
4181        let job = J2kPacketizationEncodeJob {
4182            resolution_count: 2,
4183            num_layers: 2,
4184            num_components: 1,
4185            code_block_count: 4,
4186            progression_order: J2kPacketizationProgressionOrder::Lrcp,
4187            packet_descriptors: &descriptors,
4188            resolutions: &resolutions,
4189        };
4190
4191        let plan = flatten_cuda_htj2k_packetization_job(job)
4192            .expect("persistent tag-tree state is flattened for CUDA packetization");
4193
4194        assert_eq!(
4195            plan.payload,
4196            [first_payload.as_slice(), second_payload.as_slice()].concat()
4197        );
4198        assert_eq!(plan.packets.len(), 2);
4199        assert_eq!(plan.blocks.len(), 4);
4200        assert_eq!(plan.blocks[0].previously_included, 0);
4201        assert_eq!(plan.blocks[1].previously_included, 0);
4202        assert_eq!(plan.blocks[2].previously_included, 1);
4203        assert_eq!(plan.blocks[3].previously_included, 0);
4204        assert_eq!(plan.blocks[0].inclusion_layer, 0);
4205        assert_eq!(plan.blocks[1].inclusion_layer, 1);
4206        assert_eq!(plan.blocks[2].inclusion_layer, 0);
4207        assert_eq!(plan.blocks[3].inclusion_layer, 1);
4208        assert_eq!(plan.tag_states.len(), 2);
4209        assert_eq!(plan.tag_nodes.len(), 12);
4210        assert_eq!(plan.tag_states[1].inclusion_node_start, 6);
4211        assert_eq!(plan.tag_states[1].zero_bitplane_node_start, 9);
4212        assert_eq!(
4213            &plan.tag_nodes[6..9],
4214            &[
4215                CudaHtj2kPacketizationPlanTagNodeState {
4216                    current: 0,
4217                    known: 1,
4218                },
4219                CudaHtj2kPacketizationPlanTagNodeState {
4220                    current: 1,
4221                    known: 0,
4222                },
4223                CudaHtj2kPacketizationPlanTagNodeState {
4224                    current: 0,
4225                    known: 1,
4226                },
4227            ]
4228        );
4229        assert_eq!(
4230            &plan.tag_nodes[9..12],
4231            &[
4232                CudaHtj2kPacketizationPlanTagNodeState {
4233                    current: 2,
4234                    known: 1,
4235                },
4236                CudaHtj2kPacketizationPlanTagNodeState {
4237                    current: 0,
4238                    known: 0,
4239                },
4240                CudaHtj2kPacketizationPlanTagNodeState {
4241                    current: 2,
4242                    known: 1,
4243                },
4244            ]
4245        );
4246    }
4247
4248    #[cfg(feature = "cuda-runtime")]
4249    #[test]
4250    fn cuda_lossless_encode_require_device_dispatches_cleanup_packetization_when_runtime_required()
4251    {
4252        if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4253            return;
4254        }
4255
4256        let pixels: Vec<u8> = (0u16..8 * 8)
4257            .map(|value| u8::try_from((value * 31 + 7) & 0xFF).expect("masked value fits in u8"))
4258            .collect();
4259        let samples =
4260            J2kLosslessSamples::new(&pixels, 8, 8, 1, 8, false).expect("valid gray8 samples");
4261        let options = J2kLosslessEncodeOptions::default()
4262            .with_backend(EncodeBackendPreference::RequireDevice)
4263            .with_block_coding_mode(J2kBlockCodingMode::HighThroughput)
4264            .with_max_decomposition_levels(Some(0))
4265            .with_validation(J2kEncodeValidation::CpuRoundTrip);
4266
4267        let encoded = encode_j2k_lossless_with_cuda(samples, &options)
4268            .expect("strict CUDA single-pass HT encode should dispatch all required stages");
4269        let decoded = Image::new(&encoded.codestream, &DecodeSettings::default())
4270            .expect("codestream parses")
4271            .decode_native()
4272            .expect("codestream decodes");
4273
4274        assert_eq!(encoded.backend, BackendKind::Cuda);
4275        assert_eq!(decoded.data, pixels);
4276    }
4277
4278    #[cfg(feature = "cuda-runtime")]
4279    #[test]
4280    fn cuda_deinterleave_stage_dispatches_when_runtime_required() {
4281        if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4282            return;
4283        }
4284
4285        let pixels = [0u8, 128, 255, 64, 32, 16];
4286        let mut accelerator = CudaEncodeStageAccelerator::default();
4287        let components = accelerator
4288            .encode_deinterleave(J2kDeinterleaveToF32Job {
4289                pixels: &pixels,
4290                num_pixels: 2,
4291                num_components: 3,
4292                bit_depth: 8,
4293                signed: false,
4294            })
4295            .expect("CUDA deinterleave hook")
4296            .expect("CUDA deinterleave dispatch");
4297
4298        assert_eq!(accelerator.deinterleave_dispatches(), 1);
4299        assert_eq!(
4300            components,
4301            vec![vec![-128.0, -64.0], vec![0.0, -96.0], vec![127.0, -112.0]]
4302        );
4303    }
4304
4305    #[test]
4306    fn prefer_cpu_ht_subband_declines_fused_subband_but_counts_attempts() {
4307        let mut accelerator = CudaEncodeStageAccelerator::default()
4308            .prefer_cpu_ht_subband(true)
4309            .prefer_cpu_quantize_subband(true);
4310        let output = accelerator
4311            .encode_ht_subband(J2kHtSubbandEncodeJob {
4312                coefficients: &[0.0; 16],
4313                width: 4,
4314                height: 4,
4315                step_exponent: 8,
4316                step_mantissa: 0,
4317                range_bits: 8,
4318                reversible: false,
4319                code_block_width: 4,
4320                code_block_height: 4,
4321                total_bitplanes: 9,
4322            })
4323            .expect("subband hook can decline");
4324
4325        assert!(output.is_none());
4326        assert_eq!(accelerator.ht_subband_attempts, 1);
4327        assert_eq!(accelerator.quantize_subband_attempts, 1);
4328        assert_eq!(accelerator.ht_code_block_attempts, 1);
4329        assert_eq!(accelerator.dispatch_report().total(), 0);
4330
4331        let quantized = accelerator
4332            .encode_quantize_subband(J2kQuantizeSubbandJob {
4333                coefficients: &[0.0; 16],
4334                step_exponent: 8,
4335                step_mantissa: 0,
4336                range_bits: 8,
4337                reversible: false,
4338            })
4339            .expect("quantize hook can decline");
4340        assert!(quantized.is_none());
4341        assert_eq!(accelerator.quantize_subband_attempts, 2);
4342        assert_eq!(accelerator.dispatch_report().total(), 0);
4343    }
4344
4345    #[cfg(feature = "cuda-runtime")]
4346    #[test]
4347    fn cuda_lossless_encode_require_device_dispatches_multi_block_cleanup_when_runtime_required() {
4348        if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4349            return;
4350        }
4351
4352        let pixels: Vec<u8> = (0u32..128 * 128)
4353            .map(|value| u8::try_from((value * 19 + 23) & 0xFF).expect("masked value fits in u8"))
4354            .collect();
4355        let samples =
4356            J2kLosslessSamples::new(&pixels, 128, 128, 1, 8, false).expect("valid gray8 samples");
4357        let options = J2kLosslessEncodeOptions::default()
4358            .with_backend(EncodeBackendPreference::RequireDevice)
4359            .with_block_coding_mode(J2kBlockCodingMode::HighThroughput)
4360            .with_max_decomposition_levels(Some(0))
4361            .with_validation(J2kEncodeValidation::CpuRoundTrip);
4362
4363        let encoded = encode_j2k_lossless_with_cuda(samples, &options)
4364            .expect("strict CUDA multi-block cleanup encode should dispatch all required stages");
4365        let decoded = Image::new(&encoded.codestream, &DecodeSettings::default())
4366            .expect("codestream parses")
4367            .decode_native()
4368            .expect("codestream decodes");
4369
4370        assert_eq!(encoded.backend, BackendKind::Cuda);
4371        assert_eq!(decoded.data, pixels);
4372    }
4373
4374    #[cfg(feature = "cuda-runtime")]
4375    #[test]
4376    fn cuda_lossless_encode_require_device_dispatches_dwt53_cleanup_when_runtime_required() {
4377        if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4378            return;
4379        }
4380
4381        let pixels: Vec<u8> = (0u32..128 * 128)
4382            .map(|value| u8::try_from((value * 37 + 41) & 0xFF).expect("masked value fits in u8"))
4383            .collect();
4384        let samples =
4385            J2kLosslessSamples::new(&pixels, 128, 128, 1, 8, false).expect("valid gray8 samples");
4386        let options = J2kLosslessEncodeOptions::default()
4387            .with_backend(EncodeBackendPreference::RequireDevice)
4388            .with_block_coding_mode(J2kBlockCodingMode::HighThroughput)
4389            .with_max_decomposition_levels(Some(1))
4390            .with_validation(J2kEncodeValidation::CpuRoundTrip);
4391
4392        let encoded = encode_j2k_lossless_with_cuda(samples, &options)
4393            .expect("strict CUDA DWT cleanup encode should dispatch all required stages");
4394        let decoded = Image::new(&encoded.codestream, &DecodeSettings::default())
4395            .expect("codestream parses")
4396            .decode_native()
4397            .expect("codestream decodes");
4398
4399        assert_eq!(encoded.backend, BackendKind::Cuda);
4400        assert_eq!(decoded.data, pixels);
4401    }
4402
4403    #[cfg(feature = "cuda-runtime")]
4404    #[test]
4405    fn cuda_lossless_encode_profile_reports_resident_stage_timings_when_runtime_required() {
4406        if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4407            return;
4408        }
4409
4410        let pixels: Vec<u8> = (0u32..128 * 128)
4411            .map(|value| u8::try_from((value * 43 + 29) & 0xFF).expect("masked value fits in u8"))
4412            .collect();
4413        let samples =
4414            J2kLosslessSamples::new(&pixels, 128, 128, 1, 8, false).expect("valid gray8 samples");
4415        let options = J2kLosslessEncodeOptions::default()
4416            .with_backend(EncodeBackendPreference::RequireDevice)
4417            .with_block_coding_mode(J2kBlockCodingMode::HighThroughput)
4418            .with_max_decomposition_levels(Some(1))
4419            .with_validation(J2kEncodeValidation::CpuRoundTrip);
4420
4421        let (encoded, report) = encode_j2k_lossless_with_cuda_and_profile(samples, &options)
4422            .expect("strict CUDA profiled DWT cleanup encode should dispatch all required stages");
4423        let decoded = Image::new(&encoded.codestream, &DecodeSettings::default())
4424            .expect("codestream parses")
4425            .decode_native()
4426            .expect("codestream decodes");
4427
4428        assert_eq!(encoded.backend, BackendKind::Cuda);
4429        assert_eq!(decoded.data, pixels);
4430        assert_eq!(report.backend, BackendKind::Cuda);
4431        assert_eq!(report.input_bytes, pixels.len());
4432        assert_eq!(report.codestream_bytes, encoded.codestream.len());
4433        assert!(report.dispatch_count > 0);
4434        assert!(report.block_count > 0);
4435        assert!(report.deinterleave_us > 0);
4436        assert_eq!(report.mct_us, 0);
4437        assert!(report.dwt_us > 0);
4438        assert!(report.quantize_us > 0);
4439        assert!(report.ht_encode_us > 0);
4440        assert!(report.packetize_us > 0);
4441        assert!(report.total_us > 0);
4442    }
4443
4444    #[cfg(feature = "cuda-runtime")]
4445    #[test]
4446    fn cuda_lossless_encode_require_device_dispatches_rgb_rct_cleanup_when_runtime_required() {
4447        if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4448            return;
4449        }
4450
4451        let pixels: Vec<u8> = (0u32..128 * 128 * 3)
4452            .map(|value| u8::try_from((value * 13 + 71) & 0xFF).expect("masked value fits in u8"))
4453            .collect();
4454        let samples =
4455            J2kLosslessSamples::new(&pixels, 128, 128, 3, 8, false).expect("valid rgb8 samples");
4456        let options = J2kLosslessEncodeOptions::default()
4457            .with_backend(EncodeBackendPreference::RequireDevice)
4458            .with_block_coding_mode(J2kBlockCodingMode::HighThroughput)
4459            .with_max_decomposition_levels(Some(1))
4460            .with_validation(J2kEncodeValidation::CpuRoundTrip);
4461
4462        let encoded = encode_j2k_lossless_with_cuda(samples, &options)
4463            .expect("strict CUDA RGB cleanup encode should dispatch all required stages");
4464        let decoded = Image::new(&encoded.codestream, &DecodeSettings::default())
4465            .expect("codestream parses")
4466            .decode_native()
4467            .expect("codestream decodes");
4468
4469        assert_eq!(encoded.backend, BackendKind::Cuda);
4470        assert_eq!(decoded.data, pixels);
4471    }
4472
4473    #[cfg(feature = "cuda-runtime")]
4474    #[test]
4475    fn cuda_lossy_htj2k_facade_require_device_dispatches_supported_stages_when_runtime_required() {
4476        if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4477            return;
4478        }
4479
4480        let pixels: Vec<u8> = (0u32..64 * 64)
4481            .map(|value| u8::try_from((value * 41 + 17) & 0xFF).expect("masked value fits in u8"))
4482            .collect();
4483        let samples =
4484            J2kLossySamples::new(&pixels, 64, 64, 1, 8, false).expect("valid gray8 samples");
4485        let options = J2kLossyEncodeOptions::default()
4486            .with_backend(EncodeBackendPreference::RequireDevice)
4487            .with_block_coding_mode(J2kBlockCodingMode::HighThroughput)
4488            .with_max_decomposition_levels(Some(1))
4489            .with_validation(J2kEncodeValidation::CpuRoundTrip);
4490        let mut accelerator = CudaEncodeStageAccelerator::default();
4491
4492        let encoded = encode_j2k_lossy_with_accelerator(
4493            samples,
4494            &options,
4495            BackendKind::Cuda,
4496            &mut accelerator,
4497        )
4498        .expect("strict CUDA HTJ2K lossy facade encode should dispatch supported stages");
4499        let decoded = Image::new(&encoded.codestream, &DecodeSettings::default())
4500            .expect("codestream parses")
4501            .decode_native()
4502            .expect("codestream decodes");
4503
4504        assert_eq!(encoded.backend, BackendKind::Cuda);
4505        assert_eq!(decoded.width, 64);
4506        assert_eq!(decoded.height, 64);
4507        assert_eq!(decoded.num_components, 1);
4508        assert_eq!(accelerator.deinterleave_dispatches(), 1);
4509        assert!(accelerator.forward_dwt97_dispatches() > 0);
4510        assert_eq!(accelerator.quantize_subband_dispatches(), 4);
4511        assert_eq!(accelerator.ht_code_block_dispatches(), 4);
4512        assert_eq!(accelerator.packetization_dispatches(), 1);
4513    }
4514
4515    #[test]
4516    fn cuda_encode_stage_accelerator_preserves_cpu_codestream_validity() {
4517        let pixels: Vec<u8> = (0u8..192).collect();
4518        let options = EncodeOptions {
4519            reversible: true,
4520            num_decomposition_levels: 1,
4521            ..EncodeOptions::default()
4522        };
4523        let mut accelerator = CudaEncodeStageAccelerator::default();
4524
4525        let codestream = encode_with_cuda_test_accelerator(
4526            &pixels,
4527            8,
4528            8,
4529            3,
4530            8,
4531            false,
4532            &options,
4533            &mut accelerator,
4534        )
4535        .expect("encode with CUDA stage accelerator");
4536        let decoded = Image::new(&codestream, &DecodeSettings::default())
4537            .expect("codestream parses")
4538            .decode_native()
4539            .expect("codestream decodes");
4540
4541        assert_eq!(decoded.width, 8);
4542        assert_eq!(decoded.height, 8);
4543        assert_eq!(decoded.num_components, 3);
4544        assert_eq!(decoded.bit_depth, 8);
4545        assert_eq!(accelerator.forward_rct_attempts(), 1);
4546        assert_eq!(accelerator.forward_dwt53_attempts(), 3);
4547        assert!(accelerator.tier1_code_block_attempts() > 0);
4548        assert_eq!(accelerator.packetization_attempts(), 1);
4549    }
4550
4551    #[test]
4552    fn cuda_auto_host_output_declines_packetization_before_flattening() {
4553        let mut accelerator = CudaEncodeStageAccelerator::for_auto_host_output();
4554        let invalid_for_cuda_flattening = J2kPacketizationEncodeJob {
4555            resolution_count: 1,
4556            num_layers: 1,
4557            num_components: 3,
4558            code_block_count: 0,
4559            progression_order: J2kPacketizationProgressionOrder::Lrcp,
4560            packet_descriptors: &[],
4561            resolutions: &[],
4562        };
4563
4564        let encoded = J2kEncodeStageAccelerator::encode_packetization(
4565            &mut accelerator,
4566            invalid_for_cuda_flattening,
4567        )
4568        .expect("Auto host-output CUDA packetization should decline to CPU");
4569
4570        assert!(encoded.is_none());
4571        assert_eq!(accelerator.packetization_attempts(), 1);
4572        assert_eq!(accelerator.packetization_dispatches(), 0);
4573    }
4574
4575    #[cfg(feature = "cuda-runtime")]
4576    #[test]
4577    fn cuda_forward_rct_dispatches_when_runtime_required() {
4578        if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4579            return;
4580        }
4581
4582        let pixels: Vec<u8> = (0u16..7 * 5 * 3)
4583            .map(|i| u8::try_from((i * 17) & 0xFF).expect("masked value fits in u8"))
4584            .collect();
4585        let options = EncodeOptions {
4586            reversible: true,
4587            num_decomposition_levels: 0,
4588            ..EncodeOptions::default()
4589        };
4590        let mut accelerator = CudaEncodeStageAccelerator::default();
4591
4592        let codestream = encode_with_cuda_test_accelerator(
4593            &pixels,
4594            7,
4595            5,
4596            3,
4597            8,
4598            false,
4599            &options,
4600            &mut accelerator,
4601        )
4602        .expect("encode with CUDA forward RCT");
4603        let decoded = Image::new(&codestream, &DecodeSettings::default())
4604            .expect("codestream parses")
4605            .decode_native()
4606            .expect("codestream decodes");
4607
4608        assert_eq!(decoded.data, pixels);
4609        assert_eq!(accelerator.forward_rct_attempts(), 1);
4610        assert_eq!(accelerator.forward_rct_dispatches(), 1);
4611    }
4612
4613    #[cfg(feature = "cuda-runtime")]
4614    #[test]
4615    fn cuda_forward_ict_dispatches_when_runtime_required() {
4616        if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4617            return;
4618        }
4619
4620        let pixels: Vec<u8> = (0u32..32 * 32 * 3)
4621            .map(|i| u8::try_from((i * 23 + 19) & 0xFF).expect("masked value fits in u8"))
4622            .collect();
4623        let options = EncodeOptions {
4624            reversible: false,
4625            use_ht_block_coding: true,
4626            num_decomposition_levels: 1,
4627            ..EncodeOptions::default()
4628        };
4629        let mut accelerator = CudaEncodeStageAccelerator::default();
4630
4631        let codestream = encode_with_cuda_test_accelerator(
4632            &pixels,
4633            32,
4634            32,
4635            3,
4636            8,
4637            false,
4638            &options,
4639            &mut accelerator,
4640        )
4641        .expect("encode irreversible RGB with CUDA forward ICT");
4642        let decoded = Image::new(&codestream, &DecodeSettings::default())
4643            .expect("codestream parses")
4644            .decode_native()
4645            .expect("codestream decodes");
4646
4647        assert_eq!(decoded.data.len(), pixels.len());
4648        assert_eq!(accelerator.forward_ict_attempts(), 1);
4649        assert_eq!(accelerator.forward_ict_dispatches(), 1);
4650    }
4651
4652    #[cfg(feature = "cuda-runtime")]
4653    #[test]
4654    fn cuda_forward_dwt53_dispatches_when_runtime_required() {
4655        if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4656            return;
4657        }
4658
4659        let pixels: Vec<u8> = (0u16..8 * 8)
4660            .map(|i| u8::try_from((i * 5) & 0xFF).expect("masked value fits in u8"))
4661            .collect();
4662        let options = EncodeOptions {
4663            reversible: true,
4664            num_decomposition_levels: 1,
4665            ..EncodeOptions::default()
4666        };
4667        let mut accelerator = CudaEncodeStageAccelerator::default();
4668
4669        let codestream = encode_with_cuda_test_accelerator(
4670            &pixels,
4671            8,
4672            8,
4673            1,
4674            8,
4675            false,
4676            &options,
4677            &mut accelerator,
4678        )
4679        .expect("encode with CUDA forward DWT 5/3");
4680        let decoded = Image::new(&codestream, &DecodeSettings::default())
4681            .expect("codestream parses")
4682            .decode_native()
4683            .expect("codestream decodes");
4684
4685        assert_eq!(decoded.data, pixels);
4686        assert_eq!(accelerator.forward_dwt53_attempts(), 1);
4687        assert_eq!(accelerator.forward_dwt53_dispatches(), 2);
4688    }
4689
4690    #[cfg(feature = "cuda-runtime")]
4691    #[test]
4692    fn cuda_forward_dwt97_dispatches_when_runtime_required() {
4693        if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4694            return;
4695        }
4696
4697        let pixels: Vec<u8> = (0u16..32 * 32)
4698            .map(|i| u8::try_from((i * 7 + 13) & 0xFF).expect("masked value fits in u8"))
4699            .collect();
4700        let options = EncodeOptions {
4701            reversible: false,
4702            use_ht_block_coding: true,
4703            num_decomposition_levels: 1,
4704            ..EncodeOptions::default()
4705        };
4706        let mut accelerator = CudaEncodeStageAccelerator::default();
4707
4708        let codestream = encode_with_cuda_test_accelerator(
4709            &pixels,
4710            32,
4711            32,
4712            1,
4713            8,
4714            false,
4715            &options,
4716            &mut accelerator,
4717        )
4718        .expect("encode with CUDA forward DWT 9/7");
4719        let decoded = Image::new(&codestream, &DecodeSettings::default())
4720            .expect("codestream parses")
4721            .decode_native()
4722            .expect("codestream decodes");
4723
4724        assert_eq!(decoded.data.len(), pixels.len());
4725        assert_eq!(accelerator.forward_dwt97_attempts(), 1);
4726        assert_eq!(accelerator.forward_dwt97_dispatches(), 3);
4727    }
4728
4729    #[cfg(feature = "cuda-runtime")]
4730    #[test]
4731    fn cuda_quantize_subband_dispatches_when_runtime_required() {
4732        if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4733            return;
4734        }
4735
4736        let pixels: Vec<u8> = (0u16..32 * 32)
4737            .map(|i| u8::try_from((i * 19 + 5) & 0xFF).expect("masked value fits in u8"))
4738            .collect();
4739        let options = EncodeOptions {
4740            reversible: false,
4741            use_ht_block_coding: true,
4742            num_decomposition_levels: 1,
4743            ..EncodeOptions::default()
4744        };
4745        let mut accelerator = CudaEncodeStageAccelerator::default();
4746
4747        let codestream = encode_with_cuda_test_accelerator(
4748            &pixels,
4749            32,
4750            32,
4751            1,
4752            8,
4753            false,
4754            &options,
4755            &mut accelerator,
4756        )
4757        .expect("encode with CUDA quantization");
4758        let decoded = Image::new(&codestream, &DecodeSettings::default())
4759            .expect("codestream parses")
4760            .decode_native()
4761            .expect("codestream decodes");
4762
4763        assert_eq!(decoded.data.len(), pixels.len());
4764        assert_eq!(accelerator.quantize_subband_attempts(), 4);
4765        assert_eq!(accelerator.quantize_subband_dispatches(), 4);
4766    }
4767
4768    #[cfg(feature = "cuda-runtime")]
4769    #[test]
4770    fn cuda_encode_uses_resident_tile_body_when_runtime_required() {
4771        if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4772            return;
4773        }
4774
4775        let pixels: Vec<u8> = (0u16..32 * 32)
4776            .map(|i| u8::try_from((i * 23 + 11) & 0xFF).expect("masked value fits in u8"))
4777            .collect();
4778        let options = EncodeOptions {
4779            reversible: true,
4780            use_ht_block_coding: true,
4781            num_decomposition_levels: 0,
4782            code_block_width_exp: 2,
4783            code_block_height_exp: 2,
4784            ..EncodeOptions::default()
4785        };
4786        let mut accelerator = CudaEncodeStageAccelerator::default();
4787
4788        let codestream = encode_with_cuda_test_accelerator(
4789            &pixels,
4790            32,
4791            32,
4792            1,
4793            8,
4794            false,
4795            &options,
4796            &mut accelerator,
4797        )
4798        .expect("encode HTJ2K through CUDA tile-body hook");
4799        let decoded = Image::new(&codestream, &DecodeSettings::default())
4800            .expect("codestream parses")
4801            .decode_native()
4802            .expect("codestream decodes");
4803
4804        assert_eq!(decoded.data, pixels);
4805        assert_eq!(accelerator.htj2k_tile_attempts, 1);
4806        assert_eq!(accelerator.htj2k_tile_dispatches, 1);
4807        assert_eq!(accelerator.ht_subband_attempts, 0);
4808        assert_eq!(accelerator.ht_subband_dispatches, 0);
4809        assert_eq!(accelerator.deinterleave_dispatches(), 1);
4810        assert_eq!(accelerator.quantize_subband_attempts(), 1);
4811        assert_eq!(accelerator.quantize_subband_dispatches(), 1);
4812        assert_eq!(accelerator.ht_code_block_attempts(), 4);
4813        assert_eq!(accelerator.ht_code_block_dispatches(), 1);
4814        assert_eq!(accelerator.packetization_attempts(), 1);
4815        assert_eq!(accelerator.packetization_dispatches(), 1);
4816    }
4817
4818    #[cfg(feature = "cuda-runtime")]
4819    #[test]
4820    fn cuda_encode_uses_resident_dwt_tile_body_when_runtime_required() {
4821        if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4822            return;
4823        }
4824
4825        let pixels: Vec<u8> = (0u16..32 * 32)
4826            .map(|i| u8::try_from((i * 29 + 5) & 0xFF).expect("masked value fits in u8"))
4827            .collect();
4828        let options = EncodeOptions {
4829            reversible: true,
4830            use_ht_block_coding: true,
4831            num_decomposition_levels: 1,
4832            code_block_width_exp: 2,
4833            code_block_height_exp: 2,
4834            ..EncodeOptions::default()
4835        };
4836        let mut accelerator = CudaEncodeStageAccelerator::default();
4837
4838        let codestream = encode_with_cuda_test_accelerator(
4839            &pixels,
4840            32,
4841            32,
4842            1,
4843            8,
4844            false,
4845            &options,
4846            &mut accelerator,
4847        )
4848        .expect("encode HTJ2K DWT through CUDA tile-body hook");
4849        let decoded = Image::new(&codestream, &DecodeSettings::default())
4850            .expect("codestream parses")
4851            .decode_native()
4852            .expect("codestream decodes");
4853
4854        assert_eq!(decoded.data, pixels);
4855        assert_eq!(accelerator.htj2k_tile_attempts, 1);
4856        assert_eq!(accelerator.htj2k_tile_dispatches, 1);
4857        assert_eq!(accelerator.ht_subband_attempts, 0);
4858        assert_eq!(accelerator.ht_subband_dispatches, 0);
4859        assert_eq!(accelerator.forward_dwt53_attempts(), 1);
4860        assert!(accelerator.forward_dwt53_dispatches() > 0);
4861        assert_eq!(accelerator.quantize_subband_attempts(), 4);
4862        assert_eq!(accelerator.quantize_subband_dispatches(), 4);
4863        assert_eq!(accelerator.ht_code_block_attempts(), 4);
4864        assert_eq!(accelerator.ht_code_block_dispatches(), 4);
4865        assert_eq!(accelerator.packetization_attempts(), 1);
4866        assert_eq!(accelerator.packetization_dispatches(), 1);
4867    }
4868
4869    #[cfg(feature = "cuda-runtime")]
4870    #[test]
4871    fn cuda_encode_uses_resident_mct_dwt_tile_body_when_runtime_required() {
4872        if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4873            return;
4874        }
4875
4876        let pixels: Vec<u8> = (0u16..32 * 32 * 3)
4877            .map(|i| u8::try_from((i * 19 + 17) & 0xFF).expect("masked value fits in u8"))
4878            .collect();
4879        let options = EncodeOptions {
4880            reversible: true,
4881            use_mct: true,
4882            use_ht_block_coding: true,
4883            num_decomposition_levels: 1,
4884            code_block_width_exp: 2,
4885            code_block_height_exp: 2,
4886            ..EncodeOptions::default()
4887        };
4888        let mut accelerator = CudaEncodeStageAccelerator::default();
4889
4890        let codestream = encode_with_cuda_test_accelerator(
4891            &pixels,
4892            32,
4893            32,
4894            3,
4895            8,
4896            false,
4897            &options,
4898            &mut accelerator,
4899        )
4900        .expect("encode HTJ2K RGB DWT through CUDA tile-body hook");
4901        let decoded = Image::new(&codestream, &DecodeSettings::default())
4902            .expect("codestream parses")
4903            .decode_native()
4904            .expect("codestream decodes");
4905
4906        assert_eq!(decoded.data, pixels);
4907        assert_eq!(accelerator.htj2k_tile_attempts, 1);
4908        assert_eq!(accelerator.htj2k_tile_dispatches, 1);
4909        assert_eq!(accelerator.ht_subband_attempts, 0);
4910        assert_eq!(accelerator.forward_rct_attempts(), 1);
4911        assert_eq!(accelerator.forward_rct_dispatches(), 1);
4912        assert_eq!(accelerator.forward_dwt53_attempts(), 3);
4913        assert!(accelerator.forward_dwt53_dispatches() > 0);
4914        assert_eq!(accelerator.quantize_subband_attempts(), 12);
4915        assert_eq!(accelerator.quantize_subband_dispatches(), 12);
4916        assert_eq!(accelerator.ht_code_block_attempts(), 12);
4917        assert_eq!(accelerator.ht_code_block_dispatches(), 12);
4918        assert_eq!(accelerator.packetization_attempts(), 1);
4919        assert_eq!(accelerator.packetization_dispatches(), 1);
4920    }
4921
4922    #[cfg(feature = "cuda-runtime")]
4923    #[test]
4924    fn cuda_encode_uses_resident_dwt97_tile_body_when_runtime_required() {
4925        if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4926            return;
4927        }
4928
4929        let pixels: Vec<u8> = (0u16..32 * 32)
4930            .map(|i| u8::try_from((i * 31 + 7) & 0xFF).expect("masked value fits in u8"))
4931            .collect();
4932        let options = EncodeOptions {
4933            reversible: false,
4934            use_ht_block_coding: true,
4935            num_decomposition_levels: 1,
4936            code_block_width_exp: 2,
4937            code_block_height_exp: 2,
4938            ..EncodeOptions::default()
4939        };
4940        let mut accelerator = CudaEncodeStageAccelerator::default();
4941
4942        let codestream = encode_with_cuda_test_accelerator(
4943            &pixels,
4944            32,
4945            32,
4946            1,
4947            8,
4948            false,
4949            &options,
4950            &mut accelerator,
4951        )
4952        .expect("encode irreversible HTJ2K DWT through CUDA tile-body hook");
4953        let decoded = Image::new(&codestream, &DecodeSettings::default())
4954            .expect("codestream parses")
4955            .decode_native()
4956            .expect("codestream decodes");
4957
4958        assert_eq!(decoded.width, 32);
4959        assert_eq!(decoded.height, 32);
4960        assert_eq!(decoded.num_components, 1);
4961        assert_eq!(accelerator.htj2k_tile_attempts, 1);
4962        assert_eq!(accelerator.htj2k_tile_dispatches, 1);
4963        assert_eq!(accelerator.ht_subband_attempts, 0);
4964        assert_eq!(accelerator.forward_dwt97_attempts(), 1);
4965        assert!(accelerator.forward_dwt97_dispatches() > 0);
4966        assert_eq!(accelerator.quantize_subband_attempts(), 4);
4967        assert_eq!(accelerator.quantize_subband_dispatches(), 4);
4968        assert_eq!(accelerator.ht_code_block_attempts(), 4);
4969        assert_eq!(accelerator.ht_code_block_dispatches(), 4);
4970        assert_eq!(accelerator.packetization_attempts(), 1);
4971        assert_eq!(accelerator.packetization_dispatches(), 1);
4972    }
4973
4974    #[cfg(feature = "cuda-runtime")]
4975    #[test]
4976    fn cuda_htj2k_codeblock_dispatches_when_runtime_required() {
4977        if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
4978            return;
4979        }
4980
4981        let pixels: Vec<u8> = (0u16..8 * 8)
4982            .map(|i| u8::try_from((i * 11 + 3) & 0xFF).expect("masked value fits in u8"))
4983            .collect();
4984        let options = EncodeOptions {
4985            reversible: true,
4986            use_ht_block_coding: true,
4987            num_decomposition_levels: 0,
4988            code_block_width_exp: 2,
4989            code_block_height_exp: 2,
4990            ..EncodeOptions::default()
4991        };
4992        let mut accelerator = CudaEncodeStageAccelerator::default();
4993
4994        let codestream = encode_with_cuda_test_accelerator(
4995            &pixels,
4996            8,
4997            8,
4998            1,
4999            8,
5000            false,
5001            &options,
5002            &mut accelerator,
5003        )
5004        .expect("encode HTJ2K with CUDA HT codeblock kernel");
5005        let decoded = Image::new(&codestream, &DecodeSettings::default())
5006            .expect("codestream parses")
5007            .decode_native()
5008            .expect("codestream decodes");
5009
5010        assert_eq!(decoded.data, pixels);
5011        assert!(accelerator.ht_code_block_attempts() > 0);
5012        assert!(accelerator.ht_code_block_dispatches() > 0);
5013        assert!(accelerator.ht_code_block_dispatches() <= accelerator.ht_code_block_attempts());
5014        assert_eq!(
5015            accelerator.dispatch_report().ht_code_block,
5016            accelerator.ht_code_block_dispatches()
5017        );
5018    }
5019
5020    #[cfg(feature = "cuda-runtime")]
5021    #[test]
5022    fn cuda_htj2k_codeblock_preserves_requested_refinement_passes_when_runtime_required() {
5023        if !cuda_runtime_required() {
5024            return;
5025        }
5026
5027        let coefficients = [0, 3, -5, 3, 5, 0, -3, 3, 7, -3, 0, 3, 0, 0, 5, -5];
5028        let mut accelerator = CudaEncodeStageAccelerator::default();
5029
5030        let encoded = accelerator
5031            .encode_ht_code_block(J2kHtCodeBlockEncodeJob {
5032                coefficients: &coefficients,
5033                width: 4,
5034                height: 4,
5035                total_bitplanes: 4,
5036                target_coding_passes: 2,
5037            })
5038            .expect("CUDA HTJ2K code-block encode hook")
5039            .expect("CUDA HTJ2K code-block encode output");
5040
5041        assert_eq!(encoded.num_coding_passes, 2);
5042        assert_eq!(encoded.num_zero_bitplanes, 2);
5043        assert_eq!(encoded.refinement_length, 1);
5044        assert_eq!(
5045            encoded.cleanup_length + encoded.refinement_length,
5046            u32::try_from(encoded.data.len()).expect("test payload length fits u32")
5047        );
5048        assert_eq!(accelerator.ht_code_block_dispatches(), 1);
5049    }
5050
5051    #[cfg(feature = "cuda-runtime")]
5052    #[test]
5053    fn cuda_htj2k_codeblock_batch_uses_single_dispatch_when_runtime_required() {
5054        if std::env::var_os("J2K_REQUIRE_CUDA_RUNTIME").is_none() {
5055            return;
5056        }
5057
5058        let pixels: Vec<u8> = (0u16..32 * 32)
5059            .map(|i| u8::try_from((i * 17 + 9) & 0xFF).expect("masked value fits in u8"))
5060            .collect();
5061        let options = EncodeOptions {
5062            reversible: true,
5063            use_ht_block_coding: true,
5064            num_decomposition_levels: 0,
5065            code_block_width_exp: 2,
5066            code_block_height_exp: 2,
5067            ..EncodeOptions::default()
5068        };
5069        let mut accelerator = CudaEncodeStageAccelerator::default();
5070
5071        let codestream = encode_with_cuda_test_accelerator(
5072            &pixels,
5073            32,
5074            32,
5075            1,
5076            8,
5077            false,
5078            &options,
5079            &mut accelerator,
5080        )
5081        .expect("encode HTJ2K with CUDA HT batch codeblock kernel");
5082        let decoded = Image::new(&codestream, &DecodeSettings::default())
5083            .expect("codestream parses")
5084            .decode_native()
5085            .expect("codestream decodes");
5086
5087        assert_eq!(decoded.data, pixels);
5088        assert!(accelerator.ht_code_block_attempts() > 1);
5089        assert_eq!(accelerator.ht_code_block_dispatches(), 1);
5090        assert!(
5091            accelerator.ht_code_block_dispatches() < accelerator.ht_code_block_attempts(),
5092            "batch encode must not launch one kernel per codeblock"
5093        );
5094        assert_eq!(
5095            accelerator.dispatch_report().ht_code_block,
5096            accelerator.ht_code_block_dispatches()
5097        );
5098    }
5099
5100    #[cfg(feature = "cuda-runtime")]
5101    #[test]
5102    fn cuda_resident_quantized_subband_feeds_resident_ht_batch_when_runtime_required() {
5103        if !cuda_runtime_required() {
5104            return;
5105        }
5106
5107        let samples = [-3.6f32, -2.5, -0.4, 0.0, 0.49, 1.5, 3.2, 9.9];
5108        let context = CudaContext::system_default().expect("CUDA context");
5109        let sample_buffer = context.upload_f32(&samples).expect("resident samples");
5110        let quantization = CudaJ2kQuantizeJob {
5111            step_exponent: 8,
5112            step_mantissa: 0,
5113            range_bits: 8,
5114            reversible: true,
5115        };
5116        let resident_quantized = context
5117            .j2k_quantize_subband_resident(&sample_buffer, samples.len(), quantization)
5118            .expect("resident quantization");
5119        let host_quantized = context
5120            .j2k_quantize_subband(&samples, quantization)
5121            .expect("host-staged quantization");
5122        let jobs = [CudaHtj2kEncodeCodeBlockJob {
5123            coefficient_offset: 0,
5124            width: 4,
5125            height: 2,
5126            total_bitplanes: 5,
5127            target_coding_passes: 1,
5128        }];
5129
5130        let resident_encoded = context
5131            .encode_htj2k_codeblocks_resident(
5132                resident_quantized.buffer(),
5133                resident_quantized.coefficient_count(),
5134                &jobs,
5135                cuda_htj2k_encode_tables(),
5136            )
5137            .expect("resident HTJ2K encode");
5138        let staged_encoded = context
5139            .encode_htj2k_codeblocks(
5140                host_quantized.coefficients(),
5141                &jobs,
5142                cuda_htj2k_encode_tables(),
5143            )
5144            .expect("host-staged HTJ2K encode");
5145
5146        assert_eq!(resident_quantized.coefficient_count(), samples.len());
5147        assert_eq!(resident_encoded.execution().kernel_dispatches(), 1);
5148        assert_eq!(
5149            resident_encoded.code_blocks().len(),
5150            staged_encoded.code_blocks().len()
5151        );
5152        for (resident, staged) in resident_encoded
5153            .code_blocks()
5154            .iter()
5155            .zip(staged_encoded.code_blocks())
5156        {
5157            assert_eq!(resident.data(), staged.data());
5158            assert_eq!(resident.cleanup_length(), staged.cleanup_length());
5159            assert_eq!(resident.refinement_length(), staged.refinement_length());
5160            assert_eq!(resident.num_coding_passes(), staged.num_coding_passes());
5161            assert_eq!(resident.num_zero_bitplanes(), staged.num_zero_bitplanes());
5162        }
5163    }
5164
5165    #[cfg(feature = "cuda-runtime")]
5166    #[test]
5167    fn cuda_resident_strided_codeblock_region_matches_host_gather_when_runtime_required() {
5168        if !cuda_runtime_required() {
5169            return;
5170        }
5171
5172        let samples: Vec<f32> = (0u16..16).map(|value| f32::from(value) - 8.0).collect();
5173        let context = CudaContext::system_default().expect("CUDA context");
5174        let sample_buffer = context.upload_f32(&samples).expect("resident samples");
5175        let quantization = CudaJ2kQuantizeJob {
5176            step_exponent: 8,
5177            step_mantissa: 0,
5178            range_bits: 8,
5179            reversible: true,
5180        };
5181        let resident_quantized = context
5182            .j2k_quantize_subband_resident(&sample_buffer, samples.len(), quantization)
5183            .expect("resident quantization");
5184        let quantized = resident_quantized
5185            .download_coefficients()
5186            .expect("download quantized coefficients");
5187        let gathered_codeblock = vec![quantized[5], quantized[6], quantized[9], quantized[10]];
5188        let region_jobs = [CudaHtj2kEncodeCodeBlockRegionJob {
5189            coefficient_offset: 5,
5190            coefficient_stride: 4,
5191            width: 2,
5192            height: 2,
5193            total_bitplanes: 5,
5194            target_coding_passes: 1,
5195        }];
5196        let contiguous_jobs = [CudaHtj2kEncodeCodeBlockJob {
5197            coefficient_offset: 0,
5198            width: 2,
5199            height: 2,
5200            total_bitplanes: 5,
5201            target_coding_passes: 1,
5202        }];
5203
5204        let resident_encoded = context
5205            .encode_htj2k_codeblock_regions_resident(
5206                resident_quantized.buffer(),
5207                resident_quantized.coefficient_count(),
5208                &region_jobs,
5209                cuda_htj2k_encode_tables(),
5210            )
5211            .expect("resident strided HTJ2K encode");
5212        let staged_encoded = context
5213            .encode_htj2k_codeblocks(
5214                &gathered_codeblock,
5215                &contiguous_jobs,
5216                cuda_htj2k_encode_tables(),
5217            )
5218            .expect("host-gathered HTJ2K encode");
5219
5220        assert_eq!(resident_encoded.execution().kernel_dispatches(), 1);
5221        assert_eq!(resident_encoded.code_blocks().len(), 1);
5222        assert_eq!(
5223            resident_encoded.code_blocks()[0].data(),
5224            staged_encoded.code_blocks()[0].data()
5225        );
5226        assert_eq!(
5227            resident_encoded.code_blocks()[0].num_zero_bitplanes(),
5228            staged_encoded.code_blocks()[0].num_zero_bitplanes()
5229        );
5230    }
5231}