Skip to main content

signinum_transcode/
accelerator.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! Optional acceleration hooks for coefficient-domain transform stages.
4//!
5//! These hooks are intentionally narrow: accelerated backends may replace the
6//! direct DCT-grid to one-level wavelet projection, while the scalar path
7//! remains the default oracle and fallback.
8
9use core::fmt;
10
11use crate::dct53_2d::Dwt53TwoDimensional;
12use crate::dct97_2d::Dwt97TwoDimensional;
13use crate::dct_grid::validate_dct_block_grid;
14use crate::reversible53::{
15    reversible_lift_53_high_at, reversible_lift_53_i32, reversible_lift_53_low_at,
16};
17use rayon::prelude::*;
18pub use signinum_j2k::{
19    EncodedHtJ2kCodeBlock, IrreversibleQuantizationSubbandScales, J2kSubBandType,
20    PreencodedHtj2k97CodeBlock, PreencodedHtj2k97CompactCodeBlock,
21    PreencodedHtj2k97CompactComponent, PreencodedHtj2k97CompactImage,
22    PreencodedHtj2k97CompactResolution, PreencodedHtj2k97CompactSubband,
23    PreencodedHtj2k97Component, PreencodedHtj2k97Resolution, PreencodedHtj2k97Subband,
24    PrequantizedHtj2k97CodeBlock, PrequantizedHtj2k97Component, PrequantizedHtj2k97Image,
25    PrequantizedHtj2k97Resolution, PrequantizedHtj2k97Subband,
26};
27use signinum_jpeg::transcode::idct_islow_block;
28
29const REVERSIBLE_DWT53_UNSUPPORTED_GRID: &str =
30    "reversible DCT 5/3 job has unsupported grid geometry";
31
32/// Direct DCT-grid to one-level reversible integer 5/3 projection job.
33#[derive(Debug, Clone, Copy)]
34pub struct DctGridToReversibleDwt53Job<'a> {
35    /// Natural-order, dequantized 8x8 DCT blocks.
36    pub dequantized_blocks: &'a [[i16; 64]],
37    /// Number of DCT block columns in `dequantized_blocks`.
38    pub block_cols: usize,
39    /// Number of DCT block rows in `dequantized_blocks`.
40    pub block_rows: usize,
41    /// Logical component width in samples.
42    pub width: usize,
43    /// Logical component height in samples.
44    pub height: usize,
45}
46
47/// One separable single-level reversible integer 5/3 transform result.
48#[derive(Debug, Clone, PartialEq, Eq)]
49pub struct ReversibleDwt53FirstLevel {
50    /// Low-horizontal, low-vertical band.
51    pub ll: Vec<i32>,
52    /// High-horizontal, low-vertical band.
53    pub hl: Vec<i32>,
54    /// Low-horizontal, high-vertical band.
55    pub lh: Vec<i32>,
56    /// High-horizontal, high-vertical band.
57    pub hh: Vec<i32>,
58    /// Width of horizontally low-pass bands.
59    pub low_width: usize,
60    /// Height of vertically low-pass bands.
61    pub low_height: usize,
62    /// Width of horizontally high-pass bands.
63    pub high_width: usize,
64    /// Height of vertically high-pass bands.
65    pub high_height: usize,
66}
67
68/// Direct DCT-grid to one-level 5/3 projection job.
69#[derive(Debug, Clone, Copy)]
70pub struct DctGridToDwt53Job<'a> {
71    /// Natural-order, dequantized 8x8 DCT blocks.
72    pub blocks: &'a [[[f64; 8]; 8]],
73    /// Number of DCT block columns in `blocks`.
74    pub block_cols: usize,
75    /// Number of DCT block rows in `blocks`.
76    pub block_rows: usize,
77    /// Logical component width in samples.
78    pub width: usize,
79    /// Logical component height in samples.
80    pub height: usize,
81}
82
83/// Direct DCT-grid to one-level 9/7 transform job.
84#[derive(Debug, Clone, Copy)]
85pub struct DctGridToDwt97Job<'a> {
86    /// Natural-order, dequantized 8x8 DCT blocks.
87    pub blocks: &'a [[[f64; 8]; 8]],
88    /// Number of DCT block columns in `blocks`.
89    pub block_cols: usize,
90    /// Number of DCT block rows in `blocks`.
91    pub block_rows: usize,
92    /// Logical component width in samples.
93    pub width: usize,
94    /// Logical component height in samples.
95    pub height: usize,
96}
97
98/// Direct DCT-grid to prequantized one-level 9/7 HTJ2K code-block job.
99#[derive(Debug, Clone, Copy)]
100pub struct DctGridToHtj2k97CodeBlockJob<'a> {
101    /// Natural-order, dequantized 8x8 DCT blocks.
102    pub blocks: &'a [[[f64; 8]; 8]],
103    /// Number of DCT block columns in `blocks`.
104    pub block_cols: usize,
105    /// Number of DCT block rows in `blocks`.
106    pub block_rows: usize,
107    /// Logical component width in samples.
108    pub width: usize,
109    /// Logical component height in samples.
110    pub height: usize,
111    /// Horizontal SIZ sampling factor (`XRsiz`).
112    pub x_rsiz: u8,
113    /// Vertical SIZ sampling factor (`YRsiz`).
114    pub y_rsiz: u8,
115}
116
117/// Direct dequantized i16 DCT-grid to one-level 9/7 HTJ2K code-block job.
118///
119/// This is for accelerators that consume the JPEG coefficient extraction
120/// output directly and do not need the generic f64 block representation.
121#[derive(Debug, Clone, Copy)]
122pub struct DctGridI16ToHtj2k97CodeBlockJob<'a> {
123    /// Natural-order, dequantized 8x8 DCT blocks.
124    pub dequantized_blocks: &'a [[i16; 64]],
125    /// Number of DCT block columns in `dequantized_blocks`.
126    pub block_cols: usize,
127    /// Number of DCT block rows in `dequantized_blocks`.
128    pub block_rows: usize,
129    /// Logical component width in samples.
130    pub width: usize,
131    /// Logical component height in samples.
132    pub height: usize,
133    /// Horizontal SIZ sampling factor (`XRsiz`).
134    pub x_rsiz: u8,
135    /// Vertical SIZ sampling factor (`YRsiz`).
136    pub y_rsiz: u8,
137}
138
139/// One same-geometry i16 DCT-grid HTJ2K preencode batch.
140#[derive(Debug, Clone, Copy)]
141pub struct DctGridI16ToHtj2k97CodeBlockBatch<'a, 'j> {
142    /// Jobs in this same-geometry batch.
143    pub jobs: &'j [DctGridI16ToHtj2k97CodeBlockJob<'a>],
144}
145
146/// Compact preencoded HTJ2K components backed by one payload buffer.
147#[derive(Debug, Clone)]
148pub struct PreencodedHtj2k97CompactBatch {
149    /// Contiguous encoded code-block payload bytes for every component.
150    pub payload: Vec<u8>,
151    /// Compact components in the same order as the submitted jobs.
152    pub components: Vec<PreencodedHtj2k97CompactComponent>,
153}
154
155/// Compact preencoded HTJ2K grouped-batch output backed by one payload buffer.
156#[derive(Debug, Clone)]
157pub struct PreencodedHtj2k97CompactBatchGroups {
158    /// Contiguous encoded code-block payload bytes for every returned group.
159    pub payload: Vec<u8>,
160    /// Compact components grouped in the same order as submitted batches.
161    pub groups: Vec<Vec<PreencodedHtj2k97CompactComponent>>,
162}
163
164/// Encode parameters needed to quantize 9/7 output directly into HTJ2K
165/// code-block coefficient layout.
166#[derive(Debug, Clone, Copy, PartialEq)]
167pub struct Htj2k97CodeBlockOptions {
168    /// Component precision in bits.
169    pub bit_depth: u8,
170    /// JPEG 2000 guard bits used for QCD and code-block bitplane counts.
171    pub guard_bits: u8,
172    /// Code-block width exponent minus two.
173    pub code_block_width_exp: u8,
174    /// Code-block height exponent minus two.
175    pub code_block_height_exp: u8,
176    /// Multiplier applied to irreversible 9/7 scalar quantization step sizes.
177    pub irreversible_quantization_scale: f32,
178    /// Per-subband multipliers applied on top of
179    /// [`irreversible_quantization_scale`](Self::irreversible_quantization_scale).
180    pub irreversible_quantization_subband_scales: IrreversibleQuantizationSubbandScales,
181}
182
183/// Backend-specific timing breakdown for a same-geometry 9/7 batch.
184#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
185pub struct Dwt97BatchStageTimings {
186    /// Host packing, buffer allocation, and upload time in microseconds.
187    pub pack_upload_us: u128,
188    /// Time spent in the IDCT plus horizontal 9/7 row-lift stage.
189    pub idct_row_lift_us: u128,
190    /// Time spent in the vertical 9/7 column-lift stage.
191    pub column_lift_us: u128,
192    /// Time spent quantizing 9/7 bands into HTJ2K code-block layout.
193    pub quantize_codeblock_us: u128,
194    /// Time spent HT-encoding resident code-block coefficients.
195    pub ht_encode_us: u128,
196    /// Resident HT cleanup-pass encode kernel time in microseconds.
197    pub ht_kernel_us: u128,
198    /// Resident HT status-buffer device-to-host readback time in microseconds.
199    pub ht_status_readback_us: u128,
200    /// Resident HT encoded-byte compaction kernel time in microseconds.
201    pub ht_compact_us: u128,
202    /// Resident HT compacted encoded-byte device-to-host readback time in microseconds.
203    pub ht_output_readback_us: u128,
204    /// Number of HT code-block encode kernel dispatches in this batch.
205    pub ht_codeblock_dispatches: usize,
206    /// Time spent reading and unpacking Metal band buffers into host outputs.
207    pub readback_us: u128,
208}
209
210/// Error returned by accelerated transcode stage backends.
211#[derive(Debug, Clone, PartialEq, Eq)]
212pub enum TranscodeStageError {
213    /// The job shape, options, or environment are outside what this backend
214    /// supports.
215    Unsupported(&'static str),
216    /// The backend failed while executing the stage.
217    Backend(String),
218    /// The device or runtime backing this accelerator is unavailable.
219    DeviceUnavailable,
220}
221
222impl fmt::Display for TranscodeStageError {
223    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
224        match self {
225            Self::Unsupported(reason) => f.write_str(reason),
226            Self::Backend(reason) => f.write_str(reason),
227            Self::DeviceUnavailable => f.write_str("accelerator device is unavailable"),
228        }
229    }
230}
231
232impl std::error::Error for TranscodeStageError {}
233
234impl From<&'static str> for TranscodeStageError {
235    fn from(reason: &'static str) -> Self {
236        Self::Unsupported(reason)
237    }
238}
239
240/// Optional backend for SIMD, GPU, or other accelerated transform stages.
241pub trait DctToWaveletStageAccelerator {
242    /// Whether this accelerator wants same-geometry 9/7 batch jobs offered.
243    ///
244    /// The default is false so CPU-only fallback paths do not pay the memory
245    /// cost of materializing batch-owned float DCT blocks before immediately
246    /// falling back.
247    fn supports_dwt97_batch(&self) -> bool {
248        false
249    }
250
251    /// Whether this accelerator wants same-geometry 9/7 batches offered as
252    /// prequantized HTJ2K code-block jobs before the float-band hook.
253    fn supports_htj2k97_codeblock_batch(&self) -> bool {
254        false
255    }
256
257    /// Whether this accelerator wants same-geometry 9/7 preencoded HTJ2K
258    /// batches offered with dequantized i16 DCT blocks before materializing the
259    /// generic f64 block representation.
260    fn supports_htj2k97_i16_preencoded_batch(&self) -> bool {
261        false
262    }
263
264    /// Whether this accelerator wants the compact i16 preencoded HTJ2K batch
265    /// hook offered before the owned preencoded hook.
266    fn supports_htj2k97_compact_preencoded_batch(&self) -> bool {
267        self.supports_htj2k97_i16_preencoded_batch()
268    }
269
270    /// Optionally compute the direct DCT-grid to one-level reversible integer
271    /// 5/3 projection.
272    ///
273    /// Return `Ok(Some(output))` when the backend handled the job bit-exactly
274    /// relative to signinum's scalar integer oracle. Return `Ok(None)` to use
275    /// the scalar fallback.
276    fn dct_grid_to_reversible_dwt53(
277        &mut self,
278        _job: DctGridToReversibleDwt53Job<'_>,
279    ) -> Result<Option<ReversibleDwt53FirstLevel>, TranscodeStageError> {
280        Ok(None)
281    }
282
283    /// Optionally compute a same-geometry batch of direct DCT-grid to
284    /// one-level reversible integer 5/3 projections.
285    ///
286    /// Backends should return outputs in the same order as `jobs`. Return
287    /// `Ok(None)` to use the scalar per-component fallback.
288    fn dct_grid_to_reversible_dwt53_batch(
289        &mut self,
290        _jobs: &[DctGridToReversibleDwt53Job<'_>],
291    ) -> Result<Option<Vec<ReversibleDwt53FirstLevel>>, TranscodeStageError> {
292        Ok(None)
293    }
294
295    /// Optionally compute the direct DCT-grid to one-level 5/3 projection.
296    ///
297    /// Return `Ok(Some(output))` when the backend handled the job. Return
298    /// `Ok(None)` to use the scalar fallback.
299    fn dct_grid_to_dwt53(
300        &mut self,
301        _job: DctGridToDwt53Job<'_>,
302    ) -> Result<Option<Dwt53TwoDimensional<f64>>, TranscodeStageError> {
303        Ok(None)
304    }
305
306    /// Optionally compute the direct DCT-grid to one-level 9/7 transform.
307    ///
308    /// Return `Ok(Some(output))` when the backend handled the job. Return
309    /// `Ok(None)` to use the scalar fallback.
310    fn dct_grid_to_dwt97(
311        &mut self,
312        _job: DctGridToDwt97Job<'_>,
313    ) -> Result<Option<Dwt97TwoDimensional<f64>>, TranscodeStageError> {
314        Ok(None)
315    }
316
317    /// Optionally compute a same-geometry batch of direct DCT-grid to
318    /// one-level 9/7 transforms.
319    ///
320    /// Backends should return outputs in the same order as `jobs`. Return
321    /// `Ok(None)` to use the scalar per-component fallback.
322    fn dct_grid_to_dwt97_batch(
323        &mut self,
324        _jobs: &[DctGridToDwt97Job<'_>],
325    ) -> Result<Option<Vec<Dwt97TwoDimensional<f64>>>, TranscodeStageError> {
326        Ok(None)
327    }
328
329    /// Optionally compute same-geometry DCT-grid 9/7 jobs directly into
330    /// prequantized HTJ2K code-block components.
331    ///
332    /// Backends should return one component per input job in the same order as
333    /// `jobs`. Return `Ok(None)` to use the float-band path.
334    fn dct_grid_to_htj2k97_codeblock_batch(
335        &mut self,
336        _jobs: &[DctGridToHtj2k97CodeBlockJob<'_>],
337        _options: Htj2k97CodeBlockOptions,
338    ) -> Result<Option<Vec<PrequantizedHtj2k97Component>>, TranscodeStageError> {
339        Ok(None)
340    }
341
342    /// Optionally compute same-geometry DCT-grid 9/7 jobs directly into
343    /// preencoded HTJ2K code-block payloads.
344    ///
345    /// Backends should return one component per input job in the same order as
346    /// `jobs`. Return `Ok(None)` to use the prequantized or float-band path.
347    fn dct_grid_to_htj2k97_preencoded_batch(
348        &mut self,
349        _jobs: &[DctGridToHtj2k97CodeBlockJob<'_>],
350        _options: Htj2k97CodeBlockOptions,
351    ) -> Result<Option<Vec<PreencodedHtj2k97Component>>, TranscodeStageError> {
352        Ok(None)
353    }
354
355    /// Optionally compute same-geometry dequantized i16 DCT-grid 9/7 jobs
356    /// directly into preencoded HTJ2K code-block payloads.
357    ///
358    /// Backends should return one component per input job in the same order as
359    /// `jobs`. Return `Ok(None)` to use the generic f64 preencoded path.
360    fn dct_grid_i16_to_htj2k97_preencoded_batch(
361        &mut self,
362        _jobs: &[DctGridI16ToHtj2k97CodeBlockJob<'_>],
363        _options: Htj2k97CodeBlockOptions,
364    ) -> Result<Option<Vec<PreencodedHtj2k97Component>>, TranscodeStageError> {
365        Ok(None)
366    }
367
368    /// Optionally compute same-geometry dequantized i16 DCT-grid 9/7 jobs into
369    /// compact preencoded HTJ2K code-block payloads.
370    ///
371    /// Backends should return one component per input job in the same order as
372    /// `jobs`, with all component ranges pointing into the returned payload.
373    /// Return `Ok(None)` to use the owned preencoded path.
374    fn dct_grid_i16_to_htj2k97_compact_preencoded_batch(
375        &mut self,
376        _jobs: &[DctGridI16ToHtj2k97CodeBlockJob<'_>],
377        _options: Htj2k97CodeBlockOptions,
378    ) -> Result<Option<PreencodedHtj2k97CompactBatch>, TranscodeStageError> {
379        Ok(None)
380    }
381
382    /// Optionally compute multiple same-geometry dequantized i16 DCT-grid
383    /// batches directly into preencoded HTJ2K code-block payloads.
384    ///
385    /// Each input batch is internally same-geometry, but different batches may
386    /// have different component dimensions. Backends should return one output
387    /// vector per input batch, in order. Return `Ok(None)` to use the per-group
388    /// fallback hooks.
389    fn dct_grid_i16_to_htj2k97_preencoded_batch_groups(
390        &mut self,
391        _groups: &[DctGridI16ToHtj2k97CodeBlockBatch<'_, '_>],
392        _options: Htj2k97CodeBlockOptions,
393    ) -> Result<Option<Vec<Vec<PreencodedHtj2k97Component>>>, TranscodeStageError> {
394        Ok(None)
395    }
396
397    /// Optionally compute multiple same-geometry dequantized i16 DCT-grid 9/7
398    /// batches into compact preencoded HTJ2K code-block payloads.
399    ///
400    /// Each returned item corresponds to one input batch and contains one
401    /// component per job in that batch. Return `Ok(None)` to use the owned
402    /// preencoded grouped hook.
403    fn dct_grid_i16_to_htj2k97_compact_preencoded_batch_groups(
404        &mut self,
405        _groups: &[DctGridI16ToHtj2k97CodeBlockBatch<'_, '_>],
406        _options: Htj2k97CodeBlockOptions,
407    ) -> Result<Option<PreencodedHtj2k97CompactBatchGroups>, TranscodeStageError> {
408        Ok(None)
409    }
410
411    /// Return backend stage timings for the most recent 9/7 batch dispatch.
412    fn last_dwt97_batch_stage_timings(&self) -> Option<Dwt97BatchStageTimings> {
413        None
414    }
415}
416
417/// Accelerator that always uses the scalar CPU fallback.
418#[derive(Debug, Default, Clone, Copy)]
419pub struct CpuOnlyDctToWaveletStageAccelerator;
420
421impl DctToWaveletStageAccelerator for CpuOnlyDctToWaveletStageAccelerator {}
422
423/// CPU/Rayon accelerator for the exact reversible integer 5/3 first level.
424///
425/// This backend keeps signinum's scalar ISLOW IDCT semantics as the oracle:
426/// each 8x8 block is decoded with `signinum-jpeg`, level-shifted to signed
427/// component samples, then transformed with reversible integer 5/3 lifting.
428#[derive(Debug, Default, Clone)]
429pub struct RayonReversibleDwt53Accelerator {
430    attempts: usize,
431    dispatches: usize,
432    batch_attempts: usize,
433    batch_dispatches: usize,
434}
435
436impl RayonReversibleDwt53Accelerator {
437    /// Number of reversible 5/3 jobs offered to this accelerator.
438    #[must_use]
439    pub const fn reversible_dwt53_attempts(&self) -> usize {
440        self.attempts
441    }
442
443    /// Number of reversible 5/3 jobs handled by this accelerator.
444    #[must_use]
445    pub const fn reversible_dwt53_dispatches(&self) -> usize {
446        self.dispatches
447    }
448
449    /// Number of reversible 5/3 batches offered to this accelerator.
450    #[must_use]
451    pub const fn reversible_dwt53_batch_attempts(&self) -> usize {
452        self.batch_attempts
453    }
454
455    /// Number of reversible 5/3 batches handled by this accelerator.
456    #[must_use]
457    pub const fn reversible_dwt53_batch_dispatches(&self) -> usize {
458        self.batch_dispatches
459    }
460}
461
462impl DctToWaveletStageAccelerator for RayonReversibleDwt53Accelerator {
463    fn dct_grid_to_reversible_dwt53(
464        &mut self,
465        job: DctGridToReversibleDwt53Job<'_>,
466    ) -> Result<Option<ReversibleDwt53FirstLevel>, TranscodeStageError> {
467        self.attempts = self.attempts.saturating_add(1);
468        let output = reversible_dwt53_first_level_rayon(job)?;
469        self.dispatches = self.dispatches.saturating_add(1);
470        Ok(Some(output))
471    }
472
473    fn dct_grid_to_reversible_dwt53_batch(
474        &mut self,
475        jobs: &[DctGridToReversibleDwt53Job<'_>],
476    ) -> Result<Option<Vec<ReversibleDwt53FirstLevel>>, TranscodeStageError> {
477        self.batch_attempts = self.batch_attempts.saturating_add(1);
478        let mut output = Vec::with_capacity(jobs.len());
479        for job in jobs {
480            output.push(reversible_dwt53_first_level_rayon(*job)?);
481        }
482        self.batch_dispatches = self.batch_dispatches.saturating_add(1);
483        Ok(Some(output))
484    }
485}
486
487/// Decode the job's dequantized DCT blocks into signinum's signed integer
488/// component sample blocks.
489///
490/// This is public so hybrid GPU backends can keep JPEG parsing and exact IDCT
491/// on CPU while offloading the reversible 5/3 projection.
492pub fn idct_blocks_to_signed_samples_rayon(blocks: &[[i16; 64]]) -> Vec<[i32; 64]> {
493    blocks
494        .par_iter()
495        .map(|block| {
496            let decoded = idct_islow_block(block);
497            decoded.map(|sample| i32::from(sample) - 128)
498        })
499        .collect()
500}
501
502/// Compute one exact reversible integer 5/3 level from already decoded
503/// block-local signed samples.
504pub fn reversible_dwt53_first_level_from_block_samples(
505    block_samples: &[[i32; 64]],
506    block_cols: usize,
507    block_rows: usize,
508    width: usize,
509    height: usize,
510) -> Result<ReversibleDwt53FirstLevel, &'static str> {
511    validate_reversible_grid(block_samples.len(), block_cols, block_rows, width, height)?;
512
513    let low_width = width.div_ceil(2);
514    let low_height = height.div_ceil(2);
515    let high_width = width / 2;
516    let high_height = height / 2;
517
518    let low_rows: Vec<(Vec<i32>, Vec<i32>)> = (0..low_height)
519        .into_par_iter()
520        .map(|output_y| {
521            let mut row = Vec::with_capacity(width);
522            for x in 0..width {
523                row.push(vertical_low_53_i32_at(
524                    block_samples,
525                    block_cols,
526                    width,
527                    height,
528                    x,
529                    output_y,
530                ));
531            }
532            reversible_lift_53_i32(&mut row);
533            (
534                row.iter().step_by(2).copied().collect(),
535                row.iter().skip(1).step_by(2).copied().collect(),
536            )
537        })
538        .collect();
539    let high_rows: Vec<(Vec<i32>, Vec<i32>)> = (0..high_height)
540        .into_par_iter()
541        .map(|output_y| {
542            let mut row = Vec::with_capacity(width);
543            for x in 0..width {
544                row.push(vertical_high_53_i32_at(
545                    block_samples,
546                    block_cols,
547                    width,
548                    height,
549                    x,
550                    output_y,
551                ));
552            }
553            reversible_lift_53_i32(&mut row);
554            (
555                row.iter().step_by(2).copied().collect(),
556                row.iter().skip(1).step_by(2).copied().collect(),
557            )
558        })
559        .collect();
560
561    let mut ll = Vec::with_capacity(low_width * low_height);
562    let mut hl = Vec::with_capacity(high_width * low_height);
563    for (low, high) in low_rows {
564        ll.extend(low);
565        hl.extend(high);
566    }
567
568    let mut lh = Vec::with_capacity(low_width * high_height);
569    let mut hh = Vec::with_capacity(high_width * high_height);
570    for (low, high) in high_rows {
571        lh.extend(low);
572        hh.extend(high);
573    }
574
575    Ok(ReversibleDwt53FirstLevel {
576        ll,
577        hl,
578        lh,
579        hh,
580        low_width,
581        low_height,
582        high_width,
583        high_height,
584    })
585}
586
587fn reversible_dwt53_first_level_rayon(
588    job: DctGridToReversibleDwt53Job<'_>,
589) -> Result<ReversibleDwt53FirstLevel, &'static str> {
590    validate_reversible_grid(
591        job.dequantized_blocks.len(),
592        job.block_cols,
593        job.block_rows,
594        job.width,
595        job.height,
596    )?;
597    let block_samples = idct_blocks_to_signed_samples_rayon(job.dequantized_blocks);
598    reversible_dwt53_first_level_from_block_samples(
599        &block_samples,
600        job.block_cols,
601        job.block_rows,
602        job.width,
603        job.height,
604    )
605}
606
607fn validate_reversible_grid(
608    block_count: usize,
609    block_cols: usize,
610    block_rows: usize,
611    width: usize,
612    height: usize,
613) -> Result<(), &'static str> {
614    validate_dct_block_grid(block_count, block_cols, block_rows, width, height)
615        .map_err(|_| REVERSIBLE_DWT53_UNSUPPORTED_GRID)
616}
617
618fn vertical_low_53_i32_at(
619    block_samples: &[[i32; 64]],
620    block_cols: usize,
621    width: usize,
622    height: usize,
623    x: usize,
624    low_idx: usize,
625) -> i32 {
626    reversible_lift_53_low_at(height, low_idx, |y| {
627        component_sample_i32(block_samples, block_cols, width, height, x, y)
628    })
629}
630
631fn vertical_high_53_i32_at(
632    block_samples: &[[i32; 64]],
633    block_cols: usize,
634    width: usize,
635    height: usize,
636    x: usize,
637    high_idx: usize,
638) -> i32 {
639    reversible_lift_53_high_at(height, high_idx, |y| {
640        component_sample_i32(block_samples, block_cols, width, height, x, y)
641    })
642}
643
644fn component_sample_i32(
645    block_samples: &[[i32; 64]],
646    block_cols: usize,
647    width: usize,
648    height: usize,
649    x: usize,
650    y: usize,
651) -> i32 {
652    debug_assert!(x < width);
653    debug_assert!(y < height);
654    let block_x = x / 8;
655    let block_y = y / 8;
656    let block_idx = block_y * block_cols + block_x;
657    let local_idx = (y % 8) * 8 + (x % 8);
658    block_samples[block_idx][local_idx]
659}
660
661#[cfg(test)]
662mod ground_truth_tests {
663    //! Independent ground truth for the reversible integer 5/3.
664    //!
665    //! The CUDA 5/3 kernel is parity-tested against the lifting in this module,
666    //! so a boundary/indexing/band-split bug here would be faithfully copied by
667    //! the kernel and pass parity. Validate the lifting against the canonical
668    //! JPEG2000 reversible 5/3 (ISO/IEC 15444-1 Annex F.3.8.1) evaluated per
669    //! output index from a whole-sample-symmetrically extended signal — a
670    //! structurally different implementation than the in-place two-pass loops.
671
672    use super::{
673        reversible_dwt53_first_level_from_block_samples, reversible_lift_53_i32,
674        ReversibleDwt53FirstLevel,
675    };
676
677    fn floor2(a: i32, b: i32) -> i32 {
678        a.div_euclid(b)
679    }
680
681    /// Whole-sample symmetric reflection (mirror about 0 and `n - 1`, endpoints
682    /// not repeated) — the boundary extension the lifting realizes at the edges.
683    fn ws_reflect(i: isize, n: usize) -> usize {
684        if n == 1 {
685            return 0;
686        }
687        let n = isize::try_from(n).unwrap();
688        let period = 2 * (n - 1);
689        let mut k = i.rem_euclid(period);
690        if k >= n {
691            k = period - k;
692        }
693        usize::try_from(k).unwrap()
694    }
695
696    /// Canonical forward 5/3: `(low, high)` where `low[m]` is the even/approx
697    /// coefficient and `high[m]` the odd/detail coefficient. Every index is read
698    /// through whole-sample symmetric extension of the original signal, so the
699    /// detail-boundary behavior follows automatically (no special cases).
700    fn ref_53_forward(signal: &[i32]) -> (Vec<i32>, Vec<i32>) {
701        let n = signal.len();
702        if n < 2 {
703            return (signal.to_vec(), Vec::new());
704        }
705        let sig = |i: isize| signal[ws_reflect(i, n)];
706        let detail = |m: isize| {
707            let c = 2 * m + 1;
708            sig(c) - floor2(sig(c - 1) + sig(c + 1), 2)
709        };
710        let low: Vec<i32> = (0..n.div_ceil(2))
711            .map(|m| {
712                let mi = isize::try_from(m).unwrap();
713                sig(2 * mi) + floor2(detail(mi - 1) + detail(mi) + 2, 4)
714            })
715            .collect();
716        let high: Vec<i32> = (0..n / 2)
717            .map(|m| detail(isize::try_from(m).unwrap()))
718            .collect();
719        (low, high)
720    }
721
722    /// Separable 2D reference matching the oracle's vertical-then-horizontal
723    /// order (integer floor lifting is NOT order-independent, so order matters).
724    fn ref_53_2d(plane: &[i32], width: usize, height: usize) -> ReversibleDwt53FirstLevel {
725        let low_width = width.div_ceil(2);
726        let high_width = width / 2;
727        let low_height = height.div_ceil(2);
728        let high_height = height / 2;
729
730        let mut v_low = vec![0i32; width * low_height];
731        let mut v_high = vec![0i32; width * high_height];
732        for x in 0..width {
733            let column: Vec<i32> = (0..height).map(|y| plane[y * width + x]).collect();
734            let (lo, hi) = ref_53_forward(&column);
735            for (oy, &value) in lo.iter().enumerate() {
736                v_low[oy * width + x] = value;
737            }
738            for (oy, &value) in hi.iter().enumerate() {
739                v_high[oy * width + x] = value;
740            }
741        }
742
743        let horizontal = |source: &[i32], rows: usize| -> (Vec<i32>, Vec<i32>) {
744            let mut low = vec![0i32; low_width * rows];
745            let mut high = vec![0i32; high_width * rows];
746            for oy in 0..rows {
747                let (lo, hi) = ref_53_forward(&source[oy * width..oy * width + width]);
748                low[oy * low_width..oy * low_width + low_width].copy_from_slice(&lo);
749                high[oy * high_width..oy * high_width + high_width].copy_from_slice(&hi);
750            }
751            (low, high)
752        };
753
754        let (ll, hl) = horizontal(&v_low, low_height);
755        let (lh, hh) = horizontal(&v_high, high_height);
756
757        ReversibleDwt53FirstLevel {
758            ll,
759            hl,
760            lh,
761            hh,
762            low_width,
763            low_height,
764            high_width,
765            high_height,
766        }
767    }
768
769    /// Pack a flat `width x height` sample plane into the block-major
770    /// `[[i32; 64]]` layout `reversible_dwt53_first_level_from_block_samples`
771    /// consumes (local index `(y % 8) * 8 + (x % 8)`).
772    fn pack_plane(plane: &[i32], width: usize, height: usize) -> (Vec<[i32; 64]>, usize, usize) {
773        let block_cols = width.div_ceil(8);
774        let block_rows = height.div_ceil(8);
775        let mut blocks = vec![[0i32; 64]; block_cols * block_rows];
776        for y in 0..height {
777            for x in 0..width {
778                let block = (y / 8) * block_cols + (x / 8);
779                blocks[block][(y % 8) * 8 + (x % 8)] = plane[y * width + x];
780            }
781        }
782        (blocks, block_cols, block_rows)
783    }
784
785    fn next_sample(state: &mut u64) -> i32 {
786        *state = state
787            .wrapping_mul(6_364_136_223_846_793_005)
788            .wrapping_add(1_442_695_040_888_963_407);
789        ((*state >> 40) & 0x1ff) as i32 - 256
790    }
791
792    #[test]
793    fn reversible_lift_53_matches_canonical_formula_1d() {
794        let mut state = 0x0a11_ce5e_ed00_d001u64;
795        for n in [2usize, 3, 4, 5, 8, 9, 12, 15, 16, 23, 32, 33, 64, 65] {
796            let signal: Vec<i32> = (0..n).map(|_| next_sample(&mut state)).collect();
797            let mut lifted = signal.clone();
798            reversible_lift_53_i32(&mut lifted);
799            let lifted_low: Vec<i32> = lifted.iter().step_by(2).copied().collect();
800            let lifted_high: Vec<i32> = lifted.iter().skip(1).step_by(2).copied().collect();
801            let (low, high) = ref_53_forward(&signal);
802            assert_eq!(lifted_low, low, "low band mismatch for n={n}");
803            assert_eq!(lifted_high, high, "high band mismatch for n={n}");
804        }
805    }
806
807    #[test]
808    fn reversible_lift_53_shared_helper_matches_canonical_formula_1d() {
809        let mut state = 0x5a53_5a53_5a53_5a53u64;
810        for n in [2usize, 3, 4, 5, 8, 9, 16, 17, 31, 32, 65] {
811            let signal: Vec<i32> = (0..n).map(|_| next_sample(&mut state)).collect();
812            let mut lifted = signal.clone();
813            crate::reversible53::reversible_lift_53_i32(&mut lifted);
814            let lifted_low: Vec<i32> = lifted.iter().step_by(2).copied().collect();
815            let lifted_high: Vec<i32> = lifted.iter().skip(1).step_by(2).copied().collect();
816            let (low, high) = ref_53_forward(&signal);
817            assert_eq!(lifted_low, low, "low band mismatch for n={n}");
818            assert_eq!(lifted_high, high, "high band mismatch for n={n}");
819        }
820    }
821
822    #[test]
823    fn reversible_dwt53_2d_matches_canonical_separable() {
824        let mut state = 0xfeed_5eed_d00d_face_u64;
825        for (width, height) in [
826            (8usize, 8usize),
827            (16, 16),
828            (24, 16),
829            (15, 13),
830            (16, 23),
831            (9, 7),
832            (32, 32),
833        ] {
834            let plane: Vec<i32> = (0..width * height)
835                .map(|_| next_sample(&mut state))
836                .collect();
837            let (blocks, block_cols, block_rows) = pack_plane(&plane, width, height);
838            let got = reversible_dwt53_first_level_from_block_samples(
839                &blocks, block_cols, block_rows, width, height,
840            )
841            .expect("oracle accepts the packed grid");
842            let want = ref_53_2d(&plane, width, height);
843            assert_eq!(
844                (
845                    got.low_width,
846                    got.low_height,
847                    got.high_width,
848                    got.high_height
849                ),
850                (
851                    want.low_width,
852                    want.low_height,
853                    want.high_width,
854                    want.high_height
855                ),
856                "band dimensions for {width}x{height}"
857            );
858            assert_eq!(got.ll, want.ll, "LL mismatch for {width}x{height}");
859            assert_eq!(got.hl, want.hl, "HL mismatch for {width}x{height}");
860            assert_eq!(got.lh, want.lh, "LH mismatch for {width}x{height}");
861            assert_eq!(got.hh, want.hh, "HH mismatch for {width}x{height}");
862        }
863    }
864
865    #[test]
866    fn reversible_lift_53_kills_dc_and_linear_detail() {
867        // Constant -> low = constant, detail exactly zero.
868        let mut constant = vec![7i32; 32];
869        reversible_lift_53_i32(&mut constant);
870        assert!(
871            constant.iter().skip(1).step_by(2).all(|&v| v == 0),
872            "constant produced nonzero detail"
873        );
874        assert!(
875            constant.iter().step_by(2).all(|&v| v == 7),
876            "constant low band drifted from 7"
877        );
878
879        // Linear ramp -> interior detail exactly zero (two vanishing moments).
880        let ramp: Vec<i32> = (0..40_i32).map(|k| 3 * k - 5).collect();
881        let mut lifted = ramp;
882        reversible_lift_53_i32(&mut lifted);
883        let detail: Vec<i32> = lifted.iter().skip(1).step_by(2).copied().collect();
884        for &value in &detail[1..detail.len() - 1] {
885            assert_eq!(value, 0, "linear ramp produced interior detail {value}");
886        }
887    }
888
889    #[test]
890    fn reversible_dwt53_2d_separates_horizontal_and_vertical_detail() {
891        // Varies only along x -> no vertical detail (LH and HH vanish).
892        let (width, height) = (16usize, 16usize);
893        let varies_in_x: Vec<i32> = (0..width * height)
894            .map(|i| 3 * i32::try_from(i % width).unwrap() - 7)
895            .collect();
896        let (blocks, bc, br) = pack_plane(&varies_in_x, width, height);
897        let t = reversible_dwt53_first_level_from_block_samples(&blocks, bc, br, width, height)
898            .expect("oracle accepts grid");
899        assert!(
900            t.lh.iter().all(|&v| v == 0),
901            "x-only plane produced LH detail"
902        );
903        assert!(
904            t.hh.iter().all(|&v| v == 0),
905            "x-only plane produced HH detail"
906        );
907
908        // Varies only along y -> no horizontal detail (HL and HH vanish).
909        let varies_in_y: Vec<i32> = (0..width * height)
910            .map(|i| 3 * i32::try_from(i / width).unwrap() - 7)
911            .collect();
912        let (blocks, bc, br) = pack_plane(&varies_in_y, width, height);
913        let t = reversible_dwt53_first_level_from_block_samples(&blocks, bc, br, width, height)
914            .expect("oracle accepts grid");
915        assert!(
916            t.hl.iter().all(|&v| v == 0),
917            "y-only plane produced HL detail"
918        );
919        assert!(
920            t.hh.iter().all(|&v| v == 0),
921            "y-only plane produced HH detail"
922        );
923    }
924}