j2k-jpeg 0.6.1 - Docs.rs

// SPDX-License-Identifier: Apache-2.0

//! Integer "slow" IDCT (ISLOW) — Chen-Wang decomposition, 16-bit fixed point.
//! Bit-exact with libjpeg-turbo's `jidctint.c` algorithm on the same inputs.
//!
//! Input: 64 dequantized DCT coefficients in natural (row-major) order, each
//! already multiplied by its quantization entry.
//!
//! Output: 64 u8 pixel samples in natural order, level-shifted by +128 and
//! clamped to `[0, 255]`.
//!
//! Arithmetic uses `core::num::Wrapping<i32>` so intermediate overflow on
//! adversarial (malformed) inputs wraps modulo 2^32 rather than panicking in
//! debug builds. libjpeg-turbo exhibits the same modular behavior in release.

use core::num::Wrapping;

const CONST_BITS: usize = 13;
const PASS1_BITS: usize = 2;

const FIX_0_298631336: Wrapping<i32> = Wrapping(2446);
const FIX_0_390180644: Wrapping<i32> = Wrapping(3196);
const FIX_0_541196100: Wrapping<i32> = Wrapping(4433);
const FIX_0_765366865: Wrapping<i32> = Wrapping(6270);
const FIX_0_899976223: Wrapping<i32> = Wrapping(7373);
const FIX_1_175875602: Wrapping<i32> = Wrapping(9633);
const FIX_1_501321110: Wrapping<i32> = Wrapping(12299);
const FIX_1_847759065: Wrapping<i32> = Wrapping(15137);
const FIX_1_961570560: Wrapping<i32> = Wrapping(16069);
const FIX_2_053119869: Wrapping<i32> = Wrapping(16819);
const FIX_2_562915447: Wrapping<i32> = Wrapping(20995);
const FIX_3_072711026: Wrapping<i32> = Wrapping(25172);

/// Inverse DCT of a single 8×8 block, with level shift and clamping.
pub(crate) fn idct_islow(input: &[i16; 64], output: &mut [u8; 64]) {
    let mut work = [Wrapping(0i32); 64];
    if input[32..].iter().all(|&coeff| coeff == 0) {
        for col in 0..8 {
            idct_1d_column_bottom_half_zero(input, &mut work, col);
        }
    } else {
        for col in 0..8 {
            idct_1d_column(input, &mut work, col);
        }
    }
    for row in 0..8 {
        idct_1d_row::<u8>(&work, output, row);
    }
}

/// Inverse DCT for blocks whose natural-order rows 4..7 are known to be zero.
pub(crate) fn idct_islow_bottom_half_zero(input: &[i16; 64], output: &mut [u8; 64]) {
    let mut work = [Wrapping(0i32); 64];
    for col in 0..8 {
        idct_1d_column_bottom_half_zero(input, &mut work, col);
    }
    for row in 0..8 {
        idct_1d_row::<u8>(&work, output, row);
    }
}

/// Bit-exact DC-only ISLOW path. Equivalent to `idct_islow` when every AC
/// coefficient is zero.
pub(crate) fn idct_islow_dc_only(dc_coeff: i16, output: &mut [u8; 64]) {
    output.fill(idct_islow_dc_only_pixel(dc_coeff));
}

/// Return the uniform output sample produced by the DC-only ISLOW path.
pub(crate) fn idct_islow_dc_only_pixel(dc_coeff: i16) -> u8 {
    idct_islow_dc_only_sample::<u8>(dc_coeff)
}

/// Inverse DCT of a 12-bit JPEG block, returning native 0..4095 sample values.
pub(crate) fn idct_islow_12bit(input: &[i16; 64], output: &mut [u16; 64]) {
    let mut work = [Wrapping(0i32); 64];
    if input[32..].iter().all(|&coeff| coeff == 0) {
        for col in 0..8 {
            idct_1d_column_bottom_half_zero(input, &mut work, col);
        }
    } else {
        for col in 0..8 {
            idct_1d_column(input, &mut work, col);
        }
    }
    for row in 0..8 {
        idct_1d_row::<u16>(&work, output, row);
    }
}

/// Return the uniform native 12-bit sample produced by the DC-only ISLOW path.
pub(crate) fn idct_islow_12bit_dc_only_sample(dc_coeff: i16) -> u16 {
    idct_islow_dc_only_sample::<u16>(dc_coeff)
}

trait IdctSample: Copy {
    const LEVEL_SHIFT: i32;
    const MAX: i32;

    fn from_clamped_i32(value: i32) -> Self;
}

impl IdctSample for u8 {
    const LEVEL_SHIFT: i32 = 128;
    const MAX: i32 = 255;

    fn from_clamped_i32(value: i32) -> Self {
        value as Self
    }
}

impl IdctSample for u16 {
    const LEVEL_SHIFT: i32 = 2048;
    const MAX: i32 = 4095;

    fn from_clamped_i32(value: i32) -> Self {
        value as Self
    }
}

fn idct_islow_dc_only_sample<T: IdctSample>(dc_coeff: i16) -> T {
    let level_shifted = ((i32::from(dc_coeff) + 4) >> 3).wrapping_add(T::LEVEL_SHIFT);
    T::from_clamped_i32(level_shifted.clamp(0, T::MAX))
}

fn idct_1d_column(input: &[i16; 64], work: &mut [Wrapping<i32>; 64], col: usize) {
    let p0 = Wrapping(input[col] as i32);
    let p1 = Wrapping(input[col + 8] as i32);
    let p2 = Wrapping(input[col + 16] as i32);
    let p3 = Wrapping(input[col + 24] as i32);
    let p4 = Wrapping(input[col + 32] as i32);
    let p5 = Wrapping(input[col + 40] as i32);
    let p6 = Wrapping(input[col + 48] as i32);
    let p7 = Wrapping(input[col + 56] as i32);

    if p1.0 == 0 && p2.0 == 0 && p3.0 == 0 && p4.0 == 0 && p5.0 == 0 && p6.0 == 0 && p7.0 == 0 {
        let dc = p0 << PASS1_BITS;
        work[col] = dc;
        work[col + 8] = dc;
        work[col + 16] = dc;
        work[col + 24] = dc;
        work[col + 32] = dc;
        work[col + 40] = dc;
        work[col + 48] = dc;
        work[col + 56] = dc;
        return;
    }

    let z2 = p2;
    let z3 = p6;
    let z1 = (z2 + z3) * FIX_0_541196100;
    let tmp2 = z1 + z3 * (-FIX_1_847759065);
    let tmp3 = z1 + z2 * FIX_0_765366865;

    let z2 = p0;
    let z3 = p4;
    let tmp0 = (z2 + z3) << CONST_BITS;
    let tmp1 = (z2 - z3) << CONST_BITS;

    let tmp10 = tmp0 + tmp3;
    let tmp13 = tmp0 - tmp3;
    let tmp11 = tmp1 + tmp2;
    let tmp12 = tmp1 - tmp2;

    let tmp0 = p7;
    let tmp1 = p5;
    let tmp2 = p3;
    let tmp3 = p1;

    let z1 = tmp0 + tmp3;
    let z2 = tmp1 + tmp2;
    let z3 = tmp0 + tmp2;
    let z4 = tmp1 + tmp3;
    let z5 = (z3 + z4) * FIX_1_175875602;

    let tmp0 = tmp0 * FIX_0_298631336;
    let tmp1 = tmp1 * FIX_2_053119869;
    let tmp2 = tmp2 * FIX_3_072711026;
    let tmp3 = tmp3 * FIX_1_501321110;
    let z1 = z1 * (-FIX_0_899976223);
    let z2 = z2 * (-FIX_2_562915447);
    let z3 = z3 * (-FIX_1_961570560);
    let z4 = z4 * (-FIX_0_390180644);

    let z3 = z3 + z5;
    let z4 = z4 + z5;

    let tmp0 = tmp0 + z1 + z3;
    let tmp1 = tmp1 + z2 + z4;
    let tmp2 = tmp2 + z2 + z3;
    let tmp3 = tmp3 + z1 + z4;

    let shift = CONST_BITS - PASS1_BITS;
    let rounding = Wrapping(1i32 << (shift - 1));
    work[col] = descale(tmp10 + tmp3 + rounding, shift);
    work[col + 56] = descale(tmp10 - tmp3 + rounding, shift);
    work[col + 8] = descale(tmp11 + tmp2 + rounding, shift);
    work[col + 48] = descale(tmp11 - tmp2 + rounding, shift);
    work[col + 16] = descale(tmp12 + tmp1 + rounding, shift);
    work[col + 40] = descale(tmp12 - tmp1 + rounding, shift);
    work[col + 24] = descale(tmp13 + tmp0 + rounding, shift);
    work[col + 32] = descale(tmp13 - tmp0 + rounding, shift);
}

fn idct_1d_column_bottom_half_zero(input: &[i16; 64], work: &mut [Wrapping<i32>; 64], col: usize) {
    let p0 = Wrapping(input[col] as i32);
    let p1 = Wrapping(input[col + 8] as i32);
    let p2 = Wrapping(input[col + 16] as i32);
    let p3 = Wrapping(input[col + 24] as i32);

    if p1.0 == 0 && p2.0 == 0 && p3.0 == 0 {
        let dc = p0 << PASS1_BITS;
        work[col] = dc;
        work[col + 8] = dc;
        work[col + 16] = dc;
        work[col + 24] = dc;
        work[col + 32] = dc;
        work[col + 40] = dc;
        work[col + 48] = dc;
        work[col + 56] = dc;
        return;
    }

    let z1 = p2 * FIX_0_541196100;
    let tmp2 = z1;
    let tmp3 = z1 + p2 * FIX_0_765366865;

    let tmp0 = p0 << CONST_BITS;
    let tmp1 = p0 << CONST_BITS;

    let tmp10 = tmp0 + tmp3;
    let tmp13 = tmp0 - tmp3;
    let tmp11 = tmp1 + tmp2;
    let tmp12 = tmp1 - tmp2;

    let z5 = (p1 + p3) * FIX_1_175875602;
    let z1 = p1 * (-FIX_0_899976223);
    let z2 = p3 * (-FIX_2_562915447);
    let z3 = p3 * (-FIX_1_961570560) + z5;
    let z4 = p1 * (-FIX_0_390180644) + z5;

    let tmp0 = z1 + z3;
    let tmp1 = z2 + z4;
    let tmp2 = p3 * FIX_3_072711026 + z2 + z3;
    let tmp3 = p1 * FIX_1_501321110 + z1 + z4;

    let shift = CONST_BITS - PASS1_BITS;
    let rounding = Wrapping(1i32 << (shift - 1));
    work[col] = descale(tmp10 + tmp3 + rounding, shift);
    work[col + 56] = descale(tmp10 - tmp3 + rounding, shift);
    work[col + 8] = descale(tmp11 + tmp2 + rounding, shift);
    work[col + 48] = descale(tmp11 - tmp2 + rounding, shift);
    work[col + 16] = descale(tmp12 + tmp1 + rounding, shift);
    work[col + 40] = descale(tmp12 - tmp1 + rounding, shift);
    work[col + 24] = descale(tmp13 + tmp0 + rounding, shift);
    work[col + 32] = descale(tmp13 - tmp0 + rounding, shift);
}

fn descale(v: Wrapping<i32>, shift: usize) -> Wrapping<i32> {
    Wrapping(v.0 >> shift)
}

fn idct_1d_row<T: IdctSample>(work: &[Wrapping<i32>; 64], output: &mut [T; 64], row: usize) {
    let base = row * 8;
    let p0 = work[base];
    let p1 = work[base + 1];
    let p2 = work[base + 2];
    let p3 = work[base + 3];
    let p4 = work[base + 4];
    let p5 = work[base + 5];
    let p6 = work[base + 6];
    let p7 = work[base + 7];

    let shift = CONST_BITS + PASS1_BITS + 3;
    let rounding = Wrapping(1i32 << (shift - 1));

    if p1.0 == 0 && p2.0 == 0 && p3.0 == 0 && p4.0 == 0 && p5.0 == 0 && p6.0 == 0 && p7.0 == 0 {
        let dc_shift = PASS1_BITS + 3;
        let rounding_dc = Wrapping(1i32 << (dc_shift - 1));
        let pixel = descale_and_clamp::<T>(p0 + rounding_dc, dc_shift);
        for i in 0..8 {
            output[base + i] = pixel;
        }
        return;
    }

    let z2 = p2;
    let z3 = p6;
    let z1 = (z2 + z3) * FIX_0_541196100;
    let tmp2 = z1 + z3 * (-FIX_1_847759065);
    let tmp3 = z1 + z2 * FIX_0_765366865;

    let tmp0 = (p0 + p4) << CONST_BITS;
    let tmp1 = (p0 - p4) << CONST_BITS;

    let tmp10 = tmp0 + tmp3;
    let tmp13 = tmp0 - tmp3;
    let tmp11 = tmp1 + tmp2;
    let tmp12 = tmp1 - tmp2;

    let tmp0 = p7;
    let tmp1 = p5;
    let tmp2 = p3;
    let tmp3 = p1;

    let z1 = tmp0 + tmp3;
    let z2 = tmp1 + tmp2;
    let z3 = tmp0 + tmp2;
    let z4 = tmp1 + tmp3;
    let z5 = (z3 + z4) * FIX_1_175875602;

    let tmp0 = tmp0 * FIX_0_298631336;
    let tmp1 = tmp1 * FIX_2_053119869;
    let tmp2 = tmp2 * FIX_3_072711026;
    let tmp3 = tmp3 * FIX_1_501321110;
    let z1 = z1 * (-FIX_0_899976223);
    let z2 = z2 * (-FIX_2_562915447);
    let z3 = z3 * (-FIX_1_961570560);
    let z4 = z4 * (-FIX_0_390180644);

    let z3 = z3 + z5;
    let z4 = z4 + z5;

    let tmp0 = tmp0 + z1 + z3;
    let tmp1 = tmp1 + z2 + z4;
    let tmp2 = tmp2 + z2 + z3;
    let tmp3 = tmp3 + z1 + z4;

    output[base] = descale_and_clamp::<T>(tmp10 + tmp3 + rounding, shift);
    output[base + 7] = descale_and_clamp::<T>(tmp10 - tmp3 + rounding, shift);
    output[base + 1] = descale_and_clamp::<T>(tmp11 + tmp2 + rounding, shift);
    output[base + 6] = descale_and_clamp::<T>(tmp11 - tmp2 + rounding, shift);
    output[base + 2] = descale_and_clamp::<T>(tmp12 + tmp1 + rounding, shift);
    output[base + 5] = descale_and_clamp::<T>(tmp12 - tmp1 + rounding, shift);
    output[base + 3] = descale_and_clamp::<T>(tmp13 + tmp0 + rounding, shift);
    output[base + 4] = descale_and_clamp::<T>(tmp13 - tmp0 + rounding, shift);
}

fn descale_and_clamp<T: IdctSample>(value: Wrapping<i32>, shift: usize) -> T {
    let shifted = value.0 >> shift;
    let level_shifted = shifted.wrapping_add(T::LEVEL_SHIFT);
    T::from_clamped_i32(level_shifted.clamp(0, T::MAX))
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn all_zero_input_produces_level_shifted_gray_block() {
        let input = [0i16; 64];
        let mut output = [0u8; 64];
        idct_islow(&input, &mut output);
        for (i, &px) in output.iter().enumerate() {
            assert_eq!(px, 128, "pixel {i} = {px}, expected 128");
        }
    }

    #[test]
    fn dc_only_input_produces_uniform_block() {
        let mut input = [0i16; 64];
        input[0] = 8 * 8;
        let mut output = [0u8; 64];
        idct_islow(&input, &mut output);
        for &px in &output {
            assert!((px as i32 - 136).abs() <= 1, "got {px}");
        }
    }

    // -------------------------------------------------------------------------
    // Ground truth: exact mathematical inverse DCT.
    //
    // ISLOW is a fixed-point (Chen-Wang) approximation; nothing previously
    // pinned it to the *defining* inverse DCT, so a wrong constant or butterfly
    // could pass the sanity checks (and be faithfully copied by the CUDA port).
    // Compare every pixel to the exact double-precision IDCT. ISLOW meets the
    // IEEE-1180 peak-error bound, so a 1-LSB tolerance is the correct gate.

    /// Exact 8x8 inverse DCT (DCT-III, orthonormal) of natural-order signed
    /// coefficients, before the +128 level shift. `coeffs[v * 8 + u]` is the
    /// coefficient at horizontal frequency `u`, vertical frequency `v`.
    fn exact_idct_pixel(coeffs: &[i16; 64], x: usize, y: usize) -> f64 {
        use core::f64::consts::PI;
        let alpha = |k: usize| {
            if k == 0 {
                (1.0_f64 / 8.0).sqrt()
            } else {
                (2.0_f64 / 8.0).sqrt()
            }
        };
        let cos_term = |sample: usize, freq: usize| -> f64 {
            let s = f64::from(u8::try_from(sample).unwrap());
            let f = f64::from(u8::try_from(freq).unwrap());
            (((2.0 * s) + 1.0) * f * PI / 16.0).cos()
        };
        let mut acc = 0.0;
        for v in 0..8 {
            for u in 0..8 {
                acc += alpha(u)
                    * alpha(v)
                    * f64::from(coeffs[v * 8 + u])
                    * cos_term(x, u)
                    * cos_term(y, v);
            }
        }
        acc
    }

    /// The exact IDCT pixels with JPEG's +128 level shift and `[0, 255]` clamp.
    fn exact_islow_reference(coeffs: &[i16; 64]) -> [u8; 64] {
        let mut out = [0u8; 64];
        for y in 0..8 {
            for x in 0..8 {
                let value = exact_idct_pixel(coeffs, x, y) + 128.0;
                out[y * 8 + x] = value.round().clamp(0.0, 255.0) as u8;
            }
        }
        out
    }

    fn next_coeff(state: &mut u64) -> i16 {
        *state = state
            .wrapping_mul(6_364_136_223_846_793_005)
            .wrapping_add(1_442_695_040_888_963_407);
        // Modest dequantized magnitudes so most pixels stay inside 8-bit range.
        ((*state >> 40) & 0x1ff) as i32 as i16 - 256
    }

    #[test]
    fn islow_matches_exact_idct_within_one_lsb() {
        let mut state = 0x0bad_c0de_1234_5678u64;
        for _ in 0..256 {
            let mut coeffs = [0i16; 64];
            for c in &mut coeffs {
                *c = next_coeff(&mut state);
            }
            // Keep DC moderate so the field is centered in range.
            coeffs[0] = i16::try_from(i32::from(coeffs[0]) / 4).unwrap();

            let mut got = [0u8; 64];
            idct_islow(&coeffs, &mut got);
            let want = exact_islow_reference(&coeffs);
            for i in 0..64 {
                assert!(
                    (i32::from(got[i]) - i32::from(want[i])).abs() <= 1,
                    "pixel {i}: islow={} exact={} (coeffs {coeffs:?})",
                    got[i],
                    want[i]
                );
            }
        }
    }

    #[test]
    fn islow_dc_only_matches_closed_form() {
        // DC-only block: every pixel is F(0,0)/8 + 128.
        for dc in [-512i16, -200, -64, 8, 64, 200, 512] {
            let mut coeffs = [0i16; 64];
            coeffs[0] = dc;
            let mut got = [0u8; 64];
            idct_islow(&coeffs, &mut got);
            let expected = ((f64::from(dc) / 8.0) + 128.0).round().clamp(0.0, 255.0) as u8;
            for &px in &got {
                assert!(
                    (i32::from(px) - i32::from(expected)).abs() <= 1,
                    "dc={dc}: px={px} expected={expected}"
                );
            }
        }
    }

    #[test]
    fn dc_only_helper_matches_full_idct() {
        let mut input = [0i16; 64];
        input[0] = 73;
        let mut full = [0u8; 64];
        let mut fast = [0u8; 64];
        idct_islow(&input, &mut full);
        idct_islow_dc_only(input[0], &mut fast);
        assert_eq!(fast, full);
        assert_eq!(idct_islow_dc_only_pixel(input[0]), full[0]);
    }

    #[test]
    fn clamps_extreme_coefficients_into_0_255() {
        let mut input = [0i16; 64];
        input[0] = i16::MAX;
        let mut output = [0u8; 64];
        idct_islow(&input, &mut output);
        assert!(output.iter().all(|&px| px == 255));

        let mut input = [0i16; 64];
        input[0] = i16::MIN;
        let mut output = [0u8; 64];
        idct_islow(&input, &mut output);
        assert!(output.iter().all(|&px| px == 0));
    }

    #[test]
    fn roundtrip_identity_basis_reconstructs_8x8_impulse() {
        let mut input = [0i16; 64];
        input[1] = 400;
        let mut output = [0u8; 64];
        idct_islow(&input, &mut output);
        let left = output[0] as i32;
        let right = output[7] as i32;
        assert!(
            (left - right).abs() > 40,
            "AC[1] basis should produce horizontal variation, got L={left} R={right}"
        );
    }

    #[test]
    fn does_not_panic_on_extreme_adversarial_coefficients() {
        // All maxed-out i16 — intermediate multiplies overflow i32. Wrapping<i32>
        // makes this produce garbage pixels instead of panicking.
        let input = [i16::MAX; 64];
        let mut output = [0u8; 64];
        idct_islow(&input, &mut output);
        // No panic = success. Output values are not asserted (they are modular
        // garbage by design on adversarial input).
        let _ = output;
    }
}