#[cfg(not(feature = "std"))]
use alloc::{vec, vec::Vec};
use crate::error::Iw44Error;
use crate::pixmap::Pixmap;
use crate::zp_impl::ZpDecoder;
const BAND_BUCKETS: [(usize, usize); 10] = [
(0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 7),
(8, 11),
(12, 15),
(16, 31),
(32, 47),
(48, 63),
];
const QUANT_LO_INIT: [u32; 16] = [
0x004000, 0x008000, 0x008000, 0x010000, 0x010000, 0x010000, 0x010000, 0x010000, 0x010000,
0x010000, 0x010000, 0x010000, 0x020000, 0x020000, 0x020000, 0x020000,
];
const QUANT_HI_INIT: [u32; 10] = [
0, 0x020000, 0x020000, 0x040000, 0x040000, 0x040000, 0x080000, 0x040000, 0x040000, 0x080000,
];
const ZERO: u8 = 1;
const ACTIVE: u8 = 2;
const NEW: u8 = 4;
const UNK: u8 = 8;
const fn zigzag_row(i: usize) -> u8 {
let b1 = ((i >> 1) & 1) as u8;
let b3 = ((i >> 3) & 1) as u8;
let b5 = ((i >> 5) & 1) as u8;
let b7 = ((i >> 7) & 1) as u8;
let b9 = ((i >> 9) & 1) as u8;
b1 * 16 + b3 * 8 + b5 * 4 + b7 * 2 + b9
}
const fn zigzag_col(i: usize) -> u8 {
let b0 = (i & 1) as u8;
let b2 = ((i >> 2) & 1) as u8;
let b4 = ((i >> 4) & 1) as u8;
let b6 = ((i >> 6) & 1) as u8;
let b8 = ((i >> 8) & 1) as u8;
b0 * 16 + b2 * 8 + b4 * 4 + b6 * 2 + b8
}
static ZIGZAG_INV: [u16; 1024] = {
let mut table = [0u16; 1024];
let mut i = 0usize;
while i < 1024 {
let r = zigzag_row(i) as usize;
let c = zigzag_col(i) as usize;
table[r * 32 + c] = i as u16;
i += 1;
}
table
};
static ZIGZAG_INV_SUB2: [u8; 256] = {
let mut table = [0u8; 256];
let mut i = 0usize;
while i < 256 {
let r = (zigzag_row(i) >> 1) as usize;
let c = (zigzag_col(i) >> 1) as usize;
table[r * 16 + c] = i as u8;
i += 1;
}
table
};
static ZIGZAG_INV_SUB4: [u8; 64] = {
let mut table = [0u8; 64];
let mut i = 0usize;
while i < 64 {
let r = (zigzag_row(i) >> 2) as usize;
let c = (zigzag_col(i) >> 2) as usize;
table[r * 8 + c] = i as u8;
i += 1;
}
table
};
static ZIGZAG_INV_SUB8: [u8; 16] = {
let mut table = [0u8; 16];
let mut i = 0usize;
while i < 16 {
let r = (zigzag_row(i) >> 3) as usize;
let c = (zigzag_col(i) >> 3) as usize;
table[r * 4 + c] = i as u8;
i += 1;
}
table
};
#[inline]
fn normalize(val: i16) -> i32 {
let v = ((val as i32) + 32) >> 6;
v.clamp(-128, 127)
}
pub(crate) fn ycbcr_row_to_rgba(y_row: &[i32], cb_row: &[i32], cr_row: &[i32], out: &mut [u8]) {
debug_assert_eq!(y_row.len(), cb_row.len());
debug_assert_eq!(y_row.len(), cr_row.len());
debug_assert_eq!(out.len(), y_row.len() * 4);
let w = y_row.len();
#[cfg(target_arch = "aarch64")]
{
#[allow(unsafe_code)]
unsafe {
ycbcr_neon(
y_row.as_ptr(),
cb_row.as_ptr(),
cr_row.as_ptr(),
out.as_mut_ptr(),
w,
)
};
return;
}
#[allow(unreachable_code)]
ycbcr_portable(y_row, cb_row, cr_row, out, w);
}
#[inline]
fn ycbcr_row_from_i16(y: &[i16], cb: &[i16], cr: &[i16], out: &mut [u8]) {
let w = y.len();
debug_assert_eq!(cb.len(), w);
debug_assert_eq!(cr.len(), w);
debug_assert_eq!(out.len(), w * 4);
#[cfg(target_arch = "aarch64")]
{
#[allow(unsafe_code)]
unsafe {
ycbcr_neon_raw(y.as_ptr(), cb.as_ptr(), cr.as_ptr(), out.as_mut_ptr(), w);
}
return;
}
#[cfg(all(target_arch = "x86_64", feature = "std"))]
{
if std::is_x86_feature_detected!("avx2") {
#[allow(unsafe_code)]
unsafe {
ycbcr_avx2_raw(y.as_ptr(), cb.as_ptr(), cr.as_ptr(), out.as_mut_ptr(), w);
}
return;
}
}
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
#[allow(unsafe_code)]
unsafe {
ycbcr_simd128_raw(y.as_ptr(), cb.as_ptr(), cr.as_ptr(), out.as_mut_ptr(), w);
}
return;
}
#[allow(unreachable_code)]
{
let mut y_norm = vec![0i32; w];
let mut cb_norm = vec![0i32; w];
let mut cr_norm = vec![0i32; w];
for (col, v) in y_norm.iter_mut().enumerate() {
*v = normalize(y[col]);
}
for col in 0..w {
cb_norm[col] = normalize(cb[col]);
cr_norm[col] = normalize(cr[col]);
}
ycbcr_row_to_rgba(&y_norm, &cb_norm, &cr_norm, out);
}
}
#[inline]
fn ycbcr_row_from_i16_half(y: &[i16], cb_half: &[i16], cr_half: &[i16], out: &mut [u8], w: usize) {
debug_assert!(y.len() >= w);
debug_assert_eq!(out.len(), w * 4);
#[cfg(target_arch = "aarch64")]
{
#[allow(unsafe_code)]
unsafe {
ycbcr_neon_raw_half(
y.as_ptr(),
cb_half.as_ptr(),
cr_half.as_ptr(),
out.as_mut_ptr(),
w,
);
}
return;
}
#[cfg(all(target_arch = "x86_64", feature = "std"))]
{
if std::is_x86_feature_detected!("avx2") {
#[allow(unsafe_code)]
unsafe {
ycbcr_avx2_raw_half(
y.as_ptr(),
cb_half.as_ptr(),
cr_half.as_ptr(),
out.as_mut_ptr(),
w,
);
}
return;
}
}
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
#[allow(unsafe_code)]
unsafe {
ycbcr_simd128_raw_half(
y.as_ptr(),
cb_half.as_ptr(),
cr_half.as_ptr(),
out.as_mut_ptr(),
w,
);
}
return;
}
#[allow(unreachable_code)]
{
let mut y_norm = vec![0i32; w];
let mut cb_norm = vec![0i32; w];
let mut cr_norm = vec![0i32; w];
for (col, v) in y_norm.iter_mut().enumerate() {
*v = normalize(y[col]);
}
for col in 0..w {
cb_norm[col] = normalize(cb_half[col / 2]);
cr_norm[col] = normalize(cr_half[col / 2]);
}
ycbcr_row_to_rgba(&y_norm, &cb_norm, &cr_norm, out);
}
}
#[inline(always)]
fn ycbcr_portable(y_row: &[i32], cb_row: &[i32], cr_row: &[i32], out: &mut [u8], w: usize) {
use wide::i32x8;
let c128 = i32x8::splat(128);
let c0 = i32x8::splat(0);
let c255 = i32x8::splat(255);
let full8 = w / 8;
for (((yc, cbc), crc), outc) in y_row[..full8 * 8]
.chunks_exact(8)
.zip(cb_row[..full8 * 8].chunks_exact(8))
.zip(cr_row[..full8 * 8].chunks_exact(8))
.zip(out[..full8 * 32].chunks_exact_mut(32))
{
let ys = i32x8::from([yc[0], yc[1], yc[2], yc[3], yc[4], yc[5], yc[6], yc[7]]);
let bs = i32x8::from([
cbc[0], cbc[1], cbc[2], cbc[3], cbc[4], cbc[5], cbc[6], cbc[7],
]);
let rs = i32x8::from([
crc[0], crc[1], crc[2], crc[3], crc[4], crc[5], crc[6], crc[7],
]);
let t2 = rs + (rs >> 1_i32);
let t3 = ys + c128 - (bs >> 2_i32);
let red = (ys + c128 + t2).max(c0).min(c255).to_array();
let grn = (t3 - (t2 >> 1_i32)).max(c0).min(c255).to_array();
let blu = (t3 + (bs << 1_i32)).max(c0).min(c255).to_array();
for i in 0..8 {
outc[i * 4] = red[i] as u8;
outc[i * 4 + 1] = grn[i] as u8;
outc[i * 4 + 2] = blu[i] as u8;
outc[i * 4 + 3] = 255;
}
}
for col in (full8 * 8)..w {
let y = y_row[col];
let b = cb_row[col];
let r = cr_row[col];
let t2 = r + (r >> 1);
let t3 = y + 128 - (b >> 2);
out[col * 4] = (y + 128 + t2).clamp(0, 255) as u8;
out[col * 4 + 1] = (t3 - (t2 >> 1)).clamp(0, 255) as u8;
out[col * 4 + 2] = (t3 + (b << 1)).clamp(0, 255) as u8;
out[col * 4 + 3] = 255;
}
}
#[cfg(target_arch = "aarch64")]
#[allow(unsafe_code, unsafe_op_in_unsafe_fn)]
#[target_feature(enable = "neon")]
unsafe fn ycbcr_neon_raw(
yp: *const i16,
cbp: *const i16,
crp: *const i16,
outp: *mut u8,
w: usize,
) {
use core::arch::aarch64::*;
let n_min = vdupq_n_s16(-128);
let n_max = vdupq_n_s16(127);
let c128 = vdupq_n_s16(128);
let alpha = vdup_n_u8(255);
let full8 = w / 8;
for i in 0..full8 {
let off = i * 8;
let yc = vmaxq_s16(
vminq_s16(vrshrq_n_s16::<6>(vld1q_s16(yp.add(off))), n_max),
n_min,
);
let cbc = vmaxq_s16(
vminq_s16(vrshrq_n_s16::<6>(vld1q_s16(cbp.add(off))), n_max),
n_min,
);
let crc = vmaxq_s16(
vminq_s16(vrshrq_n_s16::<6>(vld1q_s16(crp.add(off))), n_max),
n_min,
);
let y128 = vaddq_s16(yc, c128);
let t2 = vaddq_s16(crc, vshrq_n_s16::<1>(crc));
let t3 = vsubq_s16(y128, vshrq_n_s16::<2>(cbc));
let r16 = vaddq_s16(y128, t2);
let g16 = vsubq_s16(t3, vshrq_n_s16::<1>(t2));
let b16 = vaddq_s16(t3, vshlq_n_s16::<1>(cbc));
let r8 = vqmovun_s16(r16);
let g8 = vqmovun_s16(g16);
let b8 = vqmovun_s16(b16);
vst4_u8(outp.add(off * 4), uint8x8x4_t(r8, g8, b8, alpha));
}
for col in (full8 * 8)..w {
let y = normalize(*yp.add(col));
let b = normalize(*cbp.add(col));
let r = normalize(*crp.add(col));
let t2 = r + (r >> 1);
let t3 = y + 128 - (b >> 2);
*outp.add(col * 4) = (y + 128 + t2).clamp(0, 255) as u8;
*outp.add(col * 4 + 1) = (t3 - (t2 >> 1)).clamp(0, 255) as u8;
*outp.add(col * 4 + 2) = (t3 + (b << 1)).clamp(0, 255) as u8;
*outp.add(col * 4 + 3) = 255;
}
}
#[cfg(target_arch = "aarch64")]
#[allow(unsafe_code, unsafe_op_in_unsafe_fn)]
#[target_feature(enable = "neon")]
unsafe fn ycbcr_neon_raw_half(
yp: *const i16,
cbp: *const i16,
crp: *const i16,
outp: *mut u8,
w: usize,
) {
use core::arch::aarch64::*;
let n_min = vdupq_n_s16(-128);
let n_max = vdupq_n_s16(127);
let c128 = vdupq_n_s16(128);
let alpha = vdup_n_u8(255);
let full8 = w / 8;
for i in 0..full8 {
let off = i * 8;
let c_off = i * 4;
let yc = vmaxq_s16(
vminq_s16(vrshrq_n_s16::<6>(vld1q_s16(yp.add(off))), n_max),
n_min,
);
let cb4 = vmaxq_s16(
vminq_s16(
vrshrq_n_s16::<6>(vcombine_s16(vld1_s16(cbp.add(c_off)), vdup_n_s16(0))),
n_max,
),
n_min,
);
let cr4 = vmaxq_s16(
vminq_s16(
vrshrq_n_s16::<6>(vcombine_s16(vld1_s16(crp.add(c_off)), vdup_n_s16(0))),
n_max,
),
n_min,
);
let cbc = vzip1q_s16(cb4, cb4);
let crc = vzip1q_s16(cr4, cr4);
let y128 = vaddq_s16(yc, c128);
let t2 = vaddq_s16(crc, vshrq_n_s16::<1>(crc));
let t3 = vsubq_s16(y128, vshrq_n_s16::<2>(cbc));
let r16 = vaddq_s16(y128, t2);
let g16 = vsubq_s16(t3, vshrq_n_s16::<1>(t2));
let b16 = vaddq_s16(t3, vshlq_n_s16::<1>(cbc));
let r8 = vqmovun_s16(r16);
let g8 = vqmovun_s16(g16);
let b8 = vqmovun_s16(b16);
vst4_u8(outp.add(off * 4), uint8x8x4_t(r8, g8, b8, alpha));
}
for col in (full8 * 8)..w {
let y = normalize(*yp.add(col));
let b = normalize(*cbp.add(col / 2));
let r = normalize(*crp.add(col / 2));
let t2 = r + (r >> 1);
let t3 = y + 128 - (b >> 2);
*outp.add(col * 4) = (y + 128 + t2).clamp(0, 255) as u8;
*outp.add(col * 4 + 1) = (t3 - (t2 >> 1)).clamp(0, 255) as u8;
*outp.add(col * 4 + 2) = (t3 + (b << 1)).clamp(0, 255) as u8;
*outp.add(col * 4 + 3) = 255;
}
}
#[cfg(all(target_arch = "x86_64", feature = "std"))]
#[allow(unsafe_code, unsafe_op_in_unsafe_fn)]
#[target_feature(enable = "avx2")]
unsafe fn ycbcr_avx2_raw(
yp: *const i16,
cbp: *const i16,
crp: *const i16,
outp: *mut u8,
w: usize,
) {
use core::arch::x86_64::*;
let n_min = _mm256_set1_epi16(-128);
let n_max = _mm256_set1_epi16(127);
let c128 = _mm256_set1_epi16(128);
let one = _mm256_set1_epi16(1);
let full16 = w / 16;
for i in 0..full16 {
let off = i * 16;
let load_norm_clamp = |p: *const i16| -> __m256i {
let v = _mm256_loadu_si256(p as *const __m256i);
let high = _mm256_srai_epi16::<6>(v);
let bit5 = _mm256_and_si256(_mm256_srli_epi16::<5>(v), one);
let n = _mm256_add_epi16(high, bit5);
_mm256_max_epi16(_mm256_min_epi16(n, n_max), n_min)
};
let yc = load_norm_clamp(yp.add(off));
let cbc = load_norm_clamp(cbp.add(off));
let crc = load_norm_clamp(crp.add(off));
let y128 = _mm256_add_epi16(yc, c128);
let t2 = _mm256_add_epi16(crc, _mm256_srai_epi16::<1>(crc));
let t3 = _mm256_sub_epi16(y128, _mm256_srai_epi16::<2>(cbc));
let r16 = _mm256_add_epi16(y128, t2);
let g16 = _mm256_sub_epi16(t3, _mm256_srai_epi16::<1>(t2));
let b16 = _mm256_add_epi16(t3, _mm256_slli_epi16::<1>(cbc));
let r_pack = _mm_packus_epi16(
_mm256_castsi256_si128(r16),
_mm256_extracti128_si256::<1>(r16),
);
let g_pack = _mm_packus_epi16(
_mm256_castsi256_si128(g16),
_mm256_extracti128_si256::<1>(g16),
);
let b_pack = _mm_packus_epi16(
_mm256_castsi256_si128(b16),
_mm256_extracti128_si256::<1>(b16),
);
let a_pack = _mm_set1_epi8(-1i8);
let rg_lo = _mm_unpacklo_epi8(r_pack, g_pack);
let rg_hi = _mm_unpackhi_epi8(r_pack, g_pack);
let ba_lo = _mm_unpacklo_epi8(b_pack, a_pack);
let ba_hi = _mm_unpackhi_epi8(b_pack, a_pack);
let rgba0 = _mm_unpacklo_epi16(rg_lo, ba_lo);
let rgba1 = _mm_unpackhi_epi16(rg_lo, ba_lo);
let rgba2 = _mm_unpacklo_epi16(rg_hi, ba_hi);
let rgba3 = _mm_unpackhi_epi16(rg_hi, ba_hi);
let dst = outp.add(off * 4) as *mut __m128i;
_mm_storeu_si128(dst, rgba0);
_mm_storeu_si128(dst.add(1), rgba1);
_mm_storeu_si128(dst.add(2), rgba2);
_mm_storeu_si128(dst.add(3), rgba3);
}
for col in (full16 * 16)..w {
let y = normalize(*yp.add(col));
let b = normalize(*cbp.add(col));
let r = normalize(*crp.add(col));
let t2 = r + (r >> 1);
let t3 = y + 128 - (b >> 2);
*outp.add(col * 4) = (y + 128 + t2).clamp(0, 255) as u8;
*outp.add(col * 4 + 1) = (t3 - (t2 >> 1)).clamp(0, 255) as u8;
*outp.add(col * 4 + 2) = (t3 + (b << 1)).clamp(0, 255) as u8;
*outp.add(col * 4 + 3) = 255;
}
}
#[cfg(all(target_arch = "x86_64", feature = "std"))]
#[allow(unsafe_code, unsafe_op_in_unsafe_fn)]
#[target_feature(enable = "avx2")]
unsafe fn ycbcr_avx2_raw_half(
yp: *const i16,
cbp: *const i16,
crp: *const i16,
outp: *mut u8,
w: usize,
) {
use core::arch::x86_64::*;
let n_min = _mm256_set1_epi16(-128);
let n_max = _mm256_set1_epi16(127);
let c128 = _mm256_set1_epi16(128);
let one = _mm256_set1_epi16(1);
let norm_clamp = |v: __m256i| -> __m256i {
let high = _mm256_srai_epi16::<6>(v);
let bit5 = _mm256_and_si256(_mm256_srli_epi16::<5>(v), one);
let n = _mm256_add_epi16(high, bit5);
_mm256_max_epi16(_mm256_min_epi16(n, n_max), n_min)
};
let full16 = w / 16;
for i in 0..full16 {
let off = i * 16;
let c_off = i * 8;
let yv = _mm256_loadu_si256(yp.add(off) as *const __m256i);
let yc = norm_clamp(yv);
let upsample = |p: *const i16| -> __m256i {
let v8 = _mm_loadu_si128(p as *const __m128i);
let spread = _mm256_permute4x64_epi64::<0b00_01_00_00>(_mm256_castsi128_si256(v8));
_mm256_unpacklo_epi16(spread, spread)
};
let cbc = norm_clamp(upsample(cbp.add(c_off)));
let crc = norm_clamp(upsample(crp.add(c_off)));
let y128 = _mm256_add_epi16(yc, c128);
let t2 = _mm256_add_epi16(crc, _mm256_srai_epi16::<1>(crc));
let t3 = _mm256_sub_epi16(y128, _mm256_srai_epi16::<2>(cbc));
let r16 = _mm256_add_epi16(y128, t2);
let g16 = _mm256_sub_epi16(t3, _mm256_srai_epi16::<1>(t2));
let b16 = _mm256_add_epi16(t3, _mm256_slli_epi16::<1>(cbc));
let r_pack = _mm_packus_epi16(
_mm256_castsi256_si128(r16),
_mm256_extracti128_si256::<1>(r16),
);
let g_pack = _mm_packus_epi16(
_mm256_castsi256_si128(g16),
_mm256_extracti128_si256::<1>(g16),
);
let b_pack = _mm_packus_epi16(
_mm256_castsi256_si128(b16),
_mm256_extracti128_si256::<1>(b16),
);
let a_pack = _mm_set1_epi8(-1i8);
let rg_lo = _mm_unpacklo_epi8(r_pack, g_pack);
let rg_hi = _mm_unpackhi_epi8(r_pack, g_pack);
let ba_lo = _mm_unpacklo_epi8(b_pack, a_pack);
let ba_hi = _mm_unpackhi_epi8(b_pack, a_pack);
let rgba0 = _mm_unpacklo_epi16(rg_lo, ba_lo);
let rgba1 = _mm_unpackhi_epi16(rg_lo, ba_lo);
let rgba2 = _mm_unpacklo_epi16(rg_hi, ba_hi);
let rgba3 = _mm_unpackhi_epi16(rg_hi, ba_hi);
let dst = outp.add(off * 4) as *mut __m128i;
_mm_storeu_si128(dst, rgba0);
_mm_storeu_si128(dst.add(1), rgba1);
_mm_storeu_si128(dst.add(2), rgba2);
_mm_storeu_si128(dst.add(3), rgba3);
}
for col in (full16 * 16)..w {
let y = normalize(*yp.add(col));
let b = normalize(*cbp.add(col / 2));
let r = normalize(*crp.add(col / 2));
let t2 = r + (r >> 1);
let t3 = y + 128 - (b >> 2);
*outp.add(col * 4) = (y + 128 + t2).clamp(0, 255) as u8;
*outp.add(col * 4 + 1) = (t3 - (t2 >> 1)).clamp(0, 255) as u8;
*outp.add(col * 4 + 2) = (t3 + (b << 1)).clamp(0, 255) as u8;
*outp.add(col * 4 + 3) = 255;
}
}
#[cfg(target_arch = "wasm32")]
#[allow(unsafe_code, unsafe_op_in_unsafe_fn, dead_code)]
#[target_feature(enable = "simd128")]
unsafe fn ycbcr_simd128_raw(
yp: *const i16,
cbp: *const i16,
crp: *const i16,
outp: *mut u8,
w: usize,
) {
use core::arch::wasm32::*;
let n_min = i16x8_splat(-128);
let n_max = i16x8_splat(127);
let c128 = i16x8_splat(128);
let one = i16x8_splat(1);
let alpha_src = i16x8_splat(255);
let full8 = w / 8;
for i in 0..full8 {
let off = i * 8;
let load_norm_clamp = |p: *const i16| -> v128 {
let v = v128_load(p as *const v128);
let high = i16x8_shr(v, 6);
let bit5 = v128_and(u16x8_shr(v, 5), one);
let n = i16x8_add(high, bit5);
i16x8_max(i16x8_min(n, n_max), n_min)
};
let yc = load_norm_clamp(yp.add(off));
let cbc = load_norm_clamp(cbp.add(off));
let crc = load_norm_clamp(crp.add(off));
let y128 = i16x8_add(yc, c128);
let t2 = i16x8_add(crc, i16x8_shr(crc, 1));
let t3 = i16x8_sub(y128, i16x8_shr(cbc, 2));
let r16 = i16x8_add(y128, t2);
let g16 = i16x8_sub(t3, i16x8_shr(t2, 1));
let b16 = i16x8_add(t3, i16x8_shl(cbc, 1));
let v_rg = u8x16_narrow_i16x8(r16, g16);
let v_ba = u8x16_narrow_i16x8(b16, alpha_src);
let out0 =
i8x16_shuffle::<0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27>(v_rg, v_ba);
let out1 =
i8x16_shuffle::<4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31>(v_rg, v_ba);
v128_store(outp.add(off * 4) as *mut v128, out0);
v128_store(outp.add(off * 4 + 16) as *mut v128, out1);
}
for col in (full8 * 8)..w {
let y = normalize(*yp.add(col));
let b = normalize(*cbp.add(col));
let r = normalize(*crp.add(col));
let t2 = r + (r >> 1);
let t3 = y + 128 - (b >> 2);
*outp.add(col * 4) = (y + 128 + t2).clamp(0, 255) as u8;
*outp.add(col * 4 + 1) = (t3 - (t2 >> 1)).clamp(0, 255) as u8;
*outp.add(col * 4 + 2) = (t3 + (b << 1)).clamp(0, 255) as u8;
*outp.add(col * 4 + 3) = 255;
}
}
#[cfg(target_arch = "wasm32")]
#[allow(unsafe_code, unsafe_op_in_unsafe_fn, dead_code)]
#[target_feature(enable = "simd128")]
unsafe fn ycbcr_simd128_raw_half(
yp: *const i16,
cbp: *const i16,
crp: *const i16,
outp: *mut u8,
w: usize,
) {
use core::arch::wasm32::*;
let n_min = i16x8_splat(-128);
let n_max = i16x8_splat(127);
let c128 = i16x8_splat(128);
let one = i16x8_splat(1);
let alpha_src = i16x8_splat(255);
let full8 = w / 8;
for i in 0..full8 {
let off = i * 8;
let c_off = i * 4;
let load_norm_clamp = |p: *const i16| -> v128 {
let v = v128_load(p as *const v128);
let high = i16x8_shr(v, 6);
let bit5 = v128_and(u16x8_shr(v, 5), one);
let n = i16x8_add(high, bit5);
i16x8_max(i16x8_min(n, n_max), n_min)
};
let yc = load_norm_clamp(yp.add(off));
let load_norm_chroma_4 = |p: *const i16| -> v128 {
let v = v128_load64_zero(p as *const u64);
let high = i16x8_shr(v, 6);
let bit5 = v128_and(u16x8_shr(v, 5), one);
let n = i16x8_add(high, bit5);
i16x8_max(i16x8_min(n, n_max), n_min)
};
let cb4 = load_norm_chroma_4(cbp.add(c_off));
let cr4 = load_norm_chroma_4(crp.add(c_off));
let cbc = i8x16_shuffle::<0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7>(cb4, cb4);
let crc = i8x16_shuffle::<0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7>(cr4, cr4);
let y128 = i16x8_add(yc, c128);
let t2 = i16x8_add(crc, i16x8_shr(crc, 1));
let t3 = i16x8_sub(y128, i16x8_shr(cbc, 2));
let r16 = i16x8_add(y128, t2);
let g16 = i16x8_sub(t3, i16x8_shr(t2, 1));
let b16 = i16x8_add(t3, i16x8_shl(cbc, 1));
let v_rg = u8x16_narrow_i16x8(r16, g16);
let v_ba = u8x16_narrow_i16x8(b16, alpha_src);
let out0 =
i8x16_shuffle::<0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27>(v_rg, v_ba);
let out1 =
i8x16_shuffle::<4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31>(v_rg, v_ba);
v128_store(outp.add(off * 4) as *mut v128, out0);
v128_store(outp.add(off * 4 + 16) as *mut v128, out1);
}
for col in (full8 * 8)..w {
let y = normalize(*yp.add(col));
let b = normalize(*cbp.add(col / 2));
let r = normalize(*crp.add(col / 2));
let t2 = r + (r >> 1);
let t3 = y + 128 - (b >> 2);
*outp.add(col * 4) = (y + 128 + t2).clamp(0, 255) as u8;
*outp.add(col * 4 + 1) = (t3 - (t2 >> 1)).clamp(0, 255) as u8;
*outp.add(col * 4 + 2) = (t3 + (b << 1)).clamp(0, 255) as u8;
*outp.add(col * 4 + 3) = 255;
}
}
#[cfg(target_arch = "aarch64")]
#[allow(unsafe_code, unsafe_op_in_unsafe_fn)]
#[target_feature(enable = "neon")]
unsafe fn ycbcr_neon(yp: *const i32, cbp: *const i32, crp: *const i32, outp: *mut u8, w: usize) {
use core::arch::aarch64::*;
let c128 = vdupq_n_s32(128);
let c0 = vdupq_n_s32(0);
let c255 = vdupq_n_s32(255);
let alpha = vdup_n_u8(255);
let full8 = w / 8;
for i in 0..full8 {
let off = i * 8;
let y_lo = vld1q_s32(yp.add(off));
let y_hi = vld1q_s32(yp.add(off + 4));
let cb_lo = vld1q_s32(cbp.add(off));
let cb_hi = vld1q_s32(cbp.add(off + 4));
let cr_lo = vld1q_s32(crp.add(off));
let cr_hi = vld1q_s32(crp.add(off + 4));
let t2_lo = vaddq_s32(cr_lo, vshrq_n_s32::<1>(cr_lo));
let t2_hi = vaddq_s32(cr_hi, vshrq_n_s32::<1>(cr_hi));
let t3_lo = vsubq_s32(vaddq_s32(y_lo, c128), vshrq_n_s32::<2>(cb_lo));
let t3_hi = vsubq_s32(vaddq_s32(y_hi, c128), vshrq_n_s32::<2>(cb_hi));
let r_lo = vminq_s32(vmaxq_s32(vaddq_s32(vaddq_s32(y_lo, c128), t2_lo), c0), c255);
let r_hi = vminq_s32(vmaxq_s32(vaddq_s32(vaddq_s32(y_hi, c128), t2_hi), c0), c255);
let g_lo = vminq_s32(
vmaxq_s32(vsubq_s32(t3_lo, vshrq_n_s32::<1>(t2_lo)), c0),
c255,
);
let g_hi = vminq_s32(
vmaxq_s32(vsubq_s32(t3_hi, vshrq_n_s32::<1>(t2_hi)), c0),
c255,
);
let b_lo = vminq_s32(
vmaxq_s32(vaddq_s32(t3_lo, vshlq_n_s32::<1>(cb_lo)), c0),
c255,
);
let b_hi = vminq_s32(
vmaxq_s32(vaddq_s32(t3_hi, vshlq_n_s32::<1>(cb_hi)), c0),
c255,
);
let r8 = vqmovun_s16(vcombine_s16(vmovn_s32(r_lo), vmovn_s32(r_hi)));
let g8 = vqmovun_s16(vcombine_s16(vmovn_s32(g_lo), vmovn_s32(g_hi)));
let b8 = vqmovun_s16(vcombine_s16(vmovn_s32(b_lo), vmovn_s32(b_hi)));
vst4_u8(outp.add(off * 4), uint8x8x4_t(r8, g8, b8, alpha));
}
for col in (full8 * 8)..w {
let y = *yp.add(col);
let b = *cbp.add(col);
let r = *crp.add(col);
let t2 = r + (r >> 1);
let t3 = y + 128 - (b >> 2);
*outp.add(col * 4) = (y + 128 + t2).clamp(0, 255) as u8;
*outp.add(col * 4 + 1) = (t3 - (t2 >> 1)).clamp(0, 255) as u8;
*outp.add(col * 4 + 2) = (t3 + (b << 1)).clamp(0, 255) as u8;
*outp.add(col * 4 + 3) = 255;
}
}
#[derive(Clone, Debug)]
struct PlaneDecoder {
width: usize,
height: usize,
block_cols: usize,
blocks: Vec<[i16; 1024]>,
quant_lo: [u32; 16],
quant_hi: [u32; 10],
curband: usize,
ctx_decode_bucket: [u8; 1],
ctx_decode_coef: [u8; 80],
ctx_activate_coef: [u8; 16],
ctx_increase_coef: [u8; 1],
coeffstate: [[u8; 16]; 16],
bucketstate: [u8; 16],
bbstate: u8,
}
#[allow(unsafe_code)]
#[inline(always)]
fn prelim_flags_bucket(block: &[i16; 1024], base: usize, bucket: &mut [u8; 16]) -> u8 {
#[cfg(target_arch = "aarch64")]
return unsafe { prelim_flags_bucket_neon(block, base, bucket) };
#[cfg(all(target_arch = "x86_64", feature = "std"))]
{
if std::is_x86_feature_detected!("avx2") {
return unsafe { prelim_flags_bucket_avx2(block, base, bucket) };
}
}
#[cfg_attr(target_arch = "aarch64", allow(unreachable_code))]
{
let mut bstate = 0u8;
for k in 0..16 {
let f = if block[base + k] == 0 { UNK } else { ACTIVE };
bucket[k] = f;
bstate |= f;
}
bstate
}
}
#[cfg(target_arch = "aarch64")]
#[allow(unsafe_code, unsafe_op_in_unsafe_fn)]
#[target_feature(enable = "neon")]
unsafe fn prelim_flags_bucket_neon(block: &[i16; 1024], base: usize, bucket: &mut [u8; 16]) -> u8 {
use core::arch::aarch64::*;
let ptr = block.as_ptr().add(base);
let c0 = vreinterpretq_u16_s16(vld1q_s16(ptr));
let c1 = vreinterpretq_u16_s16(vld1q_s16(ptr.add(8)));
let zero = vdupq_n_u16(0);
let nz0 = vmvnq_u16(vceqq_u16(c0, zero));
let nz1 = vmvnq_u16(vceqq_u16(c1, zero));
let xv = vdupq_n_u16(10);
let uv = vdupq_n_u16(8);
let r0 = veorq_u16(uv, vandq_u16(xv, nz0));
let r1 = veorq_u16(uv, vandq_u16(xv, nz1));
let out = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
vst1q_u8(bucket.as_mut_ptr(), out);
let lo = vget_low_u8(out);
let hi = vget_high_u8(out);
let v4 = vorr_u8(lo, hi);
let v2 = vorr_u8(v4, vext_u8::<4>(v4, v4));
let v1 = vorr_u8(v2, vext_u8::<2>(v2, v2));
let v0 = vorr_u8(v1, vext_u8::<1>(v1, v1));
vget_lane_u8::<0>(v0)
}
#[cfg(all(target_arch = "x86_64", feature = "std"))]
#[allow(unsafe_code, unsafe_op_in_unsafe_fn)]
#[target_feature(enable = "avx2")]
unsafe fn prelim_flags_bucket_avx2(block: &[i16; 1024], base: usize, bucket: &mut [u8; 16]) -> u8 {
use core::arch::x86_64::*;
let coefs = _mm256_loadu_si256(block.as_ptr().add(base) as *const __m256i);
let zero = _mm256_setzero_si256();
let eq = _mm256_cmpeq_epi16(coefs, zero);
let all_ones = _mm256_cmpeq_epi16(zero, zero);
let nz = _mm256_xor_si256(eq, all_ones);
let xv = _mm256_set1_epi16(10);
let uv = _mm256_set1_epi16(8);
let r16 = _mm256_xor_si256(uv, _mm256_and_si256(xv, nz));
let r_lo = _mm256_castsi256_si128(r16);
let r_hi = _mm256_extracti128_si256::<1>(r16);
let packed = _mm_packus_epi16(r_lo, r_hi);
_mm_storeu_si128(bucket.as_mut_ptr() as *mut __m128i, packed);
let or64 = _mm_or_si128(packed, _mm_unpackhi_epi64(packed, packed));
let or32 = _mm_or_si128(or64, _mm_srli_si128::<4>(or64));
let or16_red = _mm_or_si128(or32, _mm_srli_si128::<2>(or32));
let or8 = _mm_or_si128(or16_red, _mm_srli_si128::<1>(or16_red));
_mm_extract_epi8::<0>(or8) as u8
}
#[cfg(target_arch = "aarch64")]
#[allow(unsafe_code, unsafe_op_in_unsafe_fn)]
#[target_feature(enable = "neon")]
unsafe fn prelim_flags_band0_neon(block: &[i16; 1024], old_flags: &mut [u8; 16]) -> u8 {
use core::arch::aarch64::*;
let old_u8 = vld1q_u8(old_flags.as_ptr());
let one_u8 = vdupq_n_u8(1);
let is_zero_state = vceqq_u8(old_u8, one_u8); let should_update = vmvnq_u8(is_zero_state); let ptr = block.as_ptr();
let c0 = vreinterpretq_u16_s16(vld1q_s16(ptr));
let c1 = vreinterpretq_u16_s16(vld1q_s16(ptr.add(8)));
let zero16 = vdupq_n_u16(0);
let nz0 = vmvnq_u16(vceqq_u16(c0, zero16));
let nz1 = vmvnq_u16(vceqq_u16(c1, zero16));
let xv = vdupq_n_u16(10); let uv = vdupq_n_u16(8); let r0 = veorq_u16(uv, vandq_u16(xv, nz0));
let r1 = veorq_u16(uv, vandq_u16(xv, nz1));
let new_flags = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
let result = vbslq_u8(should_update, new_flags, old_u8);
vst1q_u8(old_flags.as_mut_ptr(), result);
let lo = vget_low_u8(result);
let hi = vget_high_u8(result);
let v4 = vorr_u8(lo, hi);
let v2 = vorr_u8(v4, vext_u8::<4>(v4, v4));
let v1 = vorr_u8(v2, vext_u8::<2>(v2, v2));
let v0 = vorr_u8(v1, vext_u8::<1>(v1, v1));
vget_lane_u8::<0>(v0)
}
#[cfg(all(target_arch = "x86_64", feature = "std"))]
#[allow(unsafe_code, unsafe_op_in_unsafe_fn)]
#[target_feature(enable = "avx2")]
unsafe fn prelim_flags_band0_avx2(block: &[i16; 1024], old_flags: &mut [u8; 16]) -> u8 {
use core::arch::x86_64::*;
let old_u8 = _mm_loadu_si128(old_flags.as_ptr() as *const __m128i);
let one_u8 = _mm_set1_epi8(1);
let is_zero_state = _mm_cmpeq_epi8(old_u8, one_u8);
let all_ones_128 = _mm_cmpeq_epi8(old_u8, old_u8);
let should_update = _mm_xor_si128(is_zero_state, all_ones_128);
let coefs = _mm256_loadu_si256(block.as_ptr() as *const __m256i);
let zero = _mm256_setzero_si256();
let eq = _mm256_cmpeq_epi16(coefs, zero);
let all_ones_256 = _mm256_cmpeq_epi16(zero, zero);
let nz = _mm256_xor_si256(eq, all_ones_256);
let xv = _mm256_set1_epi16(10);
let uv = _mm256_set1_epi16(8);
let r16 = _mm256_xor_si256(uv, _mm256_and_si256(xv, nz));
let r_lo = _mm256_castsi256_si128(r16);
let r_hi = _mm256_extracti128_si256::<1>(r16);
let new_flags = _mm_packus_epi16(r_lo, r_hi);
let blended = _mm_or_si128(
_mm_and_si128(should_update, new_flags),
_mm_andnot_si128(should_update, old_u8),
);
_mm_storeu_si128(old_flags.as_mut_ptr() as *mut __m128i, blended);
let or64 = _mm_or_si128(blended, _mm_unpackhi_epi64(blended, blended));
let or32 = _mm_or_si128(or64, _mm_srli_si128::<4>(or64));
let or16_red = _mm_or_si128(or32, _mm_srli_si128::<2>(or32));
let or8 = _mm_or_si128(or16_red, _mm_srli_si128::<1>(or16_red));
_mm_extract_epi8::<0>(or8) as u8
}
#[allow(unsafe_code)]
#[inline(always)]
fn band0_dispatch(block: &[i16; 1024], old_flags: &mut [u8; 16]) -> u8 {
#[cfg(target_arch = "aarch64")]
return unsafe { prelim_flags_band0_neon(block, old_flags) };
#[cfg(all(target_arch = "x86_64", feature = "std"))]
{
if std::is_x86_feature_detected!("avx2") {
return unsafe { prelim_flags_band0_avx2(block, old_flags) };
}
}
#[cfg_attr(target_arch = "aarch64", allow(unreachable_code))]
{
let mut b = 0u8;
for k in 0..16 {
if old_flags[k] != ZERO {
old_flags[k] = if block[k] == 0 { UNK } else { ACTIVE };
}
b |= old_flags[k];
}
b
}
}
impl PlaneDecoder {
fn new(width: usize, height: usize) -> Self {
let block_cols = width.div_ceil(32);
let block_rows = height.div_ceil(32);
let block_count = block_cols * block_rows;
PlaneDecoder {
width,
height,
block_cols,
blocks: vec![[0i16; 1024]; block_count],
quant_lo: QUANT_LO_INIT,
quant_hi: QUANT_HI_INIT,
curband: 0,
ctx_decode_bucket: [0; 1],
ctx_decode_coef: [0; 80],
ctx_activate_coef: [0; 16],
ctx_increase_coef: [0; 1],
coeffstate: [[0; 16]; 16],
bucketstate: [0; 16],
bbstate: 0,
}
}
fn decode_slice(&mut self, zp: &mut ZpDecoder<'_>) {
if !self.is_null_slice() {
for block_idx in 0..self.blocks.len() {
self.preliminary_flag_computation(block_idx);
if self.block_band_decoding_pass(zp) && self.bucket_decoding_pass(zp, block_idx) {
self.newly_active_coefficient_decoding_pass(zp, block_idx);
}
if (self.bbstate & ACTIVE) != 0 {
self.previously_active_coefficient_decoding_pass(zp, block_idx);
}
}
}
self.finish_slice();
}
fn is_null_slice(&mut self) -> bool {
if self.curband == 0 {
let mut is_null = true;
for i in 0..16 {
let threshold = self.quant_lo[i];
self.coeffstate[0][i] = ZERO;
if threshold > 0 && threshold < 0x8000 {
self.coeffstate[0][i] = UNK;
is_null = false;
}
}
is_null
} else {
let threshold = self.quant_hi[self.curband];
!(threshold > 0 && threshold < 0x8000)
}
}
fn preliminary_flag_computation(&mut self, block_idx: usize) {
self.bbstate = 0;
let (from, to) = BAND_BUCKETS[self.curband];
if self.curband != 0 {
for (boff, j) in (from..=to).enumerate() {
let bstatetmp = prelim_flags_bucket(
&self.blocks[block_idx],
j << 4,
&mut self.coeffstate[boff],
);
self.bucketstate[boff] = bstatetmp;
self.bbstate |= bstatetmp;
}
} else {
let bstatetmp = band0_dispatch(&self.blocks[block_idx], &mut self.coeffstate[0]);
self.bucketstate[0] = bstatetmp;
self.bbstate |= bstatetmp;
}
}
fn block_band_decoding_pass(&mut self, zp: &mut ZpDecoder<'_>) -> bool {
let (from, to) = BAND_BUCKETS[self.curband];
let bcount = to - from + 1;
let should_mark_new = bcount < 16
|| (self.bbstate & ACTIVE) != 0
|| ((self.bbstate & UNK) != 0 && zp.decode_bit(&mut self.ctx_decode_bucket[0]));
if should_mark_new {
self.bbstate |= NEW;
}
(self.bbstate & NEW) != 0
}
fn bucket_decoding_pass(&mut self, zp: &mut ZpDecoder<'_>, block_idx: usize) -> bool {
let (from, to) = BAND_BUCKETS[self.curband];
let mut any_new = false;
for (boff, i) in (from..=to).enumerate() {
if (self.bucketstate[boff] & UNK) == 0 {
continue;
}
let mut n: usize = 0;
if self.curband != 0 {
let t = 4 * i;
for j in t..t + 4 {
if self.blocks[block_idx][j] != 0 {
n += 1;
}
}
if n == 4 {
n = 3;
}
}
if (self.bbstate & ACTIVE) != 0 {
n |= 4;
}
if zp.decode_bit(&mut self.ctx_decode_coef[n + self.curband * 8]) {
self.bucketstate[boff] |= NEW;
any_new = true;
}
}
any_new
}
fn newly_active_coefficient_decoding_pass(&mut self, zp: &mut ZpDecoder<'_>, block_idx: usize) {
let (from, to) = BAND_BUCKETS[self.curband];
let mut step = self.quant_hi[self.curband];
for (boff, i) in (from..=to).enumerate() {
if (self.bucketstate[boff] & NEW) != 0 {
let shift: usize = if (self.bucketstate[boff] & ACTIVE) != 0 {
8
} else {
0
};
let mut np: usize = 0;
for j in 0..16 {
if (self.coeffstate[boff][j] & UNK) != 0 {
np += 1;
}
}
for j in 0..16 {
if (self.coeffstate[boff][j] & UNK) != 0 {
let ip = np.min(7);
if zp.decode_bit(&mut self.ctx_activate_coef[shift + ip]) {
let sign = if zp.decode_passthrough_iw44() {
-1i32
} else {
1i32
};
np = 0;
if self.curband == 0 {
step = self.quant_lo[j];
}
let s = step as i32;
let val = sign * (s + (s >> 1) - (s >> 3));
self.blocks[block_idx][(i << 4) | j] = val as i16;
}
np = np.saturating_sub(1);
}
}
}
}
}
#[inline(never)]
fn previously_active_coefficient_decoding_pass(
&mut self,
zp: &mut ZpDecoder<'_>,
block_idx: usize,
) {
use crate::zp_impl::tables::{LPS_NEXT, MPS_NEXT, PROB, THRESHOLD};
let mut a = zp.a;
let mut c = zp.c;
let mut fence = zp.fence;
let mut bit_buf = zp.bit_buf;
let mut bit_count = zp.bit_count;
let data = zp.data;
let mut pos = zp.pos;
macro_rules! read_byte {
() => {{
let b = if pos < data.len() { data[pos] } else { 0xff };
pos = pos.wrapping_add(1);
b as u32
}};
}
macro_rules! refill {
() => {
while bit_count <= 24 {
bit_buf = (bit_buf << 8) | read_byte!();
bit_count += 8;
}
};
}
macro_rules! renorm {
() => {{
let shift = (a as u16).leading_ones();
bit_count -= shift as i32;
a = (a << shift) & 0xffff;
let mask = (1u32 << (shift & 31)).wrapping_sub(1);
c = ((c << shift) | (bit_buf >> (bit_count as u32 & 31)) & mask) & 0xffff;
if bit_count < 16 {
refill!();
}
fence = c.min(0x7fff);
}};
}
macro_rules! decode_bit_ctx {
($ctx:expr) => {{
let state = ($ctx) as usize;
let mps_bit = state & 1;
let z = a + PROB[state] as u32;
if z <= fence {
a = z;
mps_bit != 0
} else {
let boundary = 0x6000u32 + ((a + z) >> 2);
let z_clamped = z.min(boundary);
if z_clamped > c {
let complement = 0x10000u32 - z_clamped;
a = (a + complement) & 0xffff;
c = (c + complement) & 0xffff;
$ctx = LPS_NEXT[state];
renorm!();
(1 - mps_bit) != 0
} else {
if a >= THRESHOLD[state] as u32 {
$ctx = MPS_NEXT[state];
}
bit_count -= 1;
a = (z_clamped << 1) & 0xffff;
c = ((c << 1) | (bit_buf >> (bit_count as u32 & 31)) & 1) & 0xffff;
if bit_count < 16 {
refill!();
}
fence = c.min(0x7fff);
mps_bit != 0
}
}
}};
}
macro_rules! decode_passthrough_iw44 {
() => {{
let z = (0x8000u32 + (3u32 * a) / 8) as u16;
if z as u32 > c {
let complement = 0x10000u32 - z as u32;
a = (a + complement) & 0xffff;
c = (c + complement) & 0xffff;
renorm!();
true
} else {
bit_count -= 1;
a = (z as u32 * 2) & 0xffff;
c = (c << 1 | (bit_buf >> (bit_count as u32 & 31)) & 1) & 0xffff;
if bit_count < 16 {
refill!();
}
fence = c.min(0x7fff);
false
}
}};
}
let (from, to) = BAND_BUCKETS[self.curband];
let mut step = self.quant_hi[self.curband];
for (boff, i) in (from..=to).enumerate() {
for j in 0..16 {
if (self.coeffstate[boff][j] & ACTIVE) != 0 {
if self.curband == 0 {
step = self.quant_lo[j];
}
let coef = self.blocks[block_idx][(i << 4) | j];
let mut abs_coef = coef.unsigned_abs() as i32;
let s = step as i32;
let des = if abs_coef <= 3 * s {
let d = decode_bit_ctx!(self.ctx_increase_coef[0]);
abs_coef += s >> 2;
d
} else {
decode_passthrough_iw44!()
};
if des {
abs_coef += s >> 1;
} else {
abs_coef += -s + (s >> 1);
}
self.blocks[block_idx][(i << 4) | j] = if coef < 0 {
-abs_coef as i16
} else {
abs_coef as i16
};
}
}
}
zp.a = a;
zp.c = c;
zp.fence = fence;
zp.bit_buf = bit_buf;
zp.bit_count = bit_count;
zp.pos = pos;
}
fn finish_slice(&mut self) {
self.quant_hi[self.curband] >>= 1;
if self.curband == 0 {
for i in 0..16 {
self.quant_lo[i] >>= 1;
}
}
self.curband += 1;
if self.curband == 10 {
self.curband = 0;
}
}
fn reconstruct(&self, subsample: usize) -> FlatPlane {
if (2..=8).contains(&subsample) && subsample.is_power_of_two() {
let sub = subsample;
let block_rows = self.height.div_ceil(32);
let sub_block = 32 / sub;
let compact_stride = self.block_cols * sub_block;
let compact_rows = block_rows * sub_block;
let compact_w = self.width.div_ceil(sub);
let compact_h = self.height.div_ceil(sub);
#[allow(unsafe_code)]
let mut plane = FlatPlane {
data: unsafe { uninit_i16_vec(compact_stride * compact_rows) },
stride: compact_stride,
};
let compact_inv: &[u8] = match sub {
2 => &ZIGZAG_INV_SUB2,
4 => &ZIGZAG_INV_SUB4,
_ => &ZIGZAG_INV_SUB8, };
#[allow(unsafe_code)]
for r in 0..block_rows {
for c in 0..self.block_cols {
let block = &self.blocks[r * self.block_cols + c];
let base_row = r * sub_block;
let base_col = c * sub_block;
for row in 0..sub_block {
let dst_base = (base_row + row) * compact_stride + base_col;
let inv_base = row * sub_block;
for col in 0..sub_block {
let i = unsafe { *compact_inv.get_unchecked(inv_base + col) } as usize;
unsafe {
*plane.data.get_unchecked_mut(dst_base + col) =
*block.get_unchecked(i);
}
}
}
}
}
let start_scale = 16 / sub;
inverse_wavelet_transform_from(&mut plane, compact_w, compact_h, 1, start_scale);
return plane;
}
let full_width = self.width.div_ceil(32) * 32;
let full_height = self.height.div_ceil(32) * 32;
let block_rows = self.height.div_ceil(32);
#[allow(unsafe_code)]
let mut plane = FlatPlane {
data: unsafe { uninit_i16_vec(full_width * full_height) },
stride: full_width,
};
for r in 0..block_rows {
for c in 0..self.block_cols {
let block = &self.blocks[r * self.block_cols + c];
let row_base = r << 5;
let col_base = c << 5;
for row in 0..32usize {
let dst_base = (row_base + row) * full_width + col_base;
let inv_base = row * 32;
for col in 0..32usize {
let i = ZIGZAG_INV[inv_base + col] as usize;
plane.data[dst_base + col] = block[i];
}
}
}
}
inverse_wavelet_transform(&mut plane, self.width, self.height, subsample);
plane
}
}
#[allow(unsafe_code)]
unsafe fn uninit_i16_vec(n: usize) -> Vec<i16> {
use core::mem::MaybeUninit;
let mut v: Vec<MaybeUninit<i16>> = Vec::with_capacity(n);
unsafe { v.set_len(n) };
let mut md = core::mem::ManuallyDrop::new(v);
unsafe { Vec::from_raw_parts(md.as_mut_ptr().cast::<i16>(), md.len(), md.capacity()) }
}
struct FlatPlane {
data: Vec<i16>,
stride: usize,
}
use wide::i32x8;
#[inline(always)]
fn load8s(slice: &[i16], phys_off: usize, s: usize) -> i32x8 {
if s == 1 {
#[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
{
#[allow(unsafe_code)]
return unsafe { load8s_s1_avx2(slice, phys_off) };
}
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
#[allow(unsafe_code)]
return unsafe { load8s_s1_simd128(slice, phys_off) };
}
#[allow(unsafe_code, unreachable_code)]
return unsafe {
let arr: [i16; 8] = core::ptr::read(slice.as_ptr().add(phys_off) as *const [i16; 8]);
i32x8::from([
arr[0] as i32,
arr[1] as i32,
arr[2] as i32,
arr[3] as i32,
arr[4] as i32,
arr[5] as i32,
arr[6] as i32,
arr[7] as i32,
])
};
}
#[cfg(target_arch = "aarch64")]
if s == 2 || s == 4 {
#[allow(unsafe_code)]
return unsafe { load8s_neon(slice, phys_off, s) };
}
i32x8::from([
slice[phys_off] as i32,
slice[phys_off + s] as i32,
slice[phys_off + 2 * s] as i32,
slice[phys_off + 3 * s] as i32,
slice[phys_off + 4 * s] as i32,
slice[phys_off + 5 * s] as i32,
slice[phys_off + 6 * s] as i32,
slice[phys_off + 7 * s] as i32,
])
}
#[inline(always)]
fn store8s(slice: &mut [i16], phys_off: usize, s: usize, v: i32x8) {
if s == 1 {
#[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
{
#[allow(unsafe_code)]
return unsafe { store8s_s1_avx2(slice, phys_off, v) };
}
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
{
#[allow(unsafe_code)]
return unsafe { store8s_s1_simd128(slice, phys_off, v) };
}
#[allow(unsafe_code, unreachable_code)]
return unsafe {
let a = v.to_array();
let narrow: [i16; 8] = [
a[0] as i16,
a[1] as i16,
a[2] as i16,
a[3] as i16,
a[4] as i16,
a[5] as i16,
a[6] as i16,
a[7] as i16,
];
core::ptr::write(slice.as_mut_ptr().add(phys_off) as *mut [i16; 8], narrow);
};
}
#[cfg(target_arch = "aarch64")]
if s == 2 || s == 4 {
#[allow(unsafe_code)]
return unsafe { store8s_neon(slice, phys_off, s, v) };
}
let a = v.to_array();
for j in 0..8 {
slice[phys_off + j * s] = a[j] as i16;
}
}
#[cfg(target_arch = "aarch64")]
#[allow(unsafe_code, unsafe_op_in_unsafe_fn)]
#[target_feature(enable = "neon")]
unsafe fn load8s_neon(slice: &[i16], phys_off: usize, s: usize) -> i32x8 {
use core::arch::aarch64::*;
let ptr = slice.as_ptr().add(phys_off);
let target: int16x8_t = if s == 2 {
vld2q_s16(ptr).0
} else {
vld4q_s16(ptr).0
};
let lo = vmovl_s16(vget_low_s16(target));
let hi = vmovl_high_s16(target);
let arr = core::mem::transmute::<[int32x4_t; 2], [i32; 8]>([lo, hi]);
i32x8::from(arr)
}
#[cfg(all(target_arch = "x86_64", feature = "std"))]
#[allow(unsafe_code, unsafe_op_in_unsafe_fn, dead_code)]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn load8s_s1_avx2(slice: &[i16], phys_off: usize) -> i32x8 {
use core::arch::x86_64::*;
let ptr = slice.as_ptr().add(phys_off) as *const __m128i;
let v16 = _mm_loadu_si128(ptr);
let v32 = _mm256_cvtepi16_epi32(v16);
let arr: [i32; 8] = core::mem::transmute(v32);
i32x8::from(arr)
}
#[cfg(all(target_arch = "x86_64", feature = "std"))]
#[allow(unsafe_code, unsafe_op_in_unsafe_fn, dead_code)]
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn store8s_s1_avx2(slice: &mut [i16], phys_off: usize, v: i32x8) {
use core::arch::x86_64::*;
let arr: [i32; 8] = v.to_array();
let v32: __m256i = core::mem::transmute(arr);
let shuf = _mm256_setr_epi8(
0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 4, 5, 8, 9, 12, 13, -1, -1,
-1, -1, -1, -1, -1, -1,
);
let shuffled = _mm256_shuffle_epi8(v32, shuf);
let permuted = _mm256_permute4x64_epi64::<0b00_00_10_00>(shuffled);
let result = _mm256_castsi256_si128(permuted);
let ptr = slice.as_mut_ptr().add(phys_off) as *mut __m128i;
_mm_storeu_si128(ptr, result);
}
#[cfg(target_arch = "aarch64")]
#[allow(unsafe_code, unsafe_op_in_unsafe_fn)]
#[target_feature(enable = "neon")]
unsafe fn store8s_neon(slice: &mut [i16], phys_off: usize, s: usize, v: i32x8) {
use core::arch::aarch64::*;
let ptr = slice.as_mut_ptr().add(phys_off);
let v_arr = core::mem::transmute::<[i32; 8], [int32x4_t; 2]>(v.to_array());
let new_vals = vcombine_s16(vmovn_s32(v_arr[0]), vmovn_s32(v_arr[1]));
let a: [i16; 8] = core::mem::transmute(new_vals);
for (j, &val) in a.iter().enumerate() {
*ptr.add(j * s) = val;
}
}
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
#[allow(unsafe_code, unsafe_op_in_unsafe_fn, dead_code)]
#[target_feature(enable = "simd128")]
#[inline]
unsafe fn load8s_s1_simd128(slice: &[i16], phys_off: usize) -> i32x8 {
use core::arch::wasm32::*;
let v16 = v128_load(slice.as_ptr().add(phys_off) as *const v128);
let lo = i32x4_extend_low_i16x8(v16);
let hi = i32x4_extend_high_i16x8(v16);
core::mem::transmute::<[v128; 2], i32x8>([lo, hi])
}
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
#[allow(unsafe_code, unsafe_op_in_unsafe_fn, dead_code)]
#[target_feature(enable = "simd128")]
#[inline]
unsafe fn store8s_s1_simd128(slice: &mut [i16], phys_off: usize, v: i32x8) {
use core::arch::wasm32::*;
let [lo, hi]: [v128; 2] = core::mem::transmute(v);
let out = i8x16_shuffle::<0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29>(lo, hi);
v128_store(slice.as_mut_ptr().add(phys_off) as *mut v128, out);
}
#[inline(always)]
#[allow(unsafe_code)]
fn load8_i32(slice: &[i32], off: usize) -> i32x8 {
unsafe {
i32x8::from([
*slice.get_unchecked(off),
*slice.get_unchecked(off + 1),
*slice.get_unchecked(off + 2),
*slice.get_unchecked(off + 3),
*slice.get_unchecked(off + 4),
*slice.get_unchecked(off + 5),
*slice.get_unchecked(off + 6),
*slice.get_unchecked(off + 7),
])
}
}
#[inline(always)]
#[allow(unsafe_code)]
fn store8_i32(slice: &mut [i32], off: usize, v: i32x8) {
let a = v.to_array();
unsafe {
*slice.get_unchecked_mut(off) = a[0];
*slice.get_unchecked_mut(off + 1) = a[1];
*slice.get_unchecked_mut(off + 2) = a[2];
*slice.get_unchecked_mut(off + 3) = a[3];
*slice.get_unchecked_mut(off + 4) = a[4];
*slice.get_unchecked_mut(off + 5) = a[5];
*slice.get_unchecked_mut(off + 6) = a[6];
*slice.get_unchecked_mut(off + 7) = a[7];
}
}
#[inline(always)]
#[allow(unsafe_code)]
fn load_rows8(data: &[i16], offs: &[usize; 8], k: usize) -> i32x8 {
unsafe {
i32x8::from([
*data.get_unchecked(offs[0] + k) as i32,
*data.get_unchecked(offs[1] + k) as i32,
*data.get_unchecked(offs[2] + k) as i32,
*data.get_unchecked(offs[3] + k) as i32,
*data.get_unchecked(offs[4] + k) as i32,
*data.get_unchecked(offs[5] + k) as i32,
*data.get_unchecked(offs[6] + k) as i32,
*data.get_unchecked(offs[7] + k) as i32,
])
}
}
#[inline(always)]
#[allow(unsafe_code)]
fn store_rows8(data: &mut [i16], offs: &[usize; 8], k: usize, v: i32x8) {
let a = v.to_array();
unsafe {
*data.get_unchecked_mut(offs[0] + k) = a[0] as i16;
*data.get_unchecked_mut(offs[1] + k) = a[1] as i16;
*data.get_unchecked_mut(offs[2] + k) = a[2] as i16;
*data.get_unchecked_mut(offs[3] + k) = a[3] as i16;
*data.get_unchecked_mut(offs[4] + k) = a[4] as i16;
*data.get_unchecked_mut(offs[5] + k) = a[5] as i16;
*data.get_unchecked_mut(offs[6] + k) = a[6] as i16;
*data.get_unchecked_mut(offs[7] + k) = a[7] as i16;
}
}
#[allow(unsafe_code)]
const C16: i32x8 = unsafe { core::mem::transmute([16i32; 8]) };
#[allow(unsafe_code)]
const C8: i32x8 = unsafe { core::mem::transmute([8i32; 8]) };
#[allow(unsafe_code)]
const C1: i32x8 = unsafe { core::mem::transmute([1i32; 8]) };
#[inline(always)]
fn lifting_even(cur: i32x8, p1: i32x8, n1: i32x8, p3: i32x8, n3: i32x8) -> i32x8 {
let a = p1 + n1;
let c = p3 + n3;
cur - (((a << 3) + a - c + C16) >> 5)
}
#[inline(always)]
fn predict_inner(cur: i32x8, p1: i32x8, n1: i32x8, p3: i32x8, n3: i32x8) -> i32x8 {
let a = p1 + n1;
cur + (((a << 3) + a - (p3 + n3) + C8) >> 4)
}
#[inline(always)]
fn predict_avg(cur: i32x8, p: i32x8, n: i32x8) -> i32x8 {
cur + ((p + n + C1) >> 1)
}
#[cfg(target_arch = "aarch64")]
#[allow(unsafe_code, unsafe_op_in_unsafe_fn)]
#[target_feature(enable = "neon")]
unsafe fn row_pass_neon_s1_row(data: &mut [i16], row_off: usize, width: usize) {
use core::arch::aarch64::*;
let kmax = width - 1;
let border = kmax.saturating_sub(3);
let ptr = data.as_mut_ptr().add(row_off);
let even_chunks = if width >= 32 { (width - 31) / 16 } else { 0 };
let mut prev_odd = vdupq_n_s16(0i16);
for chunk in 0..even_chunks {
let curr_pair = vld2q_s16(ptr.add(chunk * 16) as *const i16);
let next_pair = vld2q_s16(ptr.add((chunk + 1) * 16) as *const i16);
let curr_even = curr_pair.0;
let curr_odd = curr_pair.1;
let next_odd = next_pair.1;
let p1 = vextq_s16::<7>(prev_odd, curr_odd);
let n1 = curr_odd;
let p3 = vextq_s16::<6>(prev_odd, curr_odd);
let n3 = vextq_s16::<1>(curr_odd, next_odd);
macro_rules! lift {
($ce:expr, $p1:expr, $n1:expr, $p3:expr, $n3:expr) => {{
let a = vaddq_s32($p1, $n1);
let c = vaddq_s32($p3, $n3);
let nine_a = vaddq_s32(vshlq_n_s32::<3>(a), a);
let delta = vshrq_n_s32::<5>(vsubq_s32(vaddq_s32(nine_a, vdupq_n_s32(16i32)), c));
vsubq_s32($ce, delta)
}};
}
let new_lo = lift!(
vmovl_s16(vget_low_s16(curr_even)),
vmovl_s16(vget_low_s16(p1)),
vmovl_s16(vget_low_s16(n1)),
vmovl_s16(vget_low_s16(p3)),
vmovl_s16(vget_low_s16(n3))
);
let new_hi = lift!(
vmovl_high_s16(curr_even),
vmovl_high_s16(p1),
vmovl_high_s16(n1),
vmovl_high_s16(p3),
vmovl_high_s16(n3)
);
let new_evens = vcombine_s16(vmovn_s32(new_lo), vmovn_s32(new_hi));
vst2q_s16(ptr.add(chunk * 16), int16x8x2_t(new_evens, curr_odd));
prev_odd = curr_odd;
}
{
let k_start = even_chunks * 16;
let mut prev1 = if even_chunks > 0 {
vgetq_lane_s16::<6>(prev_odd) as i32
} else {
0
};
let mut next1 = if even_chunks > 0 {
vgetq_lane_s16::<7>(prev_odd) as i32
} else {
0
};
let mut next3 = if k_start < kmax {
*data.get_unchecked(row_off + k_start + 1) as i32
} else {
0
};
let mut k = k_start;
while k <= kmax {
let prev3 = prev1;
prev1 = next1;
next1 = next3;
next3 = if k + 3 <= kmax {
*data.get_unchecked(row_off + k + 3) as i32
} else {
0
};
let a = prev1 + next1;
let c = prev3 + next3;
let idx = row_off + k;
*data.get_unchecked_mut(idx) =
(*data.get_unchecked(idx) as i32 - (((a << 3) + a - c + 16) >> 5)) as i16;
k += 2;
}
}
if kmax < 1 {
return;
}
{
let p1 = *data.get_unchecked(row_off) as i32;
let idx1 = row_off + 1;
if 1 < kmax {
let n1 = *data.get_unchecked(row_off + 2) as i32;
*data.get_unchecked_mut(idx1) =
(*data.get_unchecked(idx1) as i32 + ((p1 + n1 + 1) >> 1)) as i16;
} else {
*data.get_unchecked_mut(idx1) = (*data.get_unchecked(idx1) as i32 + p1) as i16;
}
}
let odd_chunks = if kmax >= 20 {
even_chunks.min((kmax - 20) / 16 + 1)
} else {
0
};
for chunk in 0..odd_chunks {
let pair1 = vld2q_s16(ptr.add(chunk * 16) as *const i16);
let pair2 = vld2q_s16(ptr.add((chunk + 1) * 16) as *const i16);
let curr_odds = vextq_s16::<1>(pair1.1, pair2.1);
let p3_e = pair1.0;
let p1_e = vextq_s16::<1>(pair1.0, pair2.0); let n1_e = vextq_s16::<2>(pair1.0, pair2.0);
let n3_e = vextq_s16::<3>(pair1.0, pair2.0);
macro_rules! predict {
($co:expr, $p1:expr, $n1:expr, $p3:expr, $n3:expr) => {{
let a = vaddq_s32($p1, $n1);
let c = vaddq_s32($p3, $n3);
let nine_a = vaddq_s32(vshlq_n_s32::<3>(a), a);
let delta = vshrq_n_s32::<4>(vsubq_s32(vaddq_s32(nine_a, vdupq_n_s32(8i32)), c));
vaddq_s32($co, delta)
}};
}
let new_lo = predict!(
vmovl_s16(vget_low_s16(curr_odds)),
vmovl_s16(vget_low_s16(p1_e)),
vmovl_s16(vget_low_s16(n1_e)),
vmovl_s16(vget_low_s16(p3_e)),
vmovl_s16(vget_low_s16(n3_e))
);
let new_hi = predict!(
vmovl_high_s16(curr_odds),
vmovl_high_s16(p1_e),
vmovl_high_s16(n1_e),
vmovl_high_s16(p3_e),
vmovl_high_s16(n3_e)
);
let new_odds = vcombine_s16(vmovn_s32(new_lo), vmovn_s32(new_hi));
vst2q_s16(ptr.add(chunk * 16 + 2), int16x8x2_t(p1_e, new_odds));
}
if kmax >= 3 {
let k_scalar = 3 + odd_chunks * 16;
let mut prev1 = *data.get_unchecked(row_off + k_scalar - 3) as i32;
let mut next1 = *data.get_unchecked(row_off + k_scalar - 1) as i32;
let mut next3 = if k_scalar < kmax {
*data.get_unchecked(row_off + k_scalar + 1) as i32
} else {
0
};
let mut k = k_scalar;
while k <= kmax {
let prev3 = prev1;
prev1 = next1;
next1 = next3;
next3 = if k + 3 <= kmax {
*data.get_unchecked(row_off + k + 3) as i32
} else {
0
};
let idx = row_off + k;
if k <= border {
let a = prev1 + next1;
let c = prev3 + next3;
*data.get_unchecked_mut(idx) =
(*data.get_unchecked(idx) as i32 + (((a << 3) + a - c + 8) >> 4)) as i16;
} else if k < kmax {
*data.get_unchecked_mut(idx) =
(*data.get_unchecked(idx) as i32 + ((prev1 + next1 + 1) >> 1)) as i16;
} else {
*data.get_unchecked_mut(idx) = (*data.get_unchecked(idx) as i32 + prev1) as i16;
}
k += 2;
}
}
}
pub(crate) fn row_pass_inner(
data: &mut [i16],
width: usize,
height: usize,
stride: usize,
s: usize,
sd: usize,
use_simd: bool,
) {
#[cfg(target_arch = "aarch64")]
if use_simd && s == 1 {
for row in (0..height).step_by(s) {
#[allow(unsafe_code)]
unsafe {
row_pass_neon_s1_row(data, row * stride, width);
}
}
return;
}
let kmax = (width - 1) >> sd;
let border = kmax.saturating_sub(3);
let simd_active = if use_simd { height / s / 8 * 8 } else { 0 };
let simd_rows = simd_active * s;
for group in 0..simd_active / 8 {
let row_base = group * 8 * s;
let o: [usize; 8] = core::array::from_fn(|i| (row_base + i * s) * stride);
let mut prev1v = i32x8::splat(0);
let mut next1v = i32x8::splat(0);
let mut next3v = if kmax >= 1 {
load_rows8(data, &o, 1 << sd)
} else {
i32x8::splat(0)
};
let mut prev3v: i32x8;
let mut k = 0usize;
while k <= kmax {
prev3v = prev1v;
prev1v = next1v;
next1v = next3v;
next3v = if k + 3 <= kmax {
load_rows8(data, &o, (k + 3) << sd)
} else {
i32x8::splat(0)
};
let cur = load_rows8(data, &o, k << sd);
store_rows8(
data,
&o,
k << sd,
lifting_even(cur, prev1v, next1v, prev3v, next3v),
);
k += 2;
}
if kmax >= 1 {
let mut k = 1usize;
prev1v = load_rows8(data, &o, (k - 1) << sd);
if k < kmax {
next1v = load_rows8(data, &o, (k + 1) << sd);
let cur = load_rows8(data, &o, k << sd);
store_rows8(data, &o, k << sd, predict_avg(cur, prev1v, next1v));
} else {
let cur = load_rows8(data, &o, k << sd);
store_rows8(data, &o, k << sd, cur + prev1v);
next1v = i32x8::splat(0);
}
next3v = if border >= 3 {
load_rows8(data, &o, (k + 3) << sd)
} else {
i32x8::splat(0)
};
k = 3;
while k <= border {
prev3v = prev1v;
prev1v = next1v;
next1v = next3v;
next3v = load_rows8(data, &o, (k + 3) << sd);
let cur = load_rows8(data, &o, k << sd);
store_rows8(
data,
&o,
k << sd,
predict_inner(cur, prev1v, next1v, prev3v, next3v),
);
k += 2;
}
while k <= kmax {
prev1v = next1v;
next1v = next3v;
next3v = i32x8::splat(0);
let cur = load_rows8(data, &o, k << sd);
if k < kmax {
store_rows8(data, &o, k << sd, predict_avg(cur, prev1v, next1v));
} else {
store_rows8(data, &o, k << sd, cur + prev1v);
}
k += 2;
}
}
}
let scalar_start = simd_rows;
for row in (scalar_start..height).step_by(s) {
let off = row * stride;
let mut prev1: i32 = 0;
let mut next1: i32 = 0;
let mut next3: i32 = if kmax >= 1 {
data[off + (1 << sd)] as i32
} else {
0
};
let mut prev3: i32;
let mut k = 0usize;
while k <= kmax {
prev3 = prev1;
prev1 = next1;
next1 = next3;
next3 = if k + 3 <= kmax {
data[off + ((k + 3) << sd)] as i32
} else {
0
};
let a = prev1 + next1;
let c = prev3 + next3;
let idx = off + (k << sd);
data[idx] = (data[idx] as i32 - (((a << 3) + a - c + 16) >> 5)) as i16;
k += 2;
}
if kmax >= 1 {
let mut k = 1usize;
prev1 = data[off + ((k - 1) << sd)] as i32;
if k < kmax {
next1 = data[off + ((k + 1) << sd)] as i32;
let idx = off + (k << sd);
data[idx] = (data[idx] as i32 + ((prev1 + next1 + 1) >> 1)) as i16;
} else {
let idx = off + (k << sd);
data[idx] = (data[idx] as i32 + prev1) as i16;
}
next3 = if border >= 3 {
data[off + ((k + 3) << sd)] as i32
} else {
0
};
k = 3;
while k <= border {
prev3 = prev1;
prev1 = next1;
next1 = next3;
next3 = data[off + ((k + 3) << sd)] as i32;
let a = prev1 + next1;
let idx = off + (k << sd);
data[idx] = (data[idx] as i32 + (((a << 3) + a - (prev3 + next3) + 8) >> 4)) as i16;
k += 2;
}
while k <= kmax {
prev1 = next1;
next1 = next3;
next3 = 0;
let idx = off + (k << sd);
if k < kmax {
data[idx] = (data[idx] as i32 + ((prev1 + next1 + 1) >> 1)) as i16;
} else {
data[idx] = (data[idx] as i32 + prev1) as i16;
}
k += 2;
}
}
}
}
fn inverse_wavelet_transform(plane: &mut FlatPlane, width: usize, height: usize, subsample: usize) {
inverse_wavelet_transform_from(plane, width, height, subsample, 16);
}
fn inverse_wavelet_transform_from(
plane: &mut FlatPlane,
width: usize,
height: usize,
subsample: usize,
start_scale: usize,
) {
let stride = plane.stride;
let data = plane.data.as_mut_slice();
let mut s = start_scale;
let mut s_degree: u32 = start_scale.trailing_zeros();
let mut st0 = vec![0i32; width];
let mut st1 = vec![0i32; width];
let mut st2 = vec![0i32; width];
while s >= subsample {
let sd = s_degree as usize;
let use_simd = s <= 4;
{
let kmax = (height - 1) >> sd;
let border = kmax.saturating_sub(3);
let num_cols = width.div_ceil(s);
let simd_cols = if use_simd { num_cols / 8 * 8 } else { 0 };
for v in &mut st0[..num_cols] {
*v = 0;
}
for v in &mut st1[..num_cols] {
*v = 0;
}
if kmax >= 1 {
let off = (1 << sd) * stride;
if use_simd {
for ci in (0..simd_cols).step_by(8) {
store8_i32(&mut st2, ci, load8s(data, off + ci * s, s));
}
for ci in simd_cols..num_cols {
st2[ci] = data[off + ci * s] as i32;
}
} else {
for (ci, col) in (0..width).step_by(s).enumerate() {
st2[ci] = data[off + col] as i32;
}
}
} else {
for v in &mut st2[..num_cols] {
*v = 0;
}
}
let mut k = 0usize;
while k + 3 <= kmax {
let k_off = (k << sd) * stride;
let n3_off = ((k + 3) << sd) * stride;
if use_simd {
let mut ci = 0usize;
while ci < simd_cols {
let vp3 = load8_i32(&st0, ci);
let vp1 = load8_i32(&st1, ci);
let vn1 = load8_i32(&st2, ci);
let vn3 = load8s(data, n3_off + ci * s, s);
let cur = load8s(data, k_off + ci * s, s);
store8s(
data,
k_off + ci * s,
s,
lifting_even(cur, vp1, vn1, vp3, vn3),
);
store8_i32(&mut st0, ci, vp1);
store8_i32(&mut st1, ci, vn1);
store8_i32(&mut st2, ci, vn3);
ci += 8;
}
while ci < num_cols {
let p3 = st0[ci];
let p1 = st1[ci];
let n1 = st2[ci];
let n3 = data[n3_off + ci * s] as i32;
let a = p1 + n1;
let idx = k_off + ci * s;
data[idx] =
(data[idx] as i32 - (((a << 3) + a - (p3 + n3) + 16) >> 5)) as i16;
st0[ci] = p1;
st1[ci] = n1;
st2[ci] = n3;
ci += 1;
}
} else {
for (ci, col) in (0..width).step_by(s).enumerate() {
let p3 = st0[ci];
let p1 = st1[ci];
let n1 = st2[ci];
let n3 = data[n3_off + col] as i32;
let a = p1 + n1;
let c = p3 + n3;
let idx = k_off + col;
data[idx] = (data[idx] as i32 - (((a << 3) + a - c + 16) >> 5)) as i16;
st0[ci] = p1;
st1[ci] = n1;
st2[ci] = n3;
}
}
k += 2;
}
while k <= kmax {
let k_off = (k << sd) * stride;
if use_simd {
let zero8 = i32x8::splat(0);
let mut ci = 0usize;
while ci < simd_cols {
let vp3 = load8_i32(&st0, ci);
let vp1 = load8_i32(&st1, ci);
let vn1 = load8_i32(&st2, ci);
let cur = load8s(data, k_off + ci * s, s);
store8s(
data,
k_off + ci * s,
s,
lifting_even(cur, vp1, vn1, vp3, zero8),
);
store8_i32(&mut st0, ci, vp1);
store8_i32(&mut st1, ci, vn1);
store8_i32(&mut st2, ci, zero8);
ci += 8;
}
while ci < num_cols {
let p3 = st0[ci];
let p1 = st1[ci];
let n1 = st2[ci];
let a = p1 + n1;
let idx = k_off + ci * s;
data[idx] = (data[idx] as i32 - (((a << 3) + a - p3 + 16) >> 5)) as i16;
st0[ci] = p1;
st1[ci] = n1;
st2[ci] = 0;
ci += 1;
}
} else {
for (ci, col) in (0..width).step_by(s).enumerate() {
let p3 = st0[ci];
let p1 = st1[ci];
let n1 = st2[ci];
let a = p1 + n1;
let idx = k_off + col;
data[idx] = (data[idx] as i32 - (((a << 3) + a - p3 + 16) >> 5)) as i16;
st0[ci] = p1;
st1[ci] = n1;
st2[ci] = 0;
}
}
k += 2;
}
if kmax >= 1 {
let km1_off = 0;
let k_off = (1 << sd) * stride;
if 2 <= kmax {
let kp1_off = (2 << sd) * stride;
if use_simd {
let mut ci = 0usize;
while ci < simd_cols {
let vp = load8s(data, km1_off + ci * s, s);
let vn = load8s(data, kp1_off + ci * s, s);
let cur = load8s(data, k_off + ci * s, s);
store8s(data, k_off + ci * s, s, predict_avg(cur, vp, vn));
store8_i32(&mut st0, ci, vp);
store8_i32(&mut st1, ci, vn);
ci += 8;
}
while ci < num_cols {
let p = data[km1_off + ci * s] as i32;
let n = data[kp1_off + ci * s] as i32;
let idx = k_off + ci * s;
data[idx] = (data[idx] as i32 + ((p + n + 1) >> 1)) as i16;
st0[ci] = p;
st1[ci] = n;
ci += 1;
}
} else {
for (ci, col) in (0..width).step_by(s).enumerate() {
let p = data[km1_off + col] as i32;
let n = data[kp1_off + col] as i32;
let idx = k_off + col;
data[idx] = (data[idx] as i32 + ((p + n + 1) >> 1)) as i16;
st0[ci] = p;
st1[ci] = n;
}
}
} else if use_simd {
let mut ci = 0usize;
while ci < simd_cols {
let vp = load8s(data, km1_off + ci * s, s);
let cur = load8s(data, k_off + ci * s, s);
store8s(data, k_off + ci * s, s, cur + vp);
store8_i32(&mut st0, ci, vp);
ci += 8;
}
for v in &mut st1[..num_cols] {
*v = 0;
}
while ci < num_cols {
let p = data[km1_off + ci * s] as i32;
let idx = k_off + ci * s;
data[idx] = (data[idx] as i32 + p) as i16;
st0[ci] = p;
st1[ci] = 0;
ci += 1;
}
} else {
for (ci, col) in (0..width).step_by(s).enumerate() {
let p = data[km1_off + col] as i32;
let idx = k_off + col;
data[idx] = (data[idx] as i32 + p) as i16;
st0[ci] = p;
st1[ci] = 0;
}
}
if border >= 3 {
let off = (4 << sd) * stride;
if use_simd {
let mut ci = 0usize;
while ci < simd_cols {
store8_i32(&mut st2, ci, load8s(data, off + ci * s, s));
ci += 8;
}
while ci < num_cols {
st2[ci] = data[off + ci * s] as i32;
ci += 1;
}
} else {
for (ci, col) in (0..width).step_by(s).enumerate() {
st2[ci] = data[off + col] as i32;
}
}
}
let mut k = 3usize;
while k <= border {
let k_off = (k << sd) * stride;
let n3_off = ((k + 3) << sd) * stride;
if use_simd {
let mut ci = 0usize;
while ci < simd_cols {
let vp3 = load8_i32(&st0, ci);
let vp1 = load8_i32(&st1, ci);
let vn1 = load8_i32(&st2, ci);
let vn3 = load8s(data, n3_off + ci * s, s);
let cur = load8s(data, k_off + ci * s, s);
store8s(
data,
k_off + ci * s,
s,
predict_inner(cur, vp1, vn1, vp3, vn3),
);
store8_i32(&mut st0, ci, vp1);
store8_i32(&mut st1, ci, vn1);
store8_i32(&mut st2, ci, vn3);
ci += 8;
}
while ci < num_cols {
let p3 = st0[ci];
let p1 = st1[ci];
let n1 = st2[ci];
let n3 = data[n3_off + ci * s] as i32;
let a = p1 + n1;
let idx = k_off + ci * s;
data[idx] =
(data[idx] as i32 + (((a << 3) + a - (p3 + n3) + 8) >> 4)) as i16;
st0[ci] = p1;
st1[ci] = n1;
st2[ci] = n3;
ci += 1;
}
} else {
for (ci, col) in (0..width).step_by(s).enumerate() {
let p3 = st0[ci];
let p1 = st1[ci];
let n1 = st2[ci];
let n3 = data[n3_off + col] as i32;
let a = p1 + n1;
let idx = k_off + col;
data[idx] =
(data[idx] as i32 + (((a << 3) + a - (p3 + n3) + 8) >> 4)) as i16;
st0[ci] = p1;
st1[ci] = n1;
st2[ci] = n3;
}
}
k += 2;
}
while k <= kmax {
let k_off = (k << sd) * stride;
if k < kmax {
if use_simd {
let mut ci = 0usize;
while ci < simd_cols {
let vp = load8_i32(&st1, ci);
let vn = load8_i32(&st2, ci);
let cur = load8s(data, k_off + ci * s, s);
store8s(data, k_off + ci * s, s, predict_avg(cur, vp, vn));
store8_i32(&mut st1, ci, vn);
store8_i32(&mut st2, ci, i32x8::splat(0));
ci += 8;
}
while ci < num_cols {
let p = st1[ci];
let n = st2[ci];
let idx = k_off + ci * s;
data[idx] = (data[idx] as i32 + ((p + n + 1) >> 1)) as i16;
st1[ci] = n;
st2[ci] = 0;
ci += 1;
}
} else {
for (ci, col) in (0..width).step_by(s).enumerate() {
let p = st1[ci];
let n = st2[ci];
let idx = k_off + col;
data[idx] = (data[idx] as i32 + ((p + n + 1) >> 1)) as i16;
st1[ci] = n;
st2[ci] = 0;
}
}
} else if use_simd {
let mut ci = 0usize;
while ci < simd_cols {
let vp = load8_i32(&st1, ci);
let cur = load8s(data, k_off + ci * s, s);
store8s(data, k_off + ci * s, s, cur + vp);
store8_i32(&mut st1, ci, load8_i32(&st2, ci));
store8_i32(&mut st2, ci, i32x8::splat(0));
ci += 8;
}
while ci < num_cols {
let p = st1[ci];
let idx = k_off + ci * s;
data[idx] = (data[idx] as i32 + p) as i16;
st1[ci] = st2[ci];
st2[ci] = 0;
ci += 1;
}
} else {
for (ci, col) in (0..width).step_by(s).enumerate() {
let p = st1[ci];
let idx = k_off + col;
data[idx] = (data[idx] as i32 + p) as i16;
st1[ci] = st2[ci];
st2[ci] = 0;
}
}
k += 2;
}
}
}
row_pass_inner(data, width, height, stride, s, sd, true);
s >>= 1;
s_degree = s_degree.saturating_sub(1);
}
}
#[derive(Clone, Debug)]
pub struct Iw44Image {
pub width: u32,
pub height: u32,
is_color: bool,
delay: u8,
chroma_half: bool,
y: Option<PlaneDecoder>,
cb: Option<PlaneDecoder>,
cr: Option<PlaneDecoder>,
cslice: usize,
}
impl Default for Iw44Image {
fn default() -> Self {
Self::new()
}
}
impl Iw44Image {
pub fn new() -> Self {
Iw44Image {
width: 0,
height: 0,
is_color: false,
delay: 0,
chroma_half: false,
y: None,
cb: None,
cr: None,
cslice: 0,
}
}
#[cfg(test)]
pub fn chroma_plane_dims(&self) -> Option<(usize, usize)> {
self.cb.as_ref().map(|p| (p.width, p.height))
}
#[cfg(test)]
pub fn is_color(&self) -> bool {
self.is_color
}
#[cfg(test)]
pub fn chroma_half(&self) -> bool {
self.chroma_half
}
pub fn decode_chunk(&mut self, data: &[u8]) -> Result<(), Iw44Error> {
if data.len() < 2 {
return Err(Iw44Error::ChunkTooShort);
}
let serial = data[0];
let slices = data[1];
let payload_start;
if serial == 0 {
if data.len() < 9 {
return Err(Iw44Error::HeaderTooShort);
}
let majver = data[2];
let minor = data[3];
let is_grayscale = (majver >> 7) != 0;
let w = u16::from_be_bytes([data[4], data[5]]);
let h = u16::from_be_bytes([data[6], data[7]]);
let delay_byte = data[8];
let delay = if minor >= 2 { delay_byte & 127 } else { 0 };
let chroma_half = minor >= 2 && (delay_byte & 0x80) == 0;
if w == 0 || h == 0 {
return Err(Iw44Error::ZeroDimension);
}
let pixels = w as u64 * h as u64;
if pixels > 64 * 1024 * 1024 {
return Err(Iw44Error::ImageTooLarge);
}
self.width = w as u32;
self.height = h as u32;
self.is_color = !is_grayscale;
self.delay = delay;
self.chroma_half = self.is_color && chroma_half;
self.cslice = 0;
self.y = Some(PlaneDecoder::new(w as usize, h as usize));
if self.is_color {
let (cw, ch) = if self.chroma_half {
((w as usize).div_ceil(2), (h as usize).div_ceil(2))
} else {
(w as usize, h as usize)
};
self.cb = Some(PlaneDecoder::new(cw, ch));
self.cr = Some(PlaneDecoder::new(cw, ch));
}
payload_start = 9;
} else {
if self.y.is_none() {
return Err(Iw44Error::MissingFirstChunk);
}
payload_start = 2;
}
let zp_data = &data[payload_start..];
let mut zp = ZpDecoder::new(zp_data).map_err(|_| Iw44Error::ZpTooShort)?;
for _ in 0..slices {
self.cslice += 1;
if let Some(ref mut y) = self.y {
y.decode_slice(&mut zp);
}
if self.is_color && self.cslice > self.delay as usize {
if let Some(ref mut cb) = self.cb {
cb.decode_slice(&mut zp);
}
if let Some(ref mut cr) = self.cr {
cr.decode_slice(&mut zp);
}
}
if zp.is_exhausted() {
break;
}
}
Ok(())
}
pub fn to_rgb(&self) -> Result<Pixmap, Iw44Error> {
self.to_rgb_subsample(1)
}
pub fn to_rgb_subsample(&self, subsample: u32) -> Result<Pixmap, Iw44Error> {
if subsample == 0 {
return Err(Iw44Error::InvalidSubsample);
}
let y_dec = self.y.as_ref().ok_or(Iw44Error::MissingCodec)?;
let sub = subsample as usize;
let w = (self.width as usize).div_ceil(sub) as u32;
let h = (self.height as usize).div_ceil(sub) as u32;
if self.is_color {
let chroma_sub = if self.chroma_half {
sub.div_ceil(2)
} else {
sub
};
let cb_dec = self.cb.as_ref().ok_or(Iw44Error::MissingCodec)?;
let cr_dec = self.cr.as_ref().ok_or(Iw44Error::MissingCodec)?;
#[cfg(feature = "parallel")]
let (y_plane, cb_plane, cr_plane) = {
let (y, (cb, cr)) = rayon::join(
|| y_dec.reconstruct(sub),
|| {
rayon::join(
|| cb_dec.reconstruct(chroma_sub),
|| cr_dec.reconstruct(chroma_sub),
)
},
);
(y, cb, cr)
};
#[cfg(not(feature = "parallel"))]
let (y_plane, cb_plane, cr_plane) = (
y_dec.reconstruct(sub),
cb_dec.reconstruct(chroma_sub),
cr_dec.reconstruct(chroma_sub),
);
let pw = w as usize;
let ph = h as usize;
let mut pm = Pixmap::new(w, h, 0, 0, 0, 255);
if sub == 1 {
#[cfg(feature = "parallel")]
{
use rayon::prelude::*;
let chroma_half = self.chroma_half;
pm.data
.par_chunks_mut(pw * 4)
.enumerate()
.for_each(|(out_row, row_data)| {
let row = ph - 1 - out_row; let y_off = row * y_plane.stride;
if chroma_half {
let c_row = row / 2;
let cb_off = c_row * cb_plane.stride;
let cr_off = c_row * cr_plane.stride;
ycbcr_row_from_i16_half(
&y_plane.data[y_off..y_off + pw],
&cb_plane.data[cb_off..],
&cr_plane.data[cr_off..],
row_data,
pw,
);
} else {
let c_off = row * cb_plane.stride;
ycbcr_row_from_i16(
&y_plane.data[y_off..y_off + pw],
&cb_plane.data[c_off..c_off + pw],
&cr_plane.data[c_off..c_off + pw],
row_data,
);
}
});
}
#[cfg(not(feature = "parallel"))]
{
for row in 0..ph {
let out_row = ph - 1 - row; let y_off = row * y_plane.stride;
let row_start = out_row * pw * 4;
if self.chroma_half {
let c_row = row / 2;
let cb_off = c_row * cb_plane.stride;
let cr_off = c_row * cr_plane.stride;
ycbcr_row_from_i16_half(
&y_plane.data[y_off..y_off + pw],
&cb_plane.data[cb_off..],
&cr_plane.data[cr_off..],
&mut pm.data[row_start..row_start + pw * 4],
pw,
);
} else {
let c_off = row * cb_plane.stride;
ycbcr_row_from_i16(
&y_plane.data[y_off..y_off + pw],
&cb_plane.data[c_off..c_off + pw],
&cr_plane.data[c_off..c_off + pw],
&mut pm.data[row_start..row_start + pw * 4],
);
}
}
}
return Ok(pm);
}
if (2..=8).contains(&sub) && sub.is_power_of_two() {
for row in 0..ph {
let out_row = ph - 1 - row; let y_off = row * y_plane.stride;
let c_off = row * cb_plane.stride;
let row_start = out_row * pw * 4;
ycbcr_row_from_i16(
&y_plane.data[y_off..y_off + pw],
&cb_plane.data[c_off..c_off + pw],
&cr_plane.data[c_off..c_off + pw],
&mut pm.data[row_start..row_start + pw * 4],
);
}
return Ok(pm);
}
for row in 0..h {
let out_row = h - 1 - row;
for col in 0..w {
let src_row = row as usize * sub;
let src_col = col as usize * sub;
let y_idx = src_row * y_plane.stride + src_col;
let chroma_row = if self.chroma_half {
src_row / 2
} else {
src_row
};
let chroma_col = if self.chroma_half {
src_col / 2
} else {
src_col
};
let c_idx = chroma_row * cb_plane.stride + chroma_col;
let y = normalize(y_plane.data[y_idx]);
let b = normalize(cb_plane.data[c_idx]);
let r = normalize(cr_plane.data[c_idx]);
let t2 = r + (r >> 1);
let t3 = y + 128 - (b >> 2);
let red = (y + 128 + t2).clamp(0, 255) as u8;
let green = (t3 - (t2 >> 1)).clamp(0, 255) as u8;
let blue = (t3 + (b << 1)).clamp(0, 255) as u8;
pm.set_rgb(col, out_row, red, green, blue);
}
}
Ok(pm)
} else {
let y_plane = y_dec.reconstruct(sub);
let is_compact = (2..=8).contains(&sub) && sub.is_power_of_two();
let mut pm = Pixmap::new(w, h, 0, 0, 0, 255);
for row in 0..h {
let out_row = h - 1 - row;
for col in 0..w {
let (src_row, src_col) = if is_compact {
(row as usize, col as usize)
} else {
(row as usize * sub, col as usize * sub)
};
let idx = src_row * y_plane.stride + src_col;
let val = normalize(y_plane.data[idx]);
let gray = (127 - val) as u8;
pm.set_rgb(col, out_row, gray, gray, gray);
}
}
Ok(pm)
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn assets_path() -> std::path::PathBuf {
std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("references/djvujs/library/assets")
}
fn golden_path() -> std::path::PathBuf {
std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/golden/iw44")
}
fn extract_bg44_chunks(file: &crate::iff::DjvuFile) -> Vec<&[u8]> {
fn collect(chunk: &crate::iff::Chunk) -> Option<Vec<&[u8]>> {
match chunk {
crate::iff::Chunk::Form {
secondary_id,
children,
..
} => {
if secondary_id == b"DJVU" {
let v = children
.iter()
.filter_map(|c| match c {
crate::iff::Chunk::Leaf {
id: [b'B', b'G', b'4', b'4'],
data,
} => Some(data.as_slice()),
_ => None,
})
.collect::<Vec<_>>();
return Some(v);
}
for c in children {
if let Some(v) = collect(c) {
return Some(v);
}
}
None
}
_ => None,
}
}
collect(&file.root).unwrap_or_default()
}
fn find_ppm_data_start(ppm: &[u8]) -> usize {
let mut newlines = 0;
for (i, &b) in ppm.iter().enumerate() {
if b == b'\n' {
newlines += 1;
if newlines == 3 {
return i + 1;
}
}
}
0
}
fn assert_or_create_golden(actual_ppm: &[u8], golden_file: &str) {
let path = golden_path().join(golden_file);
if !path.exists() {
std::fs::write(&path, actual_ppm)
.unwrap_or_else(|e| panic!("failed to write golden {golden_file}: {e}"));
return; }
assert_ppm_match(actual_ppm, golden_file);
}
fn assert_ppm_match(actual_ppm: &[u8], golden_file: &str) {
let expected_ppm = std::fs::read(golden_path().join(golden_file))
.unwrap_or_else(|_| panic!("golden file not found: {}", golden_file));
assert_eq!(
actual_ppm.len(),
expected_ppm.len(),
"PPM size mismatch for {}: got {} expected {}",
golden_file,
actual_ppm.len(),
expected_ppm.len()
);
if actual_ppm != expected_ppm {
let header_end = find_ppm_data_start(actual_ppm);
let actual_pixels = &actual_ppm[header_end..];
let expected_pixels = &expected_ppm[header_end..];
let total_pixels = actual_pixels.len() / 3;
let diff_pixels = actual_pixels
.chunks(3)
.zip(expected_pixels.chunks(3))
.filter(|(a, b)| a != b)
.count();
panic!(
"{} pixel mismatch: {}/{} pixels differ ({:.1}%)",
golden_file,
diff_pixels,
total_pixels,
diff_pixels as f64 / total_pixels as f64 * 100.0
);
}
}
#[test]
fn iw44_new_rejects_empty_chunk() {
let mut img = Iw44Image::new();
assert!(matches!(
img.decode_chunk(&[]),
Err(Iw44Error::ChunkTooShort)
));
}
#[test]
fn iw44_new_rejects_truncated_header() {
let mut img = Iw44Image::new();
assert!(matches!(
img.decode_chunk(&[0x00, 0x01, 0x00, 0x02, 0x00]),
Err(Iw44Error::HeaderTooShort)
));
}
#[test]
fn iw44_new_rejects_zero_dimension() {
let mut img = Iw44Image::new();
let header = [0x00u8, 0x01, 0x00, 0x02, 0x00, 0x00, 0x00, 0x64, 0x00];
assert!(matches!(
img.decode_chunk(&header),
Err(Iw44Error::ZeroDimension)
));
}
#[test]
fn iw44_new_rejects_subsequent_before_first() {
let mut img = Iw44Image::new();
assert!(matches!(
img.decode_chunk(&[0x01, 0x01]),
Err(Iw44Error::MissingFirstChunk)
));
}
#[test]
fn iw44_new_to_rgb_without_data_returns_error() {
let img = Iw44Image::new();
assert!(matches!(img.to_rgb(), Err(Iw44Error::MissingCodec)));
}
#[test]
fn iw44_new_subsample_zero_rejected() {
let img = Iw44Image::new();
assert!(matches!(
img.to_rgb_subsample(0),
Err(Iw44Error::InvalidSubsample)
));
}
#[test]
fn iw44_new_decode_boy_bg() {
let data = std::fs::read(assets_path().join("boy.djvu")).expect("boy.djvu not found");
let file = crate::iff::parse(&data).expect("failed to parse boy.djvu");
let chunks = extract_bg44_chunks(&file);
assert_eq!(chunks.len(), 1, "expected 1 BG44 chunk in boy.djvu");
let mut img = Iw44Image::new();
for c in &chunks {
img.decode_chunk(c).expect("decode_chunk failed");
}
assert_eq!(img.width, 192);
assert_eq!(img.height, 256);
let pm = img.to_rgb().expect("to_rgb failed");
assert_ppm_match(&pm.to_ppm(), "boy_bg.ppm");
}
#[test]
fn iw44_new_decode_chicken_bg() {
let data =
std::fs::read(assets_path().join("chicken.djvu")).expect("chicken.djvu not found");
let file = crate::iff::parse(&data).expect("failed to parse chicken.djvu");
let chunks = extract_bg44_chunks(&file);
assert_eq!(chunks.len(), 3, "expected 3 BG44 chunks in chicken.djvu");
let mut img = Iw44Image::new();
for c in &chunks {
img.decode_chunk(c).expect("decode_chunk failed");
}
assert_eq!(img.width, 181);
assert_eq!(img.height, 240);
let pm = img.to_rgb().expect("to_rgb failed");
assert_ppm_match(&pm.to_ppm(), "chicken_bg.ppm");
}
#[test]
fn iw44_new_decode_boy_sub2() {
let data = std::fs::read(assets_path().join("boy.djvu")).expect("boy.djvu not found");
let file = crate::iff::parse(&data).expect("failed to parse boy.djvu");
let chunks = extract_bg44_chunks(&file);
let mut img = Iw44Image::new();
for c in &chunks {
img.decode_chunk(c).expect("decode_chunk failed");
}
assert_eq!(img.width, 192);
assert_eq!(img.height, 256);
let pm = img.to_rgb_subsample(2).expect("to_rgb_subsample(2) failed");
assert_eq!(pm.width, 96, "sub=2 width must be ceil(192/2)");
assert_eq!(pm.height, 128, "sub=2 height must be ceil(256/2)");
assert_or_create_golden(&pm.to_ppm(), "boy_bg_sub2.ppm");
}
#[test]
fn iw44_new_decode_big_scanned_sub2() {
let data = std::fs::read(assets_path().join("big-scanned-page.djvu"))
.expect("big-scanned-page.djvu not found");
let file = crate::iff::parse(&data).expect("failed to parse big-scanned-page.djvu");
let chunks = extract_bg44_chunks(&file);
let mut img = Iw44Image::new();
for c in &chunks {
img.decode_chunk(c).expect("decode_chunk failed");
}
assert_eq!(img.width, 6780);
assert_eq!(img.height, 9148);
let pm = img.to_rgb_subsample(2).expect("to_rgb_subsample(2) failed");
assert_eq!(pm.width, 3390, "sub=2 width must be ceil(6780/2)");
assert_eq!(pm.height, 4574, "sub=2 height must be ceil(9148/2)");
assert_or_create_golden(&pm.to_ppm(), "big_scanned_sub2.ppm");
}
#[test]
fn iw44_new_decode_big_scanned_sub4() {
let data = std::fs::read(assets_path().join("big-scanned-page.djvu"))
.expect("big-scanned-page.djvu not found");
let file = crate::iff::parse(&data).expect("failed to parse big-scanned-page.djvu");
let chunks = extract_bg44_chunks(&file);
assert_eq!(chunks.len(), 4, "expected 4 BG44 chunks");
let mut img = Iw44Image::new();
for c in &chunks {
img.decode_chunk(c).expect("decode_chunk failed");
}
assert_eq!(img.width, 6780);
assert_eq!(img.height, 9148);
let pm = img.to_rgb_subsample(4).expect("to_rgb_subsample failed");
assert_ppm_match(&pm.to_ppm(), "big_scanned_sub4.ppm");
}
#[test]
fn iw44_new_progressive_matches_full_decode_chicken() {
let data =
std::fs::read(assets_path().join("chicken.djvu")).expect("chicken.djvu not found");
let file = crate::iff::parse(&data).expect("failed to parse");
let chunks = extract_bg44_chunks(&file);
assert!(
chunks.len() > 1,
"need multiple chunks for progressive test"
);
let mut full = Iw44Image::new();
for c in &chunks {
full.decode_chunk(c).expect("full decode failed");
}
let full_pm = full.to_rgb().expect("full to_rgb failed");
let mut prog = Iw44Image::new();
for c in chunks.iter().take(1) {
prog.decode_chunk(c).expect("progressive decode failed");
}
for c in chunks.iter().skip(1) {
prog.decode_chunk(c).expect("progressive decode failed");
}
let prog_pm = prog.to_rgb().expect("progressive to_rgb failed");
assert_eq!(
full_pm.data, prog_pm.data,
"progressive and full decode must produce identical pixels"
);
}
#[test]
fn chroma_half_allocates_half_size_plane() {
let data = std::fs::read(assets_path().join("carte.djvu")).expect("carte.djvu not found");
let file = crate::iff::parse(&data).expect("iff parse");
let chunks = extract_bg44_chunks(&file);
assert!(!chunks.is_empty(), "carte.djvu must have BG44 chunks");
let mut img = Iw44Image::new();
img.decode_chunk(chunks[0]).expect("decode_chunk");
assert!(img.is_color(), "carte.djvu must be a color image");
assert!(img.chroma_half(), "carte.djvu must have chroma_half=true");
let (cw, ch) = img
.chroma_plane_dims()
.expect("chroma plane must be allocated after first color chunk");
let lw = img.width as usize;
let lh = img.height as usize;
let expected_w = lw.div_ceil(2);
let expected_h = lh.div_ceil(2);
assert_eq!(
cw, expected_w,
"chroma plane width must be ceil(luma_w/2)={expected_w}, got {cw}"
);
assert_eq!(
ch, expected_h,
"chroma plane height must be ceil(luma_h/2)={expected_h}, got {ch}"
);
}
#[test]
fn iw44_new_decode_carte_bg_chroma_half() {
let data = std::fs::read(assets_path().join("carte.djvu")).expect("carte.djvu not found");
let file = crate::iff::parse(&data).expect("iff parse");
let chunks = extract_bg44_chunks(&file);
let mut img = Iw44Image::new();
for c in &chunks {
img.decode_chunk(c).expect("decode_chunk failed");
}
assert_eq!(img.width, 1400);
assert_eq!(img.height, 852);
let pm = img.to_rgb().expect("to_rgb failed");
assert_ppm_match(&pm.to_ppm(), "carte_bg.ppm");
}
#[test]
fn test_decode_empty_chunk() {
let mut img = Iw44Image::new();
let result = img.decode_chunk(&[]);
assert!(result.is_err());
}
#[test]
fn test_decode_truncated_header() {
let mut img = Iw44Image::new();
let result = img.decode_chunk(&[0x00, 0x01]);
assert!(result.is_err());
}
#[test]
fn test_to_rgb_before_decode() {
let img = Iw44Image::new();
let result = img.to_rgb();
assert!(result.is_err());
}
#[test]
fn test_to_rgb_subsample_zero() {
let img = Iw44Image::new();
let result = img.to_rgb_subsample(0);
assert!(result.is_err());
}
#[test]
fn simd_ycbcr_row_matches_scalar() {
let n = 20usize;
let ys: Vec<i32> = (0..n).map(|i| (i as i32 * 7) % 200 - 100).collect();
let bs: Vec<i32> = (0..n).map(|i| (i as i32 * 13) % 200 - 100).collect();
let rs: Vec<i32> = (0..n).map(|i| (i as i32 * 17) % 200 - 100).collect();
let mut expected = vec![0u8; n * 4];
for col in 0..n {
let y = ys[col];
let b = bs[col];
let r = rs[col];
let t2 = r + (r >> 1);
let t3 = y + 128 - (b >> 2);
expected[col * 4] = (y + 128 + t2).clamp(0, 255) as u8;
expected[col * 4 + 1] = (t3 - (t2 >> 1)).clamp(0, 255) as u8;
expected[col * 4 + 2] = (t3 + (b << 1)).clamp(0, 255) as u8;
expected[col * 4 + 3] = 255;
}
let mut actual = vec![0u8; n * 4];
super::ycbcr_row_to_rgba(&ys, &bs, &rs, &mut actual);
assert_eq!(
expected, actual,
"SIMD must produce identical output to scalar"
);
}
#[test]
fn simd_ycbcr_row_clamps_correctly() {
let n = 8usize;
let ys: Vec<i32> = vec![127, -128, 127, -128, 0, 0, 0, 0];
let bs: Vec<i32> = vec![-128, 127, -128, 127, 0, 0, 0, 0];
let rs: Vec<i32> = vec![127, -128, -128, 127, 0, 0, 0, 0];
let mut simd_out = vec![0u8; n * 4];
super::ycbcr_row_to_rgba(&ys, &bs, &rs, &mut simd_out);
for chunk in simd_out.chunks_exact(4) {
assert_eq!(chunk[3], 255, "alpha must always be 255");
}
}
#[test]
fn simd_render_matches_subsampled_render_dimensions() {
let data = std::fs::read(assets_path().join("boy.djvu")).expect("boy.djvu not found");
let file = crate::iff::parse(&data).expect("parse failed");
let chunks = extract_bg44_chunks(&file);
let mut img = Iw44Image::new();
for c in &chunks {
img.decode_chunk(c).expect("decode_chunk failed");
}
let full = img.to_rgb().expect("to_rgb failed");
let half = img.to_rgb_subsample(2).expect("subsample(2) failed");
assert_eq!(full.width, img.width);
assert_eq!(full.height, img.height);
assert_eq!(half.width, img.width.div_ceil(2));
assert_eq!(half.height, img.height.div_ceil(2));
}
#[test]
fn simd_row_pass_matches_scalar() {
let width = 32usize;
let height = 16usize;
let stride = width;
let n = stride * height;
let initial: Vec<i16> = (0..n).map(|i| ((i * 7 + 13) % 511) as i16 - 255).collect();
let mut scalar_data = initial.clone();
super::row_pass_inner(&mut scalar_data, width, height, stride, 1, 0, false);
let mut simd_data = initial.clone();
super::row_pass_inner(&mut simd_data, width, height, stride, 1, 0, true);
assert_eq!(
scalar_data, simd_data,
"SIMD row pass must produce identical output to scalar"
);
}
#[test]
fn simd_row_pass_s2_matches_scalar() {
let width = 64usize;
let height = 32usize;
let stride = width;
let n = stride * height;
let s = 2usize;
let sd = 1usize;
let initial: Vec<i16> = (0..n).map(|i| ((i * 7 + 13) % 511) as i16 - 255).collect();
let mut scalar_data = initial.clone();
super::row_pass_inner(&mut scalar_data, width, height, stride, s, sd, false);
let mut simd_data = initial.clone();
super::row_pass_inner(&mut simd_data, width, height, stride, s, sd, true);
assert_eq!(
scalar_data, simd_data,
"SIMD row pass (s=2) must produce identical output to scalar"
);
}
#[cfg(all(target_arch = "x86_64", feature = "std"))]
fn ycbcr_raw_scalar(y: &[i16], cb: &[i16], cr: &[i16], out: &mut [u8]) {
let w = y.len();
for col in 0..w {
let yn = super::normalize(y[col]);
let bn = super::normalize(cb[col]);
let rn = super::normalize(cr[col]);
let t2 = rn + (rn >> 1);
let t3 = yn + 128 - (bn >> 2);
out[col * 4] = (yn + 128 + t2).clamp(0, 255) as u8;
out[col * 4 + 1] = (t3 - (t2 >> 1)).clamp(0, 255) as u8;
out[col * 4 + 2] = (t3 + (bn << 1)).clamp(0, 255) as u8;
out[col * 4 + 3] = 255;
}
}
#[cfg(all(target_arch = "x86_64", feature = "std"))]
fn ycbcr_raw_half_scalar(y: &[i16], cb: &[i16], cr: &[i16], out: &mut [u8]) {
let w = y.len();
for col in 0..w {
let yn = super::normalize(y[col]);
let bn = super::normalize(cb[col / 2]);
let rn = super::normalize(cr[col / 2]);
let t2 = rn + (rn >> 1);
let t3 = yn + 128 - (bn >> 2);
out[col * 4] = (yn + 128 + t2).clamp(0, 255) as u8;
out[col * 4 + 1] = (t3 - (t2 >> 1)).clamp(0, 255) as u8;
out[col * 4 + 2] = (t3 + (bn << 1)).clamp(0, 255) as u8;
out[col * 4 + 3] = 255;
}
}
#[cfg(all(target_arch = "x86_64", feature = "std"))]
#[test]
fn ycbcr_avx2_raw_matches_scalar() {
if !std::is_x86_feature_detected!("avx2") {
eprintln!("skipping: AVX2 not available on this host");
return;
}
let raw_vals: [i16; 8] = [-32768, -8192, -64, -1, 0, 63, 8191, 32767];
for &width in &[1usize, 7, 16, 17, 31, 32, 33, 47, 48, 64, 100] {
let n = width;
let make_seq = |seed: usize| -> Vec<i16> {
(0..n)
.map(|i| raw_vals[(i + seed) % raw_vals.len()])
.collect()
};
let y = make_seq(0);
let cb = make_seq(3);
let cr = make_seq(5);
let mut got = vec![0u8; n * 4];
#[allow(unsafe_code)]
unsafe {
super::ycbcr_avx2_raw(y.as_ptr(), cb.as_ptr(), cr.as_ptr(), got.as_mut_ptr(), n);
}
let mut want = vec![0u8; n * 4];
ycbcr_raw_scalar(&y, &cb, &cr, &mut want);
assert_eq!(got, want, "AVX2 raw mismatch at width {}", width);
}
}
#[cfg(all(target_arch = "x86_64", feature = "std"))]
#[test]
fn load8s_s1_avx2_matches_scalar() {
if !std::is_x86_feature_detected!("avx2") {
eprintln!("skipping: AVX2 not available on this host");
return;
}
let raw_vals: [i16; 8] = [-32768, -8192, -64, -1, 0, 63, 8191, 32767];
let n = 64;
let buf: Vec<i16> = (0..n).map(|i| raw_vals[i % raw_vals.len()]).collect();
for phys_off in 0..(n - 8) {
#[allow(unsafe_code)]
let got = unsafe { super::load8s_s1_avx2(&buf, phys_off) };
let want = super::load8s(&buf, phys_off, 1);
assert_eq!(
got.to_array(),
want.to_array(),
"AVX2 load8s_s1 mismatch at phys_off {}",
phys_off
);
}
}
#[cfg(all(target_arch = "x86_64", feature = "std"))]
#[test]
fn store8s_s1_avx2_matches_scalar() {
if !std::is_x86_feature_detected!("avx2") {
eprintln!("skipping: AVX2 not available on this host");
return;
}
let raw_vals: [i32; 8] = [i32::MIN, -100_000, -32768, -1, 0, 32767, 100_000, i32::MAX];
for offset in 0..8usize {
let mut input = [0i32; 8];
for j in 0..8 {
input[j] = raw_vals[(j + offset) % 8];
}
let v = wide::i32x8::from(input);
let mut buf_avx2 = vec![0xABCDu16 as i16; 32];
#[allow(unsafe_code)]
unsafe {
super::store8s_s1_avx2(&mut buf_avx2, 8, v);
}
let mut buf_scalar = vec![0xABCDu16 as i16; 32];
for j in 0..8 {
buf_scalar[8 + j] = input[j] as i16;
}
assert_eq!(buf_avx2, buf_scalar, "AVX2 store8s_s1 mismatch");
}
}
#[cfg(all(target_arch = "x86_64", feature = "std"))]
#[test]
fn prelim_flags_bucket_avx2_matches_scalar() {
if !std::is_x86_feature_detected!("avx2") {
eprintln!("skipping: AVX2 not available on this host");
return;
}
let test_vectors: &[[i16; 16]] = &[
[0; 16],
[
1, 0, -1, 0, 100, 0, -200, 0, 0, 1234, 0, -1234, 0, 32767, -32768, 0,
],
[1; 16],
[-1; 16],
[
32767, -32768, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 32767, -32768, 1, -1,
],
];
for &coefs in test_vectors {
let mut block = [0i16; 1024];
for &base in &[0usize, 16, 32, 1008] {
block[base..base + 16].copy_from_slice(&coefs);
let mut bucket_avx2 = [0u8; 16];
#[allow(unsafe_code)]
let bstate_avx2 =
unsafe { super::prelim_flags_bucket_avx2(&block, base, &mut bucket_avx2) };
let mut bucket_scalar = [0u8; 16];
let mut bstate_scalar = 0u8;
for k in 0..16 {
let f = if block[base + k] == 0 {
super::UNK
} else {
super::ACTIVE
};
bucket_scalar[k] = f;
bstate_scalar |= f;
}
assert_eq!(
bucket_avx2, bucket_scalar,
"bucket mismatch at base={base} coefs={coefs:?}"
);
assert_eq!(
bstate_avx2, bstate_scalar,
"bstatetmp mismatch at base={base}"
);
}
}
}
#[cfg(all(target_arch = "x86_64", feature = "std"))]
#[test]
fn prelim_flags_band0_avx2_matches_scalar() {
if !std::is_x86_feature_detected!("avx2") {
eprintln!("skipping: AVX2 not available on this host");
return;
}
let old_patterns: &[[u8; 16]] = &[
[super::ZERO; 16],
[super::UNK; 16],
[super::ACTIVE; 16],
[
super::ZERO,
super::UNK,
super::ACTIVE,
super::ZERO,
super::UNK,
super::ACTIVE,
super::ZERO,
super::UNK,
super::ACTIVE,
super::ZERO,
super::UNK,
super::ACTIVE,
super::ZERO,
super::UNK,
super::ACTIVE,
super::ZERO,
],
];
let coef_patterns: &[[i16; 16]] = &[
[0; 16],
[1; 16],
[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1],
[
-32768, 0, 32767, 0, 100, 0, -100, 0, 0, 1, 0, -1, 0, 5, 0, -5,
],
];
for &old in old_patterns {
for &coefs in coef_patterns {
let mut block = [0i16; 1024];
block[..16].copy_from_slice(&coefs);
let mut flags_avx2 = old;
#[allow(unsafe_code)]
let bstate_avx2 =
unsafe { super::prelim_flags_band0_avx2(&block, &mut flags_avx2) };
let mut flags_scalar = old;
let mut bstate_scalar = 0u8;
for k in 0..16 {
if flags_scalar[k] != super::ZERO {
flags_scalar[k] = if block[k] == 0 {
super::UNK
} else {
super::ACTIVE
};
}
bstate_scalar |= flags_scalar[k];
}
assert_eq!(
flags_avx2, flags_scalar,
"flags mismatch old={old:?} coefs={coefs:?}"
);
assert_eq!(bstate_avx2, bstate_scalar, "bstatetmp mismatch");
}
}
}
#[cfg(all(target_arch = "x86_64", feature = "std"))]
#[test]
fn ycbcr_avx2_raw_half_matches_scalar() {
if !std::is_x86_feature_detected!("avx2") {
eprintln!("skipping: AVX2 not available on this host");
return;
}
let raw_vals: [i16; 8] = [-32768, -8192, -64, -1, 0, 63, 8191, 32767];
for &width in &[2usize, 8, 16, 18, 30, 32, 34, 48, 64, 96] {
let n = width;
let half = n.div_ceil(2);
let make_seq = |seed: usize, len: usize| -> Vec<i16> {
(0..len)
.map(|i| raw_vals[(i + seed) % raw_vals.len()])
.collect()
};
let y = make_seq(0, n);
let cb_half = make_seq(3, half);
let cr_half = make_seq(5, half);
let mut got = vec![0u8; n * 4];
#[allow(unsafe_code)]
unsafe {
super::ycbcr_avx2_raw_half(
y.as_ptr(),
cb_half.as_ptr(),
cr_half.as_ptr(),
got.as_mut_ptr(),
n,
);
}
let mut want = vec![0u8; n * 4];
ycbcr_raw_half_scalar(&y, &cb_half, &cr_half, &mut want);
assert_eq!(got, want, "AVX2 raw_half mismatch at width {}", width);
}
}
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
#[test]
fn load8s_s1_simd128_matches_scalar() {
let raw_vals: [i16; 8] = [-32768, -8192, -64, -1, 0, 63, 8191, 32767];
let n = 64;
let buf: alloc::vec::Vec<i16> = (0..n).map(|i| raw_vals[i % raw_vals.len()]).collect();
for phys_off in 0..(n - 8) {
#[allow(unsafe_code)]
let got = unsafe { super::load8s_s1_simd128(&buf, phys_off) };
let want = super::load8s(&buf, phys_off, 1);
assert_eq!(
got.to_array(),
want.to_array(),
"simd128 load8s_s1 mismatch at phys_off {}",
phys_off
);
}
}
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
#[test]
fn store8s_s1_simd128_matches_scalar() {
let raw_vals: [i32; 8] = [i32::MIN, -100_000, -32768, -1, 0, 32767, 100_000, i32::MAX];
for offset in 0..8usize {
let mut input = [0i32; 8];
for j in 0..8 {
input[j] = raw_vals[(j + offset) % 8];
}
let v = wide::i32x8::from(input);
let mut buf_simd128 = alloc::vec![0xABCDu16 as i16; 32];
#[allow(unsafe_code)]
unsafe {
super::store8s_s1_simd128(&mut buf_simd128, 8, v);
}
let mut buf_scalar = alloc::vec![0xABCDu16 as i16; 32];
for j in 0..8 {
buf_scalar[8 + j] = input[j] as i16;
}
assert_eq!(buf_simd128, buf_scalar, "simd128 store8s_s1 mismatch");
}
}
}