use super::block_based_image::ExpandedBlockData;
use wide::i32x8;
const _W1: i32 = 2841; const _W2: i32 = 2676; const _W3: i32 = 2408; const _W5: i32 = 1609; const _W6: i32 = 1108; const _W7: i32 = 565;
const W3: i32 = 2408; const W6: i32 = 1108; const W7: i32 = 565;
const W1PW7: i32 = _W1 + _W7;
const W1MW7: i32 = _W1 - _W7;
const W2PW6: i32 = _W2 + _W6;
const W2MW6: i32 = _W2 - _W6;
const W3PW5: i32 = _W3 + _W5;
const W3MW5: i32 = _W3 - _W5;
const R2: i32 = 181;
#[inline(always)]
fn get_raster<const IGNORE_DC: bool>(
offset: usize,
stride: usize,
block: &ExpandedBlockData,
) -> i32x8 {
return i32x8::new([
block.get_coefficient_raster(7 * stride + offset) as i32,
block.get_coefficient_raster(6 * stride + offset) as i32,
block.get_coefficient_raster(5 * stride + offset) as i32,
block.get_coefficient_raster(4 * stride + offset) as i32,
block.get_coefficient_raster(3 * stride + offset) as i32,
block.get_coefficient_raster(2 * stride + offset) as i32,
block.get_coefficient_raster(1 * stride + offset) as i32,
if IGNORE_DC && offset == 0 {
0
} else {
block.get_coefficient_raster(offset) as i32
},
]);
}
#[inline(always)]
pub fn get_q(offset: usize, stride: usize, q: &[u16; 64]) -> i32x8 {
return i32x8::new([
q[7 * stride + offset] as i32,
q[6 * stride + offset] as i32,
q[5 * stride + offset] as i32,
q[4 * stride + offset] as i32,
q[3 * stride + offset] as i32,
q[2 * stride + offset] as i32,
q[1 * stride + offset] as i32,
q[offset] as i32,
]);
}
#[inline(always)]
fn copy_to_output(row: i32x8, offset: usize, outp: &mut [i16; 64]) {
let r = row.as_array_ref();
for i in 0..8 {
outp[i + offset] = r[i] as i16;
}
}
#[inline(always)]
fn transpose(
index: usize,
row0: i32x8,
row1: i32x8,
row2: i32x8,
row3: i32x8,
row4: i32x8,
row5: i32x8,
row6: i32x8,
row7: i32x8,
) -> i32x8 {
return i32x8::new([
row0.as_array_ref()[7 - index],
row1.as_array_ref()[7 - index],
row2.as_array_ref()[7 - index],
row3.as_array_ref()[7 - index],
row4.as_array_ref()[7 - index],
row5.as_array_ref()[7 - index],
row6.as_array_ref()[7 - index],
row7.as_array_ref()[7 - index],
]);
}
#[inline(never)]
pub fn run_idct<const IGNORE_DC: bool>(
block: &ExpandedBlockData,
q: &[u16; 64],
outp: &mut [i16; 64],
) {
let mut xv0 = get_raster::<IGNORE_DC>(0, 8, block);
let mut xv1 = get_raster::<IGNORE_DC>(4, 8, block);
let mut xv2 = get_raster::<IGNORE_DC>(6, 8, block);
let mut xv3 = get_raster::<IGNORE_DC>(2, 8, block);
let mut xv4 = get_raster::<IGNORE_DC>(1, 8, block);
let mut xv5 = get_raster::<IGNORE_DC>(7, 8, block);
let mut xv6 = get_raster::<IGNORE_DC>(5, 8, block);
let mut xv7 = get_raster::<IGNORE_DC>(3, 8, block);
xv0 = ((xv0 * get_q(0, 8, q)) << 11) + 128;
xv1 = (xv1 * get_q(4, 8, q)) << 11;
xv2 *= get_q(6, 8, q);
xv3 *= get_q(2, 8, q);
xv4 *= get_q(1, 8, q);
xv5 *= get_q(7, 8, q);
xv6 *= get_q(5, 8, q);
xv7 *= get_q(3, 8, q);
let mut xv8 = _W7 * (xv4 + xv5);
xv4 = xv8 + (W1MW7 * xv4);
xv5 = xv8 - (W1PW7 * xv5);
xv8 = _W3 * (xv6 + xv7);
xv6 = xv8 - (W3MW5 * xv6);
xv7 = xv8 - (W3PW5 * xv7);
xv8 = xv0 + xv1;
xv0 -= xv1;
xv1 = W6 * (xv3 + xv2);
xv2 = xv1 - (W2PW6 * xv2);
xv3 = xv1 + (W2MW6 * xv3);
xv1 = xv4 + xv6;
xv4 -= xv6;
xv6 = xv5 + xv7;
xv5 -= xv7;
xv7 = xv8 + xv3;
xv8 -= xv3;
xv3 = xv0 + xv2;
xv0 -= xv2;
xv2 = ((R2 * (xv4 + xv5)) + 128) >> 8;
xv4 = ((R2 * (xv4 - xv5)) + 128) >> 8;
let row0: i32x8 = (xv7 + xv1) >> 8;
let row1: i32x8 = (xv3 + xv2) >> 8;
let row2: i32x8 = (xv0 + xv4) >> 8;
let row3: i32x8 = (xv8 + xv6) >> 8;
let row4: i32x8 = (xv8 - xv6) >> 8;
let row5: i32x8 = (xv0 - xv4) >> 8;
let row6: i32x8 = (xv3 - xv2) >> 8;
let row7: i32x8 = (xv7 - xv1) >> 8;
let mut yv0 = transpose(0, row0, row1, row2, row3, row4, row5, row6, row7);
let mut yv1 = transpose(4, row0, row1, row2, row3, row4, row5, row6, row7);
let mut yv2 = transpose(6, row0, row1, row2, row3, row4, row5, row6, row7);
let mut yv3 = transpose(2, row0, row1, row2, row3, row4, row5, row6, row7);
let mut yv4 = transpose(1, row0, row1, row2, row3, row4, row5, row6, row7);
let mut yv5 = transpose(7, row0, row1, row2, row3, row4, row5, row6, row7);
let mut yv6 = transpose(5, row0, row1, row2, row3, row4, row5, row6, row7);
let mut yv7 = transpose(3, row0, row1, row2, row3, row4, row5, row6, row7);
yv0 = (yv0 << 8) + 8192;
yv1 = yv1 << 8;
let mut yv8 = ((W7) * (yv4 + yv5)) + (4);
yv4 = (yv8 + ((W1MW7) * yv4)) >> 3;
yv5 = (yv8 - ((W1PW7) * yv5)) >> 3;
yv8 = ((W3) * (yv6 + yv7)) + (4);
yv6 = (yv8 - ((W3MW5) * yv6)) >> 3;
yv7 = (yv8 - ((W3PW5) * yv7)) >> 3;
yv8 = yv0 + yv1;
yv0 -= yv1;
yv1 = ((W6) * (yv3 + yv2)) + (4);
yv2 = (yv1 - ((W2PW6) * yv2)) >> 3;
yv3 = (yv1 + ((W2MW6) * yv3)) >> 3;
yv1 = yv4 + yv6;
yv4 -= yv6;
yv6 = yv5 + yv7;
yv5 -= yv7;
yv7 = yv8 + yv3;
yv8 -= yv3;
yv3 = yv0 + yv2;
yv0 -= yv2;
yv2 = (((R2) * (yv4 + yv5)) + (128)) >> 8;
yv4 = (((R2) * (yv4 - yv5)) + (128)) >> 8;
copy_to_output((yv7 + yv1) >> 11, 0, outp);
copy_to_output((yv3 + yv2) >> 11, 1 * 8, outp);
copy_to_output((yv0 + yv4) >> 11, 2 * 8, outp);
copy_to_output((yv8 + yv6) >> 11, 3 * 8, outp);
copy_to_output((yv8 - yv6) >> 11, 4 * 8, outp);
copy_to_output((yv0 - yv4) >> 11, 5 * 8, outp);
copy_to_output((yv3 - yv2) >> 11, 6 * 8, outp);
copy_to_output((yv7 - yv1) >> 11, 7 * 8, outp);
#[cfg(feature = "verify_old_dct")]
debug_assert!(verify_behavior(block, q, outp));
}
#[cfg(feature = "verify_old_dct")]
use std::num::Wrapping;
#[cfg(feature = "verify_old_dct")]
fn verify_behavior(block: &ExpandedBlockData, q: &[u16; 64], new_outp: &[i16; 64]) -> bool {
let mut outp_orig = [0; 64];
run_idct_old(block, q, &mut outp_orig, true);
return outp_orig[..] == new_outp[..];
}
#[cfg(feature = "verify_old_dct")]
fn mul(a: i16, b: u16) -> Wrapping<i32> {
return Wrapping(a as i32) * Wrapping(b as i32);
}
#[cfg(feature = "verify_old_dct")]
pub fn run_idct_old(
block: &ExpandedBlockData,
q: &[u16; 64],
outp: &mut [i16; 64],
ignore_dc: bool,
) {
let mut intermed = [Wrapping(0i32); 64];
for y in 0..8 {
let y8: usize = y * 8;
let mut x0 = if ignore_dc && y == 0 {
Wrapping(0)
} else {
mul(block.get_coefficient_raster(y8 + 0), q[y8 + 0]) << 11
} + Wrapping(128);
let mut x1 = mul(block.get_coefficient_raster(y8 + 4), q[y8 + 4]) << 11;
let mut x2 = mul(block.get_coefficient_raster(y8 + 6), q[y8 + 6]);
let mut x3 = mul(block.get_coefficient_raster(y8 + 2), q[y8 + 2]);
let mut x4 = mul(block.get_coefficient_raster(y8 + 1), q[y8 + 1]);
let mut x5 = mul(block.get_coefficient_raster(y8 + 7), q[y8 + 7]);
let mut x6 = mul(block.get_coefficient_raster(y8 + 5), q[y8 + 5]);
let mut x7 = mul(block.get_coefficient_raster(y8 + 3), q[y8 + 3]);
if x1 == Wrapping(0)
&& x2 == Wrapping(0)
&& x3 == Wrapping(0)
&& x4 == Wrapping(0)
&& x5 == Wrapping(0)
&& x6 == Wrapping(0)
&& x7 == Wrapping(0)
{
let dc = (x0 - Wrapping(128)) >> 8;
intermed[y8 + 0] = dc;
intermed[y8 + 1] = dc;
intermed[y8 + 2] = dc;
intermed[y8 + 3] = dc;
intermed[y8 + 4] = dc;
intermed[y8 + 5] = dc;
intermed[y8 + 6] = dc;
intermed[y8 + 7] = dc;
continue;
}
let mut x8 = Wrapping(W7) * (x4 + x5);
x4 = x8 + (Wrapping(W1MW7) * x4);
x5 = x8 - (Wrapping(W1PW7) * x5);
x8 = Wrapping(W3) * (x6 + x7);
x6 = x8 - (Wrapping(W3MW5) * x6);
x7 = x8 - (Wrapping(W3PW5) * x7);
x8 = x0 + x1;
x0 -= x1;
x1 = Wrapping(W6) * (x3 + x2);
x2 = x1 - (Wrapping(W2PW6) * x2);
x3 = x1 + (Wrapping(W2MW6) * x3);
x1 = x4 + x6;
x4 -= x6;
x6 = x5 + x7;
x5 -= x7;
x7 = x8 + x3;
x8 -= x3;
x3 = x0 + x2;
x0 -= x2;
x2 = ((Wrapping(R2) * (x4 + x5)) + Wrapping(128)) >> 8;
x4 = ((Wrapping(R2) * (x4 - x5)) + Wrapping(128)) >> 8;
intermed[y8 + 0] = (x7 + x1) >> 8;
intermed[y8 + 1] = (x3 + x2) >> 8;
intermed[y8 + 2] = (x0 + x4) >> 8;
intermed[y8 + 3] = (x8 + x6) >> 8;
intermed[y8 + 4] = (x8 - x6) >> 8;
intermed[y8 + 5] = (x0 - x4) >> 8;
intermed[y8 + 6] = (x3 - x2) >> 8;
intermed[y8 + 7] = (x7 - x1) >> 8;
}
for x in 0..8 {
let mut y0 = (intermed[(8 * 0) + x] << 8) + Wrapping(8192);
let mut y1 = intermed[(8 * 4) + x] << 8;
let mut y2 = intermed[(8 * 6) + x];
let mut y3 = intermed[(8 * 2) + x];
let mut y4 = intermed[(8 * 1) + x];
let mut y5 = intermed[(8 * 7) + x];
let mut y6 = intermed[(8 * 5) + x];
let mut y7 = intermed[(8 * 3) + x];
let mut y8 = (Wrapping(W7) * (y4 + y5)) + Wrapping(4);
y4 = (y8 + (Wrapping(W1MW7) * y4)) >> 3;
y5 = (y8 - (Wrapping(W1PW7) * y5)) >> 3;
y8 = (Wrapping(W3) * (y6 + y7)) + Wrapping(4);
y6 = (y8 - (Wrapping(W3MW5) * y6)) >> 3;
y7 = (y8 - (Wrapping(W3PW5) * y7)) >> 3;
y8 = y0 + y1;
y0 -= y1;
y1 = (Wrapping(W6) * (y3 + y2)) + Wrapping(4);
y2 = (y1 - (Wrapping(W2PW6) * y2)) >> 3;
y3 = (y1 + (Wrapping(W2MW6) * y3)) >> 3;
y1 = y4 + y6;
y4 -= y6;
y6 = y5 + y7;
y5 -= y7;
y7 = y8 + y3;
y8 -= y3;
y3 = y0 + y2;
y0 -= y2;
y2 = ((Wrapping(R2) * (y4 + y5)) + Wrapping(128)) >> 8;
y4 = ((Wrapping(R2) * (y4 - y5)) + Wrapping(128)) >> 8;
outp[(8 * 0) + x] = ((y7 + y1) >> 11).0 as i16;
outp[(8 * 1) + x] = ((y3 + y2) >> 11).0 as i16;
outp[(8 * 2) + x] = ((y0 + y4) >> 11).0 as i16;
outp[(8 * 3) + x] = ((y8 + y6) >> 11).0 as i16;
outp[(8 * 4) + x] = ((y8 - y6) >> 11).0 as i16;
outp[(8 * 5) + x] = ((y0 - y4) >> 11).0 as i16;
outp[(8 * 6) + x] = ((y3 - y2) >> 11).0 as i16;
outp[(8 * 7) + x] = ((y7 - y1) >> 11).0 as i16;
}
}