#![allow(clippy::needless_range_loop)]
#![allow(dead_code)]
use archmage::prelude::*;
#[cfg(target_arch = "x86")]
use archmage::intrinsics::x86 as simd_mem;
#[cfg(target_arch = "x86_64")]
use archmage::intrinsics::x86_64 as simd_mem;
use super::tables::{MAX_LEVEL, VP8_FREQ_SHARPENING};
#[cfg(target_arch = "x86_64")]
#[inline(always)]
fn split4_ref<T>(arr: &[T; 16]) -> (&[T; 4], &[T; 4], &[T; 4], &[T; 4]) {
let (a, rest) = arr.split_first_chunk::<4>().unwrap();
let rest: &[T; 12] = rest.try_into().unwrap();
let (b, rest) = rest.split_first_chunk::<4>().unwrap();
let rest: &[T; 8] = rest.try_into().unwrap();
let (c, d) = rest.split_first_chunk::<4>().unwrap();
let d: &[T; 4] = d.try_into().unwrap();
(a, b, c, d)
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
fn split4_mut<T>(arr: &mut [T; 16]) -> (&mut [T; 4], &mut [T; 4], &mut [T; 4], &mut [T; 4]) {
let (a, rest) = arr.split_first_chunk_mut::<4>().unwrap();
let rest: &mut [T; 12] = rest.try_into().unwrap();
let (b, rest) = rest.split_first_chunk_mut::<4>().unwrap();
let rest: &mut [T; 8] = rest.try_into().unwrap();
let (c, d) = rest.split_first_chunk_mut::<4>().unwrap();
let d: &mut [T; 4] = d.try_into().unwrap();
(a, b, c, d)
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
fn split2_ref<T>(arr: &[T; 16]) -> (&[T; 8], &[T; 8]) {
let (a, b) = arr.split_first_chunk::<8>().unwrap();
let b: &[T; 8] = b.try_into().unwrap();
(a, b)
}
pub const QFIX: u32 = 17;
#[inline]
pub const fn quantization_bias(b: u32) -> u32 {
(((b) << (QFIX)) + 128) >> 8
}
#[inline]
pub fn quantdiv(coeff: u32, iq: u32, bias: u32) -> i32 {
((coeff as u64 * iq as u64 + bias as u64) >> QFIX) as i32
}
#[derive(Clone, Debug)]
pub struct VP8Matrix {
pub q: [u16; 16],
pub iq: [u32; 16],
pub bias: [u32; 16],
pub zthresh: [u32; 16],
pub sharpen: [u16; 16],
}
impl VP8Matrix {
pub fn new(q_dc: u16, q_ac: u16, matrix_type: MatrixType) -> Self {
let bias_values = match matrix_type {
MatrixType::Y1 => (96, 110), MatrixType::Y2 => (96, 108), MatrixType::UV => (110, 115), };
let mut m = Self {
q: [0; 16],
iq: [0; 16],
bias: [0; 16],
zthresh: [0; 16],
sharpen: [0; 16],
};
m.q[0] = q_dc;
m.q[1] = q_ac;
for i in 0..2 {
let is_ac = i > 0;
let bias = if is_ac { bias_values.1 } else { bias_values.0 };
m.iq[i] = ((1u64 << QFIX) / m.q[i] as u64) as u32;
m.bias[i] = quantization_bias(bias);
m.zthresh[i] = ((1 << QFIX) - 1 - m.bias[i]) / m.iq[i];
}
for i in 2..16 {
m.q[i] = m.q[1];
m.iq[i] = m.iq[1];
m.bias[i] = m.bias[1];
m.zthresh[i] = m.zthresh[1];
}
if matches!(matrix_type, MatrixType::Y1) {
const SHARPEN_BITS: u32 = 11;
for (i, &freq_sharpen) in VP8_FREQ_SHARPENING.iter().enumerate() {
m.sharpen[i] = ((freq_sharpen as u32 * m.q[i] as u32) >> SHARPEN_BITS) as u16;
}
}
m
}
pub fn average_q(&self) -> u32 {
let sum: u32 = self.q.iter().map(|&x| x as u32).sum();
(sum + 8) >> 4
}
#[inline]
pub fn quantize_coeff(&self, coeff: i32, pos: usize) -> i32 {
let sign = coeff < 0;
let abs_coeff = (if sign { -coeff } else { coeff } as u32) + self.sharpen[pos] as u32;
if abs_coeff <= self.zthresh[pos] {
return 0;
}
let level = quantdiv(abs_coeff, self.iq[pos], self.bias[pos]).min(MAX_LEVEL as i32);
if sign { -level } else { level }
}
#[inline]
pub fn quantize_neutral(&self, coeff: i32, pos: usize) -> i32 {
let sign = coeff < 0;
let abs_coeff = if sign { -coeff } else { coeff } as u32;
let neutral_bias = quantization_bias(0x00); let level = quantdiv(abs_coeff, self.iq[pos], neutral_bias);
if sign { -level } else { level }
}
#[inline]
pub fn dequantize(&self, level: i32, pos: usize) -> i32 {
level * self.q[pos] as i32
}
#[inline]
pub fn quantize(&self, coeffs: &mut [i32; 16]) {
for (pos, coeff) in coeffs.iter_mut().enumerate() {
let sign = *coeff < 0;
let abs_coeff = (if sign { -*coeff } else { *coeff } as u32) + self.sharpen[pos] as u32;
if abs_coeff <= self.zthresh[pos] {
*coeff = 0;
continue;
}
let level = quantdiv(abs_coeff, self.iq[pos], self.bias[pos]).min(MAX_LEVEL as i32);
*coeff = if sign { -level } else { level };
}
}
#[allow(clippy::needless_range_loop)] pub fn quantize_ac_only(&self, coeffs: &mut [i32; 16]) {
for pos in 1..16 {
let sign = coeffs[pos] < 0;
let abs_coeff =
(if sign { -coeffs[pos] } else { coeffs[pos] } as u32) + self.sharpen[pos] as u32;
if abs_coeff <= self.zthresh[pos] {
coeffs[pos] = 0;
continue;
}
let level = quantdiv(abs_coeff, self.iq[pos], self.bias[pos]).min(MAX_LEVEL as i32);
coeffs[pos] = if sign { -level } else { level };
}
}
#[inline(always)]
pub fn dequantize_block(&self, coeffs: &mut [i32; 16]) {
incant!(
dequantize_block_dispatch(&self.q, coeffs),
[v3, neon, wasm128, scalar]
);
}
#[allow(clippy::needless_range_loop)] pub fn dequantize_ac_only(&self, coeffs: &mut [i32; 16]) {
for pos in 1..16 {
coeffs[pos] *= self.q[pos] as i32;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn dequantize_block_dispatch_v3(_token: X64V3Token, q: &[u16; 16], coeffs: &mut [i32; 16]) {
dequantize_block_sse2(_token, q, coeffs);
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
fn dequantize_block_dispatch_neon(token: NeonToken, q: &[u16; 16], coeffs: &mut [i32; 16]) {
crate::common::simd_neon::dequantize_block_neon(token, q, coeffs);
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn dequantize_block_dispatch_wasm128(token: Wasm128Token, q: &[u16; 16], coeffs: &mut [i32; 16]) {
crate::common::simd_wasm::dequantize_block_wasm_entry(token, q, coeffs);
}
#[inline(always)]
fn dequantize_block_dispatch_scalar(_token: ScalarToken, q: &[u16; 16], coeffs: &mut [i32; 16]) {
for (pos, coeff) in coeffs.iter_mut().enumerate() {
*coeff *= q[pos] as i32;
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
pub(crate) fn dequantize_block_sse2(_token: X64V3Token, q: &[u16; 16], coeffs: &mut [i32; 16]) {
let (q_lo_arr, q_hi_arr) = split2_ref(q);
let q_lo = simd_mem::_mm_loadu_si128(q_lo_arr);
let q_hi = simd_mem::_mm_loadu_si128(q_hi_arr);
let zero = _mm_setzero_si128();
let q0_32 = _mm_unpacklo_epi16(q_lo, zero); let q1_32 = _mm_unpackhi_epi16(q_lo, zero); let q2_32 = _mm_unpacklo_epi16(q_hi, zero); let q3_32 = _mm_unpackhi_epi16(q_hi, zero);
let (c0_arr, c1_arr, c2_arr, c3_arr) = split4_ref(coeffs);
let c0 = simd_mem::_mm_loadu_si128(c0_arr);
let c1 = simd_mem::_mm_loadu_si128(c1_arr);
let c2 = simd_mem::_mm_loadu_si128(c2_arr);
let c3 = simd_mem::_mm_loadu_si128(c3_arr);
macro_rules! mul_epi32_sse2 {
($a:expr, $b:expr) => {{
let even = _mm_mul_epu32($a, $b);
let a_odd = _mm_shuffle_epi32($a, 0xF5); let b_odd = _mm_shuffle_epi32($b, 0xF5);
let odd = _mm_mul_epu32(a_odd, b_odd);
let even_lo = _mm_shuffle_epi32(even, 0x08); let odd_lo = _mm_shuffle_epi32(odd, 0x08); _mm_unpacklo_epi32(even_lo, odd_lo) }};
}
let r0 = mul_epi32_sse2!(c0, q0_32);
let r1 = mul_epi32_sse2!(c1, q1_32);
let r2 = mul_epi32_sse2!(c2, q2_32);
let r3 = mul_epi32_sse2!(c3, q3_32);
let (s0, s1, s2, s3) = split4_mut(coeffs);
simd_mem::_mm_storeu_si128(s0, r0);
simd_mem::_mm_storeu_si128(s1, r1);
simd_mem::_mm_storeu_si128(s2, r2);
simd_mem::_mm_storeu_si128(s3, r3);
}
#[derive(Clone, Copy, Debug)]
pub enum MatrixType {
Y1,
Y2,
UV,
}
pub fn quantize_block_simd(coeffs: &mut [i32; 16], matrix: &VP8Matrix, use_sharpen: bool) -> bool {
incant!(
quantize_block_dispatch(coeffs, matrix, use_sharpen),
[v3, neon, wasm128, scalar]
)
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn quantize_block_dispatch_v3(
_token: X64V3Token,
coeffs: &mut [i32; 16],
matrix: &VP8Matrix,
use_sharpen: bool,
) -> bool {
quantize_block_sse2(_token, coeffs, matrix, use_sharpen)
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
fn quantize_block_dispatch_neon(
token: NeonToken,
coeffs: &mut [i32; 16],
matrix: &VP8Matrix,
use_sharpen: bool,
) -> bool {
crate::common::simd_neon::quantize_block_neon(token, coeffs, matrix, use_sharpen)
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn quantize_block_dispatch_wasm128(
token: Wasm128Token,
coeffs: &mut [i32; 16],
matrix: &VP8Matrix,
use_sharpen: bool,
) -> bool {
crate::common::simd_wasm::quantize_block_wasm_entry(token, coeffs, matrix, use_sharpen)
}
#[inline(always)]
fn quantize_block_dispatch_scalar(
_token: ScalarToken,
coeffs: &mut [i32; 16],
matrix: &VP8Matrix,
_use_sharpen: bool,
) -> bool {
matrix.quantize(coeffs);
coeffs.iter().any(|&c| c != 0)
}
#[cfg(target_arch = "x86_64")]
#[rite]
pub(crate) fn quantize_block_sse2(
_token: X64V3Token,
coeffs: &mut [i32; 16],
matrix: &VP8Matrix,
use_sharpen: bool,
) -> bool {
let max_coeff = _mm_set1_epi16(MAX_LEVEL as i16);
let zero = _mm_setzero_si128();
let (c0_arr, c1_arr, c2_arr, c3_arr) = split4_ref(coeffs);
let c0_32 = simd_mem::_mm_loadu_si128(c0_arr);
let c1_32 = simd_mem::_mm_loadu_si128(c1_arr);
let c2_32 = simd_mem::_mm_loadu_si128(c2_arr);
let c3_32 = simd_mem::_mm_loadu_si128(c3_arr);
let in0 = _mm_packs_epi32(c0_32, c1_32); let in8 = _mm_packs_epi32(c2_32, c3_32);
let iq0 = _mm_set_epi16(
matrix.iq[7] as i16,
matrix.iq[6] as i16,
matrix.iq[5] as i16,
matrix.iq[4] as i16,
matrix.iq[3] as i16,
matrix.iq[2] as i16,
matrix.iq[1] as i16,
matrix.iq[0] as i16,
);
let iq8 = _mm_set_epi16(
matrix.iq[15] as i16,
matrix.iq[14] as i16,
matrix.iq[13] as i16,
matrix.iq[12] as i16,
matrix.iq[11] as i16,
matrix.iq[10] as i16,
matrix.iq[9] as i16,
matrix.iq[8] as i16,
);
let sign0 = _mm_cmpgt_epi16(zero, in0);
let sign8 = _mm_cmpgt_epi16(zero, in8);
let mut coeff0 = _mm_sub_epi16(_mm_xor_si128(in0, sign0), sign0);
let mut coeff8 = _mm_sub_epi16(_mm_xor_si128(in8, sign8), sign8);
if use_sharpen {
let sharpen0 = _mm_set_epi16(
matrix.sharpen[7] as i16,
matrix.sharpen[6] as i16,
matrix.sharpen[5] as i16,
matrix.sharpen[4] as i16,
matrix.sharpen[3] as i16,
matrix.sharpen[2] as i16,
matrix.sharpen[1] as i16,
matrix.sharpen[0] as i16,
);
let sharpen8 = _mm_set_epi16(
matrix.sharpen[15] as i16,
matrix.sharpen[14] as i16,
matrix.sharpen[13] as i16,
matrix.sharpen[12] as i16,
matrix.sharpen[11] as i16,
matrix.sharpen[10] as i16,
matrix.sharpen[9] as i16,
matrix.sharpen[8] as i16,
);
coeff0 = _mm_add_epi16(coeff0, sharpen0);
coeff8 = _mm_add_epi16(coeff8, sharpen8);
}
let coeff_iq0_h = _mm_mulhi_epu16(coeff0, iq0);
let coeff_iq0_l = _mm_mullo_epi16(coeff0, iq0);
let coeff_iq8_h = _mm_mulhi_epu16(coeff8, iq8);
let coeff_iq8_l = _mm_mullo_epi16(coeff8, iq8);
let out_00 = _mm_unpacklo_epi16(coeff_iq0_l, coeff_iq0_h);
let out_04 = _mm_unpackhi_epi16(coeff_iq0_l, coeff_iq0_h);
let out_08 = _mm_unpacklo_epi16(coeff_iq8_l, coeff_iq8_h);
let out_12 = _mm_unpackhi_epi16(coeff_iq8_l, coeff_iq8_h);
let (b0, b1, b2, b3) = split4_ref(&matrix.bias);
let bias_00 = simd_mem::_mm_loadu_si128(b0);
let bias_04 = simd_mem::_mm_loadu_si128(b1);
let bias_08 = simd_mem::_mm_loadu_si128(b2);
let bias_12 = simd_mem::_mm_loadu_si128(b3);
let out_00 = _mm_add_epi32(out_00, bias_00);
let out_04 = _mm_add_epi32(out_04, bias_04);
let out_08 = _mm_add_epi32(out_08, bias_08);
let out_12 = _mm_add_epi32(out_12, bias_12);
let out_00 = _mm_srai_epi32(out_00, QFIX as i32);
let out_04 = _mm_srai_epi32(out_04, QFIX as i32);
let out_08 = _mm_srai_epi32(out_08, QFIX as i32);
let out_12 = _mm_srai_epi32(out_12, QFIX as i32);
let mut out0 = _mm_packs_epi32(out_00, out_04);
let mut out8 = _mm_packs_epi32(out_08, out_12);
out0 = _mm_min_epi16(out0, max_coeff);
out8 = _mm_min_epi16(out8, max_coeff);
out0 = _mm_sub_epi16(_mm_xor_si128(out0, sign0), sign0);
out8 = _mm_sub_epi16(_mm_xor_si128(out8, sign8), sign8);
let sign0_ext = _mm_cmpgt_epi16(zero, out0);
let sign8_ext = _mm_cmpgt_epi16(zero, out8);
let out0_lo = _mm_unpacklo_epi16(out0, sign0_ext);
let out0_hi = _mm_unpackhi_epi16(out0, sign0_ext);
let out8_lo = _mm_unpacklo_epi16(out8, sign8_ext);
let out8_hi = _mm_unpackhi_epi16(out8, sign8_ext);
let (s0, s1, s2, s3) = split4_mut(coeffs);
simd_mem::_mm_storeu_si128(s0, out0_lo);
simd_mem::_mm_storeu_si128(s1, out0_hi);
simd_mem::_mm_storeu_si128(s2, out8_lo);
simd_mem::_mm_storeu_si128(s3, out8_hi);
let packed = _mm_packs_epi16(out0, out8);
_mm_movemask_epi8(_mm_cmpeq_epi8(packed, zero)) != 0xffff
}
#[cfg(target_arch = "x86_64")]
pub fn quantize_ac_only_simd(
coeffs: &mut [i32; 16],
matrix: &VP8Matrix,
use_sharpen: bool,
) -> bool {
let dc = coeffs[0];
let has_nz = quantize_block_simd(coeffs, matrix, use_sharpen);
coeffs[0] = dc; coeffs[1..].iter().any(|&c| c != 0) || has_nz
}
#[cfg(target_arch = "aarch64")]
pub fn quantize_ac_only_simd(
coeffs: &mut [i32; 16],
matrix: &VP8Matrix,
use_sharpen: bool,
) -> bool {
let dc = coeffs[0];
let has_nz = quantize_block_simd(coeffs, matrix, use_sharpen);
coeffs[0] = dc;
coeffs[1..].iter().any(|&c| c != 0) || has_nz
}
#[cfg(target_arch = "wasm32")]
pub fn quantize_ac_only_simd(
coeffs: &mut [i32; 16],
matrix: &VP8Matrix,
use_sharpen: bool,
) -> bool {
let dc = coeffs[0];
let has_nz = quantize_block_simd(coeffs, matrix, use_sharpen);
coeffs[0] = dc;
coeffs[1..].iter().any(|&c| c != 0) || has_nz
}
#[cfg(not(any(
target_arch = "x86_64",
target_arch = "aarch64",
target_arch = "wasm32"
)))]
pub fn quantize_ac_only_simd(
coeffs: &mut [i32; 16],
matrix: &VP8Matrix,
_use_sharpen: bool,
) -> bool {
matrix.quantize_ac_only(coeffs);
coeffs[1..].iter().any(|&c| c != 0)
}
pub fn quantize_dequantize_block_simd(
coeffs: &[i32; 16],
matrix: &VP8Matrix,
use_sharpen: bool,
quantized: &mut [i32; 16],
dequantized: &mut [i32; 16],
) -> bool {
incant!(
quantize_dequantize_block_dispatch(coeffs, matrix, use_sharpen, quantized, dequantized),
[v3, neon, wasm128, scalar]
)
}
pub(crate) fn quantize_dequantize_block_scalar(
coeffs: &[i32; 16],
matrix: &VP8Matrix,
quantized: &mut [i32; 16],
dequantized: &mut [i32; 16],
) -> bool {
let mut has_nz = false;
for pos in 0..16 {
quantized[pos] = matrix.quantize_coeff(coeffs[pos], pos);
dequantized[pos] = quantized[pos] * matrix.q[pos] as i32;
if quantized[pos] != 0 {
has_nz = true;
}
}
has_nz
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn quantize_dequantize_block_dispatch_v3(
_token: X64V3Token,
coeffs: &[i32; 16],
matrix: &VP8Matrix,
use_sharpen: bool,
quantized: &mut [i32; 16],
dequantized: &mut [i32; 16],
) -> bool {
quantize_dequantize_block_sse2(_token, coeffs, matrix, use_sharpen, quantized, dequantized)
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
fn quantize_dequantize_block_dispatch_neon(
token: NeonToken,
coeffs: &[i32; 16],
matrix: &VP8Matrix,
use_sharpen: bool,
quantized: &mut [i32; 16],
dequantized: &mut [i32; 16],
) -> bool {
crate::common::simd_neon::quantize_dequantize_block_neon(
token,
coeffs,
matrix,
use_sharpen,
quantized,
dequantized,
)
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn quantize_dequantize_block_dispatch_wasm128(
token: Wasm128Token,
coeffs: &[i32; 16],
matrix: &VP8Matrix,
use_sharpen: bool,
quantized: &mut [i32; 16],
dequantized: &mut [i32; 16],
) -> bool {
crate::common::simd_wasm::quantize_dequantize_block_wasm_entry(
token,
coeffs,
matrix,
use_sharpen,
quantized,
dequantized,
)
}
#[inline(always)]
fn quantize_dequantize_block_dispatch_scalar(
_token: ScalarToken,
coeffs: &[i32; 16],
matrix: &VP8Matrix,
_use_sharpen: bool,
quantized: &mut [i32; 16],
dequantized: &mut [i32; 16],
) -> bool {
quantize_dequantize_block_scalar(coeffs, matrix, quantized, dequantized)
}
#[cfg(target_arch = "x86_64")]
#[rite]
pub(crate) fn quantize_dequantize_block_sse2(
_token: X64V3Token,
coeffs: &[i32; 16],
matrix: &VP8Matrix,
use_sharpen: bool,
quantized: &mut [i32; 16],
dequantized: &mut [i32; 16],
) -> bool {
let max_coeff = _mm_set1_epi16(MAX_LEVEL as i16);
let zero = _mm_setzero_si128();
let (c0_arr, c1_arr, c2_arr, c3_arr) = split4_ref(coeffs);
let c0_32 = simd_mem::_mm_loadu_si128(c0_arr);
let c1_32 = simd_mem::_mm_loadu_si128(c1_arr);
let c2_32 = simd_mem::_mm_loadu_si128(c2_arr);
let c3_32 = simd_mem::_mm_loadu_si128(c3_arr);
let in0 = _mm_packs_epi32(c0_32, c1_32);
let in8 = _mm_packs_epi32(c2_32, c3_32);
let iq0 = _mm_set_epi16(
matrix.iq[7] as i16,
matrix.iq[6] as i16,
matrix.iq[5] as i16,
matrix.iq[4] as i16,
matrix.iq[3] as i16,
matrix.iq[2] as i16,
matrix.iq[1] as i16,
matrix.iq[0] as i16,
);
let iq8 = _mm_set_epi16(
matrix.iq[15] as i16,
matrix.iq[14] as i16,
matrix.iq[13] as i16,
matrix.iq[12] as i16,
matrix.iq[11] as i16,
matrix.iq[10] as i16,
matrix.iq[9] as i16,
matrix.iq[8] as i16,
);
let sign0 = _mm_cmpgt_epi16(zero, in0);
let sign8 = _mm_cmpgt_epi16(zero, in8);
let mut coeff0 = _mm_sub_epi16(_mm_xor_si128(in0, sign0), sign0);
let mut coeff8 = _mm_sub_epi16(_mm_xor_si128(in8, sign8), sign8);
if use_sharpen {
let sharpen0 = _mm_set_epi16(
matrix.sharpen[7] as i16,
matrix.sharpen[6] as i16,
matrix.sharpen[5] as i16,
matrix.sharpen[4] as i16,
matrix.sharpen[3] as i16,
matrix.sharpen[2] as i16,
matrix.sharpen[1] as i16,
matrix.sharpen[0] as i16,
);
let sharpen8 = _mm_set_epi16(
matrix.sharpen[15] as i16,
matrix.sharpen[14] as i16,
matrix.sharpen[13] as i16,
matrix.sharpen[12] as i16,
matrix.sharpen[11] as i16,
matrix.sharpen[10] as i16,
matrix.sharpen[9] as i16,
matrix.sharpen[8] as i16,
);
coeff0 = _mm_add_epi16(coeff0, sharpen0);
coeff8 = _mm_add_epi16(coeff8, sharpen8);
}
let coeff_iq0_h = _mm_mulhi_epu16(coeff0, iq0);
let coeff_iq0_l = _mm_mullo_epi16(coeff0, iq0);
let coeff_iq8_h = _mm_mulhi_epu16(coeff8, iq8);
let coeff_iq8_l = _mm_mullo_epi16(coeff8, iq8);
let out_00 = _mm_unpacklo_epi16(coeff_iq0_l, coeff_iq0_h);
let out_04 = _mm_unpackhi_epi16(coeff_iq0_l, coeff_iq0_h);
let out_08 = _mm_unpacklo_epi16(coeff_iq8_l, coeff_iq8_h);
let out_12 = _mm_unpackhi_epi16(coeff_iq8_l, coeff_iq8_h);
let (b0, b1, b2, b3) = split4_ref(&matrix.bias);
let bias_00 = simd_mem::_mm_loadu_si128(b0);
let bias_04 = simd_mem::_mm_loadu_si128(b1);
let bias_08 = simd_mem::_mm_loadu_si128(b2);
let bias_12 = simd_mem::_mm_loadu_si128(b3);
let out_00 = _mm_srai_epi32(_mm_add_epi32(out_00, bias_00), QFIX as i32);
let out_04 = _mm_srai_epi32(_mm_add_epi32(out_04, bias_04), QFIX as i32);
let out_08 = _mm_srai_epi32(_mm_add_epi32(out_08, bias_08), QFIX as i32);
let out_12 = _mm_srai_epi32(_mm_add_epi32(out_12, bias_12), QFIX as i32);
let mut qout0 = _mm_packs_epi32(out_00, out_04);
let mut qout8 = _mm_packs_epi32(out_08, out_12);
qout0 = _mm_min_epi16(qout0, max_coeff);
qout8 = _mm_min_epi16(qout8, max_coeff);
qout0 = _mm_sub_epi16(_mm_xor_si128(qout0, sign0), sign0);
qout8 = _mm_sub_epi16(_mm_xor_si128(qout8, sign8), sign8);
let (q0_arr, q8_arr) = split2_ref(&matrix.q);
let q0 = simd_mem::_mm_loadu_si128(q0_arr);
let q8 = simd_mem::_mm_loadu_si128(q8_arr);
let dq0 = _mm_mullo_epi16(qout0, q0);
let dq8 = _mm_mullo_epi16(qout8, q8);
let qsign0 = _mm_cmpgt_epi16(zero, qout0);
let qsign8 = _mm_cmpgt_epi16(zero, qout8);
let (qs0, qs1, qs2, qs3) = split4_mut(quantized);
simd_mem::_mm_storeu_si128(qs0, _mm_unpacklo_epi16(qout0, qsign0));
simd_mem::_mm_storeu_si128(qs1, _mm_unpackhi_epi16(qout0, qsign0));
simd_mem::_mm_storeu_si128(qs2, _mm_unpacklo_epi16(qout8, qsign8));
simd_mem::_mm_storeu_si128(qs3, _mm_unpackhi_epi16(qout8, qsign8));
let dsign0 = _mm_cmpgt_epi16(zero, dq0);
let dsign8 = _mm_cmpgt_epi16(zero, dq8);
let (ds0, ds1, ds2, ds3) = split4_mut(dequantized);
simd_mem::_mm_storeu_si128(ds0, _mm_unpacklo_epi16(dq0, dsign0));
simd_mem::_mm_storeu_si128(ds1, _mm_unpackhi_epi16(dq0, dsign0));
simd_mem::_mm_storeu_si128(ds2, _mm_unpacklo_epi16(dq8, dsign8));
simd_mem::_mm_storeu_si128(ds3, _mm_unpackhi_epi16(dq8, dsign8));
let packed = _mm_packs_epi16(qout0, qout8);
_mm_movemask_epi8(_mm_cmpeq_epi8(packed, zero)) != 0xffff
}
#[cfg(target_arch = "x86_64")]
pub fn quantize_dequantize_ac_only_simd(
coeffs: &[i32; 16],
matrix: &VP8Matrix,
use_sharpen: bool,
quantized: &mut [i32; 16],
dequantized: &mut [i32; 16],
) -> bool {
let has_nz =
quantize_dequantize_block_simd(coeffs, matrix, use_sharpen, quantized, dequantized);
quantized[0] = coeffs[0];
dequantized[0] = coeffs[0]; has_nz || quantized[1..].iter().any(|&c| c != 0)
}
#[cfg(target_arch = "aarch64")]
pub fn quantize_dequantize_ac_only_simd(
coeffs: &[i32; 16],
matrix: &VP8Matrix,
use_sharpen: bool,
quantized: &mut [i32; 16],
dequantized: &mut [i32; 16],
) -> bool {
let has_nz =
quantize_dequantize_block_simd(coeffs, matrix, use_sharpen, quantized, dequantized);
quantized[0] = coeffs[0];
dequantized[0] = coeffs[0];
has_nz || quantized[1..].iter().any(|&c| c != 0)
}
#[cfg(target_arch = "wasm32")]
pub fn quantize_dequantize_ac_only_simd(
coeffs: &[i32; 16],
matrix: &VP8Matrix,
use_sharpen: bool,
quantized: &mut [i32; 16],
dequantized: &mut [i32; 16],
) -> bool {
let has_nz =
quantize_dequantize_block_simd(coeffs, matrix, use_sharpen, quantized, dequantized);
quantized[0] = coeffs[0];
dequantized[0] = coeffs[0];
has_nz || quantized[1..].iter().any(|&c| c != 0)
}
#[cfg(not(any(
target_arch = "x86_64",
target_arch = "x86",
target_arch = "aarch64",
target_arch = "wasm32"
)))]
pub fn quantize_dequantize_ac_only_simd(
coeffs: &[i32; 16],
matrix: &VP8Matrix,
_use_sharpen: bool,
quantized: &mut [i32; 16],
dequantized: &mut [i32; 16],
) -> bool {
let has_nz = quantize_dequantize_block_scalar(coeffs, matrix, quantized, dequantized);
quantized[0] = coeffs[0];
dequantized[0] = coeffs[0];
has_nz || quantized[1..].iter().any(|&c| c != 0)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_quantize_dequantize_block_matches_separate() {
let matrix = VP8Matrix::new(4, 6, MatrixType::Y1);
let coeffs: [i32; 16] = [120, -45, 30, -15, 22, -8, 5, -3, 10, -6, 4, -2, 3, -1, 1, 0];
let mut fused_quantized = [0i32; 16];
let mut fused_dequantized = [0i32; 16];
let fused_nz = quantize_dequantize_block_simd(
&coeffs,
&matrix,
true,
&mut fused_quantized,
&mut fused_dequantized,
);
let mut sep_quantized = coeffs;
quantize_block_simd(&mut sep_quantized, &matrix, true);
let mut sep_dequantized = sep_quantized;
matrix.dequantize_block(&mut sep_dequantized);
let sep_nz = sep_quantized.iter().any(|&c| c != 0);
assert_eq!(
fused_quantized, sep_quantized,
"Quantized mismatch.\nFused: {:?}\nSep: {:?}",
fused_quantized, sep_quantized
);
assert_eq!(
fused_dequantized, sep_dequantized,
"Dequantized mismatch.\nFused: {:?}\nSep: {:?}",
fused_dequantized, sep_dequantized
);
assert_eq!(fused_nz, sep_nz, "has_nz mismatch");
}
#[test]
fn test_quantize_dequantize_block_all_zero() {
let matrix = VP8Matrix::new(80, 100, MatrixType::UV);
let coeffs = [0i32; 16];
let mut quantized = [0i32; 16];
let mut dequantized = [0i32; 16];
let has_nz = quantize_dequantize_block_simd(
&coeffs,
&matrix,
false,
&mut quantized,
&mut dequantized,
);
assert!(!has_nz);
assert_eq!(quantized, [0; 16]);
assert_eq!(dequantized, [0; 16]);
}
}