#![allow(dead_code)]
#![allow(clippy::needless_range_loop)]
#[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
#[inline(always)]
fn split4_ref<T>(arr: &[T; 16]) -> (&[T; 4], &[T; 4], &[T; 4], &[T; 4]) {
let (a, rest) = arr.split_first_chunk::<4>().unwrap();
let rest: &[T; 12] = rest.try_into().unwrap();
let (b, rest) = rest.split_first_chunk::<4>().unwrap();
let rest: &[T; 8] = rest.try_into().unwrap();
let (c, d) = rest.split_first_chunk::<4>().unwrap();
let d: &[T; 4] = d.try_into().unwrap();
(a, b, c, d)
}
#[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
#[inline(always)]
fn split2_mut<T>(arr: &mut [T; 16]) -> (&mut [T; 8], &mut [T; 8]) {
let (a, b) = arr.split_first_chunk_mut::<8>().unwrap();
let b: &mut [T; 8] = b.try_into().unwrap();
(a, b)
}
use archmage::prelude::*;
#[cfg(target_arch = "aarch64")]
use archmage::intrinsics::aarch64 as simd_mem;
#[cfg(target_arch = "x86_64")]
use archmage::intrinsics::x86_64 as simd_mem;
use super::cost::{LevelCosts, vp8_bit_cost};
use super::tables::{MAX_LEVEL, MAX_VARIABLE_LEVEL, VP8_ENC_BANDS, VP8_LEVEL_FIXED_COSTS};
use crate::common::types::TokenProbTables;
pub struct Residual<'a> {
pub first: usize,
pub last: i32,
pub coeffs: &'a [i32; 16],
pub coeff_type: usize,
}
impl<'a> Residual<'a> {
pub fn new(coeffs: &'a [i32; 16], coeff_type: usize, first: usize) -> Self {
let last = coeffs
.iter()
.rposition(|&c| c != 0)
.map(|i| i as i32)
.unwrap_or(-1);
Self {
first,
last,
coeffs,
coeff_type,
}
}
}
#[inline]
pub fn get_residual_cost(
ctx0: usize,
res: &Residual,
costs: &LevelCosts,
probs: &TokenProbTables,
) -> u32 {
incant!(
get_residual_cost_dispatch(ctx0, res, costs, probs),
[v3, neon, wasm128, scalar]
)
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
fn get_residual_cost_dispatch_v3(
token: X64V3Token,
ctx0: usize,
res: &Residual,
costs: &LevelCosts,
probs: &TokenProbTables,
) -> u32 {
get_residual_cost_entry(token, ctx0, res, costs, probs)
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
fn get_residual_cost_dispatch_neon(
token: NeonToken,
ctx0: usize,
res: &Residual,
costs: &LevelCosts,
probs: &TokenProbTables,
) -> u32 {
get_residual_cost_neon_entry(token, ctx0, res, costs, probs)
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn get_residual_cost_dispatch_wasm128(
token: Wasm128Token,
ctx0: usize,
res: &Residual,
costs: &LevelCosts,
probs: &TokenProbTables,
) -> u32 {
get_residual_cost_wasm_entry(token, ctx0, res, costs, probs)
}
#[inline(always)]
fn get_residual_cost_dispatch_scalar(
_token: ScalarToken,
ctx0: usize,
res: &Residual,
costs: &LevelCosts,
probs: &TokenProbTables,
) -> u32 {
get_residual_cost_scalar(ctx0, res, costs, probs)
}
#[inline]
pub(crate) fn get_residual_cost_scalar(
ctx0: usize,
res: &Residual,
costs: &LevelCosts,
probs: &TokenProbTables,
) -> u32 {
let ctype = res.coeff_type;
let mut n = res.first;
let band = VP8_ENC_BANDS[n] as usize;
let p0 = probs[ctype][band][ctx0][0];
let mut ctx = ctx0;
let mut cost = if ctx0 == 0 {
vp8_bit_cost(true, p0) as u32
} else {
0
};
if res.last < 0 {
return vp8_bit_cost(false, p0) as u32;
}
let costs_for_type = &costs.level_cost[ctype];
let band = VP8_ENC_BANDS[n] as usize;
let mut t = &costs_for_type[band][ctx];
let last = res.last as usize;
while n < last {
let v = res.coeffs[n].unsigned_abs() as usize;
cost +=
VP8_LEVEL_FIXED_COSTS[v.min(MAX_LEVEL)] as u32 + t[v.min(MAX_VARIABLE_LEVEL)] as u32;
ctx = if v >= 2 { 2 } else { v };
n += 1;
let next_band = (VP8_ENC_BANDS[n] as usize) & 7;
t = &costs_for_type[next_band][ctx.min(2)];
}
{
let v = res.coeffs[n].unsigned_abs() as usize;
debug_assert!(v != 0, "Last coefficient should be non-zero");
cost +=
VP8_LEVEL_FIXED_COSTS[v.min(MAX_LEVEL)] as u32 + t[v.min(MAX_VARIABLE_LEVEL)] as u32;
if n < 15 {
let next_band = VP8_ENC_BANDS[n + 1] as usize;
let next_ctx = if v == 1 { 1 } else { 2 };
let last_p0 = probs[ctype][next_band][next_ctx][0];
cost += vp8_bit_cost(false, last_p0) as u32;
}
}
cost
}
use super::cost::level_costs::LevelCostArray;
#[inline(always)]
fn residual_cost_loop(
first: usize,
last: usize,
initial_ctx: usize,
levels: &[u8; 16],
abs_levels: &[u16; 16],
ctxs: &[u8; 16],
costs_for_type: &[[LevelCostArray; 3]; 8],
) -> u32 {
let mut cost = 0u32;
let mut n = first;
let mut band = VP8_ENC_BANDS[n] as usize;
let mut t: &LevelCostArray = &costs_for_type[band][initial_ctx.min(2)];
while n < last {
let level = (levels[n] as usize) & 0x7F;
let flevel = abs_levels[n] as usize;
cost += VP8_LEVEL_FIXED_COSTS[flevel.min(MAX_LEVEL)] as u32 + t[level] as u32;
let ctx = (ctxs[n] as usize).min(2);
n += 1;
band = (VP8_ENC_BANDS[n] as usize) & 7;
t = &costs_for_type[band][ctx];
}
{
let level = (levels[n] as usize) & 0x7F;
let flevel = abs_levels[n] as usize;
cost += VP8_LEVEL_FIXED_COSTS[flevel.min(MAX_LEVEL)] as u32 + t[level] as u32;
}
cost
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn get_residual_cost_entry(
_token: X64V3Token,
ctx0: usize,
res: &Residual,
costs: &LevelCosts,
probs: &TokenProbTables,
) -> u32 {
get_residual_cost_sse2(_token, ctx0, res, costs, probs)
}
#[cfg(target_arch = "x86_64")]
#[rite]
pub(crate) fn get_residual_cost_sse2(
_token: X64V3Token,
ctx0: usize,
res: &Residual,
costs: &LevelCosts,
probs: &TokenProbTables,
) -> u32 {
let mut ctxs: [u8; 16] = [0; 16];
let mut levels: [u8; 16] = [0; 16];
let mut abs_levels: [u16; 16] = [0; 16];
let ctype = res.coeff_type;
let n = res.first;
let band = VP8_ENC_BANDS[n] as usize;
let p0 = probs[ctype][band][ctx0][0];
let ctx = ctx0;
let mut cost = if ctx0 == 0 {
vp8_bit_cost(true, p0) as u32
} else {
0
};
if res.last < 0 {
return vp8_bit_cost(false, p0) as u32;
}
{
let zero = _mm_setzero_si128();
let k_cst2 = _mm_set1_epi8(2);
let k_cst67 = _mm_set1_epi8(MAX_VARIABLE_LEVEL as i8);
let (c0_arr, c1_arr, c2_arr, c3_arr) = split4_ref(res.coeffs);
let c0_32 = simd_mem::_mm_loadu_si128(c0_arr);
let c1_32 = simd_mem::_mm_loadu_si128(c1_arr);
let c2_32 = simd_mem::_mm_loadu_si128(c2_arr);
let c3_32 = simd_mem::_mm_loadu_si128(c3_arr);
let c0 = _mm_packs_epi32(c0_32, c1_32); let c1 = _mm_packs_epi32(c2_32, c3_32);
let d0 = _mm_sub_epi16(zero, c0);
let d1 = _mm_sub_epi16(zero, c1);
let e0 = _mm_max_epi16(c0, d0); let e1 = _mm_max_epi16(c1, d1);
let f = _mm_packs_epi16(e0, e1);
let g = _mm_min_epu8(f, k_cst2);
let h = _mm_min_epu8(f, k_cst67);
simd_mem::_mm_storeu_si128(&mut ctxs, g);
simd_mem::_mm_storeu_si128(&mut levels, h);
let (al0, al1) = split2_mut(&mut abs_levels);
simd_mem::_mm_storeu_si128(al0, e0);
simd_mem::_mm_storeu_si128(al1, e1);
}
let costs_for_type = &costs.level_cost[ctype];
let last = res.last as usize;
cost += residual_cost_loop(n, last, ctx, &levels, &abs_levels, &ctxs, costs_for_type);
if last < 15 {
let next_band = VP8_ENC_BANDS[last + 1] as usize;
let next_ctx = ctxs[last] as usize;
let last_p0 = probs[ctype][next_band][next_ctx][0];
cost += vp8_bit_cost(false, last_p0) as u32;
}
cost
}
#[cfg(target_arch = "x86_64")]
#[arcane]
#[allow(dead_code)]
fn find_last_nonzero_simd_entry(_token: X64V3Token, coeffs: &[i32; 16]) -> i32 {
find_last_nonzero_simd(_token, coeffs)
}
#[cfg(target_arch = "x86_64")]
#[rite]
#[allow(dead_code)]
fn find_last_nonzero_simd(_token: X64V3Token, coeffs: &[i32; 16]) -> i32 {
let zero = _mm_setzero_si128();
let (c0_arr, c1_arr, c2_arr, c3_arr) = split4_ref(coeffs);
let c0_32 = simd_mem::_mm_loadu_si128(c0_arr);
let c1_32 = simd_mem::_mm_loadu_si128(c1_arr);
let c2_32 = simd_mem::_mm_loadu_si128(c2_arr);
let c3_32 = simd_mem::_mm_loadu_si128(c3_arr);
let c0 = _mm_packs_epi32(c0_32, c1_32); let c1 = _mm_packs_epi32(c2_32, c3_32);
let m0 = _mm_packs_epi16(c0, c1);
let m1 = _mm_cmpeq_epi8(m0, zero);
let mask = 0x0000ffff_u32 ^ (_mm_movemask_epi8(m1) as u32);
if mask == 0 {
-1
} else {
(31 - mask.leading_zeros()) as i32
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
fn get_residual_cost_neon_entry(
_token: NeonToken,
ctx0: usize,
res: &Residual,
costs: &LevelCosts,
probs: &TokenProbTables,
) -> u32 {
get_residual_cost_neon(_token, ctx0, res, costs, probs)
}
#[cfg(target_arch = "aarch64")]
#[rite]
pub(crate) fn get_residual_cost_neon(
_token: NeonToken,
ctx0: usize,
res: &Residual,
costs: &LevelCosts,
probs: &TokenProbTables,
) -> u32 {
let mut ctxs: [u8; 16] = [0; 16];
let mut levels: [u8; 16] = [0; 16];
let mut abs_levels: [u16; 16] = [0; 16];
let ctype = res.coeff_type;
let n = res.first;
let band = VP8_ENC_BANDS[n] as usize;
let p0 = probs[ctype][band][ctx0][0];
let ctx = ctx0;
let mut cost = if ctx0 == 0 {
vp8_bit_cost(true, p0) as u32
} else {
0
};
if res.last < 0 {
return vp8_bit_cost(false, p0) as u32;
}
{
let k_cst2 = vdupq_n_u8(2);
let k_cst67 = vdupq_n_u8(MAX_VARIABLE_LEVEL as u8);
let (c0_arr, c1_arr, c2_arr, c3_arr) = split4_ref(res.coeffs);
let c0 = simd_mem::vld1q_s32(c0_arr);
let c1 = simd_mem::vld1q_s32(c1_arr);
let c2 = simd_mem::vld1q_s32(c2_arr);
let c3 = simd_mem::vld1q_s32(c3_arr);
let s0 = vcombine_s16(vqmovn_s32(c0), vqmovn_s32(c1)); let s1 = vcombine_s16(vqmovn_s32(c2), vqmovn_s32(c3));
let e0 = vabsq_s16(s0);
let e1 = vabsq_s16(s1);
let f = vcombine_u8(vqmovun_s16(e0), vqmovun_s16(e1));
let g = vminq_u8(f, k_cst2);
let h = vminq_u8(f, k_cst67);
simd_mem::vst1q_u8(&mut ctxs, g);
simd_mem::vst1q_u8(&mut levels, h);
let (al0, al1) = split2_mut(&mut abs_levels);
simd_mem::vst1q_u16(al0, vreinterpretq_u16_s16(e0));
simd_mem::vst1q_u16(al1, vreinterpretq_u16_s16(e1));
}
let costs_for_type = &costs.level_cost[ctype];
let last = res.last as usize;
cost += residual_cost_loop(n, last, ctx, &levels, &abs_levels, &ctxs, costs_for_type);
if last < 15 {
let next_band = VP8_ENC_BANDS[last + 1] as usize;
let next_ctx = ctxs[last] as usize;
let last_p0 = probs[ctype][next_band][next_ctx][0];
cost += vp8_bit_cost(false, last_p0) as u32;
}
cost
}
#[cfg(target_arch = "aarch64")]
#[arcane]
#[allow(dead_code)]
fn find_last_nonzero_neon_entry(_token: NeonToken, coeffs: &[i32; 16]) -> i32 {
find_last_nonzero_neon(_token, coeffs)
}
#[cfg(target_arch = "aarch64")]
#[rite]
#[allow(dead_code)]
fn find_last_nonzero_neon(_token: NeonToken, coeffs: &[i32; 16]) -> i32 {
let zero = vdupq_n_s32(0);
let (c0_arr, c1_arr, c2_arr, c3_arr) = split4_ref(coeffs);
let c0 = simd_mem::vld1q_s32(c0_arr);
let c1 = simd_mem::vld1q_s32(c1_arr);
let c2 = simd_mem::vld1q_s32(c2_arr);
let c3 = simd_mem::vld1q_s32(c3_arr);
let s0 = vcombine_s16(vqmovn_s32(c0), vqmovn_s32(c1)); let s1 = vcombine_s16(vqmovn_s32(c2), vqmovn_s32(c3));
let m0 = vcombine_s8(vqmovn_s16(s0), vqmovn_s16(s1));
let eq_zero = vceqq_s8(m0, vdupq_n_s8(0));
let ne_zero = vmvnq_s8(vreinterpretq_s8_u8(eq_zero));
let indices: [u8; 16] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
let idx_vec = simd_mem::vld1q_u8(&indices);
let masked = vandq_u8(idx_vec, vreinterpretq_u8_s8(ne_zero));
let max_idx = vmaxvq_u8(masked);
if max_idx > 0 {
max_idx as i32
} else {
let _ = zero; if coeffs[0] != 0 { 0 } else { -1 }
}
}
#[cfg(target_arch = "wasm32")]
#[arcane]
fn get_residual_cost_wasm_entry(
_token: Wasm128Token,
ctx0: usize,
res: &Residual,
costs: &LevelCosts,
probs: &TokenProbTables,
) -> u32 {
get_residual_cost_wasm(_token, ctx0, res, costs, probs)
}
#[cfg(target_arch = "wasm32")]
#[rite]
pub(crate) fn get_residual_cost_wasm(
_token: Wasm128Token,
ctx0: usize,
res: &Residual,
costs: &LevelCosts,
probs: &TokenProbTables,
) -> u32 {
let mut ctxs: [u8; 16] = [0; 16];
let mut levels: [u8; 16] = [0; 16];
let mut abs_levels: [u16; 16] = [0; 16];
let ctype = res.coeff_type;
let n = res.first;
let band = VP8_ENC_BANDS[n] as usize;
let p0 = probs[ctype][band][ctx0][0];
let ctx = ctx0;
let mut cost = if ctx0 == 0 {
vp8_bit_cost(true, p0) as u32
} else {
0
};
if res.last < 0 {
return vp8_bit_cost(false, p0) as u32;
}
{
let k_cst2 = u8x16_splat(2);
let k_cst67 = u8x16_splat(MAX_VARIABLE_LEVEL as u8);
let c0 = i32x4(res.coeffs[0], res.coeffs[1], res.coeffs[2], res.coeffs[3]);
let c1 = i32x4(res.coeffs[4], res.coeffs[5], res.coeffs[6], res.coeffs[7]);
let c2 = i32x4(res.coeffs[8], res.coeffs[9], res.coeffs[10], res.coeffs[11]);
let c3 = i32x4(
res.coeffs[12],
res.coeffs[13],
res.coeffs[14],
res.coeffs[15],
);
let s0 = i16x8_narrow_i32x4(c0, c1);
let s1 = i16x8_narrow_i32x4(c2, c3);
let e0 = i16x8_abs(s0);
let e1 = i16x8_abs(s1);
let f = u8x16_narrow_i16x8(e0, e1);
let g = u8x16_min(f, k_cst2);
let h = u8x16_min(f, k_cst67);
ctxs[0] = u8x16_extract_lane::<0>(g);
ctxs[1] = u8x16_extract_lane::<1>(g);
ctxs[2] = u8x16_extract_lane::<2>(g);
ctxs[3] = u8x16_extract_lane::<3>(g);
ctxs[4] = u8x16_extract_lane::<4>(g);
ctxs[5] = u8x16_extract_lane::<5>(g);
ctxs[6] = u8x16_extract_lane::<6>(g);
ctxs[7] = u8x16_extract_lane::<7>(g);
ctxs[8] = u8x16_extract_lane::<8>(g);
ctxs[9] = u8x16_extract_lane::<9>(g);
ctxs[10] = u8x16_extract_lane::<10>(g);
ctxs[11] = u8x16_extract_lane::<11>(g);
ctxs[12] = u8x16_extract_lane::<12>(g);
ctxs[13] = u8x16_extract_lane::<13>(g);
ctxs[14] = u8x16_extract_lane::<14>(g);
ctxs[15] = u8x16_extract_lane::<15>(g);
levels[0] = u8x16_extract_lane::<0>(h);
levels[1] = u8x16_extract_lane::<1>(h);
levels[2] = u8x16_extract_lane::<2>(h);
levels[3] = u8x16_extract_lane::<3>(h);
levels[4] = u8x16_extract_lane::<4>(h);
levels[5] = u8x16_extract_lane::<5>(h);
levels[6] = u8x16_extract_lane::<6>(h);
levels[7] = u8x16_extract_lane::<7>(h);
levels[8] = u8x16_extract_lane::<8>(h);
levels[9] = u8x16_extract_lane::<9>(h);
levels[10] = u8x16_extract_lane::<10>(h);
levels[11] = u8x16_extract_lane::<11>(h);
levels[12] = u8x16_extract_lane::<12>(h);
levels[13] = u8x16_extract_lane::<13>(h);
levels[14] = u8x16_extract_lane::<14>(h);
levels[15] = u8x16_extract_lane::<15>(h);
abs_levels[0] = u16x8_extract_lane::<0>(e0);
abs_levels[1] = u16x8_extract_lane::<1>(e0);
abs_levels[2] = u16x8_extract_lane::<2>(e0);
abs_levels[3] = u16x8_extract_lane::<3>(e0);
abs_levels[4] = u16x8_extract_lane::<4>(e0);
abs_levels[5] = u16x8_extract_lane::<5>(e0);
abs_levels[6] = u16x8_extract_lane::<6>(e0);
abs_levels[7] = u16x8_extract_lane::<7>(e0);
abs_levels[8] = u16x8_extract_lane::<0>(e1);
abs_levels[9] = u16x8_extract_lane::<1>(e1);
abs_levels[10] = u16x8_extract_lane::<2>(e1);
abs_levels[11] = u16x8_extract_lane::<3>(e1);
abs_levels[12] = u16x8_extract_lane::<4>(e1);
abs_levels[13] = u16x8_extract_lane::<5>(e1);
abs_levels[14] = u16x8_extract_lane::<6>(e1);
abs_levels[15] = u16x8_extract_lane::<7>(e1);
}
let costs_for_type = &costs.level_cost[ctype];
let last = res.last as usize;
cost += residual_cost_loop(n, last, ctx, &levels, &abs_levels, &ctxs, costs_for_type);
if last < 15 {
let next_band = VP8_ENC_BANDS[last + 1] as usize;
let next_ctx = ctxs[last] as usize;
let last_p0 = probs[ctype][next_band][next_ctx][0];
cost += vp8_bit_cost(false, last_p0) as u32;
}
cost
}
pub fn get_cost_luma4(
levels: &[i32; 16],
top_nz: bool,
left_nz: bool,
costs: &LevelCosts,
probs: &TokenProbTables,
) -> (u32, bool) {
let ctx = (top_nz as usize) + (left_nz as usize);
let res = Residual::new(levels, 3, 0);
let has_nz = res.last >= 0;
let cost = get_residual_cost(ctx, &res, costs, probs);
(cost, has_nz)
}
pub fn get_cost_luma16(
dc_levels: &[i32; 16],
ac_levels: &[[i32; 16]; 16],
costs: &LevelCosts,
probs: &TokenProbTables,
) -> u32 {
let mut total_cost = 0u32;
let dc_res = Residual::new(dc_levels, 1, 0);
total_cost += get_residual_cost(0, &dc_res, costs, probs);
let mut top_nz = [false; 4];
let mut left_nz = [false; 4];
for y in 0..4 {
for x in 0..4 {
let block_idx = y * 4 + x;
let ctx = (top_nz[x] as usize) + (left_nz[y] as usize);
let ac_res = Residual::new(&ac_levels[block_idx], 0, 1); total_cost += get_residual_cost(ctx, &ac_res, costs, probs);
let has_nz = ac_res.last >= 0;
top_nz[x] = has_nz;
left_nz[y] = has_nz;
}
}
total_cost
}
pub fn get_cost_uv(uv_levels: &[[i32; 16]; 8], costs: &LevelCosts, probs: &TokenProbTables) -> u32 {
let mut total_cost = 0u32;
for block in uv_levels.iter() {
let res = Residual::new(block, 2, 0); total_cost += get_residual_cost(0, &res, costs, probs);
}
total_cost
}