#![allow(dead_code)]
#![allow(clippy::needless_range_loop)]
use super::{BPS, C8DC8, C8TM8, I16DC16, I16TM16};
use archmage::prelude::*;
#[cfg(target_arch = "aarch64")]
use archmage::intrinsics::aarch64 as simd_mem;
#[cfg(target_arch = "x86_64")]
use archmage::intrinsics::x86_64 as simd_mem;
#[inline]
fn fill_block(dst: &mut [u8], value: u8, size: usize) {
for y in 0..size {
for x in 0..size {
dst[y * BPS + x] = value;
}
}
}
#[inline]
fn vertical_pred(dst: &mut [u8], top: Option<&[u8]>, size: usize) {
if let Some(top) = top {
for y in 0..size {
for x in 0..size {
dst[y * BPS + x] = top[x];
}
}
} else {
fill_block(dst, 127, size);
}
}
#[inline]
fn horizontal_pred(dst: &mut [u8], left: Option<&[u8]>, size: usize) {
if let Some(left) = left {
for y in 0..size {
for x in 0..size {
dst[y * BPS + x] = left[y];
}
}
} else {
fill_block(dst, 129, size);
}
}
pub fn pred_luma16_dc(dst: &mut [u8], left: Option<&[u8]>, top: Option<&[u8]>) {
let dc_val = match (top, left) {
(Some(top), Some(left)) => {
let dc: u32 = top[..16].iter().map(|&x| u32::from(x)).sum::<u32>()
+ left[..16].iter().map(|&x| u32::from(x)).sum::<u32>();
((dc + 16) >> 5) as u8
}
(Some(top), None) => {
let mut dc: u32 = top[..16].iter().map(|&x| u32::from(x)).sum();
dc += dc; ((dc + 16) >> 5) as u8
}
(None, Some(left)) => {
let mut dc: u32 = left[..16].iter().map(|&x| u32::from(x)).sum();
dc += dc; ((dc + 16) >> 5) as u8
}
(None, None) => {
0x80u8
}
};
for y in 0..16 {
for x in 0..16 {
dst[y * BPS + x] = dc_val;
}
}
}
pub fn pred_luma16_tm(dst: &mut [u8], left_with_corner: Option<&[u8]>, top: Option<&[u8]>) {
match (left_with_corner, top) {
(Some(left), Some(top)) => {
incant!(pred_luma16_tm_impl(dst, left, top), [v3, neon, scalar]);
}
(Some(left), None) => {
horizontal_pred(dst, Some(&left[1..17]), 16);
}
(None, Some(top)) => {
vertical_pred(dst, Some(top), 16);
}
(None, None) => {
fill_block(dst, 129, 16);
}
}
}
#[cfg(target_arch = "x86_64")]
#[cfg(target_arch = "x86_64")]
#[inline(always)]
fn pred_luma16_tm_impl_v3(token: X64V3Token, dst: &mut [u8], left: &[u8], top: &[u8]) {
pred_luma16_tm_sse2(token, dst, left, top);
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
fn pred_luma16_tm_impl_neon(token: NeonToken, dst: &mut [u8], left: &[u8], top: &[u8]) {
pred_luma16_tm_neon(token, dst, left, top);
}
#[inline(always)]
fn pred_luma16_tm_impl_scalar(_token: ScalarToken, dst: &mut [u8], left: &[u8], top: &[u8]) {
pred_luma16_tm_scalar(dst, left, top);
}
#[inline]
fn pred_luma16_tm_scalar(dst: &mut [u8], left: &[u8], top: &[u8]) {
let tl = i32::from(left[0]);
for y in 0..16 {
let l = i32::from(left[1 + y]);
for x in 0..16 {
let t = i32::from(top[x]);
dst[y * BPS + x] = (l + t - tl).clamp(0, 255) as u8;
}
}
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn pred_luma16_tm_sse2(_token: X64V3Token, dst: &mut [u8], left: &[u8], top: &[u8]) {
let zero = _mm_setzero_si128();
let top_arr = <&[u8; 16]>::try_from(&top[..16]).unwrap();
let top_vec = simd_mem::_mm_loadu_si128(top_arr);
let top_lo = _mm_unpacklo_epi8(top_vec, zero);
let top_hi = _mm_unpackhi_epi8(top_vec, zero);
let tl = _mm_set1_epi16(i16::from(left[0]));
let top_minus_tl_lo = _mm_sub_epi16(top_lo, tl);
let top_minus_tl_hi = _mm_sub_epi16(top_hi, tl);
for y in 0..16 {
let l = _mm_set1_epi16(i16::from(left[1 + y]));
let sum_lo = _mm_add_epi16(l, top_minus_tl_lo);
let sum_hi = _mm_add_epi16(l, top_minus_tl_hi);
let packed = _mm_packus_epi16(sum_lo, sum_hi);
let dst_arr = <&mut [u8; 16]>::try_from(&mut dst[y * BPS..y * BPS + 16]).unwrap();
simd_mem::_mm_storeu_si128(dst_arr, packed);
}
}
pub fn make_luma16_preds(yuv_p: &mut [u8], left_with_corner: Option<&[u8]>, top: Option<&[u8]>) {
let left_only = left_with_corner.map(|l| &l[1..17]);
pred_luma16_dc(&mut yuv_p[I16DC16..], left_only, top);
pred_luma16_tm(&mut yuv_p[I16TM16..], left_with_corner, top);
}
pub fn pred_chroma8_dc(dst: &mut [u8], left: Option<&[u8]>, top: Option<&[u8]>) {
let dc_val = match (top, left) {
(Some(top), Some(left)) => {
let mut dc: u32 = 0;
for i in 0..8 {
dc += u32::from(top[i]);
dc += u32::from(left[i]);
}
((dc + 8) >> 4) as u8
}
(Some(top), None) => {
let mut dc: u32 = 0;
for i in 0..8 {
dc += u32::from(top[i]);
}
dc += dc; ((dc + 8) >> 4) as u8
}
(None, Some(left)) => {
let mut dc: u32 = 0;
for i in 0..8 {
dc += u32::from(left[i]);
}
dc += dc; ((dc + 8) >> 4) as u8
}
(None, None) => {
0x80u8
}
};
for y in 0..8 {
for x in 0..8 {
dst[y * BPS + x] = dc_val;
}
}
}
pub fn pred_chroma8_tm(dst: &mut [u8], left_with_corner: Option<&[u8]>, top: Option<&[u8]>) {
match (left_with_corner, top) {
(Some(left), Some(top)) => {
incant!(pred_chroma8_tm_impl(dst, left, top), [v3, neon, scalar]);
}
(Some(left), None) => {
horizontal_pred(dst, Some(&left[1..9]), 8);
}
(None, Some(top)) => {
vertical_pred(dst, Some(top), 8);
}
(None, None) => {
fill_block(dst, 129, 8);
}
}
}
#[inline]
fn pred_chroma8_tm_scalar(dst: &mut [u8], left: &[u8], top: &[u8]) {
let tl = i32::from(left[0]);
for y in 0..8 {
let l = i32::from(left[1 + y]);
for x in 0..8 {
let t = i32::from(top[x]);
dst[y * BPS + x] = (l + t - tl).clamp(0, 255) as u8;
}
}
}
#[cfg(target_arch = "x86_64")]
#[cfg(target_arch = "x86_64")]
#[inline(always)]
fn pred_chroma8_tm_impl_v3(token: X64V3Token, dst: &mut [u8], left: &[u8], top: &[u8]) {
pred_chroma8_tm_sse2(token, dst, left, top);
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
fn pred_chroma8_tm_impl_neon(token: NeonToken, dst: &mut [u8], left: &[u8], top: &[u8]) {
pred_chroma8_tm_neon(token, dst, left, top);
}
#[inline(always)]
fn pred_chroma8_tm_impl_scalar(_token: ScalarToken, dst: &mut [u8], left: &[u8], top: &[u8]) {
pred_chroma8_tm_scalar(dst, left, top);
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn pred_chroma8_tm_sse2(_token: X64V3Token, dst: &mut [u8], left: &[u8], top: &[u8]) {
let zero = _mm_setzero_si128();
let mut top_padded = [0u8; 16];
top_padded[..8].copy_from_slice(&top[..8]);
let top_bytes = simd_mem::_mm_loadu_si128(&top_padded);
let top_i16 = _mm_unpacklo_epi8(top_bytes, zero);
let tl = _mm_set1_epi16(i16::from(left[0]));
let top_minus_tl = _mm_sub_epi16(top_i16, tl);
for y in 0..8 {
let l = _mm_set1_epi16(i16::from(left[1 + y]));
let sum = _mm_add_epi16(l, top_minus_tl);
let packed = _mm_packus_epi16(sum, zero);
let mut tmp = [0u8; 16];
simd_mem::_mm_storeu_si128(&mut tmp, packed);
dst[y * BPS..y * BPS + 8].copy_from_slice(&tmp[..8]);
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
fn pred_luma16_tm_neon(_token: NeonToken, dst: &mut [u8], left: &[u8], top: &[u8]) {
let top_vec = simd_mem::vld1q_u8(<&[u8; 16]>::try_from(&top[..16]).unwrap());
let top_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_vec)));
let top_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_vec)));
let tl = vdupq_n_s16(i16::from(left[0]));
let top_minus_tl_lo = vsubq_s16(top_lo, tl);
let top_minus_tl_hi = vsubq_s16(top_hi, tl);
for y in 0..16 {
let l = vdupq_n_s16(i16::from(left[1 + y]));
let sum_lo = vaddq_s16(l, top_minus_tl_lo);
let sum_hi = vaddq_s16(l, top_minus_tl_hi);
let packed = vcombine_u8(vqmovun_s16(sum_lo), vqmovun_s16(sum_hi));
let dst_arr = <&mut [u8; 16]>::try_from(&mut dst[y * BPS..y * BPS + 16]).unwrap();
simd_mem::vst1q_u8(dst_arr, packed);
}
}
#[cfg(target_arch = "aarch64")]
#[arcane]
fn pred_chroma8_tm_neon(_token: NeonToken, dst: &mut [u8], left: &[u8], top: &[u8]) {
let top_bytes = simd_mem::vld1_u8(<&[u8; 8]>::try_from(&top[..8]).unwrap());
let top_i16 = vreinterpretq_s16_u16(vmovl_u8(top_bytes));
let tl = vdupq_n_s16(i16::from(left[0]));
let top_minus_tl = vsubq_s16(top_i16, tl);
for y in 0..8 {
let l = vdupq_n_s16(i16::from(left[1 + y]));
let sum = vaddq_s16(l, top_minus_tl);
let packed = vqmovun_s16(sum);
let dst_arr = <&mut [u8; 8]>::try_from(&mut dst[y * BPS..y * BPS + 8]).unwrap();
simd_mem::vst1_u8(dst_arr, packed);
}
}
pub fn make_chroma8_preds(
yuv_p: &mut [u8],
u_left_with_corner: Option<&[u8]>,
v_left_with_corner: Option<&[u8]>,
uv_top: Option<&[u8]>,
) {
let (u_top, v_top) = if let Some(uv_top) = uv_top {
(Some(&uv_top[0..8]), Some(&uv_top[8..16]))
} else {
(None, None)
};
let u_left_only = u_left_with_corner.map(|l| &l[1..9]);
let v_left_only = v_left_with_corner.map(|l| &l[1..9]);
pred_chroma8_dc(&mut yuv_p[C8DC8..], u_left_only, u_top);
pred_chroma8_dc(&mut yuv_p[C8DC8 + 8..], v_left_only, v_top);
pred_chroma8_tm(&mut yuv_p[C8TM8..], u_left_with_corner, u_top);
pred_chroma8_tm(&mut yuv_p[C8TM8 + 8..], v_left_with_corner, v_top);
}