#![allow(dead_code)]
use archmage::prelude::*;
#[cfg(target_arch = "x86_64")]
use archmage::intrinsics::x86_64 as simd_mem;
#[inline]
pub fn t_transform(input: &[u8], stride: usize, w: &[u16; 16]) -> i32 {
t_transform_scalar(input, stride, w)
}
#[inline]
fn t_transform_scalar(input: &[u8], stride: usize, w: &[u16; 16]) -> i32 {
let mut tmp = [0i32; 16];
for i in 0..4 {
let row = i * stride;
let a0 = i32::from(input[row]) + i32::from(input[row + 2]);
let a1 = i32::from(input[row + 1]) + i32::from(input[row + 3]);
let a2 = i32::from(input[row + 1]) - i32::from(input[row + 3]);
let a3 = i32::from(input[row]) - i32::from(input[row + 2]);
tmp[i * 4] = a0 + a1;
tmp[i * 4 + 1] = a3 + a2;
tmp[i * 4 + 2] = a3 - a2;
tmp[i * 4 + 3] = a0 - a1;
}
let mut sum = 0i32;
for i in 0..4 {
let a0 = tmp[i] + tmp[8 + i];
let a1 = tmp[4 + i] + tmp[12 + i];
let a2 = tmp[4 + i] - tmp[12 + i];
let a3 = tmp[i] - tmp[8 + i];
let b0 = a0 + a1;
let b1 = a3 + a2;
let b2 = a3 - a2;
let b3 = a0 - a1;
sum += i32::from(w[i]) * b0.abs();
sum += i32::from(w[4 + i]) * b1.abs();
sum += i32::from(w[8 + i]) * b2.abs();
sum += i32::from(w[12 + i]) * b3.abs();
}
sum
}
#[inline]
pub fn tdisto_4x4(a: &[u8], b: &[u8], stride: usize, w: &[u16; 16]) -> i32 {
incant!(
tdisto_4x4_dispatch(a, b, stride, w),
[v3, neon, wasm128, scalar]
)
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn tdisto_4x4_dispatch_v3(
_token: X64V3Token,
a: &[u8],
b: &[u8],
stride: usize,
w: &[u16; 16],
) -> i32 {
crate::common::simd_sse::tdisto_4x4_fused_sse2(_token, a, b, stride, w)
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
fn tdisto_4x4_dispatch_neon(
token: NeonToken,
a: &[u8],
b: &[u8],
stride: usize,
w: &[u16; 16],
) -> i32 {
crate::common::simd_neon::tdisto_4x4_fused_neon(token, a, b, stride, w)
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn tdisto_4x4_dispatch_wasm128(
token: Wasm128Token,
a: &[u8],
b: &[u8],
stride: usize,
w: &[u16; 16],
) -> i32 {
crate::common::simd_wasm::tdisto_4x4_fused_wasm(token, a, b, stride, w)
}
#[inline(always)]
fn tdisto_4x4_dispatch_scalar(
_token: ScalarToken,
a: &[u8],
b: &[u8],
stride: usize,
w: &[u16; 16],
) -> i32 {
let sum1 = t_transform(a, stride, w);
let sum2 = t_transform(b, stride, w);
(sum2 - sum1).abs() >> 5
}
#[inline]
pub fn tdisto_16x16(a: &[u8], b: &[u8], stride: usize, w: &[u16; 16]) -> i32 {
incant!(
tdisto_16x16_dispatch(a, b, stride, w),
[v3, neon, wasm128, scalar]
)
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
fn tdisto_16x16_dispatch_v3(
token: X64V3Token,
a: &[u8],
b: &[u8],
stride: usize,
w: &[u16; 16],
) -> i32 {
tdisto_16x16_sse2(token, a, b, stride, w)
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
fn tdisto_16x16_dispatch_neon(
token: NeonToken,
a: &[u8],
b: &[u8],
stride: usize,
w: &[u16; 16],
) -> i32 {
tdisto_16x16_neon(token, a, b, stride, w)
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn tdisto_16x16_dispatch_wasm128(
token: Wasm128Token,
a: &[u8],
b: &[u8],
stride: usize,
w: &[u16; 16],
) -> i32 {
tdisto_16x16_wasm(token, a, b, stride, w)
}
#[inline(always)]
fn tdisto_16x16_dispatch_scalar(
_token: ScalarToken,
a: &[u8],
b: &[u8],
stride: usize,
w: &[u16; 16],
) -> i32 {
let mut d = 0i32;
for y in 0..4 {
for x in 0..4 {
let offset = y * 4 * stride + x * 4;
d += tdisto_4x4(&a[offset..], &b[offset..], stride, w);
}
}
d
}
#[arcane]
fn tdisto_16x16_sse2(_token: X64V3Token, a: &[u8], b: &[u8], stride: usize, w: &[u16; 16]) -> i32 {
let mut d = 0i32;
for y in 0..4 {
for x in 0..4 {
let offset = y * 4 * stride + x * 4;
d += crate::common::simd_sse::tdisto_4x4_fused_sse2(
_token,
&a[offset..],
&b[offset..],
stride,
w,
);
}
}
d
}
#[inline]
pub fn tdisto_8x8(a: &[u8], b: &[u8], stride: usize, w: &[u16; 16]) -> i32 {
incant!(
tdisto_8x8_dispatch(a, b, stride, w),
[v3, neon, wasm128, scalar]
)
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
fn tdisto_8x8_dispatch_v3(
token: X64V3Token,
a: &[u8],
b: &[u8],
stride: usize,
w: &[u16; 16],
) -> i32 {
tdisto_8x8_sse2(token, a, b, stride, w)
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
fn tdisto_8x8_dispatch_neon(
token: NeonToken,
a: &[u8],
b: &[u8],
stride: usize,
w: &[u16; 16],
) -> i32 {
tdisto_8x8_neon(token, a, b, stride, w)
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn tdisto_8x8_dispatch_wasm128(
token: Wasm128Token,
a: &[u8],
b: &[u8],
stride: usize,
w: &[u16; 16],
) -> i32 {
tdisto_8x8_wasm(token, a, b, stride, w)
}
#[inline(always)]
fn tdisto_8x8_dispatch_scalar(
_token: ScalarToken,
a: &[u8],
b: &[u8],
stride: usize,
w: &[u16; 16],
) -> i32 {
let mut d = 0i32;
for y in 0..2 {
for x in 0..2 {
let offset = y * 4 * stride + x * 4;
d += tdisto_4x4(&a[offset..], &b[offset..], stride, w);
}
}
d
}
#[arcane]
fn tdisto_16x16_neon(_token: NeonToken, a: &[u8], b: &[u8], stride: usize, w: &[u16; 16]) -> i32 {
let mut d = 0i32;
for y in 0..4 {
for x in 0..4 {
let offset = y * 4 * stride + x * 4;
d += crate::common::simd_neon::tdisto_4x4_fused_inner(
_token,
&a[offset..],
&b[offset..],
stride,
w,
);
}
}
d
}
#[arcane]
fn tdisto_8x8_neon(_token: NeonToken, a: &[u8], b: &[u8], stride: usize, w: &[u16; 16]) -> i32 {
let mut d = 0i32;
for y in 0..2 {
for x in 0..2 {
let offset = y * 4 * stride + x * 4;
d += crate::common::simd_neon::tdisto_4x4_fused_inner(
_token,
&a[offset..],
&b[offset..],
stride,
w,
);
}
}
d
}
#[arcane]
fn tdisto_16x16_wasm(
_token: Wasm128Token,
a: &[u8],
b: &[u8],
stride: usize,
w: &[u16; 16],
) -> i32 {
let mut d = 0i32;
for y in 0..4 {
for x in 0..4 {
let offset = y * 4 * stride + x * 4;
d += crate::common::simd_wasm::tdisto_4x4_fused_wasm(
_token,
&a[offset..],
&b[offset..],
stride,
w,
);
}
}
d
}
#[arcane]
fn tdisto_8x8_wasm(_token: Wasm128Token, a: &[u8], b: &[u8], stride: usize, w: &[u16; 16]) -> i32 {
let mut d = 0i32;
for y in 0..2 {
for x in 0..2 {
let offset = y * 4 * stride + x * 4;
d += crate::common::simd_wasm::tdisto_4x4_fused_wasm(
_token,
&a[offset..],
&b[offset..],
stride,
w,
);
}
}
d
}
#[arcane]
fn tdisto_8x8_sse2(_token: X64V3Token, a: &[u8], b: &[u8], stride: usize, w: &[u16; 16]) -> i32 {
let mut d = 0i32;
for y in 0..2 {
for x in 0..2 {
let offset = y * 4 * stride + x * 4;
d += crate::common::simd_sse::tdisto_4x4_fused_sse2(
_token,
&a[offset..],
&b[offset..],
stride,
w,
);
}
}
d
}
#[inline]
pub fn is_flat_source_16(src: &[u8], stride: usize) -> bool {
incant!(
is_flat_source_16_impl(src, stride),
[v3, neon, wasm128, scalar]
)
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn is_flat_source_16_impl_v3(_token: X64V3Token, src: &[u8], stride: usize) -> bool {
is_flat_source_16_sse2(_token, src, stride)
}
#[cfg(target_arch = "aarch64")]
fn is_flat_source_16_impl_neon(token: NeonToken, src: &[u8], stride: usize) -> bool {
crate::common::simd_neon::is_flat_source_16_neon(token, src, stride)
}
#[cfg(target_arch = "wasm32")]
fn is_flat_source_16_impl_wasm128(token: Wasm128Token, src: &[u8], stride: usize) -> bool {
crate::common::simd_wasm::is_flat_source_16_wasm(token, src, stride)
}
fn is_flat_source_16_impl_scalar(_token: ScalarToken, src: &[u8], stride: usize) -> bool {
is_flat_source_16_scalar(src, stride)
}
#[inline]
pub fn is_flat_source_16_scalar(src: &[u8], stride: usize) -> bool {
let v = src[0];
for y in 0..16 {
let row = y * stride;
for x in 0..16 {
if src[row + x] != v {
return false;
}
}
}
true
}
#[rite]
pub(crate) fn is_flat_source_16_sse2(_token: X64V3Token, src: &[u8], stride: usize) -> bool {
let v = _mm_set1_epi8(src[0] as i8);
for y in 0..16 {
let row_start = y * stride;
let row_arr = <&[u8; 16]>::try_from(&src[row_start..row_start + 16]).unwrap();
let row_bytes = simd_mem::_mm_loadu_si128(row_arr);
let cmp = _mm_cmpeq_epi8(row_bytes, v);
let mask = _mm_movemask_epi8(cmp) as u32;
if mask != 0xFFFF {
return false;
}
}
true
}
#[inline]
pub fn is_flat_coeffs(levels: &[i16], num_blocks: usize, thresh: i32) -> bool {
incant!(
is_flat_coeffs_dispatch(levels, num_blocks, thresh),
[v3, neon, wasm128, scalar]
)
}
#[cfg(target_arch = "x86_64")]
#[arcane]
fn is_flat_coeffs_dispatch_v3(
_token: X64V3Token,
levels: &[i16],
num_blocks: usize,
thresh: i32,
) -> bool {
is_flat_coeffs_sse2(_token, levels, num_blocks, thresh)
}
#[cfg(target_arch = "aarch64")]
fn is_flat_coeffs_dispatch_neon(
token: NeonToken,
levels: &[i16],
num_blocks: usize,
thresh: i32,
) -> bool {
crate::common::simd_neon::is_flat_coeffs_neon(token, levels, num_blocks, thresh)
}
#[cfg(target_arch = "wasm32")]
fn is_flat_coeffs_dispatch_wasm128(
token: Wasm128Token,
levels: &[i16],
num_blocks: usize,
thresh: i32,
) -> bool {
crate::common::simd_wasm::is_flat_coeffs_wasm(token, levels, num_blocks, thresh)
}
fn is_flat_coeffs_dispatch_scalar(
_token: ScalarToken,
levels: &[i16],
num_blocks: usize,
thresh: i32,
) -> bool {
is_flat_coeffs_scalar(levels, num_blocks, thresh)
}
#[rite]
pub(crate) fn is_flat_coeffs_sse2(
_token: X64V3Token,
levels: &[i16],
num_blocks: usize,
thresh: i32,
) -> bool {
let zero = _mm_setzero_si128();
let mut total_nz = 0i32;
for block in 0..num_blocks {
let base = block * 16;
let v = simd_mem::_mm_loadu_si128(<&[i16; 8]>::try_from(&levels[base..base + 8]).unwrap());
let v2 =
simd_mem::_mm_loadu_si128(<&[i16; 8]>::try_from(&levels[base + 8..base + 16]).unwrap());
let eq0 = _mm_cmpeq_epi16(v, zero);
let eq8 = _mm_cmpeq_epi16(v2, zero);
let packed = _mm_packs_epi16(eq0, eq8);
let mask = _mm_movemask_epi8(packed) as u32;
let nz_mask = (!mask) & 0xFFFE; total_nz += nz_mask.count_ones() as i32;
if total_nz > thresh {
return false;
}
}
true
}
fn is_flat_coeffs_scalar(levels: &[i16], num_blocks: usize, thresh: i32) -> bool {
let mut score = 0i32;
for block in 0..num_blocks {
for i in 1..16 {
if levels[block * 16 + i] != 0 {
score += 1;
if score > thresh {
return false;
}
}
}
}
true
}
pub const FLATNESS_LIMIT_I16: i32 = 0;
pub const FLATNESS_LIMIT_I4: i32 = 3;
pub const FLATNESS_LIMIT_UV: i32 = 2;
pub const FLATNESS_PENALTY: u32 = 140;