#![allow(clippy::too_many_arguments)]
#![allow(clippy::needless_range_loop)]
#![allow(clippy::erasing_op)]
#![allow(clippy::identity_op)]
#![allow(dead_code)]
use archmage::prelude::*;
use core::convert::TryFrom;
#[cfg(target_arch = "aarch64")]
use archmage::intrinsics::aarch64 as simd_mem_neon;
#[cfg(target_arch = "x86_64")]
use archmage::intrinsics::x86_64 as simd_mem_x86;
#[inline]
fn c(val: i32) -> i32 {
val.clamp(-128, 127)
}
#[inline]
fn u2s(val: u8) -> i32 {
i32::from(val) - 128
}
#[inline]
fn s2u(val: i32) -> u8 {
(c(val) + 128) as u8
}
#[inline]
const fn diff(val1: u8, val2: u8) -> u8 {
u8::abs_diff(val1, val2)
}
fn common_adjust_vertical(
use_outer_taps: bool,
pixels: &mut [u8],
point: usize,
stride: usize,
) -> i32 {
let p1 = u2s(pixels[point - 2 * stride]);
let p0 = u2s(pixels[point - stride]);
let q0 = u2s(pixels[point]);
let q1 = u2s(pixels[point + stride]);
let outer = if use_outer_taps { c(p1 - q1) } else { 0 };
let a = c(outer + 3 * (q0 - p0));
let b = (c(a + 3)) >> 3;
let a = (c(a + 4)) >> 3;
pixels[point] = s2u(q0 - a);
pixels[point - stride] = s2u(p0 + b);
a
}
fn common_adjust_horizontal(use_outer_taps: bool, pixels: &mut [u8]) -> i32 {
let p1 = u2s(pixels[2]);
let p0 = u2s(pixels[3]);
let q0 = u2s(pixels[4]);
let q1 = u2s(pixels[5]);
let outer = if use_outer_taps { c(p1 - q1) } else { 0 };
let a = c(outer + 3 * (q0 - p0));
let b = (c(a + 3)) >> 3;
let a = (c(a + 4)) >> 3;
pixels[4] = s2u(q0 - a);
pixels[3] = s2u(p0 + b);
a
}
#[inline]
fn simple_threshold_vertical(
filter_limit: i32,
pixels: &[u8],
point: usize,
stride: usize,
) -> bool {
i32::from(diff(pixels[point - stride], pixels[point])) * 2
+ i32::from(diff(pixels[point - 2 * stride], pixels[point + stride])) / 2
<= filter_limit
}
#[inline]
fn simple_threshold_horizontal(filter_limit: i32, pixels: &[u8]) -> bool {
assert!(pixels.len() >= 6); i32::from(diff(pixels[3], pixels[4])) * 2 + i32::from(diff(pixels[2], pixels[5])) / 2
<= filter_limit
}
fn should_filter_vertical(
interior_limit: u8,
edge_limit: u8,
pixels: &[u8],
point: usize,
stride: usize,
) -> bool {
simple_threshold_vertical(i32::from(edge_limit), pixels, point, stride)
&& diff(pixels[point - 4 * stride], pixels[point - 3 * stride]) <= interior_limit
&& diff(pixels[point - 3 * stride], pixels[point - 2 * stride]) <= interior_limit
&& diff(pixels[point - 2 * stride], pixels[point - stride]) <= interior_limit
&& diff(pixels[point + 3 * stride], pixels[point + 2 * stride]) <= interior_limit
&& diff(pixels[point + 2 * stride], pixels[point + stride]) <= interior_limit
&& diff(pixels[point + stride], pixels[point]) <= interior_limit
}
fn should_filter_horizontal(interior_limit: u8, edge_limit: u8, pixels: &[u8]) -> bool {
assert!(pixels.len() >= 8); simple_threshold_horizontal(i32::from(edge_limit), pixels)
&& diff(pixels[0], pixels[1]) <= interior_limit
&& diff(pixels[1], pixels[2]) <= interior_limit
&& diff(pixels[2], pixels[3]) <= interior_limit
&& diff(pixels[7], pixels[6]) <= interior_limit
&& diff(pixels[6], pixels[5]) <= interior_limit
&& diff(pixels[5], pixels[4]) <= interior_limit
}
#[inline]
fn high_edge_variance_vertical(threshold: u8, pixels: &[u8], point: usize, stride: usize) -> bool {
diff(pixels[point - 2 * stride], pixels[point - stride]) > threshold
|| diff(pixels[point + stride], pixels[point]) > threshold
}
#[inline]
fn high_edge_variance_horizontal(threshold: u8, pixels: &[u8]) -> bool {
diff(pixels[2], pixels[3]) > threshold || diff(pixels[5], pixels[4]) > threshold
}
pub(crate) fn simple_segment_vertical(
edge_limit: u8,
pixels: &mut [u8],
point: usize,
stride: usize,
) {
if simple_threshold_vertical(i32::from(edge_limit), pixels, point, stride) {
common_adjust_vertical(true, pixels, point, stride);
}
}
pub(crate) fn simple_segment_horizontal(edge_limit: u8, pixels: &mut [u8]) {
if simple_threshold_horizontal(i32::from(edge_limit), pixels) {
common_adjust_horizontal(true, pixels);
}
}
pub(crate) fn subblock_filter_vertical(
hev_threshold: u8,
interior_limit: u8,
edge_limit: u8,
pixels: &mut [u8],
point: usize,
stride: usize,
) {
if should_filter_vertical(interior_limit, edge_limit, pixels, point, stride) {
let hv = high_edge_variance_vertical(hev_threshold, pixels, point, stride);
let a = (common_adjust_vertical(hv, pixels, point, stride) + 1) >> 1;
if !hv {
pixels[point + stride] = s2u(u2s(pixels[point + stride]) - a);
pixels[point - 2 * stride] = s2u(u2s(pixels[point - 2 * stride]) + a);
}
}
}
pub(crate) fn subblock_filter_horizontal(
hev_threshold: u8,
interior_limit: u8,
edge_limit: u8,
pixels: &mut [u8],
) {
if should_filter_horizontal(interior_limit, edge_limit, pixels) {
let hv = high_edge_variance_horizontal(hev_threshold, pixels);
let a = (common_adjust_horizontal(hv, pixels) + 1) >> 1;
if !hv {
pixels[5] = s2u(u2s(pixels[5]) - a);
pixels[2] = s2u(u2s(pixels[2]) + a);
}
}
}
pub(crate) fn macroblock_filter_vertical(
hev_threshold: u8,
interior_limit: u8,
edge_limit: u8,
pixels: &mut [u8],
point: usize,
stride: usize,
) {
if should_filter_vertical(interior_limit, edge_limit, pixels, point, stride) {
if !high_edge_variance_vertical(hev_threshold, pixels, point, stride) {
let p2 = u2s(pixels[point - 3 * stride]);
let p1 = u2s(pixels[point - 2 * stride]);
let p0 = u2s(pixels[point - stride]);
let q0 = u2s(pixels[point]);
let q1 = u2s(pixels[point + stride]);
let q2 = u2s(pixels[point + 2 * stride]);
let w = c(c(p1 - q1) + 3 * (q0 - p0));
let a = c((27 * w + 63) >> 7);
pixels[point] = s2u(q0 - a);
pixels[point - stride] = s2u(p0 + a);
let a = c((18 * w + 63) >> 7);
pixels[point + stride] = s2u(q1 - a);
pixels[point - 2 * stride] = s2u(p1 + a);
let a = c((9 * w + 63) >> 7);
pixels[point + 2 * stride] = s2u(q2 - a);
pixels[point - 3 * stride] = s2u(p2 + a);
} else {
common_adjust_vertical(true, pixels, point, stride);
}
}
}
pub(crate) fn macroblock_filter_horizontal(
hev_threshold: u8,
interior_limit: u8,
edge_limit: u8,
pixels: &mut [u8],
) {
assert!(pixels.len() >= 8);
if should_filter_horizontal(interior_limit, edge_limit, pixels) {
if !high_edge_variance_horizontal(hev_threshold, pixels) {
let p2 = u2s(pixels[1]);
let p1 = u2s(pixels[2]);
let p0 = u2s(pixels[3]);
let q0 = u2s(pixels[4]);
let q1 = u2s(pixels[5]);
let q2 = u2s(pixels[6]);
let w = c(c(p1 - q1) + 3 * (q0 - p0));
let a = c((27 * w + 63) >> 7);
pixels[4] = s2u(q0 - a);
pixels[3] = s2u(p0 + a);
let a = c((18 * w + 63) >> 7);
pixels[5] = s2u(q1 - a);
pixels[2] = s2u(p1 + a);
let a = c((9 * w + 63) >> 7);
pixels[6] = s2u(q2 - a);
pixels[1] = s2u(p2 + a);
} else {
common_adjust_horizontal(true, pixels);
}
}
}
#[cfg(all(test, feature = "_benchmarks"))]
mod benches {
use super::*;
use test::{Bencher, black_box};
#[rustfmt::skip]
const TEST_DATA: [u8; 8 * 8] = [
177, 192, 179, 181, 185, 174, 186, 193,
185, 180, 175, 179, 175, 190, 189, 190,
185, 181, 177, 190, 190, 174, 176, 188,
192, 179, 186, 175, 190, 184, 190, 175,
175, 183, 183, 190, 187, 186, 176, 181,
183, 177, 182, 185, 183, 179, 178, 181,
191, 183, 188, 181, 180, 193, 185, 180,
177, 182, 177, 178, 179, 178, 191, 178,
];
#[bench]
fn measure_horizontal_macroblock_filter(b: &mut Bencher) {
let hev_threshold = 5;
let interior_limit = 15;
let edge_limit = 15;
let mut data = TEST_DATA.clone();
let stride = 8;
b.iter(|| {
for y in 0..8 {
black_box(macroblock_filter_horizontal(
hev_threshold,
interior_limit,
edge_limit,
&mut data[y * stride..][..8],
));
}
});
}
#[bench]
fn measure_vertical_macroblock_filter(b: &mut Bencher) {
let hev_threshold = 5;
let interior_limit = 15;
let edge_limit = 15;
let mut data = TEST_DATA.clone();
let stride = 8;
b.iter(|| {
for x in 0..8 {
black_box(macroblock_filter_vertical(
hev_threshold,
interior_limit,
edge_limit,
&mut data,
4 * stride + x,
stride,
));
}
});
}
#[bench]
fn measure_horizontal_subblock_filter(b: &mut Bencher) {
let hev_threshold = 5;
let interior_limit = 15;
let edge_limit = 15;
let mut data = TEST_DATA.clone();
let stride = 8;
b.iter(|| {
for y in 0usize..8 {
black_box(subblock_filter_horizontal(
hev_threshold,
interior_limit,
edge_limit,
&mut data[y * stride..][..8],
))
}
});
}
#[bench]
fn measure_vertical_subblock_filter(b: &mut Bencher) {
let hev_threshold = 5;
let interior_limit = 15;
let edge_limit = 15;
let mut data = TEST_DATA.clone();
let stride = 8;
b.iter(|| {
for x in 0..8 {
black_box(subblock_filter_vertical(
hev_threshold,
interior_limit,
edge_limit,
&mut data,
4 * stride + x,
stride,
))
}
});
}
#[bench]
fn measure_simple_segment_horizontal_filter(b: &mut Bencher) {
let edge_limit = 15;
let mut data = TEST_DATA.clone();
let stride = 8;
b.iter(|| {
for y in 0usize..8 {
black_box(simple_segment_horizontal(
edge_limit,
&mut data[y * stride..][..8],
))
}
});
}
#[bench]
fn measure_simple_segment_vertical_filter(b: &mut Bencher) {
let edge_limit = 15;
let mut data = TEST_DATA.clone();
let stride = 8;
b.iter(|| {
for x in 0usize..16 {
black_box(simple_segment_vertical(
edge_limit,
&mut data,
4 * stride + x,
stride,
))
}
});
}
}
const MAX_STRIDE: usize = 16384;
const V_FILTER_REGION: usize = 3 * MAX_STRIDE + 16;
const V_FILTER_NORMAL_REGION: usize = 7 * MAX_STRIDE + 16;
const H_FILTER_SIMPLE_REGION: usize = 15 * MAX_STRIDE + 4;
const H_FILTER_NORMAL_REGION: usize = 15 * MAX_STRIDE + 8;
const H_FILTER_FUSED_REGION: usize = 15 * MAX_STRIDE + 16;
const H_FILTER_UV_REGION: usize = 7 * MAX_STRIDE + 8;
const V_FILTER_UV_REGION: usize = 7 * MAX_STRIDE + 16;
#[cfg(target_arch = "x86_64")]
#[rite]
fn needs_filter_16(
_token: X64V3Token,
p1: __m128i,
p0: __m128i,
q0: __m128i,
q1: __m128i,
thresh: i32,
) -> __m128i {
let t = _mm_set1_epi8(thresh as i8);
let abs_p0_q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
let abs_p1_q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
let doubled = _mm_adds_epu8(abs_p0_q0, abs_p0_q0);
let halved = _mm_and_si128(_mm_srli_epi16(abs_p1_q1, 1), _mm_set1_epi8(0x7F));
let sum = _mm_adds_epu8(doubled, halved);
let exceeds = _mm_subs_epu8(sum, t);
_mm_cmpeq_epi8(exceeds, _mm_setzero_si128())
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn get_base_delta_16(
_token: X64V3Token,
p1: __m128i,
p0: __m128i,
q0: __m128i,
q1: __m128i,
) -> __m128i {
let sign = _mm_set1_epi8(-128i8);
let p1s = _mm_xor_si128(p1, sign);
let p0s = _mm_xor_si128(p0, sign);
let q0s = _mm_xor_si128(q0, sign);
let q1s = _mm_xor_si128(q1, sign);
let p1_q1 = _mm_subs_epi8(p1s, q1s);
let q0_p0 = _mm_subs_epi8(q0s, p0s);
let s1 = _mm_adds_epi8(p1_q1, q0_p0);
let s2 = _mm_adds_epi8(s1, q0_p0);
_mm_adds_epi8(s2, q0_p0)
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn signed_shift_right_3(_token: X64V3Token, v: __m128i) -> __m128i {
let lo = _mm_srai_epi16(_mm_unpacklo_epi8(v, v), 11); let hi = _mm_srai_epi16(_mm_unpackhi_epi8(v, v), 11);
_mm_packs_epi16(lo, hi)
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn do_simple_filter_16(_token: X64V3Token, p0: &mut __m128i, q0: &mut __m128i, fl: __m128i) {
let sign = _mm_set1_epi8(-128i8);
let k3 = _mm_set1_epi8(3);
let k4 = _mm_set1_epi8(4);
let v3 = _mm_adds_epi8(fl, k3);
let v4 = _mm_adds_epi8(fl, k4);
let v3 = signed_shift_right_3(_token, v3);
let v4 = signed_shift_right_3(_token, v4);
let mut p0s = _mm_xor_si128(*p0, sign);
let mut q0s = _mm_xor_si128(*q0, sign);
q0s = _mm_subs_epi8(q0s, v4);
p0s = _mm_adds_epi8(p0s, v3);
*p0 = _mm_xor_si128(p0s, sign);
*q0 = _mm_xor_si128(q0s, sign);
}
#[cfg(target_arch = "x86_64")]
#[rite]
pub(crate) fn simple_v_filter16(
_token: X64V3Token,
pixels: &mut [u8],
point: usize,
stride: usize,
thresh: i32,
) {
assert!(stride <= MAX_STRIDE, "stride exceeds MAX_STRIDE");
let start = point - 2 * stride;
let region: &mut [u8; V_FILTER_REGION] =
<&mut [u8; V_FILTER_REGION]>::try_from(&mut pixels[start..start + V_FILTER_REGION])
.expect("simple_v_filter16: buffer too small (missing FILTER_PADDING?)");
let off_p1 = 0;
let off_p0 = stride;
let off_q0 = 2 * stride;
let off_q1 = 3 * stride;
let p1 = simd_mem_x86::_mm_loadu_si128(<&[u8; 16]>::try_from(®ion[off_p1..][..16]).unwrap());
let mut p0 =
simd_mem_x86::_mm_loadu_si128(<&[u8; 16]>::try_from(®ion[off_p0..][..16]).unwrap());
let mut q0 =
simd_mem_x86::_mm_loadu_si128(<&[u8; 16]>::try_from(®ion[off_q0..][..16]).unwrap());
let q1 = simd_mem_x86::_mm_loadu_si128(<&[u8; 16]>::try_from(®ion[off_q1..][..16]).unwrap());
let mask = needs_filter_16(_token, p1, p0, q0, q1, thresh);
let fl = get_base_delta_16(_token, p1, p0, q0, q1);
let fl_masked = _mm_and_si128(fl, mask);
do_simple_filter_16(_token, &mut p0, &mut q0, fl_masked);
simd_mem_x86::_mm_storeu_si128(
<&mut [u8; 16]>::try_from(&mut region[off_p0..][..16]).unwrap(),
p0,
);
simd_mem_x86::_mm_storeu_si128(
<&mut [u8; 16]>::try_from(&mut region[off_q0..][..16]).unwrap(),
q0,
);
}
const V_FILTER_REGION_32: usize = 3 * MAX_STRIDE + 32;
#[cfg(target_arch = "x86_64")]
#[rite]
fn needs_filter_32(
_token: X64V3Token,
p1: __m256i,
p0: __m256i,
q0: __m256i,
q1: __m256i,
thresh: i32,
) -> __m256i {
let t = _mm256_set1_epi8(thresh as i8);
let abs_p0_q0 = _mm256_or_si256(_mm256_subs_epu8(p0, q0), _mm256_subs_epu8(q0, p0));
let abs_p1_q1 = _mm256_or_si256(_mm256_subs_epu8(p1, q1), _mm256_subs_epu8(q1, p1));
let doubled = _mm256_adds_epu8(abs_p0_q0, abs_p0_q0);
let halved = _mm256_and_si256(_mm256_srli_epi16(abs_p1_q1, 1), _mm256_set1_epi8(0x7F));
let sum = _mm256_adds_epu8(doubled, halved);
let exceeds = _mm256_subs_epu8(sum, t);
_mm256_cmpeq_epi8(exceeds, _mm256_setzero_si256())
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn get_base_delta_32(
_token: X64V3Token,
p1: __m256i,
p0: __m256i,
q0: __m256i,
q1: __m256i,
) -> __m256i {
let sign = _mm256_set1_epi8(-128i8);
let p1s = _mm256_xor_si256(p1, sign);
let p0s = _mm256_xor_si256(p0, sign);
let q0s = _mm256_xor_si256(q0, sign);
let q1s = _mm256_xor_si256(q1, sign);
let p1_q1 = _mm256_subs_epi8(p1s, q1s);
let q0_p0 = _mm256_subs_epi8(q0s, p0s);
let s1 = _mm256_adds_epi8(p1_q1, q0_p0);
let s2 = _mm256_adds_epi8(s1, q0_p0);
_mm256_adds_epi8(s2, q0_p0)
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn signed_shift_right_3_avx2(_token: X64V3Token, v: __m256i) -> __m256i {
let lo = _mm256_srai_epi16(_mm256_unpacklo_epi8(v, v), 11);
let hi = _mm256_srai_epi16(_mm256_unpackhi_epi8(v, v), 11);
_mm256_packs_epi16(lo, hi)
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn do_simple_filter_32(_token: X64V3Token, p0: &mut __m256i, q0: &mut __m256i, fl: __m256i) {
let sign = _mm256_set1_epi8(-128i8);
let k3 = _mm256_set1_epi8(3);
let k4 = _mm256_set1_epi8(4);
let v3 = signed_shift_right_3_avx2(_token, _mm256_adds_epi8(fl, k3));
let v4 = signed_shift_right_3_avx2(_token, _mm256_adds_epi8(fl, k4));
let mut p0s = _mm256_xor_si256(*p0, sign);
let mut q0s = _mm256_xor_si256(*q0, sign);
q0s = _mm256_subs_epi8(q0s, v4);
p0s = _mm256_adds_epi8(p0s, v3);
*p0 = _mm256_xor_si256(p0s, sign);
*q0 = _mm256_xor_si256(q0s, sign);
}
#[cfg(target_arch = "x86_64")]
#[rite]
pub(crate) fn simple_v_filter32(
_token: X64V3Token,
pixels: &mut [u8],
point: usize,
stride: usize,
thresh: i32,
) {
assert!(stride <= MAX_STRIDE, "stride exceeds MAX_STRIDE");
let start = point - 2 * stride;
let region: &mut [u8; V_FILTER_REGION_32] =
<&mut [u8; V_FILTER_REGION_32]>::try_from(&mut pixels[start..start + V_FILTER_REGION_32])
.expect("simple_v_filter32: buffer too small");
let off_p1 = 0;
let off_p0 = stride;
let off_q0 = 2 * stride;
let off_q1 = 3 * stride;
let p1 =
simd_mem_x86::_mm256_loadu_si256(<&[u8; 32]>::try_from(®ion[off_p1..][..32]).unwrap());
let mut p0 =
simd_mem_x86::_mm256_loadu_si256(<&[u8; 32]>::try_from(®ion[off_p0..][..32]).unwrap());
let mut q0 =
simd_mem_x86::_mm256_loadu_si256(<&[u8; 32]>::try_from(®ion[off_q0..][..32]).unwrap());
let q1 =
simd_mem_x86::_mm256_loadu_si256(<&[u8; 32]>::try_from(®ion[off_q1..][..32]).unwrap());
let mask = needs_filter_32(_token, p1, p0, q0, q1, thresh);
let fl = get_base_delta_32(_token, p1, p0, q0, q1);
let fl_masked = _mm256_and_si256(fl, mask);
do_simple_filter_32(_token, &mut p0, &mut q0, fl_masked);
simd_mem_x86::_mm256_storeu_si256(
<&mut [u8; 32]>::try_from(&mut region[off_p0..][..32]).unwrap(),
p0,
);
simd_mem_x86::_mm256_storeu_si256(
<&mut [u8; 32]>::try_from(&mut region[off_q0..][..32]).unwrap(),
q0,
);
}
const V_FILTER_NORMAL_REGION_32: usize = 7 * MAX_STRIDE + 32;
#[cfg(target_arch = "x86_64")]
#[rite]
fn needs_filter_normal_32(
_token: X64V3Token,
p3: __m256i,
p2: __m256i,
p1: __m256i,
p0: __m256i,
q0: __m256i,
q1: __m256i,
q2: __m256i,
q3: __m256i,
edge_limit: i32,
interior_limit: i32,
) -> __m256i {
let simple_mask = needs_filter_32(_token, p1, p0, q0, q1, edge_limit);
let i_limit = _mm256_set1_epi8(interior_limit as i8);
macro_rules! abs_diff {
($a:expr, $b:expr) => {
_mm256_or_si256(_mm256_subs_epu8($a, $b), _mm256_subs_epu8($b, $a))
};
}
let d_p3_p2 = abs_diff!(p3, p2);
let d_p2_p1 = abs_diff!(p2, p1);
let d_p1_p0 = abs_diff!(p1, p0);
let d_q0_q1 = abs_diff!(q0, q1);
let d_q1_q2 = abs_diff!(q1, q2);
let d_q2_q3 = abs_diff!(q2, q3);
let max1 = _mm256_max_epu8(d_p3_p2, d_p2_p1);
let max2 = _mm256_max_epu8(d_p1_p0, d_q0_q1);
let max3 = _mm256_max_epu8(d_q1_q2, d_q2_q3);
let max4 = _mm256_max_epu8(max1, max2);
let max_diff = _mm256_max_epu8(max3, max4);
let exceeds = _mm256_subs_epu8(max_diff, i_limit);
let interior_ok = _mm256_cmpeq_epi8(exceeds, _mm256_setzero_si256());
_mm256_and_si256(simple_mask, interior_ok)
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn high_edge_variance_32(
_token: X64V3Token,
p1: __m256i,
p0: __m256i,
q0: __m256i,
q1: __m256i,
hev_thresh: i32,
) -> __m256i {
let t = _mm256_set1_epi8(hev_thresh as i8);
let d_p1_p0 = _mm256_or_si256(_mm256_subs_epu8(p1, p0), _mm256_subs_epu8(p0, p1));
let d_q1_q0 = _mm256_or_si256(_mm256_subs_epu8(q1, q0), _mm256_subs_epu8(q0, q1));
let p_exceeds = _mm256_subs_epu8(d_p1_p0, t);
let q_exceeds = _mm256_subs_epu8(d_q1_q0, t);
let p_hev = _mm256_xor_si256(
_mm256_cmpeq_epi8(p_exceeds, _mm256_setzero_si256()),
_mm256_set1_epi8(-1),
);
let q_hev = _mm256_xor_si256(
_mm256_cmpeq_epi8(q_exceeds, _mm256_setzero_si256()),
_mm256_set1_epi8(-1),
);
_mm256_or_si256(p_hev, q_hev)
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn signed_shift_right_1_avx2(_token: X64V3Token, v: __m256i) -> __m256i {
let lo = _mm256_srai_epi16(_mm256_unpacklo_epi8(v, v), 9);
let hi = _mm256_srai_epi16(_mm256_unpackhi_epi8(v, v), 9);
_mm256_packs_epi16(lo, hi)
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn do_filter4_32(
_token: X64V3Token,
p1: &mut __m256i,
p0: &mut __m256i,
q0: &mut __m256i,
q1: &mut __m256i,
mask: __m256i,
hev: __m256i,
) {
let sign = _mm256_set1_epi8(-128i8);
let p1s = _mm256_xor_si256(*p1, sign);
let p0s = _mm256_xor_si256(*p0, sign);
let q0s = _mm256_xor_si256(*q0, sign);
let q1s = _mm256_xor_si256(*q1, sign);
let outer = _mm256_subs_epi8(p1s, q1s);
let outer_masked = _mm256_and_si256(outer, hev);
let q0_p0 = _mm256_subs_epi8(q0s, p0s);
let a = _mm256_adds_epi8(outer_masked, q0_p0);
let a = _mm256_adds_epi8(a, q0_p0);
let a = _mm256_adds_epi8(a, q0_p0);
let a = _mm256_and_si256(a, mask);
let k3 = _mm256_set1_epi8(3);
let k4 = _mm256_set1_epi8(4);
let f1 = _mm256_adds_epi8(a, k4);
let f2 = _mm256_adds_epi8(a, k3);
let f1 = signed_shift_right_3_avx2(_token, f1);
let f2 = signed_shift_right_3_avx2(_token, f2);
let new_p0s = _mm256_adds_epi8(p0s, f2);
let new_q0s = _mm256_subs_epi8(q0s, f1);
let a2 = _mm256_adds_epi8(f1, _mm256_set1_epi8(1));
let a2 = signed_shift_right_1_avx2(_token, a2);
let a2 = _mm256_andnot_si256(hev, a2);
let a2 = _mm256_and_si256(a2, mask);
let new_p1s = _mm256_adds_epi8(p1s, a2);
let new_q1s = _mm256_subs_epi8(q1s, a2);
*p0 = _mm256_xor_si256(new_p0s, sign);
*q0 = _mm256_xor_si256(new_q0s, sign);
*p1 = _mm256_xor_si256(new_p1s, sign);
*q1 = _mm256_xor_si256(new_q1s, sign);
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn filter6_wide_half_avx2(
_token: X64V3Token,
p2: __m256i,
p1: __m256i,
p0: __m256i,
q0: __m256i,
q1: __m256i,
q2: __m256i,
) -> (__m256i, __m256i, __m256i, __m256i, __m256i, __m256i) {
let p2_16 = _mm256_srai_epi16(p2, 8);
let p1_16 = _mm256_srai_epi16(p1, 8);
let p0_16 = _mm256_srai_epi16(p0, 8);
let q0_16 = _mm256_srai_epi16(q0, 8);
let q1_16 = _mm256_srai_epi16(q1, 8);
let q2_16 = _mm256_srai_epi16(q2, 8);
let p1_q1 = _mm256_sub_epi16(p1_16, q1_16);
let q0_p0 = _mm256_sub_epi16(q0_16, p0_16);
let three_q0_p0 = _mm256_add_epi16(_mm256_add_epi16(q0_p0, q0_p0), q0_p0);
let w = _mm256_add_epi16(p1_q1, three_q0_p0);
let w = _mm256_max_epi16(
_mm256_min_epi16(w, _mm256_set1_epi16(127)),
_mm256_set1_epi16(-128),
);
let k27 = _mm256_set1_epi16(27);
let k18 = _mm256_set1_epi16(18);
let k9 = _mm256_set1_epi16(9);
let k63 = _mm256_set1_epi16(63);
let a0 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_mullo_epi16(w, k27), k63), 7);
let a1 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_mullo_epi16(w, k18), k63), 7);
let a2 = _mm256_srai_epi16(_mm256_add_epi16(_mm256_mullo_epi16(w, k9), k63), 7);
let new_p0 = _mm256_add_epi16(p0_16, a0);
let new_q0 = _mm256_sub_epi16(q0_16, a0);
let new_p1 = _mm256_add_epi16(p1_16, a1);
let new_q1 = _mm256_sub_epi16(q1_16, a1);
let new_p2 = _mm256_add_epi16(p2_16, a2);
let new_q2 = _mm256_sub_epi16(q2_16, a2);
let clamp = |v: __m256i| {
_mm256_max_epi16(
_mm256_min_epi16(v, _mm256_set1_epi16(127)),
_mm256_set1_epi16(-128),
)
};
(
clamp(new_p2),
clamp(new_p1),
clamp(new_p0),
clamp(new_q0),
clamp(new_q1),
clamp(new_q2),
)
}
#[cfg(target_arch = "x86_64")]
#[rite]
#[allow(clippy::too_many_arguments)]
fn do_filter6_32(
_token: X64V3Token,
p2: &mut __m256i,
p1: &mut __m256i,
p0: &mut __m256i,
q0: &mut __m256i,
q1: &mut __m256i,
q2: &mut __m256i,
mask: __m256i,
hev: __m256i,
) {
let sign = _mm256_set1_epi8(-128i8);
let not_hev = _mm256_andnot_si256(hev, _mm256_set1_epi8(-1));
let p2s = _mm256_xor_si256(*p2, sign);
let p1s = _mm256_xor_si256(*p1, sign);
let p0s = _mm256_xor_si256(*p0, sign);
let q0s = _mm256_xor_si256(*q0, sign);
let q1s = _mm256_xor_si256(*q1, sign);
let q2s = _mm256_xor_si256(*q2, sign);
let outer = _mm256_subs_epi8(p1s, q1s);
let outer_hev = _mm256_and_si256(outer, hev);
let q0_p0 = _mm256_subs_epi8(q0s, p0s);
let a_hev = _mm256_adds_epi8(outer_hev, q0_p0);
let a_hev = _mm256_adds_epi8(a_hev, q0_p0);
let a_hev = _mm256_adds_epi8(a_hev, q0_p0);
let a_hev = _mm256_and_si256(a_hev, _mm256_and_si256(mask, hev));
let k3 = _mm256_set1_epi8(3);
let k4 = _mm256_set1_epi8(4);
let f1_hev = signed_shift_right_3_avx2(_token, _mm256_adds_epi8(a_hev, k4));
let f2_hev = signed_shift_right_3_avx2(_token, _mm256_adds_epi8(a_hev, k3));
let (new_p2_lo, new_p1_lo, new_p0_lo, new_q0_lo, new_q1_lo, new_q2_lo) = filter6_wide_half_avx2(
_token,
_mm256_unpacklo_epi8(p2s, p2s),
_mm256_unpacklo_epi8(p1s, p1s),
_mm256_unpacklo_epi8(p0s, p0s),
_mm256_unpacklo_epi8(q0s, q0s),
_mm256_unpacklo_epi8(q1s, q1s),
_mm256_unpacklo_epi8(q2s, q2s),
);
let (new_p2_hi, new_p1_hi, new_p0_hi, new_q0_hi, new_q1_hi, new_q2_hi) = filter6_wide_half_avx2(
_token,
_mm256_unpackhi_epi8(p2s, p2s),
_mm256_unpackhi_epi8(p1s, p1s),
_mm256_unpackhi_epi8(p0s, p0s),
_mm256_unpackhi_epi8(q0s, q0s),
_mm256_unpackhi_epi8(q1s, q1s),
_mm256_unpackhi_epi8(q2s, q2s),
);
let new_p2_wide = _mm256_packs_epi16(new_p2_lo, new_p2_hi);
let new_p1_wide = _mm256_packs_epi16(new_p1_lo, new_p1_hi);
let new_p0_wide = _mm256_packs_epi16(new_p0_lo, new_p0_hi);
let new_q0_wide = _mm256_packs_epi16(new_q0_lo, new_q0_hi);
let new_q1_wide = _mm256_packs_epi16(new_q1_lo, new_q1_hi);
let new_q2_wide = _mm256_packs_epi16(new_q2_lo, new_q2_hi);
let mask_not_hev = _mm256_and_si256(mask, not_hev);
let new_p0s = _mm256_adds_epi8(p0s, f2_hev);
let new_q0s = _mm256_subs_epi8(q0s, f1_hev);
let final_p0s = _mm256_blendv_epi8(new_p0s, new_p0_wide, mask_not_hev);
let final_q0s = _mm256_blendv_epi8(new_q0s, new_q0_wide, mask_not_hev);
let final_p1s = _mm256_blendv_epi8(p1s, new_p1_wide, mask_not_hev);
let final_q1s = _mm256_blendv_epi8(q1s, new_q1_wide, mask_not_hev);
let final_p2s = _mm256_blendv_epi8(p2s, new_p2_wide, mask_not_hev);
let final_q2s = _mm256_blendv_epi8(q2s, new_q2_wide, mask_not_hev);
*p0 = _mm256_xor_si256(final_p0s, sign);
*q0 = _mm256_xor_si256(final_q0s, sign);
*p1 = _mm256_xor_si256(final_p1s, sign);
*q1 = _mm256_xor_si256(final_q1s, sign);
*p2 = _mm256_xor_si256(final_p2s, sign);
*q2 = _mm256_xor_si256(final_q2s, sign);
}
#[cfg(target_arch = "x86_64")]
#[rite]
pub(crate) fn normal_v_filter32_inner(
_token: X64V3Token,
pixels: &mut [u8],
point: usize,
stride: usize,
hev_thresh: i32,
interior_limit: i32,
edge_limit: i32,
) {
assert!(stride <= MAX_STRIDE, "stride exceeds MAX_STRIDE");
let start = point - 4 * stride;
let region: &mut [u8; V_FILTER_NORMAL_REGION_32] =
<&mut [u8; V_FILTER_NORMAL_REGION_32]>::try_from(
&mut pixels[start..start + V_FILTER_NORMAL_REGION_32],
)
.expect("normal_v_filter32_inner: buffer too small");
let off_p3 = 0;
let off_p2 = stride;
let off_p1 = 2 * stride;
let off_p0 = 3 * stride;
let off_q0 = 4 * stride;
let off_q1 = 5 * stride;
let off_q2 = 6 * stride;
let off_q3 = 7 * stride;
let p3 =
simd_mem_x86::_mm256_loadu_si256(<&[u8; 32]>::try_from(®ion[off_p3..][..32]).unwrap());
let p2 =
simd_mem_x86::_mm256_loadu_si256(<&[u8; 32]>::try_from(®ion[off_p2..][..32]).unwrap());
let mut p1 =
simd_mem_x86::_mm256_loadu_si256(<&[u8; 32]>::try_from(®ion[off_p1..][..32]).unwrap());
let mut p0 =
simd_mem_x86::_mm256_loadu_si256(<&[u8; 32]>::try_from(®ion[off_p0..][..32]).unwrap());
let mut q0 =
simd_mem_x86::_mm256_loadu_si256(<&[u8; 32]>::try_from(®ion[off_q0..][..32]).unwrap());
let mut q1 =
simd_mem_x86::_mm256_loadu_si256(<&[u8; 32]>::try_from(®ion[off_q1..][..32]).unwrap());
let q2 =
simd_mem_x86::_mm256_loadu_si256(<&[u8; 32]>::try_from(®ion[off_q2..][..32]).unwrap());
let q3 =
simd_mem_x86::_mm256_loadu_si256(<&[u8; 32]>::try_from(®ion[off_q3..][..32]).unwrap());
let mask = needs_filter_normal_32(
_token,
p3,
p2,
p1,
p0,
q0,
q1,
q2,
q3,
edge_limit,
interior_limit,
);
let hev = high_edge_variance_32(_token, p1, p0, q0, q1, hev_thresh);
do_filter4_32(_token, &mut p1, &mut p0, &mut q0, &mut q1, mask, hev);
simd_mem_x86::_mm256_storeu_si256(
<&mut [u8; 32]>::try_from(&mut region[off_p1..][..32]).unwrap(),
p1,
);
simd_mem_x86::_mm256_storeu_si256(
<&mut [u8; 32]>::try_from(&mut region[off_p0..][..32]).unwrap(),
p0,
);
simd_mem_x86::_mm256_storeu_si256(
<&mut [u8; 32]>::try_from(&mut region[off_q0..][..32]).unwrap(),
q0,
);
simd_mem_x86::_mm256_storeu_si256(
<&mut [u8; 32]>::try_from(&mut region[off_q1..][..32]).unwrap(),
q1,
);
}
#[cfg(target_arch = "x86_64")]
#[rite]
pub(crate) fn normal_v_filter32_edge(
_token: X64V3Token,
pixels: &mut [u8],
point: usize,
stride: usize,
hev_thresh: i32,
interior_limit: i32,
edge_limit: i32,
) {
assert!(stride <= MAX_STRIDE, "stride exceeds MAX_STRIDE");
let start = point - 4 * stride;
let region: &mut [u8; V_FILTER_NORMAL_REGION_32] =
<&mut [u8; V_FILTER_NORMAL_REGION_32]>::try_from(
&mut pixels[start..start + V_FILTER_NORMAL_REGION_32],
)
.expect("normal_v_filter32_edge: buffer too small");
let off_p3 = 0;
let off_p2 = stride;
let off_p1 = 2 * stride;
let off_p0 = 3 * stride;
let off_q0 = 4 * stride;
let off_q1 = 5 * stride;
let off_q2 = 6 * stride;
let off_q3 = 7 * stride;
let p3 =
simd_mem_x86::_mm256_loadu_si256(<&[u8; 32]>::try_from(®ion[off_p3..][..32]).unwrap());
let mut p2 =
simd_mem_x86::_mm256_loadu_si256(<&[u8; 32]>::try_from(®ion[off_p2..][..32]).unwrap());
let mut p1 =
simd_mem_x86::_mm256_loadu_si256(<&[u8; 32]>::try_from(®ion[off_p1..][..32]).unwrap());
let mut p0 =
simd_mem_x86::_mm256_loadu_si256(<&[u8; 32]>::try_from(®ion[off_p0..][..32]).unwrap());
let mut q0 =
simd_mem_x86::_mm256_loadu_si256(<&[u8; 32]>::try_from(®ion[off_q0..][..32]).unwrap());
let mut q1 =
simd_mem_x86::_mm256_loadu_si256(<&[u8; 32]>::try_from(®ion[off_q1..][..32]).unwrap());
let mut q2 =
simd_mem_x86::_mm256_loadu_si256(<&[u8; 32]>::try_from(®ion[off_q2..][..32]).unwrap());
let q3 =
simd_mem_x86::_mm256_loadu_si256(<&[u8; 32]>::try_from(®ion[off_q3..][..32]).unwrap());
let mask = needs_filter_normal_32(
_token,
p3,
p2,
p1,
p0,
q0,
q1,
q2,
q3,
edge_limit,
interior_limit,
);
let hev = high_edge_variance_32(_token, p1, p0, q0, q1, hev_thresh);
do_filter6_32(
_token, &mut p2, &mut p1, &mut p0, &mut q0, &mut q1, &mut q2, mask, hev,
);
simd_mem_x86::_mm256_storeu_si256(
<&mut [u8; 32]>::try_from(&mut region[off_p2..][..32]).unwrap(),
p2,
);
simd_mem_x86::_mm256_storeu_si256(
<&mut [u8; 32]>::try_from(&mut region[off_p1..][..32]).unwrap(),
p1,
);
simd_mem_x86::_mm256_storeu_si256(
<&mut [u8; 32]>::try_from(&mut region[off_p0..][..32]).unwrap(),
p0,
);
simd_mem_x86::_mm256_storeu_si256(
<&mut [u8; 32]>::try_from(&mut region[off_q0..][..32]).unwrap(),
q0,
);
simd_mem_x86::_mm256_storeu_si256(
<&mut [u8; 32]>::try_from(&mut region[off_q1..][..32]).unwrap(),
q1,
);
simd_mem_x86::_mm256_storeu_si256(
<&mut [u8; 32]>::try_from(&mut region[off_q2..][..32]).unwrap(),
q2,
);
}
macro_rules! load_8x4_impl {
($pixels:expr, $base:expr, $stride:expr) => {{
let base_ = $base;
let stride_ = $stride;
let pixels_: &[u8] = &*$pixels;
let r0 = i32::from_ne_bytes(<[u8; 4]>::try_from(&pixels_[base_..][..4]).unwrap());
let r1 = i32::from_ne_bytes(<[u8; 4]>::try_from(&pixels_[base_ + stride_..][..4]).unwrap());
let r2 =
i32::from_ne_bytes(<[u8; 4]>::try_from(&pixels_[base_ + 2 * stride_..][..4]).unwrap());
let r3 =
i32::from_ne_bytes(<[u8; 4]>::try_from(&pixels_[base_ + 3 * stride_..][..4]).unwrap());
let r4 =
i32::from_ne_bytes(<[u8; 4]>::try_from(&pixels_[base_ + 4 * stride_..][..4]).unwrap());
let r5 =
i32::from_ne_bytes(<[u8; 4]>::try_from(&pixels_[base_ + 5 * stride_..][..4]).unwrap());
let r6 =
i32::from_ne_bytes(<[u8; 4]>::try_from(&pixels_[base_ + 6 * stride_..][..4]).unwrap());
let r7 =
i32::from_ne_bytes(<[u8; 4]>::try_from(&pixels_[base_ + 7 * stride_..][..4]).unwrap());
let a0 = _mm_set_epi32(r6, r2, r4, r0);
let a1 = _mm_set_epi32(r7, r3, r5, r1);
let b0 = _mm_unpacklo_epi8(a0, a1);
let b1 = _mm_unpackhi_epi8(a0, a1);
let c0 = _mm_unpacklo_epi16(b0, b1);
let c1 = _mm_unpackhi_epi16(b0, b1);
(_mm_unpacklo_epi32(c0, c1), _mm_unpackhi_epi32(c0, c1))
}};
}
macro_rules! store_4x4_impl {
($pixels:expr, $base:expr, $stride:expr, $vals:expr) => {{
let base_ = $base;
let stride_ = $stride;
let vals_ = $vals;
let p_ = &mut *$pixels;
simd_mem_x86::_mm_storeu_si32(
<&mut [u8; 4]>::try_from(&mut p_[base_..][..4]).unwrap(),
vals_,
);
simd_mem_x86::_mm_storeu_si32(
<&mut [u8; 4]>::try_from(&mut p_[base_ + stride_..][..4]).unwrap(),
_mm_srli_si128(vals_, 4),
);
simd_mem_x86::_mm_storeu_si32(
<&mut [u8; 4]>::try_from(&mut p_[base_ + 2 * stride_..][..4]).unwrap(),
_mm_srli_si128(vals_, 8),
);
simd_mem_x86::_mm_storeu_si32(
<&mut [u8; 4]>::try_from(&mut p_[base_ + 3 * stride_..][..4]).unwrap(),
_mm_srli_si128(vals_, 12),
);
}};
}
macro_rules! load_16x4_impl {
($pixels:expr, $base:expr, $stride:expr) => {{
let (pq_lo_a, pq_hi_a) = load_8x4_impl!($pixels, $base, $stride);
let (pq_lo_b, pq_hi_b) = load_8x4_impl!($pixels, $base + 8 * $stride, $stride);
(
_mm_unpacklo_epi64(pq_lo_a, pq_lo_b),
_mm_unpackhi_epi64(pq_lo_a, pq_lo_b),
_mm_unpacklo_epi64(pq_hi_a, pq_hi_b),
_mm_unpackhi_epi64(pq_hi_a, pq_hi_b),
)
}};
}
macro_rules! store_16x4_impl {
($pixels:expr, $base:expr, $stride:expr, $col0:expr, $col1:expr, $col2:expr, $col3:expr) => {{
let t0_ = _mm_unpacklo_epi8($col0, $col1);
let t1_ = _mm_unpackhi_epi8($col0, $col1);
let t2_ = _mm_unpacklo_epi8($col2, $col3);
let t3_ = _mm_unpackhi_epi8($col2, $col3);
let r0_ = _mm_unpacklo_epi16(t0_, t2_);
let r1_ = _mm_unpackhi_epi16(t0_, t2_);
let r2_ = _mm_unpacklo_epi16(t1_, t3_);
let r3_ = _mm_unpackhi_epi16(t1_, t3_);
store_4x4_impl!($pixels, $base, $stride, r0_);
store_4x4_impl!($pixels, $base + 4 * $stride, $stride, r1_);
store_4x4_impl!($pixels, $base + 8 * $stride, $stride, r2_);
store_4x4_impl!($pixels, $base + 12 * $stride, $stride, r3_);
}};
}
macro_rules! store_q1q2_16_impl {
($pixels:expr, $base:expr, $stride:expr, $q1:expr, $q2:expr) => {{
let t0_ = _mm_unpacklo_epi8($q1, $q2);
let t1_ = _mm_unpackhi_epi8($q1, $q2);
macro_rules! _sq {
($r:expr, $l:expr, $row:expr) => {
let val_ = _mm_extract_epi16($r, $l) as u16;
let off_ = $base + $row * $stride;
$pixels[off_] = val_ as u8;
$pixels[off_ + 1] = (val_ >> 8) as u8;
};
}
_sq!(t0_, 0, 0);
_sq!(t0_, 1, 1);
_sq!(t0_, 2, 2);
_sq!(t0_, 3, 3);
_sq!(t0_, 4, 4);
_sq!(t0_, 5, 5);
_sq!(t0_, 6, 6);
_sq!(t0_, 7, 7);
_sq!(t1_, 0, 8);
_sq!(t1_, 1, 9);
_sq!(t1_, 2, 10);
_sq!(t1_, 3, 11);
_sq!(t1_, 4, 12);
_sq!(t1_, 5, 13);
_sq!(t1_, 6, 14);
_sq!(t1_, 7, 15);
}};
}
macro_rules! store_uv_16x4_impl {
($u:expr, $v:expr, $base:expr, $stride:expr, $c0:expr, $c1:expr, $c2:expr, $c3:expr) => {{
let t0_ = _mm_unpacklo_epi8($c0, $c1);
let t1_ = _mm_unpackhi_epi8($c0, $c1);
let t2_ = _mm_unpacklo_epi8($c2, $c3);
let t3_ = _mm_unpackhi_epi8($c2, $c3);
let r0_ = _mm_unpacklo_epi16(t0_, t2_);
let r1_ = _mm_unpackhi_epi16(t0_, t2_);
let r2_ = _mm_unpacklo_epi16(t1_, t3_);
let r3_ = _mm_unpackhi_epi16(t1_, t3_);
store_4x4_impl!($u, $base, $stride, r0_);
store_4x4_impl!($u, $base + 4 * $stride, $stride, r1_);
store_4x4_impl!($v, $base, $stride, r2_);
store_4x4_impl!($v, $base + 4 * $stride, $stride, r3_);
}};
}
macro_rules! store_q1q2_uv_16_impl {
($u:expr, $v:expr, $base:expr, $stride:expr, $q1:expr, $q2:expr) => {{
let t0_ = _mm_unpacklo_epi8($q1, $q2); let t1_ = _mm_unpackhi_epi8($q1, $q2); macro_rules! _sq {
($r:expr, $l:expr, $buf:expr, $row:expr) => {
let val_ = _mm_extract_epi16($r, $l) as u16;
let off_ = $base + $row * $stride;
$buf[off_] = val_ as u8;
$buf[off_ + 1] = (val_ >> 8) as u8;
};
}
_sq!(t0_, 0, $u, 0);
_sq!(t0_, 1, $u, 1);
_sq!(t0_, 2, $u, 2);
_sq!(t0_, 3, $u, 3);
_sq!(t0_, 4, $u, 4);
_sq!(t0_, 5, $u, 5);
_sq!(t0_, 6, $u, 6);
_sq!(t0_, 7, $u, 7);
_sq!(t1_, 0, $v, 0);
_sq!(t1_, 1, $v, 1);
_sq!(t1_, 2, $v, 2);
_sq!(t1_, 3, $v, 3);
_sq!(t1_, 4, $v, 4);
_sq!(t1_, 5, $v, 5);
_sq!(t1_, 6, $v, 6);
_sq!(t1_, 7, $v, 7);
}};
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn transpose_8x16_to_16x8(_token: X64V3Token, rows: &[__m128i; 16]) -> [__m128i; 8] {
let t0 = _mm_unpacklo_epi8(rows[0], rows[1]);
let t1 = _mm_unpacklo_epi8(rows[2], rows[3]);
let t2 = _mm_unpacklo_epi8(rows[4], rows[5]);
let t3 = _mm_unpacklo_epi8(rows[6], rows[7]);
let t4 = _mm_unpacklo_epi8(rows[8], rows[9]);
let t5 = _mm_unpacklo_epi8(rows[10], rows[11]);
let t6 = _mm_unpacklo_epi8(rows[12], rows[13]);
let t7 = _mm_unpacklo_epi8(rows[14], rows[15]);
let u0 = _mm_unpacklo_epi16(t0, t1);
let u1 = _mm_unpackhi_epi16(t0, t1);
let u2 = _mm_unpacklo_epi16(t2, t3);
let u3 = _mm_unpackhi_epi16(t2, t3);
let u4 = _mm_unpacklo_epi16(t4, t5);
let u5 = _mm_unpackhi_epi16(t4, t5);
let u6 = _mm_unpacklo_epi16(t6, t7);
let u7 = _mm_unpackhi_epi16(t6, t7);
let v0 = _mm_unpacklo_epi32(u0, u2);
let v1 = _mm_unpackhi_epi32(u0, u2);
let v2 = _mm_unpacklo_epi32(u4, u6);
let v3 = _mm_unpackhi_epi32(u4, u6);
let v4 = _mm_unpacklo_epi32(u1, u3);
let v5 = _mm_unpackhi_epi32(u1, u3);
let v6 = _mm_unpacklo_epi32(u5, u7);
let v7 = _mm_unpackhi_epi32(u5, u7);
[
_mm_unpacklo_epi64(v0, v2), _mm_unpackhi_epi64(v0, v2), _mm_unpacklo_epi64(v1, v3), _mm_unpackhi_epi64(v1, v3), _mm_unpacklo_epi64(v4, v6), _mm_unpackhi_epi64(v4, v6), _mm_unpacklo_epi64(v5, v7), _mm_unpackhi_epi64(v5, v7), ]
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn transpose_4x16_to_16x4(
_token: X64V3Token,
p1: __m128i,
p0: __m128i,
q0: __m128i,
q1: __m128i,
) -> [i32; 16] {
let p1p0_lo = _mm_unpacklo_epi8(p1, p0); let p1p0_hi = _mm_unpackhi_epi8(p1, p0); let q0q1_lo = _mm_unpacklo_epi8(q0, q1);
let q0q1_hi = _mm_unpackhi_epi8(q0, q1);
let r0 = _mm_unpacklo_epi16(p1p0_lo, q0q1_lo); let r1 = _mm_unpackhi_epi16(p1p0_lo, q0q1_lo); let r2 = _mm_unpacklo_epi16(p1p0_hi, q0q1_hi); let r3 = _mm_unpackhi_epi16(p1p0_hi, q0q1_hi);
let mut result = [0i32; 16];
result[0] = _mm_extract_epi32(r0, 0);
result[1] = _mm_extract_epi32(r0, 1);
result[2] = _mm_extract_epi32(r0, 2);
result[3] = _mm_extract_epi32(r0, 3);
result[4] = _mm_extract_epi32(r1, 0);
result[5] = _mm_extract_epi32(r1, 1);
result[6] = _mm_extract_epi32(r1, 2);
result[7] = _mm_extract_epi32(r1, 3);
result[8] = _mm_extract_epi32(r2, 0);
result[9] = _mm_extract_epi32(r2, 1);
result[10] = _mm_extract_epi32(r2, 2);
result[11] = _mm_extract_epi32(r2, 3);
result[12] = _mm_extract_epi32(r3, 0);
result[13] = _mm_extract_epi32(r3, 1);
result[14] = _mm_extract_epi32(r3, 2);
result[15] = _mm_extract_epi32(r3, 3);
result
}
#[cfg(target_arch = "x86_64")]
#[rite]
pub(crate) fn simple_h_filter16(
_token: X64V3Token,
pixels: &mut [u8],
x: usize,
y_start: usize,
stride: usize,
thresh: i32,
) {
assert!(stride <= MAX_STRIDE, "stride exceeds MAX_STRIDE");
let base = y_start * stride + x - 2;
let region: &mut [u8; H_FILTER_SIMPLE_REGION] = <&mut [u8; H_FILTER_SIMPLE_REGION]>::try_from(
&mut pixels[base..base + H_FILTER_SIMPLE_REGION],
)
.expect("simple_h_filter16: buffer too small (missing FILTER_PADDING?)");
let (p1, p0, q0, q1) = load_16x4_impl!(region, 0, stride);
let mut p0 = p0;
let mut q0 = q0;
let mask = needs_filter_16(_token, p1, p0, q0, q1, thresh);
let fl = get_base_delta_16(_token, p1, p0, q0, q1);
let fl_masked = _mm_and_si128(fl, mask);
do_simple_filter_16(_token, &mut p0, &mut q0, fl_masked);
store_16x4_impl!(region, 0, stride, p1, p0, q0, q1);
}
#[cfg(target_arch = "x86_64")]
#[rite]
pub(crate) fn simple_filter_mb_edge_v(
_token: X64V3Token,
pixels: &mut [u8],
mb_y: usize,
mb_x: usize,
stride: usize,
thresh: i32,
) {
let point = mb_y * 16 * stride + mb_x * 16;
simple_v_filter16(_token, pixels, point, stride, thresh);
}
#[cfg(target_arch = "x86_64")]
#[rite]
pub(crate) fn simple_filter_mb_edge_h(
_token: X64V3Token,
pixels: &mut [u8],
mb_y: usize,
mb_x: usize,
stride: usize,
thresh: i32,
) {
let x = mb_x * 16;
let y_start = mb_y * 16;
simple_h_filter16(_token, pixels, x, y_start, stride, thresh);
}
#[cfg(target_arch = "x86_64")]
#[rite]
pub(crate) fn simple_filter_subblock_edge_v(
_token: X64V3Token,
pixels: &mut [u8],
mb_y: usize,
mb_x: usize,
y_offset: usize,
stride: usize,
thresh: i32,
) {
let point = (mb_y * 16 + y_offset) * stride + mb_x * 16;
simple_v_filter16(_token, pixels, point, stride, thresh);
}
#[cfg(target_arch = "x86_64")]
#[rite]
pub(crate) fn simple_filter_subblock_edge_h(
_token: X64V3Token,
pixels: &mut [u8],
mb_y: usize,
mb_x: usize,
x_offset: usize,
stride: usize,
thresh: i32,
) {
let x = mb_x * 16 + x_offset;
let y_start = mb_y * 16;
simple_h_filter16(_token, pixels, x, y_start, stride, thresh);
}
#[cfg(target_arch = "x86_64")]
#[rite]
#[allow(clippy::too_many_arguments)]
fn needs_filter_normal_16(
_token: X64V3Token,
p3: __m128i,
p2: __m128i,
p1: __m128i,
p0: __m128i,
q0: __m128i,
q1: __m128i,
q2: __m128i,
q3: __m128i,
edge_limit: i32,
interior_limit: i32,
) -> __m128i {
let simple_mask = needs_filter_16(_token, p1, p0, q0, q1, edge_limit);
let i_limit = _mm_set1_epi8(interior_limit as i8);
macro_rules! abs_diff {
($a:expr, $b:expr) => {
_mm_or_si128(_mm_subs_epu8($a, $b), _mm_subs_epu8($b, $a))
};
}
let d_p3_p2 = abs_diff!(p3, p2);
let d_p2_p1 = abs_diff!(p2, p1);
let d_p1_p0 = abs_diff!(p1, p0);
let d_q0_q1 = abs_diff!(q0, q1);
let d_q1_q2 = abs_diff!(q1, q2);
let d_q2_q3 = abs_diff!(q2, q3);
let max1 = _mm_max_epu8(d_p3_p2, d_p2_p1);
let max2 = _mm_max_epu8(d_p1_p0, d_q0_q1);
let max3 = _mm_max_epu8(d_q1_q2, d_q2_q3);
let max4 = _mm_max_epu8(max1, max2);
let max_diff = _mm_max_epu8(max3, max4);
let exceeds = _mm_subs_epu8(max_diff, i_limit);
let interior_ok = _mm_cmpeq_epi8(exceeds, _mm_setzero_si128());
_mm_and_si128(simple_mask, interior_ok)
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn high_edge_variance_16(
_token: X64V3Token,
p1: __m128i,
p0: __m128i,
q0: __m128i,
q1: __m128i,
hev_thresh: i32,
) -> __m128i {
let t = _mm_set1_epi8(hev_thresh as i8);
let d_p1_p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
let d_q1_q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
let p_exceeds = _mm_subs_epu8(d_p1_p0, t);
let q_exceeds = _mm_subs_epu8(d_q1_q0, t);
let p_hev = _mm_xor_si128(
_mm_cmpeq_epi8(p_exceeds, _mm_setzero_si128()),
_mm_set1_epi8(-1),
);
let q_hev = _mm_xor_si128(
_mm_cmpeq_epi8(q_exceeds, _mm_setzero_si128()),
_mm_set1_epi8(-1),
);
_mm_or_si128(p_hev, q_hev)
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn do_filter4_16(
_token: X64V3Token,
p1: &mut __m128i,
p0: &mut __m128i,
q0: &mut __m128i,
q1: &mut __m128i,
mask: __m128i,
hev: __m128i,
) {
let sign = _mm_set1_epi8(-128i8);
let p1s = _mm_xor_si128(*p1, sign);
let p0s = _mm_xor_si128(*p0, sign);
let q0s = _mm_xor_si128(*q0, sign);
let q1s = _mm_xor_si128(*q1, sign);
let outer = _mm_subs_epi8(p1s, q1s);
let outer_masked = _mm_and_si128(outer, hev);
let q0_p0 = _mm_subs_epi8(q0s, p0s);
let a = _mm_adds_epi8(outer_masked, q0_p0);
let a = _mm_adds_epi8(a, q0_p0);
let a = _mm_adds_epi8(a, q0_p0);
let a = _mm_and_si128(a, mask);
let k3 = _mm_set1_epi8(3);
let k4 = _mm_set1_epi8(4);
let f1 = _mm_adds_epi8(a, k4);
let f2 = _mm_adds_epi8(a, k3);
let f1 = signed_shift_right_3(_token, f1);
let f2 = signed_shift_right_3(_token, f2);
let new_p0s = _mm_adds_epi8(p0s, f2);
let new_q0s = _mm_subs_epi8(q0s, f1);
let a2 = _mm_adds_epi8(f1, _mm_set1_epi8(1));
let a2 = signed_shift_right_1(_token, a2);
let a2 = _mm_andnot_si128(hev, a2); let a2 = _mm_and_si128(a2, mask);
let new_p1s = _mm_adds_epi8(p1s, a2);
let new_q1s = _mm_subs_epi8(q1s, a2);
*p0 = _mm_xor_si128(new_p0s, sign);
*q0 = _mm_xor_si128(new_q0s, sign);
*p1 = _mm_xor_si128(new_p1s, sign);
*q1 = _mm_xor_si128(new_q1s, sign);
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn signed_shift_right_1(_token: X64V3Token, v: __m128i) -> __m128i {
let lo = _mm_srai_epi16(_mm_unpacklo_epi8(v, v), 9);
let hi = _mm_srai_epi16(_mm_unpackhi_epi8(v, v), 9);
_mm_packs_epi16(lo, hi)
}
#[cfg(target_arch = "x86_64")]
#[rite]
#[allow(clippy::too_many_arguments)]
fn do_filter6_16(
_token: X64V3Token,
p2: &mut __m128i,
p1: &mut __m128i,
p0: &mut __m128i,
q0: &mut __m128i,
q1: &mut __m128i,
q2: &mut __m128i,
mask: __m128i,
hev: __m128i,
) {
let sign = _mm_set1_epi8(-128i8);
let not_hev = _mm_andnot_si128(hev, _mm_set1_epi8(-1));
let p2s = _mm_xor_si128(*p2, sign);
let p1s = _mm_xor_si128(*p1, sign);
let p0s = _mm_xor_si128(*p0, sign);
let q0s = _mm_xor_si128(*q0, sign);
let q1s = _mm_xor_si128(*q1, sign);
let q2s = _mm_xor_si128(*q2, sign);
let outer = _mm_subs_epi8(p1s, q1s);
let outer_hev = _mm_and_si128(outer, hev);
let q0_p0 = _mm_subs_epi8(q0s, p0s);
let a_hev = _mm_adds_epi8(outer_hev, q0_p0);
let a_hev = _mm_adds_epi8(a_hev, q0_p0);
let a_hev = _mm_adds_epi8(a_hev, q0_p0);
let a_hev = _mm_and_si128(a_hev, _mm_and_si128(mask, hev));
let k3 = _mm_set1_epi8(3);
let k4 = _mm_set1_epi8(4);
let f1_hev = signed_shift_right_3(_token, _mm_adds_epi8(a_hev, k4));
let f2_hev = signed_shift_right_3(_token, _mm_adds_epi8(a_hev, k3));
let (new_p2_lo, new_p1_lo, new_p0_lo, new_q0_lo, new_q1_lo, new_q2_lo) = filter6_wide_half(
_token,
_mm_unpacklo_epi8(p2s, p2s),
_mm_unpacklo_epi8(p1s, p1s),
_mm_unpacklo_epi8(p0s, p0s),
_mm_unpacklo_epi8(q0s, q0s),
_mm_unpacklo_epi8(q1s, q1s),
_mm_unpacklo_epi8(q2s, q2s),
);
let (new_p2_hi, new_p1_hi, new_p0_hi, new_q0_hi, new_q1_hi, new_q2_hi) = filter6_wide_half(
_token,
_mm_unpackhi_epi8(p2s, p2s),
_mm_unpackhi_epi8(p1s, p1s),
_mm_unpackhi_epi8(p0s, p0s),
_mm_unpackhi_epi8(q0s, q0s),
_mm_unpackhi_epi8(q1s, q1s),
_mm_unpackhi_epi8(q2s, q2s),
);
let new_p2_wide = _mm_packs_epi16(new_p2_lo, new_p2_hi);
let new_p1_wide = _mm_packs_epi16(new_p1_lo, new_p1_hi);
let new_p0_wide = _mm_packs_epi16(new_p0_lo, new_p0_hi);
let new_q0_wide = _mm_packs_epi16(new_q0_lo, new_q0_hi);
let new_q1_wide = _mm_packs_epi16(new_q1_lo, new_q1_hi);
let new_q2_wide = _mm_packs_epi16(new_q2_lo, new_q2_hi);
let mask_not_hev = _mm_and_si128(mask, not_hev);
let new_p0s = _mm_adds_epi8(p0s, f2_hev); let new_q0s = _mm_subs_epi8(q0s, f1_hev);
let final_p0s = _mm_blendv_epi8(new_p0s, new_p0_wide, mask_not_hev);
let final_q0s = _mm_blendv_epi8(new_q0s, new_q0_wide, mask_not_hev);
let final_p1s = _mm_blendv_epi8(p1s, new_p1_wide, mask_not_hev);
let final_q1s = _mm_blendv_epi8(q1s, new_q1_wide, mask_not_hev);
let final_p2s = _mm_blendv_epi8(p2s, new_p2_wide, mask_not_hev);
let final_q2s = _mm_blendv_epi8(q2s, new_q2_wide, mask_not_hev);
*p0 = _mm_xor_si128(final_p0s, sign);
*q0 = _mm_xor_si128(final_q0s, sign);
*p1 = _mm_xor_si128(final_p1s, sign);
*q1 = _mm_xor_si128(final_q1s, sign);
*p2 = _mm_xor_si128(final_p2s, sign);
*q2 = _mm_xor_si128(final_q2s, sign);
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn filter6_wide_half(
_token: X64V3Token,
p2: __m128i,
p1: __m128i,
p0: __m128i,
q0: __m128i,
q1: __m128i,
q2: __m128i,
) -> (__m128i, __m128i, __m128i, __m128i, __m128i, __m128i) {
let p2_16 = _mm_srai_epi16(p2, 8);
let p1_16 = _mm_srai_epi16(p1, 8);
let p0_16 = _mm_srai_epi16(p0, 8);
let q0_16 = _mm_srai_epi16(q0, 8);
let q1_16 = _mm_srai_epi16(q1, 8);
let q2_16 = _mm_srai_epi16(q2, 8);
let p1_q1 = _mm_sub_epi16(p1_16, q1_16);
let q0_p0 = _mm_sub_epi16(q0_16, p0_16);
let three_q0_p0 = _mm_add_epi16(_mm_add_epi16(q0_p0, q0_p0), q0_p0);
let w = _mm_add_epi16(p1_q1, three_q0_p0);
let w = _mm_max_epi16(_mm_min_epi16(w, _mm_set1_epi16(127)), _mm_set1_epi16(-128));
let k27 = _mm_set1_epi16(27);
let k18 = _mm_set1_epi16(18);
let k9 = _mm_set1_epi16(9);
let k63 = _mm_set1_epi16(63);
let a0 = _mm_srai_epi16(_mm_add_epi16(_mm_mullo_epi16(w, k27), k63), 7);
let a1 = _mm_srai_epi16(_mm_add_epi16(_mm_mullo_epi16(w, k18), k63), 7);
let a2 = _mm_srai_epi16(_mm_add_epi16(_mm_mullo_epi16(w, k9), k63), 7);
let new_p0 = _mm_add_epi16(p0_16, a0);
let new_q0 = _mm_sub_epi16(q0_16, a0);
let new_p1 = _mm_add_epi16(p1_16, a1);
let new_q1 = _mm_sub_epi16(q1_16, a1);
let new_p2 = _mm_add_epi16(p2_16, a2);
let new_q2 = _mm_sub_epi16(q2_16, a2);
let clamp =
|v: __m128i| _mm_max_epi16(_mm_min_epi16(v, _mm_set1_epi16(127)), _mm_set1_epi16(-128));
(
clamp(new_p2),
clamp(new_p1),
clamp(new_p0),
clamp(new_q0),
clamp(new_q1),
clamp(new_q2),
)
}
#[cfg(target_arch = "x86_64")]
#[rite]
pub(crate) fn normal_v_filter16_inner(
_token: X64V3Token,
pixels: &mut [u8],
point: usize,
stride: usize,
hev_thresh: i32,
interior_limit: i32,
edge_limit: i32,
) {
assert!(stride <= MAX_STRIDE, "stride exceeds MAX_STRIDE");
let start = point - 4 * stride;
let region: &mut [u8; V_FILTER_NORMAL_REGION] = <&mut [u8; V_FILTER_NORMAL_REGION]>::try_from(
&mut pixels[start..start + V_FILTER_NORMAL_REGION],
)
.expect("normal_v_filter16_inner: buffer too small (missing FILTER_PADDING?)");
let off_p3 = 0;
let off_p2 = stride;
let off_p1 = 2 * stride;
let off_p0 = 3 * stride;
let off_q0 = 4 * stride;
let off_q1 = 5 * stride;
let off_q2 = 6 * stride;
let off_q3 = 7 * stride;
let p3 = simd_mem_x86::_mm_loadu_si128(<&[u8; 16]>::try_from(®ion[off_p3..][..16]).unwrap());
let p2 = simd_mem_x86::_mm_loadu_si128(<&[u8; 16]>::try_from(®ion[off_p2..][..16]).unwrap());
let mut p1 =
simd_mem_x86::_mm_loadu_si128(<&[u8; 16]>::try_from(®ion[off_p1..][..16]).unwrap());
let mut p0 =
simd_mem_x86::_mm_loadu_si128(<&[u8; 16]>::try_from(®ion[off_p0..][..16]).unwrap());
let mut q0 =
simd_mem_x86::_mm_loadu_si128(<&[u8; 16]>::try_from(®ion[off_q0..][..16]).unwrap());
let mut q1 =
simd_mem_x86::_mm_loadu_si128(<&[u8; 16]>::try_from(®ion[off_q1..][..16]).unwrap());
let q2 = simd_mem_x86::_mm_loadu_si128(<&[u8; 16]>::try_from(®ion[off_q2..][..16]).unwrap());
let q3 = simd_mem_x86::_mm_loadu_si128(<&[u8; 16]>::try_from(®ion[off_q3..][..16]).unwrap());
let mask = needs_filter_normal_16(
_token,
p3,
p2,
p1,
p0,
q0,
q1,
q2,
q3,
edge_limit,
interior_limit,
);
let hev = high_edge_variance_16(_token, p1, p0, q0, q1, hev_thresh);
do_filter4_16(_token, &mut p1, &mut p0, &mut q0, &mut q1, mask, hev);
simd_mem_x86::_mm_storeu_si128(
<&mut [u8; 16]>::try_from(&mut region[off_p1..][..16]).unwrap(),
p1,
);
simd_mem_x86::_mm_storeu_si128(
<&mut [u8; 16]>::try_from(&mut region[off_p0..][..16]).unwrap(),
p0,
);
simd_mem_x86::_mm_storeu_si128(
<&mut [u8; 16]>::try_from(&mut region[off_q0..][..16]).unwrap(),
q0,
);
simd_mem_x86::_mm_storeu_si128(
<&mut [u8; 16]>::try_from(&mut region[off_q1..][..16]).unwrap(),
q1,
);
}
#[cfg(target_arch = "x86_64")]
#[rite]
pub(crate) fn normal_v_filter16_edge(
_token: X64V3Token,
pixels: &mut [u8],
point: usize,
stride: usize,
hev_thresh: i32,
interior_limit: i32,
edge_limit: i32,
) {
assert!(stride <= MAX_STRIDE, "stride exceeds MAX_STRIDE");
let start = point - 4 * stride;
let region: &mut [u8; V_FILTER_NORMAL_REGION] = <&mut [u8; V_FILTER_NORMAL_REGION]>::try_from(
&mut pixels[start..start + V_FILTER_NORMAL_REGION],
)
.expect("normal_v_filter16_edge: buffer too small (missing FILTER_PADDING?)");
let off_p3 = 0;
let off_p2 = stride;
let off_p1 = 2 * stride;
let off_p0 = 3 * stride;
let off_q0 = 4 * stride;
let off_q1 = 5 * stride;
let off_q2 = 6 * stride;
let off_q3 = 7 * stride;
let p3 = simd_mem_x86::_mm_loadu_si128(<&[u8; 16]>::try_from(®ion[off_p3..][..16]).unwrap());
let mut p2 =
simd_mem_x86::_mm_loadu_si128(<&[u8; 16]>::try_from(®ion[off_p2..][..16]).unwrap());
let mut p1 =
simd_mem_x86::_mm_loadu_si128(<&[u8; 16]>::try_from(®ion[off_p1..][..16]).unwrap());
let mut p0 =
simd_mem_x86::_mm_loadu_si128(<&[u8; 16]>::try_from(®ion[off_p0..][..16]).unwrap());
let mut q0 =
simd_mem_x86::_mm_loadu_si128(<&[u8; 16]>::try_from(®ion[off_q0..][..16]).unwrap());
let mut q1 =
simd_mem_x86::_mm_loadu_si128(<&[u8; 16]>::try_from(®ion[off_q1..][..16]).unwrap());
let mut q2 =
simd_mem_x86::_mm_loadu_si128(<&[u8; 16]>::try_from(®ion[off_q2..][..16]).unwrap());
let q3 = simd_mem_x86::_mm_loadu_si128(<&[u8; 16]>::try_from(®ion[off_q3..][..16]).unwrap());
let mask = needs_filter_normal_16(
_token,
p3,
p2,
p1,
p0,
q0,
q1,
q2,
q3,
edge_limit,
interior_limit,
);
let hev = high_edge_variance_16(_token, p1, p0, q0, q1, hev_thresh);
do_filter6_16(
_token, &mut p2, &mut p1, &mut p0, &mut q0, &mut q1, &mut q2, mask, hev,
);
simd_mem_x86::_mm_storeu_si128(
<&mut [u8; 16]>::try_from(&mut region[off_p2..][..16]).unwrap(),
p2,
);
simd_mem_x86::_mm_storeu_si128(
<&mut [u8; 16]>::try_from(&mut region[off_p1..][..16]).unwrap(),
p1,
);
simd_mem_x86::_mm_storeu_si128(
<&mut [u8; 16]>::try_from(&mut region[off_p0..][..16]).unwrap(),
p0,
);
simd_mem_x86::_mm_storeu_si128(
<&mut [u8; 16]>::try_from(&mut region[off_q0..][..16]).unwrap(),
q0,
);
simd_mem_x86::_mm_storeu_si128(
<&mut [u8; 16]>::try_from(&mut region[off_q1..][..16]).unwrap(),
q1,
);
simd_mem_x86::_mm_storeu_si128(
<&mut [u8; 16]>::try_from(&mut region[off_q2..][..16]).unwrap(),
q2,
);
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn transpose_6x16_to_16x6(
_token: X64V3Token,
p2: __m128i,
p1: __m128i,
p0: __m128i,
q0: __m128i,
q1: __m128i,
q2: __m128i,
) -> ([i32; 16], [i16; 16]) {
let p2p1_lo = _mm_unpacklo_epi8(p2, p1); let p2p1_hi = _mm_unpackhi_epi8(p2, p1); let p0q0_lo = _mm_unpacklo_epi8(p0, q0);
let p0q0_hi = _mm_unpackhi_epi8(p0, q0);
let q1q2_lo = _mm_unpacklo_epi8(q1, q2);
let q1q2_hi = _mm_unpackhi_epi8(q1, q2);
let r0_lo = _mm_unpacklo_epi16(p2p1_lo, p0q0_lo); let r1_lo = _mm_unpackhi_epi16(p2p1_lo, p0q0_lo); let r0_hi = _mm_unpacklo_epi16(p2p1_hi, p0q0_hi); let r1_hi = _mm_unpackhi_epi16(p2p1_hi, p0q0_hi);
let mut result4 = [0i32; 16];
result4[0] = _mm_extract_epi32(r0_lo, 0);
result4[1] = _mm_extract_epi32(r0_lo, 1);
result4[2] = _mm_extract_epi32(r0_lo, 2);
result4[3] = _mm_extract_epi32(r0_lo, 3);
result4[4] = _mm_extract_epi32(r1_lo, 0);
result4[5] = _mm_extract_epi32(r1_lo, 1);
result4[6] = _mm_extract_epi32(r1_lo, 2);
result4[7] = _mm_extract_epi32(r1_lo, 3);
result4[8] = _mm_extract_epi32(r0_hi, 0);
result4[9] = _mm_extract_epi32(r0_hi, 1);
result4[10] = _mm_extract_epi32(r0_hi, 2);
result4[11] = _mm_extract_epi32(r0_hi, 3);
result4[12] = _mm_extract_epi32(r1_hi, 0);
result4[13] = _mm_extract_epi32(r1_hi, 1);
result4[14] = _mm_extract_epi32(r1_hi, 2);
result4[15] = _mm_extract_epi32(r1_hi, 3);
let mut result2 = [0i16; 16];
result2[0] = _mm_extract_epi16(q1q2_lo, 0) as i16;
result2[1] = _mm_extract_epi16(q1q2_lo, 1) as i16;
result2[2] = _mm_extract_epi16(q1q2_lo, 2) as i16;
result2[3] = _mm_extract_epi16(q1q2_lo, 3) as i16;
result2[4] = _mm_extract_epi16(q1q2_lo, 4) as i16;
result2[5] = _mm_extract_epi16(q1q2_lo, 5) as i16;
result2[6] = _mm_extract_epi16(q1q2_lo, 6) as i16;
result2[7] = _mm_extract_epi16(q1q2_lo, 7) as i16;
result2[8] = _mm_extract_epi16(q1q2_hi, 0) as i16;
result2[9] = _mm_extract_epi16(q1q2_hi, 1) as i16;
result2[10] = _mm_extract_epi16(q1q2_hi, 2) as i16;
result2[11] = _mm_extract_epi16(q1q2_hi, 3) as i16;
result2[12] = _mm_extract_epi16(q1q2_hi, 4) as i16;
result2[13] = _mm_extract_epi16(q1q2_hi, 5) as i16;
result2[14] = _mm_extract_epi16(q1q2_hi, 6) as i16;
result2[15] = _mm_extract_epi16(q1q2_hi, 7) as i16;
(result4, result2)
}
#[cfg(target_arch = "x86_64")]
#[allow(clippy::too_many_arguments)]
#[rite]
pub(crate) fn normal_h_filter16_inner(
_token: X64V3Token,
pixels: &mut [u8],
x: usize,
y_start: usize,
stride: usize,
hev_thresh: i32,
interior_limit: i32,
edge_limit: i32,
) {
assert!(stride <= MAX_STRIDE, "stride exceeds MAX_STRIDE");
let base = y_start * stride + x - 4;
let region: &mut [u8; H_FILTER_NORMAL_REGION] = <&mut [u8; H_FILTER_NORMAL_REGION]>::try_from(
&mut pixels[base..base + H_FILTER_NORMAL_REGION],
)
.expect("normal_h_filter16_inner: buffer too small (missing FILTER_PADDING?)");
let mut rows = [_mm_setzero_si128(); 16];
for (i, row) in rows.iter_mut().enumerate() {
let row_start = i * stride;
*row =
simd_mem_x86::_mm_loadu_si64(<&[u8; 8]>::try_from(®ion[row_start..][..8]).unwrap());
}
let cols = transpose_8x16_to_16x8(_token, &rows);
let p3 = cols[0];
let p2 = cols[1];
let mut p1 = cols[2];
let mut p0 = cols[3];
let mut q0 = cols[4];
let mut q1 = cols[5];
let q2 = cols[6];
let q3 = cols[7];
let mask = needs_filter_normal_16(
_token,
p3,
p2,
p1,
p0,
q0,
q1,
q2,
q3,
edge_limit,
interior_limit,
);
let hev = high_edge_variance_16(_token, p1, p0, q0, q1, hev_thresh);
do_filter4_16(_token, &mut p1, &mut p0, &mut q0, &mut q1, mask, hev);
store_16x4_impl!(region, 2, stride, p1, p0, q0, q1);
}
#[cfg(target_arch = "x86_64")]
#[allow(clippy::too_many_arguments)]
#[rite]
pub(crate) fn normal_h_filter16_edge(
_token: X64V3Token,
pixels: &mut [u8],
x: usize,
y_start: usize,
stride: usize,
hev_thresh: i32,
interior_limit: i32,
edge_limit: i32,
) {
assert!(stride <= MAX_STRIDE, "stride exceeds MAX_STRIDE");
let base = y_start * stride + x - 4;
let region: &mut [u8; H_FILTER_NORMAL_REGION] = <&mut [u8; H_FILTER_NORMAL_REGION]>::try_from(
&mut pixels[base..base + H_FILTER_NORMAL_REGION],
)
.expect("normal_h_filter16_edge: buffer too small (missing FILTER_PADDING?)");
let mut rows = [_mm_setzero_si128(); 16];
for (i, row) in rows.iter_mut().enumerate() {
let row_start = i * stride;
*row =
simd_mem_x86::_mm_loadu_si64(<&[u8; 8]>::try_from(®ion[row_start..][..8]).unwrap());
}
let cols = transpose_8x16_to_16x8(_token, &rows);
let p3 = cols[0];
let mut p2 = cols[1];
let mut p1 = cols[2];
let mut p0 = cols[3];
let mut q0 = cols[4];
let mut q1 = cols[5];
let mut q2 = cols[6];
let q3 = cols[7];
let mask = needs_filter_normal_16(
_token,
p3,
p2,
p1,
p0,
q0,
q1,
q2,
q3,
edge_limit,
interior_limit,
);
let hev = high_edge_variance_16(_token, p1, p0, q0, q1, hev_thresh);
do_filter6_16(
_token, &mut p2, &mut p1, &mut p0, &mut q0, &mut q1, &mut q2, mask, hev,
);
store_16x4_impl!(region, 1, stride, p2, p1, p0, q0);
store_q1q2_16_impl!(region, 5, stride, q1, q2);
}
#[cfg(target_arch = "x86_64")]
#[allow(clippy::too_many_arguments)]
#[rite]
pub(crate) fn normal_h_filter16i(
_token: X64V3Token,
pixels: &mut [u8],
x_start: usize,
y_start: usize,
stride: usize,
hev_thresh: i32,
interior_limit: i32,
edge_limit: i32,
) {
assert!(stride <= MAX_STRIDE, "stride exceeds MAX_STRIDE");
let base = y_start * stride + x_start;
let region: &mut [u8; H_FILTER_FUSED_REGION] = <&mut [u8; H_FILTER_FUSED_REGION]>::try_from(
&mut pixels[base..base + H_FILTER_FUSED_REGION],
)
.expect("normal_h_filter16i: buffer too small (missing FILTER_PADDING?)");
let (p3, p2, p1, p0) = load_16x4_impl!(region, 0, stride);
let mut p3 = p3;
let mut p2 = p2;
let mut p1 = p1;
let mut p0 = p0;
for k in 0..3 {
let edge_x = x_start + 4 + k * 4;
let d_p1_p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
let d_p3_p2 = _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3));
let d_p2_p1 = _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2));
let mut max_diff = _mm_max_epu8(d_p1_p0, _mm_max_epu8(d_p3_p2, d_p2_p1));
let next_off = edge_x - x_start;
let (new_p3, new_p2, tmp1, tmp2) = load_16x4_impl!(region, next_off, stride);
let d1 = _mm_or_si128(_mm_subs_epu8(new_p3, new_p2), _mm_subs_epu8(new_p2, new_p3));
let d2 = _mm_or_si128(_mm_subs_epu8(tmp1, tmp2), _mm_subs_epu8(tmp2, tmp1));
let d3 = _mm_or_si128(_mm_subs_epu8(new_p2, tmp1), _mm_subs_epu8(tmp1, new_p2));
max_diff = _mm_max_epu8(max_diff, _mm_max_epu8(d1, _mm_max_epu8(d2, d3)));
let i_limit = _mm_set1_epi8(interior_limit as i8);
let exceeds = _mm_subs_epu8(max_diff, i_limit);
let interior_ok = _mm_cmpeq_epi8(exceeds, _mm_setzero_si128());
let simple_mask = needs_filter_16(_token, p1, p0, new_p3, new_p2, edge_limit);
let mask = _mm_and_si128(simple_mask, interior_ok);
let hev = high_edge_variance_16(_token, p1, p0, new_p3, new_p2, hev_thresh);
let mut fp1 = p1;
let mut fp0 = p0;
let mut fq0 = new_p3;
let mut fq1 = new_p2;
do_filter4_16(_token, &mut fp1, &mut fp0, &mut fq0, &mut fq1, mask, hev);
let store_off = edge_x - x_start - 2;
store_16x4_impl!(region, store_off, stride, fp1, fp0, fq0, fq1);
p3 = fq0;
p2 = fq1;
p1 = tmp1;
p0 = tmp2;
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
pub(crate) fn normal_h_filter_uv_edge(
_token: X64V3Token,
u_pixels: &mut [u8],
v_pixels: &mut [u8],
x: usize,
y_start: usize,
stride: usize,
hev_thresh: i32,
interior_limit: i32,
edge_limit: i32,
) {
assert!(stride <= MAX_STRIDE, "stride exceeds MAX_STRIDE");
let base = y_start * stride + x - 4;
let u_region: &mut [u8; H_FILTER_UV_REGION] =
<&mut [u8; H_FILTER_UV_REGION]>::try_from(&mut u_pixels[base..base + H_FILTER_UV_REGION])
.expect("normal_h_filter_uv_edge: u_pixels buffer too small");
let v_region: &mut [u8; H_FILTER_UV_REGION] =
<&mut [u8; H_FILTER_UV_REGION]>::try_from(&mut v_pixels[base..base + H_FILTER_UV_REGION])
.expect("normal_h_filter_uv_edge: v_pixels buffer too small");
let mut rows = [_mm_setzero_si128(); 16];
for i in 0..8 {
let row_start = i * stride;
rows[i] = simd_mem_x86::_mm_loadu_si64(
<&[u8; 8]>::try_from(&u_region[row_start..][..8]).unwrap(),
);
rows[i + 8] = simd_mem_x86::_mm_loadu_si64(
<&[u8; 8]>::try_from(&v_region[row_start..][..8]).unwrap(),
);
}
let cols = transpose_8x16_to_16x8(_token, &rows);
let p3 = cols[0];
let mut p2 = cols[1];
let mut p1 = cols[2];
let mut p0 = cols[3];
let mut q0 = cols[4];
let mut q1 = cols[5];
let mut q2 = cols[6];
let q3 = cols[7];
let mask = needs_filter_normal_16(
_token,
p3,
p2,
p1,
p0,
q0,
q1,
q2,
q3,
edge_limit,
interior_limit,
);
let hev = high_edge_variance_16(_token, p1, p0, q0, q1, hev_thresh);
do_filter6_16(
_token, &mut p2, &mut p1, &mut p0, &mut q0, &mut q1, &mut q2, mask, hev,
);
store_uv_16x4_impl!(u_region, v_region, 1, stride, p2, p1, p0, q0);
store_q1q2_uv_16_impl!(u_region, v_region, 5, stride, q1, q2);
}
#[cfg(target_arch = "x86_64")]
#[rite]
pub(crate) fn normal_h_filter_uv_inner(
_token: X64V3Token,
u_pixels: &mut [u8],
v_pixels: &mut [u8],
x: usize,
y_start: usize,
stride: usize,
hev_thresh: i32,
interior_limit: i32,
edge_limit: i32,
) {
assert!(stride <= MAX_STRIDE, "stride exceeds MAX_STRIDE");
let base = y_start * stride + x - 4;
let u_region: &mut [u8; H_FILTER_UV_REGION] =
<&mut [u8; H_FILTER_UV_REGION]>::try_from(&mut u_pixels[base..base + H_FILTER_UV_REGION])
.expect("normal_h_filter_uv_inner: u_pixels buffer too small");
let v_region: &mut [u8; H_FILTER_UV_REGION] =
<&mut [u8; H_FILTER_UV_REGION]>::try_from(&mut v_pixels[base..base + H_FILTER_UV_REGION])
.expect("normal_h_filter_uv_inner: v_pixels buffer too small");
let mut rows = [_mm_setzero_si128(); 16];
for i in 0..8 {
let row_start = i * stride;
rows[i] = simd_mem_x86::_mm_loadu_si64(
<&[u8; 8]>::try_from(&u_region[row_start..][..8]).unwrap(),
);
rows[i + 8] = simd_mem_x86::_mm_loadu_si64(
<&[u8; 8]>::try_from(&v_region[row_start..][..8]).unwrap(),
);
}
let cols = transpose_8x16_to_16x8(_token, &rows);
let p3 = cols[0];
let p2 = cols[1];
let mut p1 = cols[2];
let mut p0 = cols[3];
let mut q0 = cols[4];
let mut q1 = cols[5];
let q2 = cols[6];
let q3 = cols[7];
let mask = needs_filter_normal_16(
_token,
p3,
p2,
p1,
p0,
q0,
q1,
q2,
q3,
edge_limit,
interior_limit,
);
let hev = high_edge_variance_16(_token, p1, p0, q0, q1, hev_thresh);
do_filter4_16(_token, &mut p1, &mut p0, &mut q0, &mut q1, mask, hev);
store_uv_16x4_impl!(u_region, v_region, 2, stride, p1, p0, q0, q1);
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn transpose_8x32_to_32x8(
_token: X64V3Token,
rows_lo: &[__m128i; 16],
rows_hi: &[__m128i; 16],
) -> [__m256i; 8] {
let cols_lo = transpose_8x16_to_16x8(_token, rows_lo);
let cols_hi = transpose_8x16_to_16x8(_token, rows_hi);
[
_mm256_set_m128i(cols_hi[0], cols_lo[0]),
_mm256_set_m128i(cols_hi[1], cols_lo[1]),
_mm256_set_m128i(cols_hi[2], cols_lo[2]),
_mm256_set_m128i(cols_hi[3], cols_lo[3]),
_mm256_set_m128i(cols_hi[4], cols_lo[4]),
_mm256_set_m128i(cols_hi[5], cols_lo[5]),
_mm256_set_m128i(cols_hi[6], cols_lo[6]),
_mm256_set_m128i(cols_hi[7], cols_lo[7]),
]
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn transpose_4x32_to_32x4(
_token: X64V3Token,
p1: __m256i,
p0: __m256i,
q0: __m256i,
q1: __m256i,
) -> [i32; 32] {
let p1_lo = _mm256_castsi256_si128(p1);
let p1_hi = _mm256_extracti128_si256(p1, 1);
let p0_lo = _mm256_castsi256_si128(p0);
let p0_hi = _mm256_extracti128_si256(p0, 1);
let q0_lo = _mm256_castsi256_si128(q0);
let q0_hi = _mm256_extracti128_si256(q0, 1);
let q1_lo = _mm256_castsi256_si128(q1);
let q1_hi = _mm256_extracti128_si256(q1, 1);
let lo = transpose_4x16_to_16x4(_token, p1_lo, p0_lo, q0_lo, q1_lo);
let hi = transpose_4x16_to_16x4(_token, p1_hi, p0_hi, q0_hi, q1_hi);
let mut result = [0i32; 32];
result[..16].copy_from_slice(&lo);
result[16..].copy_from_slice(&hi);
result
}
#[cfg(target_arch = "x86_64")]
#[rite]
fn transpose_6x32_to_32x6(
_token: X64V3Token,
p2: __m256i,
p1: __m256i,
p0: __m256i,
q0: __m256i,
q1: __m256i,
q2: __m256i,
) -> ([i32; 32], [i16; 32]) {
let p2_lo = _mm256_castsi256_si128(p2);
let p2_hi = _mm256_extracti128_si256(p2, 1);
let p1_lo = _mm256_castsi256_si128(p1);
let p1_hi = _mm256_extracti128_si256(p1, 1);
let p0_lo = _mm256_castsi256_si128(p0);
let p0_hi = _mm256_extracti128_si256(p0, 1);
let q0_lo = _mm256_castsi256_si128(q0);
let q0_hi = _mm256_extracti128_si256(q0, 1);
let q1_lo = _mm256_castsi256_si128(q1);
let q1_hi = _mm256_extracti128_si256(q1, 1);
let q2_lo = _mm256_castsi256_si128(q2);
let q2_hi = _mm256_extracti128_si256(q2, 1);
let (lo4, lo2) = transpose_6x16_to_16x6(_token, p2_lo, p1_lo, p0_lo, q0_lo, q1_lo, q2_lo);
let (hi4, hi2) = transpose_6x16_to_16x6(_token, p2_hi, p1_hi, p0_hi, q0_hi, q1_hi, q2_hi);
let mut result4 = [0i32; 32];
let mut result2 = [0i16; 32];
result4[..16].copy_from_slice(&lo4);
result4[16..].copy_from_slice(&hi4);
result2[..16].copy_from_slice(&lo2);
result2[16..].copy_from_slice(&hi2);
(result4, result2)
}
#[cfg(target_arch = "x86_64")]
#[allow(clippy::too_many_arguments)]
#[rite]
pub(crate) fn normal_h_filter32_inner(
_token: X64V3Token,
pixels: &mut [u8],
x: usize,
y_start: usize,
stride: usize,
hev_thresh: i32,
interior_limit: i32,
edge_limit: i32,
) {
let validated_end = (y_start + 31) * stride + x + 4;
assert!(
x >= 4 && validated_end <= pixels.len(),
"normal_h_filter32_inner: bounds check failed"
);
let pixels = &mut pixels[..validated_end];
let mut rows_lo = [_mm_setzero_si128(); 16];
let mut rows_hi = [_mm_setzero_si128(); 16];
for (i, row) in rows_lo.iter_mut().enumerate() {
let row_start = (y_start + i) * stride + x - 4;
*row =
simd_mem_x86::_mm_loadu_si64(<&[u8; 8]>::try_from(&pixels[row_start..][..8]).unwrap());
}
for (i, row) in rows_hi.iter_mut().enumerate() {
let row_start = (y_start + 16 + i) * stride + x - 4;
*row =
simd_mem_x86::_mm_loadu_si64(<&[u8; 8]>::try_from(&pixels[row_start..][..8]).unwrap());
}
let cols = transpose_8x32_to_32x8(_token, &rows_lo, &rows_hi);
let p3 = cols[0];
let p2 = cols[1];
let mut p1 = cols[2];
let mut p0 = cols[3];
let mut q0 = cols[4];
let mut q1 = cols[5];
let q2 = cols[6];
let q3 = cols[7];
let mask = needs_filter_normal_32(
_token,
p3,
p2,
p1,
p0,
q0,
q1,
q2,
q3,
edge_limit,
interior_limit,
);
let hev = high_edge_variance_32(_token, p1, p0, q0, q1, hev_thresh);
do_filter4_32(_token, &mut p1, &mut p0, &mut q0, &mut q1, mask, hev);
let packed = transpose_4x32_to_32x4(_token, p1, p0, q0, q1);
for (i, &val) in packed.iter().enumerate() {
let row_start = (y_start + i) * stride + x - 2;
simd_mem_x86::_mm_storeu_si32(
<&mut [u8; 4]>::try_from(&mut pixels[row_start..][..4]).unwrap(),
_mm_cvtsi32_si128(val),
);
}
}
#[cfg(target_arch = "x86_64")]
#[allow(clippy::too_many_arguments)]
#[rite]
pub(crate) fn normal_h_filter32_edge(
_token: X64V3Token,
pixels: &mut [u8],
x: usize,
y_start: usize,
stride: usize,
hev_thresh: i32,
interior_limit: i32,
edge_limit: i32,
) {
let validated_end = (y_start + 31) * stride + x + 4;
assert!(
x >= 4 && validated_end <= pixels.len(),
"normal_h_filter32_edge: bounds check failed"
);
let pixels = &mut pixels[..validated_end];
let mut rows_lo = [_mm_setzero_si128(); 16];
let mut rows_hi = [_mm_setzero_si128(); 16];
for (i, row) in rows_lo.iter_mut().enumerate() {
let row_start = (y_start + i) * stride + x - 4;
*row =
simd_mem_x86::_mm_loadu_si64(<&[u8; 8]>::try_from(&pixels[row_start..][..8]).unwrap());
}
for (i, row) in rows_hi.iter_mut().enumerate() {
let row_start = (y_start + 16 + i) * stride + x - 4;
*row =
simd_mem_x86::_mm_loadu_si64(<&[u8; 8]>::try_from(&pixels[row_start..][..8]).unwrap());
}
let cols = transpose_8x32_to_32x8(_token, &rows_lo, &rows_hi);
let p3 = cols[0];
let mut p2 = cols[1];
let mut p1 = cols[2];
let mut p0 = cols[3];
let mut q0 = cols[4];
let mut q1 = cols[5];
let mut q2 = cols[6];
let q3 = cols[7];
let mask = needs_filter_normal_32(
_token,
p3,
p2,
p1,
p0,
q0,
q1,
q2,
q3,
edge_limit,
interior_limit,
);
let hev = high_edge_variance_32(_token, p1, p0, q0, q1, hev_thresh);
do_filter6_32(
_token, &mut p2, &mut p1, &mut p0, &mut q0, &mut q1, &mut q2, mask, hev,
);
let (packed4, packed2) = transpose_6x32_to_32x6(_token, p2, p1, p0, q0, q1, q2);
for (i, (&val4, &val2)) in packed4.iter().zip(packed2.iter()).enumerate() {
let row_start = (y_start + i) * stride + x - 3;
simd_mem_x86::_mm_storeu_si32(
<&mut [u8; 4]>::try_from(&mut pixels[row_start..][..4]).unwrap(),
_mm_cvtsi32_si128(val4),
);
simd_mem_x86::_mm_storeu_si16(
<&mut [u8; 2]>::try_from(&mut pixels[row_start + 4..][..2]).unwrap(),
_mm_cvtsi32_si128(val2 as i32),
);
}
}
#[cfg(target_arch = "x86_64")]
#[rite]
pub(crate) fn normal_v_filter_uv_edge(
_token: X64V3Token,
u_pixels: &mut [u8],
v_pixels: &mut [u8],
point: usize,
stride: usize,
hev_thresh: i32,
interior_limit: i32,
edge_limit: i32,
) {
assert!(stride <= MAX_STRIDE, "stride exceeds MAX_STRIDE");
let region_start = point - 4 * stride;
let u_region: &mut [u8; V_FILTER_UV_REGION] = <&mut [u8; V_FILTER_UV_REGION]>::try_from(
&mut u_pixels[region_start..region_start + V_FILTER_UV_REGION],
)
.expect("normal_v_filter_uv_edge: u_pixels buffer too small");
let v_region: &mut [u8; V_FILTER_UV_REGION] = <&mut [u8; V_FILTER_UV_REGION]>::try_from(
&mut v_pixels[region_start..region_start + V_FILTER_UV_REGION],
)
.expect("normal_v_filter_uv_edge: v_pixels buffer too small");
let load_uv_row = |u_reg: &[u8; V_FILTER_UV_REGION],
v_reg: &[u8; V_FILTER_UV_REGION],
offset: usize|
-> __m128i {
let u = simd_mem_x86::_mm_loadu_si64(<&[u8; 8]>::try_from(&u_reg[offset..][..8]).unwrap());
let v = simd_mem_x86::_mm_loadu_si64(<&[u8; 8]>::try_from(&v_reg[offset..][..8]).unwrap());
_mm_unpacklo_epi64(u, v)
};
let off_p3 = 0;
let off_p2 = stride;
let off_p1 = 2 * stride;
let off_p0 = 3 * stride;
let off_q0 = 4 * stride;
let off_q1 = 5 * stride;
let off_q2 = 6 * stride;
let off_q3 = 7 * stride;
let p3 = load_uv_row(u_region, v_region, off_p3);
let mut p2 = load_uv_row(u_region, v_region, off_p2);
let mut p1 = load_uv_row(u_region, v_region, off_p1);
let mut p0 = load_uv_row(u_region, v_region, off_p0);
let mut q0 = load_uv_row(u_region, v_region, off_q0);
let mut q1 = load_uv_row(u_region, v_region, off_q1);
let mut q2 = load_uv_row(u_region, v_region, off_q2);
let q3 = load_uv_row(u_region, v_region, off_q3);
let mask = needs_filter_normal_16(
_token,
p3,
p2,
p1,
p0,
q0,
q1,
q2,
q3,
edge_limit,
interior_limit,
);
let hev = high_edge_variance_16(_token, p1, p0, q0, q1, hev_thresh);
do_filter6_16(
_token, &mut p2, &mut p1, &mut p0, &mut q0, &mut q1, &mut q2, mask, hev,
);
let store_uv_row = |u_reg: &mut [u8; V_FILTER_UV_REGION],
v_reg: &mut [u8; V_FILTER_UV_REGION],
offset: usize,
reg: __m128i| {
simd_mem_x86::_mm_storel_epi64(
<&mut [u8; 16]>::try_from(&mut u_reg[offset..][..16]).unwrap(),
reg,
);
simd_mem_x86::_mm_storel_epi64(
<&mut [u8; 16]>::try_from(&mut v_reg[offset..][..16]).unwrap(),
_mm_srli_si128(reg, 8),
);
};
store_uv_row(u_region, v_region, off_p2, p2);
store_uv_row(u_region, v_region, off_p1, p1);
store_uv_row(u_region, v_region, off_p0, p0);
store_uv_row(u_region, v_region, off_q0, q0);
store_uv_row(u_region, v_region, off_q1, q1);
store_uv_row(u_region, v_region, off_q2, q2);
}
#[cfg(target_arch = "x86_64")]
#[rite]
pub(crate) fn normal_v_filter_uv_inner(
_token: X64V3Token,
u_pixels: &mut [u8],
v_pixels: &mut [u8],
point: usize,
stride: usize,
hev_thresh: i32,
interior_limit: i32,
edge_limit: i32,
) {
assert!(stride <= MAX_STRIDE, "stride exceeds MAX_STRIDE");
let region_start = point - 4 * stride;
let u_region: &mut [u8; V_FILTER_UV_REGION] = <&mut [u8; V_FILTER_UV_REGION]>::try_from(
&mut u_pixels[region_start..region_start + V_FILTER_UV_REGION],
)
.expect("normal_v_filter_uv_inner: u_pixels buffer too small");
let v_region: &mut [u8; V_FILTER_UV_REGION] = <&mut [u8; V_FILTER_UV_REGION]>::try_from(
&mut v_pixels[region_start..region_start + V_FILTER_UV_REGION],
)
.expect("normal_v_filter_uv_inner: v_pixels buffer too small");
let load_uv_row = |u_reg: &[u8; V_FILTER_UV_REGION],
v_reg: &[u8; V_FILTER_UV_REGION],
offset: usize|
-> __m128i {
let u = simd_mem_x86::_mm_loadu_si64(<&[u8; 8]>::try_from(&u_reg[offset..][..8]).unwrap());
let v = simd_mem_x86::_mm_loadu_si64(<&[u8; 8]>::try_from(&v_reg[offset..][..8]).unwrap());
_mm_unpacklo_epi64(u, v)
};
let off_p3 = 0;
let off_p2 = stride;
let off_p1 = 2 * stride;
let off_p0 = 3 * stride;
let off_q0 = 4 * stride;
let off_q1 = 5 * stride;
let off_q2 = 6 * stride;
let off_q3 = 7 * stride;
let p3 = load_uv_row(u_region, v_region, off_p3);
let p2 = load_uv_row(u_region, v_region, off_p2);
let mut p1 = load_uv_row(u_region, v_region, off_p1);
let mut p0 = load_uv_row(u_region, v_region, off_p0);
let mut q0 = load_uv_row(u_region, v_region, off_q0);
let mut q1 = load_uv_row(u_region, v_region, off_q1);
let q2 = load_uv_row(u_region, v_region, off_q2);
let q3 = load_uv_row(u_region, v_region, off_q3);
let mask = needs_filter_normal_16(
_token,
p3,
p2,
p1,
p0,
q0,
q1,
q2,
q3,
edge_limit,
interior_limit,
);
let hev = high_edge_variance_16(_token, p1, p0, q0, q1, hev_thresh);
do_filter4_16(_token, &mut p1, &mut p0, &mut q0, &mut q1, mask, hev);
let store_uv_row = |u_reg: &mut [u8; V_FILTER_UV_REGION],
v_reg: &mut [u8; V_FILTER_UV_REGION],
offset: usize,
reg: __m128i| {
simd_mem_x86::_mm_storel_epi64(
<&mut [u8; 16]>::try_from(&mut u_reg[offset..][..16]).unwrap(),
reg,
);
simd_mem_x86::_mm_storel_epi64(
<&mut [u8; 16]>::try_from(&mut v_reg[offset..][..16]).unwrap(),
_mm_srli_si128(reg, 8),
);
};
store_uv_row(u_region, v_region, off_p1, p1);
store_uv_row(u_region, v_region, off_p0, p0);
store_uv_row(u_region, v_region, off_q0, q0);
store_uv_row(u_region, v_region, off_q1, q1);
}
#[cfg(test)]
mod tests {
extern crate std;
use super::*;
use alloc::vec;
#[archmage::arcane]
fn call_simple_v_filter16(
t: X64V3Token,
pixels: &mut [u8],
point: usize,
stride: usize,
thresh: i32,
) {
simple_v_filter16(t, pixels, point, stride, thresh);
}
#[archmage::arcane]
fn call_simple_v_filter32(
t: X64V3Token,
pixels: &mut [u8],
point: usize,
stride: usize,
thresh: i32,
) {
simple_v_filter32(t, pixels, point, stride, thresh);
}
#[archmage::arcane]
fn call_simple_h_filter16(
t: X64V3Token,
pixels: &mut [u8],
x: usize,
y_start: usize,
stride: usize,
thresh: i32,
) {
simple_h_filter16(t, pixels, x, y_start, stride, thresh);
}
#[archmage::arcane]
fn call_normal_v_filter16_inner(
t: X64V3Token,
pixels: &mut [u8],
point: usize,
stride: usize,
hev_thresh: i32,
interior_limit: i32,
edge_limit: i32,
) {
normal_v_filter16_inner(
t,
pixels,
point,
stride,
hev_thresh,
interior_limit,
edge_limit,
);
}
#[archmage::arcane]
fn call_normal_v_filter16_edge(
t: X64V3Token,
pixels: &mut [u8],
point: usize,
stride: usize,
hev_thresh: i32,
interior_limit: i32,
edge_limit: i32,
) {
normal_v_filter16_edge(
t,
pixels,
point,
stride,
hev_thresh,
interior_limit,
edge_limit,
);
}
#[archmage::arcane]
fn call_normal_v_filter32_inner(
t: X64V3Token,
pixels: &mut [u8],
point: usize,
stride: usize,
hev_thresh: i32,
interior_limit: i32,
edge_limit: i32,
) {
normal_v_filter32_inner(
t,
pixels,
point,
stride,
hev_thresh,
interior_limit,
edge_limit,
);
}
#[archmage::arcane]
fn call_normal_v_filter32_edge(
t: X64V3Token,
pixels: &mut [u8],
point: usize,
stride: usize,
hev_thresh: i32,
interior_limit: i32,
edge_limit: i32,
) {
normal_v_filter32_edge(
t,
pixels,
point,
stride,
hev_thresh,
interior_limit,
edge_limit,
);
}
#[archmage::arcane]
fn call_normal_h_filter16_inner(
t: X64V3Token,
pixels: &mut [u8],
x: usize,
y_start: usize,
stride: usize,
hev_thresh: i32,
interior_limit: i32,
edge_limit: i32,
) {
normal_h_filter16_inner(
t,
pixels,
x,
y_start,
stride,
hev_thresh,
interior_limit,
edge_limit,
);
}
#[archmage::arcane]
fn call_normal_h_filter16_edge(
t: X64V3Token,
pixels: &mut [u8],
x: usize,
y_start: usize,
stride: usize,
hev_thresh: i32,
interior_limit: i32,
edge_limit: i32,
) {
normal_h_filter16_edge(
t,
pixels,
x,
y_start,
stride,
hev_thresh,
interior_limit,
edge_limit,
);
}
#[archmage::arcane]
fn call_normal_h_filter32_inner(
t: X64V3Token,
pixels: &mut [u8],
x: usize,
y_start: usize,
stride: usize,
hev_thresh: i32,
interior_limit: i32,
edge_limit: i32,
) {
normal_h_filter32_inner(
t,
pixels,
x,
y_start,
stride,
hev_thresh,
interior_limit,
edge_limit,
);
}
#[archmage::arcane]
fn call_normal_h_filter32_edge(
t: X64V3Token,
pixels: &mut [u8],
x: usize,
y_start: usize,
stride: usize,
hev_thresh: i32,
interior_limit: i32,
edge_limit: i32,
) {
normal_h_filter32_edge(
t,
pixels,
x,
y_start,
stride,
hev_thresh,
interior_limit,
edge_limit,
);
}
fn scalar_simple_filter(p1: u8, p0: u8, q0: u8, q1: u8, thresh: i32) -> (u8, u8) {
let diff_p0_q0 = (p0 as i32 - q0 as i32).abs();
let diff_p1_q1 = (p1 as i32 - q1 as i32).abs();
if diff_p0_q0 * 2 + diff_p1_q1 / 2 > thresh {
return (p0, q0); }
let p1s = p1 as i32 - 128;
let p0s = p0 as i32 - 128;
let q0s = q0 as i32 - 128;
let q1s = q1 as i32 - 128;
let a = (p1s - q1s + 3 * (q0s - p0s)).clamp(-128, 127);
let a_plus_4 = (a + 4) >> 3;
let a_plus_3 = (a + 3) >> 3;
let new_q0 = (q0s - a_plus_4).clamp(-128, 127) + 128;
let new_p0 = (p0s + a_plus_3).clamp(-128, 127) + 128;
(new_p0 as u8, new_q0 as u8)
}
#[test]
#[cfg(target_arch = "x86_64")]
fn test_simple_v_filter16_matches_scalar() {
let Some(token) = archmage::X64V3Token::summon() else {
return;
};
let stride = 32;
let mut pixels = vec![128u8; stride * 8 + V_FILTER_REGION];
let mut pixels_scalar = pixels.clone();
for x in 0..16 {
pixels[x] = 100; pixels[stride + x] = 110; pixels[2 * stride + x] = 140; pixels[3 * stride + x] = 150;
pixels_scalar[x] = 100;
pixels_scalar[stride + x] = 110;
pixels_scalar[2 * stride + x] = 140;
pixels_scalar[3 * stride + x] = 150;
}
let thresh = 40;
call_simple_v_filter16(token, &mut pixels, 2 * stride, stride, thresh);
for x in 0..16 {
let p1 = pixels_scalar[x];
let p0 = pixels_scalar[stride + x];
let q0 = pixels_scalar[2 * stride + x];
let q1 = pixels_scalar[3 * stride + x];
let (new_p0, new_q0) = scalar_simple_filter(p1, p0, q0, q1, thresh);
pixels_scalar[stride + x] = new_p0;
pixels_scalar[2 * stride + x] = new_q0;
}
for x in 0..16 {
assert_eq!(
pixels[stride + x],
pixels_scalar[stride + x],
"p0 mismatch at x={}",
x
);
assert_eq!(
pixels[2 * stride + x],
pixels_scalar[2 * stride + x],
"q0 mismatch at x={}",
x
);
}
}
#[test]
#[cfg(target_arch = "x86_64")]
fn test_simple_h_filter16_matches_scalar() {
let Some(token) = archmage::X64V3Token::summon() else {
return;
};
let stride = 32;
let mut pixels = vec![128u8; stride * 20 + H_FILTER_SIMPLE_REGION];
let mut pixels_scalar = pixels.clone();
for y in 0..16 {
pixels[y * stride + 2] = 100; pixels[y * stride + 3] = 110; pixels[y * stride + 4] = 140; pixels[y * stride + 5] = 150;
pixels_scalar[y * stride + 2] = 100;
pixels_scalar[y * stride + 3] = 110;
pixels_scalar[y * stride + 4] = 140;
pixels_scalar[y * stride + 5] = 150;
}
let thresh = 40;
call_simple_h_filter16(token, &mut pixels, 4, 0, stride, thresh);
for y in 0..16 {
let p1 = pixels_scalar[y * stride + 2];
let p0 = pixels_scalar[y * stride + 3];
let q0 = pixels_scalar[y * stride + 4];
let q1 = pixels_scalar[y * stride + 5];
let (new_p0, new_q0) = scalar_simple_filter(p1, p0, q0, q1, thresh);
pixels_scalar[y * stride + 3] = new_p0;
pixels_scalar[y * stride + 4] = new_q0;
}
for y in 0..16 {
assert_eq!(
pixels[y * stride + 3],
pixels_scalar[y * stride + 3],
"p0 mismatch at y={}",
y
);
assert_eq!(
pixels[y * stride + 4],
pixels_scalar[y * stride + 4],
"q0 mismatch at y={}",
y
);
}
}
#[test]
#[cfg(target_arch = "x86_64")]
fn test_normal_v_filter16_inner_matches_scalar() {
let Some(token) = archmage::X64V3Token::summon() else {
return;
};
let stride = 32;
let mut pixels = vec![128u8; stride * 12 + V_FILTER_NORMAL_REGION];
let mut pixels_scalar = pixels.clone();
for x in 0..16 {
pixels[0 * stride + x] = 100; pixels[1 * stride + x] = 105; pixels[2 * stride + x] = 110; pixels[3 * stride + x] = 115; pixels[4 * stride + x] = 145; pixels[5 * stride + x] = 150; pixels[6 * stride + x] = 155; pixels[7 * stride + x] = 160;
pixels_scalar[0 * stride + x] = 100;
pixels_scalar[1 * stride + x] = 105;
pixels_scalar[2 * stride + x] = 110;
pixels_scalar[3 * stride + x] = 115;
pixels_scalar[4 * stride + x] = 145;
pixels_scalar[5 * stride + x] = 150;
pixels_scalar[6 * stride + x] = 155;
pixels_scalar[7 * stride + x] = 160;
}
let hev_thresh = 5;
let interior_limit = 15;
let edge_limit = 25;
call_normal_v_filter16_inner(
token,
&mut pixels,
4 * stride,
stride,
hev_thresh,
interior_limit,
edge_limit,
);
for x in 0..16 {
crate::decoder::loop_filter::subblock_filter_vertical(
hev_thresh as u8,
interior_limit as u8,
edge_limit as u8,
&mut pixels_scalar,
4 * stride + x,
stride,
);
}
for x in 0..16 {
assert_eq!(
pixels[2 * stride + x],
pixels_scalar[2 * stride + x],
"p1 mismatch at x={}",
x
);
assert_eq!(
pixels[3 * stride + x],
pixels_scalar[3 * stride + x],
"p0 mismatch at x={}",
x
);
assert_eq!(
pixels[4 * stride + x],
pixels_scalar[4 * stride + x],
"q0 mismatch at x={}",
x
);
assert_eq!(
pixels[5 * stride + x],
pixels_scalar[5 * stride + x],
"q1 mismatch at x={}",
x
);
}
}
#[test]
#[cfg(target_arch = "x86_64")]
fn test_normal_v_filter16_edge_matches_scalar() {
let Some(token) = archmage::X64V3Token::summon() else {
return;
};
let stride = 32;
let mut pixels = vec![128u8; stride * 12 + V_FILTER_NORMAL_REGION];
let mut pixels_scalar = pixels.clone();
for x in 0..16 {
pixels[0 * stride + x] = 100; pixels[1 * stride + x] = 105; pixels[2 * stride + x] = 110; pixels[3 * stride + x] = 115; pixels[4 * stride + x] = 145; pixels[5 * stride + x] = 150; pixels[6 * stride + x] = 155; pixels[7 * stride + x] = 160;
pixels_scalar[0 * stride + x] = 100;
pixels_scalar[1 * stride + x] = 105;
pixels_scalar[2 * stride + x] = 110;
pixels_scalar[3 * stride + x] = 115;
pixels_scalar[4 * stride + x] = 145;
pixels_scalar[5 * stride + x] = 150;
pixels_scalar[6 * stride + x] = 155;
pixels_scalar[7 * stride + x] = 160;
}
let hev_thresh = 5;
let interior_limit = 15;
let edge_limit = 40;
call_normal_v_filter16_edge(
token,
&mut pixels,
4 * stride,
stride,
hev_thresh,
interior_limit,
edge_limit,
);
for x in 0..16 {
crate::decoder::loop_filter::macroblock_filter_vertical(
hev_thresh as u8,
interior_limit as u8,
edge_limit as u8,
&mut pixels_scalar,
4 * stride + x,
stride,
);
}
for x in 0..16 {
assert_eq!(
pixels[1 * stride + x],
pixels_scalar[1 * stride + x],
"p2 mismatch at x={}",
x
);
assert_eq!(
pixels[2 * stride + x],
pixels_scalar[2 * stride + x],
"p1 mismatch at x={}",
x
);
assert_eq!(
pixels[3 * stride + x],
pixels_scalar[3 * stride + x],
"p0 mismatch at x={}",
x
);
assert_eq!(
pixels[4 * stride + x],
pixels_scalar[4 * stride + x],
"q0 mismatch at x={}",
x
);
assert_eq!(
pixels[5 * stride + x],
pixels_scalar[5 * stride + x],
"q1 mismatch at x={}",
x
);
assert_eq!(
pixels[6 * stride + x],
pixels_scalar[6 * stride + x],
"q2 mismatch at x={}",
x
);
}
}
#[test]
#[cfg(target_arch = "x86_64")]
fn test_simple_v_filter32_matches_scalar() {
let Some(token) = archmage::X64V3Token::summon() else {
return;
};
let stride = 64;
let mut pixels = vec![128u8; stride * 8 + V_FILTER_REGION_32];
let mut pixels_scalar = pixels.clone();
for x in 0..32 {
pixels[x] = 100;
pixels[stride + x] = 110;
pixels[2 * stride + x] = 140;
pixels[3 * stride + x] = 150;
pixels_scalar[x] = 100;
pixels_scalar[stride + x] = 110;
pixels_scalar[2 * stride + x] = 140;
pixels_scalar[3 * stride + x] = 150;
}
let thresh = 40;
call_simple_v_filter32(token, &mut pixels, 2 * stride, stride, thresh);
for x in 0..32 {
let p1 = pixels_scalar[x];
let p0 = pixels_scalar[stride + x];
let q0 = pixels_scalar[2 * stride + x];
let q1 = pixels_scalar[3 * stride + x];
let (new_p0, new_q0) = scalar_simple_filter(p1, p0, q0, q1, thresh);
pixels_scalar[stride + x] = new_p0;
pixels_scalar[2 * stride + x] = new_q0;
}
for x in 0..32 {
assert_eq!(
pixels[stride + x],
pixels_scalar[stride + x],
"p0 mismatch at x={}",
x
);
assert_eq!(
pixels[2 * stride + x],
pixels_scalar[2 * stride + x],
"q0 mismatch at x={}",
x
);
}
}
#[test]
#[cfg(target_arch = "x86_64")]
fn test_normal_v_filter32_inner_matches_two_16() {
let Some(token) = archmage::X64V3Token::summon() else {
return;
};
let stride = 64;
let mut pixels = vec![128u8; stride * 12 + V_FILTER_NORMAL_REGION_32];
let mut pixels_16 = pixels.clone();
for x in 0..32 {
pixels[0 * stride + x] = 100;
pixels[1 * stride + x] = 105;
pixels[2 * stride + x] = 110;
pixels[3 * stride + x] = 115;
pixels[4 * stride + x] = 145;
pixels[5 * stride + x] = 150;
pixels[6 * stride + x] = 155;
pixels[7 * stride + x] = 160;
pixels_16[0 * stride + x] = 100;
pixels_16[1 * stride + x] = 105;
pixels_16[2 * stride + x] = 110;
pixels_16[3 * stride + x] = 115;
pixels_16[4 * stride + x] = 145;
pixels_16[5 * stride + x] = 150;
pixels_16[6 * stride + x] = 155;
pixels_16[7 * stride + x] = 160;
}
let hev_thresh = 5;
let interior_limit = 15;
let edge_limit = 25;
call_normal_v_filter32_inner(
token,
&mut pixels,
4 * stride,
stride,
hev_thresh,
interior_limit,
edge_limit,
);
call_normal_v_filter16_inner(
token,
&mut pixels_16,
4 * stride,
stride,
hev_thresh,
interior_limit,
edge_limit,
);
call_normal_v_filter16_inner(
token,
&mut pixels_16,
4 * stride + 16,
stride,
hev_thresh,
interior_limit,
edge_limit,
);
for x in 0..32 {
assert_eq!(
pixels[2 * stride + x],
pixels_16[2 * stride + x],
"p1 mismatch at x={}",
x
);
assert_eq!(
pixels[3 * stride + x],
pixels_16[3 * stride + x],
"p0 mismatch at x={}",
x
);
assert_eq!(
pixels[4 * stride + x],
pixels_16[4 * stride + x],
"q0 mismatch at x={}",
x
);
assert_eq!(
pixels[5 * stride + x],
pixels_16[5 * stride + x],
"q1 mismatch at x={}",
x
);
}
}
#[test]
#[cfg(target_arch = "x86_64")]
fn test_normal_v_filter32_edge_matches_two_16() {
let Some(token) = archmage::X64V3Token::summon() else {
return;
};
let stride = 64;
let mut pixels = vec![128u8; stride * 12 + V_FILTER_NORMAL_REGION_32];
let mut pixels_16 = pixels.clone();
for x in 0..32 {
pixels[0 * stride + x] = 100;
pixels[1 * stride + x] = 105;
pixels[2 * stride + x] = 110;
pixels[3 * stride + x] = 115;
pixels[4 * stride + x] = 145;
pixels[5 * stride + x] = 150;
pixels[6 * stride + x] = 155;
pixels[7 * stride + x] = 160;
pixels_16[0 * stride + x] = 100;
pixels_16[1 * stride + x] = 105;
pixels_16[2 * stride + x] = 110;
pixels_16[3 * stride + x] = 115;
pixels_16[4 * stride + x] = 145;
pixels_16[5 * stride + x] = 150;
pixels_16[6 * stride + x] = 155;
pixels_16[7 * stride + x] = 160;
}
let hev_thresh = 5;
let interior_limit = 15;
let edge_limit = 25;
call_normal_v_filter32_edge(
token,
&mut pixels,
4 * stride,
stride,
hev_thresh,
interior_limit,
edge_limit,
);
call_normal_v_filter16_edge(
token,
&mut pixels_16,
4 * stride,
stride,
hev_thresh,
interior_limit,
edge_limit,
);
call_normal_v_filter16_edge(
token,
&mut pixels_16,
4 * stride + 16,
stride,
hev_thresh,
interior_limit,
edge_limit,
);
for x in 0..32 {
assert_eq!(
pixels[1 * stride + x],
pixels_16[1 * stride + x],
"p2 mismatch at x={}",
x
);
assert_eq!(
pixels[2 * stride + x],
pixels_16[2 * stride + x],
"p1 mismatch at x={}",
x
);
assert_eq!(
pixels[3 * stride + x],
pixels_16[3 * stride + x],
"p0 mismatch at x={}",
x
);
assert_eq!(
pixels[4 * stride + x],
pixels_16[4 * stride + x],
"q0 mismatch at x={}",
x
);
assert_eq!(
pixels[5 * stride + x],
pixels_16[5 * stride + x],
"q1 mismatch at x={}",
x
);
assert_eq!(
pixels[6 * stride + x],
pixels_16[6 * stride + x],
"q2 mismatch at x={}",
x
);
}
}
#[test]
#[cfg(target_arch = "x86_64")]
fn test_normal_h_filter32_inner_matches_two_16() {
let Some(token) = X64V3Token::summon() else {
std::eprintln!("AVX2 not available, skipping test");
return;
};
let width = 64;
let height = 48; let stride = width;
let mut pixels = vec![128u8; stride * height + H_FILTER_NORMAL_REGION];
let mut pixels_16 = pixels.clone();
for y in 0..32 {
for x in 0..8 {
let base = y * stride + 12 + x; pixels[base] = (100 + x * 10) as u8;
pixels_16[base] = (100 + x * 10) as u8;
}
}
let hev_thresh = 5;
let interior_limit = 15;
let edge_limit = 25;
call_normal_h_filter32_inner(
token,
&mut pixels,
16, 0, stride,
hev_thresh,
interior_limit,
edge_limit,
);
call_normal_h_filter16_inner(
token,
&mut pixels_16,
16,
0,
stride,
hev_thresh,
interior_limit,
edge_limit,
);
call_normal_h_filter16_inner(
token,
&mut pixels_16,
16,
16,
stride,
hev_thresh,
interior_limit,
edge_limit,
);
for y in 0..32 {
for x in 14..18 {
assert_eq!(
pixels[y * stride + x],
pixels_16[y * stride + x],
"mismatch at y={}, x={}",
y,
x
);
}
}
}
#[test]
#[cfg(target_arch = "x86_64")]
fn test_normal_h_filter32_edge_matches_two_16() {
let Some(token) = X64V3Token::summon() else {
std::eprintln!("AVX2 not available, skipping test");
return;
};
let width = 64;
let height = 48;
let stride = width;
let mut pixels = vec![128u8; stride * height + H_FILTER_NORMAL_REGION];
let mut pixels_16 = pixels.clone();
for y in 0..32 {
for x in 0..8 {
let base = y * stride + 12 + x;
pixels[base] = (100 + x * 10) as u8;
pixels_16[base] = (100 + x * 10) as u8;
}
}
let hev_thresh = 5;
let interior_limit = 15;
let edge_limit = 25;
call_normal_h_filter32_edge(
token,
&mut pixels,
16,
0,
stride,
hev_thresh,
interior_limit,
edge_limit,
);
call_normal_h_filter16_edge(
token,
&mut pixels_16,
16,
0,
stride,
hev_thresh,
interior_limit,
edge_limit,
);
call_normal_h_filter16_edge(
token,
&mut pixels_16,
16,
16,
stride,
hev_thresh,
interior_limit,
edge_limit,
);
for y in 0..32 {
for x in 13..19 {
assert_eq!(
pixels[y * stride + x],
pixels_16[y * stride + x],
"mismatch at y={}, x={}",
y,
x
);
}
}
}
}
#[rite]
fn needs_filter_neon(
_token: NeonToken,
p1: uint8x16_t,
p0: uint8x16_t,
q0: uint8x16_t,
q1: uint8x16_t,
thresh: i32,
) -> uint8x16_t {
let thresh_v = vdupq_n_u8(thresh as u8);
let a_p0_q0 = vabdq_u8(p0, q0); let a_p1_q1 = vabdq_u8(p1, q1); let a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0); let a_p1_q1_2 = vshrq_n_u8::<1>(a_p1_q1); let sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
vcgeq_u8(thresh_v, sum) }
#[rite]
fn needs_filter2_neon(
_token: NeonToken,
p3: uint8x16_t,
p2: uint8x16_t,
p1: uint8x16_t,
p0: uint8x16_t,
q0: uint8x16_t,
q1: uint8x16_t,
q2: uint8x16_t,
q3: uint8x16_t,
ithresh: i32,
thresh: i32,
) -> uint8x16_t {
let ithresh_v = vdupq_n_u8(ithresh as u8);
let a_p3_p2 = vabdq_u8(p3, p2);
let a_p2_p1 = vabdq_u8(p2, p1);
let a_p1_p0 = vabdq_u8(p1, p0);
let a_q3_q2 = vabdq_u8(q3, q2);
let a_q2_q1 = vabdq_u8(q2, q1);
let a_q1_q0 = vabdq_u8(q1, q0);
let max1 = vmaxq_u8(a_p3_p2, a_p2_p1);
let max2 = vmaxq_u8(a_p1_p0, a_q3_q2);
let max3 = vmaxq_u8(a_q2_q1, a_q1_q0);
let max12 = vmaxq_u8(max1, max2);
let max123 = vmaxq_u8(max12, max3);
let mask2 = vcgeq_u8(ithresh_v, max123);
let mask1 = needs_filter_neon(_token, p1, p0, q0, q1, thresh);
vandq_u8(mask1, mask2)
}
#[rite]
fn needs_hev_neon(
_token: NeonToken,
p1: uint8x16_t,
p0: uint8x16_t,
q0: uint8x16_t,
q1: uint8x16_t,
hev_thresh: i32,
) -> uint8x16_t {
let hev_thresh_v = vdupq_n_u8(hev_thresh as u8);
let a_p1_p0 = vabdq_u8(p1, p0);
let a_q1_q0 = vabdq_u8(q1, q0);
let a_max = vmaxq_u8(a_p1_p0, a_q1_q0);
vcgtq_u8(a_max, hev_thresh_v) }
#[rite]
fn flip_sign_neon(_token: NeonToken, v: uint8x16_t) -> int8x16_t {
let sign_bit = vdupq_n_u8(0x80);
vreinterpretq_s8_u8(veorq_u8(v, sign_bit))
}
#[rite]
fn flip_sign_back_neon(_token: NeonToken, v: int8x16_t) -> uint8x16_t {
let sign_bit = vdupq_n_s8(-128); vreinterpretq_u8_s8(veorq_s8(v, sign_bit))
}
#[rite]
fn get_base_delta_neon(
_token: NeonToken,
p1s: int8x16_t,
p0s: int8x16_t,
q0s: int8x16_t,
q1s: int8x16_t,
) -> int8x16_t {
let q0_p0 = vqsubq_s8(q0s, p0s); let p1_q1 = vqsubq_s8(p1s, q1s); let s1 = vqaddq_s8(p1_q1, q0_p0); let s2 = vqaddq_s8(q0_p0, s1); vqaddq_s8(q0_p0, s2) }
#[rite]
fn get_base_delta0_neon(_token: NeonToken, p0s: int8x16_t, q0s: int8x16_t) -> int8x16_t {
let q0_p0 = vqsubq_s8(q0s, p0s);
let s1 = vqaddq_s8(q0_p0, q0_p0); vqaddq_s8(q0_p0, s1) }
#[rite]
fn apply_filter2_no_flip_neon(
_token: NeonToken,
p0s: int8x16_t,
q0s: int8x16_t,
delta: int8x16_t,
) -> (int8x16_t, int8x16_t) {
let k3 = vdupq_n_s8(0x03);
let k4 = vdupq_n_s8(0x04);
let delta_p3 = vqaddq_s8(delta, k3);
let delta_p4 = vqaddq_s8(delta, k4);
let delta3 = vshrq_n_s8::<3>(delta_p3);
let delta4 = vshrq_n_s8::<3>(delta_p4);
let op0 = vqaddq_s8(p0s, delta3);
let oq0 = vqsubq_s8(q0s, delta4);
(op0, oq0)
}
#[rite]
fn do_filter2_neon(
_token: NeonToken,
p1: uint8x16_t,
p0: uint8x16_t,
q0: uint8x16_t,
q1: uint8x16_t,
mask: uint8x16_t,
) -> (uint8x16_t, uint8x16_t) {
let p1s = flip_sign_neon(_token, p1);
let p0s = flip_sign_neon(_token, p0);
let q0s = flip_sign_neon(_token, q0);
let q1s = flip_sign_neon(_token, q1);
let delta0 = get_base_delta_neon(_token, p1s, p0s, q0s, q1s);
let delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
let k3 = vdupq_n_s8(0x03);
let k4 = vdupq_n_s8(0x04);
let delta_p3 = vqaddq_s8(delta1, k3);
let delta_p4 = vqaddq_s8(delta1, k4);
let delta3 = vshrq_n_s8::<3>(delta_p3);
let delta4 = vshrq_n_s8::<3>(delta_p4);
let sp0 = vqaddq_s8(p0s, delta3);
let sq0 = vqsubq_s8(q0s, delta4);
(
flip_sign_back_neon(_token, sp0),
flip_sign_back_neon(_token, sq0),
)
}
#[rite]
fn do_filter4_neon(
_token: NeonToken,
p1: uint8x16_t,
p0: uint8x16_t,
q0: uint8x16_t,
q1: uint8x16_t,
mask: uint8x16_t,
hev_mask: uint8x16_t,
) -> (uint8x16_t, uint8x16_t, uint8x16_t, uint8x16_t) {
let p1s = flip_sign_neon(_token, p1);
let mut p0s = flip_sign_neon(_token, p0);
let mut q0s = flip_sign_neon(_token, q0);
let q1s = flip_sign_neon(_token, q1);
let simple_lf_mask = vandq_u8(mask, hev_mask);
{
let delta = get_base_delta_neon(_token, p1s, p0s, q0s, q1s);
let simple_lf_delta = vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
let (new_p0s, new_q0s) = apply_filter2_no_flip_neon(_token, p0s, q0s, simple_lf_delta);
p0s = new_p0s;
q0s = new_q0s;
}
let delta0 = get_base_delta0_neon(_token, p0s, q0s);
let complex_lf_mask = veorq_u8(simple_lf_mask, mask);
let complex_lf_delta = vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
let k3 = vdupq_n_s8(0x03);
let k4 = vdupq_n_s8(0x04);
let delta1 = vqaddq_s8(complex_lf_delta, k4);
let delta2 = vqaddq_s8(complex_lf_delta, k3);
let a1 = vshrq_n_s8::<3>(delta1);
let a2 = vshrq_n_s8::<3>(delta2);
let a3 = vrshrq_n_s8::<1>(a1);
let op0 = flip_sign_back_neon(_token, vqaddq_s8(p0s, a2));
let oq0 = flip_sign_back_neon(_token, vqsubq_s8(q0s, a1));
let op1 = flip_sign_back_neon(_token, vqaddq_s8(p1s, a3));
let oq1 = flip_sign_back_neon(_token, vqsubq_s8(q1s, a3));
(op1, op0, oq0, oq1)
}
#[rite]
#[allow(clippy::type_complexity)]
fn do_filter6_neon(
_token: NeonToken,
p2: uint8x16_t,
p1: uint8x16_t,
p0: uint8x16_t,
q0: uint8x16_t,
q1: uint8x16_t,
q2: uint8x16_t,
mask: uint8x16_t,
hev_mask: uint8x16_t,
) -> (
uint8x16_t,
uint8x16_t,
uint8x16_t,
uint8x16_t,
uint8x16_t,
uint8x16_t,
) {
let p2s = flip_sign_neon(_token, p2);
let p1s = flip_sign_neon(_token, p1);
let mut p0s = flip_sign_neon(_token, p0);
let mut q0s = flip_sign_neon(_token, q0);
let q1s = flip_sign_neon(_token, q1);
let q2s = flip_sign_neon(_token, q2);
let simple_lf_mask = vandq_u8(mask, hev_mask);
let delta0 = get_base_delta_neon(_token, p1s, p0s, q0s, q1s);
{
let simple_lf_delta = vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
let (new_p0s, new_q0s) = apply_filter2_no_flip_neon(_token, p0s, q0s, simple_lf_delta);
p0s = new_p0s;
q0s = new_q0s;
}
let complex_lf_mask = veorq_u8(simple_lf_mask, mask);
let complex_lf_delta = vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
let delta_lo = vget_low_s8(complex_lf_delta);
let delta_hi = vget_high_s8(complex_lf_delta);
let k9 = vdup_n_s8(9);
let km1 = vdupq_n_s16(-1);
let k18 = vdup_n_s8(18);
let s_lo = vmlal_s8(km1, k9, delta_lo); let s_hi = vmlal_s8(km1, k9, delta_hi);
let z_lo = vmlal_s8(s_lo, k18, delta_lo); let z_hi = vmlal_s8(s_hi, k18, delta_hi);
let a3_lo = vqrshrn_n_s16::<7>(s_lo); let a3_hi = vqrshrn_n_s16::<7>(s_hi);
let a2_lo = vqrshrn_n_s16::<6>(s_lo); let a2_hi = vqrshrn_n_s16::<6>(s_hi);
let a1_lo = vqrshrn_n_s16::<7>(z_lo); let a1_hi = vqrshrn_n_s16::<7>(z_hi);
let a1 = vcombine_s8(a1_lo, a1_hi);
let a2 = vcombine_s8(a2_lo, a2_hi);
let a3 = vcombine_s8(a3_lo, a3_hi);
let op0 = flip_sign_back_neon(_token, vqaddq_s8(p0s, a1));
let oq0 = flip_sign_back_neon(_token, vqsubq_s8(q0s, a1));
let op1 = flip_sign_back_neon(_token, vqaddq_s8(p1s, a2));
let oq1 = flip_sign_back_neon(_token, vqsubq_s8(q1s, a2));
let op2 = flip_sign_back_neon(_token, vqaddq_s8(p2s, a3));
let oq2 = flip_sign_back_neon(_token, vqsubq_s8(q2s, a3));
(op2, op1, op0, oq0, oq1, oq2)
}
#[rite]
fn load_16x4_neon(
_token: NeonToken,
buf: &[u8],
point: usize,
stride: usize,
) -> (uint8x16_t, uint8x16_t, uint8x16_t, uint8x16_t) {
let p1 =
simd_mem_neon::vld1q_u8(<&[u8; 16]>::try_from(&buf[point - 2 * stride..][..16]).unwrap());
let p0 = simd_mem_neon::vld1q_u8(<&[u8; 16]>::try_from(&buf[point - stride..][..16]).unwrap());
let q0 = simd_mem_neon::vld1q_u8(<&[u8; 16]>::try_from(&buf[point..][..16]).unwrap());
let q1 = simd_mem_neon::vld1q_u8(<&[u8; 16]>::try_from(&buf[point + stride..][..16]).unwrap());
(p1, p0, q0, q1)
}
#[rite]
#[allow(clippy::type_complexity)]
fn load_16x8_neon(
_token: NeonToken,
buf: &[u8],
point: usize,
stride: usize,
) -> (
uint8x16_t,
uint8x16_t,
uint8x16_t,
uint8x16_t,
uint8x16_t,
uint8x16_t,
uint8x16_t,
uint8x16_t,
) {
let p3 =
simd_mem_neon::vld1q_u8(<&[u8; 16]>::try_from(&buf[point - 4 * stride..][..16]).unwrap());
let p2 =
simd_mem_neon::vld1q_u8(<&[u8; 16]>::try_from(&buf[point - 3 * stride..][..16]).unwrap());
let p1 =
simd_mem_neon::vld1q_u8(<&[u8; 16]>::try_from(&buf[point - 2 * stride..][..16]).unwrap());
let p0 = simd_mem_neon::vld1q_u8(<&[u8; 16]>::try_from(&buf[point - stride..][..16]).unwrap());
let q0 = simd_mem_neon::vld1q_u8(<&[u8; 16]>::try_from(&buf[point..][..16]).unwrap());
let q1 = simd_mem_neon::vld1q_u8(<&[u8; 16]>::try_from(&buf[point + stride..][..16]).unwrap());
let q2 =
simd_mem_neon::vld1q_u8(<&[u8; 16]>::try_from(&buf[point + 2 * stride..][..16]).unwrap());
let q3 =
simd_mem_neon::vld1q_u8(<&[u8; 16]>::try_from(&buf[point + 3 * stride..][..16]).unwrap());
(p3, p2, p1, p0, q0, q1, q2, q3)
}
#[rite]
fn store_16x2_neon(
_token: NeonToken,
buf: &mut [u8],
point: usize,
stride: usize,
p0: uint8x16_t,
q0: uint8x16_t,
) {
simd_mem_neon::vst1q_u8(
<&mut [u8; 16]>::try_from(&mut buf[point - stride..][..16]).unwrap(),
p0,
);
simd_mem_neon::vst1q_u8(
<&mut [u8; 16]>::try_from(&mut buf[point..][..16]).unwrap(),
q0,
);
}
#[rite]
#[allow(clippy::type_complexity)]
fn load_8x8x2_neon(
_token: NeonToken,
u_buf: &[u8],
v_buf: &[u8],
point: usize,
stride: usize,
) -> (
uint8x16_t,
uint8x16_t,
uint8x16_t,
uint8x16_t,
uint8x16_t,
uint8x16_t,
uint8x16_t,
uint8x16_t,
) {
let p3 = vcombine_u8(
simd_mem_neon::vld1_u8(<&[u8; 8]>::try_from(&u_buf[point - 4 * stride..][..8]).unwrap()),
simd_mem_neon::vld1_u8(<&[u8; 8]>::try_from(&v_buf[point - 4 * stride..][..8]).unwrap()),
);
let p2 = vcombine_u8(
simd_mem_neon::vld1_u8(<&[u8; 8]>::try_from(&u_buf[point - 3 * stride..][..8]).unwrap()),
simd_mem_neon::vld1_u8(<&[u8; 8]>::try_from(&v_buf[point - 3 * stride..][..8]).unwrap()),
);
let p1 = vcombine_u8(
simd_mem_neon::vld1_u8(<&[u8; 8]>::try_from(&u_buf[point - 2 * stride..][..8]).unwrap()),
simd_mem_neon::vld1_u8(<&[u8; 8]>::try_from(&v_buf[point - 2 * stride..][..8]).unwrap()),
);
let p0 = vcombine_u8(
simd_mem_neon::vld1_u8(<&[u8; 8]>::try_from(&u_buf[point - stride..][..8]).unwrap()),
simd_mem_neon::vld1_u8(<&[u8; 8]>::try_from(&v_buf[point - stride..][..8]).unwrap()),
);
let q0 = vcombine_u8(
simd_mem_neon::vld1_u8(<&[u8; 8]>::try_from(&u_buf[point..][..8]).unwrap()),
simd_mem_neon::vld1_u8(<&[u8; 8]>::try_from(&v_buf[point..][..8]).unwrap()),
);
let q1 = vcombine_u8(
simd_mem_neon::vld1_u8(<&[u8; 8]>::try_from(&u_buf[point + stride..][..8]).unwrap()),
simd_mem_neon::vld1_u8(<&[u8; 8]>::try_from(&v_buf[point + stride..][..8]).unwrap()),
);
let q2 = vcombine_u8(
simd_mem_neon::vld1_u8(<&[u8; 8]>::try_from(&u_buf[point + 2 * stride..][..8]).unwrap()),
simd_mem_neon::vld1_u8(<&[u8; 8]>::try_from(&v_buf[point + 2 * stride..][..8]).unwrap()),
);
let q3 = vcombine_u8(
simd_mem_neon::vld1_u8(<&[u8; 8]>::try_from(&u_buf[point + 3 * stride..][..8]).unwrap()),
simd_mem_neon::vld1_u8(<&[u8; 8]>::try_from(&v_buf[point + 3 * stride..][..8]).unwrap()),
);
(p3, p2, p1, p0, q0, q1, q2, q3)
}
#[rite]
fn store_8x2x2_neon(
_token: NeonToken,
p0: uint8x16_t,
q0: uint8x16_t,
u_buf: &mut [u8],
v_buf: &mut [u8],
u_point: usize,
v_point: usize,
stride: usize,
) {
simd_mem_neon::vst1_u8(
<&mut [u8; 8]>::try_from(&mut u_buf[u_point - stride..][..8]).unwrap(),
vget_low_u8(p0),
);
simd_mem_neon::vst1_u8(
<&mut [u8; 8]>::try_from(&mut u_buf[u_point..][..8]).unwrap(),
vget_low_u8(q0),
);
simd_mem_neon::vst1_u8(
<&mut [u8; 8]>::try_from(&mut v_buf[v_point - stride..][..8]).unwrap(),
vget_high_u8(p0),
);
simd_mem_neon::vst1_u8(
<&mut [u8; 8]>::try_from(&mut v_buf[v_point..][..8]).unwrap(),
vget_high_u8(q0),
);
}
#[rite]
fn store_8x4x2_neon(
_token: NeonToken,
p1: uint8x16_t,
p0: uint8x16_t,
q0: uint8x16_t,
q1: uint8x16_t,
u_buf: &mut [u8],
v_buf: &mut [u8],
u_point: usize,
v_point: usize,
stride: usize,
) {
store_8x2x2_neon(
_token,
p1,
p0,
u_buf,
v_buf,
u_point - stride,
v_point - stride,
stride,
);
store_8x2x2_neon(
_token,
q0,
q1,
u_buf,
v_buf,
u_point + stride,
v_point + stride,
stride,
);
}
#[rite]
fn store_8x6x2_neon(
_token: NeonToken,
op2: uint8x16_t,
op1: uint8x16_t,
op0: uint8x16_t,
oq0: uint8x16_t,
oq1: uint8x16_t,
oq2: uint8x16_t,
u_buf: &mut [u8],
v_buf: &mut [u8],
u_point: usize,
v_point: usize,
stride: usize,
) {
store_8x2x2_neon(
_token,
op2,
op1,
u_buf,
v_buf,
u_point - 2 * stride,
v_point - 2 * stride,
stride,
);
store_8x2x2_neon(_token, op0, oq0, u_buf, v_buf, u_point, v_point, stride);
store_8x2x2_neon(
_token,
oq1,
oq2,
u_buf,
v_buf,
u_point + 2 * stride,
v_point + 2 * stride,
stride,
);
}
#[rite]
fn load_4x16_neon(
_token: NeonToken,
buf: &[u8],
x0: usize,
y_start: usize,
stride: usize,
) -> (uint8x16_t, uint8x16_t, uint8x16_t, uint8x16_t) {
let base = y_start * stride + x0 - 2;
let mut col0 = [0u8; 16];
let mut col1 = [0u8; 16];
let mut col2 = [0u8; 16];
let mut col3 = [0u8; 16];
for i in 0..16 {
let offset = base + i * stride;
col0[i] = buf[offset];
col1[i] = buf[offset + 1];
col2[i] = buf[offset + 2];
col3[i] = buf[offset + 3];
}
let p1 = simd_mem_neon::vld1q_u8(&col0);
let p0 = simd_mem_neon::vld1q_u8(&col1);
let q0 = simd_mem_neon::vld1q_u8(&col2);
let q1 = simd_mem_neon::vld1q_u8(&col3);
(p1, p0, q0, q1)
}
#[rite]
#[allow(clippy::type_complexity)]
fn load_8x16_neon(
_token: NeonToken,
buf: &[u8],
x0: usize,
y_start: usize,
stride: usize,
) -> (
uint8x16_t,
uint8x16_t,
uint8x16_t,
uint8x16_t,
uint8x16_t,
uint8x16_t,
uint8x16_t,
uint8x16_t,
) {
let (p3, p2, p1, p0) = load_4x16_neon(_token, buf, x0 - 2, y_start, stride);
let (q0, q1, q2, q3) = load_4x16_neon(_token, buf, x0 + 2, y_start, stride);
(p3, p2, p1, p0, q0, q1, q2, q3)
}
#[rite]
fn store_2x16_neon(
_token: NeonToken,
p0: uint8x16_t,
q0: uint8x16_t,
buf: &mut [u8],
x0: usize,
y_start: usize,
stride: usize,
) {
for i in 0..16 {
let offset = (y_start + i) * stride + x0 - 1;
buf[offset] = vgetq_lane_u8::<0>(vextq_u8::<0>(p0, p0)); }
let mut p0_bytes = [0u8; 16];
let mut q0_bytes = [0u8; 16];
simd_mem_neon::vst1q_u8(&mut p0_bytes, p0);
simd_mem_neon::vst1q_u8(&mut q0_bytes, q0);
for i in 0..16 {
let offset = (y_start + i) * stride + x0 - 1;
buf[offset] = p0_bytes[i];
buf[offset + 1] = q0_bytes[i];
}
}
#[rite]
fn store_4x16_neon(
_token: NeonToken,
p1: uint8x16_t,
p0: uint8x16_t,
q0: uint8x16_t,
q1: uint8x16_t,
buf: &mut [u8],
x0: usize,
y_start: usize,
stride: usize,
) {
let mut p1_bytes = [0u8; 16];
let mut p0_bytes = [0u8; 16];
let mut q0_bytes = [0u8; 16];
let mut q1_bytes = [0u8; 16];
simd_mem_neon::vst1q_u8(&mut p1_bytes, p1);
simd_mem_neon::vst1q_u8(&mut p0_bytes, p0);
simd_mem_neon::vst1q_u8(&mut q0_bytes, q0);
simd_mem_neon::vst1q_u8(&mut q1_bytes, q1);
for i in 0..16 {
let offset = (y_start + i) * stride + x0 - 2;
buf[offset] = p1_bytes[i];
buf[offset + 1] = p0_bytes[i];
buf[offset + 2] = q0_bytes[i];
buf[offset + 3] = q1_bytes[i];
}
}
#[rite]
fn store_6x16_neon(
_token: NeonToken,
op2: uint8x16_t,
op1: uint8x16_t,
op0: uint8x16_t,
oq0: uint8x16_t,
oq1: uint8x16_t,
oq2: uint8x16_t,
buf: &mut [u8],
x0: usize,
y_start: usize,
stride: usize,
) {
let mut b2 = [0u8; 16];
let mut b1 = [0u8; 16];
let mut b0 = [0u8; 16];
let mut a0 = [0u8; 16];
let mut a1 = [0u8; 16];
let mut a2 = [0u8; 16];
simd_mem_neon::vst1q_u8(&mut b2, op2);
simd_mem_neon::vst1q_u8(&mut b1, op1);
simd_mem_neon::vst1q_u8(&mut b0, op0);
simd_mem_neon::vst1q_u8(&mut a0, oq0);
simd_mem_neon::vst1q_u8(&mut a1, oq1);
simd_mem_neon::vst1q_u8(&mut a2, oq2);
for i in 0..16 {
let offset = (y_start + i) * stride + x0 - 3;
buf[offset] = b2[i];
buf[offset + 1] = b1[i];
buf[offset + 2] = b0[i];
buf[offset + 3] = a0[i];
buf[offset + 4] = a1[i];
buf[offset + 5] = a2[i];
}
}
#[rite]
fn load_4x8x2_neon(
_token: NeonToken,
u_buf: &[u8],
v_buf: &[u8],
x0: usize,
y_start: usize,
stride: usize,
) -> (uint8x16_t, uint8x16_t, uint8x16_t, uint8x16_t) {
let base_u = y_start * stride + x0 - 2;
let base_v = y_start * stride + x0 - 2;
let mut col0 = [0u8; 16];
let mut col1 = [0u8; 16];
let mut col2 = [0u8; 16];
let mut col3 = [0u8; 16];
for i in 0..8 {
let u_off = base_u + i * stride;
let v_off = base_v + i * stride;
col0[i] = u_buf[u_off];
col1[i] = u_buf[u_off + 1];
col2[i] = u_buf[u_off + 2];
col3[i] = u_buf[u_off + 3];
col0[8 + i] = v_buf[v_off];
col1[8 + i] = v_buf[v_off + 1];
col2[8 + i] = v_buf[v_off + 2];
col3[8 + i] = v_buf[v_off + 3];
}
let p1 = simd_mem_neon::vld1q_u8(&col0);
let p0 = simd_mem_neon::vld1q_u8(&col1);
let q0 = simd_mem_neon::vld1q_u8(&col2);
let q1 = simd_mem_neon::vld1q_u8(&col3);
(p1, p0, q0, q1)
}
#[rite]
#[allow(clippy::type_complexity)]
fn load_8x8x2_h_neon(
_token: NeonToken,
u_buf: &[u8],
v_buf: &[u8],
x0: usize,
y_start: usize,
stride: usize,
) -> (
uint8x16_t,
uint8x16_t,
uint8x16_t,
uint8x16_t,
uint8x16_t,
uint8x16_t,
uint8x16_t,
uint8x16_t,
) {
let (p3, p2, p1, p0) = load_4x8x2_neon(_token, u_buf, v_buf, x0 - 2, y_start, stride);
let (q0, q1, q2, q3) = load_4x8x2_neon(_token, u_buf, v_buf, x0 + 2, y_start, stride);
(p3, p2, p1, p0, q0, q1, q2, q3)
}
#[allow(dead_code)]
#[rite]
fn store_2x8x2_neon(
_token: NeonToken,
p0: uint8x16_t,
q0: uint8x16_t,
u_buf: &mut [u8],
v_buf: &mut [u8],
x0: usize,
y_start: usize,
stride: usize,
) {
let mut p0_bytes = [0u8; 16];
let mut q0_bytes = [0u8; 16];
simd_mem_neon::vst1q_u8(&mut p0_bytes, p0);
simd_mem_neon::vst1q_u8(&mut q0_bytes, q0);
for i in 0..8 {
let offset = (y_start + i) * stride + x0 - 1;
u_buf[offset] = p0_bytes[i];
u_buf[offset + 1] = q0_bytes[i];
}
for i in 0..8 {
let offset = (y_start + i) * stride + x0 - 1;
v_buf[offset] = p0_bytes[8 + i];
v_buf[offset + 1] = q0_bytes[8 + i];
}
}
#[rite]
fn store_4x8x2_neon(
_token: NeonToken,
p1: uint8x16_t,
p0: uint8x16_t,
q0: uint8x16_t,
q1: uint8x16_t,
u_buf: &mut [u8],
v_buf: &mut [u8],
x0: usize,
y_start: usize,
stride: usize,
) {
let mut b1 = [0u8; 16];
let mut b0 = [0u8; 16];
let mut a0 = [0u8; 16];
let mut a1 = [0u8; 16];
simd_mem_neon::vst1q_u8(&mut b1, p1);
simd_mem_neon::vst1q_u8(&mut b0, p0);
simd_mem_neon::vst1q_u8(&mut a0, q0);
simd_mem_neon::vst1q_u8(&mut a1, q1);
for i in 0..8 {
let offset = (y_start + i) * stride + x0 - 2;
u_buf[offset] = b1[i];
u_buf[offset + 1] = b0[i];
u_buf[offset + 2] = a0[i];
u_buf[offset + 3] = a1[i];
}
for i in 0..8 {
let offset = (y_start + i) * stride + x0 - 2;
v_buf[offset] = b1[8 + i];
v_buf[offset + 1] = b0[8 + i];
v_buf[offset + 2] = a0[8 + i];
v_buf[offset + 3] = a1[8 + i];
}
}
#[rite]
fn store_6x8x2_neon(
_token: NeonToken,
op2: uint8x16_t,
op1: uint8x16_t,
op0: uint8x16_t,
oq0: uint8x16_t,
oq1: uint8x16_t,
oq2: uint8x16_t,
u_buf: &mut [u8],
v_buf: &mut [u8],
x0: usize,
y_start: usize,
stride: usize,
) {
let mut b2 = [0u8; 16];
let mut b1 = [0u8; 16];
let mut b0 = [0u8; 16];
let mut a0 = [0u8; 16];
let mut a1 = [0u8; 16];
let mut a2 = [0u8; 16];
simd_mem_neon::vst1q_u8(&mut b2, op2);
simd_mem_neon::vst1q_u8(&mut b1, op1);
simd_mem_neon::vst1q_u8(&mut b0, op0);
simd_mem_neon::vst1q_u8(&mut a0, oq0);
simd_mem_neon::vst1q_u8(&mut a1, oq1);
simd_mem_neon::vst1q_u8(&mut a2, oq2);
for i in 0..8 {
let offset = (y_start + i) * stride + x0 - 3;
u_buf[offset] = b2[i];
u_buf[offset + 1] = b1[i];
u_buf[offset + 2] = b0[i];
u_buf[offset + 3] = a0[i];
u_buf[offset + 4] = a1[i];
u_buf[offset + 5] = a2[i];
}
for i in 0..8 {
let offset = (y_start + i) * stride + x0 - 3;
v_buf[offset] = b2[8 + i];
v_buf[offset + 1] = b1[8 + i];
v_buf[offset + 2] = b0[8 + i];
v_buf[offset + 3] = a0[8 + i];
v_buf[offset + 4] = a1[8 + i];
v_buf[offset + 5] = a2[8 + i];
}
}
#[rite]
pub(crate) fn simple_v_filter16_neon(
_token: NeonToken,
buf: &mut [u8],
point: usize,
stride: usize,
thresh: i32,
) {
let (p1, p0, q0, q1) = load_16x4_neon(_token, buf, point, stride);
let mask = needs_filter_neon(_token, p1, p0, q0, q1, thresh);
let (op0, oq0) = do_filter2_neon(_token, p1, p0, q0, q1, mask);
store_16x2_neon(_token, buf, point, stride, op0, oq0);
}
#[rite]
pub(crate) fn simple_h_filter16_neon(
_token: NeonToken,
buf: &mut [u8],
x0: usize,
y_start: usize,
stride: usize,
thresh: i32,
) {
let (p1, p0, q0, q1) = load_4x16_neon(_token, buf, x0, y_start, stride);
let mask = needs_filter_neon(_token, p1, p0, q0, q1, thresh);
let (op0, oq0) = do_filter2_neon(_token, p1, p0, q0, q1, mask);
store_2x16_neon(_token, op0, oq0, buf, x0, y_start, stride);
}
#[rite]
pub(crate) fn normal_v_filter16_inner_neon(
_token: NeonToken,
buf: &mut [u8],
point: usize,
stride: usize,
hev_threshold: i32,
interior_limit: i32,
edge_limit: i32,
) {
let (p3, p2, p1, p0, q0, q1, q2, q3) = load_16x8_neon(_token, buf, point, stride);
let mask = needs_filter2_neon(
_token,
p3,
p2,
p1,
p0,
q0,
q1,
q2,
q3,
interior_limit,
edge_limit,
);
let hev_mask = needs_hev_neon(_token, p1, p0, q0, q1, hev_threshold);
let (op1, op0, oq0, oq1) = do_filter4_neon(_token, p1, p0, q0, q1, mask, hev_mask);
simd_mem_neon::vst1q_u8(
<&mut [u8; 16]>::try_from(&mut buf[point - 2 * stride..][..16]).unwrap(),
op1,
);
simd_mem_neon::vst1q_u8(
<&mut [u8; 16]>::try_from(&mut buf[point - stride..][..16]).unwrap(),
op0,
);
simd_mem_neon::vst1q_u8(
<&mut [u8; 16]>::try_from(&mut buf[point..][..16]).unwrap(),
oq0,
);
simd_mem_neon::vst1q_u8(
<&mut [u8; 16]>::try_from(&mut buf[point + stride..][..16]).unwrap(),
oq1,
);
}
#[rite]
pub(crate) fn normal_v_filter16_edge_neon(
_token: NeonToken,
buf: &mut [u8],
point: usize,
stride: usize,
hev_threshold: i32,
interior_limit: i32,
edge_limit: i32,
) {
let (p3, p2, p1, p0, q0, q1, q2, q3) = load_16x8_neon(_token, buf, point, stride);
let mask = needs_filter2_neon(
_token,
p3,
p2,
p1,
p0,
q0,
q1,
q2,
q3,
interior_limit,
edge_limit,
);
let hev_mask = needs_hev_neon(_token, p1, p0, q0, q1, hev_threshold);
let (op2, op1, op0, oq0, oq1, oq2) =
do_filter6_neon(_token, p2, p1, p0, q0, q1, q2, mask, hev_mask);
simd_mem_neon::vst1q_u8(
<&mut [u8; 16]>::try_from(&mut buf[point - 3 * stride..][..16]).unwrap(),
op2,
);
simd_mem_neon::vst1q_u8(
<&mut [u8; 16]>::try_from(&mut buf[point - 2 * stride..][..16]).unwrap(),
op1,
);
simd_mem_neon::vst1q_u8(
<&mut [u8; 16]>::try_from(&mut buf[point - stride..][..16]).unwrap(),
op0,
);
simd_mem_neon::vst1q_u8(
<&mut [u8; 16]>::try_from(&mut buf[point..][..16]).unwrap(),
oq0,
);
simd_mem_neon::vst1q_u8(
<&mut [u8; 16]>::try_from(&mut buf[point + stride..][..16]).unwrap(),
oq1,
);
simd_mem_neon::vst1q_u8(
<&mut [u8; 16]>::try_from(&mut buf[point + 2 * stride..][..16]).unwrap(),
oq2,
);
}
#[rite]
pub(crate) fn normal_h_filter16_inner_neon(
_token: NeonToken,
buf: &mut [u8],
x0: usize,
y_start: usize,
stride: usize,
hev_threshold: i32,
interior_limit: i32,
edge_limit: i32,
) {
let (p3, p2, p1, p0, q0, q1, q2, q3) = load_8x16_neon(_token, buf, x0, y_start, stride);
let mask = needs_filter2_neon(
_token,
p3,
p2,
p1,
p0,
q0,
q1,
q2,
q3,
interior_limit,
edge_limit,
);
let hev_mask = needs_hev_neon(_token, p1, p0, q0, q1, hev_threshold);
let (op1, op0, oq0, oq1) = do_filter4_neon(_token, p1, p0, q0, q1, mask, hev_mask);
store_4x16_neon(_token, op1, op0, oq0, oq1, buf, x0, y_start, stride);
}
#[rite]
pub(crate) fn normal_h_filter16_edge_neon(
_token: NeonToken,
buf: &mut [u8],
x0: usize,
y_start: usize,
stride: usize,
hev_threshold: i32,
interior_limit: i32,
edge_limit: i32,
) {
let (p3, p2, p1, p0, q0, q1, q2, q3) = load_8x16_neon(_token, buf, x0, y_start, stride);
let mask = needs_filter2_neon(
_token,
p3,
p2,
p1,
p0,
q0,
q1,
q2,
q3,
interior_limit,
edge_limit,
);
let hev_mask = needs_hev_neon(_token, p1, p0, q0, q1, hev_threshold);
let (op2, op1, op0, oq0, oq1, oq2) =
do_filter6_neon(_token, p2, p1, p0, q0, q1, q2, mask, hev_mask);
store_6x16_neon(
_token, op2, op1, op0, oq0, oq1, oq2, buf, x0, y_start, stride,
);
}
#[rite]
pub(crate) fn normal_v_filter_uv_edge_neon(
_token: NeonToken,
u_buf: &mut [u8],
v_buf: &mut [u8],
point: usize,
stride: usize,
hev_threshold: i32,
interior_limit: i32,
edge_limit: i32,
) {
let (p3, p2, p1, p0, q0, q1, q2, q3) = load_8x8x2_neon(_token, u_buf, v_buf, point, stride);
let mask = needs_filter2_neon(
_token,
p3,
p2,
p1,
p0,
q0,
q1,
q2,
q3,
interior_limit,
edge_limit,
);
let hev_mask = needs_hev_neon(_token, p1, p0, q0, q1, hev_threshold);
let (op2, op1, op0, oq0, oq1, oq2) =
do_filter6_neon(_token, p2, p1, p0, q0, q1, q2, mask, hev_mask);
store_8x6x2_neon(
_token, op2, op1, op0, oq0, oq1, oq2, u_buf, v_buf, point, point, stride,
);
}
#[rite]
pub(crate) fn normal_v_filter_uv_inner_neon(
_token: NeonToken,
u_buf: &mut [u8],
v_buf: &mut [u8],
point: usize,
stride: usize,
hev_threshold: i32,
interior_limit: i32,
edge_limit: i32,
) {
let (p3, p2, p1, p0, q0, q1, q2, q3) = load_8x8x2_neon(_token, u_buf, v_buf, point, stride);
let mask = needs_filter2_neon(
_token,
p3,
p2,
p1,
p0,
q0,
q1,
q2,
q3,
interior_limit,
edge_limit,
);
let hev_mask = needs_hev_neon(_token, p1, p0, q0, q1, hev_threshold);
let (op1, op0, oq0, oq1) = do_filter4_neon(_token, p1, p0, q0, q1, mask, hev_mask);
store_8x4x2_neon(
_token, op1, op0, oq0, oq1, u_buf, v_buf, point, point, stride,
);
}
#[rite]
pub(crate) fn normal_h_filter_uv_edge_neon(
_token: NeonToken,
u_buf: &mut [u8],
v_buf: &mut [u8],
x0: usize,
y_start: usize,
stride: usize,
hev_threshold: i32,
interior_limit: i32,
edge_limit: i32,
) {
let (p3, p2, p1, p0, q0, q1, q2, q3) =
load_8x8x2_h_neon(_token, u_buf, v_buf, x0, y_start, stride);
let mask = needs_filter2_neon(
_token,
p3,
p2,
p1,
p0,
q0,
q1,
q2,
q3,
interior_limit,
edge_limit,
);
let hev_mask = needs_hev_neon(_token, p1, p0, q0, q1, hev_threshold);
let (op2, op1, op0, oq0, oq1, oq2) =
do_filter6_neon(_token, p2, p1, p0, q0, q1, q2, mask, hev_mask);
store_6x8x2_neon(
_token, op2, op1, op0, oq0, oq1, oq2, u_buf, v_buf, x0, y_start, stride,
);
}
#[rite]
pub(crate) fn normal_h_filter_uv_inner_neon(
_token: NeonToken,
u_buf: &mut [u8],
v_buf: &mut [u8],
x0: usize,
y_start: usize,
stride: usize,
hev_threshold: i32,
interior_limit: i32,
edge_limit: i32,
) {
let (p3, p2, p1, p0, q0, q1, q2, q3) =
load_8x8x2_h_neon(_token, u_buf, v_buf, x0, y_start, stride);
let mask = needs_filter2_neon(
_token,
p3,
p2,
p1,
p0,
q0,
q1,
q2,
q3,
interior_limit,
edge_limit,
);
let hev_mask = needs_hev_neon(_token, p1, p0, q0, q1, hev_threshold);
let (op1, op0, oq0, oq1) = do_filter4_neon(_token, p1, p0, q0, q1, mask, hev_mask);
store_4x8x2_neon(
_token, op1, op0, oq0, oq1, u_buf, v_buf, x0, y_start, stride,
);
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn load_u8x16(a: &[u8; 16]) -> v128 {
u8x16(
a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8], a[9], a[10], a[11], a[12], a[13],
a[14], a[15],
)
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn store_u8x16(out: &mut [u8; 16], v: v128) {
out[0] = u8x16_extract_lane::<0>(v);
out[1] = u8x16_extract_lane::<1>(v);
out[2] = u8x16_extract_lane::<2>(v);
out[3] = u8x16_extract_lane::<3>(v);
out[4] = u8x16_extract_lane::<4>(v);
out[5] = u8x16_extract_lane::<5>(v);
out[6] = u8x16_extract_lane::<6>(v);
out[7] = u8x16_extract_lane::<7>(v);
out[8] = u8x16_extract_lane::<8>(v);
out[9] = u8x16_extract_lane::<9>(v);
out[10] = u8x16_extract_lane::<10>(v);
out[11] = u8x16_extract_lane::<11>(v);
out[12] = u8x16_extract_lane::<12>(v);
out[13] = u8x16_extract_lane::<13>(v);
out[14] = u8x16_extract_lane::<14>(v);
out[15] = u8x16_extract_lane::<15>(v);
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn load_row(buf: &[u8], offset: usize) -> v128 {
load_u8x16(<&[u8; 16]>::try_from(&buf[offset..offset + 16]).unwrap())
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn store_row(buf: &mut [u8], offset: usize, v: v128) {
store_u8x16(
<&mut [u8; 16]>::try_from(&mut buf[offset..offset + 16]).unwrap(),
v,
);
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn load_col(buf: &[u8], base: usize, stride: usize) -> v128 {
u8x16(
buf[base],
buf[base + stride],
buf[base + stride * 2],
buf[base + stride * 3],
buf[base + stride * 4],
buf[base + stride * 5],
buf[base + stride * 6],
buf[base + stride * 7],
buf[base + stride * 8],
buf[base + stride * 9],
buf[base + stride * 10],
buf[base + stride * 11],
buf[base + stride * 12],
buf[base + stride * 13],
buf[base + stride * 14],
buf[base + stride * 15],
)
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn store_col(buf: &mut [u8], base: usize, stride: usize, v: v128) {
buf[base] = u8x16_extract_lane::<0>(v);
buf[base + stride] = u8x16_extract_lane::<1>(v);
buf[base + stride * 2] = u8x16_extract_lane::<2>(v);
buf[base + stride * 3] = u8x16_extract_lane::<3>(v);
buf[base + stride * 4] = u8x16_extract_lane::<4>(v);
buf[base + stride * 5] = u8x16_extract_lane::<5>(v);
buf[base + stride * 6] = u8x16_extract_lane::<6>(v);
buf[base + stride * 7] = u8x16_extract_lane::<7>(v);
buf[base + stride * 8] = u8x16_extract_lane::<8>(v);
buf[base + stride * 9] = u8x16_extract_lane::<9>(v);
buf[base + stride * 10] = u8x16_extract_lane::<10>(v);
buf[base + stride * 11] = u8x16_extract_lane::<11>(v);
buf[base + stride * 12] = u8x16_extract_lane::<12>(v);
buf[base + stride * 13] = u8x16_extract_lane::<13>(v);
buf[base + stride * 14] = u8x16_extract_lane::<14>(v);
buf[base + stride * 15] = u8x16_extract_lane::<15>(v);
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn abd_u8x16(a: v128, b: v128) -> v128 {
u8x16_sub(u8x16_max(a, b), u8x16_min(a, b))
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn needs_filter(p1: v128, p0: v128, q0: v128, q1: v128, thresh: i32) -> v128 {
let thresh_v = u8x16_splat(thresh as u8);
let a_p0_q0 = abd_u8x16(p0, q0);
let a_p1_q1 = abd_u8x16(p1, q1);
let a_p0_q0_2 = u8x16_add_sat(a_p0_q0, a_p0_q0);
let a_p1_q1_2 = u8x16_shr(a_p1_q1, 1);
let sum = u8x16_add_sat(a_p0_q0_2, a_p1_q1_2);
u8x16_le(sum, thresh_v)
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn needs_filter2(
p3: v128,
p2: v128,
p1: v128,
p0: v128,
q0: v128,
q1: v128,
q2: v128,
q3: v128,
ithresh: i32,
thresh: i32,
) -> v128 {
let it = u8x16_splat(ithresh as u8);
let mask = needs_filter(p1, p0, q0, q1, thresh);
let m1 = u8x16_le(abd_u8x16(p3, p2), it);
let m2 = u8x16_le(abd_u8x16(p2, p1), it);
let m3 = u8x16_le(abd_u8x16(p1, p0), it);
let m4 = u8x16_le(abd_u8x16(q0, q1), it);
let m5 = u8x16_le(abd_u8x16(q1, q2), it);
let m6 = u8x16_le(abd_u8x16(q2, q3), it);
v128_and(
mask,
v128_and(
v128_and(m1, m2),
v128_and(v128_and(m3, m4), v128_and(m5, m6)),
),
)
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn hev(p1: v128, p0: v128, q0: v128, q1: v128, thresh: i32) -> v128 {
let t = u8x16_splat(thresh as u8);
let h1 = u8x16_gt(abd_u8x16(p1, p0), t);
let h2 = u8x16_gt(abd_u8x16(q1, q0), t);
v128_or(h1, h2)
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn do_filter2(p1: v128, p0: &mut v128, q0: &mut v128, q1: v128, mask: v128) {
let sign = u8x16_splat(0x80);
let sp1 = v128_xor(p1, sign);
let sp0 = v128_xor(*p0, sign);
let sq0 = v128_xor(*q0, sign);
let sq1 = v128_xor(q1, sign);
let a0 = i8x16_sub_sat(sp1, sq1); let a1 = i8x16_sub_sat(sq0, sp0); let a2 = i8x16_add_sat(a1, a1); let a3 = i8x16_add_sat(a2, a1); let a = i8x16_add_sat(a0, a3); let a_masked = v128_and(a, mask);
let f1 = i8x16_shr(i8x16_add_sat(a_masked, i8x16_splat(4)), 3);
let f2 = i8x16_shr(i8x16_add_sat(a_masked, i8x16_splat(3)), 3);
*q0 = v128_xor(i8x16_sub_sat(sq0, f1), sign);
*p0 = v128_xor(i8x16_add_sat(sp0, f2), sign);
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn do_filter4(
p1: &mut v128,
p0: &mut v128,
q0: &mut v128,
q1: &mut v128,
mask: v128,
hev_mask: v128,
) {
let sign = u8x16_splat(0x80);
let sp1 = v128_xor(*p1, sign);
let sp0 = v128_xor(*p0, sign);
let sq0 = v128_xor(*q0, sign);
let sq1 = v128_xor(*q1, sign);
let a0 = i8x16_sub_sat(sp1, sq1);
let a0_hev = v128_and(a0, hev_mask); let a1 = i8x16_sub_sat(sq0, sp0);
let a2 = i8x16_add_sat(a1, a1);
let a3 = i8x16_add_sat(a2, a1);
let a = i8x16_add_sat(a0_hev, a3);
let a_masked = v128_and(a, mask);
let f1 = i8x16_shr(i8x16_add_sat(a_masked, i8x16_splat(4)), 3);
let f2 = i8x16_shr(i8x16_add_sat(a_masked, i8x16_splat(3)), 3);
let new_q0 = v128_xor(i8x16_sub_sat(sq0, f1), sign);
let new_p0 = v128_xor(i8x16_add_sat(sp0, f2), sign);
let f3 = i8x16_add_sat(f1, i8x16_splat(1));
let f3 = i8x16_shr(f3, 1); let not_hev = v128_not(hev_mask);
let f3_masked = v128_and(f3, not_hev);
*q0 = new_q0;
*p0 = new_p0;
*q1 = v128_xor(i8x16_sub_sat(sq1, f3_masked), sign);
*p1 = v128_xor(i8x16_add_sat(sp1, f3_masked), sign);
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn do_filter6(
p2: &mut v128,
p1: &mut v128,
p0: &mut v128,
q0: &mut v128,
q1: &mut v128,
q2: &mut v128,
mask: v128,
hev_mask: v128,
) {
let sign = u8x16_splat(0x80);
let sp2 = v128_xor(*p2, sign);
let sp1 = v128_xor(*p1, sign);
let sp0 = v128_xor(*p0, sign);
let sq0 = v128_xor(*q0, sign);
let sq1 = v128_xor(*q1, sign);
let sq2 = v128_xor(*q2, sign);
let a0 = i8x16_sub_sat(sp1, sq1);
let a1 = i8x16_sub_sat(sq0, sp0);
let a2 = i8x16_add_sat(a1, a1);
let a3 = i8x16_add_sat(a2, a1);
let a = i8x16_add_sat(a0, a3);
let a_masked = v128_and(a, mask);
let f1 = i8x16_shr(i8x16_add_sat(a_masked, i8x16_splat(4)), 3);
let f2 = i8x16_shr(i8x16_add_sat(a_masked, i8x16_splat(3)), 3);
let hev_q0 = v128_xor(i8x16_sub_sat(sq0, f1), sign);
let hev_p0 = v128_xor(i8x16_add_sat(sp0, f2), sign);
let not_hev = v128_and(mask, v128_not(hev_mask));
let w = i8x16_sub_sat(sp0, sq0);
let w_lo = i16x8_extend_low_i8x16(w);
let w_hi = i16x8_extend_high_i8x16(w);
let round = i16x8_splat(63);
let w27_lo = i16x8_mul(w_lo, i16x8_splat(27));
let w27_hi = i16x8_mul(w_hi, i16x8_splat(27));
let a_27_lo = i16x8_shr(i16x8_add(w27_lo, round), 7);
let a_27_hi = i16x8_shr(i16x8_add(w27_hi, round), 7);
let a27 = i8x16_narrow_i16x8(a_27_lo, a_27_hi);
let w18_lo = i16x8_mul(w_lo, i16x8_splat(18));
let w18_hi = i16x8_mul(w_hi, i16x8_splat(18));
let a_18_lo = i16x8_shr(i16x8_add(w18_lo, round), 7);
let a_18_hi = i16x8_shr(i16x8_add(w18_hi, round), 7);
let a18 = i8x16_narrow_i16x8(a_18_lo, a_18_hi);
let w9_lo = i16x8_mul(w_lo, i16x8_splat(9));
let w9_hi = i16x8_mul(w_hi, i16x8_splat(9));
let a_9_lo = i16x8_shr(i16x8_add(w9_lo, round), 7);
let a_9_hi = i16x8_shr(i16x8_add(w9_hi, round), 7);
let a9 = i8x16_narrow_i16x8(a_9_lo, a_9_hi);
let wide_q0 = v128_xor(i8x16_sub_sat(sq0, a27), sign);
let wide_p0 = v128_xor(i8x16_add_sat(sp0, a27), sign);
let wide_q1 = v128_xor(i8x16_sub_sat(sq1, a18), sign);
let wide_p1 = v128_xor(i8x16_add_sat(sp1, a18), sign);
let wide_q2 = v128_xor(i8x16_sub_sat(sq2, a9), sign);
let wide_p2 = v128_xor(i8x16_add_sat(sp2, a9), sign);
*q0 = v128_bitselect(hev_q0, wide_q0, hev_mask);
*p0 = v128_bitselect(hev_p0, wide_p0, hev_mask);
*q1 = v128_bitselect(*q1, wide_q1, not_hev); *p1 = v128_bitselect(*p1, wide_p1, not_hev);
*q2 = v128_bitselect(*q2, wide_q2, not_hev);
*p2 = v128_bitselect(*p2, wide_p2, not_hev);
}
#[rite]
pub(crate) fn simple_v_filter16_wasm(
_token: Wasm128Token,
buf: &mut [u8],
point: usize,
stride: usize,
thresh: i32,
) {
let p1 = load_row(buf, point - 2 * stride);
let mut p0 = load_row(buf, point - stride);
let mut q0 = load_row(buf, point);
let q1 = load_row(buf, point + stride);
let mask = needs_filter(p1, p0, q0, q1, thresh);
do_filter2(p1, &mut p0, &mut q0, q1, mask);
store_row(buf, point - stride, p0);
store_row(buf, point, q0);
}
#[rite]
pub(crate) fn simple_h_filter16_wasm(
_token: Wasm128Token,
buf: &mut [u8],
x0: usize,
y_start: usize,
stride: usize,
thresh: i32,
) {
let base = y_start * stride + x0;
let p1 = load_col(buf, base - 2, stride);
let mut p0 = load_col(buf, base - 1, stride);
let mut q0 = load_col(buf, base, stride);
let q1 = load_col(buf, base + 1, stride);
let mask = needs_filter(p1, p0, q0, q1, thresh);
do_filter2(p1, &mut p0, &mut q0, q1, mask);
store_col(buf, base - 1, stride, p0);
store_col(buf, base, stride, q0);
}
#[rite]
pub(crate) fn normal_v_filter16_inner_wasm(
_token: Wasm128Token,
buf: &mut [u8],
point: usize,
stride: usize,
hev_threshold: i32,
interior_limit: i32,
edge_limit: i32,
) {
let p3 = load_row(buf, point - 4 * stride);
let p2 = load_row(buf, point - 3 * stride);
let mut p1 = load_row(buf, point - 2 * stride);
let mut p0 = load_row(buf, point - stride);
let mut q0 = load_row(buf, point);
let mut q1 = load_row(buf, point + stride);
let q2 = load_row(buf, point + 2 * stride);
let q3 = load_row(buf, point + 3 * stride);
let mask = needs_filter2(p3, p2, p1, p0, q0, q1, q2, q3, interior_limit, edge_limit);
let hev_mask = hev(p1, p0, q0, q1, hev_threshold);
do_filter4(&mut p1, &mut p0, &mut q0, &mut q1, mask, hev_mask);
store_row(buf, point - 2 * stride, p1);
store_row(buf, point - stride, p0);
store_row(buf, point, q0);
store_row(buf, point + stride, q1);
}
#[rite]
pub(crate) fn normal_v_filter16_edge_wasm(
_token: Wasm128Token,
buf: &mut [u8],
point: usize,
stride: usize,
hev_threshold: i32,
interior_limit: i32,
edge_limit: i32,
) {
let p3 = load_row(buf, point - 4 * stride);
let mut p2 = load_row(buf, point - 3 * stride);
let mut p1 = load_row(buf, point - 2 * stride);
let mut p0 = load_row(buf, point - stride);
let mut q0 = load_row(buf, point);
let mut q1 = load_row(buf, point + stride);
let mut q2 = load_row(buf, point + 2 * stride);
let q3 = load_row(buf, point + 3 * stride);
let mask = needs_filter2(p3, p2, p1, p0, q0, q1, q2, q3, interior_limit, edge_limit);
let hev_mask = hev(p1, p0, q0, q1, hev_threshold);
do_filter6(
&mut p2, &mut p1, &mut p0, &mut q0, &mut q1, &mut q2, mask, hev_mask,
);
store_row(buf, point - 3 * stride, p2);
store_row(buf, point - 2 * stride, p1);
store_row(buf, point - stride, p0);
store_row(buf, point, q0);
store_row(buf, point + stride, q1);
store_row(buf, point + 2 * stride, q2);
}
#[rite]
pub(crate) fn normal_h_filter16_inner_wasm(
_token: Wasm128Token,
buf: &mut [u8],
x0: usize,
y_start: usize,
stride: usize,
hev_threshold: i32,
interior_limit: i32,
edge_limit: i32,
) {
let base = y_start * stride + x0;
let p3 = load_col(buf, base - 4, stride);
let p2 = load_col(buf, base - 3, stride);
let mut p1 = load_col(buf, base - 2, stride);
let mut p0 = load_col(buf, base - 1, stride);
let mut q0 = load_col(buf, base, stride);
let mut q1 = load_col(buf, base + 1, stride);
let q2 = load_col(buf, base + 2, stride);
let q3 = load_col(buf, base + 3, stride);
let mask = needs_filter2(p3, p2, p1, p0, q0, q1, q2, q3, interior_limit, edge_limit);
let hev_mask = hev(p1, p0, q0, q1, hev_threshold);
do_filter4(&mut p1, &mut p0, &mut q0, &mut q1, mask, hev_mask);
store_col(buf, base - 2, stride, p1);
store_col(buf, base - 1, stride, p0);
store_col(buf, base, stride, q0);
store_col(buf, base + 1, stride, q1);
}
#[rite]
pub(crate) fn normal_h_filter16_edge_wasm(
_token: Wasm128Token,
buf: &mut [u8],
x0: usize,
y_start: usize,
stride: usize,
hev_threshold: i32,
interior_limit: i32,
edge_limit: i32,
) {
let base = y_start * stride + x0;
let p3 = load_col(buf, base - 4, stride);
let mut p2 = load_col(buf, base - 3, stride);
let mut p1 = load_col(buf, base - 2, stride);
let mut p0 = load_col(buf, base - 1, stride);
let mut q0 = load_col(buf, base, stride);
let mut q1 = load_col(buf, base + 1, stride);
let mut q2 = load_col(buf, base + 2, stride);
let q3 = load_col(buf, base + 3, stride);
let mask = needs_filter2(p3, p2, p1, p0, q0, q1, q2, q3, interior_limit, edge_limit);
let hev_mask = hev(p1, p0, q0, q1, hev_threshold);
do_filter6(
&mut p2, &mut p1, &mut p0, &mut q0, &mut q1, &mut q2, mask, hev_mask,
);
store_col(buf, base - 3, stride, p2);
store_col(buf, base - 2, stride, p1);
store_col(buf, base - 1, stride, p0);
store_col(buf, base, stride, q0);
store_col(buf, base + 1, stride, q1);
store_col(buf, base + 2, stride, q2);
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn load_uv_col(u_buf: &[u8], v_buf: &[u8], u_base: usize, v_base: usize, stride: usize) -> v128 {
u8x16(
u_buf[u_base],
u_buf[u_base + stride],
u_buf[u_base + stride * 2],
u_buf[u_base + stride * 3],
u_buf[u_base + stride * 4],
u_buf[u_base + stride * 5],
u_buf[u_base + stride * 6],
u_buf[u_base + stride * 7],
v_buf[v_base],
v_buf[v_base + stride],
v_buf[v_base + stride * 2],
v_buf[v_base + stride * 3],
v_buf[v_base + stride * 4],
v_buf[v_base + stride * 5],
v_buf[v_base + stride * 6],
v_buf[v_base + stride * 7],
)
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn store_uv_col(
u_buf: &mut [u8],
v_buf: &mut [u8],
u_base: usize,
v_base: usize,
stride: usize,
v: v128,
) {
u_buf[u_base] = u8x16_extract_lane::<0>(v);
u_buf[u_base + stride] = u8x16_extract_lane::<1>(v);
u_buf[u_base + stride * 2] = u8x16_extract_lane::<2>(v);
u_buf[u_base + stride * 3] = u8x16_extract_lane::<3>(v);
u_buf[u_base + stride * 4] = u8x16_extract_lane::<4>(v);
u_buf[u_base + stride * 5] = u8x16_extract_lane::<5>(v);
u_buf[u_base + stride * 6] = u8x16_extract_lane::<6>(v);
u_buf[u_base + stride * 7] = u8x16_extract_lane::<7>(v);
v_buf[v_base] = u8x16_extract_lane::<8>(v);
v_buf[v_base + stride] = u8x16_extract_lane::<9>(v);
v_buf[v_base + stride * 2] = u8x16_extract_lane::<10>(v);
v_buf[v_base + stride * 3] = u8x16_extract_lane::<11>(v);
v_buf[v_base + stride * 4] = u8x16_extract_lane::<12>(v);
v_buf[v_base + stride * 5] = u8x16_extract_lane::<13>(v);
v_buf[v_base + stride * 6] = u8x16_extract_lane::<14>(v);
v_buf[v_base + stride * 7] = u8x16_extract_lane::<15>(v);
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn load_uv_row(u_buf: &[u8], v_buf: &[u8], u_off: usize, v_off: usize) -> v128 {
u8x16(
u_buf[u_off],
u_buf[u_off + 1],
u_buf[u_off + 2],
u_buf[u_off + 3],
u_buf[u_off + 4],
u_buf[u_off + 5],
u_buf[u_off + 6],
u_buf[u_off + 7],
v_buf[v_off],
v_buf[v_off + 1],
v_buf[v_off + 2],
v_buf[v_off + 3],
v_buf[v_off + 4],
v_buf[v_off + 5],
v_buf[v_off + 6],
v_buf[v_off + 7],
)
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn store_uv_row(u_buf: &mut [u8], v_buf: &mut [u8], u_off: usize, v_off: usize, v: v128) {
u_buf[u_off] = u8x16_extract_lane::<0>(v);
u_buf[u_off + 1] = u8x16_extract_lane::<1>(v);
u_buf[u_off + 2] = u8x16_extract_lane::<2>(v);
u_buf[u_off + 3] = u8x16_extract_lane::<3>(v);
u_buf[u_off + 4] = u8x16_extract_lane::<4>(v);
u_buf[u_off + 5] = u8x16_extract_lane::<5>(v);
u_buf[u_off + 6] = u8x16_extract_lane::<6>(v);
u_buf[u_off + 7] = u8x16_extract_lane::<7>(v);
v_buf[v_off] = u8x16_extract_lane::<8>(v);
v_buf[v_off + 1] = u8x16_extract_lane::<9>(v);
v_buf[v_off + 2] = u8x16_extract_lane::<10>(v);
v_buf[v_off + 3] = u8x16_extract_lane::<11>(v);
v_buf[v_off + 4] = u8x16_extract_lane::<12>(v);
v_buf[v_off + 5] = u8x16_extract_lane::<13>(v);
v_buf[v_off + 6] = u8x16_extract_lane::<14>(v);
v_buf[v_off + 7] = u8x16_extract_lane::<15>(v);
}
#[rite]
pub(crate) fn normal_v_filter_uv_edge_wasm(
_token: Wasm128Token,
u_buf: &mut [u8],
v_buf: &mut [u8],
point: usize,
stride: usize,
hev_threshold: i32,
interior_limit: i32,
edge_limit: i32,
) {
let p3 = load_uv_row(u_buf, v_buf, point - 4 * stride, point - 4 * stride);
let mut p2 = load_uv_row(u_buf, v_buf, point - 3 * stride, point - 3 * stride);
let mut p1 = load_uv_row(u_buf, v_buf, point - 2 * stride, point - 2 * stride);
let mut p0 = load_uv_row(u_buf, v_buf, point - stride, point - stride);
let mut q0 = load_uv_row(u_buf, v_buf, point, point);
let mut q1 = load_uv_row(u_buf, v_buf, point + stride, point + stride);
let mut q2 = load_uv_row(u_buf, v_buf, point + 2 * stride, point + 2 * stride);
let q3 = load_uv_row(u_buf, v_buf, point + 3 * stride, point + 3 * stride);
let mask = needs_filter2(p3, p2, p1, p0, q0, q1, q2, q3, interior_limit, edge_limit);
let hev_mask = hev(p1, p0, q0, q1, hev_threshold);
do_filter6(
&mut p2, &mut p1, &mut p0, &mut q0, &mut q1, &mut q2, mask, hev_mask,
);
store_uv_row(u_buf, v_buf, point - 3 * stride, point - 3 * stride, p2);
store_uv_row(u_buf, v_buf, point - 2 * stride, point - 2 * stride, p1);
store_uv_row(u_buf, v_buf, point - stride, point - stride, p0);
store_uv_row(u_buf, v_buf, point, point, q0);
store_uv_row(u_buf, v_buf, point + stride, point + stride, q1);
store_uv_row(u_buf, v_buf, point + 2 * stride, point + 2 * stride, q2);
}
#[rite]
pub(crate) fn normal_v_filter_uv_inner_wasm(
_token: Wasm128Token,
u_buf: &mut [u8],
v_buf: &mut [u8],
point: usize,
stride: usize,
hev_threshold: i32,
interior_limit: i32,
edge_limit: i32,
) {
let p3 = load_uv_row(u_buf, v_buf, point - 4 * stride, point - 4 * stride);
let p2 = load_uv_row(u_buf, v_buf, point - 3 * stride, point - 3 * stride);
let mut p1 = load_uv_row(u_buf, v_buf, point - 2 * stride, point - 2 * stride);
let mut p0 = load_uv_row(u_buf, v_buf, point - stride, point - stride);
let mut q0 = load_uv_row(u_buf, v_buf, point, point);
let mut q1 = load_uv_row(u_buf, v_buf, point + stride, point + stride);
let q2 = load_uv_row(u_buf, v_buf, point + 2 * stride, point + 2 * stride);
let q3 = load_uv_row(u_buf, v_buf, point + 3 * stride, point + 3 * stride);
let mask = needs_filter2(p3, p2, p1, p0, q0, q1, q2, q3, interior_limit, edge_limit);
let hev_mask = hev(p1, p0, q0, q1, hev_threshold);
do_filter4(&mut p1, &mut p0, &mut q0, &mut q1, mask, hev_mask);
store_uv_row(u_buf, v_buf, point - 2 * stride, point - 2 * stride, p1);
store_uv_row(u_buf, v_buf, point - stride, point - stride, p0);
store_uv_row(u_buf, v_buf, point, point, q0);
store_uv_row(u_buf, v_buf, point + stride, point + stride, q1);
}
#[rite]
pub(crate) fn normal_h_filter_uv_edge_wasm(
_token: Wasm128Token,
u_buf: &mut [u8],
v_buf: &mut [u8],
x0: usize,
y_start: usize,
stride: usize,
hev_threshold: i32,
interior_limit: i32,
edge_limit: i32,
) {
let u_base = y_start * stride + x0;
let v_base = y_start * stride + x0;
let p3 = load_uv_col(u_buf, v_buf, u_base - 4, v_base - 4, stride);
let mut p2 = load_uv_col(u_buf, v_buf, u_base - 3, v_base - 3, stride);
let mut p1 = load_uv_col(u_buf, v_buf, u_base - 2, v_base - 2, stride);
let mut p0 = load_uv_col(u_buf, v_buf, u_base - 1, v_base - 1, stride);
let mut q0 = load_uv_col(u_buf, v_buf, u_base, v_base, stride);
let mut q1 = load_uv_col(u_buf, v_buf, u_base + 1, v_base + 1, stride);
let mut q2 = load_uv_col(u_buf, v_buf, u_base + 2, v_base + 2, stride);
let q3 = load_uv_col(u_buf, v_buf, u_base + 3, v_base + 3, stride);
let mask = needs_filter2(p3, p2, p1, p0, q0, q1, q2, q3, interior_limit, edge_limit);
let hev_mask = hev(p1, p0, q0, q1, hev_threshold);
do_filter6(
&mut p2, &mut p1, &mut p0, &mut q0, &mut q1, &mut q2, mask, hev_mask,
);
store_uv_col(u_buf, v_buf, u_base - 3, v_base - 3, stride, p2);
store_uv_col(u_buf, v_buf, u_base - 2, v_base - 2, stride, p1);
store_uv_col(u_buf, v_buf, u_base - 1, v_base - 1, stride, p0);
store_uv_col(u_buf, v_buf, u_base, v_base, stride, q0);
store_uv_col(u_buf, v_buf, u_base + 1, v_base + 1, stride, q1);
store_uv_col(u_buf, v_buf, u_base + 2, v_base + 2, stride, q2);
}
#[rite]
pub(crate) fn normal_h_filter_uv_inner_wasm(
_token: Wasm128Token,
u_buf: &mut [u8],
v_buf: &mut [u8],
x0: usize,
y_start: usize,
stride: usize,
hev_threshold: i32,
interior_limit: i32,
edge_limit: i32,
) {
let u_base = y_start * stride + x0;
let v_base = y_start * stride + x0;
let p3 = load_uv_col(u_buf, v_buf, u_base - 4, v_base - 4, stride);
let p2 = load_uv_col(u_buf, v_buf, u_base - 3, v_base - 3, stride);
let mut p1 = load_uv_col(u_buf, v_buf, u_base - 2, v_base - 2, stride);
let mut p0 = load_uv_col(u_buf, v_buf, u_base - 1, v_base - 1, stride);
let mut q0 = load_uv_col(u_buf, v_buf, u_base, v_base, stride);
let mut q1 = load_uv_col(u_buf, v_buf, u_base + 1, v_base + 1, stride);
let q2 = load_uv_col(u_buf, v_buf, u_base + 2, v_base + 2, stride);
let q3 = load_uv_col(u_buf, v_buf, u_base + 3, v_base + 3, stride);
let mask = needs_filter2(p3, p2, p1, p0, q0, q1, q2, q3, interior_limit, edge_limit);
let hev_mask = hev(p1, p0, q0, q1, hev_threshold);
do_filter4(&mut p1, &mut p0, &mut q0, &mut q1, mask, hev_mask);
store_uv_col(u_buf, v_buf, u_base - 2, v_base - 2, stride, p1);
store_uv_col(u_buf, v_buf, u_base - 1, v_base - 1, stride, p0);
store_uv_col(u_buf, v_buf, u_base, v_base, stride, q0);
store_uv_col(u_buf, v_buf, u_base + 1, v_base + 1, stride, q1);
}
#[derive(Clone, Copy)]
pub(crate) struct MbFilterParams {
pub filter_level: u8,
pub interior_limit: u8,
pub hev_threshold: u8,
pub mbedge_limit: u8,
pub sub_bedge_limit: u8,
pub do_subblock_filtering: bool,
}
#[inline]
pub(crate) fn simple_filter_horizontal_16_rows(
buf: &mut [u8],
y_start: usize,
x0: usize,
stride: usize,
edge_limit: u8,
) {
for y in 0usize..16 {
let y0 = y_start + y;
simple_segment_horizontal(edge_limit, &mut buf[y0 * stride + x0 - 4..][..8]);
}
}
#[inline]
pub(crate) fn simple_filter_vertical_16_cols(
buf: &mut [u8],
y0: usize,
x_start: usize,
stride: usize,
edge_limit: u8,
) {
for x in 0usize..16 {
let point = y0 * stride + x_start + x;
simple_segment_vertical(edge_limit, buf, point, stride);
}
}
#[inline]
pub(crate) fn normal_filter_vertical_mb_16_cols(
buf: &mut [u8],
y0: usize,
x_start: usize,
stride: usize,
hev_threshold: u8,
interior_limit: u8,
edge_limit: u8,
) {
for x in 0usize..16 {
let point = y0 * stride + x_start + x;
macroblock_filter_vertical(
hev_threshold,
interior_limit,
edge_limit,
buf,
point,
stride,
);
}
}
#[inline]
pub(crate) fn normal_filter_vertical_sub_16_cols(
buf: &mut [u8],
y0: usize,
x_start: usize,
stride: usize,
hev_threshold: u8,
interior_limit: u8,
edge_limit: u8,
) {
for x in 0usize..16 {
let point = y0 * stride + x_start + x;
subblock_filter_vertical(
hev_threshold,
interior_limit,
edge_limit,
buf,
point,
stride,
);
}
}
#[inline]
pub(crate) fn normal_filter_horizontal_mb_16_rows(
buf: &mut [u8],
y_start: usize,
x0: usize,
stride: usize,
hev_threshold: u8,
interior_limit: u8,
edge_limit: u8,
) {
for y in 0usize..16 {
let row = y_start + y;
macroblock_filter_horizontal(
hev_threshold,
interior_limit,
edge_limit,
&mut buf[row * stride + x0 - 4..][..8],
);
}
}
#[inline]
pub(crate) fn normal_filter_horizontal_sub_16_rows(
buf: &mut [u8],
y_start: usize,
x0: usize,
stride: usize,
hev_threshold: u8,
interior_limit: u8,
edge_limit: u8,
) {
for y in 0usize..16 {
let row = y_start + y;
subblock_filter_horizontal(
hev_threshold,
interior_limit,
edge_limit,
&mut buf[row * stride + x0 - 4..][..8],
);
}
}
#[inline]
pub(crate) fn normal_filter_horizontal_uv_mb(
u_buf: &mut [u8],
v_buf: &mut [u8],
y_start: usize,
x0: usize,
stride: usize,
hev_threshold: u8,
interior_limit: u8,
edge_limit: u8,
) {
for y in 0usize..8 {
let row = y_start + y;
macroblock_filter_horizontal(
hev_threshold,
interior_limit,
edge_limit,
&mut u_buf[row * stride + x0 - 4..][..8],
);
macroblock_filter_horizontal(
hev_threshold,
interior_limit,
edge_limit,
&mut v_buf[row * stride + x0 - 4..][..8],
);
}
}
#[inline]
pub(crate) fn normal_filter_horizontal_uv_sub(
u_buf: &mut [u8],
v_buf: &mut [u8],
y_start: usize,
x0: usize,
stride: usize,
hev_threshold: u8,
interior_limit: u8,
edge_limit: u8,
) {
for y in 0usize..8 {
let row = y_start + y;
subblock_filter_horizontal(
hev_threshold,
interior_limit,
edge_limit,
&mut u_buf[row * stride + x0 - 4..][..8],
);
subblock_filter_horizontal(
hev_threshold,
interior_limit,
edge_limit,
&mut v_buf[row * stride + x0 - 4..][..8],
);
}
}
#[inline]
pub(crate) fn normal_filter_vertical_uv_mb(
u_buf: &mut [u8],
v_buf: &mut [u8],
y0: usize,
x_start: usize,
stride: usize,
hev_threshold: u8,
interior_limit: u8,
edge_limit: u8,
) {
for x in 0usize..8 {
let point = y0 * stride + x_start + x;
macroblock_filter_vertical(
hev_threshold,
interior_limit,
edge_limit,
u_buf,
point,
stride,
);
macroblock_filter_vertical(
hev_threshold,
interior_limit,
edge_limit,
v_buf,
point,
stride,
);
}
}
#[inline]
pub(crate) fn normal_filter_vertical_uv_sub(
u_buf: &mut [u8],
v_buf: &mut [u8],
y0: usize,
x_start: usize,
stride: usize,
hev_threshold: u8,
interior_limit: u8,
edge_limit: u8,
) {
for x in 0usize..8 {
let point = y0 * stride + x_start + x;
subblock_filter_vertical(
hev_threshold,
interior_limit,
edge_limit,
u_buf,
point,
stride,
);
subblock_filter_vertical(
hev_threshold,
interior_limit,
edge_limit,
v_buf,
point,
stride,
);
}
}
#[archmage::arcane]
pub(crate) fn filter_row_simd(
_token: archmage::X64V3Token,
cache_y: &mut [u8],
cache_u: &mut [u8],
cache_v: &mut [u8],
cache_y_stride: usize,
cache_uv_stride: usize,
extra_y_rows: usize,
filter_type: bool,
mby: usize,
mb_params: &[MbFilterParams],
) {
let extra_uv_rows = extra_y_rows / 2;
let mbwidth = mb_params.len();
for mbx in 0..mbwidth {
let p = mb_params[mbx];
if p.filter_level == 0 {
continue;
}
let mbedge_limit_i = i32::from(p.mbedge_limit);
let sub_bedge_limit_i = i32::from(p.sub_bedge_limit);
let hev_i = i32::from(p.hev_threshold);
let interior_i = i32::from(p.interior_limit);
if mbx > 0 {
if filter_type {
simple_h_filter16(
_token,
cache_y,
mbx * 16,
extra_y_rows,
cache_y_stride,
mbedge_limit_i,
);
} else {
normal_h_filter16_edge(
_token,
cache_y,
mbx * 16,
extra_y_rows,
cache_y_stride,
hev_i,
interior_i,
mbedge_limit_i,
);
normal_h_filter_uv_edge(
_token,
cache_u,
cache_v,
mbx * 8,
extra_uv_rows,
cache_uv_stride,
hev_i,
interior_i,
mbedge_limit_i,
);
}
}
if p.do_subblock_filtering {
if filter_type {
for x in (4usize..16 - 1).step_by(4) {
simple_h_filter16(
_token,
cache_y,
mbx * 16 + x,
extra_y_rows,
cache_y_stride,
sub_bedge_limit_i,
);
}
} else {
normal_h_filter16i(
_token,
cache_y,
mbx * 16,
extra_y_rows,
cache_y_stride,
hev_i,
interior_i,
sub_bedge_limit_i,
);
normal_h_filter_uv_inner(
_token,
cache_u,
cache_v,
mbx * 8 + 4,
extra_uv_rows,
cache_uv_stride,
hev_i,
interior_i,
sub_bedge_limit_i,
);
}
}
if mby > 0 {
if filter_type {
let point = extra_y_rows * cache_y_stride + mbx * 16;
simple_v_filter16(_token, cache_y, point, cache_y_stride, mbedge_limit_i);
} else {
let point_y = extra_y_rows * cache_y_stride + mbx * 16;
normal_v_filter16_edge(
_token,
cache_y,
point_y,
cache_y_stride,
hev_i,
interior_i,
mbedge_limit_i,
);
let point_uv = extra_uv_rows * cache_uv_stride + mbx * 8;
normal_v_filter_uv_edge(
_token,
cache_u,
cache_v,
point_uv,
cache_uv_stride,
hev_i,
interior_i,
mbedge_limit_i,
);
}
}
if p.do_subblock_filtering {
if filter_type {
for y in (4usize..16 - 1).step_by(4) {
let point = (extra_y_rows + y) * cache_y_stride + mbx * 16;
simple_v_filter16(_token, cache_y, point, cache_y_stride, sub_bedge_limit_i);
}
} else {
for y in (4usize..16 - 3).step_by(4) {
let point = (extra_y_rows + y) * cache_y_stride + mbx * 16;
normal_v_filter16_inner(
_token,
cache_y,
point,
cache_y_stride,
hev_i,
interior_i,
sub_bedge_limit_i,
);
}
let point_uv = (extra_uv_rows + 4) * cache_uv_stride + mbx * 8;
normal_v_filter_uv_inner(
_token,
cache_u,
cache_v,
point_uv,
cache_uv_stride,
hev_i,
interior_i,
sub_bedge_limit_i,
);
}
}
}
}
#[archmage::arcane]
pub(crate) fn filter_row_simd(
_token: archmage::NeonToken,
cache_y: &mut [u8],
cache_u: &mut [u8],
cache_v: &mut [u8],
cache_y_stride: usize,
cache_uv_stride: usize,
extra_y_rows: usize,
filter_type: bool,
mby: usize,
mb_params: &[MbFilterParams],
) {
let extra_uv_rows = extra_y_rows / 2;
let mbwidth = mb_params.len();
for mbx in 0..mbwidth {
let p = mb_params[mbx];
if p.filter_level == 0 {
continue;
}
let mbedge_limit_i = i32::from(p.mbedge_limit);
let sub_bedge_limit_i = i32::from(p.sub_bedge_limit);
let hev_i = i32::from(p.hev_threshold);
let interior_i = i32::from(p.interior_limit);
if mbx > 0 {
if filter_type {
simple_h_filter16_neon(
_token,
cache_y,
mbx * 16,
extra_y_rows,
cache_y_stride,
mbedge_limit_i,
);
} else {
normal_h_filter16_edge_neon(
_token,
cache_y,
mbx * 16,
extra_y_rows,
cache_y_stride,
hev_i,
interior_i,
mbedge_limit_i,
);
normal_h_filter_uv_edge_neon(
_token,
cache_u,
cache_v,
mbx * 8,
extra_uv_rows,
cache_uv_stride,
hev_i,
interior_i,
mbedge_limit_i,
);
}
}
if p.do_subblock_filtering {
if filter_type {
for x in (4usize..16 - 1).step_by(4) {
simple_h_filter16_neon(
_token,
cache_y,
mbx * 16 + x,
extra_y_rows,
cache_y_stride,
sub_bedge_limit_i,
);
}
} else {
for x in (4usize..16 - 3).step_by(4) {
normal_h_filter16_inner_neon(
_token,
cache_y,
mbx * 16 + x,
extra_y_rows,
cache_y_stride,
hev_i,
interior_i,
sub_bedge_limit_i,
);
}
normal_h_filter_uv_inner_neon(
_token,
cache_u,
cache_v,
mbx * 8 + 4,
extra_uv_rows,
cache_uv_stride,
hev_i,
interior_i,
sub_bedge_limit_i,
);
}
}
if mby > 0 {
if filter_type {
let point = extra_y_rows * cache_y_stride + mbx * 16;
simple_v_filter16_neon(_token, cache_y, point, cache_y_stride, mbedge_limit_i);
} else {
let point_y = extra_y_rows * cache_y_stride + mbx * 16;
normal_v_filter16_edge_neon(
_token,
cache_y,
point_y,
cache_y_stride,
hev_i,
interior_i,
mbedge_limit_i,
);
let point_uv = extra_uv_rows * cache_uv_stride + mbx * 8;
normal_v_filter_uv_edge_neon(
_token,
cache_u,
cache_v,
point_uv,
cache_uv_stride,
hev_i,
interior_i,
mbedge_limit_i,
);
}
}
if p.do_subblock_filtering {
if filter_type {
for y in (4usize..16 - 1).step_by(4) {
let point = (extra_y_rows + y) * cache_y_stride + mbx * 16;
simple_v_filter16_neon(
_token,
cache_y,
point,
cache_y_stride,
sub_bedge_limit_i,
);
}
} else {
for y in (4usize..16 - 3).step_by(4) {
let point = (extra_y_rows + y) * cache_y_stride + mbx * 16;
normal_v_filter16_inner_neon(
_token,
cache_y,
point,
cache_y_stride,
hev_i,
interior_i,
sub_bedge_limit_i,
);
}
let point_uv = (extra_uv_rows + 4) * cache_uv_stride + mbx * 8;
normal_v_filter_uv_inner_neon(
_token,
cache_u,
cache_v,
point_uv,
cache_uv_stride,
hev_i,
interior_i,
sub_bedge_limit_i,
);
}
}
}
}
#[archmage::arcane]
pub(crate) fn filter_row_simd(
_token: archmage::Wasm128Token,
cache_y: &mut [u8],
cache_u: &mut [u8],
cache_v: &mut [u8],
cache_y_stride: usize,
cache_uv_stride: usize,
extra_y_rows: usize,
filter_type: bool,
mby: usize,
mb_params: &[MbFilterParams],
) {
let extra_uv_rows = extra_y_rows / 2;
let mbwidth = mb_params.len();
for mbx in 0..mbwidth {
let p = mb_params[mbx];
if p.filter_level == 0 {
continue;
}
let mbedge_limit_i = i32::from(p.mbedge_limit);
let sub_bedge_limit_i = i32::from(p.sub_bedge_limit);
let hev_i = i32::from(p.hev_threshold);
let interior_i = i32::from(p.interior_limit);
if mbx > 0 {
if filter_type {
simple_h_filter16_wasm(
_token,
cache_y,
mbx * 16,
extra_y_rows,
cache_y_stride,
mbedge_limit_i,
);
} else {
normal_h_filter16_edge_wasm(
_token,
cache_y,
mbx * 16,
extra_y_rows,
cache_y_stride,
hev_i,
interior_i,
mbedge_limit_i,
);
normal_h_filter_uv_edge_wasm(
_token,
cache_u,
cache_v,
mbx * 8,
extra_uv_rows,
cache_uv_stride,
hev_i,
interior_i,
mbedge_limit_i,
);
}
}
if p.do_subblock_filtering {
if filter_type {
for x in (4usize..16 - 1).step_by(4) {
simple_h_filter16_wasm(
_token,
cache_y,
mbx * 16 + x,
extra_y_rows,
cache_y_stride,
sub_bedge_limit_i,
);
}
} else {
for x in (4usize..16 - 3).step_by(4) {
normal_h_filter16_inner_wasm(
_token,
cache_y,
mbx * 16 + x,
extra_y_rows,
cache_y_stride,
hev_i,
interior_i,
sub_bedge_limit_i,
);
}
normal_h_filter_uv_inner_wasm(
_token,
cache_u,
cache_v,
mbx * 8 + 4,
extra_uv_rows,
cache_uv_stride,
hev_i,
interior_i,
sub_bedge_limit_i,
);
}
}
if mby > 0 {
if filter_type {
let point = extra_y_rows * cache_y_stride + mbx * 16;
simple_v_filter16_wasm(_token, cache_y, point, cache_y_stride, mbedge_limit_i);
} else {
let point_y = extra_y_rows * cache_y_stride + mbx * 16;
normal_v_filter16_edge_wasm(
_token,
cache_y,
point_y,
cache_y_stride,
hev_i,
interior_i,
mbedge_limit_i,
);
let point_uv = extra_uv_rows * cache_uv_stride + mbx * 8;
normal_v_filter_uv_edge_wasm(
_token,
cache_u,
cache_v,
point_uv,
cache_uv_stride,
hev_i,
interior_i,
mbedge_limit_i,
);
}
}
if p.do_subblock_filtering {
if filter_type {
for y in (4usize..16 - 1).step_by(4) {
let point = (extra_y_rows + y) * cache_y_stride + mbx * 16;
simple_v_filter16_wasm(
_token,
cache_y,
point,
cache_y_stride,
sub_bedge_limit_i,
);
}
} else {
for y in (4usize..16 - 3).step_by(4) {
let point = (extra_y_rows + y) * cache_y_stride + mbx * 16;
normal_v_filter16_inner_wasm(
_token,
cache_y,
point,
cache_y_stride,
hev_i,
interior_i,
sub_bedge_limit_i,
);
}
let point_uv = (extra_uv_rows + 4) * cache_uv_stride + mbx * 8;
normal_v_filter_uv_inner_wasm(
_token,
cache_u,
cache_v,
point_uv,
cache_uv_stride,
hev_i,
interior_i,
sub_bedge_limit_i,
);
}
}
}
}