#![cfg_attr(not(asm_loopfilter), forbid(unsafe_code))]
use crate::include::common::bitdepth::AsPrimitive;
use crate::include::common::bitdepth::BitDepth;
use crate::include::common::bitdepth::DynPixel;
use crate::include::common::intops::iclip;
use crate::include::dav1d::picture::PicOffset;
use crate::src::align::Align16;
use crate::src::cpu::CpuFlags;
use crate::src::ffi_safe::FFISafe;
use crate::src::internal::Rav1dFrameData;
use crate::src::lf_mask::Av1FilterLUT;
use crate::src::strided::Strided as _;
use crate::src::with_offset::WithOffset;
use crate::src::wrap_fn_ptr::wrap_fn_ptr;
use std::sync::atomic::AtomicU8;
use std::sync::atomic::Ordering::Relaxed;
#[allow(non_camel_case_types)]
type ptrdiff_t = isize;
use std::cmp;
use std::ffi::c_int;
use strum::FromRepr;
#[cfg(all(
asm_loopfilter,
not(any(target_arch = "riscv64", target_arch = "riscv32"))
))]
use crate::include::common::bitdepth::bd_fn;
#[cfg(not(asm_loopfilter))]
use crate::src::enum_map::DefaultValue;
wrap_fn_ptr!(pub unsafe extern "C" fn loopfilter_sb(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
mask: &[u32; 3],
lvl_ptr: *const [u8; 4],
b4_stride: ptrdiff_t,
lut: &Align16<Av1FilterLUT>,
w: c_int,
bitdepth_max: c_int,
_dst: *const FFISafe<PicOffset>,
_lvl: *const FFISafe<WithOffset<&[AtomicU8]>>,
) -> ());
#[cfg(not(asm_loopfilter))]
fn loopfilter_sb_scalar<BD: BitDepth>(
dst: PicOffset,
mask: &[u32; 3],
lvl: WithOffset<&[AtomicU8]>,
b4_stride: usize,
lut: &Align16<Av1FilterLUT>,
wh: c_int,
bd: BD,
is_y: bool,
is_v: bool,
) {
match (is_y, is_v) {
(true, false) => loop_filter_sb128_rust::<BD, { HV::H as usize }, { YUV::Y as usize }>(
dst, mask, lvl, b4_stride, lut, wh, bd,
),
(true, true) => loop_filter_sb128_rust::<BD, { HV::V as usize }, { YUV::Y as usize }>(
dst, mask, lvl, b4_stride, lut, wh, bd,
),
(false, false) => loop_filter_sb128_rust::<BD, { HV::H as usize }, { YUV::UV as usize }>(
dst, mask, lvl, b4_stride, lut, wh, bd,
),
(false, true) => loop_filter_sb128_rust::<BD, { HV::V as usize }, { YUV::UV as usize }>(
dst, mask, lvl, b4_stride, lut, wh, bd,
),
}
}
#[cfg(not(asm_loopfilter))]
fn loopfilter_sb_direct<BD: BitDepth>(
f: &Rav1dFrameData,
dst: PicOffset,
mask: &[u32; 3],
lvl: WithOffset<&[AtomicU8]>,
w: usize,
is_y: bool,
is_v: bool,
) {
let stride = dst.stride();
let b4_stride = f.b4_stride;
let lut = &f.lf.lim_lut;
let wh = w as c_int;
let bd_max = f.bitdepth_max;
#[cfg(feature = "simd_test")]
let saved_buf = {
let (guard, _) = dst.full_guard::<BD>();
guard.to_vec()
};
let simd_handled = {
#[cfg(target_arch = "x86_64")]
{
crate::src::safe_simd::loopfilter::loopfilter_sb_dispatch::<BD>(
dst, stride, mask, lvl, b4_stride, lut, wh, bd_max, is_y, is_v,
)
}
#[cfg(target_arch = "aarch64")]
{
crate::src::safe_simd::loopfilter_arm::loopfilter_sb_dispatch::<BD>(
dst, stride, mask, lvl, b4_stride, lut, wh, bd_max, is_y, is_v,
)
}
#[cfg(target_arch = "wasm32")]
{
crate::src::safe_simd::loopfilter::loopfilter_sb_dispatch::<BD>(
dst, stride, mask, lvl, b4_stride, lut, wh, bd_max, is_y, is_v,
)
}
#[cfg(not(any(
target_arch = "x86_64",
target_arch = "aarch64",
target_arch = "wasm32"
)))]
{
let _ = (stride, &mask, &lvl, b4_stride, lut, wh, bd_max, is_y, is_v);
false
}
};
if simd_handled {
#[cfg(feature = "simd_test")]
{
let (guard, _) = dst.full_guard::<BD>();
let simd_buf = guard.to_vec();
drop(guard);
{
let (mut guard, _) = dst.full_guard_mut::<BD>();
guard.copy_from_slice(&saved_buf);
}
let bd = BD::from_c(bd_max);
loopfilter_sb_scalar::<BD>(dst, mask, lvl, b4_stride as usize, lut, wh, bd, is_y, is_v);
let (guard, _) = dst.full_guard::<BD>();
let scalar_buf = guard.to_vec();
drop(guard);
let pxstride = dst.pixel_stride::<BD>().unsigned_abs();
let mut diffs = 0u32;
let mut first_diff = None;
for (i, (&sv, &rv)) in simd_buf.iter().zip(scalar_buf.iter()).enumerate() {
let sv = sv.as_::<i32>();
let rv = rv.as_::<i32>();
if sv != rv {
diffs += 1;
if first_diff.is_none() {
let y = i / pxstride;
let x = i % pxstride;
first_diff = Some((i, x, y, sv, rv));
}
}
}
if let Some((idx, x, y, sv, rv)) = first_diff {
panic!(
"LF SIMD/scalar: {} pixel diffs, first at ({},{}) idx={}: simd={} scalar={} is_y={} is_v={} w={}",
diffs, x, y, idx, sv, rv, is_y, is_v, w
);
}
{
let (mut guard, _) = dst.full_guard_mut::<BD>();
guard.copy_from_slice(&simd_buf);
}
}
return;
}
#[allow(unreachable_code)]
{
let b4_stride = b4_stride as usize;
let bd = BD::from_c(bd_max);
loopfilter_sb_scalar::<BD>(dst, mask, lvl, b4_stride, lut, wh, bd, is_y, is_v);
}
}
impl loopfilter_sb::Fn {
#[allow(dead_code)]
pub fn call<BD: BitDepth>(
&self,
f: &Rav1dFrameData,
dst: PicOffset,
mask: &[u32; 3],
lvl: WithOffset<&[AtomicU8]>,
w: usize,
is_y: bool,
is_v: bool,
) {
cfg_if::cfg_if! {
if #[cfg(asm_loopfilter)] {
let _ = (is_y, is_v);
let dst_ptr = dst.as_mut_ptr::<BD>().cast();
let stride = dst.stride();
assert!(lvl.offset <= lvl.data.len());
let lvl_ptr = unsafe { (lvl.data.as_ptr() as *const u8).add(lvl.offset) };
let lvl_ptr = lvl_ptr.cast::<[u8; 4]>();
let b4_stride = f.b4_stride;
let lut = &f.lf.lim_lut;
let w = w as c_int;
let bd = f.bitdepth_max;
let dst = FFISafe::new(&dst);
let lvl = FFISafe::new(&lvl);
unsafe {
self.get()(
dst_ptr, stride, mask, lvl_ptr, b4_stride, lut, w, bd, dst, lvl,
)
}
} else {
loopfilter_sb_direct::<BD>(f, dst, mask, lvl, w, is_y, is_v)
}
}
}
#[cfg(asm_loopfilter)]
const fn default<BD: BitDepth, const HV: usize, const YUV: usize>() -> Self {
Self::new(loop_filter_sb128_c_erased::<BD, { HV }, { YUV }>)
}
}
pub struct LoopFilterHVDSPContext {
pub h: loopfilter_sb::Fn,
pub v: loopfilter_sb::Fn,
}
pub struct LoopFilterYUVDSPContext {
pub y: LoopFilterHVDSPContext,
pub uv: LoopFilterHVDSPContext,
}
pub struct Rav1dLoopFilterDSPContext {
pub loop_filter_sb: LoopFilterYUVDSPContext,
}
#[inline(never)]
fn loop_filter<BD: BitDepth>(
dst: PicOffset,
e: u8,
i: u8,
h: u8,
stridea: ptrdiff_t,
strideb: ptrdiff_t,
wd: c_int,
bd: BD,
) {
let bitdepth_min_8 = bd.bitdepth() - 8;
let [f, e, i, h] = [1, e, i, h].map(|n| (n as i32) << bitdepth_min_8);
for idx in 0..4 {
let dst = dst + (idx * stridea);
let dst = |stride_index: isize| (dst + (strideb * stride_index)).index_mut::<BD>();
let get_dst = |stride_index| (*dst(stride_index)).as_::<i32>();
let set_dst = |stride_index, pixel: i32| {
*dst(stride_index) = pixel.as_::<BD::Pixel>();
};
let set_dst_clipped = |stride_index, pixel: i32| {
*dst(stride_index) = bd.iclip_pixel(pixel);
};
let mut p6 = 0;
let mut p5 = 0;
let mut p4 = 0;
let mut p3 = 0;
let mut p2 = 0;
let p1 = get_dst(-2);
let p0 = get_dst(-1);
let q0 = get_dst(0);
let q1 = get_dst(1);
let mut q2 = 0;
let mut q3 = 0;
let mut q4 = 0;
let mut q5 = 0;
let mut q6 = 0;
let mut flat8out = false;
let mut flat8in = false;
let mut fm = (p1 - p0).abs() <= i
&& (q1 - q0).abs() <= i
&& (p0 - q0).abs() * 2 + ((p1 - q1).abs() >> 1) <= e;
if wd > 4 {
p2 = get_dst(-3);
q2 = get_dst(2);
fm &= (p2 - p1).abs() <= i && (q2 - q1).abs() <= i;
if wd > 6 {
p3 = get_dst(-4);
q3 = get_dst(3);
fm &= (p3 - p2).abs() <= i && (q3 - q2).abs() <= i;
}
}
if !fm {
continue;
}
if wd >= 16 {
p6 = get_dst(-7);
p5 = get_dst(-6);
p4 = get_dst(-5);
q4 = get_dst(4);
q5 = get_dst(5);
q6 = get_dst(6);
flat8out = (p6 - p0).abs() <= f
&& (p5 - p0).abs() <= f
&& (p4 - p0).abs() <= f
&& (q4 - q0).abs() <= f
&& (q5 - q0).abs() <= f
&& (q6 - q0).abs() <= f;
}
if wd >= 6 {
flat8in = (p2 - p0).abs() <= f
&& (p1 - p0).abs() <= f
&& (q1 - q0).abs() <= f
&& (q2 - q0).abs() <= f;
}
if wd >= 8 {
flat8in &= (p3 - p0).abs() <= f && (q3 - q0).abs() <= f;
}
if wd >= 16 && flat8out && flat8in {
set_dst(
-6,
p6 + p6 + p6 + p6 + p6 + p6 * 2 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + 8 >> 4,
);
set_dst(
-5,
p6 + p6 + p6 + p6 + p6 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + 8 >> 4,
);
set_dst(
-4,
p6 + p6 + p6 + p6 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + 8 >> 4,
);
set_dst(
-3,
p6 + p6 + p6 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3 + 8 >> 4,
);
set_dst(
-2,
p6 + p6 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + q0 + q1 + q2 + q3 + q4 + 8 >> 4,
);
set_dst(
-1,
p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1 + q2 + q3 + q4 + q5 + 8 >> 4,
);
set_dst(
0,
p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + 8 >> 4,
);
set_dst(
1,
p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 + q2 * 2 + q3 + q4 + q5 + q6 + q6 + 8 >> 4,
);
set_dst(
2,
p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 + q6 + q6 + 8 >> 4,
);
set_dst(
3,
p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 + q6 + q6 + q6 + 8 >> 4,
);
set_dst(
4,
p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 + q6 + q6 + q6 + q6 + 8 >> 4,
);
set_dst(
5,
p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 2 + q6 + q6 + q6 + q6 + q6 + 8 >> 4,
);
} else if wd >= 8 && flat8in {
set_dst(-3, p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4 >> 3);
set_dst(-2, p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4 >> 3);
set_dst(-1, p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4 >> 3);
set_dst(0, p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4 >> 3);
set_dst(1, p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4 >> 3);
set_dst(2, p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4 >> 3);
} else if wd == 6 && flat8in {
set_dst(-2, p2 + 2 * p2 + 2 * p1 + 2 * p0 + q0 + 4 >> 3);
set_dst(-1, p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4 >> 3);
set_dst(0, p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4 >> 3);
set_dst(1, p0 + 2 * q0 + 2 * q1 + 2 * q2 + q2 + 4 >> 3);
} else {
let hev = (p1 - p0).abs() > h || (q1 - q0).abs() > h;
fn iclip_diff(v: c_int, bitdepth_min_8: u8) -> i32 {
iclip(
v,
-128 * (1 << bitdepth_min_8),
128 * (1 << bitdepth_min_8) - 1,
)
}
if hev {
let f = iclip_diff(p1 - q1, bitdepth_min_8);
let f = iclip_diff(3 * (q0 - p0) + f, bitdepth_min_8);
let f1 = cmp::min(f + 4, (128 << bitdepth_min_8) - 1) >> 3;
let f2 = cmp::min(f + 3, (128 << bitdepth_min_8) - 1) >> 3;
set_dst_clipped(-1, p0 + f2);
set_dst_clipped(0, q0 - f1);
} else {
let f = iclip_diff(3 * (q0 - p0), bitdepth_min_8);
let f1 = cmp::min(f + 4, (128 << bitdepth_min_8) - 1) >> 3;
let f2 = cmp::min(f + 3, (128 << bitdepth_min_8) - 1) >> 3;
set_dst_clipped(-1, p0 + f2);
set_dst_clipped(0, q0 - f1);
let f = (f1 + 1) >> 1;
set_dst_clipped(-2, p1 + f);
set_dst_clipped(1, q1 - f);
}
}
}
}
#[derive(FromRepr)]
enum HV {
H,
V,
}
#[derive(FromRepr)]
enum YUV {
Y,
UV,
}
fn loop_filter_sb128_rust<BD: BitDepth, const HV: usize, const YUV: usize>(
mut dst: PicOffset,
vmask: &[u32; 3],
mut lvl: WithOffset<&[AtomicU8]>,
b4_stride: usize,
lut: &Align16<Av1FilterLUT>,
_wh: c_int,
bd: BD,
) {
let hv = HV::from_repr(HV).unwrap();
let yuv = YUV::from_repr(YUV).unwrap();
let stride = dst.pixel_stride::<BD>();
let (stridea, strideb) = match hv {
HV::H => (stride, 1),
HV::V => (1, stride),
};
let (b4_stridea, b4_strideb) = match hv {
HV::H => (b4_stride, 1),
HV::V => (1, b4_stride),
};
let vm = match yuv {
YUV::Y => vmask[0] | vmask[1] | vmask[2],
YUV::UV => vmask[0] | vmask[1],
};
let mut xy = 1u32;
while vm & !xy.wrapping_sub(1) != 0 {
'block: {
if vm & xy == 0 {
break 'block;
}
let l = lvl.data[lvl.offset].load(Relaxed);
let l = if l != 0 {
l
} else {
let lvl = lvl - 4 * b4_strideb;
lvl.data[lvl.offset].load(Relaxed)
};
if l == 0 {
break 'block;
}
let h = l >> 4;
let e = lut.e[l as usize];
let i = lut.i[l as usize];
let idx = match yuv {
YUV::Y => {
let idx = if vmask[2] & xy != 0 {
2
} else {
(vmask[1] & xy != 0) as c_int
};
4 << idx
}
YUV::UV => {
let idx = (vmask[1] & xy != 0) as c_int;
4 + 2 * idx
}
};
loop_filter(dst, e, i, h, stridea, strideb, idx, bd);
}
xy <<= 1;
dst += 4 * stridea;
lvl += 4 * b4_stridea;
}
}
#[cfg(asm_loopfilter)]
#[deny(unsafe_op_in_unsafe_fn)]
unsafe extern "C" fn loop_filter_sb128_c_erased<BD: BitDepth, const HV: usize, const YUV: usize>(
_dst_ptr: *mut DynPixel,
_stride: ptrdiff_t,
vmask: &[u32; 3],
_lvl_ptr: *const [u8; 4],
b4_stride: isize,
lut: &Align16<Av1FilterLUT>,
wh: c_int,
bitdepth_max: c_int,
dst: *const FFISafe<PicOffset>,
lvl: *const FFISafe<WithOffset<&[AtomicU8]>>,
) {
let dst = *unsafe { FFISafe::get(dst) };
let lvl = *unsafe { FFISafe::get(lvl) };
let b4_stride = b4_stride as usize;
let bd = BD::from_c(bitdepth_max);
loop_filter_sb128_rust::<BD, { HV }, { YUV }>(dst, vmask, lvl, b4_stride, lut, wh, bd)
}
impl Rav1dLoopFilterDSPContext {
pub const fn default<BD: BitDepth>() -> Self {
cfg_if::cfg_if! {
if #[cfg(asm_loopfilter)] {
use HV::*;
use YUV::*;
Self {
loop_filter_sb: LoopFilterYUVDSPContext {
y: LoopFilterHVDSPContext {
h: loopfilter_sb::Fn::default::<BD, { H as _ }, { Y as _ }>(),
v: loopfilter_sb::Fn::default::<BD, { V as _ }, { Y as _ }>(),
},
uv: LoopFilterHVDSPContext {
h: loopfilter_sb::Fn::default::<BD, { H as _ }, { UV as _ }>(),
v: loopfilter_sb::Fn::default::<BD, { V as _ }, { UV as _ }>(),
},
},
}
} else {
Self {
loop_filter_sb: LoopFilterYUVDSPContext {
y: LoopFilterHVDSPContext {
h: loopfilter_sb::Fn::DEFAULT,
v: loopfilter_sb::Fn::DEFAULT,
},
uv: LoopFilterHVDSPContext {
h: loopfilter_sb::Fn::DEFAULT,
v: loopfilter_sb::Fn::DEFAULT,
},
},
}
}
}
}
#[cfg(all(asm_loopfilter, any(target_arch = "x86", target_arch = "x86_64")))]
#[inline(always)]
const fn init_x86<BD: BitDepth>(mut self, flags: CpuFlags) -> Self {
if !flags.contains(CpuFlags::SSSE3) {
return self;
}
self.loop_filter_sb.y.h = bd_fn!(loopfilter_sb::decl_fn, BD, lpf_h_sb_y, ssse3);
self.loop_filter_sb.y.v = bd_fn!(loopfilter_sb::decl_fn, BD, lpf_v_sb_y, ssse3);
self.loop_filter_sb.uv.h = bd_fn!(loopfilter_sb::decl_fn, BD, lpf_h_sb_uv, ssse3);
self.loop_filter_sb.uv.v = bd_fn!(loopfilter_sb::decl_fn, BD, lpf_v_sb_uv, ssse3);
#[cfg(target_arch = "x86_64")]
{
if !flags.contains(CpuFlags::AVX2) {
return self;
}
self.loop_filter_sb.y.h = bd_fn!(loopfilter_sb::decl_fn, BD, lpf_h_sb_y, avx2);
self.loop_filter_sb.y.v = bd_fn!(loopfilter_sb::decl_fn, BD, lpf_v_sb_y, avx2);
self.loop_filter_sb.uv.h = bd_fn!(loopfilter_sb::decl_fn, BD, lpf_h_sb_uv, avx2);
self.loop_filter_sb.uv.v = bd_fn!(loopfilter_sb::decl_fn, BD, lpf_v_sb_uv, avx2);
if !flags.contains(CpuFlags::AVX512ICL) {
return self;
}
self.loop_filter_sb.y.v = bd_fn!(loopfilter_sb::decl_fn, BD, lpf_v_sb_y, avx512icl);
self.loop_filter_sb.uv.v = bd_fn!(loopfilter_sb::decl_fn, BD, lpf_v_sb_uv, avx512icl);
if !flags.contains(CpuFlags::SLOW_GATHER) {
self.loop_filter_sb.y.h = bd_fn!(loopfilter_sb::decl_fn, BD, lpf_h_sb_y, avx512icl);
self.loop_filter_sb.uv.h =
bd_fn!(loopfilter_sb::decl_fn, BD, lpf_h_sb_uv, avx512icl);
}
}
self
}
#[cfg(all(asm_loopfilter, any(target_arch = "arm", target_arch = "aarch64")))]
#[inline(always)]
const fn init_arm<BD: BitDepth>(mut self, flags: CpuFlags) -> Self {
if !flags.contains(CpuFlags::NEON) {
return self;
}
self.loop_filter_sb.y.h = bd_fn!(loopfilter_sb::decl_fn, BD, lpf_h_sb_y, neon);
self.loop_filter_sb.y.v = bd_fn!(loopfilter_sb::decl_fn, BD, lpf_v_sb_y, neon);
self.loop_filter_sb.uv.h = bd_fn!(loopfilter_sb::decl_fn, BD, lpf_h_sb_uv, neon);
self.loop_filter_sb.uv.v = bd_fn!(loopfilter_sb::decl_fn, BD, lpf_v_sb_uv, neon);
self
}
#[inline(always)]
const fn init<BD: BitDepth>(self, flags: CpuFlags) -> Self {
#[cfg(asm_loopfilter)]
{
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
return self.init_x86::<BD>(flags);
}
#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
{
return self.init_arm::<BD>(flags);
}
}
#[allow(unreachable_code)] {
let _ = flags;
self
}
}
pub const fn new<BD: BitDepth>(flags: CpuFlags) -> Self {
Self::default::<BD>().init::<BD>(flags)
}
}