#![cfg_attr(not(feature = "unchecked"), forbid(unsafe_code))]
#![cfg_attr(feature = "unchecked", deny(unsafe_code))]
#![allow(unused_imports)]
#[cfg(target_arch = "x86_64")]
use archmage::{Desktop64, SimdToken};
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
use crate::include::common::bitdepth::AsPrimitive;
use crate::include::common::bitdepth::BitDepth;
use crate::include::common::bitdepth::DynPixel;
use crate::include::common::intops::iclip;
use crate::include::dav1d::picture::PicOffset;
use crate::src::align::Align16;
use crate::src::ffi_safe::FFISafe;
use crate::src::lf_mask::Av1FilterLUT;
use crate::src::with_offset::WithOffset;
use std::sync::atomic::AtomicU8;
use std::sync::atomic::Ordering::Relaxed;
#[allow(non_camel_case_types)]
type ptrdiff_t = isize;
use std::cmp;
use std::ffi::c_int;
#[inline(always)]
fn iclip_diff(v: i32, bitdepth_min_8: u8) -> i32 {
iclip(
v,
-128 * (1 << bitdepth_min_8),
128 * (1 << bitdepth_min_8) - 1,
)
}
#[inline(always)]
fn signed_idx(base: usize, offset: isize) -> usize {
(base as isize + offset) as usize
}
#[cfg(any(target_arch = "x86_64", target_arch = "wasm32"))]
fn loop_filter_4_8bpc(
buf: &mut [u8],
base: usize,
e: i32,
i: i32,
h: i32,
stridea: isize,
strideb: isize,
wd: i32,
bitdepth_max: i32,
) {
let f = 1i32;
for idx in 0..4isize {
let edge = signed_idx(base, idx * stridea);
let get_px = |offset: isize| -> i32 { buf[signed_idx(edge, strideb * offset)] as i32 };
let p1 = get_px(-2);
let p0 = get_px(-1);
let q0 = get_px(0);
let q1 = get_px(1);
let mut fm = (p1 - p0).abs() <= i
&& (q1 - q0).abs() <= i
&& (p0 - q0).abs() * 2 + ((p1 - q1).abs() >> 1) <= e;
let (mut p2, mut p3, mut q2, mut q3) = (0, 0, 0, 0);
let (mut p4, mut p5, mut p6, mut q4, mut q5, mut q6) = (0, 0, 0, 0, 0, 0);
if wd > 4 {
p2 = get_px(-3);
q2 = get_px(2);
fm &= (p2 - p1).abs() <= i && (q2 - q1).abs() <= i;
if wd > 6 {
p3 = get_px(-4);
q3 = get_px(3);
fm &= (p3 - p2).abs() <= i && (q3 - q2).abs() <= i;
}
}
if !fm {
continue;
}
let mut flat8out = false;
let mut flat8in = false;
if wd >= 16 {
p6 = get_px(-7);
p5 = get_px(-6);
p4 = get_px(-5);
q4 = get_px(4);
q5 = get_px(5);
q6 = get_px(6);
flat8out = (p6 - p0).abs() <= f
&& (p5 - p0).abs() <= f
&& (p4 - p0).abs() <= f
&& (q4 - q0).abs() <= f
&& (q5 - q0).abs() <= f
&& (q6 - q0).abs() <= f;
}
if wd >= 6 {
flat8in = (p2 - p0).abs() <= f
&& (p1 - p0).abs() <= f
&& (q1 - q0).abs() <= f
&& (q2 - q0).abs() <= f;
}
if wd >= 8 {
flat8in &= (p3 - p0).abs() <= f && (q3 - q0).abs() <= f;
}
let set_px = |buf: &mut [u8], offset: isize, val: i32| {
buf[signed_idx(edge, strideb * offset)] = val.clamp(0, bitdepth_max) as u8;
};
if wd >= 16 && flat8out && flat8in {
set_px(
buf,
-6,
(p6 + p6 + p6 + p6 + p6 + p6 * 2 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + 8)
>> 4,
);
set_px(
buf,
-5,
(p6 + p6 + p6 + p6 + p6 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + 8)
>> 4,
);
set_px(
buf,
-4,
(p6 + p6 + p6 + p6 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + 8)
>> 4,
);
set_px(
buf,
-3,
(p6 + p6 + p6 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3 + 8)
>> 4,
);
set_px(
buf,
-2,
(p6 + p6 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + q0 + q1 + q2 + q3 + q4 + 8)
>> 4,
);
set_px(
buf,
-1,
(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1 + q2 + q3 + q4 + q5 + 8)
>> 4,
);
set_px(
buf,
0,
(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + 8)
>> 4,
);
set_px(
buf,
1,
(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 + q2 * 2 + q3 + q4 + q5 + q6 + q6 + 8)
>> 4,
);
set_px(
buf,
2,
(p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 + q6 + q6 + 8)
>> 4,
);
set_px(
buf,
3,
(p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 + q6 + q6 + q6 + 8)
>> 4,
);
set_px(
buf,
4,
(p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 + q6 + q6 + q6 + q6 + 8)
>> 4,
);
set_px(
buf,
5,
(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 2 + q6 + q6 + q6 + q6 + q6 + 8)
>> 4,
);
} else if wd >= 8 && flat8in {
set_px(buf, -3, (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3);
set_px(buf, -2, (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3);
set_px(buf, -1, (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3);
set_px(buf, 0, (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3);
set_px(buf, 1, (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3);
set_px(buf, 2, (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3);
} else if wd == 6 && flat8in {
set_px(buf, -2, (p2 + 2 * p2 + 2 * p1 + 2 * p0 + q0 + 4) >> 3);
set_px(buf, -1, (p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3);
set_px(buf, 0, (p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3);
set_px(buf, 1, (p0 + 2 * q0 + 2 * q1 + 2 * q2 + q2 + 4) >> 3);
} else {
let hev = (p1 - p0).abs() > h || (q1 - q0).abs() > h;
if hev {
let f = iclip_diff(p1 - q1, 0);
let f = iclip_diff(3 * (q0 - p0) + f, 0);
let f1 = cmp::min(f + 4, 127) >> 3;
let f2 = cmp::min(f + 3, 127) >> 3;
set_px(buf, -1, p0 + f2);
set_px(buf, 0, q0 - f1);
} else {
let f = iclip_diff(3 * (q0 - p0), 0);
let f1 = cmp::min(f + 4, 127) >> 3;
let f2 = cmp::min(f + 3, 127) >> 3;
set_px(buf, -1, p0 + f2);
set_px(buf, 0, q0 - f1);
let f = (f1 + 1) >> 1;
set_px(buf, -2, p1 + f);
set_px(buf, 1, q1 - f);
}
}
}
}
#[inline(always)]
fn read_lvl(lvl: &[AtomicU8], offset: usize, byte_idx: usize) -> u8 {
let idx = offset * 4 + byte_idx;
lvl.get(idx).map_or(0, |v| v.load(Relaxed))
}
#[cfg(any(target_arch = "x86_64", target_arch = "wasm32"))]
fn lpf_h_sb_y_8bpc_inner(
buf: &mut [u8],
mut dst_offset: usize,
stride: isize,
vmask: &[u32; 3],
lvl: &[AtomicU8],
lvl_base: usize,
lvl_byte_idx: usize,
b4_stride: isize,
lut: &Align16<Av1FilterLUT>,
_w: i32,
bitdepth_max: i32,
) {
let stridea = stride;
let strideb = 1isize;
let b4_stridea = b4_stride as usize;
let b4_strideb = 1usize;
let vm = vmask[0] | vmask[1] | vmask[2];
let mut lvl_offset = lvl_base;
let mut xy = 1u32;
while vm & !xy.wrapping_sub(1) != 0 {
if vm & xy != 0 {
let lvl_val = read_lvl(lvl, lvl_offset, lvl_byte_idx);
let l = if lvl_val != 0 {
lvl_val
} else {
if lvl_offset >= b4_strideb {
read_lvl(lvl, lvl_offset - b4_strideb, lvl_byte_idx)
} else {
0
}
};
if l != 0 {
let h = (l >> 4) as i32;
let e = lut.e[l as usize] as i32;
let i = lut.i[l as usize] as i32;
let idx = if vmask[2] & xy != 0 {
16
} else if vmask[1] & xy != 0 {
8
} else {
4
};
loop_filter_4_8bpc(
buf,
dst_offset,
e,
i,
h,
stridea,
strideb,
idx,
bitdepth_max,
);
}
}
xy <<= 1;
dst_offset = signed_idx(dst_offset, 4 * stridea);
lvl_offset += b4_stridea;
}
}
#[cfg(any(target_arch = "x86_64", target_arch = "wasm32"))]
fn lpf_v_sb_y_8bpc_inner(
buf: &mut [u8],
mut dst_offset: usize,
stride: isize,
vmask: &[u32; 3],
lvl: &[AtomicU8],
lvl_base: usize,
lvl_byte_idx: usize,
b4_stride: isize,
lut: &Align16<Av1FilterLUT>,
_w: i32,
bitdepth_max: i32,
) {
let stridea = 1isize;
let strideb = stride;
let b4_stridea = 1usize;
let b4_strideb = b4_stride as usize;
let vm = vmask[0] | vmask[1] | vmask[2];
let mut lvl_offset = lvl_base;
let mut xy = 1u32;
while vm & !xy.wrapping_sub(1) != 0 {
if vm & xy != 0 {
let lvl_val = read_lvl(lvl, lvl_offset, lvl_byte_idx);
let l = if lvl_val != 0 {
lvl_val
} else {
if lvl_offset >= b4_strideb {
read_lvl(lvl, lvl_offset - b4_strideb, lvl_byte_idx)
} else {
0
}
};
if l != 0 {
let h = (l >> 4) as i32;
let e = lut.e[l as usize] as i32;
let i = lut.i[l as usize] as i32;
let idx = if vmask[2] & xy != 0 {
16
} else if vmask[1] & xy != 0 {
8
} else {
4
};
loop_filter_4_8bpc(
buf,
dst_offset,
e,
i,
h,
stridea,
strideb,
idx,
bitdepth_max,
);
}
}
xy <<= 1;
dst_offset = signed_idx(dst_offset, 4 * stridea);
lvl_offset += b4_stridea;
}
}
#[cfg(any(target_arch = "x86_64", target_arch = "wasm32"))]
fn lpf_h_sb_uv_8bpc_inner(
buf: &mut [u8],
mut dst_offset: usize,
stride: isize,
vmask: &[u32; 3],
lvl: &[AtomicU8],
lvl_base: usize,
lvl_byte_idx: usize,
b4_stride: isize,
lut: &Align16<Av1FilterLUT>,
_w: i32,
bitdepth_max: i32,
) {
let stridea = stride;
let strideb = 1isize;
let b4_stridea = b4_stride as usize;
let b4_strideb = 1usize;
let vm = vmask[0] | vmask[1];
let mut lvl_offset = lvl_base;
let mut xy = 1u32;
while vm & !xy.wrapping_sub(1) != 0 {
if vm & xy != 0 {
let lvl_val = read_lvl(lvl, lvl_offset, lvl_byte_idx);
let l = if lvl_val != 0 {
lvl_val
} else {
if lvl_offset >= b4_strideb {
read_lvl(lvl, lvl_offset - b4_strideb, lvl_byte_idx)
} else {
0
}
};
if l != 0 {
let h = (l >> 4) as i32;
let e = lut.e[l as usize] as i32;
let i = lut.i[l as usize] as i32;
let idx = if vmask[1] & xy != 0 { 6 } else { 4 };
loop_filter_4_8bpc(
buf,
dst_offset,
e,
i,
h,
stridea,
strideb,
idx,
bitdepth_max,
);
}
}
xy <<= 1;
dst_offset = signed_idx(dst_offset, 4 * stridea);
lvl_offset += b4_stridea;
}
}
#[cfg(any(target_arch = "x86_64", target_arch = "wasm32"))]
fn lpf_v_sb_uv_8bpc_inner(
buf: &mut [u8],
mut dst_offset: usize,
stride: isize,
vmask: &[u32; 3],
lvl: &[AtomicU8],
lvl_base: usize,
lvl_byte_idx: usize,
b4_stride: isize,
lut: &Align16<Av1FilterLUT>,
_w: i32,
bitdepth_max: i32,
) {
let stridea = 1isize;
let strideb = stride;
let b4_stridea = 1usize;
let b4_strideb = b4_stride as usize;
let vm = vmask[0] | vmask[1];
let mut lvl_offset = lvl_base;
let mut xy = 1u32;
while vm & !xy.wrapping_sub(1) != 0 {
if vm & xy != 0 {
let lvl_val = read_lvl(lvl, lvl_offset, lvl_byte_idx);
let l = if lvl_val != 0 {
lvl_val
} else {
if lvl_offset >= b4_strideb {
read_lvl(lvl, lvl_offset - b4_strideb, lvl_byte_idx)
} else {
0
}
};
if l != 0 {
let h = (l >> 4) as i32;
let e = lut.e[l as usize] as i32;
let i = lut.i[l as usize] as i32;
let idx = if vmask[1] & xy != 0 { 6 } else { 4 };
loop_filter_4_8bpc(
buf,
dst_offset,
e,
i,
h,
stridea,
strideb,
idx,
bitdepth_max,
);
}
}
xy <<= 1;
dst_offset = signed_idx(dst_offset, 4 * stridea);
lvl_offset += b4_stridea;
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn lpf_h_sb_y_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
mask: &[u32; 3],
lvl_ptr: *const [u8; 4],
b4_stride: ptrdiff_t,
lut: &Align16<Av1FilterLUT>,
w: c_int,
bitdepth_max: c_int,
_dst: *const FFISafe<PicOffset>,
_lvl: *const FFISafe<WithOffset<&[AtomicU8]>>,
) {
let buf_len = compute_buf_len_u8(stride as isize, w);
let buf = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let lvl_byte_len = compute_lvl_len(b4_stride as isize, w) * 4;
let lvl = unsafe { std::slice::from_raw_parts(lvl_ptr as *const AtomicU8, lvl_byte_len) };
lpf_h_sb_y_8bpc_inner(
buf,
0,
stride as isize,
mask,
lvl,
0,
0,
b4_stride as isize,
lut,
w,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn lpf_v_sb_y_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
mask: &[u32; 3],
lvl_ptr: *const [u8; 4],
b4_stride: ptrdiff_t,
lut: &Align16<Av1FilterLUT>,
w: c_int,
bitdepth_max: c_int,
_dst: *const FFISafe<PicOffset>,
_lvl: *const FFISafe<WithOffset<&[AtomicU8]>>,
) {
let buf_len = compute_buf_len_u8(stride as isize, w);
let buf = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let lvl_byte_len = compute_lvl_len(b4_stride as isize, w) * 4;
let lvl = unsafe { std::slice::from_raw_parts(lvl_ptr as *const AtomicU8, lvl_byte_len) };
lpf_v_sb_y_8bpc_inner(
buf,
0,
stride as isize,
mask,
lvl,
0,
0,
b4_stride as isize,
lut,
w,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn lpf_h_sb_uv_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
mask: &[u32; 3],
lvl_ptr: *const [u8; 4],
b4_stride: ptrdiff_t,
lut: &Align16<Av1FilterLUT>,
w: c_int,
bitdepth_max: c_int,
_dst: *const FFISafe<PicOffset>,
_lvl: *const FFISafe<WithOffset<&[AtomicU8]>>,
) {
let buf_len = compute_buf_len_u8(stride as isize, w);
let buf = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let lvl_byte_len = compute_lvl_len(b4_stride as isize, w) * 4;
let lvl = unsafe { std::slice::from_raw_parts(lvl_ptr as *const AtomicU8, lvl_byte_len) };
lpf_h_sb_uv_8bpc_inner(
buf,
0,
stride as isize,
mask,
lvl,
0,
0,
b4_stride as isize,
lut,
w,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn lpf_v_sb_uv_8bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
mask: &[u32; 3],
lvl_ptr: *const [u8; 4],
b4_stride: ptrdiff_t,
lut: &Align16<Av1FilterLUT>,
w: c_int,
bitdepth_max: c_int,
_dst: *const FFISafe<PicOffset>,
_lvl: *const FFISafe<WithOffset<&[AtomicU8]>>,
) {
let buf_len = compute_buf_len_u8(stride as isize, w);
let buf = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u8, buf_len) };
let lvl_byte_len = compute_lvl_len(b4_stride as isize, w) * 4;
let lvl = unsafe { std::slice::from_raw_parts(lvl_ptr as *const AtomicU8, lvl_byte_len) };
lpf_v_sb_uv_8bpc_inner(
buf,
0,
stride as isize,
mask,
lvl,
0,
0,
b4_stride as isize,
lut,
w,
bitdepth_max,
);
}
#[cfg(any(target_arch = "x86_64", target_arch = "wasm32"))]
fn loop_filter_4_16bpc(
buf: &mut [u16],
base: usize,
e: i32,
i: i32,
h: i32,
stridea: isize,
strideb: isize,
wd: i32,
bitdepth_max: i32,
) {
let bitdepth_min_8 = if bitdepth_max > 255 {
if bitdepth_max > 1023 { 4 } else { 2 }
} else {
0
};
let f = 1i32 << bitdepth_min_8;
let e = e << bitdepth_min_8;
let i = i << bitdepth_min_8;
let h = h << bitdepth_min_8;
for idx in 0..4isize {
let edge = signed_idx(base, idx * stridea);
let get_px = |offset: isize| -> i32 { buf[signed_idx(edge, strideb * offset)] as i32 };
let p1 = get_px(-2);
let p0 = get_px(-1);
let q0 = get_px(0);
let q1 = get_px(1);
let mut fm = (p1 - p0).abs() <= i
&& (q1 - q0).abs() <= i
&& (p0 - q0).abs() * 2 + ((p1 - q1).abs() >> 1) <= e;
let (mut p2, mut p3, mut q2, mut q3) = (0, 0, 0, 0);
let (mut p4, mut p5, mut p6, mut q4, mut q5, mut q6) = (0, 0, 0, 0, 0, 0);
if wd > 4 {
p2 = get_px(-3);
q2 = get_px(2);
fm &= (p2 - p1).abs() <= i && (q2 - q1).abs() <= i;
if wd > 6 {
p3 = get_px(-4);
q3 = get_px(3);
fm &= (p3 - p2).abs() <= i && (q3 - q2).abs() <= i;
}
}
if !fm {
continue;
}
let mut flat8out = false;
let mut flat8in = false;
if wd >= 16 {
p6 = get_px(-7);
p5 = get_px(-6);
p4 = get_px(-5);
q4 = get_px(4);
q5 = get_px(5);
q6 = get_px(6);
flat8out = (p6 - p0).abs() <= f
&& (p5 - p0).abs() <= f
&& (p4 - p0).abs() <= f
&& (q4 - q0).abs() <= f
&& (q5 - q0).abs() <= f
&& (q6 - q0).abs() <= f;
}
if wd >= 6 {
flat8in = (p2 - p0).abs() <= f
&& (p1 - p0).abs() <= f
&& (q1 - q0).abs() <= f
&& (q2 - q0).abs() <= f;
}
if wd >= 8 {
flat8in &= (p3 - p0).abs() <= f && (q3 - q0).abs() <= f;
}
let set_px = |buf: &mut [u16], offset: isize, val: i32| {
buf[signed_idx(edge, strideb * offset)] = val.clamp(0, bitdepth_max) as u16;
};
if wd >= 16 && flat8out && flat8in {
set_px(
buf,
-6,
(p6 + p6 + p6 + p6 + p6 + p6 * 2 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + 8)
>> 4,
);
set_px(
buf,
-5,
(p6 + p6 + p6 + p6 + p6 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + 8)
>> 4,
);
set_px(
buf,
-4,
(p6 + p6 + p6 + p6 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + 8)
>> 4,
);
set_px(
buf,
-3,
(p6 + p6 + p6 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3 + 8)
>> 4,
);
set_px(
buf,
-2,
(p6 + p6 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + q0 + q1 + q2 + q3 + q4 + 8)
>> 4,
);
set_px(
buf,
-1,
(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1 + q2 + q3 + q4 + q5 + 8)
>> 4,
);
set_px(
buf,
0,
(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + 8)
>> 4,
);
set_px(
buf,
1,
(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 + q2 * 2 + q3 + q4 + q5 + q6 + q6 + 8)
>> 4,
);
set_px(
buf,
2,
(p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 + q6 + q6 + 8)
>> 4,
);
set_px(
buf,
3,
(p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 + q6 + q6 + q6 + 8)
>> 4,
);
set_px(
buf,
4,
(p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 + q6 + q6 + q6 + q6 + 8)
>> 4,
);
set_px(
buf,
5,
(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 2 + q6 + q6 + q6 + q6 + q6 + 8)
>> 4,
);
} else if wd >= 8 && flat8in {
set_px(buf, -3, (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3);
set_px(buf, -2, (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3);
set_px(buf, -1, (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3);
set_px(buf, 0, (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3);
set_px(buf, 1, (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3);
set_px(buf, 2, (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3);
} else if wd >= 6 && flat8in {
set_px(buf, -2, (p2 + 2 * p2 + 2 * p1 + 2 * p0 + q0 + 4) >> 3);
set_px(buf, -1, (p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3);
set_px(buf, 0, (p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3);
set_px(buf, 1, (p0 + 2 * q0 + 2 * q1 + 2 * q2 + q2 + 4) >> 3);
} else {
let hev = (p1 - p0).abs() > h || (q1 - q0).abs() > h;
let bdm8 = bitdepth_min_8 as u8;
if hev {
let f = iclip_diff(p1 - q1, bdm8);
let f = iclip_diff(3 * (q0 - p0) + f, bdm8);
let f1 = cmp::min(f + 4, (128 << bdm8) - 1) >> 3;
let f2 = cmp::min(f + 3, (128 << bdm8) - 1) >> 3;
set_px(buf, -1, iclip(p0 + f2, 0, bitdepth_max));
set_px(buf, 0, iclip(q0 - f1, 0, bitdepth_max));
} else {
let f = iclip_diff(3 * (q0 - p0), bdm8);
let f1 = cmp::min(f + 4, (128 << bdm8) - 1) >> 3;
let f2 = cmp::min(f + 3, (128 << bdm8) - 1) >> 3;
set_px(buf, -1, iclip(p0 + f2, 0, bitdepth_max));
set_px(buf, 0, iclip(q0 - f1, 0, bitdepth_max));
let f3 = (f1 + 1) >> 1;
set_px(buf, -2, iclip(p1 + f3, 0, bitdepth_max));
set_px(buf, 1, iclip(q1 - f3, 0, bitdepth_max));
}
}
}
}
#[cfg(any(target_arch = "x86_64", target_arch = "wasm32"))]
fn lpf_h_sb_y_16bpc_inner(
buf: &mut [u16],
mut dst_offset: usize,
stride_u16: isize,
vmask: &[u32; 3],
lvl: &[AtomicU8],
lvl_base: usize,
lvl_byte_idx: usize,
b4_stride: isize,
lut: &Align16<Av1FilterLUT>,
_w: i32,
bitdepth_max: i32,
) {
let stridea = stride_u16;
let strideb = 1isize;
let b4_stridea = b4_stride as usize;
let b4_strideb = 1usize;
let vm = vmask[0] | vmask[1] | vmask[2];
let mut lvl_offset = lvl_base;
let mut xy = 1u32;
while vm & !xy.wrapping_sub(1) != 0 {
if vm & xy != 0 {
let lvl_val = read_lvl(lvl, lvl_offset, lvl_byte_idx);
let l = if lvl_val != 0 {
lvl_val
} else {
if lvl_offset >= b4_strideb {
read_lvl(lvl, lvl_offset - b4_strideb, lvl_byte_idx)
} else {
0
}
};
if l != 0 {
let h = (l >> 4) as i32;
let e = lut.e[l as usize] as i32;
let i = lut.i[l as usize] as i32;
let idx = if vmask[2] & xy != 0 {
16
} else if vmask[1] & xy != 0 {
8
} else {
4
};
loop_filter_4_16bpc(
buf,
dst_offset,
e,
i,
h,
stridea,
strideb,
idx,
bitdepth_max,
);
}
}
xy <<= 1;
dst_offset = signed_idx(dst_offset, 4 * stridea);
lvl_offset += b4_stridea;
}
}
#[cfg(any(target_arch = "x86_64", target_arch = "wasm32"))]
fn lpf_v_sb_y_16bpc_inner(
buf: &mut [u16],
mut dst_offset: usize,
stride_u16: isize,
vmask: &[u32; 3],
lvl: &[AtomicU8],
lvl_base: usize,
lvl_byte_idx: usize,
b4_stride: isize,
lut: &Align16<Av1FilterLUT>,
_w: i32,
bitdepth_max: i32,
) {
let stridea = 1isize;
let strideb = stride_u16;
let b4_stridea = 1usize;
let b4_strideb = b4_stride as usize;
let vm = vmask[0] | vmask[1] | vmask[2];
let mut lvl_offset = lvl_base;
let mut xy = 1u32;
while vm & !xy.wrapping_sub(1) != 0 {
if vm & xy != 0 {
let lvl_val = read_lvl(lvl, lvl_offset, lvl_byte_idx);
let l = if lvl_val != 0 {
lvl_val
} else {
if lvl_offset >= b4_strideb {
read_lvl(lvl, lvl_offset - b4_strideb, lvl_byte_idx)
} else {
0
}
};
if l != 0 {
let h = (l >> 4) as i32;
let e = lut.e[l as usize] as i32;
let i = lut.i[l as usize] as i32;
let idx = if vmask[2] & xy != 0 {
16
} else if vmask[1] & xy != 0 {
8
} else {
4
};
loop_filter_4_16bpc(
buf,
dst_offset,
e,
i,
h,
stridea,
strideb,
idx,
bitdepth_max,
);
}
}
xy <<= 1;
dst_offset = signed_idx(dst_offset, 4 * stridea);
lvl_offset += b4_stridea;
}
}
#[cfg(any(target_arch = "x86_64", target_arch = "wasm32"))]
fn lpf_h_sb_uv_16bpc_inner(
buf: &mut [u16],
mut dst_offset: usize,
stride_u16: isize,
vmask: &[u32; 3],
lvl: &[AtomicU8],
lvl_base: usize,
lvl_byte_idx: usize,
b4_stride: isize,
lut: &Align16<Av1FilterLUT>,
_w: i32,
bitdepth_max: i32,
) {
let stridea = stride_u16;
let strideb = 1isize;
let b4_stridea = b4_stride as usize;
let b4_strideb = 1usize;
let vm = vmask[0] | vmask[1];
let mut lvl_offset = lvl_base;
let mut xy = 1u32;
while vm & !xy.wrapping_sub(1) != 0 {
if vm & xy != 0 {
let lvl_val = read_lvl(lvl, lvl_offset, lvl_byte_idx);
let l = if lvl_val != 0 {
lvl_val
} else {
if lvl_offset >= b4_strideb {
read_lvl(lvl, lvl_offset - b4_strideb, lvl_byte_idx)
} else {
0
}
};
if l != 0 {
let h = (l >> 4) as i32;
let e = lut.e[l as usize] as i32;
let i = lut.i[l as usize] as i32;
let idx = if vmask[1] & xy != 0 { 6 } else { 4 };
loop_filter_4_16bpc(
buf,
dst_offset,
e,
i,
h,
stridea,
strideb,
idx,
bitdepth_max,
);
}
}
xy <<= 1;
dst_offset = signed_idx(dst_offset, 4 * stridea);
lvl_offset += b4_stridea;
}
}
#[cfg(any(target_arch = "x86_64", target_arch = "wasm32"))]
fn lpf_v_sb_uv_16bpc_inner(
buf: &mut [u16],
mut dst_offset: usize,
stride_u16: isize,
vmask: &[u32; 3],
lvl: &[AtomicU8],
lvl_base: usize,
lvl_byte_idx: usize,
b4_stride: isize,
lut: &Align16<Av1FilterLUT>,
_w: i32,
bitdepth_max: i32,
) {
let stridea = 1isize;
let strideb = stride_u16;
let b4_stridea = 1usize;
let b4_strideb = b4_stride as usize;
let vm = vmask[0] | vmask[1];
let mut lvl_offset = lvl_base;
let mut xy = 1u32;
while vm & !xy.wrapping_sub(1) != 0 {
if vm & xy != 0 {
let lvl_val = read_lvl(lvl, lvl_offset, lvl_byte_idx);
let l = if lvl_val != 0 {
lvl_val
} else {
if lvl_offset >= b4_strideb {
read_lvl(lvl, lvl_offset - b4_strideb, lvl_byte_idx)
} else {
0
}
};
if l != 0 {
let h = (l >> 4) as i32;
let e = lut.e[l as usize] as i32;
let i = lut.i[l as usize] as i32;
let idx = if vmask[1] & xy != 0 { 6 } else { 4 };
loop_filter_4_16bpc(
buf,
dst_offset,
e,
i,
h,
stridea,
strideb,
idx,
bitdepth_max,
);
}
}
xy <<= 1;
dst_offset = signed_idx(dst_offset, 4 * stridea);
lvl_offset += b4_stridea;
}
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn lpf_h_sb_y_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
mask: &[u32; 3],
lvl_ptr: *const [u8; 4],
b4_stride: ptrdiff_t,
lut: &Align16<Av1FilterLUT>,
w: c_int,
bitdepth_max: c_int,
_dst: *const FFISafe<PicOffset>,
_lvl: *const FFISafe<WithOffset<&[AtomicU8]>>,
) {
let buf_len = compute_buf_len_u16(stride as isize, w);
let buf = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, buf_len) };
let lvl_byte_len = compute_lvl_len(b4_stride as isize, w) * 4;
let lvl = unsafe { std::slice::from_raw_parts(lvl_ptr as *const AtomicU8, lvl_byte_len) };
lpf_h_sb_y_16bpc_inner(
buf,
0,
stride as isize / 2,
mask,
lvl,
0,
0,
b4_stride as isize,
lut,
w,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn lpf_v_sb_y_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
mask: &[u32; 3],
lvl_ptr: *const [u8; 4],
b4_stride: ptrdiff_t,
lut: &Align16<Av1FilterLUT>,
w: c_int,
bitdepth_max: c_int,
_dst: *const FFISafe<PicOffset>,
_lvl: *const FFISafe<WithOffset<&[AtomicU8]>>,
) {
let buf_len = compute_buf_len_u16(stride as isize, w);
let buf = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, buf_len) };
let lvl_byte_len = compute_lvl_len(b4_stride as isize, w) * 4;
let lvl = unsafe { std::slice::from_raw_parts(lvl_ptr as *const AtomicU8, lvl_byte_len) };
lpf_v_sb_y_16bpc_inner(
buf,
0,
stride as isize / 2,
mask,
lvl,
0,
0,
b4_stride as isize,
lut,
w,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn lpf_h_sb_uv_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
mask: &[u32; 3],
lvl_ptr: *const [u8; 4],
b4_stride: ptrdiff_t,
lut: &Align16<Av1FilterLUT>,
w: c_int,
bitdepth_max: c_int,
_dst: *const FFISafe<PicOffset>,
_lvl: *const FFISafe<WithOffset<&[AtomicU8]>>,
) {
let buf_len = compute_buf_len_u16(stride as isize, w);
let buf = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, buf_len) };
let lvl_byte_len = compute_lvl_len(b4_stride as isize, w) * 4;
let lvl = unsafe { std::slice::from_raw_parts(lvl_ptr as *const AtomicU8, lvl_byte_len) };
lpf_h_sb_uv_16bpc_inner(
buf,
0,
stride as isize / 2,
mask,
lvl,
0,
0,
b4_stride as isize,
lut,
w,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
pub unsafe extern "C" fn lpf_v_sb_uv_16bpc_avx2(
dst_ptr: *mut DynPixel,
stride: ptrdiff_t,
mask: &[u32; 3],
lvl_ptr: *const [u8; 4],
b4_stride: ptrdiff_t,
lut: &Align16<Av1FilterLUT>,
w: c_int,
bitdepth_max: c_int,
_dst: *const FFISafe<PicOffset>,
_lvl: *const FFISafe<WithOffset<&[AtomicU8]>>,
) {
let buf_len = compute_buf_len_u16(stride as isize, w);
let buf = unsafe { std::slice::from_raw_parts_mut(dst_ptr as *mut u16, buf_len) };
let lvl_byte_len = compute_lvl_len(b4_stride as isize, w) * 4;
let lvl = unsafe { std::slice::from_raw_parts(lvl_ptr as *const AtomicU8, lvl_byte_len) };
lpf_v_sb_uv_16bpc_inner(
buf,
0,
stride as isize / 2,
mask,
lvl,
0,
0,
b4_stride as isize,
lut,
w,
bitdepth_max,
);
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
fn compute_buf_len_u8(stride: isize, _w: i32) -> usize {
(stride.unsigned_abs() * 128 + 8) as usize
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
fn compute_buf_len_u16(stride: isize, _w: i32) -> usize {
let stride_u16 = stride.unsigned_abs() / 2;
(stride_u16 * 128 + 8) as usize
}
#[cfg(all(feature = "asm", target_arch = "x86_64"))]
fn compute_lvl_len(b4_stride: isize, _w: i32) -> usize {
(b4_stride.unsigned_abs() as usize) * 132 + 4
}
#[cfg(target_arch = "x86_64")]
pub fn loopfilter_sb_dispatch<BD: BitDepth>(
dst: PicOffset,
stride: ptrdiff_t,
mask: &[u32; 3],
lvl: WithOffset<&[AtomicU8]>,
b4_stride: isize,
lut: &Align16<Av1FilterLUT>,
w: c_int,
bitdepth_max: c_int,
is_y: bool,
is_v: bool,
) -> bool {
use crate::include::common::bitdepth::BPC;
let Some(_token) = crate::src::cpu::summon_avx2() else {
return false;
};
assert!(lvl.offset <= lvl.data.len());
let b4_strideb_entries = if !is_v {
1usize
} else {
b4_stride.unsigned_abs() as usize
};
let lvl_lookback_bytes = b4_strideb_entries * 4;
let lvl_start = lvl.offset.saturating_sub(lvl_lookback_bytes) & !3;
let lvl_slice = &lvl.data[lvl_start..];
let lvl_byte_idx = lvl.offset % 4;
let lvl_base = (lvl.offset - lvl_byte_idx - lvl_start) / 4;
let vm = mask[0] | mask[1] | mask[2];
if vm == 0 {
return true; }
let max_iter = 32 - vm.leading_zeros() as usize;
match BD::BPC {
BPC::BPC8 => {
use crate::include::common::bitdepth::BitDepth8;
let byte_stride = stride.unsigned_abs() as usize;
let (reach_before, reach_after) = if !is_v {
(7, (max_iter * 4 - 1) * byte_stride + 16)
} else {
(7 * byte_stride, max_iter * 4 - 1 + 16 * byte_stride)
};
let buf_pixel_len = dst.data.pixel_len::<BitDepth8>();
if dst.offset < reach_before || dst.offset.saturating_add(reach_after) > buf_pixel_len {
return false;
}
let use_compact = crate::include::dav1d::picture::tile_threading_active();
let start_pixel = dst.offset - reach_before;
let total_pixels = (reach_before + reach_after).min(buf_pixel_len - start_pixel);
if use_compact {
let (cw, ch, cstart, cbase) = if !is_v {
(7 + 16, max_iter * 4, dst.offset - 7, 7usize)
} else {
let cw = max_iter * 4;
(
cw,
7 + 16, dst.offset.saturating_sub(7 * byte_stride),
7 * cw,
)
};
let lpf_pic = crate::src::with_offset::WithOffset {
data: dst.data,
offset: cstart,
};
let (mut cb, cs) = lpf_pic.compact_read_per_row::<BitDepth8>(cw, ch);
let buf: &mut [u8] = &mut cb;
let base = cbase;
let stride_i = cs as isize;
match (is_y, is_v) {
(true, false) => lpf_h_sb_y_8bpc_inner(
buf,
base,
stride_i,
mask,
lvl_slice,
lvl_base,
lvl_byte_idx,
b4_stride,
lut,
w,
bitdepth_max,
),
(true, true) => lpf_v_sb_y_8bpc_inner(
buf,
base,
stride_i,
mask,
lvl_slice,
lvl_base,
lvl_byte_idx,
b4_stride,
lut,
w,
bitdepth_max,
),
(false, false) => lpf_h_sb_uv_8bpc_inner(
buf,
base,
stride_i,
mask,
lvl_slice,
lvl_base,
lvl_byte_idx,
b4_stride,
lut,
w,
bitdepth_max,
),
(false, true) => lpf_v_sb_uv_8bpc_inner(
buf,
base,
stride_i,
mask,
lvl_slice,
lvl_base,
lvl_byte_idx,
b4_stride,
lut,
w,
bitdepth_max,
),
}
lpf_pic.compact_write_back_per_row::<BitDepth8>(cw, ch, &cb);
} else {
let mut guard = dst
.data
.slice_mut::<BitDepth8, _>((start_pixel.., ..total_pixels));
let buf: &mut [u8] = &mut *guard;
let base = reach_before;
let stride_i = stride as isize;
match (is_y, is_v) {
(true, false) => lpf_h_sb_y_8bpc_inner(
buf,
base,
stride_i,
mask,
lvl_slice,
lvl_base,
lvl_byte_idx,
b4_stride,
lut,
w,
bitdepth_max,
),
(true, true) => lpf_v_sb_y_8bpc_inner(
buf,
base,
stride_i,
mask,
lvl_slice,
lvl_base,
lvl_byte_idx,
b4_stride,
lut,
w,
bitdepth_max,
),
(false, false) => lpf_h_sb_uv_8bpc_inner(
buf,
base,
stride_i,
mask,
lvl_slice,
lvl_base,
lvl_byte_idx,
b4_stride,
lut,
w,
bitdepth_max,
),
(false, true) => lpf_v_sb_uv_8bpc_inner(
buf,
base,
stride_i,
mask,
lvl_slice,
lvl_base,
lvl_byte_idx,
b4_stride,
lut,
w,
bitdepth_max,
),
}
}
}
BPC::BPC16 => {
use crate::include::common::bitdepth::BitDepth16;
let u16_stride = (stride / 2).unsigned_abs() as usize;
let (reach_before, reach_after) = if !is_v {
(7, (max_iter * 4 - 1) * u16_stride + 16)
} else {
(7 * u16_stride, max_iter * 4 - 1 + 16 * u16_stride)
};
let buf_pixel_len = dst.data.pixel_len::<BitDepth16>();
if dst.offset < reach_before || dst.offset.saturating_add(reach_after) > buf_pixel_len {
return false;
}
let use_compact = crate::include::dav1d::picture::tile_threading_active();
if use_compact {
let (compact_w, compact_h, start_pixel, base) = if !is_v {
let w = 7 + 16; let h = max_iter * 4;
let start = dst.offset - 7;
(w, h, start, 7usize)
} else {
let w = max_iter * 4;
let h = 7 + 16; let start = dst.offset.saturating_sub(7 * u16_stride);
(w, h, start, 7 * w)
};
let lpf_pic = crate::src::with_offset::WithOffset {
data: dst.data,
offset: start_pixel,
};
let (mut compact, compact_stride) =
lpf_pic.compact_read_per_row::<BitDepth16>(compact_w, compact_h);
let buf: &mut [u16] =
zerocopy::FromBytes::mut_from_bytes(&mut compact[..]).unwrap();
let stride_i = (compact_stride / 2) as isize;
match (is_y, is_v) {
(true, false) => lpf_h_sb_y_16bpc_inner(
buf,
base,
stride_i,
mask,
lvl_slice,
lvl_base,
lvl_byte_idx,
b4_stride,
lut,
w,
bitdepth_max,
),
(true, true) => lpf_v_sb_y_16bpc_inner(
buf,
base,
stride_i,
mask,
lvl_slice,
lvl_base,
lvl_byte_idx,
b4_stride,
lut,
w,
bitdepth_max,
),
(false, false) => lpf_h_sb_uv_16bpc_inner(
buf,
base,
stride_i,
mask,
lvl_slice,
lvl_base,
lvl_byte_idx,
b4_stride,
lut,
w,
bitdepth_max,
),
(false, true) => lpf_v_sb_uv_16bpc_inner(
buf,
base,
stride_i,
mask,
lvl_slice,
lvl_base,
lvl_byte_idx,
b4_stride,
lut,
w,
bitdepth_max,
),
}
lpf_pic.compact_write_back_per_row::<BitDepth16>(compact_w, compact_h, &compact);
} else {
let start_pixel = dst.offset - reach_before;
let total_pixels = (reach_before + reach_after).min(buf_pixel_len - start_pixel);
let mut guard = dst
.data
.slice_mut::<BitDepth16, _>((start_pixel.., ..total_pixels));
let buf: &mut [u16] = &mut *guard;
let base = reach_before;
let stride_i = stride as isize / 2;
match (is_y, is_v) {
(true, false) => lpf_h_sb_y_16bpc_inner(
buf,
base,
stride_i,
mask,
lvl_slice,
lvl_base,
lvl_byte_idx,
b4_stride,
lut,
w,
bitdepth_max,
),
(true, true) => lpf_v_sb_y_16bpc_inner(
buf,
base,
stride_i,
mask,
lvl_slice,
lvl_base,
lvl_byte_idx,
b4_stride,
lut,
w,
bitdepth_max,
),
(false, false) => lpf_h_sb_uv_16bpc_inner(
buf,
base,
stride_i,
mask,
lvl_slice,
lvl_base,
lvl_byte_idx,
b4_stride,
lut,
w,
bitdepth_max,
),
(false, true) => lpf_v_sb_uv_16bpc_inner(
buf,
base,
stride_i,
mask,
lvl_slice,
lvl_base,
lvl_byte_idx,
b4_stride,
lut,
w,
bitdepth_max,
),
}
}
}
}
true
}
#[cfg(target_arch = "wasm32")]
pub fn loopfilter_sb_dispatch<BD: BitDepth>(
dst: PicOffset,
stride: ptrdiff_t,
mask: &[u32; 3],
lvl: WithOffset<&[AtomicU8]>,
b4_stride: isize,
lut: &Align16<Av1FilterLUT>,
w: c_int,
bitdepth_max: c_int,
is_y: bool,
is_v: bool,
) -> bool {
use crate::include::common::bitdepth::BPC;
assert!(lvl.offset <= lvl.data.len());
let b4_strideb_entries = if !is_v {
1usize
} else {
b4_stride.unsigned_abs() as usize
};
let lvl_lookback_bytes = b4_strideb_entries * 4;
let lvl_start = lvl.offset.saturating_sub(lvl_lookback_bytes) & !3;
let lvl_slice = &lvl.data[lvl_start..];
let lvl_byte_idx = lvl.offset % 4;
let lvl_base = (lvl.offset - lvl_byte_idx - lvl_start) / 4;
let vm = mask[0] | mask[1] | mask[2];
if vm == 0 {
return true;
}
let max_iter = 32 - vm.leading_zeros() as usize;
match BD::BPC {
BPC::BPC8 => {
use crate::include::common::bitdepth::BitDepth8;
let byte_stride = stride.unsigned_abs() as usize;
let (reach_before, reach_after) = if !is_v {
(7, (max_iter * 4 - 1) * byte_stride + 16)
} else {
(7 * byte_stride, max_iter * 4 - 1 + 16 * byte_stride)
};
let buf_pixel_len = dst.data.pixel_len::<BitDepth8>();
if dst.offset < reach_before || dst.offset.saturating_add(reach_after) > buf_pixel_len {
return false;
}
let start_pixel = dst.offset - reach_before;
let total_pixels = (reach_before + reach_after).min(buf_pixel_len - start_pixel);
let mut buf_guard = dst
.data
.slice_mut::<BitDepth8, _>((start_pixel.., ..total_pixels));
let buf: &mut [u8] = &mut *buf_guard;
let base = reach_before;
match (is_y, is_v) {
(true, false) => lpf_h_sb_y_8bpc_inner(
buf,
base,
stride as isize,
mask,
lvl_slice,
lvl_base,
lvl_byte_idx,
b4_stride,
lut,
w,
bitdepth_max,
),
(true, true) => lpf_v_sb_y_8bpc_inner(
buf,
base,
stride as isize,
mask,
lvl_slice,
lvl_base,
lvl_byte_idx,
b4_stride,
lut,
w,
bitdepth_max,
),
(false, false) => lpf_h_sb_uv_8bpc_inner(
buf,
base,
stride as isize,
mask,
lvl_slice,
lvl_base,
lvl_byte_idx,
b4_stride,
lut,
w,
bitdepth_max,
),
(false, true) => lpf_v_sb_uv_8bpc_inner(
buf,
base,
stride as isize,
mask,
lvl_slice,
lvl_base,
lvl_byte_idx,
b4_stride,
lut,
w,
bitdepth_max,
),
}
}
BPC::BPC16 => {
use crate::include::common::bitdepth::BitDepth16;
let u16_stride = (stride / 2).unsigned_abs() as usize;
let (reach_before, reach_after) = if !is_v {
(7, (max_iter * 4 - 1) * u16_stride + 16)
} else {
(7 * u16_stride, max_iter * 4 - 1 + 16 * u16_stride)
};
let buf_pixel_len = dst.data.pixel_len::<BitDepth16>();
if dst.offset < reach_before || dst.offset.saturating_add(reach_after) > buf_pixel_len {
return false;
}
let start_pixel = dst.offset - reach_before;
let total_pixels = (reach_before + reach_after).min(buf_pixel_len - start_pixel);
let mut buf_guard = dst
.data
.slice_mut::<BitDepth16, _>((start_pixel.., ..total_pixels));
let buf: &mut [u16] = &mut *buf_guard;
let base = reach_before;
match (is_y, is_v) {
(true, false) => lpf_h_sb_y_16bpc_inner(
buf,
base,
stride as isize / 2,
mask,
lvl_slice,
lvl_base,
lvl_byte_idx,
b4_stride,
lut,
w,
bitdepth_max,
),
(true, true) => lpf_v_sb_y_16bpc_inner(
buf,
base,
stride as isize / 2,
mask,
lvl_slice,
lvl_base,
lvl_byte_idx,
b4_stride,
lut,
w,
bitdepth_max,
),
(false, false) => lpf_h_sb_uv_16bpc_inner(
buf,
base,
stride as isize / 2,
mask,
lvl_slice,
lvl_base,
lvl_byte_idx,
b4_stride,
lut,
w,
bitdepth_max,
),
(false, true) => lpf_v_sb_uv_16bpc_inner(
buf,
base,
stride as isize / 2,
mask,
lvl_slice,
lvl_base,
lvl_byte_idx,
b4_stride,
lut,
w,
bitdepth_max,
),
}
}
}
true
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_iclip_diff() {
assert_eq!(iclip_diff(100, 0), 100);
assert_eq!(iclip_diff(-100, 0), -100);
assert_eq!(iclip_diff(200, 0), 127);
assert_eq!(iclip_diff(-200, 0), -128);
}
}