cfg_if::cfg_if! {
if #[cfg(nasm_x86_64)] {
pub use crate::asm::x86::dist::*;
} else if #[cfg(asm_neon)] {
pub use crate::asm::aarch64::dist::*;
pub use self::rust::get_satd;
pub use self::rust::get_weighted_sse;
} else {
pub use self::rust::*;
}
}
pub(crate) mod rust {
use crate::cpu_features::CpuFeatureLevel;
use crate::partition::BlockSize;
use crate::tiling::*;
use crate::util::*;
use crate::encoder::IMPORTANCE_BLOCK_SIZE;
use crate::rdo::{DistortionScale, RawDistortion};
use simd_helpers::cold_for_target_arch;
#[cold_for_target_arch("x86_64")]
pub fn get_sad<T: Pixel>(
plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>,
bsize: BlockSize, _bit_depth: usize, _cpu: CpuFeatureLevel,
) -> u32 {
let blk_w = bsize.width();
let blk_h = bsize.height();
let mut sum = 0 as u32;
for (slice_org, slice_ref) in
plane_org.rows_iter().take(blk_h).zip(plane_ref.rows_iter())
{
sum += slice_org
.iter()
.take(blk_w)
.zip(slice_ref)
.map(|(&a, &b)| (i32::cast_from(a) - i32::cast_from(b)).abs() as u32)
.sum::<u32>();
}
sum
}
#[inline(always)]
const fn butterfly(a: i32, b: i32) -> (i32, i32) {
((a + b), (a - b))
}
#[inline(always)]
#[allow(clippy::identity_op, clippy::erasing_op)]
fn hadamard4_1d(data: &mut [i32], n: usize, stride0: usize, stride1: usize) {
for i in 0..n {
let sub: &mut [i32] = &mut data[i * stride0..];
let (a0, a1) = butterfly(sub[0 * stride1], sub[1 * stride1]);
let (a2, a3) = butterfly(sub[2 * stride1], sub[3 * stride1]);
let (b0, b2) = butterfly(a0, a2);
let (b1, b3) = butterfly(a1, a3);
sub[0 * stride1] = b0;
sub[1 * stride1] = b1;
sub[2 * stride1] = b2;
sub[3 * stride1] = b3;
}
}
#[inline(always)]
#[allow(clippy::identity_op, clippy::erasing_op)]
fn hadamard8_1d(data: &mut [i32], n: usize, stride0: usize, stride1: usize) {
for i in 0..n {
let sub: &mut [i32] = &mut data[i * stride0..];
let (a0, a1) = butterfly(sub[0 * stride1], sub[1 * stride1]);
let (a2, a3) = butterfly(sub[2 * stride1], sub[3 * stride1]);
let (a4, a5) = butterfly(sub[4 * stride1], sub[5 * stride1]);
let (a6, a7) = butterfly(sub[6 * stride1], sub[7 * stride1]);
let (b0, b2) = butterfly(a0, a2);
let (b1, b3) = butterfly(a1, a3);
let (b4, b6) = butterfly(a4, a6);
let (b5, b7) = butterfly(a5, a7);
let (c0, c4) = butterfly(b0, b4);
let (c1, c5) = butterfly(b1, b5);
let (c2, c6) = butterfly(b2, b6);
let (c3, c7) = butterfly(b3, b7);
sub[0 * stride1] = c0;
sub[1 * stride1] = c1;
sub[2 * stride1] = c2;
sub[3 * stride1] = c3;
sub[4 * stride1] = c4;
sub[5 * stride1] = c5;
sub[6 * stride1] = c6;
sub[7 * stride1] = c7;
}
}
#[inline(always)]
fn hadamard2d(data: &mut [i32], (w, h): (usize, usize)) {
let vert_func = if h == 4 { hadamard4_1d } else { hadamard8_1d };
vert_func(data, w, 1, h);
let horz_func = if w == 4 { hadamard4_1d } else { hadamard8_1d };
horz_func(data, h, w, 1);
}
fn hadamard4x4(data: &mut [i32]) {
hadamard2d(data, (4, 4));
}
fn hadamard8x8(data: &mut [i32]) {
hadamard2d(data, (8, 8));
}
#[cold_for_target_arch("x86_64")]
pub fn get_satd<T: Pixel>(
plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>,
bsize: BlockSize, _bit_depth: usize, _cpu: CpuFeatureLevel,
) -> u32 {
let blk_w = bsize.width();
let blk_h = bsize.height();
let size: usize = blk_w.min(blk_h).min(8);
let tx2d = if size == 4 { hadamard4x4 } else { hadamard8x8 };
let mut sum = 0 as u64;
for chunk_y in (0..blk_h).step_by(size) {
for chunk_x in (0..blk_w).step_by(size) {
let chunk_area: Area = Area::Rect {
x: chunk_x as isize,
y: chunk_y as isize,
width: size,
height: size,
};
let chunk_org = plane_org.subregion(chunk_area);
let chunk_ref = plane_ref.subregion(chunk_area);
let buf: &mut [i32] = &mut [0; 8 * 8][..size * size];
for (row_diff, (row_org, row_ref)) in buf
.chunks_mut(size)
.zip(chunk_org.rows_iter().zip(chunk_ref.rows_iter()))
{
for (diff, (a, b)) in
row_diff.iter_mut().zip(row_org.iter().zip(row_ref.iter()))
{
*diff = i32::cast_from(*a) - i32::cast_from(*b);
}
}
tx2d(buf);
sum += buf.iter().map(|a| a.abs() as u64).sum::<u64>();
}
}
let ln = msb(size as i32) as u64;
((sum + (1 << ln >> 1)) >> ln) as u32
}
#[inline(never)]
pub fn get_weighted_sse<T: Pixel>(
src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, scale: &[u32],
scale_stride: usize, w: usize, h: usize, _bit_depth: usize,
_cpu: CpuFeatureLevel,
) -> u64 {
let chunk_size = IMPORTANCE_BLOCK_SIZE >> 1;
let mut sse: u64 = 0;
for block_y in 0..(h + chunk_size - 1) / chunk_size {
for block_x in 0..(w + chunk_size - 1) / chunk_size {
let mut block_sse: u32 = 0;
for j in 0..chunk_size {
let s1 = &src1[block_y * chunk_size + j]
[block_x * chunk_size..((block_x + 1) * chunk_size).min(w)];
let s2 = &src2[block_y * chunk_size + j]
[block_x * chunk_size..((block_x + 1) * chunk_size).min(w)];
block_sse += s1
.iter()
.zip(s2)
.map(|(&a, &b)| {
let c = (i16::cast_from(a) - i16::cast_from(b)) as i32;
(c * c) as u32
})
.sum::<u32>();
}
sse += (RawDistortion::new(block_sse as u64)
* DistortionScale(scale[block_y * scale_stride + block_x]))
.0;
}
}
sse
}
}
#[cfg(test)]
pub mod test {
use super::*;
use crate::cpu_features::CpuFeatureLevel;
use crate::frame::*;
use crate::partition::BlockSize;
use crate::partition::BlockSize::*;
use crate::tiling::Area;
use crate::util::Pixel;
fn setup_planes<T: Pixel>() -> (Plane<T>, Plane<T>) {
let mut input_plane = Plane::new(640, 480, 0, 0, 128 + 8, 128 + 8);
let mut rec_plane = Plane::new(640, 480, 0, 0, 2 * 128 + 8, 2 * 128 + 8);
let xpad_off =
(input_plane.cfg.xorigin - input_plane.cfg.xpad) as i32 - 8i32;
for (i, row) in
input_plane.data.chunks_mut(input_plane.cfg.stride).enumerate()
{
for (j, pixel) in row.iter_mut().enumerate() {
let val = ((j + i) as i32 - xpad_off) & 255i32;
assert!(
val >= u8::min_value().into() && val <= u8::max_value().into()
);
*pixel = T::cast_from(val);
}
}
for (i, row) in rec_plane.data.chunks_mut(rec_plane.cfg.stride).enumerate()
{
for (j, pixel) in row.iter_mut().enumerate() {
let val = (j as i32 - i as i32 - xpad_off) & 255i32;
assert!(
val >= u8::min_value().into() && val <= u8::max_value().into()
);
*pixel = T::cast_from(val);
}
}
(input_plane, rec_plane)
}
fn get_sad_same_inner<T: Pixel>() {
let blocks: Vec<(BlockSize, u32)> = vec![
(BLOCK_4X4, 1912),
(BLOCK_4X8, 4296),
(BLOCK_8X4, 3496),
(BLOCK_8X8, 7824),
(BLOCK_8X16, 16592),
(BLOCK_16X8, 14416),
(BLOCK_16X16, 31136),
(BLOCK_16X32, 60064),
(BLOCK_32X16, 59552),
(BLOCK_32X32, 120128),
(BLOCK_32X64, 186688),
(BLOCK_64X32, 250176),
(BLOCK_64X64, 438912),
(BLOCK_64X128, 654272),
(BLOCK_128X64, 1016768),
(BLOCK_128X128, 1689792),
(BLOCK_4X16, 8680),
(BLOCK_16X4, 6664),
(BLOCK_8X32, 31056),
(BLOCK_32X8, 27600),
(BLOCK_16X64, 93344),
(BLOCK_64X16, 116384),
];
let bit_depth: usize = 8;
let (input_plane, rec_plane) = setup_planes::<T>();
for block in blocks {
let area = Area::StartingAt { x: 32, y: 40 };
let input_region = input_plane.region(area);
let rec_region = rec_plane.region(area);
assert_eq!(
block.1,
get_sad(
&input_region,
&rec_region,
block.0,
bit_depth,
CpuFeatureLevel::default()
)
);
}
}
#[test]
fn get_sad_same_u8() {
get_sad_same_inner::<u8>();
}
#[test]
fn get_sad_same_u16() {
get_sad_same_inner::<u16>();
}
fn get_satd_same_inner<T: Pixel>() {
let blocks: Vec<(BlockSize, u32)> = vec![
(BLOCK_4X4, 1408),
(BLOCK_4X8, 2016),
(BLOCK_8X4, 1816),
(BLOCK_8X8, 3984),
(BLOCK_8X16, 5136),
(BLOCK_16X8, 4864),
(BLOCK_16X16, 9984),
(BLOCK_16X32, 13824),
(BLOCK_32X16, 13760),
(BLOCK_32X32, 27952),
(BLOCK_32X64, 37168),
(BLOCK_64X32, 45104),
(BLOCK_64X64, 84176),
(BLOCK_64X128, 127920),
(BLOCK_128X64, 173680),
(BLOCK_128X128, 321456),
(BLOCK_4X16, 3136),
(BLOCK_16X4, 2632),
(BLOCK_8X32, 7056),
(BLOCK_32X8, 6624),
(BLOCK_16X64, 18432),
(BLOCK_64X16, 21312),
];
let bit_depth: usize = 8;
let (input_plane, rec_plane) = setup_planes::<T>();
for block in blocks {
let area = Area::StartingAt { x: 32, y: 40 };
let input_region = input_plane.region(area);
let rec_region = rec_plane.region(area);
assert_eq!(
block.1,
get_satd(
&input_region,
&rec_region,
block.0,
bit_depth,
CpuFeatureLevel::default()
)
);
}
}
#[test]
fn get_satd_same_u8() {
get_satd_same_inner::<u8>();
}
#[test]
fn get_satd_same_u16() {
get_satd_same_inner::<u16>();
}
}