use v_frame::pixel::Pixel;
use crate::{
data::{
hadamard::{hadamard4x4, hadamard8x8},
plane::{Area, PlaneRegion, Rect},
sad::get_sad,
},
math::msb,
};
#[cfg_attr(all(asm_x86_64, target_feature = "avx2"), cold)]
#[cfg_attr(asm_neon, cold)]
pub(super) fn get_satd_internal<T: Pixel>(
plane_org: &PlaneRegion<'_, T>,
plane_ref: &PlaneRegion<'_, T>,
w: usize,
h: usize,
bit_depth: usize,
) -> u32 {
assert!(w <= 128 && h <= 128);
assert!(plane_org.rect().width >= w && plane_org.rect().height >= h);
assert!(plane_ref.rect().width >= w && plane_ref.rect().height >= h);
let size: usize = w.min(h).min(8);
let tx2d = if size == 4 { hadamard4x4 } else { hadamard8x8 };
let mut sum: u64 = 0;
for chunk_y in (0..h).step_by(size) {
let chunk_h = (h - chunk_y).min(size);
for chunk_x in (0..w).step_by(size) {
let chunk_w = (w - chunk_x).min(size);
let chunk_area = Area::Rect(Rect {
x: chunk_x as isize,
y: chunk_y as isize,
width: chunk_w,
height: chunk_h,
});
let chunk_org = plane_org.subregion(chunk_area);
let chunk_ref = plane_ref.subregion(chunk_area);
if chunk_w != size || chunk_h != size {
sum += get_sad(&chunk_org, &chunk_ref, chunk_w, chunk_h, bit_depth) as u64;
continue;
}
let buf: &mut [i32] = &mut [0; 8 * 8][..size * size];
for (row_diff, (row_org, row_ref)) in buf
.chunks_mut(size)
.zip(chunk_org.rows_iter().zip(chunk_ref.rows_iter()))
{
for (diff, (a, b)) in row_diff.iter_mut().zip(row_org.iter().zip(row_ref.iter())) {
let a = a.to_i32().expect("value should fit in i32");
let b = b.to_i32().expect("value should fit in i32");
*diff = a - b;
}
}
unsafe {
tx2d(buf);
}
sum += buf.iter().map(|a| a.unsigned_abs() as u64).sum::<u64>();
}
}
let ln = msb(size as i32) as u64;
((sum + (1 << ln >> 1)) >> ln) as u32
}