use crate::simd::traits::SimdOps;
use crate::simd::types::{I16x8, I32x4, U8x16};
pub struct MotionCompSimd<S> {
simd: S,
}
impl<S: SimdOps> MotionCompSimd<S> {
#[inline]
pub const fn new(simd: S) -> Self {
Self { simd }
}
pub fn copy_block(
&self,
src: &[u8],
src_stride: usize,
dst: &mut [u8],
dst_stride: usize,
width: usize,
height: usize,
) {
for y in 0..height {
let src_offset = y * src_stride;
let dst_offset = y * dst_stride;
if src.len() >= src_offset + width && dst.len() >= dst_offset + width {
dst[dst_offset..dst_offset + width]
.copy_from_slice(&src[src_offset..src_offset + width]);
}
}
}
pub fn filter_h_8tap(
&self,
src: &[u8],
src_stride: usize,
dst: &mut [u8],
dst_stride: usize,
coeffs: &[i16; 8],
width: usize,
height: usize,
) {
for y in 0..height {
for x in 0..width {
let src_offset = y * src_stride + x;
let dst_offset = y * dst_stride + x;
if src.len() < src_offset + 8 || dst_offset >= dst.len() {
continue;
}
let mut pixels = I16x8::zero();
for i in 0..8 {
if src_offset + i < src.len() {
pixels[i] = i16::from(src[src_offset + i]);
}
}
let filter = I16x8::from_array(*coeffs);
let products = self.simd.mul_i16x8(pixels, filter);
let sum = self.simd.horizontal_sum_i16x8(products);
let result = (sum + 64) >> 7;
dst[dst_offset] = result.clamp(0, 255) as u8;
}
}
}
pub fn filter_v_8tap(
&self,
src: &[u8],
src_stride: usize,
dst: &mut [u8],
dst_stride: usize,
coeffs: &[i16; 8],
width: usize,
height: usize,
) {
for y in 0..height {
for x in 0..width {
let dst_offset = y * dst_stride + x;
if dst_offset >= dst.len() {
continue;
}
let mut pixels = I16x8::zero();
for i in 0..8 {
let src_offset = (y + i) * src_stride + x;
if src_offset < src.len() {
pixels[i] = i16::from(src[src_offset]);
}
}
let filter = I16x8::from_array(*coeffs);
let products = self.simd.mul_i16x8(pixels, filter);
let sum = self.simd.horizontal_sum_i16x8(products);
let result = (sum + 64) >> 7;
dst[dst_offset] = result.clamp(0, 255) as u8;
}
}
}
#[allow(clippy::too_many_arguments)]
pub fn filter_2d_8tap(
&self,
src: &[u8],
src_stride: usize,
dst: &mut [u8],
dst_stride: usize,
h_coeffs: &[i16; 8],
v_coeffs: &[i16; 8],
width: usize,
height: usize,
) {
let temp_size = (height + 7) * width;
let mut temp = vec![0i16; temp_size];
for y in 0..height + 7 {
for x in 0..width {
let src_offset = y * src_stride + x;
let temp_offset = y * width + x;
if temp_offset >= temp.len() {
continue;
}
let mut pixels = I16x8::zero();
for i in 0..8 {
if src_offset + i < src.len() {
pixels[i] = i16::from(src[src_offset + i]);
}
}
let filter = I16x8::from_array(*h_coeffs);
let products = self.simd.mul_i16x8(pixels, filter);
let sum = self.simd.horizontal_sum_i16x8(products);
temp[temp_offset] = ((sum + 64) >> 7) as i16;
}
}
for y in 0..height {
for x in 0..width {
let dst_offset = y * dst_stride + x;
if dst_offset >= dst.len() {
continue;
}
let mut pixels = I16x8::zero();
for i in 0..8 {
let temp_offset = (y + i) * width + x;
if temp_offset < temp.len() {
pixels[i] = temp[temp_offset];
}
}
let filter = I16x8::from_array(*v_coeffs);
let products = self.simd.mul_i16x8(pixels, filter);
let sum = self.simd.horizontal_sum_i16x8(products);
let result = (sum + 64) >> 7;
dst[dst_offset] = result.clamp(0, 255) as u8;
}
}
}
pub fn bilinear_h(
&self,
src: &[u8],
src_stride: usize,
dst: &mut [u8],
dst_stride: usize,
fraction: u8,
width: usize,
height: usize,
) {
let w1 = fraction;
let w0 = 64 - w1;
for y in 0..height {
for x in 0..width {
let src_offset = y * src_stride + x;
let dst_offset = y * dst_stride + x;
if src_offset + 1 >= src.len() || dst_offset >= dst.len() {
continue;
}
let p0 = u32::from(src[src_offset]);
let p1 = u32::from(src[src_offset + 1]);
let result = (p0 * u32::from(w0) + p1 * u32::from(w1) + 32) / 64;
dst[dst_offset] = result as u8;
}
}
}
pub fn bilinear_v(
&self,
src: &[u8],
src_stride: usize,
dst: &mut [u8],
dst_stride: usize,
fraction: u8,
width: usize,
height: usize,
) {
let w1 = fraction;
let w0 = 64 - w1;
for y in 0..height {
for x in 0..width {
let src_offset = y * src_stride + x;
let dst_offset = y * dst_stride + x;
if src_offset + src_stride >= src.len() || dst_offset >= dst.len() {
continue;
}
let p0 = u32::from(src[src_offset]);
let p1 = u32::from(src[src_offset + src_stride]);
let result = (p0 * u32::from(w0) + p1 * u32::from(w1) + 32) / 64;
dst[dst_offset] = result as u8;
}
}
}
pub fn average_blocks(
&self,
src1: &[u8],
src2: &[u8],
dst: &mut [u8],
width: usize,
height: usize,
stride: usize,
) {
for y in 0..height {
let offset = y * stride;
let chunks = width / 16;
for i in 0..chunks {
let pos = offset + i * 16;
if src1.len() < pos + 16 || src2.len() < pos + 16 || dst.len() < pos + 16 {
continue;
}
let mut v1 = U8x16::zero();
let mut v2 = U8x16::zero();
v1.copy_from_slice(&src1[pos..pos + 16]);
v2.copy_from_slice(&src2[pos..pos + 16]);
let avg = self.simd.avg_u8x16(v1, v2);
let avg_array = avg.to_array();
dst[pos..pos + 16].copy_from_slice(&avg_array);
}
for x in (chunks * 16)..width {
let pos = offset + x;
if src1.len() > pos && src2.len() > pos && dst.len() > pos {
dst[pos] = ((u16::from(src1[pos]) + u16::from(src2[pos]) + 1) / 2) as u8;
}
}
}
}
#[allow(clippy::too_many_arguments)]
pub fn weighted_pred(
&self,
src1: &[u8],
src2: &[u8],
dst: &mut [u8],
weight1: u8,
weight2: u8,
width: usize,
height: usize,
stride: usize,
) {
let total_weight = u32::from(weight1) + u32::from(weight2);
for y in 0..height {
for x in 0..width {
let offset = y * stride + x;
if src1.len() <= offset || src2.len() <= offset || dst.len() <= offset {
continue;
}
let p1 = u32::from(src1[offset]) * u32::from(weight1);
let p2 = u32::from(src2[offset]) * u32::from(weight2);
let result = (p1 + p2 + total_weight / 2) / total_weight;
dst[offset] = result.clamp(0, 255) as u8;
}
}
}
#[allow(clippy::too_many_arguments)]
pub fn obmc_blend(
&self,
pred: &[u8],
obmc: &[u8],
dst: &mut [u8],
width: usize,
height: usize,
stride: usize,
weights: &[u8],
) {
for y in 0..height {
for x in 0..width {
let offset = y * stride + x;
let weight_idx = (y * width + x).min(weights.len().saturating_sub(1));
if pred.len() <= offset || obmc.len() <= offset || dst.len() <= offset {
continue;
}
let w = u32::from(weights[weight_idx]);
let p1 = u32::from(pred[offset]) * w;
let p2 = u32::from(obmc[offset]) * (64 - w);
let result = (p1 + p2 + 32) / 64;
dst[offset] = result as u8;
}
}
}
#[allow(dead_code)]
fn filter_h_4_simd(&self, src: &[u8], coeffs: &[i16; 8]) -> [u8; 4] {
let mut pixels = I16x8::zero();
for i in 0..8.min(src.len()) {
pixels[i] = i16::from(src[i]);
}
let filter = I16x8::from_array(*coeffs);
let result = self.simd.pmaddwd(pixels, filter);
let sum = self.simd.horizontal_sum_i32x4(result);
let final_val = (sum + 64) >> 7;
[
final_val.clamp(0, 255) as u8,
final_val.clamp(0, 255) as u8,
final_val.clamp(0, 255) as u8,
final_val.clamp(0, 255) as u8,
]
}
}
pub mod filter_coeffs {
pub const REGULAR: [i16; 8] = [-1, 3, -7, 127, 8, -3, 1, 0];
pub const SHARP: [i16; 8] = [-1, 3, -8, 127, 8, -2, 1, 0];
pub const SMOOTH: [i16; 8] = [-2, 6, -13, 120, 13, -6, 2, 0];
pub const BILINEAR: [i16; 8] = [0, 0, 0, 128, 0, 0, 0, 0];
}