use core::arch::x86_64::*;
#[target_feature(enable = "avx2")]
pub(super) unsafe fn sad_w16(
source: &[u8],
source_stride: usize,
pred: &[u8],
pred_stride: usize,
h: usize,
) -> u32 {
let src_ptr = source.as_ptr();
let prd_ptr = pred.as_ptr();
let mut acc = unsafe { _mm_setzero_si128() };
let mut y = 0;
while y < h {
let s = unsafe {
_mm_loadu_si128(src_ptr.add(y * source_stride) as *const __m128i)
};
let p = unsafe {
_mm_loadu_si128(prd_ptr.add(y * pred_stride) as *const __m128i)
};
let d = unsafe { _mm_sad_epu8(s, p) };
acc = unsafe { _mm_add_epi64(acc, d) };
y += 1;
}
let lo = unsafe { _mm_extract_epi64::<0>(acc) as u32 };
let hi = unsafe { _mm_extract_epi64::<1>(acc) as u32 };
lo + hi
}
#[target_feature(enable = "avx2")]
pub(super) unsafe fn sad_w8(
source: &[u8],
source_stride: usize,
pred: &[u8],
pred_stride: usize,
h: usize,
) -> u32 {
let src_ptr = source.as_ptr();
let prd_ptr = pred.as_ptr();
let mut acc = unsafe { _mm_setzero_si128() };
let mut y = 0;
while y < h {
let s = unsafe {
_mm_loadl_epi64(src_ptr.add(y * source_stride) as *const __m128i)
};
let p = unsafe {
_mm_loadl_epi64(prd_ptr.add(y * pred_stride) as *const __m128i)
};
let d = unsafe { _mm_sad_epu8(s, p) };
acc = unsafe { _mm_add_epi64(acc, d) };
y += 1;
}
unsafe { _mm_extract_epi64::<0>(acc) as u32 }
}
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn hadamard_4x4(input: &[[i32; 4]; 4]) -> [__m128i; 4] {
let r0 = unsafe { _mm_loadu_si128(input[0].as_ptr() as *const __m128i) };
let r1 = unsafe { _mm_loadu_si128(input[1].as_ptr() as *const __m128i) };
let r2 = unsafe { _mm_loadu_si128(input[2].as_ptr() as *const __m128i) };
let r3 = unsafe { _mm_loadu_si128(input[3].as_ptr() as *const __m128i) };
let p02 = _mm_add_epi32(r0, r2);
let m02 = _mm_sub_epi32(r0, r2);
let p13 = _mm_add_epi32(r1, r3);
let m13 = _mm_sub_epi32(r1, r3);
let a0 = _mm_add_epi32(p02, p13);
let a1 = _mm_add_epi32(m02, m13);
let a2 = _mm_sub_epi32(m02, m13);
let a3 = _mm_sub_epi32(p02, p13);
let t01_lo = _mm_unpacklo_epi32(a0, a1);
let t01_hi = _mm_unpackhi_epi32(a0, a1);
let t23_lo = _mm_unpacklo_epi32(a2, a3);
let t23_hi = _mm_unpackhi_epi32(a2, a3);
let b0 = _mm_unpacklo_epi64(t01_lo, t23_lo);
let b1 = _mm_unpackhi_epi64(t01_lo, t23_lo);
let b2 = _mm_unpacklo_epi64(t01_hi, t23_hi);
let b3 = _mm_unpackhi_epi64(t01_hi, t23_hi);
let p02 = _mm_add_epi32(b0, b2);
let m02 = _mm_sub_epi32(b0, b2);
let p13 = _mm_add_epi32(b1, b3);
let m13 = _mm_sub_epi32(b1, b3);
let c0 = _mm_add_epi32(p02, p13);
let c1 = _mm_add_epi32(m02, m13);
let c2 = _mm_sub_epi32(m02, m13);
let c3 = _mm_sub_epi32(p02, p13);
let u01_lo = _mm_unpacklo_epi32(c0, c1);
let u01_hi = _mm_unpackhi_epi32(c0, c1);
let u23_lo = _mm_unpacklo_epi32(c2, c3);
let u23_hi = _mm_unpackhi_epi32(c2, c3);
let d0 = _mm_unpacklo_epi64(u01_lo, u23_lo);
let d1 = _mm_unpackhi_epi64(u01_lo, u23_lo);
let d2 = _mm_unpacklo_epi64(u01_hi, u23_hi);
let d3 = _mm_unpackhi_epi64(u01_hi, u23_hi);
[d0, d1, d2, d3]
}
#[target_feature(enable = "avx2")]
pub(super) unsafe fn satd_block_4x4_tiled(
source: &[u8],
source_stride: usize,
pred: &[u8],
pred_stride: usize,
block_w: usize,
block_h: usize,
) -> u32 {
debug_assert!(block_w.is_multiple_of(4));
debug_assert!(block_h.is_multiple_of(4));
let tiles_y = block_h / 4;
let tiles_x = block_w / 4;
let mut total: u32 = 0;
for by in 0..tiles_y {
for bx in 0..tiles_x {
let mut residual = [[0i32; 4]; 4];
for dy in 0..4 {
let sy = by * 4 + dy;
let sx = bx * 4;
for dx in 0..4 {
let s = source[sy * source_stride + sx + dx] as i32;
let p = pred[sy * pred_stride + sx + dx] as i32;
residual[dy][dx] = s - p;
}
}
let h = unsafe { hadamard_4x4(&residual) };
let s0 = _mm_abs_epi32(h[0]);
let s1 = _mm_abs_epi32(h[1]);
let s2 = _mm_abs_epi32(h[2]);
let s3 = _mm_abs_epi32(h[3]);
let row01 = _mm_add_epi32(s0, s1);
let row23 = _mm_add_epi32(s2, s3);
let row_sums = _mm_add_epi32(row01, row23);
let pair = _mm_add_epi32(
row_sums,
_mm_shuffle_epi32(row_sums, 0b00_01_10_11),
);
let single = _mm_add_epi32(
pair,
_mm_shuffle_epi32(pair, 0b00_00_00_01),
);
let tile_sum = _mm_cvtsi128_si32(single) as u32;
total = total.saturating_add(tile_sum);
}
}
total
}
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn load_8u_to_s16(src: *const u8) -> __m128i {
let v8 = unsafe { _mm_loadl_epi64(src as *const __m128i) };
unsafe { _mm_unpacklo_epi8(v8, _mm_setzero_si128()) }
}
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn apply_6tap_s16(
s0: __m128i, s1: __m128i, s2: __m128i,
s3: __m128i, s4: __m128i, s5: __m128i,
) -> __m128i {
let s2p3 = _mm_add_epi16(s2, s3);
let s1p4 = _mm_add_epi16(s1, s4);
let s0p5 = _mm_add_epi16(s0, s5);
let term20 = _mm_mullo_epi16(s2p3, _mm_set1_epi16(20));
let term5 = _mm_mullo_epi16(s1p4, _mm_set1_epi16(5));
_mm_add_epi16(_mm_sub_epi16(term20, term5), s0p5)
}
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn b_from_b1(b1: __m128i) -> __m128i {
let off = _mm_add_epi16(b1, _mm_set1_epi16(16));
let shr = _mm_srai_epi16(off, 5);
_mm_packus_epi16(shr, _mm_setzero_si128())
}
#[target_feature(enable = "avx2")]
pub(super) unsafe fn mc_luma_integer_mv(
y_plane: &[u8],
plane_w: u32,
plane_h: u32,
block_x: u32,
block_y: u32,
block_w: u32,
block_h: u32,
mv_x_int: i32,
mv_y_int: i32,
out: &mut [u8],
out_stride: usize,
) -> bool {
let src_x_start = block_x as i32 + mv_x_int;
let src_y_start = block_y as i32 + mv_y_int;
if src_x_start < 0
|| src_y_start < 0
|| src_x_start as u32 + block_w > plane_w
|| src_y_start as u32 + block_h > plane_h
{
return false;
}
let src_x = src_x_start as usize;
let src_y = src_y_start as usize;
let plane_w_us = plane_w as usize;
let w = block_w as usize;
let src_ptr = y_plane.as_ptr();
let dst_ptr = out.as_mut_ptr();
for dy in 0..block_h as usize {
let src_off = (src_y + dy) * plane_w_us + src_x;
let dst_off = dy * out_stride;
let mut x = 0;
while x + 16 <= w {
let v = unsafe {
_mm_loadu_si128(src_ptr.add(src_off + x) as *const __m128i)
};
unsafe {
_mm_storeu_si128(dst_ptr.add(dst_off + x) as *mut __m128i, v);
}
x += 16;
}
while x + 8 <= w {
let v = unsafe {
_mm_loadl_epi64(src_ptr.add(src_off + x) as *const __m128i)
};
unsafe {
_mm_storel_epi64(dst_ptr.add(dst_off + x) as *mut __m128i, v);
}
x += 8;
}
}
true
}
#[target_feature(enable = "avx2")]
pub(super) unsafe fn mc_luma_h_only(
y_plane: &[u8],
plane_w: u32,
plane_h: u32,
block_x: u32,
block_y: u32,
block_w: u32,
block_h: u32,
mv_x_int: i32,
mv_y_int: i32,
x_frac: u8,
out: &mut [u8],
out_stride: usize,
) -> bool {
let src_x_start = block_x as i32 + mv_x_int;
let src_y_start = block_y as i32 + mv_y_int;
if src_x_start - 2 < 0
|| src_y_start < 0
|| (src_x_start + block_w as i32 + 3) as u32 > plane_w
|| src_y_start as u32 + block_h > plane_h
{
return false;
}
let src_x = src_x_start as usize;
let src_y = src_y_start as usize;
let plane_w_us = plane_w as usize;
let w = block_w as usize;
let src_ptr = y_plane.as_ptr();
let dst_ptr = out.as_mut_ptr();
for dy in 0..block_h as usize {
let row_base = (src_y + dy) * plane_w_us + src_x;
let mut x = 0;
while x + 8 <= w {
let base = unsafe { src_ptr.add(row_base + x) };
let s0 = unsafe { load_8u_to_s16(base.wrapping_sub(2)) };
let s1 = unsafe { load_8u_to_s16(base.wrapping_sub(1)) };
let s2 = unsafe { load_8u_to_s16(base) };
let s3 = unsafe { load_8u_to_s16(base.add(1)) };
let s4 = unsafe { load_8u_to_s16(base.add(2)) };
let s5 = unsafe { load_8u_to_s16(base.add(3)) };
let b1 = unsafe { apply_6tap_s16(s0, s1, s2, s3, s4, s5) };
let b_packed = unsafe { b_from_b1(b1) };
let out_xmm = match x_frac {
2 => b_packed,
1 => {
let g = unsafe {
_mm_loadl_epi64(base as *const __m128i)
};
unsafe { _mm_avg_epu8(g, b_packed) }
}
3 => {
let h_int = unsafe {
_mm_loadl_epi64(base.add(1) as *const __m128i)
};
unsafe { _mm_avg_epu8(h_int, b_packed) }
}
_ => unreachable!(),
};
unsafe {
_mm_storel_epi64(
dst_ptr.add(dy * out_stride + x) as *mut __m128i,
out_xmm,
);
}
x += 8;
}
}
true
}
#[target_feature(enable = "avx2")]
pub(super) unsafe fn mc_luma_v_only(
y_plane: &[u8],
plane_w: u32,
plane_h: u32,
block_x: u32,
block_y: u32,
block_w: u32,
block_h: u32,
mv_x_int: i32,
mv_y_int: i32,
y_frac: u8,
out: &mut [u8],
out_stride: usize,
) -> bool {
let src_x_start = block_x as i32 + mv_x_int;
let src_y_start = block_y as i32 + mv_y_int;
if src_x_start < 0
|| src_y_start - 2 < 0
|| src_x_start as u32 + block_w > plane_w
|| (src_y_start + block_h as i32 + 3) as u32 > plane_h
{
return false;
}
let src_x = src_x_start as usize;
let src_y = src_y_start as usize;
let plane_w_us = plane_w as usize;
let w = block_w as usize;
let src_ptr = y_plane.as_ptr();
let dst_ptr = out.as_mut_ptr();
for dy in 0..block_h as usize {
let mut x = 0;
while x + 8 <= w {
let row_off = |delta: i32| -> *const u8 {
let off = ((src_y as i32 + dy as i32 + delta) as usize)
* plane_w_us
+ src_x + x;
unsafe { src_ptr.add(off) }
};
let s0 = unsafe { load_8u_to_s16(row_off(-2)) };
let s1 = unsafe { load_8u_to_s16(row_off(-1)) };
let s2 = unsafe { load_8u_to_s16(row_off(0)) };
let s3 = unsafe { load_8u_to_s16(row_off(1)) };
let s4 = unsafe { load_8u_to_s16(row_off(2)) };
let s5 = unsafe { load_8u_to_s16(row_off(3)) };
let h1 = unsafe { apply_6tap_s16(s0, s1, s2, s3, s4, s5) };
let h_packed = unsafe { b_from_b1(h1) };
let out_xmm = match y_frac {
2 => h_packed,
1 => {
let g = unsafe {
_mm_loadl_epi64(row_off(0) as *const __m128i)
};
unsafe { _mm_avg_epu8(g, h_packed) }
}
3 => {
let m_int = unsafe {
_mm_loadl_epi64(row_off(1) as *const __m128i)
};
unsafe { _mm_avg_epu8(m_int, h_packed) }
}
_ => unreachable!(),
};
unsafe {
_mm_storel_epi64(
dst_ptr.add(dy * out_stride + x) as *mut __m128i,
out_xmm,
);
}
x += 8;
}
}
true
}
#[target_feature(enable = "avx2")]
pub(super) unsafe fn mc_luma_composite(
y_plane: &[u8],
plane_w: u32,
plane_h: u32,
block_x: u32,
block_y: u32,
block_w: u32,
block_h: u32,
mv_x_int: i32,
mv_y_int: i32,
x_frac: u8,
y_frac: u8,
out: &mut [u8],
out_stride: usize,
) -> bool {
debug_assert!(x_frac > 0 && y_frac > 0 && x_frac <= 3 && y_frac <= 3);
let src_x_start = block_x as i32 + mv_x_int;
let src_y_start = block_y as i32 + mv_y_int;
if src_x_start - 2 < 0
|| src_y_start - 2 < 0
|| (src_x_start + block_w as i32 + 3 + 1) as u32 > plane_w
|| (src_y_start + block_h as i32 + 3) as u32 > plane_h
{
return false;
}
let src_x = src_x_start as usize;
let src_y = src_y_start as usize;
let plane_w_us = plane_w as usize;
let w = block_w as usize;
let src_ptr = y_plane.as_ptr();
let dst_ptr = out.as_mut_ptr();
for dy in 0..block_h as usize {
let mut x = 0;
while x + 8 <= w {
let row_off = |delta: i32| -> usize {
((src_y as i32 + dy as i32 + delta) as usize) * plane_w_us
+ src_x + x
};
let h6 = |off: usize| -> __m128i {
let base = unsafe { src_ptr.add(off) };
let s0 = unsafe { load_8u_to_s16(base.wrapping_sub(2)) };
let s1 = unsafe { load_8u_to_s16(base.wrapping_sub(1)) };
let s2 = unsafe { load_8u_to_s16(base) };
let s3 = unsafe { load_8u_to_s16(base.add(1)) };
let s4 = unsafe { load_8u_to_s16(base.add(2)) };
let s5 = unsafe { load_8u_to_s16(base.add(3)) };
unsafe { apply_6tap_s16(s0, s1, s2, s3, s4, s5) }
};
let b1_m2 = h6(row_off(-2));
let b1_m1 = h6(row_off(-1));
let b1_0 = h6(row_off(0));
let b1_1 = h6(row_off(1));
let b1_2 = h6(row_off(2));
let b1_3 = h6(row_off(3));
let v6 = |col_extra: usize| -> __m128i {
let load = |delta: i32| -> __m128i {
let off = ((src_y as i32 + dy as i32 + delta) as usize)
* plane_w_us
+ src_x + x + col_extra;
unsafe { load_8u_to_s16(src_ptr.add(off)) }
};
let s0 = load(-2);
let s1 = load(-1);
let s2 = load(0);
let s3 = load(1);
let s4 = load(2);
let s5 = load(3);
unsafe { apply_6tap_s16(s0, s1, s2, s3, s4, s5) }
};
let j_compute = || -> __m128i {
let sign_ext_lo = |v: __m128i| -> __m128i {
unsafe { _mm_unpacklo_epi16(v, _mm_srai_epi16(v, 15)) }
};
let sign_ext_hi = |v: __m128i| -> __m128i {
unsafe { _mm_unpackhi_epi16(v, _mm_srai_epi16(v, 15)) }
};
let one_half = |a_m2: __m128i, a_m1: __m128i, a_0: __m128i,
a_1: __m128i, a_2: __m128i, a_3: __m128i|
-> __m128i {
let sp01 = _mm_add_epi32(a_0, a_1);
let sp12 = _mm_add_epi32(a_m1, a_2);
let sp23 = _mm_add_epi32(a_m2, a_3);
let term20 = _mm_mullo_epi32(sp01, _mm_set1_epi32(20));
let term5 = _mm_mullo_epi32(sp12, _mm_set1_epi32(5));
_mm_add_epi32(_mm_sub_epi32(term20, term5), sp23)
};
let j1_lo = one_half(
sign_ext_lo(b1_m2), sign_ext_lo(b1_m1), sign_ext_lo(b1_0),
sign_ext_lo(b1_1), sign_ext_lo(b1_2), sign_ext_lo(b1_3),
);
let j1_hi = one_half(
sign_ext_hi(b1_m2), sign_ext_hi(b1_m1), sign_ext_hi(b1_0),
sign_ext_hi(b1_1), sign_ext_hi(b1_2), sign_ext_hi(b1_3),
);
let off = _mm_set1_epi32(512);
let j_lo = _mm_srai_epi32(_mm_add_epi32(j1_lo, off), 10);
let j_hi = _mm_srai_epi32(_mm_add_epi32(j1_hi, off), 10);
let s16 = _mm_packs_epi32(j_lo, j_hi);
_mm_packus_epi16(s16, _mm_setzero_si128())
};
let result = match (x_frac, y_frac) {
(1, 1) => {
let b0 = unsafe { b_from_b1(b1_0) };
let h0 = unsafe { b_from_b1(v6(0)) };
unsafe { _mm_avg_epu8(b0, h0) }
}
(2, 1) => {
let b0 = unsafe { b_from_b1(b1_0) };
let j = j_compute();
unsafe { _mm_avg_epu8(b0, j) }
}
(3, 1) => {
let b0 = unsafe { b_from_b1(b1_0) };
let h1 = unsafe { b_from_b1(v6(1)) };
unsafe { _mm_avg_epu8(b0, h1) }
}
(1, 2) => {
let h0 = unsafe { b_from_b1(v6(0)) };
let j = j_compute();
unsafe { _mm_avg_epu8(h0, j) }
}
(2, 2) => j_compute(),
(3, 2) => {
let h1 = unsafe { b_from_b1(v6(1)) };
let j = j_compute();
unsafe { _mm_avg_epu8(j, h1) }
}
(1, 3) => {
let h0 = unsafe { b_from_b1(v6(0)) };
let b_one = unsafe { b_from_b1(b1_1) };
unsafe { _mm_avg_epu8(h0, b_one) }
}
(2, 3) => {
let j = j_compute();
let b_one = unsafe { b_from_b1(b1_1) };
unsafe { _mm_avg_epu8(j, b_one) }
}
(3, 3) => {
let h1 = unsafe { b_from_b1(v6(1)) };
let b_one = unsafe { b_from_b1(b1_1) };
unsafe { _mm_avg_epu8(h1, b_one) }
}
_ => unreachable!(),
};
unsafe {
_mm_storel_epi64(
dst_ptr.add(dy * out_stride + x) as *mut __m128i,
result,
);
}
x += 8;
}
}
true
}