#![allow(clippy::undocumented_unsafe_blocks)]
#![allow(unsafe_op_in_unsafe_fn)]
#[cfg(test)]
use std::mem::size_of;
use std::{arch::x86_64::*, num::NonZeroUsize};
#[cfg(test)]
use crate::util::Pixel;
use cpudetect::target_family;
use semisafe::slice::get_mut as semisafe_get_mut;
#[cfg(test)]
#[must_use]
#[target_family("x86_64_v3")]
pub(super) unsafe fn get_sad<T: Pixel>(
width: NonZeroUsize,
height: NonZeroUsize,
src: &[T],
src_pitch: NonZeroUsize,
ref_: &[T],
ref_pitch: NonZeroUsize,
) -> u64 {
match (size_of::<T>(), width.get(), height.get()) {
(1, 2, 2) => get_sad_u8::<2, 2>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(1, 2, 4) => get_sad_u8::<2, 4>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(1, 4, 2) => get_sad_u8::<4, 2>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(1, 4, 4) => get_sad_u8::<4, 4>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(1, 4, 8) => get_sad_u8::<4, 8>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(1, 8, 1) => get_sad_u8::<8, 1>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(1, 8, 2) => get_sad_u8::<8, 2>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(1, 8, 4) => get_sad_u8::<8, 4>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(1, 8, 8) => get_sad_u8::<8, 8>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(1, 8, 16) => get_sad_u8::<8, 16>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(1, 16, 1) => get_sad_u8::<16, 1>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(1, 16, 2) => get_sad_u8::<16, 2>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(1, 16, 4) => get_sad_u8::<16, 4>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(1, 16, 8) => get_sad_u8::<16, 8>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(1, 16, 16) => get_sad_u8::<16, 16>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(1, 16, 32) => get_sad_u8::<16, 32>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(1, 32, 8) => get_sad_u8::<32, 8>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(1, 32, 16) => get_sad_u8::<32, 16>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(1, 32, 32) => get_sad_u8::<32, 32>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(1, 32, 64) => get_sad_u8::<32, 64>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(1, 64, 16) => get_sad_u8::<64, 16>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(1, 64, 32) => get_sad_u8::<64, 32>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(1, 64, 64) => get_sad_u8::<64, 64>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(1, 64, 128) => get_sad_u8::<64, 128>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(1, 128, 32) => get_sad_u8::<128, 32>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(1, 128, 64) => get_sad_u8::<128, 64>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(1, 128, 128) => get_sad_u8::<128, 128>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(2, 2, 2) => get_sad_u16::<2, 2>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(2, 2, 4) => get_sad_u16::<2, 4>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(2, 4, 2) => get_sad_u16::<4, 2>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(2, 4, 4) => get_sad_u16::<4, 4>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(2, 4, 8) => get_sad_u16::<4, 8>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(2, 8, 1) => get_sad_u16::<8, 1>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(2, 8, 2) => get_sad_u16::<8, 2>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(2, 8, 4) => get_sad_u16::<8, 4>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(2, 8, 8) => get_sad_u16::<8, 8>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(2, 8, 16) => get_sad_u16::<8, 16>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(2, 16, 1) => get_sad_u16::<16, 1>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(2, 16, 2) => get_sad_u16::<16, 2>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(2, 16, 4) => get_sad_u16::<16, 4>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(2, 16, 8) => get_sad_u16::<16, 8>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(2, 16, 16) => get_sad_u16::<16, 16>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(2, 16, 32) => get_sad_u16::<16, 32>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(2, 32, 8) => get_sad_u16::<32, 8>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(2, 32, 16) => get_sad_u16::<32, 16>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(2, 32, 32) => get_sad_u16::<32, 32>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(2, 32, 64) => get_sad_u16::<32, 64>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(2, 64, 16) => get_sad_u16::<64, 16>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(2, 64, 32) => get_sad_u16::<64, 32>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(2, 64, 64) => get_sad_u16::<64, 64>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(2, 64, 128) => get_sad_u16::<64, 128>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(2, 128, 32) => get_sad_u16::<128, 32>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(2, 128, 64) => get_sad_u16::<128, 64>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
(2, 128, 128) => get_sad_u16::<128, 128>(
src.as_ptr().cast(),
src_pitch,
ref_.as_ptr().cast(),
ref_pitch,
),
_ => unreachable!("unsupported block size"),
}
}
#[target_family("x86_64_v3")]
unsafe fn combine_m128i(lo: __m128i, hi: __m128i) -> __m256i {
_mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1)
}
#[target_family("x86_64_v3")]
unsafe fn pack_u8_rows<const WIDTH: usize>(ptr: *const u8, pitch: usize, rows: usize) -> __m256i {
debug_assert!(rows > 0);
debug_assert!(rows <= 32 / WIDTH);
match WIDTH {
16 => {
let row0 = _mm_loadu_si128(ptr.cast::<__m128i>());
let row1 = if rows > 1 {
_mm_loadu_si128(ptr.add(pitch).cast::<__m128i>())
} else {
_mm_setzero_si128()
};
combine_m128i(row0, row1)
}
8 => {
let zero = _mm_setzero_si128();
let row0 = _mm_loadl_epi64(ptr.cast::<__m128i>());
let row1 = if rows > 1 {
_mm_loadl_epi64(ptr.add(pitch).cast::<__m128i>())
} else {
zero
};
let row2 = if rows > 2 {
_mm_loadl_epi64(ptr.add(pitch * 2).cast::<__m128i>())
} else {
zero
};
let row3 = if rows > 3 {
_mm_loadl_epi64(ptr.add(pitch * 3).cast::<__m128i>())
} else {
zero
};
combine_m128i(
_mm_unpacklo_epi64(row0, row1),
_mm_unpacklo_epi64(row2, row3),
)
}
4 => {
let row0 = (ptr as *const u32).read_unaligned() as i32;
let row1 = if rows > 1 {
(ptr.add(pitch) as *const u32).read_unaligned() as i32
} else {
0
};
let row2 = if rows > 2 {
(ptr.add(pitch * 2) as *const u32).read_unaligned() as i32
} else {
0
};
let row3 = if rows > 3 {
(ptr.add(pitch * 3) as *const u32).read_unaligned() as i32
} else {
0
};
let row4 = if rows > 4 {
(ptr.add(pitch * 4) as *const u32).read_unaligned() as i32
} else {
0
};
let row5 = if rows > 5 {
(ptr.add(pitch * 5) as *const u32).read_unaligned() as i32
} else {
0
};
let row6 = if rows > 6 {
(ptr.add(pitch * 6) as *const u32).read_unaligned() as i32
} else {
0
};
let row7 = if rows > 7 {
(ptr.add(pitch * 7) as *const u32).read_unaligned() as i32
} else {
0
};
combine_m128i(
_mm_setr_epi32(row0, row1, row2, row3),
_mm_setr_epi32(row4, row5, row6, row7),
)
}
2 => {
let mut packed = [0u16; 16];
for row in 0..rows {
*semisafe_get_mut(&mut packed, row) =
(ptr.add(row * pitch) as *const u16).read_unaligned();
}
_mm256_loadu_si256(packed.as_ptr().cast::<__m256i>())
}
_ => unreachable!(),
}
}
#[target_family("x86_64_v3")]
unsafe fn pack_u16_rows<const WIDTH: usize>(ptr: *const u16, pitch: usize, rows: usize) -> __m256i {
debug_assert!(rows > 0);
debug_assert!(rows <= 16 / WIDTH);
match WIDTH {
4 => {
let zero = _mm_setzero_si128();
let row0 = _mm_loadl_epi64(ptr.cast::<__m128i>());
let row1 = if rows > 1 {
_mm_loadl_epi64(ptr.add(pitch).cast::<__m128i>())
} else {
zero
};
let row2 = if rows > 2 {
_mm_loadl_epi64(ptr.add(pitch * 2).cast::<__m128i>())
} else {
zero
};
let row3 = if rows > 3 {
_mm_loadl_epi64(ptr.add(pitch * 3).cast::<__m128i>())
} else {
zero
};
combine_m128i(
_mm_unpacklo_epi64(row0, row1),
_mm_unpacklo_epi64(row2, row3),
)
}
2 => {
let load = |row: usize| -> i32 {
if row < rows {
(ptr.add(row * pitch) as *const u32).read_unaligned() as i32
} else {
0
}
};
_mm256_setr_epi32(
load(0),
load(1),
load(2),
load(3),
load(4),
load(5),
load(6),
load(7),
)
}
_ => unreachable!(),
}
}
#[must_use]
#[target_family("x86_64_v3")]
pub(crate) unsafe fn get_sad_u8<const WIDTH: usize, const HEIGHT: usize>(
src: *const u8,
src_pitch: NonZeroUsize,
ref_: *const u8,
ref_pitch: NonZeroUsize,
) -> u64 {
let src_pitch = src_pitch.get();
let ref_pitch = ref_pitch.get();
let sum = if WIDTH >= 32 {
if WIDTH == 128 {
let mut acc0 = _mm256_setzero_si256();
let mut acc1 = _mm256_setzero_si256();
let mut acc2 = _mm256_setzero_si256();
let mut acc3 = _mm256_setzero_si256();
for j in 0..HEIGHT {
let src_row = src.add(j * src_pitch);
let ref_row = ref_.add(j * ref_pitch);
acc0 = _mm256_add_epi64(
acc0,
_mm256_sad_epu8(
_mm256_loadu_si256(src_row.cast::<__m256i>()),
_mm256_loadu_si256(ref_row.cast::<__m256i>()),
),
);
acc1 = _mm256_add_epi64(
acc1,
_mm256_sad_epu8(
_mm256_loadu_si256(src_row.add(32).cast::<__m256i>()),
_mm256_loadu_si256(ref_row.add(32).cast::<__m256i>()),
),
);
acc2 = _mm256_add_epi64(
acc2,
_mm256_sad_epu8(
_mm256_loadu_si256(src_row.add(64).cast::<__m256i>()),
_mm256_loadu_si256(ref_row.add(64).cast::<__m256i>()),
),
);
acc3 = _mm256_add_epi64(
acc3,
_mm256_sad_epu8(
_mm256_loadu_si256(src_row.add(96).cast::<__m256i>()),
_mm256_loadu_si256(ref_row.add(96).cast::<__m256i>()),
),
);
}
let acc256 =
_mm256_add_epi64(_mm256_add_epi64(acc0, acc1), _mm256_add_epi64(acc2, acc3));
let acc256_lo = _mm256_castsi256_si128(acc256);
let acc256_hi = _mm256_extracti128_si256(acc256, 1);
_mm_add_epi64(acc256_lo, acc256_hi)
} else if WIDTH == 64 {
let mut acc0 = _mm256_setzero_si256();
let mut acc1 = _mm256_setzero_si256();
for j in 0..HEIGHT {
let src_row = src.add(j * src_pitch);
let ref_row = ref_.add(j * ref_pitch);
acc0 = _mm256_add_epi64(
acc0,
_mm256_sad_epu8(
_mm256_loadu_si256(src_row.cast::<__m256i>()),
_mm256_loadu_si256(ref_row.cast::<__m256i>()),
),
);
acc1 = _mm256_add_epi64(
acc1,
_mm256_sad_epu8(
_mm256_loadu_si256(src_row.add(32).cast::<__m256i>()),
_mm256_loadu_si256(ref_row.add(32).cast::<__m256i>()),
),
);
}
let acc256 = _mm256_add_epi64(acc0, acc1);
let acc256_lo = _mm256_castsi256_si128(acc256);
let acc256_hi = _mm256_extracti128_si256(acc256, 1);
_mm_add_epi64(acc256_lo, acc256_hi)
} else {
let mut acc0 = _mm256_setzero_si256();
let mut acc1 = _mm256_setzero_si256();
for j in 0..HEIGHT {
let src_row = src.add(j * src_pitch);
let ref_row = ref_.add(j * ref_pitch);
let sad = _mm256_sad_epu8(
_mm256_loadu_si256(src_row.cast::<__m256i>()),
_mm256_loadu_si256(ref_row.cast::<__m256i>()),
);
if j % 2 == 0 {
acc0 = _mm256_add_epi64(acc0, sad);
} else {
acc1 = _mm256_add_epi64(acc1, sad);
}
}
let acc256 = _mm256_add_epi64(acc0, acc1);
let acc256_lo = _mm256_castsi256_si128(acc256);
let acc256_hi = _mm256_extracti128_si256(acc256, 1);
_mm_add_epi64(acc256_lo, acc256_hi)
}
} else {
let rows_per_vec = 32 / WIDTH;
let full_rows = HEIGHT / rows_per_vec * rows_per_vec;
let mut acc256 = _mm256_setzero_si256();
for j in (0..full_rows).step_by(rows_per_vec) {
let s = pack_u8_rows::<WIDTH>(src.add(j * src_pitch), src_pitch, rows_per_vec);
let r = pack_u8_rows::<WIDTH>(ref_.add(j * ref_pitch), ref_pitch, rows_per_vec);
acc256 = _mm256_add_epi64(acc256, _mm256_sad_epu8(s, r));
}
if full_rows != HEIGHT {
let s = pack_u8_rows::<WIDTH>(
src.add(full_rows * src_pitch),
src_pitch,
HEIGHT - full_rows,
);
let r = pack_u8_rows::<WIDTH>(
ref_.add(full_rows * ref_pitch),
ref_pitch,
HEIGHT - full_rows,
);
acc256 = _mm256_add_epi64(acc256, _mm256_sad_epu8(s, r));
}
let acc256_lo = _mm256_castsi256_si128(acc256);
let acc256_hi = _mm256_extracti128_si256(acc256, 1);
_mm_add_epi64(acc256_lo, acc256_hi)
};
let high = _mm_unpackhi_epi64(sum, sum);
let total = _mm_add_epi64(sum, high);
_mm_cvtsi128_si64(total) as u64
}
#[must_use]
#[target_family("x86_64_v3")]
pub(crate) unsafe fn get_sad_u16<const WIDTH: usize, const HEIGHT: usize>(
src: *const u8,
src_pitch: NonZeroUsize,
ref_: *const u8,
ref_pitch: NonZeroUsize,
) -> u64 {
let src: *const u16 = src.cast();
let ref_: *const u16 = ref_.cast();
let src_pitch = src_pitch.get();
let ref_pitch = ref_pitch.get();
let mut sum = if WIDTH >= 16 {
let mut acc_lo = _mm256_setzero_si256();
let mut acc_hi = _mm256_setzero_si256();
let zero256 = _mm256_setzero_si256();
for j in 0..HEIGHT {
let src_row = src.add(j * src_pitch);
let ref_row = ref_.add(j * ref_pitch);
for i in (0..WIDTH).step_by(16) {
let s = _mm256_loadu_si256(src_row.add(i) as *const __m256i);
let r = _mm256_loadu_si256(ref_row.add(i) as *const __m256i);
let abs_diff = _mm256_sub_epi16(_mm256_max_epu16(s, r), _mm256_min_epu16(s, r));
acc_lo = _mm256_add_epi32(acc_lo, _mm256_unpacklo_epi16(abs_diff, zero256));
acc_hi = _mm256_add_epi32(acc_hi, _mm256_unpackhi_epi16(abs_diff, zero256));
}
}
let acc256 = _mm256_add_epi32(acc_lo, acc_hi);
let acc256_lo = _mm256_castsi256_si128(acc256);
let acc256_hi = _mm256_extracti128_si256(acc256, 1);
_mm_add_epi32(acc256_lo, acc256_hi)
} else {
match WIDTH {
8 => {
let mut acc_lo = _mm256_setzero_si256();
let mut acc_hi = _mm256_setzero_si256();
let zero256 = _mm256_setzero_si256();
let full_rows = HEIGHT / 2 * 2;
for j in (0..full_rows).step_by(2) {
let src0 = _mm_loadu_si128(src.add(j * src_pitch) as *const __m128i);
let src1 = _mm_loadu_si128(src.add((j + 1) * src_pitch) as *const __m128i);
let ref0 = _mm_loadu_si128(ref_.add(j * ref_pitch) as *const __m128i);
let ref1 = _mm_loadu_si128(ref_.add((j + 1) * ref_pitch) as *const __m128i);
let s = combine_m128i(src0, src1);
let r = combine_m128i(ref0, ref1);
let abs_diff = _mm256_sub_epi16(_mm256_max_epu16(s, r), _mm256_min_epu16(s, r));
acc_lo = _mm256_add_epi32(acc_lo, _mm256_unpacklo_epi16(abs_diff, zero256));
acc_hi = _mm256_add_epi32(acc_hi, _mm256_unpackhi_epi16(abs_diff, zero256));
}
let acc256 = _mm256_add_epi32(acc_lo, acc_hi);
let acc256_lo = _mm256_castsi256_si128(acc256);
let acc256_hi = _mm256_extracti128_si256(acc256, 1);
let mut acc128 = _mm_add_epi32(acc256_lo, acc256_hi);
if full_rows != HEIGHT {
let zero128 = _mm_setzero_si128();
let s = _mm_loadu_si128(src.add(full_rows * src_pitch) as *const __m128i);
let r = _mm_loadu_si128(ref_.add(full_rows * ref_pitch) as *const __m128i);
let abs_diff = _mm_sub_epi16(_mm_max_epu16(s, r), _mm_min_epu16(s, r));
acc128 = _mm_add_epi32(acc128, _mm_unpacklo_epi16(abs_diff, zero128));
acc128 = _mm_add_epi32(acc128, _mm_unpackhi_epi16(abs_diff, zero128));
}
acc128
}
4 | 2 => {
let rows_per_vec = 16 / WIDTH;
let full_rows = HEIGHT / rows_per_vec * rows_per_vec;
let mut acc_lo = _mm256_setzero_si256();
let mut acc_hi = _mm256_setzero_si256();
let zero256 = _mm256_setzero_si256();
for j in (0..full_rows).step_by(rows_per_vec) {
let s = pack_u16_rows::<WIDTH>(src.add(j * src_pitch), src_pitch, rows_per_vec);
let r =
pack_u16_rows::<WIDTH>(ref_.add(j * ref_pitch), ref_pitch, rows_per_vec);
let abs_diff = _mm256_sub_epi16(_mm256_max_epu16(s, r), _mm256_min_epu16(s, r));
acc_lo = _mm256_add_epi32(acc_lo, _mm256_unpacklo_epi16(abs_diff, zero256));
acc_hi = _mm256_add_epi32(acc_hi, _mm256_unpackhi_epi16(abs_diff, zero256));
}
if full_rows != HEIGHT {
let s = pack_u16_rows::<WIDTH>(
src.add(full_rows * src_pitch),
src_pitch,
HEIGHT - full_rows,
);
let r = pack_u16_rows::<WIDTH>(
ref_.add(full_rows * ref_pitch),
ref_pitch,
HEIGHT - full_rows,
);
let abs_diff = _mm256_sub_epi16(_mm256_max_epu16(s, r), _mm256_min_epu16(s, r));
acc_lo = _mm256_add_epi32(acc_lo, _mm256_unpacklo_epi16(abs_diff, zero256));
acc_hi = _mm256_add_epi32(acc_hi, _mm256_unpackhi_epi16(abs_diff, zero256));
}
let acc256 = _mm256_add_epi32(acc_lo, acc_hi);
let acc256_lo = _mm256_castsi256_si128(acc256);
let acc256_hi = _mm256_extracti128_si256(acc256, 1);
_mm_add_epi32(acc256_lo, acc256_hi)
}
_ => unreachable!(),
}
};
let mut m0 = _mm_srli_si128(sum, 8);
sum = _mm_add_epi32(sum, m0);
m0 = _mm_srli_epi64(sum, 32);
sum = _mm_add_epi32(sum, m0);
_mm_cvtsi128_si32(sum) as u32 as u64
}