#![allow(clippy::undocumented_unsafe_blocks)]
#![allow(unsafe_op_in_unsafe_fn)]
use std::{
arch::x86_64::*,
num::{NonZeroU8, NonZeroUsize},
};
use crate::util::Pixel;
#[target_feature(enable = "avx2")]
pub(super) unsafe fn refine_horizontal_wiener<T: Pixel>(
dest: &mut [T],
src: &[T],
pitch: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
bits_per_sample: NonZeroU8,
) {
match size_of::<T>() {
1 => refine_horizontal_wiener_u8(
src.as_ptr() as *const u8,
dest.as_mut_ptr() as *mut u8,
pitch,
width,
height,
bits_per_sample,
),
2 => refine_horizontal_wiener_u16(
src.as_ptr() as *const u16,
dest.as_mut_ptr() as *mut u16,
pitch,
width,
height,
bits_per_sample,
),
_ => unreachable!(),
}
}
#[target_feature(enable = "avx2")]
pub(super) unsafe fn refine_vertical_wiener<T: Pixel>(
dest: &mut [T],
src: &[T],
pitch: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
bits_per_sample: NonZeroU8,
) {
match size_of::<T>() {
1 => refine_vertical_wiener_u8(
src.as_ptr() as *const u8,
dest.as_mut_ptr() as *mut u8,
pitch,
width,
height,
bits_per_sample,
),
2 => refine_vertical_wiener_u16(
src.as_ptr() as *const u16,
dest.as_mut_ptr() as *mut u16,
pitch,
width,
height,
bits_per_sample,
),
_ => unreachable!(),
}
}
#[target_feature(enable = "avx2")]
unsafe fn refine_horizontal_wiener_u8(
mut src: *const u8,
mut dest: *mut u8,
pitch: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
_bits_per_sample: NonZeroU8,
) {
let bits_per_sample: u8 = 8;
let width = width.get();
let height = height.get();
let pitch = pitch.get();
let pixel_max = _mm256_set1_epi16(((1i32 << bits_per_sample) - 1) as i16);
let sixteen = _mm256_set1_epi16(16);
let w01 = _mm256_set1_epi16(((u16::from((-5i8) as u8) << 8) | 1u16) as i16);
let w23 = _mm256_set1_epi16(((20u16 << 8) | 20u16) as i16);
let w45 = _mm256_set1_epi16(((1u16 << 8) | u16::from((-5i8) as u8)) as i16);
for _y in 0..height {
*dest = ((*src as u16 + *src.add(1) as u16 + 1) >> 1) as u8;
*dest.add(1) = ((*src.add(1) as u16 + *src.add(2) as u16 + 1) >> 1) as u8;
let wiener_start = 2;
let wiener_end = if width >= 4 { width - 4 } else { wiener_start };
let mut x = wiener_start;
while x + 32 <= wiener_end {
let m0 = _mm256_loadu_si256(src.add(x - 2).cast());
let m1 = _mm256_loadu_si256(src.add(x - 1).cast());
let m2 = _mm256_loadu_si256(src.add(x).cast());
let m3 = _mm256_loadu_si256(src.add(x + 1).cast());
let m4 = _mm256_loadu_si256(src.add(x + 2).cast());
let m5 = _mm256_loadu_si256(src.add(x + 3).cast());
let result = apply_wiener_kernel_u8_fast(
m0, m1, m2, m3, m4, m5, w01, w23, w45, sixteen, pixel_max,
);
_mm256_storeu_si256(dest.add(x).cast(), result);
x += 32;
}
while x < wiener_end {
let m0 = *src.add(x - 2) as i16;
let m1 = *src.add(x - 1) as i16;
let mut m2 = *src.add(x) as i16;
let m3 = *src.add(x + 1) as i16;
let m4 = *src.add(x + 2) as i16;
let m5 = *src.add(x + 3) as i16;
m2 = (m2 + m3) * 4;
m2 -= m1 + m4;
m2 *= 5;
let result = (m0 + m5 + m2 + 16) >> 5;
*dest.add(x) = result.clamp(0, (1 << bits_per_sample) - 1) as u8;
x += 1;
}
for x in wiener_end..(width - 1) {
*dest.add(x) = ((*src.add(x) as u16 + *src.add(x + 1) as u16 + 1) >> 1) as u8;
}
*dest.add(width - 1) = *src.add(width - 1);
dest = dest.add(pitch);
src = src.add(pitch);
}
}
#[target_feature(enable = "avx2")]
unsafe fn refine_horizontal_wiener_u16(
src: *const u16,
dest: *mut u16,
pitch: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
bits_per_sample: NonZeroU8,
) {
let use_fast = bits_per_sample.get() <= 15;
let pixel_max = _mm256_set1_epi32((1i32 << bits_per_sample.get()) - 1);
let four = _mm256_set1_epi32(4);
let five = _mm256_set1_epi32(5);
let sixteen = _mm256_set1_epi32(16);
let w01 = _mm256_set1_epi32(((((-5i32) as u32 & 0xFFFF) << 16) | (1u32 & 0xFFFF)) as i32);
let w23 = _mm256_set1_epi32((((20u32 & 0xFFFF) << 16) | (20u32 & 0xFFFF)) as i32);
let w45 = _mm256_set1_epi32((((1u32 & 0xFFFF) << 16) | ((-5i32) as u32 & 0xFFFF)) as i32);
let zero = _mm256_setzero_si256();
let mut offset = 0;
for _j in 0..height.get() {
if width.get() >= 2 {
let a = *src.add(offset) as u32;
let b = *src.add(offset + 1) as u32;
*dest.add(offset) = ((a + b + 1) / 2) as u16;
if width.get() >= 3 {
let c = *src.add(offset + 2) as u32;
*dest.add(offset + 1) = ((b + c + 1) / 2) as u16;
}
}
let wiener_start = 2;
let wiener_end = if width.get() >= 4 {
width.get() - 4
} else {
wiener_start
};
let mut i = wiener_start;
while i + 16 <= wiener_end {
let m0_words = _mm256_loadu_si256((src.add(offset + i - 2)) as *const __m256i);
let m1_words = _mm256_loadu_si256((src.add(offset + i - 1)) as *const __m256i);
let m2_words = _mm256_loadu_si256((src.add(offset + i)) as *const __m256i);
let m3_words = _mm256_loadu_si256((src.add(offset + i + 1)) as *const __m256i);
let m4_words = _mm256_loadu_si256((src.add(offset + i + 2)) as *const __m256i);
let m5_words = _mm256_loadu_si256((src.add(offset + i + 3)) as *const __m256i);
let result = if use_fast {
apply_wiener_kernel_u16_fast(
m0_words, m1_words, m2_words, m3_words, m4_words, m5_words, w01, w23, w45,
sixteen, pixel_max,
)
} else {
let m0_lo = _mm256_unpacklo_epi16(m0_words, zero);
let m1_lo = _mm256_unpacklo_epi16(m1_words, zero);
let m2_lo = _mm256_unpacklo_epi16(m2_words, zero);
let m3_lo = _mm256_unpacklo_epi16(m3_words, zero);
let m4_lo = _mm256_unpacklo_epi16(m4_words, zero);
let m5_lo = _mm256_unpacklo_epi16(m5_words, zero);
let result_lo = apply_wiener_kernel_u16(
m0_lo, m1_lo, m2_lo, m3_lo, m4_lo, m5_lo, four, five, sixteen, pixel_max,
);
let m0_hi = _mm256_unpackhi_epi16(m0_words, zero);
let m1_hi = _mm256_unpackhi_epi16(m1_words, zero);
let m2_hi = _mm256_unpackhi_epi16(m2_words, zero);
let m3_hi = _mm256_unpackhi_epi16(m3_words, zero);
let m4_hi = _mm256_unpackhi_epi16(m4_words, zero);
let m5_hi = _mm256_unpackhi_epi16(m5_words, zero);
let result_hi = apply_wiener_kernel_u16(
m0_hi, m1_hi, m2_hi, m3_hi, m4_hi, m5_hi, four, five, sixteen, pixel_max,
);
_mm256_packus_epi32(result_lo, result_hi)
};
_mm256_storeu_si256((dest.add(offset + i)) as *mut __m256i, result);
i += 16;
}
while i < wiener_end {
let m0 = *src.add(offset + i - 2) as i32;
let m1 = *src.add(offset + i - 1) as i32;
let mut m2 = *src.add(offset + i) as i32;
let m3 = *src.add(offset + i + 1) as i32;
let m4 = *src.add(offset + i + 2) as i32;
let m5 = *src.add(offset + i + 3) as i32;
m2 = (m2 + m3) * 4;
m2 -= m1 + m4;
m2 *= 5;
let result = (m0 + m5 + m2 + 16) >> 5;
*dest.add(offset + i) = result.clamp(0, (1 << bits_per_sample.get()) - 1) as u16;
i += 1;
}
for i in wiener_end..(width.get() - 1).min(width.get()) {
let a = *src.add(offset + i) as u32;
let b = *src.add(offset + i + 1) as u32;
*dest.add(offset + i) = ((a + b + 1) / 2) as u16;
}
if width.get() > 0 {
*dest.add(offset + width.get() - 1) = *src.add(offset + width.get() - 1);
}
offset += pitch.get();
}
}
#[target_feature(enable = "avx2")]
unsafe fn refine_vertical_wiener_u8(
src: *const u8,
dest: *mut u8,
pitch: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
_bits_per_sample: NonZeroU8,
) {
let bits_per_sample: u8 = 8;
let pitch = pitch.get();
let width = width.get();
let height = height.get();
let zero = _mm256_setzero_si256();
let pixel_max = _mm256_set1_epi16(((1i32 << bits_per_sample) - 1) as i16);
let sixteen = _mm256_set1_epi16(16);
let w01 = _mm256_set1_epi16(((u16::from((-5i8) as u8) << 8) | 1u16) as i16);
let w23 = _mm256_set1_epi16(((20u16 << 8) | 20u16) as i16);
let w45 = _mm256_set1_epi16(((1u16 << 8) | u16::from((-5i8) as u8)) as i16);
let mut offset = 0;
for _j in 0..2.min(height - 1) {
let mut i = 0;
while i + 32 <= width {
let a_bytes = _mm256_loadu_si256((src.add(offset + i)) as *const __m256i);
let b_bytes = _mm256_loadu_si256((src.add(offset + i + pitch)) as *const __m256i);
let a_lo = _mm256_unpacklo_epi8(a_bytes, zero);
let b_lo = _mm256_unpacklo_epi8(b_bytes, zero);
let a_hi = _mm256_unpackhi_epi8(a_bytes, zero);
let b_hi = _mm256_unpackhi_epi8(b_bytes, zero);
let sum_lo = _mm256_add_epi16(_mm256_add_epi16(a_lo, b_lo), _mm256_set1_epi16(1));
let sum_hi = _mm256_add_epi16(_mm256_add_epi16(a_hi, b_hi), _mm256_set1_epi16(1));
let avg_lo = _mm256_srli_epi16(sum_lo, 1);
let avg_hi = _mm256_srli_epi16(sum_hi, 1);
let result = _mm256_packus_epi16(avg_lo, avg_hi);
_mm256_storeu_si256((dest.add(offset + i)) as *mut __m256i, result);
i += 32;
}
while i < width {
let a = *src.add(offset + i) as u16;
let b = *src.add(offset + i + pitch) as u16;
*dest.add(offset + i) = ((a + b + 1) / 2) as u8;
i += 1;
}
offset += pitch;
}
for _j in 2..(height - 4).max(2) {
let row_m0 = src.add(offset - pitch * 2);
let row_m1 = src.add(offset - pitch);
let row_m2 = src.add(offset);
let row_m3 = src.add(offset + pitch);
let row_m4 = src.add(offset + pitch * 2);
let row_m5 = src.add(offset + pitch * 3);
let row_dest = dest.add(offset);
let mut i = 0;
while i + 32 <= width {
let m0_bytes = _mm256_loadu_si256((row_m0.add(i)) as *const __m256i);
let m1_bytes = _mm256_loadu_si256((row_m1.add(i)) as *const __m256i);
let m2_bytes = _mm256_loadu_si256((row_m2.add(i)) as *const __m256i);
let m3_bytes = _mm256_loadu_si256((row_m3.add(i)) as *const __m256i);
let m4_bytes = _mm256_loadu_si256((row_m4.add(i)) as *const __m256i);
let m5_bytes = _mm256_loadu_si256((row_m5.add(i)) as *const __m256i);
let result = apply_wiener_kernel_u8_fast(
m0_bytes, m1_bytes, m2_bytes, m3_bytes, m4_bytes, m5_bytes, w01, w23, w45, sixteen,
pixel_max,
);
_mm256_storeu_si256((row_dest.add(i)) as *mut __m256i, result);
i += 32;
}
while i < width {
let m0 = *row_m0.add(i) as i16;
let m1 = *row_m1.add(i) as i16;
let mut m2 = *row_m2.add(i) as i16;
let m3 = *row_m3.add(i) as i16;
let m4 = *row_m4.add(i) as i16;
let m5 = *row_m5.add(i) as i16;
m2 = (m2 + m3) * 4;
m2 -= m1 + m4;
m2 *= 5;
let result = (m0 + m5 + m2 + 16) >> 5;
*row_dest.add(i) = result.clamp(0, (1 << bits_per_sample) - 1) as u8;
i += 1;
}
offset += pitch;
}
for _j in (height - 4).max(2)..(height - 1) {
let mut i = 0;
while i + 32 <= width {
let a_bytes = _mm256_loadu_si256((src.add(offset + i)) as *const __m256i);
let b_bytes = _mm256_loadu_si256((src.add(offset + i + pitch)) as *const __m256i);
let a_lo = _mm256_unpacklo_epi8(a_bytes, zero);
let b_lo = _mm256_unpacklo_epi8(b_bytes, zero);
let a_hi = _mm256_unpackhi_epi8(a_bytes, zero);
let b_hi = _mm256_unpackhi_epi8(b_bytes, zero);
let sum_lo = _mm256_add_epi16(_mm256_add_epi16(a_lo, b_lo), _mm256_set1_epi16(1));
let sum_hi = _mm256_add_epi16(_mm256_add_epi16(a_hi, b_hi), _mm256_set1_epi16(1));
let avg_lo = _mm256_srli_epi16(sum_lo, 1);
let avg_hi = _mm256_srli_epi16(sum_hi, 1);
let result = _mm256_packus_epi16(avg_lo, avg_hi);
_mm256_storeu_si256((dest.add(offset + i)) as *mut __m256i, result);
i += 32;
}
while i < width {
let a = *src.add(offset + i) as u16;
let b = *src.add(offset + i + pitch) as u16;
*dest.add(offset + i) = ((a + b + 1) / 2) as u8;
i += 1;
}
offset += pitch;
}
if height > 0 {
std::ptr::copy_nonoverlapping(src.add(offset), dest.add(offset), width);
}
}
#[target_feature(enable = "avx2")]
unsafe fn refine_vertical_wiener_u16(
src: *const u16,
dest: *mut u16,
pitch: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
bits_per_sample: NonZeroU8,
) {
let pitch = pitch.get();
let width = width.get();
let height = height.get();
let use_fast = bits_per_sample.get() <= 15;
let pixel_max = _mm256_set1_epi32((1i32 << bits_per_sample.get()) - 1);
let four = _mm256_set1_epi32(4);
let five = _mm256_set1_epi32(5);
let sixteen = _mm256_set1_epi32(16);
let w01 = _mm256_set1_epi32(((((-5i32) as u32 & 0xFFFF) << 16) | (1u32 & 0xFFFF)) as i32);
let w23 = _mm256_set1_epi32((((20u32 & 0xFFFF) << 16) | (20u32 & 0xFFFF)) as i32);
let w45 = _mm256_set1_epi32((((1u32 & 0xFFFF) << 16) | ((-5i32) as u32 & 0xFFFF)) as i32);
let zero = _mm256_setzero_si256();
let mut offset = 0;
for _j in 0..2.min(height - 1) {
let mut i = 0;
while i + 16 <= width {
let a_words = _mm256_loadu_si256((src.add(offset + i)) as *const __m256i);
let b_words = _mm256_loadu_si256((src.add(offset + i + pitch)) as *const __m256i);
let a_lo = _mm256_unpacklo_epi16(a_words, zero);
let b_lo = _mm256_unpacklo_epi16(b_words, zero);
let a_hi = _mm256_unpackhi_epi16(a_words, zero);
let b_hi = _mm256_unpackhi_epi16(b_words, zero);
let sum_lo = _mm256_add_epi32(_mm256_add_epi32(a_lo, b_lo), _mm256_set1_epi32(1));
let sum_hi = _mm256_add_epi32(_mm256_add_epi32(a_hi, b_hi), _mm256_set1_epi32(1));
let avg_lo = _mm256_srli_epi32(sum_lo, 1);
let avg_hi = _mm256_srli_epi32(sum_hi, 1);
let result = _mm256_packus_epi32(avg_lo, avg_hi);
_mm256_storeu_si256((dest.add(offset + i)) as *mut __m256i, result);
i += 16;
}
while i < width {
let a = *src.add(offset + i) as u32;
let b = *src.add(offset + i + pitch) as u32;
*dest.add(offset + i) = ((a + b + 1) / 2) as u16;
i += 1;
}
offset += pitch;
}
for _j in 2..(height - 4).max(2) {
let row_m0 = src.add(offset - pitch * 2);
let row_m1 = src.add(offset - pitch);
let row_m2 = src.add(offset);
let row_m3 = src.add(offset + pitch);
let row_m4 = src.add(offset + pitch * 2);
let row_m5 = src.add(offset + pitch * 3);
let row_dest = dest.add(offset);
let mut i = 0;
while i + 16 <= width {
let m0_words = _mm256_loadu_si256((row_m0.add(i)) as *const __m256i);
let m1_words = _mm256_loadu_si256((row_m1.add(i)) as *const __m256i);
let m2_words = _mm256_loadu_si256((row_m2.add(i)) as *const __m256i);
let m3_words = _mm256_loadu_si256((row_m3.add(i)) as *const __m256i);
let m4_words = _mm256_loadu_si256((row_m4.add(i)) as *const __m256i);
let m5_words = _mm256_loadu_si256((row_m5.add(i)) as *const __m256i);
let result = if use_fast {
apply_wiener_kernel_u16_fast(
m0_words, m1_words, m2_words, m3_words, m4_words, m5_words, w01, w23, w45,
sixteen, pixel_max,
)
} else {
let m0_lo = _mm256_unpacklo_epi16(m0_words, zero);
let m1_lo = _mm256_unpacklo_epi16(m1_words, zero);
let m2_lo = _mm256_unpacklo_epi16(m2_words, zero);
let m3_lo = _mm256_unpacklo_epi16(m3_words, zero);
let m4_lo = _mm256_unpacklo_epi16(m4_words, zero);
let m5_lo = _mm256_unpacklo_epi16(m5_words, zero);
let result_lo = apply_wiener_kernel_u16(
m0_lo, m1_lo, m2_lo, m3_lo, m4_lo, m5_lo, four, five, sixteen, pixel_max,
);
let m0_hi = _mm256_unpackhi_epi16(m0_words, zero);
let m1_hi = _mm256_unpackhi_epi16(m1_words, zero);
let m2_hi = _mm256_unpackhi_epi16(m2_words, zero);
let m3_hi = _mm256_unpackhi_epi16(m3_words, zero);
let m4_hi = _mm256_unpackhi_epi16(m4_words, zero);
let m5_hi = _mm256_unpackhi_epi16(m5_words, zero);
let result_hi = apply_wiener_kernel_u16(
m0_hi, m1_hi, m2_hi, m3_hi, m4_hi, m5_hi, four, five, sixteen, pixel_max,
);
_mm256_packus_epi32(result_lo, result_hi)
};
_mm256_storeu_si256((row_dest.add(i)) as *mut __m256i, result);
i += 16;
}
while i < width {
let m0 = *row_m0.add(i) as i32;
let m1 = *row_m1.add(i) as i32;
let mut m2 = *row_m2.add(i) as i32;
let m3 = *row_m3.add(i) as i32;
let m4 = *row_m4.add(i) as i32;
let m5 = *row_m5.add(i) as i32;
m2 = (m2 + m3) * 4;
m2 -= m1 + m4;
m2 *= 5;
let result = (m0 + m5 + m2 + 16) >> 5;
*row_dest.add(i) = result.clamp(0, (1 << bits_per_sample.get()) - 1) as u16;
i += 1;
}
offset += pitch;
}
for _j in (height - 4).max(2)..(height - 1) {
let mut i = 0;
while i + 16 <= width {
let a_words = _mm256_loadu_si256((src.add(offset + i)) as *const __m256i);
let b_words = _mm256_loadu_si256((src.add(offset + i + pitch)) as *const __m256i);
let a_lo = _mm256_unpacklo_epi16(a_words, zero);
let b_lo = _mm256_unpacklo_epi16(b_words, zero);
let a_hi = _mm256_unpackhi_epi16(a_words, zero);
let b_hi = _mm256_unpackhi_epi16(b_words, zero);
let sum_lo = _mm256_add_epi32(_mm256_add_epi32(a_lo, b_lo), _mm256_set1_epi32(1));
let sum_hi = _mm256_add_epi32(_mm256_add_epi32(a_hi, b_hi), _mm256_set1_epi32(1));
let avg_lo = _mm256_srli_epi32(sum_lo, 1);
let avg_hi = _mm256_srli_epi32(sum_hi, 1);
let result = _mm256_packus_epi32(avg_lo, avg_hi);
_mm256_storeu_si256((dest.add(offset + i)) as *mut __m256i, result);
i += 16;
}
while i < width {
let a = *src.add(offset + i) as u32;
let b = *src.add(offset + i + pitch) as u32;
*dest.add(offset + i) = ((a + b + 1) / 2) as u16;
i += 1;
}
offset += pitch;
}
if height > 0 {
std::ptr::copy_nonoverlapping(src.add(offset), dest.add(offset), width);
}
}
#[target_feature(enable = "avx2")]
#[inline]
pub(super) unsafe fn apply_wiener_kernel_u8_fast(
m0_bytes: __m256i,
m1_bytes: __m256i,
m2_bytes: __m256i,
m3_bytes: __m256i,
m4_bytes: __m256i,
m5_bytes: __m256i,
w01: __m256i,
w23: __m256i,
w45: __m256i,
sixteen: __m256i,
pixel_max: __m256i,
) -> __m256i {
let pair01_lo = _mm256_unpacklo_epi8(m0_bytes, m1_bytes);
let pair23_lo = _mm256_unpacklo_epi8(m2_bytes, m3_bytes);
let pair45_lo = _mm256_unpacklo_epi8(m4_bytes, m5_bytes);
let pair01_hi = _mm256_unpackhi_epi8(m0_bytes, m1_bytes);
let pair23_hi = _mm256_unpackhi_epi8(m2_bytes, m3_bytes);
let pair45_hi = _mm256_unpackhi_epi8(m4_bytes, m5_bytes);
let lo = _mm256_add_epi16(
_mm256_add_epi16(
_mm256_maddubs_epi16(pair01_lo, w01),
_mm256_maddubs_epi16(pair23_lo, w23),
),
_mm256_add_epi16(_mm256_maddubs_epi16(pair45_lo, w45), sixteen),
);
let hi = _mm256_add_epi16(
_mm256_add_epi16(
_mm256_maddubs_epi16(pair01_hi, w01),
_mm256_maddubs_epi16(pair23_hi, w23),
),
_mm256_add_epi16(_mm256_maddubs_epi16(pair45_hi, w45), sixteen),
);
let res_lo = _mm256_srai_epi16(lo, 5);
let res_hi = _mm256_srai_epi16(hi, 5);
let zero = _mm256_setzero_si256();
let clamped_lo = _mm256_max_epi16(zero, _mm256_min_epi16(res_lo, pixel_max));
let clamped_hi = _mm256_max_epi16(zero, _mm256_min_epi16(res_hi, pixel_max));
_mm256_packus_epi16(clamped_lo, clamped_hi)
}
#[target_feature(enable = "avx2")]
#[inline]
pub(super) unsafe fn apply_wiener_kernel_u16_fast(
m0: __m256i,
m1: __m256i,
m2: __m256i,
m3: __m256i,
m4: __m256i,
m5: __m256i,
w01: __m256i,
w23: __m256i,
w45: __m256i,
sixteen: __m256i,
pixel_max: __m256i,
) -> __m256i {
let pair01_lo = _mm256_unpacklo_epi16(m0, m1);
let pair23_lo = _mm256_unpacklo_epi16(m2, m3);
let pair45_lo = _mm256_unpacklo_epi16(m4, m5);
let pair01_hi = _mm256_unpackhi_epi16(m0, m1);
let pair23_hi = _mm256_unpackhi_epi16(m2, m3);
let pair45_hi = _mm256_unpackhi_epi16(m4, m5);
let lo = _mm256_add_epi32(
_mm256_add_epi32(
_mm256_madd_epi16(pair01_lo, w01),
_mm256_madd_epi16(pair23_lo, w23),
),
_mm256_add_epi32(_mm256_madd_epi16(pair45_lo, w45), sixteen),
);
let hi = _mm256_add_epi32(
_mm256_add_epi32(
_mm256_madd_epi16(pair01_hi, w01),
_mm256_madd_epi16(pair23_hi, w23),
),
_mm256_add_epi32(_mm256_madd_epi16(pair45_hi, w45), sixteen),
);
let res_lo = _mm256_srai_epi32(lo, 5);
let res_hi = _mm256_srai_epi32(hi, 5);
let zero = _mm256_setzero_si256();
let clamped_lo = _mm256_max_epi32(zero, _mm256_min_epi32(res_lo, pixel_max));
let clamped_hi = _mm256_max_epi32(zero, _mm256_min_epi32(res_hi, pixel_max));
_mm256_packus_epi32(clamped_lo, clamped_hi)
}
#[target_feature(enable = "avx2")]
#[inline]
pub(super) unsafe fn apply_wiener_kernel_u16(
m0: __m256i,
m1: __m256i,
m2: __m256i,
m3: __m256i,
m4: __m256i,
m5: __m256i,
four: __m256i,
five: __m256i,
sixteen: __m256i,
pixel_max: __m256i,
) -> __m256i {
let sum23 = _mm256_add_epi32(m2, m3);
let mut temp = _mm256_mullo_epi32(sum23, four);
let sum14 = _mm256_add_epi32(m1, m4);
temp = _mm256_sub_epi32(temp, sum14);
temp = _mm256_mullo_epi32(temp, five);
let sum05 = _mm256_add_epi32(m0, m5);
let sum = _mm256_add_epi32(_mm256_add_epi32(sum05, temp), sixteen);
let result = _mm256_srai_epi32(sum, 5);
let zero = _mm256_setzero_si256();
_mm256_max_epi32(zero, _mm256_min_epi32(result, pixel_max))
}