#![allow(clippy::undocumented_unsafe_blocks)]
use std::{
arch::x86_64::*,
num::{NonZeroU8, NonZeroUsize},
};
use crate::util::Pixel;
#[target_feature(enable = "avx2")]
pub(super) fn refine_horizontal_bilinear<T: Pixel>(
dest: &mut [T],
src: &[T],
pitch: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
_bits_per_sample: NonZeroU8,
) {
match size_of::<T>() {
1 => unsafe {
refine_horizontal_bilinear_u8(
src.as_ptr() as *const u8,
dest.as_mut_ptr() as *mut u8,
pitch,
width,
height,
);
},
2 => unsafe {
refine_horizontal_bilinear_u16(
src.as_ptr() as *const u16,
dest.as_mut_ptr() as *mut u16,
pitch,
width,
height,
);
},
_ => unreachable!(),
}
}
#[target_feature(enable = "avx2")]
pub(super) fn refine_vertical_bilinear<T: Pixel>(
dest: &mut [T],
src: &[T],
pitch: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
_bits_per_sample: NonZeroU8,
) {
match size_of::<T>() {
1 => unsafe {
refine_vertical_bilinear_u8(
src.as_ptr() as *const u8,
dest.as_mut_ptr() as *mut u8,
pitch,
width,
height,
);
},
2 => unsafe {
refine_vertical_bilinear_u16(
src.as_ptr() as *const u16,
dest.as_mut_ptr() as *mut u16,
pitch,
width,
height,
);
},
_ => unreachable!(),
}
}
#[target_feature(enable = "avx2")]
#[cfg(any(test, feature = "experimental"))]
pub(super) fn refine_diagonal_bilinear<T: Pixel>(
dest: &mut [T],
src: &[T],
pitch: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
_bits_per_sample: NonZeroU8,
) {
match size_of::<T>() {
1 => unsafe {
refine_diagonal_bilinear_u8(
src.as_ptr() as *const u8,
dest.as_mut_ptr() as *mut u8,
pitch,
width,
height,
);
},
2 => unsafe {
refine_diagonal_bilinear_u16(
src.as_ptr() as *const u16,
dest.as_mut_ptr() as *mut u16,
pitch,
width,
height,
);
},
_ => unreachable!(),
}
}
#[target_feature(enable = "avx2")]
unsafe fn refine_horizontal_bilinear_u8(
src: *const u8,
dest: *mut u8,
pitch: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
) {
let pitch = pitch.get();
let width = width.get();
let height = height.get();
for j in 0..height {
let row_offset = j * pitch;
let mut i = 0;
while i + 32 < width {
let current = _mm256_loadu_si256((src.add(row_offset + i)) as *const __m256i);
let next = _mm256_loadu_si256((src.add(row_offset + i + 1)) as *const __m256i);
let result = _mm256_avg_epu8(current, next);
_mm256_storeu_si256((dest.add(row_offset + i)) as *mut __m256i, result);
i += 32;
}
while i < width - 1 {
let a = *src.add(row_offset + i) as u16;
let b = *src.add(row_offset + i + 1) as u16;
*dest.add(row_offset + i) = ((a + b + 1) / 2) as u8;
i += 1;
}
if width > 0 {
*dest.add(row_offset + width - 1) = *src.add(row_offset + width - 1);
}
}
}
#[target_feature(enable = "avx2")]
unsafe fn refine_horizontal_bilinear_u16(
src: *const u16,
dest: *mut u16,
pitch: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
) {
let pitch = pitch.get();
let width = width.get();
let height = height.get();
for j in 0..height {
let row_offset = j * pitch;
let mut i = 0;
while i + 16 < width {
let current = _mm256_loadu_si256((src.add(row_offset + i)) as *const __m256i);
let next = _mm256_loadu_si256((src.add(row_offset + i + 1)) as *const __m256i);
let result = _mm256_avg_epu16(current, next);
_mm256_storeu_si256((dest.add(row_offset + i)) as *mut __m256i, result);
i += 16;
}
while i < width - 1 {
let a = *src.add(row_offset + i) as u32;
let b = *src.add(row_offset + i + 1) as u32;
*dest.add(row_offset + i) = ((a + b + 1) / 2) as u16;
i += 1;
}
if width > 0 {
*dest.add(row_offset + width - 1) = *src.add(row_offset + width - 1);
}
}
}
#[target_feature(enable = "avx2")]
unsafe fn refine_vertical_bilinear_u8(
mut src: *const u8,
mut dest: *mut u8,
pitch: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
) {
let pitch = pitch.get();
let width = width.get();
let height = height.get();
let simd_width_32 = width & !31;
for _y in 0..(height - 1) {
for x in (0..simd_width_32).step_by(32) {
let mut m0 = _mm256_loadu_si256(src.add(x).cast());
let m1 = _mm256_loadu_si256(src.add(x + pitch).cast());
m0 = _mm256_avg_epu8(m0, m1);
_mm256_storeu_si256(dest.add(x).cast(), m0);
}
for x in simd_width_32..width {
*dest.add(x) = ((*src.add(x) as u16 + *src.add(x + pitch) as u16 + 1) >> 1) as u8;
}
src = src.add(pitch);
dest = dest.add(pitch);
}
for x in 0..width {
*dest.add(x) = *src.add(x);
}
}
#[target_feature(enable = "avx2")]
unsafe fn refine_vertical_bilinear_u16(
src: *const u16,
dest: *mut u16,
pitch: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
) {
let pitch = pitch.get();
let width = width.get();
let height = height.get();
for j in 0..height - 1 {
let row_offset = j * pitch;
let mut i = 0;
while i + 16 <= width {
let current = _mm256_loadu_si256((src.add(row_offset + i)) as *const __m256i);
let next = _mm256_loadu_si256((src.add(row_offset + pitch + i)) as *const __m256i);
let result = _mm256_avg_epu16(current, next);
_mm256_storeu_si256((dest.add(row_offset + i)) as *mut __m256i, result);
i += 16;
}
while i < width {
let a = *src.add(row_offset + i) as u32;
let b = *src.add(row_offset + pitch + i) as u32;
*dest.add(row_offset + i) = ((a + b + 1) / 2) as u16;
i += 1;
}
}
if height > 0 {
let last_row_offset = (height - 1) * pitch;
std::ptr::copy_nonoverlapping(src.add(last_row_offset), dest.add(last_row_offset), width);
}
}
#[target_feature(enable = "avx2")]
#[inline]
#[cfg(any(test, feature = "experimental", feature = "avx512"))]
pub(super) unsafe fn apply_diagonal_bilinear_u8_avx2(
a: __m256i,
b: __m256i,
c: __m256i,
d: __m256i,
two: __m256i,
) -> __m128i {
let sum_ab = _mm256_add_epi16(a, b);
let sum_cd = _mm256_add_epi16(c, d);
let sum = _mm256_add_epi16(_mm256_add_epi16(sum_ab, sum_cd), two);
let result = _mm256_srli_epi16(sum, 2);
let packed = _mm256_packus_epi16(result, result);
let packed = _mm256_permute4x64_epi64(packed, 0b11_01_10_00);
_mm256_castsi256_si128(packed)
}
#[target_feature(enable = "avx2")]
#[inline]
#[cfg(any(test, feature = "experimental", feature = "avx512"))]
pub(super) unsafe fn apply_diagonal_bilinear_u16_avx2(
a: __m256i,
b: __m256i,
c: __m256i,
d: __m256i,
two: __m256i,
) -> __m128i {
let sum_ab = _mm256_add_epi32(a, b);
let sum_cd = _mm256_add_epi32(c, d);
let sum = _mm256_add_epi32(_mm256_add_epi32(sum_ab, sum_cd), two);
let result = _mm256_srli_epi32(sum, 2);
let packed = _mm256_packus_epi32(result, result);
let packed = _mm256_permute4x64_epi64(packed, 0b11_01_10_00);
_mm256_castsi256_si128(packed)
}
#[target_feature(enable = "avx2")]
#[cfg(any(test, feature = "experimental"))]
unsafe fn refine_diagonal_bilinear_u8(
src: *const u8,
dest: *mut u8,
pitch: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
) {
let pitch = pitch.get();
let width = width.get();
let height = height.get();
let two = _mm256_set1_epi16(2);
let mut offset = 0;
for _j in 0..height {
let mut i = 0;
while i + 16 < width {
let a = _mm_loadu_si128(src.add(offset + i).cast::<__m128i>());
let b = _mm_loadu_si128(src.add(offset + i + 1).cast::<__m128i>());
let c = _mm_loadu_si128(src.add(offset + pitch + i).cast::<__m128i>());
let d = _mm_loadu_si128(src.add(offset + pitch + i + 1).cast::<__m128i>());
let result = apply_diagonal_bilinear_u8_avx2(
_mm256_cvtepu8_epi16(a),
_mm256_cvtepu8_epi16(b),
_mm256_cvtepu8_epi16(c),
_mm256_cvtepu8_epi16(d),
two,
);
_mm_storeu_si128(dest.add(offset + i).cast::<__m128i>(), result);
i += 16;
}
while i + 1 < width {
let a = *src.add(offset + i) as u16;
let b = *src.add(offset + i + 1) as u16;
let c = *src.add(offset + pitch + i) as u16;
let d = *src.add(offset + pitch + i + 1) as u16;
*dest.add(offset + i) = ((a + b + c + d + 2) >> 2) as u8;
i += 1;
}
if width > 0 {
let a = *src.add(offset + width - 1) as u16;
let b = *src.add(offset + width - 1 + pitch) as u16;
*dest.add(offset + width - 1) = ((a + b + 1) >> 1) as u8;
}
offset += pitch;
}
for i in 0..width.saturating_sub(1) {
let a = *src.add(offset + i) as u16;
let b = *src.add(offset + i + 1) as u16;
*dest.add(offset + i) = ((a + b + 1) >> 1) as u8;
}
if width > 0 {
*dest.add(offset + width - 1) = *src.add(offset + width - 1);
}
}
#[target_feature(enable = "avx2")]
#[cfg(any(test, feature = "experimental"))]
unsafe fn refine_diagonal_bilinear_u16(
src: *const u16,
dest: *mut u16,
pitch: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
) {
let pitch = pitch.get();
let width = width.get();
let height = height.get();
let two = _mm256_set1_epi32(2);
let mut offset = 0;
for _j in 0..height {
let mut i = 0;
while i + 8 < width {
let a = _mm_loadu_si128(src.add(offset + i).cast::<__m128i>());
let b = _mm_loadu_si128(src.add(offset + i + 1).cast::<__m128i>());
let c = _mm_loadu_si128(src.add(offset + pitch + i).cast::<__m128i>());
let d = _mm_loadu_si128(src.add(offset + pitch + i + 1).cast::<__m128i>());
let result = apply_diagonal_bilinear_u16_avx2(
_mm256_cvtepu16_epi32(a),
_mm256_cvtepu16_epi32(b),
_mm256_cvtepu16_epi32(c),
_mm256_cvtepu16_epi32(d),
two,
);
_mm_storeu_si128(dest.add(offset + i).cast::<__m128i>(), result);
i += 8;
}
while i + 1 < width {
let a = *src.add(offset + i) as u32;
let b = *src.add(offset + i + 1) as u32;
let c = *src.add(offset + pitch + i) as u32;
let d = *src.add(offset + pitch + i + 1) as u32;
*dest.add(offset + i) = ((a + b + c + d + 2) >> 2) as u16;
i += 1;
}
if width > 0 {
let a = *src.add(offset + width - 1) as u32;
let b = *src.add(offset + width - 1 + pitch) as u32;
*dest.add(offset + width - 1) = ((a + b + 1) >> 1) as u16;
}
offset += pitch;
}
for i in 0..width.saturating_sub(1) {
let a = *src.add(offset + i) as u32;
let b = *src.add(offset + i + 1) as u32;
*dest.add(offset + i) = ((a + b + 1) >> 1) as u16;
}
if width > 0 {
*dest.add(offset + width - 1) = *src.add(offset + width - 1);
}
}