#![allow(clippy::undocumented_unsafe_blocks)]
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
use std::{mem::size_of, num::NonZeroUsize};
use crate::util::Pixel;
#[target_feature(enable = "avx2")]
pub(super) fn reduce_bilinear<T: Pixel>(
dest: &mut [T],
src: &[T],
dest_pitch: NonZeroUsize,
src_pitch: NonZeroUsize,
dest_width: NonZeroUsize,
dest_height: NonZeroUsize,
) {
debug_assert!(src.len() >= src_pitch.get() * dest_height.get() * 2);
debug_assert!(dest.len() >= dest_pitch.get() * dest_height.get());
match size_of::<T>() {
1 => unsafe {
reduce_bilinear_vertical_u8(
dest.as_mut_ptr() as *mut u8,
src.as_ptr() as *const u8,
dest_pitch,
src_pitch,
dest_width.saturating_mul(NonZeroUsize::new_unchecked(2)),
dest_height,
);
reduce_bilinear_horizontal_inplace_u8(
dest.as_mut_ptr() as *mut u8,
dest_pitch,
dest_width,
dest_height,
);
},
2 => unsafe {
reduce_bilinear_vertical_u16(
dest.as_mut_ptr() as *mut u16,
src.as_ptr() as *const u16,
dest_pitch,
src_pitch,
dest_width.saturating_mul(NonZeroUsize::new_unchecked(2)),
dest_height,
);
reduce_bilinear_horizontal_inplace_u16(
dest.as_mut_ptr() as *mut u16,
dest_pitch,
dest_width,
dest_height,
);
},
_ => unreachable!(),
}
}
#[target_feature(enable = "avx2")]
unsafe fn reduce_bilinear_vertical_u8(
dest: *mut u8,
src: *const u8,
dest_pitch: NonZeroUsize,
src_pitch: NonZeroUsize,
dest_width: NonZeroUsize,
dest_height: NonZeroUsize,
) {
let dest_pitch = dest_pitch.get();
let src_pitch = src_pitch.get();
let dest_width = dest_width.get();
let dest_height = dest_height.get();
let mut dest_ptr = dest;
let src_ptr = src;
let mut x = 0;
while x + 32 <= dest_width {
let a = _mm256_loadu_si256(src_ptr.add(x) as *const __m256i);
let b = _mm256_loadu_si256(src_ptr.add(x + src_pitch) as *const __m256i);
let a_lo = _mm256_unpacklo_epi8(a, _mm256_setzero_si256());
let a_hi = _mm256_unpackhi_epi8(a, _mm256_setzero_si256());
let b_lo = _mm256_unpacklo_epi8(b, _mm256_setzero_si256());
let b_hi = _mm256_unpackhi_epi8(b, _mm256_setzero_si256());
let ones = _mm256_set1_epi16(1);
let sum_lo = _mm256_add_epi16(_mm256_add_epi16(a_lo, b_lo), ones);
let sum_hi = _mm256_add_epi16(_mm256_add_epi16(a_hi, b_hi), ones);
let result_lo = _mm256_srli_epi16(sum_lo, 1);
let result_hi = _mm256_srli_epi16(sum_hi, 1);
let result = _mm256_packus_epi16(result_lo, result_hi);
_mm256_storeu_si256(dest_ptr.add(x) as *mut __m256i, result);
x += 32;
}
while x < dest_width {
let a = *src_ptr.add(x) as u16;
let b = *src_ptr.add(x + src_pitch) as u16;
*dest_ptr.add(x) = ((a + b + 1) / 2) as u8;
x += 1;
}
dest_ptr = dest_ptr.add(dest_pitch);
for y in 1..(dest_height - 1) {
let src_row_offset = y * 2 * src_pitch;
let mut x = 0;
while x + 32 <= dest_width {
let a =
_mm256_loadu_si256(src_ptr.add(src_row_offset + x - src_pitch) as *const __m256i);
let b = _mm256_loadu_si256(src_ptr.add(src_row_offset + x) as *const __m256i);
let c =
_mm256_loadu_si256(src_ptr.add(src_row_offset + x + src_pitch) as *const __m256i);
let d = _mm256_loadu_si256(
src_ptr.add(src_row_offset + x + src_pitch * 2) as *const __m256i
);
let a_lo = _mm256_unpacklo_epi8(a, _mm256_setzero_si256());
let a_hi = _mm256_unpackhi_epi8(a, _mm256_setzero_si256());
let b_lo = _mm256_unpacklo_epi8(b, _mm256_setzero_si256());
let b_hi = _mm256_unpackhi_epi8(b, _mm256_setzero_si256());
let c_lo = _mm256_unpacklo_epi8(c, _mm256_setzero_si256());
let c_hi = _mm256_unpackhi_epi8(c, _mm256_setzero_si256());
let d_lo = _mm256_unpacklo_epi8(d, _mm256_setzero_si256());
let d_hi = _mm256_unpackhi_epi8(d, _mm256_setzero_si256());
let bc_lo = _mm256_add_epi16(b_lo, c_lo);
let bc_hi = _mm256_add_epi16(b_hi, c_hi);
let bc3_lo = _mm256_add_epi16(_mm256_add_epi16(bc_lo, bc_lo), bc_lo);
let bc3_hi = _mm256_add_epi16(_mm256_add_epi16(bc_hi, bc_hi), bc_hi);
let fours = _mm256_set1_epi16(4);
let sum_lo = _mm256_add_epi16(
_mm256_add_epi16(a_lo, bc3_lo),
_mm256_add_epi16(d_lo, fours),
);
let sum_hi = _mm256_add_epi16(
_mm256_add_epi16(a_hi, bc3_hi),
_mm256_add_epi16(d_hi, fours),
);
let result_lo = _mm256_srli_epi16(sum_lo, 3);
let result_hi = _mm256_srli_epi16(sum_hi, 3);
let result = _mm256_packus_epi16(result_lo, result_hi);
_mm256_storeu_si256(dest_ptr.add(x) as *mut __m256i, result);
x += 32;
}
while x < dest_width {
let a = *src_ptr.add(src_row_offset + x - src_pitch) as u16;
let b = *src_ptr.add(src_row_offset + x) as u16;
let c = *src_ptr.add(src_row_offset + x + src_pitch) as u16;
let d = *src_ptr.add(src_row_offset + x + src_pitch * 2) as u16;
*dest_ptr.add(x) = ((a + (b + c) * 3 + d + 4) / 8) as u8;
x += 1;
}
dest_ptr = dest_ptr.add(dest_pitch);
}
if dest_height > 1 {
let src_row_offset = (dest_height - 1) * 2 * src_pitch;
let mut x = 0;
while x + 32 <= dest_width {
let a = _mm256_loadu_si256(src_ptr.add(src_row_offset + x) as *const __m256i);
let b =
_mm256_loadu_si256(src_ptr.add(src_row_offset + x + src_pitch) as *const __m256i);
let a_lo = _mm256_unpacklo_epi8(a, _mm256_setzero_si256());
let a_hi = _mm256_unpackhi_epi8(a, _mm256_setzero_si256());
let b_lo = _mm256_unpacklo_epi8(b, _mm256_setzero_si256());
let b_hi = _mm256_unpackhi_epi8(b, _mm256_setzero_si256());
let ones = _mm256_set1_epi16(1);
let sum_lo = _mm256_add_epi16(_mm256_add_epi16(a_lo, b_lo), ones);
let sum_hi = _mm256_add_epi16(_mm256_add_epi16(a_hi, b_hi), ones);
let result_lo = _mm256_srli_epi16(sum_lo, 1);
let result_hi = _mm256_srli_epi16(sum_hi, 1);
let result = _mm256_packus_epi16(result_lo, result_hi);
_mm256_storeu_si256(dest_ptr.add(x) as *mut __m256i, result);
x += 32;
}
while x < dest_width {
let a = *src_ptr.add(src_row_offset + x) as u16;
let b = *src_ptr.add(src_row_offset + x + src_pitch) as u16;
*dest_ptr.add(x) = ((a + b + 1) / 2) as u8;
x += 1;
}
}
}
#[target_feature(enable = "avx2")]
unsafe fn reduce_bilinear_horizontal_inplace_u8(
dest: *mut u8,
dest_pitch: NonZeroUsize,
dest_width: NonZeroUsize,
dest_height: NonZeroUsize,
) {
let dest_pitch = dest_pitch.get();
let dest_width = dest_width.get();
let dest_height = dest_height.get();
let w_ab = _mm256_set1_epi16(0x0301);
let w_cd = _mm256_set1_epi16(0x0103);
let round16 = _mm256_set1_epi16(4);
let mut dest_ptr = dest;
for _y in 0..dest_height {
let a = *dest_ptr as u16;
let b = *dest_ptr.add(1) as u16;
let src0 = ((a + b + 1) / 2) as u8;
let middle_end = dest_width - 1;
let mut x = 1;
while x + 16 <= middle_end {
let ab = _mm256_loadu_si256(dest_ptr.add(x * 2 - 1) as *const __m256i);
let cd = _mm256_loadu_si256(dest_ptr.add(x * 2 + 1) as *const __m256i);
let term_ab = _mm256_maddubs_epi16(ab, w_ab);
let term_cd = _mm256_maddubs_epi16(cd, w_cd);
let sum = _mm256_add_epi16(_mm256_add_epi16(term_ab, term_cd), round16);
let reduced = _mm256_srli_epi16(sum, 3);
let packed = _mm256_packus_epi16(reduced, reduced);
let reordered = _mm256_permute4x64_epi64(packed, 0b1101_1000);
_mm_storeu_si128(
dest_ptr.add(x) as *mut __m128i,
_mm256_castsi256_si128(reordered),
);
x += 16;
}
while x < middle_end {
let a = *dest_ptr.add(x * 2 - 1) as u16;
let b = *dest_ptr.add(x * 2) as u16;
let c = *dest_ptr.add(x * 2 + 1) as u16;
let d = *dest_ptr.add(x * 2 + 2) as u16;
*dest_ptr.add(x) = ((a + (b + c) * 3 + d + 4) / 8) as u8;
x += 1;
}
*dest_ptr = src0;
if dest_width > 1 {
let x = dest_width - 1;
let a = *dest_ptr.add(x * 2) as u16;
let b = *dest_ptr.add(x * 2 + 1) as u16;
*dest_ptr.add(x) = ((a + b + 1) / 2) as u8;
}
dest_ptr = dest_ptr.add(dest_pitch);
}
}
#[target_feature(enable = "avx2")]
unsafe fn reduce_bilinear_vertical_u16(
dest: *mut u16,
src: *const u16,
dest_pitch: NonZeroUsize,
src_pitch: NonZeroUsize,
dest_width: NonZeroUsize,
dest_height: NonZeroUsize,
) {
let dest_pitch = dest_pitch.get();
let src_pitch = src_pitch.get();
let dest_width = dest_width.get();
let dest_height = dest_height.get();
let mut dest_ptr = dest;
let src_ptr = src;
let mut x = 0;
while x + 16 <= dest_width {
let a = _mm256_loadu_si256(src_ptr.add(x) as *const __m256i);
let b = _mm256_loadu_si256(src_ptr.add(x + src_pitch) as *const __m256i);
let a_lo = _mm256_unpacklo_epi16(a, _mm256_setzero_si256());
let a_hi = _mm256_unpackhi_epi16(a, _mm256_setzero_si256());
let b_lo = _mm256_unpacklo_epi16(b, _mm256_setzero_si256());
let b_hi = _mm256_unpackhi_epi16(b, _mm256_setzero_si256());
let ones = _mm256_set1_epi32(1);
let sum_lo = _mm256_add_epi32(_mm256_add_epi32(a_lo, b_lo), ones);
let sum_hi = _mm256_add_epi32(_mm256_add_epi32(a_hi, b_hi), ones);
let result_lo = _mm256_srli_epi32(sum_lo, 1);
let result_hi = _mm256_srli_epi32(sum_hi, 1);
let result = _mm256_packus_epi32(result_lo, result_hi);
_mm256_storeu_si256(dest_ptr.add(x) as *mut __m256i, result);
x += 16;
}
while x < dest_width {
let a = *src_ptr.add(x) as u32;
let b = *src_ptr.add(x + src_pitch) as u32;
*dest_ptr.add(x) = ((a + b + 1) / 2) as u16;
x += 1;
}
dest_ptr = dest_ptr.add(dest_pitch);
for y in 1..(dest_height - 1) {
let src_row_offset = y * 2 * src_pitch;
let mut x = 0;
while x + 16 <= dest_width {
let a =
_mm256_loadu_si256(src_ptr.add(src_row_offset + x - src_pitch) as *const __m256i);
let b = _mm256_loadu_si256(src_ptr.add(src_row_offset + x) as *const __m256i);
let c =
_mm256_loadu_si256(src_ptr.add(src_row_offset + x + src_pitch) as *const __m256i);
let d = _mm256_loadu_si256(
src_ptr.add(src_row_offset + x + src_pitch * 2) as *const __m256i
);
let a_lo = _mm256_unpacklo_epi16(a, _mm256_setzero_si256());
let a_hi = _mm256_unpackhi_epi16(a, _mm256_setzero_si256());
let b_lo = _mm256_unpacklo_epi16(b, _mm256_setzero_si256());
let b_hi = _mm256_unpackhi_epi16(b, _mm256_setzero_si256());
let c_lo = _mm256_unpacklo_epi16(c, _mm256_setzero_si256());
let c_hi = _mm256_unpackhi_epi16(c, _mm256_setzero_si256());
let d_lo = _mm256_unpacklo_epi16(d, _mm256_setzero_si256());
let d_hi = _mm256_unpackhi_epi16(d, _mm256_setzero_si256());
let bc_lo = _mm256_add_epi32(b_lo, c_lo);
let bc_hi = _mm256_add_epi32(b_hi, c_hi);
let bc3_lo = _mm256_add_epi32(_mm256_add_epi32(bc_lo, bc_lo), bc_lo);
let bc3_hi = _mm256_add_epi32(_mm256_add_epi32(bc_hi, bc_hi), bc_hi);
let fours = _mm256_set1_epi32(4);
let sum_lo = _mm256_add_epi32(
_mm256_add_epi32(a_lo, bc3_lo),
_mm256_add_epi32(d_lo, fours),
);
let sum_hi = _mm256_add_epi32(
_mm256_add_epi32(a_hi, bc3_hi),
_mm256_add_epi32(d_hi, fours),
);
let result_lo = _mm256_srli_epi32(sum_lo, 3);
let result_hi = _mm256_srli_epi32(sum_hi, 3);
let result = _mm256_packus_epi32(result_lo, result_hi);
_mm256_storeu_si256(dest_ptr.add(x) as *mut __m256i, result);
x += 16;
}
while x < dest_width {
let a = *src_ptr.add(src_row_offset + x - src_pitch) as u32;
let b = *src_ptr.add(src_row_offset + x) as u32;
let c = *src_ptr.add(src_row_offset + x + src_pitch) as u32;
let d = *src_ptr.add(src_row_offset + x + src_pitch * 2) as u32;
*dest_ptr.add(x) = ((a + (b + c) * 3 + d + 4) / 8) as u16;
x += 1;
}
dest_ptr = dest_ptr.add(dest_pitch);
}
if dest_height > 1 {
let src_row_offset = (dest_height - 1) * 2 * src_pitch;
let mut x = 0;
while x + 16 <= dest_width {
let a = _mm256_loadu_si256(src_ptr.add(src_row_offset + x) as *const __m256i);
let b =
_mm256_loadu_si256(src_ptr.add(src_row_offset + x + src_pitch) as *const __m256i);
let a_lo = _mm256_unpacklo_epi16(a, _mm256_setzero_si256());
let a_hi = _mm256_unpackhi_epi16(a, _mm256_setzero_si256());
let b_lo = _mm256_unpacklo_epi16(b, _mm256_setzero_si256());
let b_hi = _mm256_unpackhi_epi16(b, _mm256_setzero_si256());
let ones = _mm256_set1_epi32(1);
let sum_lo = _mm256_add_epi32(_mm256_add_epi32(a_lo, b_lo), ones);
let sum_hi = _mm256_add_epi32(_mm256_add_epi32(a_hi, b_hi), ones);
let result_lo = _mm256_srli_epi32(sum_lo, 1);
let result_hi = _mm256_srli_epi32(sum_hi, 1);
let result = _mm256_packus_epi32(result_lo, result_hi);
_mm256_storeu_si256(dest_ptr.add(x) as *mut __m256i, result);
x += 16;
}
while x < dest_width {
let a = *src_ptr.add(src_row_offset + x) as u32;
let b = *src_ptr.add(src_row_offset + x + src_pitch) as u32;
*dest_ptr.add(x) = ((a + b + 1) / 2) as u16;
x += 1;
}
}
}
#[target_feature(enable = "avx2")]
unsafe fn reduce_bilinear_horizontal_inplace_u16(
dest: *mut u16,
dest_pitch: NonZeroUsize,
dest_width: NonZeroUsize,
dest_height: NonZeroUsize,
) {
let dest_pitch = dest_pitch.get();
let dest_width = dest_width.get();
let dest_height = dest_height.get();
let sign_bit = _mm256_set1_epi16(i16::MIN);
let w_ab = _mm256_set1_epi32(0x0003_0001);
let w_cd = _mm256_set1_epi32(0x0001_0003);
let round32 = _mm256_set1_epi32(262_148);
let mut dest_ptr = dest;
for _y in 0..dest_height {
let a = *dest_ptr as u32;
let b = *dest_ptr.add(1) as u32;
let src0 = ((a + b + 1) / 2) as u16;
let middle_end = dest_width - 1;
let mut x = 1;
while x + 8 <= middle_end {
let ab = _mm256_loadu_si256(dest_ptr.add(x * 2 - 1) as *const __m256i);
let cd = _mm256_loadu_si256(dest_ptr.add(x * 2 + 1) as *const __m256i);
let term_ab = _mm256_madd_epi16(_mm256_xor_si256(ab, sign_bit), w_ab);
let term_cd = _mm256_madd_epi16(_mm256_xor_si256(cd, sign_bit), w_cd);
let reduced = _mm256_srli_epi32(
_mm256_add_epi32(_mm256_add_epi32(term_ab, term_cd), round32),
3,
);
let packed = _mm256_packus_epi32(reduced, reduced);
let reordered = _mm256_permute4x64_epi64(packed, 0b1101_1000);
_mm_storeu_si128(
dest_ptr.add(x) as *mut __m128i,
_mm256_castsi256_si128(reordered),
);
x += 8;
}
while x < middle_end {
let a = *dest_ptr.add(x * 2 - 1) as u32;
let b = *dest_ptr.add(x * 2) as u32;
let c = *dest_ptr.add(x * 2 + 1) as u32;
let d = *dest_ptr.add(x * 2 + 2) as u32;
*dest_ptr.add(x) = ((a + (b + c) * 3 + d + 4) / 8) as u16;
x += 1;
}
*dest_ptr = src0;
if dest_width > 1 {
let x = dest_width - 1;
let a = *dest_ptr.add(x * 2) as u32;
let b = *dest_ptr.add(x * 2 + 1) as u32;
*dest_ptr.add(x) = ((a + b + 1) / 2) as u16;
}
dest_ptr = dest_ptr.add(dest_pitch);
}
}