#![allow(clippy::undocumented_unsafe_blocks)]
use std::{arch::x86_64::*, num::NonZeroUsize};
use crate::util::Pixel;
#[target_feature(enable = "avx2")]
pub(super) fn reduce_cubic<T: Pixel>(
dest: &mut [T],
src: &[T],
dest_pitch: NonZeroUsize,
src_pitch: NonZeroUsize,
dest_width: NonZeroUsize,
dest_height: NonZeroUsize,
) {
debug_assert!(src.len() >= src_pitch.get() * dest_height.get() * 2);
debug_assert!(dest.len() >= dest_pitch.get() * dest_height.get());
match size_of::<T>() {
1 => unsafe {
reduce_cubic_vertical_u8(
dest.as_mut_ptr() as *mut u8,
src.as_ptr() as *const u8,
dest_pitch,
src_pitch,
dest_width.saturating_mul(NonZeroUsize::new_unchecked(2)),
dest_height,
);
reduce_cubic_horizontal_inplace_u8(
dest.as_mut_ptr() as *mut u8,
dest_pitch,
dest_width,
dest_height,
);
},
2 => unsafe {
reduce_cubic_vertical_u16(
dest.as_mut_ptr() as *mut u16,
src.as_ptr() as *const u16,
dest_pitch,
src_pitch,
dest_width.saturating_mul(NonZeroUsize::new_unchecked(2)),
dest_height,
);
reduce_cubic_horizontal_inplace_u16(
dest.as_mut_ptr() as *mut u16,
dest_pitch,
dest_width,
dest_height,
);
},
_ => unreachable!(),
}
}
#[target_feature(enable = "avx2")]
unsafe fn reduce_cubic_vertical_u8(
dest: *mut u8,
src: *const u8,
dest_pitch: NonZeroUsize,
src_pitch: NonZeroUsize,
dest_width: NonZeroUsize,
dest_height: NonZeroUsize,
) {
let dest_pitch = dest_pitch.get();
let src_pitch = src_pitch.get();
let dest_width = dest_width.get();
let dest_height = dest_height.get();
let w01 = _mm256_set1_epi16(0x0501);
let w23 = _mm256_set1_epi16(0x0A0A);
let w45 = _mm256_set1_epi16(0x0105);
let round16 = _mm256_set1_epi16(16);
let mut dest_ptr = dest;
let mut x = 0;
while x + 32 <= dest_width {
let a = _mm256_loadu_si256(src.add(x) as *const __m256i);
let b = _mm256_loadu_si256(src.add(x + src_pitch) as *const __m256i);
let result = _mm256_avg_epu8(a, b);
_mm256_storeu_si256(dest_ptr.add(x) as *mut __m256i, result);
x += 32;
}
while x < dest_width {
let a = *src.add(x) as u16;
let b = *src.add(x + src_pitch) as u16;
*dest_ptr.add(x) = ((a + b + 1) / 2) as u8;
x += 1;
}
dest_ptr = dest_ptr.add(dest_pitch);
for y in 1..(dest_height - 1) {
let src_row_offset = y * 2 * src_pitch;
let mut x = 0;
while x + 32 <= dest_width {
let m0 =
_mm256_loadu_si256(src.add(src_row_offset + x - src_pitch * 2) as *const __m256i);
let m1 = _mm256_loadu_si256(src.add(src_row_offset + x - src_pitch) as *const __m256i);
let m2 = _mm256_loadu_si256(src.add(src_row_offset + x) as *const __m256i);
let m3 = _mm256_loadu_si256(src.add(src_row_offset + x + src_pitch) as *const __m256i);
let m4 =
_mm256_loadu_si256(src.add(src_row_offset + x + src_pitch * 2) as *const __m256i);
let m5 =
_mm256_loadu_si256(src.add(src_row_offset + x + src_pitch * 3) as *const __m256i);
let p01_lo = _mm256_unpacklo_epi8(m0, m1);
let p01_hi = _mm256_unpackhi_epi8(m0, m1);
let p23_lo = _mm256_unpacklo_epi8(m2, m3);
let p23_hi = _mm256_unpackhi_epi8(m2, m3);
let p45_lo = _mm256_unpacklo_epi8(m4, m5);
let p45_hi = _mm256_unpackhi_epi8(m4, m5);
let term01_lo = _mm256_maddubs_epi16(p01_lo, w01);
let term01_hi = _mm256_maddubs_epi16(p01_hi, w01);
let term23_lo = _mm256_maddubs_epi16(p23_lo, w23);
let term23_hi = _mm256_maddubs_epi16(p23_hi, w23);
let term45_lo = _mm256_maddubs_epi16(p45_lo, w45);
let term45_hi = _mm256_maddubs_epi16(p45_hi, w45);
let sum_lo = _mm256_add_epi16(_mm256_add_epi16(term01_lo, term23_lo), term45_lo);
let sum_hi = _mm256_add_epi16(_mm256_add_epi16(term01_hi, term23_hi), term45_hi);
let result_lo = _mm256_srli_epi16(_mm256_add_epi16(sum_lo, round16), 5);
let result_hi = _mm256_srli_epi16(_mm256_add_epi16(sum_hi, round16), 5);
let result = _mm256_packus_epi16(result_lo, result_hi);
_mm256_storeu_si256(dest_ptr.add(x) as *mut __m256i, result);
x += 32;
}
while x < dest_width {
let m0 = *src.add(src_row_offset + x - src_pitch * 2) as u16;
let m1 = *src.add(src_row_offset + x - src_pitch) as u16;
let m2 = *src.add(src_row_offset + x) as u16;
let m3 = *src.add(src_row_offset + x + src_pitch) as u16;
let m4 = *src.add(src_row_offset + x + src_pitch * 2) as u16;
let m5 = *src.add(src_row_offset + x + src_pitch * 3) as u16;
let result = (m0 + m5 + (m1 + m4) * 5 + (m2 + m3) * 10 + 16) >> 5;
*dest_ptr.add(x) = result.min(255) as u8;
x += 1;
}
dest_ptr = dest_ptr.add(dest_pitch);
}
if dest_height > 1 {
let src_row_offset = (dest_height - 1) * 2 * src_pitch;
let mut x = 0;
while x + 32 <= dest_width {
let a = _mm256_loadu_si256(src.add(src_row_offset + x) as *const __m256i);
let b = _mm256_loadu_si256(src.add(src_row_offset + x + src_pitch) as *const __m256i);
let result = _mm256_avg_epu8(a, b);
_mm256_storeu_si256(dest_ptr.add(x) as *mut __m256i, result);
x += 32;
}
while x < dest_width {
let a = *src.add(src_row_offset + x) as u16;
let b = *src.add(src_row_offset + x + src_pitch) as u16;
*dest_ptr.add(x) = ((a + b + 1) / 2) as u8;
x += 1;
}
}
}
#[target_feature(enable = "avx2")]
unsafe fn reduce_cubic_horizontal_inplace_u8(
dest: *mut u8,
dest_pitch: NonZeroUsize,
dest_width: NonZeroUsize,
dest_height: NonZeroUsize,
) {
let dest_pitch = dest_pitch.get();
let dest_width = dest_width.get();
let dest_height = dest_height.get();
let w01 = _mm256_set1_epi16(0x0501);
let w23 = _mm256_set1_epi16(0x0A0A);
let w45 = _mm256_set1_epi16(0x0105);
let round16 = _mm256_set1_epi16(16);
const SIMD_WIDTH: usize = 32;
const FALLBACK_SIMD_WIDTH: usize = 16;
let mut dest_ptr = dest;
for _y in 0..dest_height {
let a = *dest_ptr as u16;
let b = *dest_ptr.add(1) as u16;
let src0 = ((a + b + 1) / 2) as u8;
let middle_end = dest_width - 1;
let mut x = 1;
while x + SIMD_WIDTH <= middle_end {
let p01_lo = _mm256_loadu_si256(dest_ptr.add(x * 2 - 2) as *const __m256i);
let p01_hi = _mm256_loadu_si256(dest_ptr.add(x * 2 + 30) as *const __m256i);
let p23_lo = _mm256_loadu_si256(dest_ptr.add(x * 2) as *const __m256i);
let p23_hi = _mm256_loadu_si256(dest_ptr.add(x * 2 + 32) as *const __m256i);
let p45_lo = _mm256_loadu_si256(dest_ptr.add(x * 2 + 2) as *const __m256i);
let p45_hi = _mm256_loadu_si256(dest_ptr.add(x * 2 + 34) as *const __m256i);
let term01_lo = _mm256_maddubs_epi16(p01_lo, w01);
let term01_hi = _mm256_maddubs_epi16(p01_hi, w01);
let term23_lo = _mm256_maddubs_epi16(p23_lo, w23);
let term23_hi = _mm256_maddubs_epi16(p23_hi, w23);
let term45_lo = _mm256_maddubs_epi16(p45_lo, w45);
let term45_hi = _mm256_maddubs_epi16(p45_hi, w45);
let sum_lo = _mm256_add_epi16(_mm256_add_epi16(term01_lo, term23_lo), term45_lo);
let sum_hi = _mm256_add_epi16(_mm256_add_epi16(term01_hi, term23_hi), term45_hi);
let reduced_lo = _mm256_srli_epi16(_mm256_add_epi16(sum_lo, round16), 5);
let reduced_hi = _mm256_srli_epi16(_mm256_add_epi16(sum_hi, round16), 5);
let packed = _mm256_packus_epi16(reduced_lo, reduced_hi);
let reordered = _mm256_permute4x64_epi64(packed, 0b11_01_10_00);
_mm256_storeu_si256(dest_ptr.add(x) as *mut __m256i, reordered);
x += SIMD_WIDTH;
}
if x + FALLBACK_SIMD_WIDTH <= middle_end {
let p01 = _mm256_loadu_si256(dest_ptr.add(x * 2 - 2) as *const __m256i);
let p23 = _mm256_loadu_si256(dest_ptr.add(x * 2) as *const __m256i);
let p45 = _mm256_loadu_si256(dest_ptr.add(x * 2 + 2) as *const __m256i);
let term01 = _mm256_maddubs_epi16(p01, w01);
let term23 = _mm256_maddubs_epi16(p23, w23);
let term45 = _mm256_maddubs_epi16(p45, w45);
let sum = _mm256_add_epi16(_mm256_add_epi16(term01, term23), term45);
let reduced = _mm256_srli_epi16(_mm256_add_epi16(sum, round16), 5);
let packed = _mm256_packus_epi16(reduced, reduced);
let reordered = _mm256_permute4x64_epi64(packed, 0b0000_1000);
_mm_storeu_si128(
dest_ptr.add(x) as *mut __m128i,
_mm256_castsi256_si128(reordered),
);
x += FALLBACK_SIMD_WIDTH;
}
while x < middle_end {
let m0 = *dest_ptr.add(x * 2 - 2) as u16;
let m1 = *dest_ptr.add(x * 2 - 1) as u16;
let m2 = *dest_ptr.add(x * 2) as u16;
let m3 = *dest_ptr.add(x * 2 + 1) as u16;
let m4 = *dest_ptr.add(x * 2 + 2) as u16;
let m5 = *dest_ptr.add(x * 2 + 3) as u16;
let result = (m0 + m5 + (m1 + m4) * 5 + (m2 + m3) * 10 + 16) >> 5;
*dest_ptr.add(x) = result.min(255) as u8;
x += 1;
}
*dest_ptr = src0;
if dest_width > 1 {
let x = dest_width - 1;
let a = *dest_ptr.add(x * 2) as u16;
let b = *dest_ptr.add(x * 2 + 1) as u16;
*dest_ptr.add(x) = ((a + b + 1) / 2) as u8;
}
dest_ptr = dest_ptr.add(dest_pitch);
}
}
#[target_feature(enable = "avx2")]
unsafe fn reduce_cubic_vertical_u16(
dest: *mut u16,
src: *const u16,
dest_pitch: NonZeroUsize,
src_pitch: NonZeroUsize,
dest_width: NonZeroUsize,
dest_height: NonZeroUsize,
) {
let dest_pitch = dest_pitch.get();
let src_pitch = src_pitch.get();
let dest_width = dest_width.get();
let dest_height = dest_height.get();
let signed_bias = _mm256_set1_epi16(i16::MIN);
let w05 = _mm256_set1_epi32(0x0001_0001);
let w14 = _mm256_set1_epi32(0x0005_0005);
let w23 = _mm256_set1_epi32(0x000A_000A);
let unsigned_correction = _mm256_set1_epi32(32 * 32768);
let round16 = _mm256_set1_epi32(16);
let mut dest_ptr = dest;
let mut x = 0;
while x + 16 <= dest_width {
let a = _mm256_loadu_si256(src.add(x) as *const __m256i);
let b = _mm256_loadu_si256(src.add(x + src_pitch) as *const __m256i);
let sum = _mm256_add_epi16(_mm256_add_epi16(a, b), _mm256_set1_epi16(1));
let result = _mm256_srli_epi16(sum, 1);
_mm256_storeu_si256(dest_ptr.add(x) as *mut __m256i, result);
x += 16;
}
while x < dest_width {
let a = *src.add(x) as u32;
let b = *src.add(x + src_pitch) as u32;
*dest_ptr.add(x) = ((a + b + 1) / 2) as u16;
x += 1;
}
dest_ptr = dest_ptr.add(dest_pitch);
for y in 1..(dest_height - 1) {
let src_row_offset = y * 2 * src_pitch;
let mut x = 0;
while x + 16 <= dest_width {
let m0 =
_mm256_loadu_si256(src.add(src_row_offset + x - src_pitch * 2) as *const __m256i);
let m1 = _mm256_loadu_si256(src.add(src_row_offset + x - src_pitch) as *const __m256i);
let m2 = _mm256_loadu_si256(src.add(src_row_offset + x) as *const __m256i);
let m3 = _mm256_loadu_si256(src.add(src_row_offset + x + src_pitch) as *const __m256i);
let m4 =
_mm256_loadu_si256(src.add(src_row_offset + x + src_pitch * 2) as *const __m256i);
let m5 =
_mm256_loadu_si256(src.add(src_row_offset + x + src_pitch * 3) as *const __m256i);
let m0 = _mm256_xor_si256(m0, signed_bias);
let m1 = _mm256_xor_si256(m1, signed_bias);
let m2 = _mm256_xor_si256(m2, signed_bias);
let m3 = _mm256_xor_si256(m3, signed_bias);
let m4 = _mm256_xor_si256(m4, signed_bias);
let m5 = _mm256_xor_si256(m5, signed_bias);
let pair05_lo = _mm256_unpacklo_epi16(m0, m5);
let pair05_hi = _mm256_unpackhi_epi16(m0, m5);
let pair14_lo = _mm256_unpacklo_epi16(m1, m4);
let pair14_hi = _mm256_unpackhi_epi16(m1, m4);
let pair23_lo = _mm256_unpacklo_epi16(m2, m3);
let pair23_hi = _mm256_unpackhi_epi16(m2, m3);
let term05_lo = _mm256_madd_epi16(pair05_lo, w05);
let term05_hi = _mm256_madd_epi16(pair05_hi, w05);
let term14_lo = _mm256_madd_epi16(pair14_lo, w14);
let term14_hi = _mm256_madd_epi16(pair14_hi, w14);
let term23_lo = _mm256_madd_epi16(pair23_lo, w23);
let term23_hi = _mm256_madd_epi16(pair23_hi, w23);
let sum_lo = _mm256_add_epi32(
_mm256_add_epi32(_mm256_add_epi32(term05_lo, term14_lo), term23_lo),
unsigned_correction,
);
let sum_hi = _mm256_add_epi32(
_mm256_add_epi32(_mm256_add_epi32(term05_hi, term14_hi), term23_hi),
unsigned_correction,
);
let result_lo = _mm256_srli_epi32(_mm256_add_epi32(sum_lo, round16), 5);
let result_hi = _mm256_srli_epi32(_mm256_add_epi32(sum_hi, round16), 5);
let result = _mm256_packus_epi32(result_lo, result_hi);
_mm256_storeu_si256(dest_ptr.add(x) as *mut __m256i, result);
x += 16;
}
while x < dest_width {
let m0 = *src.add(src_row_offset + x - src_pitch * 2) as u32;
let m1 = *src.add(src_row_offset + x - src_pitch) as u32;
let m2 = *src.add(src_row_offset + x) as u32;
let m3 = *src.add(src_row_offset + x + src_pitch) as u32;
let m4 = *src.add(src_row_offset + x + src_pitch * 2) as u32;
let m5 = *src.add(src_row_offset + x + src_pitch * 3) as u32;
let result = (m0 + m5 + (m1 + m4) * 5 + (m2 + m3) * 10 + 16) >> 5;
*dest_ptr.add(x) = result.min(65535) as u16;
x += 1;
}
dest_ptr = dest_ptr.add(dest_pitch);
}
if dest_height > 1 {
let src_row_offset = (dest_height - 1) * 2 * src_pitch;
let mut x = 0;
while x + 16 <= dest_width {
let a = _mm256_loadu_si256(src.add(src_row_offset + x) as *const __m256i);
let b = _mm256_loadu_si256(src.add(src_row_offset + x + src_pitch) as *const __m256i);
let sum = _mm256_add_epi16(_mm256_add_epi16(a, b), _mm256_set1_epi16(1));
let result = _mm256_srli_epi16(sum, 1);
_mm256_storeu_si256(dest_ptr.add(x) as *mut __m256i, result);
x += 16;
}
while x < dest_width {
let a = *src.add(src_row_offset + x) as u32;
let b = *src.add(src_row_offset + x + src_pitch) as u32;
*dest_ptr.add(x) = ((a + b + 1) / 2) as u16;
x += 1;
}
}
}
#[target_feature(enable = "avx2")]
unsafe fn reduce_cubic_horizontal_inplace_u16(
dest: *mut u16,
dest_pitch: NonZeroUsize,
dest_width: NonZeroUsize,
dest_height: NonZeroUsize,
) {
let dest_pitch = dest_pitch.get();
let dest_width = dest_width.get();
let dest_height = dest_height.get();
let zero = _mm256_setzero_si256();
let mask_lo16 = _mm256_set1_epi32(0x0000_FFFF);
let weight_5 = _mm256_set1_epi32(5);
let weight_10 = _mm256_set1_epi32(10);
let bias_16 = _mm256_set1_epi32(16);
let mut dest_ptr = dest;
for _y in 0..dest_height {
let a = *dest_ptr as u32;
let b = *dest_ptr.add(1) as u32;
let src0 = ((a + b + 1) / 2) as u16;
let middle_end = dest_width - 1;
let mut x = 1;
while x + 8 <= middle_end {
let v01 = _mm256_loadu_si256(dest_ptr.add(x * 2 - 2) as *const __m256i);
let v23 = _mm256_loadu_si256(dest_ptr.add(x * 2) as *const __m256i);
let v45 = _mm256_loadu_si256(dest_ptr.add(x * 2 + 2) as *const __m256i);
let m0 = _mm256_and_si256(v01, mask_lo16);
let m1 = _mm256_srli_epi32(v01, 16);
let m2 = _mm256_and_si256(v23, mask_lo16);
let m3 = _mm256_srli_epi32(v23, 16);
let m4 = _mm256_and_si256(v45, mask_lo16);
let m5 = _mm256_srli_epi32(v45, 16);
let sum_05 = _mm256_add_epi32(m0, m5);
let sum_14 = _mm256_mullo_epi32(_mm256_add_epi32(m1, m4), weight_5);
let sum_23 = _mm256_mullo_epi32(_mm256_add_epi32(m2, m3), weight_10);
let result = _mm256_srli_epi32(
_mm256_add_epi32(
_mm256_add_epi32(sum_05, sum_14),
_mm256_add_epi32(sum_23, bias_16),
),
5,
);
let packed = _mm256_packus_epi32(result, zero);
let lower_half = _mm256_castsi256_si128(packed);
let upper_half = _mm256_extracti128_si256(packed, 1);
let combined = _mm_unpacklo_epi64(lower_half, upper_half);
_mm_storeu_si128(dest_ptr.add(x) as *mut __m128i, combined);
x += 8;
}
while x < middle_end {
let m0 = *dest_ptr.add(x * 2 - 2) as u32;
let m1 = *dest_ptr.add(x * 2 - 1) as u32;
let m2 = *dest_ptr.add(x * 2) as u32;
let m3 = *dest_ptr.add(x * 2 + 1) as u32;
let m4 = *dest_ptr.add(x * 2 + 2) as u32;
let m5 = *dest_ptr.add(x * 2 + 3) as u32;
let result = (m0 + m5 + (m1 + m4) * 5 + (m2 + m3) * 10 + 16) >> 5;
*dest_ptr.add(x) = result.min(65535) as u16;
x += 1;
}
*dest_ptr = src0;
if dest_width > 1 {
let x = dest_width - 1;
let a = *dest_ptr.add(x * 2) as u32;
let b = *dest_ptr.add(x * 2 + 1) as u32;
*dest_ptr.add(x) = ((a + b + 1) / 2) as u16;
}
dest_ptr = dest_ptr.add(dest_pitch);
}
}