#![allow(clippy::undocumented_unsafe_blocks)]
use std::arch::x86_64::*;
use std::num::{NonZeroU8, NonZeroUsize};
use crate::util::Pixel;
#[target_feature(enable = "avx2")]
#[inline]
pub(super) unsafe fn apply_bicubic_kernel_u8_avx2(
a: __m256i,
b: __m256i,
c: __m256i,
d: __m256i,
w_ab: __m256i,
w_cd: __m256i,
eight: __m256i,
) -> __m256i {
let ab_lo = _mm256_unpacklo_epi8(a, b);
let ab_hi = _mm256_unpackhi_epi8(a, b);
let cd_lo = _mm256_unpacklo_epi8(c, d);
let cd_hi = _mm256_unpackhi_epi8(c, d);
let sum_lo = _mm256_add_epi16(
_mm256_maddubs_epi16(ab_lo, w_ab),
_mm256_maddubs_epi16(cd_lo, w_cd),
);
let sum_hi = _mm256_add_epi16(
_mm256_maddubs_epi16(ab_hi, w_ab),
_mm256_maddubs_epi16(cd_hi, w_cd),
);
let result_lo = _mm256_srai_epi16(_mm256_add_epi16(sum_lo, eight), 4);
let result_hi = _mm256_srai_epi16(_mm256_add_epi16(sum_hi, eight), 4);
_mm256_packus_epi16(result_lo, result_hi)
}
#[target_feature(enable = "avx2")]
#[inline]
pub(super) unsafe fn apply_bicubic_kernel_u16_fast_avx2(
a: __m256i,
b: __m256i,
c: __m256i,
d: __m256i,
w_ab: __m256i,
w_cd: __m256i,
eight: __m256i,
pixel_max: __m256i,
) -> __m256i {
let zero = _mm256_setzero_si256();
let ab_lo = _mm256_unpacklo_epi16(a, b);
let ab_hi = _mm256_unpackhi_epi16(a, b);
let cd_lo = _mm256_unpacklo_epi16(c, d);
let cd_hi = _mm256_unpackhi_epi16(c, d);
let sum_lo = _mm256_add_epi32(
_mm256_madd_epi16(ab_lo, w_ab),
_mm256_madd_epi16(cd_lo, w_cd),
);
let sum_hi = _mm256_add_epi32(
_mm256_madd_epi16(ab_hi, w_ab),
_mm256_madd_epi16(cd_hi, w_cd),
);
let result_lo = _mm256_srai_epi32(_mm256_add_epi32(sum_lo, eight), 4);
let result_hi = _mm256_srai_epi32(_mm256_add_epi32(sum_hi, eight), 4);
let clamped_lo = _mm256_min_epi32(pixel_max, _mm256_max_epi32(zero, result_lo));
let clamped_hi = _mm256_min_epi32(pixel_max, _mm256_max_epi32(zero, result_hi));
_mm256_packus_epi32(clamped_lo, clamped_hi)
}
#[target_feature(enable = "avx2")]
#[inline]
pub(super) unsafe fn apply_bicubic_kernel_u16_exact_avx2(
a_lo: __m256i,
a_hi: __m256i,
b_lo: __m256i,
b_hi: __m256i,
c_lo: __m256i,
c_hi: __m256i,
d_lo: __m256i,
d_hi: __m256i,
nine: __m256i,
eight: __m256i,
pixel_max: __m256i,
) -> __m256i {
let zero = _mm256_setzero_si256();
let bc_lo = _mm256_mullo_epi32(_mm256_add_epi32(b_lo, c_lo), nine);
let bc_hi = _mm256_mullo_epi32(_mm256_add_epi32(b_hi, c_hi), nine);
let ad_lo = _mm256_add_epi32(a_lo, d_lo);
let ad_hi = _mm256_add_epi32(a_hi, d_hi);
let result_lo = _mm256_srai_epi32(_mm256_add_epi32(_mm256_sub_epi32(bc_lo, ad_lo), eight), 4);
let result_hi = _mm256_srai_epi32(_mm256_add_epi32(_mm256_sub_epi32(bc_hi, ad_hi), eight), 4);
let clamped_lo = _mm256_min_epi32(pixel_max, _mm256_max_epi32(zero, result_lo));
let clamped_hi = _mm256_min_epi32(pixel_max, _mm256_max_epi32(zero, result_hi));
_mm256_permute4x64_epi64(_mm256_packus_epi32(clamped_lo, clamped_hi), 0b11_01_10_00)
}
#[target_feature(enable = "avx2")]
pub(super) unsafe fn refine_horizontal_bicubic<T: Pixel>(
dest: &mut [T],
src: &[T],
pitch: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
bits_per_sample: NonZeroU8,
) {
match size_of::<T>() {
1 => unsafe {
refine_horizontal_bicubic_u8(
src.as_ptr() as *const u8,
dest.as_mut_ptr() as *mut u8,
pitch,
width,
height,
bits_per_sample,
);
},
2 => unsafe {
refine_horizontal_bicubic_u16(
src.as_ptr() as *const u16,
dest.as_mut_ptr() as *mut u16,
pitch,
width,
height,
bits_per_sample,
);
},
_ => unreachable!(),
}
}
#[target_feature(enable = "avx2")]
pub(super) unsafe fn refine_vertical_bicubic<T: Pixel>(
dest: &mut [T],
src: &[T],
pitch: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
bits_per_sample: NonZeroU8,
) {
match size_of::<T>() {
1 => unsafe {
refine_vertical_bicubic_u8(
src.as_ptr() as *const u8,
dest.as_mut_ptr() as *mut u8,
pitch,
width,
height,
bits_per_sample,
);
},
2 => unsafe {
refine_vertical_bicubic_u16(
src.as_ptr() as *const u16,
dest.as_mut_ptr() as *mut u16,
pitch,
width,
height,
bits_per_sample,
);
},
_ => unreachable!(),
}
}
#[target_feature(enable = "avx2")]
unsafe fn refine_horizontal_bicubic_u8(
src: *const u8,
dest: *mut u8,
pitch: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
bits_per_sample: NonZeroU8,
) {
let pixel_max = (1u16 << bits_per_sample.get()) - 1;
let width_val = width.get();
let pitch_val = pitch.get();
let w_ab = _mm256_set1_epi16(((9u16 << 8) | ((-1i8 as u8) as u16)) as i16);
let w_cd = _mm256_set1_epi16(((((-1i8 as u8) as u16) << 8) | 9u16) as i16);
let eight = _mm256_set1_epi16(8);
for j in 0..height.get() {
let row_offset = j * pitch_val;
let src_row = src.add(row_offset);
let dest_row = dest.add(row_offset);
let a = *src_row.add(0) as u16;
let b = *src_row.add(1) as u16;
*dest_row.add(0) = ((a + b + 1) / 2) as u8;
let mut i = 1;
let simd_end = 1 + ((width_val - 4) / 32) * 32;
while i < simd_end {
let a = _mm256_loadu_si256(src_row.add(i - 1).cast::<__m256i>());
let b = _mm256_loadu_si256(src_row.add(i).cast::<__m256i>());
let c = _mm256_loadu_si256(src_row.add(i + 1).cast::<__m256i>());
let d = _mm256_loadu_si256(src_row.add(i + 2).cast::<__m256i>());
let result = apply_bicubic_kernel_u8_avx2(a, b, c, d, w_ab, w_cd, eight);
_mm256_storeu_si256(dest_row.add(i).cast::<__m256i>(), result);
i += 32;
}
for i in i..(width_val - 3) {
let a = *src_row.add(i - 1) as i16;
let b = *src_row.add(i) as i16;
let c = *src_row.add(i + 1) as i16;
let d = *src_row.add(i + 2) as i16;
let result = (-(a + d) + (b + c) * 9 + 8) >> 4;
*dest_row.add(i) = std::cmp::min(pixel_max, std::cmp::max(0, result) as u16) as u8;
}
for i in (width_val - 3)..(width_val - 1) {
let a = *src_row.add(i) as u16;
let b = *src_row.add(i + 1) as u16;
*dest_row.add(i) = ((a + b + 1) / 2) as u8;
}
*dest_row.add(width_val - 1) = *src_row.add(width_val - 1);
}
}
#[target_feature(enable = "avx2")]
unsafe fn refine_horizontal_bicubic_u16(
src: *const u16,
dest: *mut u16,
pitch: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
bits_per_sample: NonZeroU8,
) {
let pixel_max = (1u32 << bits_per_sample.get()) - 1;
let width_val = width.get();
let pitch_val = pitch.get();
let use_fast = bits_per_sample.get() <= 15;
let w_ab = _mm256_set1_epi32(((9i32) << 16) | (((-1i32) as u32) & 0xFFFF) as i32);
let w_cd = _mm256_set1_epi32((((((-1i32) as u32) & 0xFFFF) << 16) | 9u32) as i32);
let eight = _mm256_set1_epi32(8);
let nine = _mm256_set1_epi32(9);
let pixel_max_vec = _mm256_set1_epi32(pixel_max as i32);
for j in 0..height.get() {
let row_offset = j * pitch_val;
let src_row = src.add(row_offset);
let dest_row = dest.add(row_offset);
let a = *src_row.add(0) as u32;
let b = *src_row.add(1) as u32;
*dest_row.add(0) = ((a + b + 1) / 2) as u16;
let mut i = 1;
let simd_end = 1 + ((width_val - 4) / 16) * 16;
if use_fast {
while i < simd_end {
let a = _mm256_loadu_si256(src_row.add(i - 1).cast::<__m256i>());
let b = _mm256_loadu_si256(src_row.add(i).cast::<__m256i>());
let c = _mm256_loadu_si256(src_row.add(i + 1).cast::<__m256i>());
let d = _mm256_loadu_si256(src_row.add(i + 2).cast::<__m256i>());
let result = apply_bicubic_kernel_u16_fast_avx2(
a,
b,
c,
d,
w_ab,
w_cd,
eight,
pixel_max_vec,
);
_mm256_storeu_si256(dest_row.add(i).cast::<__m256i>(), result);
i += 16;
}
} else {
while i < simd_end {
let a = _mm256_loadu_si256(src_row.add(i - 1).cast::<__m256i>());
let b = _mm256_loadu_si256(src_row.add(i).cast::<__m256i>());
let c = _mm256_loadu_si256(src_row.add(i + 1).cast::<__m256i>());
let d = _mm256_loadu_si256(src_row.add(i + 2).cast::<__m256i>());
let a_lo = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(a));
let a_hi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(a, 1));
let b_lo = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b));
let b_hi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(b, 1));
let c_lo = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(c));
let c_hi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(c, 1));
let d_lo = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(d));
let d_hi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(d, 1));
let result = apply_bicubic_kernel_u16_exact_avx2(
a_lo,
a_hi,
b_lo,
b_hi,
c_lo,
c_hi,
d_lo,
d_hi,
nine,
eight,
pixel_max_vec,
);
_mm256_storeu_si256(dest_row.add(i).cast::<__m256i>(), result);
i += 16;
}
}
for i in i..(width_val - 3) {
let a = *src_row.add(i - 1) as i32;
let b = *src_row.add(i) as i32;
let c = *src_row.add(i + 1) as i32;
let d = *src_row.add(i + 2) as i32;
let result = (-(a + d) + (b + c) * 9 + 8) >> 4;
*dest_row.add(i) = std::cmp::min(pixel_max, std::cmp::max(0, result) as u32) as u16;
}
for i in (width_val - 3)..(width_val - 1) {
let a = *src_row.add(i) as u32;
let b = *src_row.add(i + 1) as u32;
*dest_row.add(i) = ((a + b + 1) / 2) as u16;
}
*dest_row.add(width_val - 1) = *src_row.add(width_val - 1);
}
}
#[target_feature(enable = "avx2")]
unsafe fn refine_vertical_bicubic_u8(
src: *const u8,
dest: *mut u8,
pitch: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
bits_per_sample: NonZeroU8,
) {
let pixel_max = (1u16 << bits_per_sample.get()) - 1;
let width_val = width.get();
let pitch_val = pitch.get();
let height_val = height.get();
let w_ab = _mm256_set1_epi16(((9u16 << 8) | ((-1i8 as u8) as u16)) as i16);
let w_cd = _mm256_set1_epi16(((((-1i8 as u8) as u16) << 8) | 9u16) as i16);
let eight = _mm256_set1_epi16(8);
for i in 0..width_val {
let a = *src.add(i) as u16;
let b = *src.add(i + pitch_val) as u16;
*dest.add(i) = ((a + b + 1) / 2) as u8;
}
for j in 1..(height_val - 3) {
let offset = j * pitch_val;
let src_prev = src.add(offset - pitch_val);
let src_curr = src.add(offset);
let src_next = src.add(offset + pitch_val);
let src_next2 = src.add(offset + pitch_val * 2);
let dest_row = dest.add(offset);
let mut i = 0;
let simd_end = (width_val / 32) * 32;
while i < simd_end {
let a = _mm256_loadu_si256(src_prev.add(i).cast::<__m256i>());
let b = _mm256_loadu_si256(src_curr.add(i).cast::<__m256i>());
let c = _mm256_loadu_si256(src_next.add(i).cast::<__m256i>());
let d = _mm256_loadu_si256(src_next2.add(i).cast::<__m256i>());
let result = apply_bicubic_kernel_u8_avx2(a, b, c, d, w_ab, w_cd, eight);
_mm256_storeu_si256(dest_row.add(i).cast::<__m256i>(), result);
i += 32;
}
for i in i..width_val {
let a = *src_prev.add(i) as i16;
let b = *src_curr.add(i) as i16;
let c = *src_next.add(i) as i16;
let d = *src_next2.add(i) as i16;
let result = (-(a + d) + (b + c) * 9 + 8) >> 4;
*dest_row.add(i) = std::cmp::min(pixel_max, std::cmp::max(0, result) as u16) as u8;
}
}
for j in (height_val - 3)..(height_val - 1) {
let offset = j * pitch_val;
for i in 0..width_val {
let a = *src.add(offset + i) as u16;
let b = *src.add(offset + i + pitch_val) as u16;
*dest.add(offset + i) = ((a + b + 1) / 2) as u8;
}
}
let last_offset = (height_val - 1) * pitch_val;
std::ptr::copy_nonoverlapping(src.add(last_offset), dest.add(last_offset), width_val);
}
#[target_feature(enable = "avx2")]
unsafe fn refine_vertical_bicubic_u16(
src: *const u16,
dest: *mut u16,
pitch: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
bits_per_sample: NonZeroU8,
) {
let pixel_max = (1u32 << bits_per_sample.get()) - 1;
let width_val = width.get();
let pitch_val = pitch.get();
let height_val = height.get();
let use_fast = bits_per_sample.get() <= 15;
let w_ab = _mm256_set1_epi32(((9i32) << 16) | (((-1i32) as u32) & 0xFFFF) as i32);
let w_cd = _mm256_set1_epi32((((((-1i32) as u32) & 0xFFFF) << 16) | 9u32) as i32);
let eight = _mm256_set1_epi32(8);
let nine = _mm256_set1_epi32(9);
let pixel_max_vec = _mm256_set1_epi32(pixel_max as i32);
for i in 0..width_val {
let a = *src.add(i) as u32;
let b = *src.add(i + pitch_val) as u32;
*dest.add(i) = ((a + b + 1) / 2) as u16;
}
for j in 1..(height_val - 3) {
let offset = j * pitch_val;
let src_prev = src.add(offset - pitch_val);
let src_curr = src.add(offset);
let src_next = src.add(offset + pitch_val);
let src_next2 = src.add(offset + pitch_val * 2);
let dest_row = dest.add(offset);
let mut i = 0;
let simd_end = (width_val / 16) * 16;
if use_fast {
while i < simd_end {
let a = _mm256_loadu_si256(src_prev.add(i).cast::<__m256i>());
let b = _mm256_loadu_si256(src_curr.add(i).cast::<__m256i>());
let c = _mm256_loadu_si256(src_next.add(i).cast::<__m256i>());
let d = _mm256_loadu_si256(src_next2.add(i).cast::<__m256i>());
let result = apply_bicubic_kernel_u16_fast_avx2(
a,
b,
c,
d,
w_ab,
w_cd,
eight,
pixel_max_vec,
);
_mm256_storeu_si256(dest_row.add(i).cast::<__m256i>(), result);
i += 16;
}
} else {
while i < simd_end {
let a = _mm256_loadu_si256(src_prev.add(i).cast::<__m256i>());
let b = _mm256_loadu_si256(src_curr.add(i).cast::<__m256i>());
let c = _mm256_loadu_si256(src_next.add(i).cast::<__m256i>());
let d = _mm256_loadu_si256(src_next2.add(i).cast::<__m256i>());
let a_lo = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(a));
let a_hi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(a, 1));
let b_lo = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b));
let b_hi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(b, 1));
let c_lo = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(c));
let c_hi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(c, 1));
let d_lo = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(d));
let d_hi = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(d, 1));
let result = apply_bicubic_kernel_u16_exact_avx2(
a_lo,
a_hi,
b_lo,
b_hi,
c_lo,
c_hi,
d_lo,
d_hi,
nine,
eight,
pixel_max_vec,
);
_mm256_storeu_si256(dest_row.add(i).cast::<__m256i>(), result);
i += 16;
}
}
for i in i..width_val {
let a = *src_prev.add(i) as i32;
let b = *src_curr.add(i) as i32;
let c = *src_next.add(i) as i32;
let d = *src_next2.add(i) as i32;
let result = (-(a + d) + (b + c) * 9 + 8) >> 4;
*dest_row.add(i) = std::cmp::min(pixel_max, std::cmp::max(0, result) as u32) as u16;
}
}
for j in (height_val - 3)..(height_val - 1) {
let offset = j * pitch_val;
for i in 0..width_val {
let a = *src.add(offset + i) as u32;
let b = *src.add(offset + i + pitch_val) as u32;
*dest.add(offset + i) = ((a + b + 1) / 2) as u16;
}
}
let last_offset = (height_val - 1) * pitch_val;
std::ptr::copy_nonoverlapping(src.add(last_offset), dest.add(last_offset), width_val);
}