#![allow(clippy::undocumented_unsafe_blocks)]
#![allow(unsafe_op_in_unsafe_fn)]
use std::{
arch::x86_64::*,
mem::size_of,
num::{NonZeroU8, NonZeroUsize},
};
use cpudetect::target_family;
use crate::util::Pixel;
#[target_family("x86_64_v4")]
pub(super) unsafe fn refine_horizontal_bicubic<T: Pixel>(
dest: &mut [T],
src: &[T],
pitch: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
bits_per_sample: NonZeroU8,
) {
match size_of::<T>() {
1 => unsafe {
refine_horizontal_bicubic_u8(
src.as_ptr().cast(),
dest.as_mut_ptr().cast(),
pitch,
width,
height,
bits_per_sample,
);
},
2 => unsafe {
refine_horizontal_bicubic_u16(
src.as_ptr().cast(),
dest.as_mut_ptr().cast(),
pitch,
width,
height,
bits_per_sample,
);
},
_ => unreachable!(),
}
}
#[target_family("x86_64_v4")]
pub(super) unsafe fn refine_vertical_bicubic<T: Pixel>(
dest: &mut [T],
src: &[T],
pitch: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
bits_per_sample: NonZeroU8,
) {
match size_of::<T>() {
1 => unsafe {
refine_vertical_bicubic_u8(
src.as_ptr().cast(),
dest.as_mut_ptr().cast(),
pitch,
width,
height,
bits_per_sample,
);
},
2 => unsafe {
refine_vertical_bicubic_u16(
src.as_ptr().cast(),
dest.as_mut_ptr().cast(),
pitch,
width,
height,
bits_per_sample,
);
},
_ => unreachable!(),
}
}
#[target_family("x86_64_v4")]
unsafe fn refine_vertical_bicubic_u8(
src: *const u8,
dest: *mut u8,
pitch: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
bits_per_sample: NonZeroU8,
) {
let pixel_max = (1u16 << bits_per_sample.get()) - 1;
let width_val = width.get();
let pitch_val = pitch.get();
let height_val = height.get();
let w_ab = _mm512_set1_epi16(((9u16 << 8) | ((-1i8 as u8) as u16)) as i16);
let w_cd = _mm512_set1_epi16(((((-1i8 as u8) as u16) << 8) | 9u16) as i16);
let avx2_w_ab = _mm256_set1_epi16(((9u16 << 8) | ((-1i8 as u8) as u16)) as i16);
let avx2_w_cd = _mm256_set1_epi16(((((-1i8 as u8) as u16) << 8) | 9u16) as i16);
let eight = _mm512_set1_epi16(8);
let avx2_eight = _mm256_set1_epi16(8);
let mut i = 0;
while i + 64 <= width_val {
let a = _mm512_loadu_si512(src.add(i).cast::<__m512i>());
let b = _mm512_loadu_si512(src.add(i + pitch_val).cast::<__m512i>());
let avg = _mm512_avg_epu8(a, b);
_mm512_storeu_si512(dest.add(i).cast::<__m512i>(), avg);
i += 64;
}
while i + 32 <= width_val {
let a = _mm256_loadu_si256(src.add(i).cast::<__m256i>());
let b = _mm256_loadu_si256(src.add(i + pitch_val).cast::<__m256i>());
let avg = _mm256_avg_epu8(a, b);
_mm256_storeu_si256(dest.add(i).cast::<__m256i>(), avg);
i += 32;
}
while i < width_val {
let a = *src.add(i) as u16;
let b = *src.add(i + pitch_val) as u16;
*dest.add(i) = ((a + b + 1) / 2) as u8;
i += 1;
}
for j in 1..(height_val - 3) {
let offset = j * pitch_val;
let src_prev = src.add(offset - pitch_val);
let src_curr = src.add(offset);
let src_next = src.add(offset + pitch_val);
let src_next2 = src.add(offset + pitch_val * 2);
let dest_row = dest.add(offset);
let mut i = 0;
while i + 64 <= width_val {
let a = _mm512_loadu_si512(src_prev.add(i).cast::<__m512i>());
let b = _mm512_loadu_si512(src_curr.add(i).cast::<__m512i>());
let c = _mm512_loadu_si512(src_next.add(i).cast::<__m512i>());
let d = _mm512_loadu_si512(src_next2.add(i).cast::<__m512i>());
let ab_lo = _mm512_unpacklo_epi8(a, b);
let ab_hi = _mm512_unpackhi_epi8(a, b);
let cd_lo = _mm512_unpacklo_epi8(c, d);
let cd_hi = _mm512_unpackhi_epi8(c, d);
let sum_lo = _mm512_add_epi16(
_mm512_maddubs_epi16(ab_lo, w_ab),
_mm512_maddubs_epi16(cd_lo, w_cd),
);
let sum_hi = _mm512_add_epi16(
_mm512_maddubs_epi16(ab_hi, w_ab),
_mm512_maddubs_epi16(cd_hi, w_cd),
);
let result_lo = _mm512_srai_epi16(_mm512_add_epi16(sum_lo, eight), 4);
let result_hi = _mm512_srai_epi16(_mm512_add_epi16(sum_hi, eight), 4);
let packed = _mm512_packus_epi16(result_lo, result_hi);
_mm512_storeu_si512(dest_row.add(i).cast::<__m512i>(), packed);
i += 64;
}
while i + 32 <= width_val {
let a = _mm256_loadu_si256(src_prev.add(i).cast::<__m256i>());
let b = _mm256_loadu_si256(src_curr.add(i).cast::<__m256i>());
let c = _mm256_loadu_si256(src_next.add(i).cast::<__m256i>());
let d = _mm256_loadu_si256(src_next2.add(i).cast::<__m256i>());
let result = super::avx2::apply_bicubic_kernel_u8_avx2(
a, b, c, d, avx2_w_ab, avx2_w_cd, avx2_eight,
);
_mm256_storeu_si256(dest_row.add(i).cast::<__m256i>(), result);
i += 32;
}
while i < width_val {
let a = *src_prev.add(i) as i16;
let b = *src_curr.add(i) as i16;
let c = *src_next.add(i) as i16;
let d = *src_next2.add(i) as i16;
let result = (-(a + d) + (b + c) * 9 + 8) >> 4;
*dest_row.add(i) = std::cmp::min(pixel_max, std::cmp::max(0, result) as u16) as u8;
i += 1;
}
}
for j in (height_val - 3)..(height_val - 1) {
let offset = j * pitch_val;
let src_row = src.add(offset);
let src_next = src.add(offset + pitch_val);
let dest_row = dest.add(offset);
let mut i = 0;
while i + 64 <= width_val {
let a = _mm512_loadu_si512(src_row.add(i).cast::<__m512i>());
let b = _mm512_loadu_si512(src_next.add(i).cast::<__m512i>());
let avg = _mm512_avg_epu8(a, b);
_mm512_storeu_si512(dest_row.add(i).cast::<__m512i>(), avg);
i += 64;
}
while i + 32 <= width_val {
let a = _mm256_loadu_si256(src_row.add(i).cast::<__m256i>());
let b = _mm256_loadu_si256(src_next.add(i).cast::<__m256i>());
let avg = _mm256_avg_epu8(a, b);
_mm256_storeu_si256(dest_row.add(i).cast::<__m256i>(), avg);
i += 32;
}
while i < width_val {
let a = *src_row.add(i) as u16;
let b = *src_next.add(i) as u16;
*dest_row.add(i) = ((a + b + 1) / 2) as u8;
i += 1;
}
}
let last_offset = (height_val - 1) * pitch_val;
std::ptr::copy_nonoverlapping(src.add(last_offset), dest.add(last_offset), width_val);
}
#[target_family("x86_64_v4")]
unsafe fn refine_vertical_bicubic_u16(
src: *const u16,
dest: *mut u16,
pitch: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
bits_per_sample: NonZeroU8,
) {
let pixel_max = (1u32 << bits_per_sample.get()) - 1;
let width_val = width.get();
let pitch_val = pitch.get();
let height_val = height.get();
let use_fast = bits_per_sample.get() <= 15;
let w_ab = _mm512_set1_epi32(((9i32) << 16) | (((-1i32) as u32) & 0xFFFF) as i32);
let w_cd = _mm512_set1_epi32((((((-1i32) as u32) & 0xFFFF) << 16) | 9u32) as i32);
let avx2_w_ab = _mm256_set1_epi32(((9i32) << 16) | (((-1i32) as u32) & 0xFFFF) as i32);
let avx2_w_cd = _mm256_set1_epi32((((((-1i32) as u32) & 0xFFFF) << 16) | 9u32) as i32);
let eight = _mm512_set1_epi32(8);
let avx2_eight = _mm256_set1_epi32(8);
let nine = _mm512_set1_epi32(9);
let avx2_nine = _mm256_set1_epi32(9);
let pixel_max_vec = _mm512_set1_epi32(pixel_max as i32);
let avx2_pixel_max_vec = _mm256_set1_epi32(pixel_max as i32);
let zero = _mm512_setzero_si512();
let permute = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
let mut i = 0;
while i + 32 <= width_val {
let a = _mm512_loadu_si512(src.add(i).cast::<__m512i>());
let b = _mm512_loadu_si512(src.add(i + pitch_val).cast::<__m512i>());
let avg = _mm512_avg_epu16(a, b);
_mm512_storeu_si512(dest.add(i).cast::<__m512i>(), avg);
i += 32;
}
while i + 16 <= width_val {
let a = _mm256_loadu_si256(src.add(i).cast::<__m256i>());
let b = _mm256_loadu_si256(src.add(i + pitch_val).cast::<__m256i>());
let avg = _mm256_avg_epu16(a, b);
_mm256_storeu_si256(dest.add(i).cast::<__m256i>(), avg);
i += 16;
}
while i < width_val {
let a = *src.add(i) as u32;
let b = *src.add(i + pitch_val) as u32;
*dest.add(i) = ((a + b + 1) / 2) as u16;
i += 1;
}
for j in 1..(height_val - 3) {
let offset = j * pitch_val;
let src_prev = src.add(offset - pitch_val);
let src_curr = src.add(offset);
let src_next = src.add(offset + pitch_val);
let src_next2 = src.add(offset + pitch_val * 2);
let dest_row = dest.add(offset);
let mut i = 0;
if use_fast {
while i + 32 <= width_val {
let a = _mm512_loadu_si512(src_prev.add(i).cast::<__m512i>());
let b = _mm512_loadu_si512(src_curr.add(i).cast::<__m512i>());
let c = _mm512_loadu_si512(src_next.add(i).cast::<__m512i>());
let d = _mm512_loadu_si512(src_next2.add(i).cast::<__m512i>());
let ab_lo = _mm512_unpacklo_epi16(a, b);
let ab_hi = _mm512_unpackhi_epi16(a, b);
let cd_lo = _mm512_unpacklo_epi16(c, d);
let cd_hi = _mm512_unpackhi_epi16(c, d);
let sum_lo = _mm512_add_epi32(
_mm512_madd_epi16(ab_lo, w_ab),
_mm512_madd_epi16(cd_lo, w_cd),
);
let sum_hi = _mm512_add_epi32(
_mm512_madd_epi16(ab_hi, w_ab),
_mm512_madd_epi16(cd_hi, w_cd),
);
let result_lo = _mm512_srai_epi32(_mm512_add_epi32(sum_lo, eight), 4);
let result_hi = _mm512_srai_epi32(_mm512_add_epi32(sum_hi, eight), 4);
let clamped_lo = _mm512_min_epi32(pixel_max_vec, _mm512_max_epi32(zero, result_lo));
let clamped_hi = _mm512_min_epi32(pixel_max_vec, _mm512_max_epi32(zero, result_hi));
let packed = _mm512_packus_epi32(clamped_lo, clamped_hi);
_mm512_storeu_si512(dest_row.add(i).cast::<__m512i>(), packed);
i += 32;
}
while i + 16 <= width_val {
let a = _mm256_loadu_si256(src_prev.add(i).cast::<__m256i>());
let b = _mm256_loadu_si256(src_curr.add(i).cast::<__m256i>());
let c = _mm256_loadu_si256(src_next.add(i).cast::<__m256i>());
let d = _mm256_loadu_si256(src_next2.add(i).cast::<__m256i>());
let result = super::avx2::apply_bicubic_kernel_u16_fast_avx2(
a,
b,
c,
d,
avx2_w_ab,
avx2_w_cd,
avx2_eight,
avx2_pixel_max_vec,
);
_mm256_storeu_si256(dest_row.add(i).cast::<__m256i>(), result);
i += 16;
}
} else {
while i + 16 <= width_val {
let a = _mm256_loadu_si256(src_prev.add(i).cast::<__m256i>());
let b = _mm256_loadu_si256(src_curr.add(i).cast::<__m256i>());
let c = _mm256_loadu_si256(src_next.add(i).cast::<__m256i>());
let d = _mm256_loadu_si256(src_next2.add(i).cast::<__m256i>());
let a_32 = _mm512_cvtepu16_epi32(a);
let b_32 = _mm512_cvtepu16_epi32(b);
let c_32 = _mm512_cvtepu16_epi32(c);
let d_32 = _mm512_cvtepu16_epi32(d);
let bc = _mm512_mullo_epi32(_mm512_add_epi32(b_32, c_32), nine);
let ad = _mm512_add_epi32(a_32, d_32);
let result =
_mm512_srai_epi32(_mm512_add_epi32(_mm512_sub_epi32(bc, ad), eight), 4);
let result = _mm512_min_epi32(pixel_max_vec, _mm512_max_epi32(zero, result));
let packed = _mm512_packus_epi32(result, _mm512_setzero_si512());
let packed = _mm512_permutexvar_epi64(permute, packed);
_mm256_storeu_si256(
dest_row.add(i).cast::<__m256i>(),
_mm512_castsi512_si256(packed),
);
i += 16;
}
while i + 8 <= width_val {
let a = _mm_loadu_si128(src_prev.add(i).cast::<__m128i>());
let b = _mm_loadu_si128(src_curr.add(i).cast::<__m128i>());
let c = _mm_loadu_si128(src_next.add(i).cast::<__m128i>());
let d = _mm_loadu_si128(src_next2.add(i).cast::<__m128i>());
let a_lo = _mm256_cvtepu16_epi32(a);
let b_lo = _mm256_cvtepu16_epi32(b);
let c_lo = _mm256_cvtepu16_epi32(c);
let d_lo = _mm256_cvtepu16_epi32(d);
let result = super::avx2::apply_bicubic_kernel_u16_exact_avx2(
a_lo,
_mm256_setzero_si256(),
b_lo,
_mm256_setzero_si256(),
c_lo,
_mm256_setzero_si256(),
d_lo,
_mm256_setzero_si256(),
avx2_nine,
avx2_eight,
avx2_pixel_max_vec,
);
_mm_storeu_si128(
dest_row.add(i).cast::<__m128i>(),
_mm256_castsi256_si128(result),
);
i += 8;
}
}
while i < width_val {
let a = *src_prev.add(i) as i32;
let b = *src_curr.add(i) as i32;
let c = *src_next.add(i) as i32;
let d = *src_next2.add(i) as i32;
let result = (-(a + d) + (b + c) * 9 + 8) >> 4;
*dest_row.add(i) = std::cmp::min(pixel_max, std::cmp::max(0, result) as u32) as u16;
i += 1;
}
}
for j in (height_val - 3)..(height_val - 1) {
let offset = j * pitch_val;
let src_row = src.add(offset);
let src_next = src.add(offset + pitch_val);
let dest_row = dest.add(offset);
let mut i = 0;
while i + 32 <= width_val {
let a = _mm512_loadu_si512(src_row.add(i).cast::<__m512i>());
let b = _mm512_loadu_si512(src_next.add(i).cast::<__m512i>());
let avg = _mm512_avg_epu16(a, b);
_mm512_storeu_si512(dest_row.add(i).cast::<__m512i>(), avg);
i += 32;
}
while i + 16 <= width_val {
let a = _mm256_loadu_si256(src_row.add(i).cast::<__m256i>());
let b = _mm256_loadu_si256(src_next.add(i).cast::<__m256i>());
let avg = _mm256_avg_epu16(a, b);
_mm256_storeu_si256(dest_row.add(i).cast::<__m256i>(), avg);
i += 16;
}
while i < width_val {
let a = *src_row.add(i) as u32;
let b = *src_next.add(i) as u32;
*dest_row.add(i) = ((a + b + 1) / 2) as u16;
i += 1;
}
}
let last_offset = (height_val - 1) * pitch_val;
std::ptr::copy_nonoverlapping(src.add(last_offset), dest.add(last_offset), width_val);
}
#[target_family("x86_64_v4")]
unsafe fn refine_horizontal_bicubic_u8(
src: *const u8,
dest: *mut u8,
pitch: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
bits_per_sample: NonZeroU8,
) {
let pixel_max = (1u16 << bits_per_sample.get()) - 1;
let width_val = width.get();
let pitch_val = pitch.get();
let w_ab = _mm512_set1_epi16(((9u16 << 8) | ((-1i8 as u8) as u16)) as i16);
let w_cd = _mm512_set1_epi16(((((-1i8 as u8) as u16) << 8) | 9u16) as i16);
let avx2_w_ab = _mm256_set1_epi16(((9u16 << 8) | ((-1i8 as u8) as u16)) as i16);
let avx2_w_cd = _mm256_set1_epi16(((((-1i8 as u8) as u16) << 8) | 9u16) as i16);
let eight = _mm512_set1_epi16(8);
let avx2_eight = _mm256_set1_epi16(8);
for j in 0..height.get() {
let row_offset = j * pitch_val;
let src_row = src.add(row_offset);
let dest_row = dest.add(row_offset);
let a = *src_row.add(0) as u16;
let b = *src_row.add(1) as u16;
*dest_row.add(0) = ((a + b + 1) / 2) as u8;
let wiener_start = 1;
let wiener_end = if width_val >= 3 {
width_val - 3
} else {
wiener_start
};
let mut i = wiener_start;
while i + 64 <= wiener_end {
let a = _mm512_loadu_si512(src_row.add(i - 1).cast::<__m512i>());
let b = _mm512_loadu_si512(src_row.add(i).cast::<__m512i>());
let c = _mm512_loadu_si512(src_row.add(i + 1).cast::<__m512i>());
let d = _mm512_loadu_si512(src_row.add(i + 2).cast::<__m512i>());
let ab_lo = _mm512_unpacklo_epi8(a, b);
let ab_hi = _mm512_unpackhi_epi8(a, b);
let cd_lo = _mm512_unpacklo_epi8(c, d);
let cd_hi = _mm512_unpackhi_epi8(c, d);
let sum_lo = _mm512_add_epi16(
_mm512_maddubs_epi16(ab_lo, w_ab),
_mm512_maddubs_epi16(cd_lo, w_cd),
);
let sum_hi = _mm512_add_epi16(
_mm512_maddubs_epi16(ab_hi, w_ab),
_mm512_maddubs_epi16(cd_hi, w_cd),
);
let result_lo = _mm512_srai_epi16(_mm512_add_epi16(sum_lo, eight), 4);
let result_hi = _mm512_srai_epi16(_mm512_add_epi16(sum_hi, eight), 4);
let packed = _mm512_packus_epi16(result_lo, result_hi);
_mm512_storeu_si512(dest_row.add(i).cast::<__m512i>(), packed);
i += 64;
}
while i + 32 <= wiener_end {
let a = _mm256_loadu_si256(src_row.add(i - 1).cast::<__m256i>());
let b = _mm256_loadu_si256(src_row.add(i).cast::<__m256i>());
let c = _mm256_loadu_si256(src_row.add(i + 1).cast::<__m256i>());
let d = _mm256_loadu_si256(src_row.add(i + 2).cast::<__m256i>());
let result = super::avx2::apply_bicubic_kernel_u8_avx2(
a, b, c, d, avx2_w_ab, avx2_w_cd, avx2_eight,
);
_mm256_storeu_si256(dest_row.add(i).cast::<__m256i>(), result);
i += 32;
}
while i < wiener_end {
let a = *src_row.add(i - 1) as i16;
let b = *src_row.add(i) as i16;
let c = *src_row.add(i + 1) as i16;
let d = *src_row.add(i + 2) as i16;
let result = (-(a + d) + (b + c) * 9 + 8) >> 4;
*dest_row.add(i) = std::cmp::min(pixel_max, std::cmp::max(0, result) as u16) as u8;
i += 1;
}
if width_val >= 3 {
for i in (width_val - 3)..(width_val - 1) {
let a = *src_row.add(i) as u16;
let b = *src_row.add(i + 1) as u16;
*dest_row.add(i) = ((a + b + 1) / 2) as u8;
}
}
*dest_row.add(width_val - 1) = *src_row.add(width_val - 1);
}
}
#[target_family("x86_64_v4")]
unsafe fn refine_horizontal_bicubic_u16(
src: *const u16,
dest: *mut u16,
pitch: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
bits_per_sample: NonZeroU8,
) {
let pixel_max = (1u32 << bits_per_sample.get()) - 1;
let width_val = width.get();
let pitch_val = pitch.get();
let use_fast = bits_per_sample.get() <= 15;
let w_ab = _mm512_set1_epi32(((9i32) << 16) | (((-1i32) as u32) & 0xFFFF) as i32);
let w_cd = _mm512_set1_epi32((((((-1i32) as u32) & 0xFFFF) << 16) | 9u32) as i32);
let avx2_w_ab = _mm256_set1_epi32(((9i32) << 16) | (((-1i32) as u32) & 0xFFFF) as i32);
let avx2_w_cd = _mm256_set1_epi32((((((-1i32) as u32) & 0xFFFF) << 16) | 9u32) as i32);
let eight = _mm512_set1_epi32(8);
let avx2_eight = _mm256_set1_epi32(8);
let nine = _mm512_set1_epi32(9);
let avx2_nine = _mm256_set1_epi32(9);
let pixel_max_vec = _mm512_set1_epi32(pixel_max as i32);
let avx2_pixel_max_vec = _mm256_set1_epi32(pixel_max as i32);
let zero = _mm512_setzero_si512();
let permute = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
for j in 0..height.get() {
let row_offset = j * pitch_val;
let src_row = src.add(row_offset);
let dest_row = dest.add(row_offset);
let a = *src_row.add(0) as u32;
let b = *src_row.add(1) as u32;
*dest_row.add(0) = ((a + b + 1) / 2) as u16;
let wiener_start = 1;
let wiener_end = if width_val >= 3 {
width_val - 3
} else {
wiener_start
};
let mut i = wiener_start;
if use_fast {
while i + 32 <= wiener_end {
let a = _mm512_loadu_si512(src_row.add(i - 1).cast::<__m512i>());
let b = _mm512_loadu_si512(src_row.add(i).cast::<__m512i>());
let c = _mm512_loadu_si512(src_row.add(i + 1).cast::<__m512i>());
let d = _mm512_loadu_si512(src_row.add(i + 2).cast::<__m512i>());
let ab_lo = _mm512_unpacklo_epi16(a, b);
let ab_hi = _mm512_unpackhi_epi16(a, b);
let cd_lo = _mm512_unpacklo_epi16(c, d);
let cd_hi = _mm512_unpackhi_epi16(c, d);
let sum_lo = _mm512_add_epi32(
_mm512_madd_epi16(ab_lo, w_ab),
_mm512_madd_epi16(cd_lo, w_cd),
);
let sum_hi = _mm512_add_epi32(
_mm512_madd_epi16(ab_hi, w_ab),
_mm512_madd_epi16(cd_hi, w_cd),
);
let result_lo = _mm512_srai_epi32(_mm512_add_epi32(sum_lo, eight), 4);
let result_hi = _mm512_srai_epi32(_mm512_add_epi32(sum_hi, eight), 4);
let clamped_lo = _mm512_min_epi32(pixel_max_vec, _mm512_max_epi32(zero, result_lo));
let clamped_hi = _mm512_min_epi32(pixel_max_vec, _mm512_max_epi32(zero, result_hi));
let packed = _mm512_packus_epi32(clamped_lo, clamped_hi);
_mm512_storeu_si512(dest_row.add(i).cast::<__m512i>(), packed);
i += 32;
}
while i + 16 <= wiener_end {
let a = _mm256_loadu_si256(src_row.add(i - 1).cast::<__m256i>());
let b = _mm256_loadu_si256(src_row.add(i).cast::<__m256i>());
let c = _mm256_loadu_si256(src_row.add(i + 1).cast::<__m256i>());
let d = _mm256_loadu_si256(src_row.add(i + 2).cast::<__m256i>());
let result = super::avx2::apply_bicubic_kernel_u16_fast_avx2(
a,
b,
c,
d,
avx2_w_ab,
avx2_w_cd,
avx2_eight,
avx2_pixel_max_vec,
);
_mm256_storeu_si256(dest_row.add(i).cast::<__m256i>(), result);
i += 16;
}
} else {
while i + 16 <= wiener_end {
let a = _mm256_loadu_si256(src_row.add(i - 1).cast::<__m256i>());
let b = _mm256_loadu_si256(src_row.add(i).cast::<__m256i>());
let c = _mm256_loadu_si256(src_row.add(i + 1).cast::<__m256i>());
let d = _mm256_loadu_si256(src_row.add(i + 2).cast::<__m256i>());
let a_32 = _mm512_cvtepu16_epi32(a);
let b_32 = _mm512_cvtepu16_epi32(b);
let c_32 = _mm512_cvtepu16_epi32(c);
let d_32 = _mm512_cvtepu16_epi32(d);
let bc = _mm512_mullo_epi32(_mm512_add_epi32(b_32, c_32), nine);
let ad = _mm512_add_epi32(a_32, d_32);
let result =
_mm512_srai_epi32(_mm512_add_epi32(_mm512_sub_epi32(bc, ad), eight), 4);
let result = _mm512_min_epi32(pixel_max_vec, _mm512_max_epi32(zero, result));
let packed = _mm512_packus_epi32(result, _mm512_setzero_si512());
let packed = _mm512_permutexvar_epi64(permute, packed);
_mm256_storeu_si256(
dest_row.add(i).cast::<__m256i>(),
_mm512_castsi512_si256(packed),
);
i += 16;
}
while i + 8 <= wiener_end {
let a = _mm_loadu_si128(src_row.add(i - 1).cast::<__m128i>());
let b = _mm_loadu_si128(src_row.add(i).cast::<__m128i>());
let c = _mm_loadu_si128(src_row.add(i + 1).cast::<__m128i>());
let d = _mm_loadu_si128(src_row.add(i + 2).cast::<__m128i>());
let a_lo = _mm256_cvtepu16_epi32(a);
let b_lo = _mm256_cvtepu16_epi32(b);
let c_lo = _mm256_cvtepu16_epi32(c);
let d_lo = _mm256_cvtepu16_epi32(d);
let result = super::avx2::apply_bicubic_kernel_u16_exact_avx2(
a_lo,
_mm256_setzero_si256(),
b_lo,
_mm256_setzero_si256(),
c_lo,
_mm256_setzero_si256(),
d_lo,
_mm256_setzero_si256(),
avx2_nine,
avx2_eight,
avx2_pixel_max_vec,
);
_mm_storeu_si128(
dest_row.add(i).cast::<__m128i>(),
_mm256_castsi256_si128(result),
);
i += 8;
}
}
while i < wiener_end {
let a = *src_row.add(i - 1) as i32;
let b = *src_row.add(i) as i32;
let c = *src_row.add(i + 1) as i32;
let d = *src_row.add(i + 2) as i32;
let result = (-(a + d) + (b + c) * 9 + 8) >> 4;
*dest_row.add(i) = std::cmp::min(pixel_max, std::cmp::max(0, result) as u32) as u16;
i += 1;
}
if width_val >= 3 {
for i in (width_val - 3)..(width_val - 1) {
let a = *src_row.add(i) as u32;
let b = *src_row.add(i + 1) as u32;
*dest_row.add(i) = ((a + b + 1) / 2) as u16;
}
}
*dest_row.add(width_val - 1) = *src_row.add(width_val - 1);
}
}