#![allow(clippy::undocumented_unsafe_blocks)]
#![allow(unsafe_op_in_unsafe_fn)]
#[cfg(any(test, feature = "experimental"))]
use std::{arch::x86_64::*, mem::size_of, num::NonZeroUsize};
#[cfg(any(test, feature = "experimental"))]
use cpudetect::target_family;
#[cfg(any(test, feature = "experimental"))]
use semisafe::result::unwrap as semisafe_res_unwrap;
#[target_family("x86_64_v3")]
#[cfg(any(test, feature = "experimental"))]
pub(super) unsafe fn limit_changes_u8(
dest: *mut u8,
dest_stride_bytes: NonZeroUsize,
src: *const u8,
src_stride_bytes: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
limit: u16,
) {
let width = width.get();
let height = height.get();
let dest_stride = dest_stride_bytes.get();
let src_stride = src_stride_bytes.get();
let limit = semisafe_res_unwrap(u8::try_from(limit));
let limit256 = _mm256_set1_epi8(i8::from_ne_bytes([limit]));
let limit128 = _mm_set1_epi8(i8::from_ne_bytes([limit]));
for y in 0..height {
let src_row = src.add(y * src_stride);
let dest_row = dest.add(y * dest_stride);
let mut x = 0;
while x + 32 <= width {
let src256 = _mm256_loadu_si256(src_row.add(x).cast::<__m256i>());
let dest256 = _mm256_loadu_si256(dest_row.add(x).cast::<__m256i>());
let lower = _mm256_subs_epu8(src256, limit256);
let upper = _mm256_adds_epu8(src256, limit256);
let clamped = _mm256_min_epu8(_mm256_max_epu8(dest256, lower), upper);
_mm256_storeu_si256(dest_row.add(x).cast::<__m256i>(), clamped);
x += 32;
}
if x + 16 <= width {
let src128 = _mm_loadu_si128(src_row.add(x).cast::<__m128i>());
let dest128 = _mm_loadu_si128(dest_row.add(x).cast::<__m128i>());
let lower = _mm_subs_epu8(src128, limit128);
let upper = _mm_adds_epu8(src128, limit128);
let clamped = _mm_min_epu8(_mm_max_epu8(dest128, lower), upper);
_mm_storeu_si128(dest_row.add(x).cast::<__m128i>(), clamped);
x += 16;
}
while x < width {
let src = *src_row.add(x);
let dest = *dest_row.add(x);
let lower = src.saturating_sub(limit);
let upper = src.saturating_add(limit);
*dest_row.add(x) = lower.max(dest).min(upper);
x += 1;
}
}
}
#[target_family("x86_64_v3")]
#[cfg(any(test, feature = "experimental"))]
pub(super) unsafe fn limit_changes_u16(
dest: *mut u8,
dest_stride_bytes: NonZeroUsize,
src: *const u8,
src_stride_bytes: NonZeroUsize,
width: NonZeroUsize,
height: NonZeroUsize,
limit: u16,
) {
let width = width.get();
let height = height.get();
let dest_stride = dest_stride_bytes.get() / size_of::<u16>();
let src_stride = src_stride_bytes.get() / size_of::<u16>();
let src = src.cast::<u16>();
let dest = dest.cast::<u16>();
let limit256 = _mm256_set1_epi16(i16::from_ne_bytes(limit.to_ne_bytes()));
let limit128 = _mm_set1_epi16(i16::from_ne_bytes(limit.to_ne_bytes()));
for y in 0..height {
let src_row = src.add(y * src_stride);
let dest_row = dest.add(y * dest_stride);
let mut x = 0;
while x + 16 <= width {
let src256 = _mm256_loadu_si256(src_row.add(x).cast::<__m256i>());
let dest256 = _mm256_loadu_si256(dest_row.add(x).cast::<__m256i>());
let lower = _mm256_subs_epu16(src256, limit256);
let upper = _mm256_adds_epu16(src256, limit256);
let clamped = _mm256_min_epu16(_mm256_max_epu16(dest256, lower), upper);
_mm256_storeu_si256(dest_row.add(x).cast::<__m256i>(), clamped);
x += 16;
}
if x + 8 <= width {
let src128 = _mm_loadu_si128(src_row.add(x).cast::<__m128i>());
let dest128 = _mm_loadu_si128(dest_row.add(x).cast::<__m128i>());
let lower = _mm_subs_epu16(src128, limit128);
let upper = _mm_adds_epu16(src128, limit128);
let clamped = _mm_min_epu16(_mm_max_epu16(dest128, lower), upper);
_mm_storeu_si128(dest_row.add(x).cast::<__m128i>(), clamped);
x += 8;
}
while x < width {
let src = *src_row.add(x);
let dest = *dest_row.add(x);
let lower = src.saturating_sub(limit);
let upper = src.saturating_add(limit);
*dest_row.add(x) = lower.max(dest).min(upper);
x += 1;
}
}
}