#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
#[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
#[inline]
pub unsafe fn blend_over_avx2(dst: &mut [u8], src: &[u8]) {
debug_assert_eq!(dst.len(), src.len());
debug_assert_eq!(dst.len() % 4, 0);
let len = dst.len();
let simd_len = len - (len % 32);
let mut i = 0;
while i < simd_len {
let src_vec = _mm256_loadu_si256(src.as_ptr().add(i) as *const __m256i);
let dst_vec = _mm256_loadu_si256(dst.as_ptr().add(i) as *const __m256i);
let alpha_mask = _mm256_set_epi8(
15, -1, -1, -1, 11, -1, -1, -1, 7, -1, -1, -1, 3, -1, -1, -1, 15, -1, -1, -1, 11, -1,
-1, -1, 7, -1, -1, -1, 3, -1, -1, -1,
);
let src_alpha = _mm256_shuffle_epi8(src_vec, alpha_mask);
let max_alpha = _mm256_set1_epi8(255u8 as i8);
let inv_alpha = _mm256_sub_epi8(max_alpha, src_alpha);
let dst_scaled = _mm256_mullo_epi16(
_mm256_unpacklo_epi8(dst_vec, _mm256_setzero_si256()),
_mm256_unpacklo_epi8(inv_alpha, _mm256_setzero_si256()),
);
let dst_scaled_hi = _mm256_mullo_epi16(
_mm256_unpackhi_epi8(dst_vec, _mm256_setzero_si256()),
_mm256_unpackhi_epi8(inv_alpha, _mm256_setzero_si256()),
);
let dst_blended_lo = _mm256_srli_epi16(dst_scaled, 8);
let dst_blended_hi = _mm256_srli_epi16(dst_scaled_hi, 8);
let dst_blended = _mm256_packus_epi16(dst_blended_lo, dst_blended_hi);
let result = _mm256_adds_epu8(src_vec, dst_blended);
_mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut __m256i, result);
i += 32;
}
while i < len {
let src_alpha = src[i + 3];
let inv_alpha = 255 - src_alpha;
dst[i] = src[i].saturating_add(((dst[i] as u16 * inv_alpha as u16) >> 8) as u8);
dst[i + 1] = src[i + 1].saturating_add(((dst[i + 1] as u16 * inv_alpha as u16) >> 8) as u8);
dst[i + 2] = src[i + 2].saturating_add(((dst[i + 2] as u16 * inv_alpha as u16) >> 8) as u8);
dst[i + 3] = src[i + 3].saturating_add(((dst[i + 3] as u16 * inv_alpha as u16) >> 8) as u8);
i += 4;
}
}
#[cfg(all(
target_arch = "x86_64",
not(target_feature = "avx2"),
target_feature = "sse4.1"
))]
#[inline]
pub unsafe fn blend_over_sse41(dst: &mut [u8], src: &[u8]) {
debug_assert_eq!(dst.len(), src.len());
debug_assert_eq!(dst.len() % 4, 0);
let len = dst.len();
let simd_len = len - (len % 16);
let mut i = 0;
while i < simd_len {
let src_vec = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
let dst_vec = _mm_loadu_si128(dst.as_ptr().add(i) as *const __m128i);
let alpha_mask = _mm_set_epi8(15, -1, -1, -1, 11, -1, -1, -1, 7, -1, -1, -1, 3, -1, -1, -1);
let src_alpha = _mm_shuffle_epi8(src_vec, alpha_mask);
let max_alpha = _mm_set1_epi8(255u8 as i8);
let inv_alpha = _mm_sub_epi8(max_alpha, src_alpha);
let dst_scaled = _mm_mullo_epi16(
_mm_unpacklo_epi8(dst_vec, _mm_setzero_si128()),
_mm_unpacklo_epi8(inv_alpha, _mm_setzero_si128()),
);
let dst_scaled_hi = _mm_mullo_epi16(
_mm_unpackhi_epi8(dst_vec, _mm_setzero_si128()),
_mm_unpackhi_epi8(inv_alpha, _mm_setzero_si128()),
);
let dst_blended_lo = _mm_srli_epi16(dst_scaled, 8);
let dst_blended_hi = _mm_srli_epi16(dst_scaled_hi, 8);
let dst_blended = _mm_packus_epi16(dst_blended_lo, dst_blended_hi);
let result = _mm_adds_epu8(src_vec, dst_blended);
_mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, result);
i += 16;
}
while i < len {
let src_alpha = src[i + 3];
let inv_alpha = 255 - src_alpha;
dst[i] = src[i].saturating_add(((dst[i] as u16 * inv_alpha as u16) >> 8) as u8);
dst[i + 1] = src[i + 1].saturating_add(((dst[i + 1] as u16 * inv_alpha as u16) >> 8) as u8);
dst[i + 2] = src[i + 2].saturating_add(((dst[i + 2] as u16 * inv_alpha as u16) >> 8) as u8);
dst[i + 3] = src[i + 3].saturating_add(((dst[i + 3] as u16 * inv_alpha as u16) >> 8) as u8);
i += 4;
}
}
#[cfg(target_arch = "aarch64")]
#[inline]
pub unsafe fn blend_over_neon(dst: &mut [u8], src: &[u8]) {
use std::arch::aarch64::*;
debug_assert_eq!(dst.len(), src.len());
debug_assert_eq!(dst.len() % 4, 0);
let len = dst.len();
let simd_len = len - (len % 16);
let mut i = 0;
#[allow(clippy::never_loop, clippy::while_immutable_condition)]
if i < simd_len {
let _src_vec = vld1q_u8(src.as_ptr().add(i));
let _dst_vec = vld1q_u8(dst.as_ptr().add(i));
}
while i < len {
let src_alpha = src[i + 3];
let inv_alpha = 255 - src_alpha;
dst[i] = src[i].saturating_add(((dst[i] as u16 * inv_alpha as u16) >> 8) as u8);
dst[i + 1] = src[i + 1].saturating_add(((dst[i + 1] as u16 * inv_alpha as u16) >> 8) as u8);
dst[i + 2] = src[i + 2].saturating_add(((dst[i + 2] as u16 * inv_alpha as u16) >> 8) as u8);
dst[i + 3] = src[i + 3].saturating_add(((dst[i + 3] as u16 * inv_alpha as u16) >> 8) as u8);
i += 4;
}
}
#[inline]
#[allow(dead_code)]
pub fn blend_over_scalar(dst: &mut [u8], src: &[u8]) {
debug_assert_eq!(dst.len(), src.len());
debug_assert_eq!(dst.len() % 4, 0);
for i in (0..dst.len()).step_by(4) {
let src_alpha = src[i + 3];
let inv_alpha = 255 - src_alpha;
dst[i] = src[i].saturating_add(((dst[i] as u16 * inv_alpha as u16) >> 8) as u8);
dst[i + 1] = src[i + 1].saturating_add(((dst[i + 1] as u16 * inv_alpha as u16) >> 8) as u8);
dst[i + 2] = src[i + 2].saturating_add(((dst[i + 2] as u16 * inv_alpha as u16) >> 8) as u8);
dst[i + 3] = src[i + 3].saturating_add(((dst[i + 3] as u16 * inv_alpha as u16) >> 8) as u8);
}
}
#[inline]
pub fn blend_over(dst: &mut [u8], src: &[u8]) {
#[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
unsafe {
if is_x86_feature_detected!("avx2") {
return blend_over_avx2(dst, src);
}
}
#[cfg(all(target_arch = "x86_64", target_feature = "sse4.1"))]
unsafe {
if is_x86_feature_detected!("sse4.1") {
return blend_over_sse41(dst, src);
}
}
#[cfg(target_arch = "aarch64")]
unsafe {
blend_over_neon(dst, src);
}
#[cfg(not(any(
all(
target_arch = "x86_64",
any(target_feature = "avx2", target_feature = "sse4.1")
),
target_arch = "aarch64"
)))]
blend_over_scalar(dst, src);
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_blend_over_scalar() {
let mut dst = vec![100, 100, 100, 255, 50, 50, 50, 128];
let src = vec![200, 200, 200, 128, 150, 150, 150, 64];
blend_over_scalar(&mut dst, &src);
assert!(dst[0] > 100); }
#[test]
fn test_blend_consistency() {
let mut dst1 = vec![100; 1024];
let mut dst2 = dst1.clone();
let src = vec![200; 1024];
blend_over_scalar(&mut dst1, &src);
blend_over(&mut dst2, &src);
assert_eq!(dst1, dst2);
}
#[test]
fn test_blend_performance() {
let mut dst = vec![0u8; 1024 * 1024]; let src = vec![128u8; 1024 * 1024];
let start = std::time::Instant::now();
for _ in 0..10 {
blend_over(&mut dst, &src);
}
let elapsed = start.elapsed();
let throughput = (dst.len() as f64 * 10.0) / elapsed.as_secs_f64() / 1_000_000_000.0;
println!("Blending throughput: {:.2} GB/s", throughput);
#[cfg(debug_assertions)]
let min_throughput = 0.05; #[cfg(not(debug_assertions))]
let min_throughput = 0.35;
assert!(
throughput > min_throughput,
"Throughput too low: {:.2} GB/s (expected > {} GB/s)",
throughput,
min_throughput
);
}
}