#![cfg_attr(not(feature = "std"), allow(dead_code))]
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
use crate::row::scalar::alpha_extract as scalar;
#[cfg(feature = "yuv-444-packed")]
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn copy_alpha_packed_u8x4_at_3(packed: &[u8], rgba_out: &mut [u8], width: usize) {
debug_assert!(packed.len() >= width * 4, "packed too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
unsafe {
let alpha_mask = _mm256_set_epi8(
-1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, );
let mut x = 0usize;
while x + 8 <= width {
let off = x * 4;
let src = _mm256_loadu_si256(packed.as_ptr().add(off).cast());
let dst = _mm256_loadu_si256(rgba_out.as_ptr().add(off).cast());
let merged = _mm256_blendv_epi8(dst, src, alpha_mask);
_mm256_storeu_si256(rgba_out.as_mut_ptr().add(off).cast(), merged);
x += 8;
}
if x < width {
scalar::copy_alpha_packed_u8x4_at_3(
&packed[x * 4..width * 4],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[cfg(feature = "yuv-444-packed")]
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn copy_alpha_packed_u16x4_to_u8_at_0(
packed: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(packed.len() >= width * 4, "packed too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
unsafe {
let alpha_mask = _mm256_set_epi8(
-1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, );
let shuf_mask = _mm256_set_epi8(
12, -1, -1, -1, 8, -1, -1, -1, 4, -1, -1, -1, 0, -1, -1, -1, 12, -1, -1, -1, 8, -1, -1, -1, 4, -1, -1, -1, 0, -1, -1, -1, );
let shr8 = _mm_cvtsi32_si128(8);
let mut x = 0usize;
while x + 8 <= width {
let src_off = x * 4;
let lo = _mm256_loadu_si256(packed.as_ptr().add(src_off).cast()); let hi = _mm256_loadu_si256(packed.as_ptr().add(src_off + 16).cast());
let lo_shr = _mm256_srl_epi16(lo, shr8);
let hi_shr = _mm256_srl_epi16(hi, shr8);
let packed_u8 = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi16(lo_shr, hi_shr));
let a_scattered = _mm256_shuffle_epi8(packed_u8, shuf_mask);
let dst_off = x * 4;
let dst = _mm256_loadu_si256(rgba_out.as_ptr().add(dst_off).cast());
let merged = _mm256_blendv_epi8(dst, a_scattered, alpha_mask);
_mm256_storeu_si256(rgba_out.as_mut_ptr().add(dst_off).cast(), merged);
x += 8;
}
if x < width {
scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(
&packed[x * 4..width * 4],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[cfg(feature = "yuv-444-packed")]
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn copy_alpha_packed_u16x4_at_0(
packed: &[u16],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(packed.len() >= width * 4, "packed too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
unsafe {
let alpha_mask_u16 = _mm256_set_epi16(
-1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, );
let shuf_mask = _mm256_set_epi8(
9, 8, -1, -1, -1, -1, -1, -1, 1, 0, -1, -1, -1, -1, -1, -1, 9, 8, -1, -1, -1, -1, -1, -1, 1, 0, -1, -1, -1, -1, -1, -1, );
let mut x = 0usize;
while x + 8 <= width {
let off = x * 4;
let src_lo = _mm256_loadu_si256(packed.as_ptr().add(off).cast()); let src_hi = _mm256_loadu_si256(packed.as_ptr().add(off + 16).cast()); let dst_lo = _mm256_loadu_si256(rgba_out.as_ptr().add(off).cast());
let dst_hi = _mm256_loadu_si256(rgba_out.as_ptr().add(off + 16).cast());
let a_lo = _mm256_shuffle_epi8(src_lo, shuf_mask);
let a_hi = _mm256_shuffle_epi8(src_hi, shuf_mask);
let merged_lo = _mm256_blendv_epi8(dst_lo, a_lo, alpha_mask_u16);
let merged_hi = _mm256_blendv_epi8(dst_hi, a_hi, alpha_mask_u16);
_mm256_storeu_si256(rgba_out.as_mut_ptr().add(off).cast(), merged_lo);
_mm256_storeu_si256(rgba_out.as_mut_ptr().add(off + 16).cast(), merged_hi);
x += 8;
}
if x < width {
scalar::copy_alpha_packed_u16x4_at_0::<false>(
&packed[x * 4..width * 4],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[cfg(any(feature = "gbr", feature = "yuva"))]
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn copy_alpha_plane_u8(alpha: &[u8], rgba_out: &mut [u8], width: usize) {
debug_assert!(alpha.len() >= width, "alpha plane too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
unsafe {
let alpha_mask = _mm256_set_epi8(
-1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, );
let shuf_mask = _mm256_set_epi8(
7, -1, -1, -1, 6, -1, -1, -1, 5, -1, -1, -1, 4, -1, -1, -1, 3, -1, -1, -1, 2, -1, -1, -1, 1, -1, -1, -1, 0, -1, -1, -1, );
let mut x = 0usize;
while x + 8 <= width {
let a_raw_128 = _mm_loadl_epi64(alpha.as_ptr().add(x).cast());
let a_raw_256 = _mm256_broadcastsi128_si256(a_raw_128);
let a_scattered = _mm256_shuffle_epi8(a_raw_256, shuf_mask);
let off = x * 4;
let dst = _mm256_loadu_si256(rgba_out.as_ptr().add(off).cast());
let merged = _mm256_blendv_epi8(dst, a_scattered, alpha_mask);
_mm256_storeu_si256(rgba_out.as_mut_ptr().add(off).cast(), merged);
x += 8;
}
if x < width {
scalar::copy_alpha_plane_u8(&alpha[x..width], &mut rgba_out[x * 4..width * 4], width - x);
}
}
}
#[cfg(any(feature = "gbr", feature = "yuva"))]
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn copy_alpha_plane_u16_to_u8<const BITS: u32>(
alpha: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
const {
assert!(BITS >= 8 && BITS <= 16, "BITS must be in [8, 16]");
}
debug_assert!(alpha.len() >= width, "alpha plane too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
unsafe {
let shr_count = _mm_cvtsi32_si128((BITS as i32) - 8);
let bits_mask = _mm_set1_epi16(((1u32 << BITS) - 1) as i16);
let alpha_mask = _mm256_set_epi8(
-1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, );
let shuf_mask = _mm256_set_epi8(
7, -1, -1, -1, 6, -1, -1, -1, 5, -1, -1, -1, 4, -1, -1, -1, 3, -1, -1, -1, 2, -1, -1, -1, 1, -1, -1, -1, 0, -1, -1, -1, );
let mut x = 0usize;
while x + 8 <= width {
let a_u16 = _mm_and_si128(_mm_loadu_si128(alpha.as_ptr().add(x).cast()), bits_mask);
let a_shifted = _mm_srl_epi16(a_u16, shr_count);
let a_u8_128 = _mm_packus_epi16(a_shifted, _mm_setzero_si128());
let a_u8_256 = _mm256_broadcastsi128_si256(a_u8_128);
let a_scattered = _mm256_shuffle_epi8(a_u8_256, shuf_mask);
let off = x * 4;
let dst = _mm256_loadu_si256(rgba_out.as_ptr().add(off).cast());
let merged = _mm256_blendv_epi8(dst, a_scattered, alpha_mask);
_mm256_storeu_si256(rgba_out.as_mut_ptr().add(off).cast(), merged);
x += 8;
}
if x < width {
scalar::copy_alpha_plane_u16_to_u8::<BITS, false>(
&alpha[x..width],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[cfg(any(feature = "gbr", feature = "yuva"))]
#[inline]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn copy_alpha_plane_u16<const BITS: u32>(
alpha: &[u16],
rgba_out: &mut [u16],
width: usize,
) {
const {
assert!(BITS > 0 && BITS <= 16, "BITS must be in [1, 16]");
}
debug_assert!(alpha.len() >= width, "alpha plane too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
unsafe {
let bits_mask_256 = _mm256_set1_epi16(((1u32 << BITS) - 1) as i16);
let alpha_mask_u16 = _mm256_set_epi16(
-1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, );
let shuf_dst0 = _mm256_set_epi8(
7, 6, -1, -1, -1, -1, -1, -1, 5, 4, -1, -1, -1, -1, -1, -1, 3, 2, -1, -1, -1, -1, -1, -1, 1, 0, -1, -1, -1, -1, -1, -1, );
let shuf_dst1 = _mm256_set_epi8(
15, 14, -1, -1, -1, -1, -1, -1, 13, 12, -1, -1, -1, -1, -1, -1, 11, 10, -1, -1, -1, -1, -1, -1, 9, 8, -1, -1, -1, -1, -1, -1, );
let mut x = 0usize;
while x + 8 <= width {
let a_raw_128 = _mm_loadu_si128(alpha.as_ptr().add(x).cast());
let a_raw_256 = _mm256_and_si256(_mm256_broadcastsi128_si256(a_raw_128), bits_mask_256);
let off = x * 4;
let dst_lo = _mm256_loadu_si256(rgba_out.as_ptr().add(off).cast());
let dst_hi = _mm256_loadu_si256(rgba_out.as_ptr().add(off + 16).cast());
let a_for_lo = _mm256_shuffle_epi8(a_raw_256, shuf_dst0);
let a_for_hi = _mm256_shuffle_epi8(a_raw_256, shuf_dst1);
let merged_lo = _mm256_blendv_epi8(dst_lo, a_for_lo, alpha_mask_u16);
let merged_hi = _mm256_blendv_epi8(dst_hi, a_for_hi, alpha_mask_u16);
_mm256_storeu_si256(rgba_out.as_mut_ptr().add(off).cast(), merged_lo);
_mm256_storeu_si256(rgba_out.as_mut_ptr().add(off + 16).cast(), merged_hi);
x += 8;
}
if x < width {
scalar::copy_alpha_plane_u16::<BITS, false>(
&alpha[x..width],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[cfg(all(test, feature = "std"))]
mod tests {
use crate::row::scalar::alpha_extract as scalar;
fn pseudo_random_u8(out: &mut [u8], seed: u32) {
let mut state = seed;
for v in out.iter_mut() {
state = state.wrapping_mul(1664525).wrapping_add(1013904223);
*v = (state >> 16) as u8;
}
}
fn pseudo_random_u16(out: &mut [u16], seed: u32) {
let mut state = seed;
for v in out.iter_mut() {
state = state.wrapping_mul(1664525).wrapping_add(1013904223);
*v = (state >> 8) as u16;
}
}
const WIDTHS: &[usize] = &[
1, 7, 8, 9, 15, 16, 17, 23, 24, 31, 32, 33, 47, 48, 64, 128, 130,
];
#[test]
#[cfg_attr(
miri,
ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
)]
fn avx2_copy_alpha_packed_u8x4_at_3_matches_scalar_widths() {
if !std::arch::is_x86_feature_detected!("avx2") {
return;
}
for &w in WIDTHS {
let mut packed = std::vec![0u8; w * 4];
pseudo_random_u8(&mut packed, 0xC0FFEE);
let mut rgba_simd = std::vec![0u8; w * 4];
pseudo_random_u8(&mut rgba_simd, 0xDECAF);
let mut rgba_scalar = rgba_simd.clone();
unsafe { super::copy_alpha_packed_u8x4_at_3(&packed, &mut rgba_simd, w) };
scalar::copy_alpha_packed_u8x4_at_3(&packed, &mut rgba_scalar, w);
assert_eq!(rgba_simd, rgba_scalar, "width={w}");
}
}
#[test]
#[cfg_attr(
miri,
ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
)]
fn avx2_copy_alpha_packed_u16x4_to_u8_at_0_matches_scalar_widths() {
if !std::arch::is_x86_feature_detected!("avx2") {
return;
}
for &w in WIDTHS {
let mut packed = std::vec![0u16; w * 4];
pseudo_random_u16(&mut packed, 0xCAB00D);
let mut rgba_simd = std::vec![0u8; w * 4];
pseudo_random_u8(&mut rgba_simd, 0xFEED);
let mut rgba_scalar = rgba_simd.clone();
unsafe { super::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_simd, w) };
scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(&packed, &mut rgba_scalar, w);
assert_eq!(rgba_simd, rgba_scalar, "width={w}");
}
}
#[test]
#[cfg_attr(
miri,
ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
)]
fn avx2_copy_alpha_packed_u16x4_at_0_matches_scalar_widths() {
if !std::arch::is_x86_feature_detected!("avx2") {
return;
}
for &w in WIDTHS {
let mut packed = std::vec![0u16; w * 4];
pseudo_random_u16(&mut packed, 0xBEEF11);
let mut rgba_simd = std::vec![0u16; w * 4];
pseudo_random_u16(&mut rgba_simd, 0x1337);
let mut rgba_scalar = rgba_simd.clone();
unsafe { super::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_simd, w) };
scalar::copy_alpha_packed_u16x4_at_0::<false>(&packed, &mut rgba_scalar, w);
assert_eq!(rgba_simd, rgba_scalar, "width={w}");
}
}
#[test]
#[cfg_attr(
miri,
ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
)]
fn avx2_copy_alpha_plane_u8_matches_scalar_widths() {
if !std::arch::is_x86_feature_detected!("avx2") {
return;
}
for &w in WIDTHS {
let mut alpha = std::vec![0u8; w];
pseudo_random_u8(&mut alpha, 0xABCDEF);
let mut rgba_simd = std::vec![0u8; w * 4];
pseudo_random_u8(&mut rgba_simd, 0x123456);
let mut rgba_scalar = rgba_simd.clone();
unsafe { super::copy_alpha_plane_u8(&alpha, &mut rgba_simd, w) };
scalar::copy_alpha_plane_u8(&alpha, &mut rgba_scalar, w);
assert_eq!(rgba_simd, rgba_scalar, "width={w}");
}
}
#[test]
#[cfg_attr(
miri,
ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
)]
fn avx2_copy_alpha_plane_u16_to_u8_matches_scalar_widths_bits10() {
if !std::arch::is_x86_feature_detected!("avx2") {
return;
}
for &w in WIDTHS {
let mut alpha = std::vec![0u16; w];
pseudo_random_u16(&mut alpha, 0xC0DE);
for v in alpha.iter_mut() {
*v &= 0x03FF;
}
let mut rgba_simd = std::vec![0u8; w * 4];
pseudo_random_u8(&mut rgba_simd, 0xBABE);
let mut rgba_scalar = rgba_simd.clone();
unsafe { super::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_simd, w) };
scalar::copy_alpha_plane_u16_to_u8::<10, false>(&alpha, &mut rgba_scalar, w);
assert_eq!(rgba_simd, rgba_scalar, "width={w}");
}
}
#[test]
#[cfg_attr(
miri,
ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
)]
fn avx2_copy_alpha_plane_u16_to_u8_matches_scalar_widths_bits12() {
if !std::arch::is_x86_feature_detected!("avx2") {
return;
}
for &w in WIDTHS {
let mut alpha = std::vec![0u16; w];
pseudo_random_u16(&mut alpha, 0xF00BAA);
for v in alpha.iter_mut() {
*v &= 0x0FFF;
}
let mut rgba_simd = std::vec![0u8; w * 4];
pseudo_random_u8(&mut rgba_simd, 0x5EED);
let mut rgba_scalar = rgba_simd.clone();
unsafe { super::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_simd, w) };
scalar::copy_alpha_plane_u16_to_u8::<12, false>(&alpha, &mut rgba_scalar, w);
assert_eq!(rgba_simd, rgba_scalar, "width={w}");
}
}
#[test]
#[cfg_attr(
miri,
ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
)]
fn avx2_copy_alpha_plane_u16_matches_scalar_widths() {
if !std::arch::is_x86_feature_detected!("avx2") {
return;
}
for &w in WIDTHS {
let mut alpha = std::vec![0u16; w];
pseudo_random_u16(&mut alpha, 0xDEADBE);
let mut rgba_simd = std::vec![0u16; w * 4];
pseudo_random_u16(&mut rgba_simd, 0xFADE);
let mut rgba_scalar = rgba_simd.clone();
unsafe { super::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_simd, w) };
scalar::copy_alpha_plane_u16::<10, false>(&alpha, &mut rgba_scalar, w);
assert_eq!(rgba_simd, rgba_scalar, "width={w}");
}
}
}