#![cfg_attr(not(feature = "std"), allow(dead_code))]
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
use crate::row::scalar::alpha_extract as scalar;
#[cfg(feature = "yuv-444-packed")]
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn copy_alpha_packed_u8x4_at_3(packed: &[u8], rgba_out: &mut [u8], width: usize) {
debug_assert!(packed.len() >= width * 4, "packed too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
unsafe {
const ALPHA_MASK_U8: __mmask64 = 0x8888_8888_8888_8888u64;
let mut x = 0usize;
while x + 16 <= width {
let off = x * 4;
let src = _mm512_loadu_si512(packed.as_ptr().add(off).cast());
let dst = _mm512_loadu_si512(rgba_out.as_ptr().add(off).cast());
let merged = _mm512_mask_blend_epi8(ALPHA_MASK_U8, dst, src);
_mm512_storeu_si512(rgba_out.as_mut_ptr().add(off).cast(), merged);
x += 16;
}
if x < width {
scalar::copy_alpha_packed_u8x4_at_3(
&packed[x * 4..width * 4],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[cfg(feature = "yuv-444-packed")]
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn copy_alpha_packed_u16x4_to_u8_at_0(
packed: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(packed.len() >= width * 4, "packed too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
unsafe {
const ALPHA_MASK_U8: __mmask64 = 0x8888_8888_8888_8888u64;
let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
#[rustfmt::skip]
let shuf_mask = _mm512_set_epi8(
12, -1, -1, -1, 8, -1, -1, -1, 4, -1, -1, -1, 0, -1, -1, -1,
12, -1, -1, -1, 8, -1, -1, -1, 4, -1, -1, -1, 0, -1, -1, -1,
12, -1, -1, -1, 8, -1, -1, -1, 4, -1, -1, -1, 0, -1, -1, -1,
12, -1, -1, -1, 8, -1, -1, -1, 4, -1, -1, -1, 0, -1, -1, -1,
);
let mut x = 0usize;
while x + 16 <= width {
let src_off = x * 4;
let lo = _mm512_loadu_si512(packed.as_ptr().add(src_off).cast()); let hi = _mm512_loadu_si512(packed.as_ptr().add(src_off + 32).cast());
let lo_shr = _mm512_srli_epi16::<8>(lo);
let hi_shr = _mm512_srli_epi16::<8>(hi);
let packed_u8 = _mm512_permutexvar_epi64(pack_fixup, _mm512_packus_epi16(lo_shr, hi_shr));
let a_scattered = _mm512_shuffle_epi8(packed_u8, shuf_mask);
let dst_off = x * 4;
let dst = _mm512_loadu_si512(rgba_out.as_ptr().add(dst_off).cast());
let merged = _mm512_mask_blend_epi8(ALPHA_MASK_U8, dst, a_scattered);
_mm512_storeu_si512(rgba_out.as_mut_ptr().add(dst_off).cast(), merged);
x += 16;
}
if x < width {
scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(
&packed[x * 4..width * 4],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[cfg(feature = "yuv-444-packed")]
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn copy_alpha_packed_u16x4_at_0(
packed: &[u16],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(packed.len() >= width * 4, "packed too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
unsafe {
const ALPHA_MASK_U16: __mmask32 = 0x8888_8888u32;
#[rustfmt::skip]
let shuf_mask = _mm512_set_epi8(
9, 8, -1, -1, -1, -1, -1, -1, 1, 0, -1, -1, -1, -1, -1, -1,
9, 8, -1, -1, -1, -1, -1, -1, 1, 0, -1, -1, -1, -1, -1, -1,
9, 8, -1, -1, -1, -1, -1, -1, 1, 0, -1, -1, -1, -1, -1, -1,
9, 8, -1, -1, -1, -1, -1, -1, 1, 0, -1, -1, -1, -1, -1, -1,
);
let mut x = 0usize;
while x + 16 <= width {
let off = x * 4;
let src_lo = _mm512_loadu_si512(packed.as_ptr().add(off).cast()); let src_hi = _mm512_loadu_si512(packed.as_ptr().add(off + 32).cast()); let dst_lo = _mm512_loadu_si512(rgba_out.as_ptr().add(off).cast());
let dst_hi = _mm512_loadu_si512(rgba_out.as_ptr().add(off + 32).cast());
let a_lo = _mm512_shuffle_epi8(src_lo, shuf_mask);
let a_hi = _mm512_shuffle_epi8(src_hi, shuf_mask);
let merged_lo = _mm512_mask_blend_epi16(ALPHA_MASK_U16, dst_lo, a_lo);
let merged_hi = _mm512_mask_blend_epi16(ALPHA_MASK_U16, dst_hi, a_hi);
_mm512_storeu_si512(rgba_out.as_mut_ptr().add(off).cast(), merged_lo);
_mm512_storeu_si512(rgba_out.as_mut_ptr().add(off + 32).cast(), merged_hi);
x += 16;
}
if x < width {
scalar::copy_alpha_packed_u16x4_at_0::<false>(
&packed[x * 4..width * 4],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[cfg(any(feature = "gbr", feature = "yuva"))]
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn copy_alpha_plane_u8(alpha: &[u8], rgba_out: &mut [u8], width: usize) {
debug_assert!(alpha.len() >= width, "alpha plane too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
unsafe {
const ALPHA_MASK_U8: __mmask64 = 0x8888_8888_8888_8888u64;
let mut x = 0usize;
while x + 16 <= width {
let a_raw_128 = _mm_loadu_si128(alpha.as_ptr().add(x).cast());
let a_widened = _mm512_cvtepu8_epi32(a_raw_128);
let a_at_alpha_slot = _mm512_slli_epi32::<24>(a_widened);
let off = x * 4;
let dst = _mm512_loadu_si512(rgba_out.as_ptr().add(off).cast());
let merged = _mm512_mask_blend_epi8(ALPHA_MASK_U8, dst, a_at_alpha_slot);
_mm512_storeu_si512(rgba_out.as_mut_ptr().add(off).cast(), merged);
x += 16;
}
if x < width {
scalar::copy_alpha_plane_u8(&alpha[x..width], &mut rgba_out[x * 4..width * 4], width - x);
}
}
}
#[cfg(any(feature = "gbr", feature = "yuva"))]
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn copy_alpha_plane_u16_to_u8<const BITS: u32>(
alpha: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
const {
assert!(BITS >= 8 && BITS <= 16, "BITS must be in [8, 16]");
}
debug_assert!(alpha.len() >= width, "alpha plane too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
unsafe {
let shr_count = _mm_cvtsi32_si128((BITS as i32) - 8);
let bits_mask = _mm256_set1_epi16(((1u32 << BITS) - 1) as i16);
const ALPHA_MASK_U8: __mmask64 = 0x8888_8888_8888_8888u64;
let mut x = 0usize;
while x + 16 <= width {
let a_u16 = _mm256_and_si256(_mm256_loadu_si256(alpha.as_ptr().add(x).cast()), bits_mask);
let a_shifted = _mm256_srl_epi16(a_u16, shr_count);
let a_u8_256 =
_mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi16(a_shifted, _mm256_setzero_si256()));
let a_u8_128 = _mm256_castsi256_si128(a_u8_256);
let a_widened = _mm512_cvtepu8_epi32(a_u8_128);
let a_at_alpha_slot = _mm512_slli_epi32::<24>(a_widened);
let off = x * 4;
let dst = _mm512_loadu_si512(rgba_out.as_ptr().add(off).cast());
let merged = _mm512_mask_blend_epi8(ALPHA_MASK_U8, dst, a_at_alpha_slot);
_mm512_storeu_si512(rgba_out.as_mut_ptr().add(off).cast(), merged);
x += 16;
}
if x < width {
scalar::copy_alpha_plane_u16_to_u8::<BITS, false>(
&alpha[x..width],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[cfg(any(feature = "gbr", feature = "yuva"))]
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn copy_alpha_plane_u16<const BITS: u32>(
alpha: &[u16],
rgba_out: &mut [u16],
width: usize,
) {
const {
assert!(BITS > 0 && BITS <= 16, "BITS must be in [1, 16]");
}
debug_assert!(alpha.len() >= width, "alpha plane too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
unsafe {
let bits_mask = _mm256_set1_epi16(((1u32 << BITS) - 1) as i16);
const ALPHA_MASK_U16: __mmask32 = 0x8888_8888u32;
let mut x = 0usize;
while x + 16 <= width {
let a_raw_256 = _mm256_and_si256(_mm256_loadu_si256(alpha.as_ptr().add(x).cast()), bits_mask);
let a_lo_128 = _mm256_castsi256_si128(a_raw_256); let a_hi_128 = _mm256_extracti128_si256::<1>(a_raw_256);
let a_lo_u64 = _mm512_cvtepu16_epi64(a_lo_128);
let a_hi_u64 = _mm512_cvtepu16_epi64(a_hi_128);
let a_lo_at_slot = _mm512_slli_epi64::<48>(a_lo_u64);
let a_hi_at_slot = _mm512_slli_epi64::<48>(a_hi_u64);
let off = x * 4;
let dst_lo = _mm512_loadu_si512(rgba_out.as_ptr().add(off).cast());
let dst_hi = _mm512_loadu_si512(rgba_out.as_ptr().add(off + 32).cast());
let merged_lo = _mm512_mask_blend_epi16(ALPHA_MASK_U16, dst_lo, a_lo_at_slot);
let merged_hi = _mm512_mask_blend_epi16(ALPHA_MASK_U16, dst_hi, a_hi_at_slot);
_mm512_storeu_si512(rgba_out.as_mut_ptr().add(off).cast(), merged_lo);
_mm512_storeu_si512(rgba_out.as_mut_ptr().add(off + 32).cast(), merged_hi);
x += 16;
}
if x < width {
scalar::copy_alpha_plane_u16::<BITS, false>(
&alpha[x..width],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[cfg(all(test, feature = "std"))]
mod tests {
use crate::row::scalar::alpha_extract as scalar;
fn pseudo_random_u8(out: &mut [u8], seed: u32) {
let mut state = seed;
for v in out.iter_mut() {
state = state.wrapping_mul(1664525).wrapping_add(1013904223);
*v = (state >> 16) as u8;
}
}
fn pseudo_random_u16(out: &mut [u16], seed: u32) {
let mut state = seed;
for v in out.iter_mut() {
state = state.wrapping_mul(1664525).wrapping_add(1013904223);
*v = (state >> 8) as u16;
}
}
const WIDTHS: &[usize] = &[1, 7, 8, 9, 15, 16, 17, 31, 32, 33, 47, 48, 64, 128, 130];
#[test]
#[cfg_attr(
miri,
ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
)]
fn avx512_copy_alpha_packed_u8x4_at_3_matches_scalar_widths() {
if !std::arch::is_x86_feature_detected!("avx512f")
|| !std::arch::is_x86_feature_detected!("avx512bw")
{
return;
}
for &w in WIDTHS {
let mut packed = std::vec![0u8; w * 4];
pseudo_random_u8(&mut packed, 0xC0FFEE);
let mut rgba_simd = std::vec![0u8; w * 4];
pseudo_random_u8(&mut rgba_simd, 0xDECAF);
let mut rgba_scalar = rgba_simd.clone();
unsafe { super::copy_alpha_packed_u8x4_at_3(&packed, &mut rgba_simd, w) };
scalar::copy_alpha_packed_u8x4_at_3(&packed, &mut rgba_scalar, w);
assert_eq!(rgba_simd, rgba_scalar, "width={w}");
}
}
#[test]
#[cfg_attr(
miri,
ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
)]
fn avx512_copy_alpha_packed_u16x4_to_u8_at_0_matches_scalar_widths() {
if !std::arch::is_x86_feature_detected!("avx512f")
|| !std::arch::is_x86_feature_detected!("avx512bw")
{
return;
}
for &w in WIDTHS {
let mut packed = std::vec![0u16; w * 4];
pseudo_random_u16(&mut packed, 0xCAB00D);
let mut rgba_simd = std::vec![0u8; w * 4];
pseudo_random_u8(&mut rgba_simd, 0xFEED);
let mut rgba_scalar = rgba_simd.clone();
unsafe { super::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_simd, w) };
scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(&packed, &mut rgba_scalar, w);
assert_eq!(rgba_simd, rgba_scalar, "width={w}");
}
}
#[test]
#[cfg_attr(
miri,
ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
)]
fn avx512_copy_alpha_packed_u16x4_at_0_matches_scalar_widths() {
if !std::arch::is_x86_feature_detected!("avx512f")
|| !std::arch::is_x86_feature_detected!("avx512bw")
{
return;
}
for &w in WIDTHS {
let mut packed = std::vec![0u16; w * 4];
pseudo_random_u16(&mut packed, 0xBEEF11);
let mut rgba_simd = std::vec![0u16; w * 4];
pseudo_random_u16(&mut rgba_simd, 0x1337);
let mut rgba_scalar = rgba_simd.clone();
unsafe { super::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_simd, w) };
scalar::copy_alpha_packed_u16x4_at_0::<false>(&packed, &mut rgba_scalar, w);
assert_eq!(rgba_simd, rgba_scalar, "width={w}");
}
}
#[test]
#[cfg_attr(
miri,
ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
)]
fn avx512_copy_alpha_plane_u8_matches_scalar_widths() {
if !std::arch::is_x86_feature_detected!("avx512f")
|| !std::arch::is_x86_feature_detected!("avx512bw")
{
return;
}
for &w in WIDTHS {
let mut alpha = std::vec![0u8; w];
pseudo_random_u8(&mut alpha, 0xABCDEF);
let mut rgba_simd = std::vec![0u8; w * 4];
pseudo_random_u8(&mut rgba_simd, 0x123456);
let mut rgba_scalar = rgba_simd.clone();
unsafe { super::copy_alpha_plane_u8(&alpha, &mut rgba_simd, w) };
scalar::copy_alpha_plane_u8(&alpha, &mut rgba_scalar, w);
assert_eq!(rgba_simd, rgba_scalar, "width={w}");
}
}
#[test]
#[cfg_attr(
miri,
ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
)]
fn avx512_copy_alpha_plane_u16_to_u8_matches_scalar_widths_bits10() {
if !std::arch::is_x86_feature_detected!("avx512f")
|| !std::arch::is_x86_feature_detected!("avx512bw")
{
return;
}
for &w in WIDTHS {
let mut alpha = std::vec![0u16; w];
pseudo_random_u16(&mut alpha, 0xC0DE);
for v in alpha.iter_mut() {
*v &= 0x03FF;
}
let mut rgba_simd = std::vec![0u8; w * 4];
pseudo_random_u8(&mut rgba_simd, 0xBABE);
let mut rgba_scalar = rgba_simd.clone();
unsafe { super::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_simd, w) };
scalar::copy_alpha_plane_u16_to_u8::<10, false>(&alpha, &mut rgba_scalar, w);
assert_eq!(rgba_simd, rgba_scalar, "width={w}");
}
}
#[test]
#[cfg_attr(
miri,
ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
)]
fn avx512_copy_alpha_plane_u16_to_u8_matches_scalar_widths_bits12() {
if !std::arch::is_x86_feature_detected!("avx512f")
|| !std::arch::is_x86_feature_detected!("avx512bw")
{
return;
}
for &w in WIDTHS {
let mut alpha = std::vec![0u16; w];
pseudo_random_u16(&mut alpha, 0xF00BAA);
for v in alpha.iter_mut() {
*v &= 0x0FFF;
}
let mut rgba_simd = std::vec![0u8; w * 4];
pseudo_random_u8(&mut rgba_simd, 0x5EED);
let mut rgba_scalar = rgba_simd.clone();
unsafe { super::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_simd, w) };
scalar::copy_alpha_plane_u16_to_u8::<12, false>(&alpha, &mut rgba_scalar, w);
assert_eq!(rgba_simd, rgba_scalar, "width={w}");
}
}
#[test]
#[cfg_attr(
miri,
ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
)]
fn avx512_copy_alpha_plane_u16_matches_scalar_widths() {
if !std::arch::is_x86_feature_detected!("avx512f")
|| !std::arch::is_x86_feature_detected!("avx512bw")
{
return;
}
for &w in WIDTHS {
let mut alpha = std::vec![0u16; w];
pseudo_random_u16(&mut alpha, 0xDEADBE);
let mut rgba_simd = std::vec![0u16; w * 4];
pseudo_random_u16(&mut rgba_simd, 0xFADE);
let mut rgba_scalar = rgba_simd.clone();
unsafe { super::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_simd, w) };
scalar::copy_alpha_plane_u16::<10, false>(&alpha, &mut rgba_scalar, w);
assert_eq!(rgba_simd, rgba_scalar, "width={w}");
}
}
}