#![cfg_attr(not(feature = "std"), allow(dead_code))]
use core::arch::aarch64::*;
use crate::row::scalar::alpha_extract as scalar;
#[cfg(feature = "yuv-444-packed")]
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn copy_alpha_packed_u8x4_at_3(packed: &[u8], rgba_out: &mut [u8], width: usize) {
debug_assert!(packed.len() >= width * 4, "packed too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
let mut x = 0usize;
unsafe {
while x + 16 <= width {
let off = x * 4;
let src = vld4q_u8(packed.as_ptr().add(off));
let dst = vld4q_u8(rgba_out.as_ptr().add(off));
let merged = uint8x16x4_t(dst.0, dst.1, dst.2, src.3);
vst4q_u8(rgba_out.as_mut_ptr().add(off), merged);
x += 16;
}
}
if x < width {
scalar::copy_alpha_packed_u8x4_at_3(
&packed[x * 4..width * 4],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
#[cfg(feature = "yuv-444-packed")]
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn copy_alpha_packed_u16x4_to_u8_at_0(
packed: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(packed.len() >= width * 4, "packed too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let src_off = x * 4;
let dst_off = x * 4;
let src = vld4q_u16(packed.as_ptr().add(src_off));
let a_u8 = vshrn_n_u16::<8>(src.0);
let dst = vld4_u8(rgba_out.as_ptr().add(dst_off));
let merged = uint8x8x4_t(dst.0, dst.1, dst.2, a_u8);
vst4_u8(rgba_out.as_mut_ptr().add(dst_off), merged);
x += 8;
}
}
if x < width {
scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(
&packed[x * 4..width * 4],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
#[cfg(feature = "yuv-444-packed")]
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn copy_alpha_packed_u16x4_at_0(
packed: &[u16],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(packed.len() >= width * 4, "packed too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let off = x * 4;
let src = vld4q_u16(packed.as_ptr().add(off));
let dst = vld4q_u16(rgba_out.as_ptr().add(off));
let merged = uint16x8x4_t(dst.0, dst.1, dst.2, src.0);
vst4q_u16(rgba_out.as_mut_ptr().add(off), merged);
x += 8;
}
}
if x < width {
scalar::copy_alpha_packed_u16x4_at_0::<false>(
&packed[x * 4..width * 4],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
#[cfg(any(feature = "gbr", feature = "yuva"))]
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn copy_alpha_plane_u8(alpha: &[u8], rgba_out: &mut [u8], width: usize) {
debug_assert!(alpha.len() >= width, "alpha plane too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
let mut x = 0usize;
unsafe {
while x + 16 <= width {
let a_vec = vld1q_u8(alpha.as_ptr().add(x));
let off = x * 4;
let dst = vld4q_u8(rgba_out.as_ptr().add(off));
let merged = uint8x16x4_t(dst.0, dst.1, dst.2, a_vec);
vst4q_u8(rgba_out.as_mut_ptr().add(off), merged);
x += 16;
}
}
if x < width {
scalar::copy_alpha_plane_u8(&alpha[x..width], &mut rgba_out[x * 4..width * 4], width - x);
}
}
#[cfg(any(feature = "gbr", feature = "yuva"))]
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn copy_alpha_plane_u16_to_u8<const BITS: u32>(
alpha: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
const {
assert!(BITS >= 8 && BITS <= 16, "BITS must be in [8, 16]");
}
debug_assert!(alpha.len() >= width, "alpha plane too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
let mut x = 0usize;
unsafe {
let shr_count = vdupq_n_s16(-((BITS as i16) - 8));
let bits_mask = vdupq_n_u16(((1u32 << BITS) - 1) as u16);
while x + 8 <= width {
let a_u16 = vld1q_u16(alpha.as_ptr().add(x));
let a_masked = vandq_u16(a_u16, bits_mask);
let a_shifted = vshlq_u16(a_masked, shr_count);
let a_u8 = vmovn_u16(a_shifted);
let off = x * 4;
let dst = vld4_u8(rgba_out.as_ptr().add(off));
let merged = uint8x8x4_t(dst.0, dst.1, dst.2, a_u8);
vst4_u8(rgba_out.as_mut_ptr().add(off), merged);
x += 8;
}
}
if x < width {
scalar::copy_alpha_plane_u16_to_u8::<BITS, false>(
&alpha[x..width],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
#[cfg(any(feature = "gbr", feature = "yuva"))]
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn copy_alpha_plane_u16<const BITS: u32>(
alpha: &[u16],
rgba_out: &mut [u16],
width: usize,
) {
const {
assert!(BITS > 0 && BITS <= 16, "BITS must be in [1, 16]");
}
debug_assert!(alpha.len() >= width, "alpha plane too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
let mut x = 0usize;
unsafe {
let bits_mask = vdupq_n_u16(((1u32 << BITS) - 1) as u16);
while x + 8 <= width {
let a_vec = vandq_u16(vld1q_u16(alpha.as_ptr().add(x)), bits_mask);
let off = x * 4;
let dst = vld4q_u16(rgba_out.as_ptr().add(off));
let merged = uint16x8x4_t(dst.0, dst.1, dst.2, a_vec);
vst4q_u16(rgba_out.as_mut_ptr().add(off), merged);
x += 8;
}
}
if x < width {
scalar::copy_alpha_plane_u16::<BITS, false>(
&alpha[x..width],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
#[cfg(all(test, feature = "std"))]
mod tests {
use crate::row::scalar::alpha_extract as scalar;
fn pseudo_random_u8(out: &mut [u8], seed: u32) {
let mut state = seed;
for v in out.iter_mut() {
state = state.wrapping_mul(1664525).wrapping_add(1013904223);
*v = (state >> 16) as u8;
}
}
fn pseudo_random_u16(out: &mut [u16], seed: u32) {
let mut state = seed;
for v in out.iter_mut() {
state = state.wrapping_mul(1664525).wrapping_add(1013904223);
*v = (state >> 8) as u16;
}
}
const WIDTHS: &[usize] = &[1, 7, 8, 9, 15, 16, 17, 23, 24, 31, 32, 33, 128, 130];
#[test]
#[cfg_attr(
miri,
ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
)]
fn neon_copy_alpha_packed_u8x4_at_3_matches_scalar_widths() {
for &w in WIDTHS {
let mut packed = std::vec![0u8; w * 4];
pseudo_random_u8(&mut packed, 0xC0FFEE);
let mut rgba_simd = std::vec![1u8; w * 4];
let mut rgba_scalar = std::vec![1u8; w * 4];
pseudo_random_u8(&mut rgba_simd, 0xDECAF);
rgba_scalar.copy_from_slice(&rgba_simd);
unsafe { super::copy_alpha_packed_u8x4_at_3(&packed, &mut rgba_simd, w) };
scalar::copy_alpha_packed_u8x4_at_3(&packed, &mut rgba_scalar, w);
assert_eq!(rgba_simd, rgba_scalar, "width={w}");
}
}
#[test]
#[cfg_attr(
miri,
ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
)]
fn neon_copy_alpha_packed_u16x4_to_u8_at_0_matches_scalar_widths() {
for &w in WIDTHS {
let mut packed = std::vec![0u16; w * 4];
pseudo_random_u16(&mut packed, 0xCAB00D);
let mut rgba_simd = std::vec![1u8; w * 4];
pseudo_random_u8(&mut rgba_simd, 0xFEED);
let mut rgba_scalar = rgba_simd.clone();
unsafe { super::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_simd, w) };
scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(&packed, &mut rgba_scalar, w);
assert_eq!(rgba_simd, rgba_scalar, "width={w}");
}
}
#[test]
#[cfg_attr(
miri,
ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
)]
fn neon_copy_alpha_packed_u16x4_at_0_matches_scalar_widths() {
for &w in WIDTHS {
let mut packed = std::vec![0u16; w * 4];
pseudo_random_u16(&mut packed, 0xBEEF11);
let mut rgba_simd = std::vec![1u16; w * 4];
pseudo_random_u16(&mut rgba_simd, 0x1337);
let mut rgba_scalar = rgba_simd.clone();
unsafe { super::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_simd, w) };
scalar::copy_alpha_packed_u16x4_at_0::<false>(&packed, &mut rgba_scalar, w);
assert_eq!(rgba_simd, rgba_scalar, "width={w}");
}
}
#[test]
#[cfg_attr(
miri,
ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
)]
fn neon_copy_alpha_plane_u8_matches_scalar_widths() {
for &w in WIDTHS {
let mut alpha = std::vec![0u8; w];
pseudo_random_u8(&mut alpha, 0xABCDEF);
let mut rgba_simd = std::vec![1u8; w * 4];
pseudo_random_u8(&mut rgba_simd, 0x123456);
let mut rgba_scalar = rgba_simd.clone();
unsafe { super::copy_alpha_plane_u8(&alpha, &mut rgba_simd, w) };
scalar::copy_alpha_plane_u8(&alpha, &mut rgba_scalar, w);
assert_eq!(rgba_simd, rgba_scalar, "width={w}");
}
}
#[test]
#[cfg_attr(
miri,
ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
)]
fn neon_copy_alpha_plane_u16_to_u8_matches_scalar_widths_bits10() {
for &w in WIDTHS {
let mut alpha = std::vec![0u16; w];
pseudo_random_u16(&mut alpha, 0xC0DE);
for v in alpha.iter_mut() {
*v &= 0x03FF;
}
let mut rgba_simd = std::vec![1u8; w * 4];
pseudo_random_u8(&mut rgba_simd, 0xBABE);
let mut rgba_scalar = rgba_simd.clone();
unsafe { super::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_simd, w) };
scalar::copy_alpha_plane_u16_to_u8::<10, false>(&alpha, &mut rgba_scalar, w);
assert_eq!(rgba_simd, rgba_scalar, "width={w}");
}
}
#[test]
#[cfg_attr(
miri,
ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
)]
fn neon_copy_alpha_plane_u16_to_u8_matches_scalar_widths_bits12() {
for &w in WIDTHS {
let mut alpha = std::vec![0u16; w];
pseudo_random_u16(&mut alpha, 0xF00BAA);
for v in alpha.iter_mut() {
*v &= 0x0FFF;
}
let mut rgba_simd = std::vec![1u8; w * 4];
pseudo_random_u8(&mut rgba_simd, 0x5EED);
let mut rgba_scalar = rgba_simd.clone();
unsafe { super::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_simd, w) };
scalar::copy_alpha_plane_u16_to_u8::<12, false>(&alpha, &mut rgba_scalar, w);
assert_eq!(rgba_simd, rgba_scalar, "width={w}");
}
}
#[test]
#[cfg_attr(
miri,
ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
)]
fn neon_copy_alpha_plane_u16_matches_scalar_widths() {
for &w in WIDTHS {
let mut alpha = std::vec![0u16; w];
pseudo_random_u16(&mut alpha, 0xDEADBE);
let mut rgba_simd = std::vec![1u16; w * 4];
pseudo_random_u16(&mut rgba_simd, 0xFADE);
let mut rgba_scalar = rgba_simd.clone();
unsafe { super::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_simd, w) };
scalar::copy_alpha_plane_u16::<10, false>(&alpha, &mut rgba_scalar, w);
assert_eq!(rgba_simd, rgba_scalar, "width={w}");
}
}
}