#![cfg_attr(not(feature = "std"), allow(dead_code))]
use core::arch::wasm32::*;
use crate::row::scalar::alpha_extract as scalar;
#[cfg(feature = "yuv-444-packed")]
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn copy_alpha_packed_u8x4_at_3(packed: &[u8], rgba_out: &mut [u8], width: usize) {
debug_assert!(packed.len() >= width * 4, "packed too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
let alpha_mask = i8x16(0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1);
unsafe {
let mut x = 0usize;
while x + 4 <= width {
let off = x * 4;
let src = v128_load(packed.as_ptr().add(off).cast());
let dst = v128_load(rgba_out.as_ptr().add(off).cast());
let merged = v128_bitselect(src, dst, alpha_mask);
v128_store(rgba_out.as_mut_ptr().add(off).cast(), merged);
x += 4;
}
if x < width {
scalar::copy_alpha_packed_u8x4_at_3(
&packed[x * 4..width * 4],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[cfg(feature = "yuv-444-packed")]
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn copy_alpha_packed_u16x4_to_u8_at_0(
packed: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(packed.len() >= width * 4, "packed too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
let alpha_mask = i8x16(0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1);
unsafe {
let mut x = 0usize;
while x + 4 <= width {
let src_off = x * 4;
let lo = v128_load(packed.as_ptr().add(src_off).cast());
let hi = v128_load(packed.as_ptr().add(src_off + 8).cast());
let lo_shr = u16x8_shr(lo, 8);
let hi_shr = u16x8_shr(hi, 8);
let packed_u8 = u8x16_narrow_i16x8(lo_shr, hi_shr);
let shuf_mask = i8x16(
-1, -1, -1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, );
let a_scattered = u8x16_swizzle(packed_u8, shuf_mask);
let dst_off = x * 4;
let dst = v128_load(rgba_out.as_ptr().add(dst_off).cast());
let merged = v128_bitselect(a_scattered, dst, alpha_mask);
v128_store(rgba_out.as_mut_ptr().add(dst_off).cast(), merged);
x += 4;
}
if x < width {
scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(
&packed[x * 4..width * 4],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[cfg(feature = "yuv-444-packed")]
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn copy_alpha_packed_u16x4_at_0(
packed: &[u16],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(packed.len() >= width * 4, "packed too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
let alpha_mask = i8x16(0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, -1, -1);
let shuf_lo = i8x16(
-1, -1, -1, -1, -1, -1, 0, 1, -1, -1, -1, -1, -1, -1, 8, 9, );
let shuf_hi = i8x16(
-1, -1, -1, -1, -1, -1, 0, 1, -1, -1, -1, -1, -1, -1, 8, 9, );
unsafe {
let mut x = 0usize;
while x + 4 <= width {
let off = x * 4;
let src_lo = v128_load(packed.as_ptr().add(off).cast()); let src_hi = v128_load(packed.as_ptr().add(off + 8).cast()); let dst_lo = v128_load(rgba_out.as_ptr().add(off).cast());
let dst_hi = v128_load(rgba_out.as_ptr().add(off + 8).cast());
let a_lo = u8x16_swizzle(src_lo, shuf_lo);
let a_hi = u8x16_swizzle(src_hi, shuf_hi);
let merged_lo = v128_bitselect(a_lo, dst_lo, alpha_mask);
let merged_hi = v128_bitselect(a_hi, dst_hi, alpha_mask);
v128_store(rgba_out.as_mut_ptr().add(off).cast(), merged_lo);
v128_store(rgba_out.as_mut_ptr().add(off + 8).cast(), merged_hi);
x += 4;
}
if x < width {
scalar::copy_alpha_packed_u16x4_at_0::<false>(
&packed[x * 4..width * 4],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[cfg(any(feature = "gbr", feature = "yuva"))]
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn copy_alpha_plane_u8(alpha: &[u8], rgba_out: &mut [u8], width: usize) {
debug_assert!(alpha.len() >= width, "alpha plane too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
let alpha_mask = i8x16(0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1);
let shuf_mask = i8x16(-1, -1, -1, 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3);
unsafe {
let mut x = 0usize;
while x + 4 <= width {
let a_raw = v128_load32_zero(alpha.as_ptr().add(x).cast());
let a_scattered = u8x16_swizzle(a_raw, shuf_mask);
let off = x * 4;
let dst = v128_load(rgba_out.as_ptr().add(off).cast());
let merged = v128_bitselect(a_scattered, dst, alpha_mask);
v128_store(rgba_out.as_mut_ptr().add(off).cast(), merged);
x += 4;
}
if x < width {
scalar::copy_alpha_plane_u8(&alpha[x..width], &mut rgba_out[x * 4..width * 4], width - x);
}
}
}
#[cfg(any(feature = "gbr", feature = "yuva"))]
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn copy_alpha_plane_u16_to_u8<const BITS: u32>(
alpha: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
const {
assert!(BITS >= 8 && BITS <= 16, "BITS must be in [8, 16]");
}
debug_assert!(alpha.len() >= width, "alpha plane too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
let alpha_mask = i8x16(0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1);
let shuf_mask = i8x16(-1, -1, -1, 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3);
let shr_count: u32 = BITS - 8;
let bits_mask = u16x8_splat(((1u32 << BITS) - 1) as u16);
unsafe {
let mut x = 0usize;
while x + 4 <= width {
let a_u16_raw = v128_load64_zero(alpha.as_ptr().add(x).cast());
let a_u16 = v128_and(a_u16_raw, bits_mask);
let a_shifted = u16x8_shr(a_u16, shr_count);
let zero = i16x8_splat(0);
let a_u8_vec = u8x16_narrow_i16x8(a_shifted, zero);
let a_scattered = u8x16_swizzle(a_u8_vec, shuf_mask);
let off = x * 4;
let dst = v128_load(rgba_out.as_ptr().add(off).cast());
let merged = v128_bitselect(a_scattered, dst, alpha_mask);
v128_store(rgba_out.as_mut_ptr().add(off).cast(), merged);
x += 4;
}
if x < width {
scalar::copy_alpha_plane_u16_to_u8::<BITS, false>(
&alpha[x..width],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[cfg(any(feature = "gbr", feature = "yuva"))]
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn copy_alpha_plane_u16<const BITS: u32>(
alpha: &[u16],
rgba_out: &mut [u16],
width: usize,
) {
const {
assert!(BITS > 0 && BITS <= 16, "BITS must be in [1, 16]");
}
debug_assert!(alpha.len() >= width, "alpha plane too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
let alpha_mask = i8x16(0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, -1, -1);
let shuf_lo = i8x16(
-1, -1, -1, -1, -1, -1, 0, 1, -1, -1, -1, -1, -1, -1, 2, 3, );
let shuf_hi = i8x16(
-1, -1, -1, -1, -1, -1, 4, 5, -1, -1, -1, -1, -1, -1, 6, 7, );
let bits_mask = u16x8_splat(((1u32 << BITS) - 1) as u16);
unsafe {
let mut x = 0usize;
while x + 4 <= width {
let a_raw = v128_and(v128_load64_zero(alpha.as_ptr().add(x).cast()), bits_mask);
let a_lo = u8x16_swizzle(a_raw, shuf_lo);
let a_hi = u8x16_swizzle(a_raw, shuf_hi);
let off = x * 4;
let dst_lo = v128_load(rgba_out.as_ptr().add(off).cast());
let dst_hi = v128_load(rgba_out.as_ptr().add(off + 8).cast());
let merged_lo = v128_bitselect(a_lo, dst_lo, alpha_mask);
let merged_hi = v128_bitselect(a_hi, dst_hi, alpha_mask);
v128_store(rgba_out.as_mut_ptr().add(off).cast(), merged_lo);
v128_store(rgba_out.as_mut_ptr().add(off + 8).cast(), merged_hi);
x += 4;
}
if x < width {
scalar::copy_alpha_plane_u16::<BITS, false>(
&alpha[x..width],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[cfg(all(test, feature = "std"))]
mod tests {
use crate::row::scalar::alpha_extract as scalar;
fn pseudo_random_u8(out: &mut [u8], seed: u32) {
let mut state = seed;
for v in out.iter_mut() {
state = state.wrapping_mul(1664525).wrapping_add(1013904223);
*v = (state >> 16) as u8;
}
}
fn pseudo_random_u16(out: &mut [u16], seed: u32) {
let mut state = seed;
for v in out.iter_mut() {
state = state.wrapping_mul(1664525).wrapping_add(1013904223);
*v = (state >> 8) as u16;
}
}
const WIDTHS: &[usize] = &[
1, 3, 4, 5, 7, 8, 9, 15, 16, 17, 23, 24, 31, 32, 33, 128, 130,
];
#[test]
#[cfg_attr(
miri,
ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
)]
fn wasm_simd128_copy_alpha_packed_u8x4_at_3_matches_scalar_widths() {
for &w in WIDTHS {
let mut packed = std::vec![0u8; w * 4];
pseudo_random_u8(&mut packed, 0xC0FFEE);
let mut rgba_simd = std::vec![0u8; w * 4];
pseudo_random_u8(&mut rgba_simd, 0xDECAF);
let mut rgba_scalar = rgba_simd.clone();
unsafe {
super::copy_alpha_packed_u8x4_at_3(&packed, &mut rgba_simd, w);
}
scalar::copy_alpha_packed_u8x4_at_3(&packed, &mut rgba_scalar, w);
assert_eq!(rgba_simd, rgba_scalar, "width={w}");
}
}
#[test]
#[cfg_attr(
miri,
ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
)]
fn wasm_simd128_copy_alpha_packed_u16x4_to_u8_at_0_matches_scalar_widths() {
for &w in WIDTHS {
let mut packed = std::vec![0u16; w * 4];
pseudo_random_u16(&mut packed, 0xCAB00D);
let mut rgba_simd = std::vec![0u8; w * 4];
pseudo_random_u8(&mut rgba_simd, 0xFEED);
let mut rgba_scalar = rgba_simd.clone();
unsafe {
super::copy_alpha_packed_u16x4_to_u8_at_0(&packed, &mut rgba_simd, w);
}
scalar::copy_alpha_packed_u16x4_to_u8_at_0::<false>(&packed, &mut rgba_scalar, w);
assert_eq!(rgba_simd, rgba_scalar, "width={w}");
}
}
#[test]
#[cfg_attr(
miri,
ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
)]
fn wasm_simd128_copy_alpha_packed_u16x4_at_0_matches_scalar_widths() {
for &w in WIDTHS {
let mut packed = std::vec![0u16; w * 4];
pseudo_random_u16(&mut packed, 0xBEEF11);
let mut rgba_simd = std::vec![0u16; w * 4];
pseudo_random_u16(&mut rgba_simd, 0x1337);
let mut rgba_scalar = rgba_simd.clone();
unsafe {
super::copy_alpha_packed_u16x4_at_0(&packed, &mut rgba_simd, w);
}
scalar::copy_alpha_packed_u16x4_at_0::<false>(&packed, &mut rgba_scalar, w);
assert_eq!(rgba_simd, rgba_scalar, "width={w}");
}
}
#[test]
#[cfg_attr(
miri,
ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
)]
fn wasm_simd128_copy_alpha_plane_u8_matches_scalar_widths() {
for &w in WIDTHS {
let mut alpha = std::vec![0u8; w];
pseudo_random_u8(&mut alpha, 0xABCDEF);
let mut rgba_simd = std::vec![0u8; w * 4];
pseudo_random_u8(&mut rgba_simd, 0x123456);
let mut rgba_scalar = rgba_simd.clone();
unsafe {
super::copy_alpha_plane_u8(&alpha, &mut rgba_simd, w);
}
scalar::copy_alpha_plane_u8(&alpha, &mut rgba_scalar, w);
assert_eq!(rgba_simd, rgba_scalar, "width={w}");
}
}
#[test]
#[cfg_attr(
miri,
ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
)]
fn wasm_simd128_copy_alpha_plane_u16_to_u8_matches_scalar_widths_bits10() {
for &w in WIDTHS {
let mut alpha = std::vec![0u16; w];
pseudo_random_u16(&mut alpha, 0xC0DE);
for v in alpha.iter_mut() {
*v &= 0x03FF;
}
let mut rgba_simd = std::vec![0u8; w * 4];
pseudo_random_u8(&mut rgba_simd, 0xBABE);
let mut rgba_scalar = rgba_simd.clone();
unsafe {
super::copy_alpha_plane_u16_to_u8::<10>(&alpha, &mut rgba_simd, w);
}
scalar::copy_alpha_plane_u16_to_u8::<10, false>(&alpha, &mut rgba_scalar, w);
assert_eq!(rgba_simd, rgba_scalar, "width={w}");
}
}
#[test]
#[cfg_attr(
miri,
ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
)]
fn wasm_simd128_copy_alpha_plane_u16_to_u8_matches_scalar_widths_bits12() {
for &w in WIDTHS {
let mut alpha = std::vec![0u16; w];
pseudo_random_u16(&mut alpha, 0xF00BAA);
for v in alpha.iter_mut() {
*v &= 0x0FFF;
}
let mut rgba_simd = std::vec![0u8; w * 4];
pseudo_random_u8(&mut rgba_simd, 0x5EED);
let mut rgba_scalar = rgba_simd.clone();
unsafe {
super::copy_alpha_plane_u16_to_u8::<12>(&alpha, &mut rgba_simd, w);
}
scalar::copy_alpha_plane_u16_to_u8::<12, false>(&alpha, &mut rgba_scalar, w);
assert_eq!(rgba_simd, rgba_scalar, "width={w}");
}
}
#[test]
#[cfg_attr(
miri,
ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
)]
fn wasm_simd128_copy_alpha_plane_u16_matches_scalar_widths() {
for &w in WIDTHS {
let mut alpha = std::vec![0u16; w];
pseudo_random_u16(&mut alpha, 0xDEADBE);
let mut rgba_simd = std::vec![0u16; w * 4];
pseudo_random_u16(&mut rgba_simd, 0xFADE);
let mut rgba_scalar = rgba_simd.clone();
unsafe {
super::copy_alpha_plane_u16::<10>(&alpha, &mut rgba_simd, w);
}
scalar::copy_alpha_plane_u16::<10, false>(&alpha, &mut rgba_scalar, w);
assert_eq!(rgba_simd, rgba_scalar, "width={w}");
}
}
}