#![cfg_attr(not(feature = "std"), allow(dead_code))]
use core::arch::wasm32::*;
use crate::row::{
arch::wasm_simd128::endian::{load_endian_u16x8, load_endian_u32x4},
scalar::{bits_mask, gray as scalar, grayf32, ya8, ya16},
};
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gray8_to_rgb_row(
y_plane: &[u8],
out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
scalar::gray8_to_rgb_row(y_plane, out, width, full_range);
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gray8_to_rgba_row(
y_plane: &[u8],
out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 4);
scalar::gray8_to_rgba_row(y_plane, out, width, full_range);
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gray8_to_hsv_row(
y_plane: &[u8],
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
if !full_range {
return scalar::gray8_to_hsv_row(y_plane, h_out, s_out, v_out, width, full_range);
}
let mut x = 0usize;
unsafe {
let zero = i64x2(0, 0);
while x + 16 <= width {
let v = v128_load(y_plane.as_ptr().add(x).cast());
v128_store(h_out.as_mut_ptr().add(x).cast(), zero);
v128_store(s_out.as_mut_ptr().add(x).cast(), zero);
v128_store(v_out.as_mut_ptr().add(x).cast(), v);
x += 16;
}
}
if x < width {
scalar::gray8_to_hsv_row(
&y_plane[x..width],
&mut h_out[x..width],
&mut s_out[x..width],
&mut v_out[x..width],
width - x,
true,
);
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gray_n_to_rgb_row<const BITS: u32, const BE: bool>(
y_plane: &[u16],
out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
scalar::gray_n_to_rgb_row::<BITS, BE>(y_plane, out, width, full_range);
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gray_n_to_rgba_row<const BITS: u32, const BE: bool>(
y_plane: &[u16],
out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 4);
scalar::gray_n_to_rgba_row::<BITS, BE>(y_plane, out, width, full_range);
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gray_n_to_rgb_u16_row<const BITS: u32, const BE: bool>(
y_plane: &[u16],
out: &mut [u16],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
scalar::gray_n_to_rgb_u16_row::<BITS, BE>(y_plane, out, width, full_range);
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gray_n_to_rgba_u16_row<const BITS: u32, const BE: bool>(
y_plane: &[u16],
out: &mut [u16],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 4);
scalar::gray_n_to_rgba_u16_row::<BITS, BE>(y_plane, out, width, full_range);
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gray_n_to_luma_row<const BITS: u32, const BE: bool>(
y_plane: &[u16],
out: &mut [u8],
width: usize,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width);
let mask = bits_mask::<BITS>();
let shift = BITS - 8;
let mut x = 0usize;
unsafe {
let mask_v = u16x8_splat(mask);
while x + 8 <= width {
let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
let masked = v128_and(raw, mask_v);
let shifted = u16x8_shr(masked, shift);
let zero = i64x2(0, 0);
let narrowed = u8x16_narrow_i16x8(shifted, zero);
let val = i64x2_extract_lane::<0>(narrowed) as u64;
out[x..x + 8].copy_from_slice(&val.to_le_bytes());
x += 8;
}
}
if x < width {
scalar::gray_n_to_luma_row::<BITS, BE>(&y_plane[x..width], &mut out[x..width], width - x);
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gray_n_to_luma_u16_row<const BITS: u32, const BE: bool>(
y_plane: &[u16],
out: &mut [u16],
width: usize,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width);
let mask = bits_mask::<BITS>();
let mut x = 0usize;
unsafe {
let mask_v = u16x8_splat(mask);
while x + 8 <= width {
let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
let masked = v128_and(raw, mask_v);
v128_store(out.as_mut_ptr().add(x).cast(), masked);
x += 8;
}
}
if x < width {
scalar::gray_n_to_luma_u16_row::<BITS, BE>(&y_plane[x..width], &mut out[x..width], width - x);
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32, const BE: bool>(
y_plane: &[u16],
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
if !full_range {
return scalar::gray_n_to_hsv_row::<BITS, BE>(y_plane, h_out, s_out, v_out, width, full_range);
}
let mask = bits_mask::<BITS>();
let shift = BITS - 8;
let mut x = 0usize;
unsafe {
let mask_v = u16x8_splat(mask);
let zero = i64x2(0, 0);
while x + 8 <= width {
let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
let masked = v128_and(raw, mask_v);
let shifted = u16x8_shr(masked, shift);
let narrowed = u8x16_narrow_i16x8(shifted, zero);
let val = i64x2_extract_lane::<0>(narrowed) as u64;
let bytes = val.to_le_bytes();
h_out[x..x + 8].fill(0);
s_out[x..x + 8].fill(0);
v_out[x..x + 8].copy_from_slice(&bytes);
x += 8;
}
}
if x < width {
scalar::gray_n_to_hsv_row::<BITS, BE>(
&y_plane[x..width],
&mut h_out[x..width],
&mut s_out[x..width],
&mut v_out[x..width],
width - x,
true,
);
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gray16_to_rgb_row<const BE: bool>(
y_plane: &[u16],
out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
scalar::gray16_to_rgb_row::<BE>(y_plane, out, width, full_range);
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gray16_to_rgba_row<const BE: bool>(
y_plane: &[u16],
out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 4);
scalar::gray16_to_rgba_row::<BE>(y_plane, out, width, full_range);
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gray16_to_rgb_u16_row<const BE: bool>(
y_plane: &[u16],
out: &mut [u16],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
scalar::gray16_to_rgb_u16_row::<BE>(y_plane, out, width, full_range);
}
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gray16_to_rgba_u16_row<const BE: bool>(
y_plane: &[u16],
out: &mut [u16],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 4);
scalar::gray16_to_rgba_u16_row::<BE>(y_plane, out, width, full_range);
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gray16_to_luma_row<const BE: bool>(
y_plane: &[u16],
out: &mut [u8],
width: usize,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width);
let mut x = 0usize;
unsafe {
let zero = i64x2(0, 0);
while x + 8 <= width {
let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
let shifted = u16x8_shr(raw, 8);
let narrowed = u8x16_narrow_i16x8(shifted, zero);
let val = i64x2_extract_lane::<0>(narrowed) as u64;
out[x..x + 8].copy_from_slice(&val.to_le_bytes());
x += 8;
}
}
if x < width {
scalar::gray16_to_luma_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gray16_to_luma_u16_row<const BE: bool>(
y_plane: &[u16],
out: &mut [u16],
width: usize,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width);
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let y = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
v128_store(out.as_mut_ptr().add(x).cast(), y);
x += 8;
}
}
if x < width {
scalar::gray16_to_luma_u16_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn gray16_to_hsv_row<const BE: bool>(
y_plane: &[u16],
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
if !full_range {
return scalar::gray16_to_hsv_row::<BE>(y_plane, h_out, s_out, v_out, width, full_range);
}
let mut x = 0usize;
unsafe {
let zero = i64x2(0, 0);
while x + 8 <= width {
let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
let shifted = u16x8_shr(raw, 8);
let narrowed = u8x16_narrow_i16x8(shifted, zero);
let val = i64x2_extract_lane::<0>(narrowed) as u64;
let bytes = val.to_le_bytes();
h_out[x..x + 8].fill(0);
s_out[x..x + 8].fill(0);
v_out[x..x + 8].copy_from_slice(&bytes);
x += 8;
}
}
if x < width {
scalar::gray16_to_hsv_row::<BE>(
&y_plane[x..width],
&mut h_out[x..width],
&mut s_out[x..width],
&mut v_out[x..width],
width - x,
true,
);
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn grayf32_to_rgb_row<const BE: bool>(
y_plane: &[f32],
out: &mut [u8],
width: usize,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
grayf32::grayf32_to_rgb_row::<BE>(y_plane, out, width);
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn grayf32_to_rgba_row<const BE: bool>(
y_plane: &[f32],
out: &mut [u8],
width: usize,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 4);
grayf32::grayf32_to_rgba_row::<BE>(y_plane, out, width);
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn grayf32_to_rgb_u16_row<const BE: bool>(
y_plane: &[f32],
out: &mut [u16],
width: usize,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
grayf32::grayf32_to_rgb_u16_row::<BE>(y_plane, out, width);
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn grayf32_to_rgba_u16_row<const BE: bool>(
y_plane: &[f32],
out: &mut [u16],
width: usize,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 4);
grayf32::grayf32_to_rgba_u16_row::<BE>(y_plane, out, width);
}
#[inline]
#[target_feature(enable = "simd128")]
#[allow(dead_code)] pub(crate) unsafe fn grayf32_to_rgb_f32_row<const BE: bool>(
y_plane: &[f32],
out: &mut [f32],
width: usize,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
grayf32::grayf32_to_rgb_f32_row::<BE>(y_plane, out, width);
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn grayf32_to_luma_row<const BE: bool>(
y_plane: &[f32],
out: &mut [u8],
width: usize,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width);
let scale = f32x4_splat(255.0);
let zero4 = f32x4_splat(0.0);
let one4 = f32x4_splat(1.0);
let half = f32x4_splat(0.5);
let zero16 = i64x2(0, 0);
let mut x = 0usize;
unsafe {
while x + 4 <= width {
let y = load_endian_u32x4::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 4));
let clamped = f32x4_min(f32x4_max(y, zero4), one4);
let scaled = f32x4_mul(clamped, scale);
let rounded = i32x4_trunc_sat_f32x4(f32x4_add(scaled, half));
let narrow16 = i16x8_narrow_i32x4(rounded, zero16);
let narrow8 = u8x16_narrow_i16x8(narrow16, zero16);
let val = i32x4_extract_lane::<0>(narrow8) as u32;
let bytes = val.to_le_bytes();
out[x..x + 4].copy_from_slice(&bytes);
x += 4;
}
}
if x < width {
grayf32::grayf32_to_luma_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn grayf32_to_luma_u16_row<const BE: bool>(
y_plane: &[f32],
out: &mut [u16],
width: usize,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width);
let scale = f32x4_splat(65535.0);
let zero4 = f32x4_splat(0.0);
let one4 = f32x4_splat(1.0);
let half = f32x4_splat(0.5);
let zero16 = i64x2(0, 0);
let mut x = 0usize;
unsafe {
while x + 4 <= width {
let y = load_endian_u32x4::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 4));
let clamped = f32x4_min(f32x4_max(y, zero4), one4);
let scaled = f32x4_mul(clamped, scale);
let rounded = i32x4_trunc_sat_f32x4(f32x4_add(scaled, half));
let narrow16 = u16x8_narrow_i32x4(rounded, zero16);
out[x] = u16x8_extract_lane::<0>(narrow16);
out[x + 1] = u16x8_extract_lane::<1>(narrow16);
out[x + 2] = u16x8_extract_lane::<2>(narrow16);
out[x + 3] = u16x8_extract_lane::<3>(narrow16);
x += 4;
}
}
if x < width {
grayf32::grayf32_to_luma_u16_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
}
}
#[inline]
#[target_feature(enable = "simd128")]
#[allow(dead_code)] pub(crate) unsafe fn grayf32_to_luma_f32_row<const BE: bool>(
y_plane: &[f32],
out: &mut [f32],
width: usize,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width);
let mut x = 0usize;
unsafe {
while x + 4 <= width {
let y = load_endian_u32x4::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 4));
v128_store(out.as_mut_ptr().add(x).cast(), y);
x += 4;
}
}
if x < width {
grayf32::grayf32_to_luma_f32_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn grayf32_to_hsv_row<const BE: bool>(
y_plane: &[f32],
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
width: usize,
) {
debug_assert!(y_plane.len() >= width);
let scale = f32x4_splat(255.0);
let zero4 = f32x4_splat(0.0);
let one4 = f32x4_splat(1.0);
let half = f32x4_splat(0.5);
let zero16 = i64x2(0, 0);
let mut x = 0usize;
unsafe {
while x + 4 <= width {
let y = load_endian_u32x4::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 4));
let clamped = f32x4_min(f32x4_max(y, zero4), one4);
let scaled = f32x4_mul(clamped, scale);
let rounded = i32x4_trunc_sat_f32x4(f32x4_add(scaled, half));
let narrow16 = i16x8_narrow_i32x4(rounded, zero16);
let narrow8 = u8x16_narrow_i16x8(narrow16, zero16);
let val = i32x4_extract_lane::<0>(narrow8) as u32;
let bytes = val.to_le_bytes();
h_out[x..x + 4].fill(0);
s_out[x..x + 4].fill(0);
v_out[x..x + 4].copy_from_slice(&bytes);
x += 4;
}
}
if x < width {
grayf32::grayf32_to_hsv_row::<BE>(
&y_plane[x..width],
&mut h_out[x..width],
&mut s_out[x..width],
&mut v_out[x..width],
width - x,
);
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn ya8_to_rgb_row(packed: &[u8], out: &mut [u8], width: usize) {
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 3);
ya8::ya8_to_rgb_row(packed, out, width);
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn ya8_to_rgba_row(packed: &[u8], out: &mut [u8], width: usize) {
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 4);
ya8::ya8_to_rgba_row(packed, out, width);
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn ya8_to_rgb_u16_row(packed: &[u8], out: &mut [u16], width: usize) {
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 3);
ya8::ya8_to_rgb_u16_row(packed, out, width);
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn ya8_to_rgba_u16_row(packed: &[u8], out: &mut [u16], width: usize) {
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 4);
ya8::ya8_to_rgba_u16_row(packed, out, width);
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn ya8_to_luma_row(packed: &[u8], out: &mut [u8], width: usize) {
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width);
let shuf = i8x16(0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1);
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let src = v128_load(packed.as_ptr().add(x * 2).cast());
let y8 = i8x16_swizzle(src, shuf);
let val = i64x2_extract_lane::<0>(y8) as u64;
out[x..x + 8].copy_from_slice(&val.to_le_bytes());
x += 8;
}
}
if x < width {
ya8::ya8_to_luma_row(&packed[x * 2..width * 2], &mut out[x..width], width - x);
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn ya8_to_luma_u16_row(packed: &[u8], out: &mut [u16], width: usize) {
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width);
let shuf = i8x16(0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1);
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let src = v128_load(packed.as_ptr().add(x * 2).cast());
let y8 = i8x16_swizzle(src, shuf);
let y16 = u16x8_extend_low_u8x16(y8);
v128_store(out.as_mut_ptr().add(x).cast(), y16);
x += 8;
}
}
if x < width {
ya8::ya8_to_luma_u16_row(&packed[x * 2..width * 2], &mut out[x..width], width - x);
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn ya8_to_hsv_row(
packed: &[u8],
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
width: usize,
) {
debug_assert!(packed.len() >= width * 2);
let shuf = i8x16(0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1);
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let src = v128_load(packed.as_ptr().add(x * 2).cast());
let y8 = i8x16_swizzle(src, shuf);
let val = i64x2_extract_lane::<0>(y8) as u64;
let bytes = val.to_le_bytes();
h_out[x..x + 8].fill(0);
s_out[x..x + 8].fill(0);
v_out[x..x + 8].copy_from_slice(&bytes);
x += 8;
}
}
if x < width {
ya8::ya8_to_hsv_row(
&packed[x * 2..width * 2],
&mut h_out[x..width],
&mut s_out[x..width],
&mut v_out[x..width],
width - x,
);
}
}
const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn ya16_to_rgb_row<const BE: bool>(packed: &[u16], out: &mut [u8], width: usize) {
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 3);
ya16::ya16_to_rgb_row::<BE>(packed, out, width);
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn ya16_to_rgba_row<const BE: bool>(
packed: &[u16],
out: &mut [u8],
width: usize,
) {
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 4);
ya16::ya16_to_rgba_row::<BE>(packed, out, width);
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn ya16_to_rgb_u16_row<const BE: bool>(
packed: &[u16],
out: &mut [u16],
width: usize,
) {
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 3);
ya16::ya16_to_rgb_u16_row::<BE>(packed, out, width);
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn ya16_to_rgba_u16_row<const BE: bool>(
packed: &[u16],
out: &mut [u16],
width: usize,
) {
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 4);
ya16::ya16_to_rgba_u16_row::<BE>(packed, out, width);
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn ya16_to_luma_row<const BE: bool>(
packed: &[u16],
out: &mut [u8],
width: usize,
) {
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width);
if BE != HOST_NATIVE_BE {
return ya16::ya16_to_luma_row::<BE>(packed, out, width);
}
let shuf_lo = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
let zero16 = i64x2(0, 0);
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let src0 = v128_load(packed.as_ptr().add(x * 2).cast::<v128>());
let src1 = v128_load(packed.as_ptr().add(x * 2 + 8).cast::<v128>());
let y0 = i8x16_swizzle(src0, shuf_lo); let y1 = i8x16_swizzle(src1, shuf_lo); let y_words = v128_or(
y0,
i8x16_swizzle(
y1,
i8x16(-1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7),
),
);
let shifted = u16x8_shr(y_words, 8);
let narrowed = u8x16_narrow_i16x8(shifted, zero16);
let val = i64x2_extract_lane::<0>(narrowed) as u64;
out[x..x + 8].copy_from_slice(&val.to_le_bytes());
x += 8;
}
}
if x < width {
ya16::ya16_to_luma_row::<BE>(&packed[x * 2..width * 2], &mut out[x..width], width - x);
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn ya16_to_luma_u16_row<const BE: bool>(
packed: &[u16],
out: &mut [u16],
width: usize,
) {
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width);
if BE != HOST_NATIVE_BE {
return ya16::ya16_to_luma_u16_row::<BE>(packed, out, width);
}
let shuf_lo = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let src0 = v128_load(packed.as_ptr().add(x * 2).cast::<v128>());
let src1 = v128_load(packed.as_ptr().add(x * 2 + 8).cast::<v128>());
let y0 = i8x16_swizzle(src0, shuf_lo);
let y1 = i8x16_swizzle(src1, shuf_lo);
let y_words = v128_or(
y0,
i8x16_swizzle(
y1,
i8x16(-1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7),
),
);
v128_store(out.as_mut_ptr().add(x).cast(), y_words);
x += 8;
}
}
if x < width {
ya16::ya16_to_luma_u16_row::<BE>(&packed[x * 2..width * 2], &mut out[x..width], width - x);
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn ya16_to_hsv_row<const BE: bool>(
packed: &[u16],
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
width: usize,
) {
debug_assert!(packed.len() >= width * 2);
if BE != HOST_NATIVE_BE {
return ya16::ya16_to_hsv_row::<BE>(packed, h_out, s_out, v_out, width);
}
let shuf_lo = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1);
let zero16 = i64x2(0, 0);
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let src0 = v128_load(packed.as_ptr().add(x * 2).cast::<v128>());
let src1 = v128_load(packed.as_ptr().add(x * 2 + 8).cast::<v128>());
let y0 = i8x16_swizzle(src0, shuf_lo);
let y1 = i8x16_swizzle(src1, shuf_lo);
let y_words = v128_or(
y0,
i8x16_swizzle(
y1,
i8x16(-1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7),
),
);
let shifted = u16x8_shr(y_words, 8);
let narrowed = u8x16_narrow_i16x8(shifted, zero16);
let val = i64x2_extract_lane::<0>(narrowed) as u64;
let bytes = val.to_le_bytes();
h_out[x..x + 8].fill(0);
s_out[x..x + 8].fill(0);
v_out[x..x + 8].copy_from_slice(&bytes);
x += 8;
}
}
if x < width {
ya16::ya16_to_hsv_row::<BE>(
&packed[x * 2..width * 2],
&mut h_out[x..width],
&mut s_out[x..width],
&mut v_out[x..width],
width - x,
);
}
}