#![allow(dead_code)]
use core::arch::wasm32::*;
use super::*;
#[inline(always)]
unsafe fn deinterleave_rgb48_8px(v0: v128, v1: v128, v2: v128) -> (v128, v128, v128) {
let ch0_v0 = i8x16(0, 1, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let ch0_v1 = i8x16(-1, -1, -1, -1, -1, -1, 2, 3, 8, 9, 14, 15, -1, -1, -1, -1);
let ch0_v2 = i8x16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4, 5, 10, 11);
let ch0 = v128_or(
v128_or(u8x16_swizzle(v0, ch0_v0), u8x16_swizzle(v1, ch0_v1)),
u8x16_swizzle(v2, ch0_v2),
);
let ch1_v0 = i8x16(2, 3, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let ch1_v1 = i8x16(-1, -1, -1, -1, -1, -1, 4, 5, 10, 11, -1, -1, -1, -1, -1, -1);
let ch1_v2 = i8x16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 6, 7, 12, 13);
let ch1 = v128_or(
v128_or(u8x16_swizzle(v0, ch1_v0), u8x16_swizzle(v1, ch1_v1)),
u8x16_swizzle(v2, ch1_v2),
);
let ch2_v0 = i8x16(4, 5, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let ch2_v1 = i8x16(-1, -1, -1, -1, 0, 1, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1);
let ch2_v2 = i8x16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 3, 8, 9, 14, 15);
let ch2 = v128_or(
v128_or(u8x16_swizzle(v0, ch2_v0), u8x16_swizzle(v1, ch2_v1)),
u8x16_swizzle(v2, ch2_v2),
);
(ch0, ch1, ch2)
}
#[inline(always)]
unsafe fn deinterleave_rgba64_8px(
raw0: v128,
raw1: v128,
raw2: v128,
raw3: v128,
) -> (v128, v128, v128, v128) {
let pair_01_c01 = i16x8_shuffle::<0, 4, 8, 12, 1, 5, 9, 13>(raw0, raw1);
let pair_01_c23 = i16x8_shuffle::<2, 6, 10, 14, 3, 7, 11, 15>(raw0, raw1);
let pair_23_c01 = i16x8_shuffle::<0, 4, 8, 12, 1, 5, 9, 13>(raw2, raw3);
let pair_23_c23 = i16x8_shuffle::<2, 6, 10, 14, 3, 7, 11, 15>(raw2, raw3);
let ch0 = i16x8_shuffle::<0, 1, 2, 3, 8, 9, 10, 11>(pair_01_c01, pair_23_c01);
let ch1 = i16x8_shuffle::<4, 5, 6, 7, 12, 13, 14, 15>(pair_01_c01, pair_23_c01);
let ch2 = i16x8_shuffle::<0, 1, 2, 3, 8, 9, 10, 11>(pair_01_c23, pair_23_c23);
let ch3 = i16x8_shuffle::<4, 5, 6, 7, 12, 13, 14, 15>(pair_01_c23, pair_23_c23);
(ch0, ch1, ch2, ch3)
}
#[inline(always)]
unsafe fn narrow_u16x8_to_u8x8(v: v128) -> v128 {
let shr = u16x8_shr(v, 8);
let zero = u16x8_splat(0);
u8x16_narrow_i16x8(shr, zero)
}
const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
#[inline(always)]
unsafe fn byteswap_if_be<const BE: bool>(v: v128) -> v128 {
if BE != HOST_NATIVE_BE {
u8x16_swizzle(
v,
i8x16(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14),
)
} else {
v
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn wasm_rgb48_to_rgb_row<const BE: bool>(
rgb48: &[u16],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let ptr = rgb48.as_ptr().add(x * 3);
let v0 = byteswap_if_be::<BE>(v128_load(ptr.cast()));
let v1 = byteswap_if_be::<BE>(v128_load(ptr.add(8).cast()));
let v2 = byteswap_if_be::<BE>(v128_load(ptr.add(16).cast()));
let (r, g, b) = deinterleave_rgb48_8px(v0, v1, v2);
let r_u8 = narrow_u16x8_to_u8x8(r);
let g_u8 = narrow_u16x8_to_u8x8(g);
let b_u8 = narrow_u16x8_to_u8x8(b);
let mut tmp = [0u8; 48];
write_rgb_16(r_u8, g_u8, b_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgb_out.as_mut_ptr().add(x * 3), 24);
x += 8;
}
if x < width {
scalar::rgb48_to_rgb_row::<BE>(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn wasm_rgb48_to_rgba_row<const BE: bool>(
rgb48: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let opaque_u8 = u8x16_splat(0xFF);
let mut x = 0usize;
while x + 8 <= width {
let ptr = rgb48.as_ptr().add(x * 3);
let v0 = byteswap_if_be::<BE>(v128_load(ptr.cast()));
let v1 = byteswap_if_be::<BE>(v128_load(ptr.add(8).cast()));
let v2 = byteswap_if_be::<BE>(v128_load(ptr.add(16).cast()));
let (r, g, b) = deinterleave_rgb48_8px(v0, v1, v2);
let r_u8 = narrow_u16x8_to_u8x8(r);
let g_u8 = narrow_u16x8_to_u8x8(g);
let b_u8 = narrow_u16x8_to_u8x8(b);
let mut tmp = [0u8; 64];
write_rgba_16(r_u8, g_u8, b_u8, opaque_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgba_out.as_mut_ptr().add(x * 4), 32);
x += 8;
}
if x < width {
scalar::rgb48_to_rgba_row::<BE>(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn wasm_rgb48_to_rgb_u16_row<const BE: bool>(
rgb48: &[u16],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let ptr = rgb48.as_ptr().add(x * 3);
let v0 = byteswap_if_be::<BE>(v128_load(ptr.cast()));
let v1 = byteswap_if_be::<BE>(v128_load(ptr.add(8).cast()));
let v2 = byteswap_if_be::<BE>(v128_load(ptr.add(16).cast()));
let (r, g, b) = deinterleave_rgb48_8px(v0, v1, v2);
write_rgb_u16_8(r, g, b, rgb_out.as_mut_ptr().add(x * 3));
x += 8;
}
if x < width {
scalar::rgb48_to_rgb_u16_row::<BE>(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn wasm_rgb48_to_rgba_u16_row<const BE: bool>(
rgb48: &[u16],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let opaque = u16x8_splat(0xFFFF);
let mut x = 0usize;
while x + 8 <= width {
let ptr = rgb48.as_ptr().add(x * 3);
let v0 = byteswap_if_be::<BE>(v128_load(ptr.cast()));
let v1 = byteswap_if_be::<BE>(v128_load(ptr.add(8).cast()));
let v2 = byteswap_if_be::<BE>(v128_load(ptr.add(16).cast()));
let (r, g, b) = deinterleave_rgb48_8px(v0, v1, v2);
write_rgba_u16_8(r, g, b, opaque, rgba_out.as_mut_ptr().add(x * 4));
x += 8;
}
if x < width {
scalar::rgb48_to_rgba_u16_row::<BE>(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn wasm_bgr48_to_rgb_row<const BE: bool>(
bgr48: &[u16],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let ptr = bgr48.as_ptr().add(x * 3);
let v0 = byteswap_if_be::<BE>(v128_load(ptr.cast()));
let v1 = byteswap_if_be::<BE>(v128_load(ptr.add(8).cast()));
let v2 = byteswap_if_be::<BE>(v128_load(ptr.add(16).cast()));
let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2);
let r_u8 = narrow_u16x8_to_u8x8(r);
let g_u8 = narrow_u16x8_to_u8x8(g);
let b_u8 = narrow_u16x8_to_u8x8(b);
let mut tmp = [0u8; 48];
write_rgb_16(r_u8, g_u8, b_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgb_out.as_mut_ptr().add(x * 3), 24);
x += 8;
}
if x < width {
scalar::bgr48_to_rgb_row::<BE>(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn wasm_bgr48_to_rgba_row<const BE: bool>(
bgr48: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let opaque_u8 = u8x16_splat(0xFF);
let mut x = 0usize;
while x + 8 <= width {
let ptr = bgr48.as_ptr().add(x * 3);
let v0 = byteswap_if_be::<BE>(v128_load(ptr.cast()));
let v1 = byteswap_if_be::<BE>(v128_load(ptr.add(8).cast()));
let v2 = byteswap_if_be::<BE>(v128_load(ptr.add(16).cast()));
let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2);
let r_u8 = narrow_u16x8_to_u8x8(r);
let g_u8 = narrow_u16x8_to_u8x8(g);
let b_u8 = narrow_u16x8_to_u8x8(b);
let mut tmp = [0u8; 64];
write_rgba_16(r_u8, g_u8, b_u8, opaque_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgba_out.as_mut_ptr().add(x * 4), 32);
x += 8;
}
if x < width {
scalar::bgr48_to_rgba_row::<BE>(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn wasm_bgr48_to_rgb_u16_row<const BE: bool>(
bgr48: &[u16],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let ptr = bgr48.as_ptr().add(x * 3);
let v0 = byteswap_if_be::<BE>(v128_load(ptr.cast()));
let v1 = byteswap_if_be::<BE>(v128_load(ptr.add(8).cast()));
let v2 = byteswap_if_be::<BE>(v128_load(ptr.add(16).cast()));
let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2);
write_rgb_u16_8(r, g, b, rgb_out.as_mut_ptr().add(x * 3));
x += 8;
}
if x < width {
scalar::bgr48_to_rgb_u16_row::<BE>(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn wasm_bgr48_to_rgba_u16_row<const BE: bool>(
bgr48: &[u16],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let opaque = u16x8_splat(0xFFFF);
let mut x = 0usize;
while x + 8 <= width {
let ptr = bgr48.as_ptr().add(x * 3);
let v0 = byteswap_if_be::<BE>(v128_load(ptr.cast()));
let v1 = byteswap_if_be::<BE>(v128_load(ptr.add(8).cast()));
let v2 = byteswap_if_be::<BE>(v128_load(ptr.add(16).cast()));
let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2);
write_rgba_u16_8(r, g, b, opaque, rgba_out.as_mut_ptr().add(x * 4));
x += 8;
}
if x < width {
scalar::bgr48_to_rgba_u16_row::<BE>(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn wasm_rgba64_to_rgb_row<const BE: bool>(
rgba64: &[u16],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let ptr = rgba64.as_ptr().add(x * 4);
let raw0 = byteswap_if_be::<BE>(v128_load(ptr.cast()));
let raw1 = byteswap_if_be::<BE>(v128_load(ptr.add(8).cast()));
let raw2 = byteswap_if_be::<BE>(v128_load(ptr.add(16).cast()));
let raw3 = byteswap_if_be::<BE>(v128_load(ptr.add(24).cast()));
let (r, g, b, _a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3);
let r_u8 = narrow_u16x8_to_u8x8(r);
let g_u8 = narrow_u16x8_to_u8x8(g);
let b_u8 = narrow_u16x8_to_u8x8(b);
let mut tmp = [0u8; 48];
write_rgb_16(r_u8, g_u8, b_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgb_out.as_mut_ptr().add(x * 3), 24);
x += 8;
}
if x < width {
scalar::rgba64_to_rgb_row::<BE>(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn wasm_rgba64_to_rgba_row<const BE: bool>(
rgba64: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let ptr = rgba64.as_ptr().add(x * 4);
let raw0 = byteswap_if_be::<BE>(v128_load(ptr.cast()));
let raw1 = byteswap_if_be::<BE>(v128_load(ptr.add(8).cast()));
let raw2 = byteswap_if_be::<BE>(v128_load(ptr.add(16).cast()));
let raw3 = byteswap_if_be::<BE>(v128_load(ptr.add(24).cast()));
let (r, g, b, a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3);
let r_u8 = narrow_u16x8_to_u8x8(r);
let g_u8 = narrow_u16x8_to_u8x8(g);
let b_u8 = narrow_u16x8_to_u8x8(b);
let a_u8 = narrow_u16x8_to_u8x8(a);
let mut tmp = [0u8; 64];
write_rgba_16(r_u8, g_u8, b_u8, a_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgba_out.as_mut_ptr().add(x * 4), 32);
x += 8;
}
if x < width {
scalar::rgba64_to_rgba_row::<BE>(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn wasm_rgba64_to_rgb_u16_row<const BE: bool>(
rgba64: &[u16],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let ptr = rgba64.as_ptr().add(x * 4);
let raw0 = byteswap_if_be::<BE>(v128_load(ptr.cast()));
let raw1 = byteswap_if_be::<BE>(v128_load(ptr.add(8).cast()));
let raw2 = byteswap_if_be::<BE>(v128_load(ptr.add(16).cast()));
let raw3 = byteswap_if_be::<BE>(v128_load(ptr.add(24).cast()));
let (r, g, b, _a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3);
write_rgb_u16_8(r, g, b, rgb_out.as_mut_ptr().add(x * 3));
x += 8;
}
if x < width {
scalar::rgba64_to_rgb_u16_row::<BE>(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn wasm_rgba64_to_rgba_u16_row<const BE: bool>(
rgba64: &[u16],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let ptr = rgba64.as_ptr().add(x * 4);
let raw0 = byteswap_if_be::<BE>(v128_load(ptr.cast()));
let raw1 = byteswap_if_be::<BE>(v128_load(ptr.add(8).cast()));
let raw2 = byteswap_if_be::<BE>(v128_load(ptr.add(16).cast()));
let raw3 = byteswap_if_be::<BE>(v128_load(ptr.add(24).cast()));
let (r, g, b, a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3);
write_rgba_u16_8(r, g, b, a, rgba_out.as_mut_ptr().add(x * 4));
x += 8;
}
if x < width {
scalar::rgba64_to_rgba_u16_row::<BE>(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn wasm_bgra64_to_rgb_row<const BE: bool>(
bgra64: &[u16],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let ptr = bgra64.as_ptr().add(x * 4);
let raw0 = byteswap_if_be::<BE>(v128_load(ptr.cast()));
let raw1 = byteswap_if_be::<BE>(v128_load(ptr.add(8).cast()));
let raw2 = byteswap_if_be::<BE>(v128_load(ptr.add(16).cast()));
let raw3 = byteswap_if_be::<BE>(v128_load(ptr.add(24).cast()));
let (b, g, r, _a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3);
let r_u8 = narrow_u16x8_to_u8x8(r);
let g_u8 = narrow_u16x8_to_u8x8(g);
let b_u8 = narrow_u16x8_to_u8x8(b);
let mut tmp = [0u8; 48];
write_rgb_16(r_u8, g_u8, b_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgb_out.as_mut_ptr().add(x * 3), 24);
x += 8;
}
if x < width {
scalar::bgra64_to_rgb_row::<BE>(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn wasm_bgra64_to_rgba_row<const BE: bool>(
bgra64: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let ptr = bgra64.as_ptr().add(x * 4);
let raw0 = byteswap_if_be::<BE>(v128_load(ptr.cast()));
let raw1 = byteswap_if_be::<BE>(v128_load(ptr.add(8).cast()));
let raw2 = byteswap_if_be::<BE>(v128_load(ptr.add(16).cast()));
let raw3 = byteswap_if_be::<BE>(v128_load(ptr.add(24).cast()));
let (b, g, r, a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3);
let r_u8 = narrow_u16x8_to_u8x8(r);
let g_u8 = narrow_u16x8_to_u8x8(g);
let b_u8 = narrow_u16x8_to_u8x8(b);
let a_u8 = narrow_u16x8_to_u8x8(a);
let mut tmp = [0u8; 64];
write_rgba_16(r_u8, g_u8, b_u8, a_u8, tmp.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp.as_ptr(), rgba_out.as_mut_ptr().add(x * 4), 32);
x += 8;
}
if x < width {
scalar::bgra64_to_rgba_row::<BE>(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn wasm_bgra64_to_rgb_u16_row<const BE: bool>(
bgra64: &[u16],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let ptr = bgra64.as_ptr().add(x * 4);
let raw0 = byteswap_if_be::<BE>(v128_load(ptr.cast()));
let raw1 = byteswap_if_be::<BE>(v128_load(ptr.add(8).cast()));
let raw2 = byteswap_if_be::<BE>(v128_load(ptr.add(16).cast()));
let raw3 = byteswap_if_be::<BE>(v128_load(ptr.add(24).cast()));
let (b, g, r, _a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3);
write_rgb_u16_8(r, g, b, rgb_out.as_mut_ptr().add(x * 3));
x += 8;
}
if x < width {
scalar::bgra64_to_rgb_u16_row::<BE>(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn wasm_bgra64_to_rgba_u16_row<const BE: bool>(
bgra64: &[u16],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let ptr = bgra64.as_ptr().add(x * 4);
let raw0 = byteswap_if_be::<BE>(v128_load(ptr.cast()));
let raw1 = byteswap_if_be::<BE>(v128_load(ptr.add(8).cast()));
let raw2 = byteswap_if_be::<BE>(v128_load(ptr.add(16).cast()));
let raw3 = byteswap_if_be::<BE>(v128_load(ptr.add(24).cast()));
let (b, g, r, a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3);
write_rgba_u16_8(r, g, b, a, rgba_out.as_mut_ptr().add(x * 4));
x += 8;
}
if x < width {
scalar::bgra64_to_rgba_u16_row::<BE>(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x);
}
}
}