use core::arch::wasm32::*;
use super::{endian::load_endian_u32x4, scalar};
const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
#[inline(always)]
fn clamp_scale_to_i32(v: v128, zero: v128, one: v128, scale: v128) -> v128 {
let clamped = f32x4_min(f32x4_max(v, zero), one);
let scaled = f32x4_mul(clamped, scale);
let rounded = f32x4_nearest(scaled);
i32x4_trunc_sat_f32x4(rounded)
}
#[inline(always)]
unsafe fn load_f32x4<const BE: bool>(ptr: *const f32) -> v128 {
unsafe { load_endian_u32x4::<BE>(ptr as *const u8) }
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn rgbf32_to_rgb_row<const BE: bool>(
rgb_in: &[f32],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
let zero = f32x4_splat(0.0);
let one = f32x4_splat(1.0);
let scale = f32x4_splat(255.0);
let total_lanes = width * 3;
let mut lane = 0usize;
while lane + 12 <= total_lanes {
unsafe {
let v0 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane));
let v1 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 4));
let v2 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 8));
let i0 = clamp_scale_to_i32(v0, zero, one, scale);
let i1 = clamp_scale_to_i32(v1, zero, one, scale);
let i2 = clamp_scale_to_i32(v2, zero, one, scale);
let h01 = i16x8_narrow_i32x4(i0, i1);
let h22 = i16x8_narrow_i32x4(i2, i2);
let bytes = u8x16_narrow_i16x8(h01, h22);
let mut tmp = [0u8; 16];
v128_store(tmp.as_mut_ptr() as *mut v128, bytes);
rgb_out
.get_unchecked_mut(lane..lane + 12)
.copy_from_slice(&tmp[..12]);
}
lane += 12;
}
let pix_done = lane / 3;
if pix_done < width {
scalar::rgbf32_to_rgb_row::<BE>(
&rgb_in[pix_done * 3..width * 3],
&mut rgb_out[pix_done * 3..width * 3],
width - pix_done,
);
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn rgbf32_to_rgba_row<const BE: bool>(
rgb_in: &[f32],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
let zero = f32x4_splat(0.0);
let one = f32x4_splat(1.0);
let scale = f32x4_splat(255.0);
let total_lanes = width * 3;
let mut lane = 0usize;
let mut pix = 0usize;
while lane + 12 <= total_lanes {
unsafe {
let v0 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane));
let v1 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 4));
let v2 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 8));
let i0 = clamp_scale_to_i32(v0, zero, one, scale);
let i1 = clamp_scale_to_i32(v1, zero, one, scale);
let i2 = clamp_scale_to_i32(v2, zero, one, scale);
let h01 = i16x8_narrow_i32x4(i0, i1);
let h22 = i16x8_narrow_i32x4(i2, i2);
let bytes = u8x16_narrow_i16x8(h01, h22);
let mut tmp = [0u8; 16];
v128_store(tmp.as_mut_ptr() as *mut v128, bytes);
let dst = rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16);
for p in 0..4 {
dst[p * 4] = tmp[p * 3];
dst[p * 4 + 1] = tmp[p * 3 + 1];
dst[p * 4 + 2] = tmp[p * 3 + 2];
dst[p * 4 + 3] = 0xFF;
}
}
lane += 12;
pix += 4;
}
if pix < width {
scalar::rgbf32_to_rgba_row::<BE>(
&rgb_in[pix * 3..width * 3],
&mut rgba_out[pix * 4..width * 4],
width - pix,
);
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn rgbf32_to_rgb_u16_row<const BE: bool>(
rgb_in: &[f32],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_u16_out row too short");
let zero = f32x4_splat(0.0);
let one = f32x4_splat(1.0);
let scale = f32x4_splat(65535.0);
let total_lanes = width * 3;
let mut lane = 0usize;
while lane + 12 <= total_lanes {
unsafe {
let v0 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane));
let v1 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 4));
let v2 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 8));
let i0 = clamp_scale_to_i32(v0, zero, one, scale);
let i1 = clamp_scale_to_i32(v1, zero, one, scale);
let i2 = clamp_scale_to_i32(v2, zero, one, scale);
let u01 = u16x8_narrow_i32x4(i0, i1);
let u22 = u16x8_narrow_i32x4(i2, i2);
let mut tmp = [0u16; 16];
v128_store(tmp.as_mut_ptr() as *mut v128, u01);
v128_store(tmp.as_mut_ptr().add(8) as *mut v128, u22);
rgb_out
.get_unchecked_mut(lane..lane + 12)
.copy_from_slice(&tmp[..12]);
}
lane += 12;
}
let pix_done = lane / 3;
if pix_done < width {
scalar::rgbf32_to_rgb_u16_row::<BE>(
&rgb_in[pix_done * 3..width * 3],
&mut rgb_out[pix_done * 3..width * 3],
width - pix_done,
);
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn rgbf32_to_rgba_u16_row<const BE: bool>(
rgb_in: &[f32],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_u16_out row too short");
let zero = f32x4_splat(0.0);
let one = f32x4_splat(1.0);
let scale = f32x4_splat(65535.0);
let total_lanes = width * 3;
let mut lane = 0usize;
let mut pix = 0usize;
while lane + 12 <= total_lanes {
unsafe {
let v0 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane));
let v1 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 4));
let v2 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 8));
let i0 = clamp_scale_to_i32(v0, zero, one, scale);
let i1 = clamp_scale_to_i32(v1, zero, one, scale);
let i2 = clamp_scale_to_i32(v2, zero, one, scale);
let u01 = u16x8_narrow_i32x4(i0, i1);
let u22 = u16x8_narrow_i32x4(i2, i2);
let mut tmp = [0u16; 16];
v128_store(tmp.as_mut_ptr() as *mut v128, u01);
v128_store(tmp.as_mut_ptr().add(8) as *mut v128, u22);
let dst = rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16);
for p in 0..4 {
dst[p * 4] = tmp[p * 3];
dst[p * 4 + 1] = tmp[p * 3 + 1];
dst[p * 4 + 2] = tmp[p * 3 + 2];
dst[p * 4 + 3] = 0xFFFF;
}
}
lane += 12;
pix += 4;
}
if pix < width {
scalar::rgbf32_to_rgba_u16_row::<BE>(
&rgb_in[pix * 3..width * 3],
&mut rgba_out[pix * 4..width * 4],
width - pix,
);
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
rgb_in: &[f32],
rgb_out: &mut [f32],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_f32_out row too short");
if BE == HOST_NATIVE_BE {
let total = width * 3;
let mut i = 0usize;
while i + 4 <= total {
unsafe {
let v = v128_load(rgb_in.as_ptr().add(i) as *const v128);
v128_store(rgb_out.as_mut_ptr().add(i) as *mut v128, v);
}
i += 4;
}
while i < total {
unsafe {
*rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i);
}
i += 1;
}
} else {
let total = width * 3;
let mut i = 0usize;
while i + 4 <= total {
unsafe {
let swapped = load_f32x4::<BE>(rgb_in.as_ptr().add(i));
v128_store(rgb_out.as_mut_ptr().add(i) as *mut v128, swapped);
}
i += 4;
}
while i < total {
unsafe {
let bits = rgb_in.get_unchecked(i).to_bits();
let host_bits = if BE {
u32::from_be(bits)
} else {
u32::from_le(bits)
};
*rgb_out.get_unchecked_mut(i) = f32::from_bits(host_bits);
}
i += 1;
}
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn rgbf16_to_rgb_row<const BE: bool>(
rgb_in: &[half::f16],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
let total_lanes = width * 3;
let mut lane = 0usize;
while lane + 12 <= total_lanes {
let mut buf = [0.0f32; 12];
for k in 0..12 {
let f = unsafe { rgb_in.get_unchecked(lane + k) };
let raw = f.to_bits();
let bits = if BE {
u16::from_be(raw)
} else {
u16::from_le(raw)
};
buf[k] = half::f16::from_bits(bits).to_f32();
}
unsafe {
rgbf32_to_rgb_row::<HOST_NATIVE_BE>(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
}
lane += 12;
}
let pix_done = lane / 3;
if pix_done < width {
scalar::rgbf16_to_rgb_row::<BE>(
&rgb_in[pix_done * 3..width * 3],
&mut rgb_out[pix_done * 3..width * 3],
width - pix_done,
);
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn rgbf16_to_rgba_row<const BE: bool>(
rgb_in: &[half::f16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
let total_lanes = width * 3;
let mut lane = 0usize;
let mut pix = 0usize;
while lane + 12 <= total_lanes {
let mut buf = [0.0f32; 12];
for k in 0..12 {
let f = unsafe { rgb_in.get_unchecked(lane + k) };
let raw = f.to_bits();
let bits = if BE {
u16::from_be(raw)
} else {
u16::from_le(raw)
};
buf[k] = half::f16::from_bits(bits).to_f32();
}
unsafe {
rgbf32_to_rgba_row::<HOST_NATIVE_BE>(
&buf,
rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16),
4,
);
}
lane += 12;
pix += 4;
}
if pix < width {
scalar::rgbf16_to_rgba_row::<BE>(
&rgb_in[pix * 3..width * 3],
&mut rgba_out[pix * 4..width * 4],
width - pix,
);
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn rgbf16_to_rgb_u16_row<const BE: bool>(
rgb_in: &[half::f16],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_u16_out row too short");
let total_lanes = width * 3;
let mut lane = 0usize;
while lane + 12 <= total_lanes {
let mut buf = [0.0f32; 12];
for k in 0..12 {
let f = unsafe { rgb_in.get_unchecked(lane + k) };
let raw = f.to_bits();
let bits = if BE {
u16::from_be(raw)
} else {
u16::from_le(raw)
};
buf[k] = half::f16::from_bits(bits).to_f32();
}
unsafe {
rgbf32_to_rgb_u16_row::<HOST_NATIVE_BE>(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
}
lane += 12;
}
let pix_done = lane / 3;
if pix_done < width {
scalar::rgbf16_to_rgb_u16_row::<BE>(
&rgb_in[pix_done * 3..width * 3],
&mut rgb_out[pix_done * 3..width * 3],
width - pix_done,
);
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn rgbf16_to_rgba_u16_row<const BE: bool>(
rgb_in: &[half::f16],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_u16_out row too short");
let total_lanes = width * 3;
let mut lane = 0usize;
let mut pix = 0usize;
while lane + 12 <= total_lanes {
let mut buf = [0.0f32; 12];
for k in 0..12 {
let f = unsafe { rgb_in.get_unchecked(lane + k) };
let raw = f.to_bits();
let bits = if BE {
u16::from_be(raw)
} else {
u16::from_le(raw)
};
buf[k] = half::f16::from_bits(bits).to_f32();
}
unsafe {
rgbf32_to_rgba_u16_row::<HOST_NATIVE_BE>(
&buf,
rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16),
4,
);
}
lane += 12;
pix += 4;
}
if pix < width {
scalar::rgbf16_to_rgba_u16_row::<BE>(
&rgb_in[pix * 3..width * 3],
&mut rgba_out[pix * 4..width * 4],
width - pix,
);
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn rgbf16_to_rgb_f32_row<const BE: bool>(
rgb_in: &[half::f16],
rgb_out: &mut [f32],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_f32_out row too short");
let total_lanes = width * 3;
for i in 0..total_lanes {
unsafe {
let f = rgb_in.get_unchecked(i);
let raw = f.to_bits();
let bits = if BE {
u16::from_be(raw)
} else {
u16::from_le(raw)
};
*rgb_out.get_unchecked_mut(i) = half::f16::from_bits(bits).to_f32();
}
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn rgbf16_to_rgb_f16_row<const BE: bool>(
rgb_in: &[half::f16],
rgb_out: &mut [half::f16],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_f16_out row too short");
scalar::rgbf16_to_rgb_f16_row::<BE>(rgb_in, rgb_out, width);
}