use core::arch::wasm32::*;
use super::endian::load_endian_u16x8;
use crate::{
DcpTargetGamut,
row::scalar::{
self,
xyz12::{oetf_srgb, smpte428_inverse_oetf},
xyz12_constants::xyz_to_rgb_matrix,
},
};
const PIXELS_PER_ITER: usize = 8;
const SAMPLE_MASK_U16: u16 = 0x0FFF;
#[inline(always)]
unsafe fn deinterleave_xyz12_8px(v0: v128, v1: v128, v2: v128) -> (v128, v128, v128) {
let x_v0 = i8x16(0, 1, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let x_v1 = i8x16(-1, -1, -1, -1, -1, -1, 2, 3, 8, 9, 14, 15, -1, -1, -1, -1);
let x_v2 = i8x16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4, 5, 10, 11);
let x = v128_or(
v128_or(u8x16_swizzle(v0, x_v0), u8x16_swizzle(v1, x_v1)),
u8x16_swizzle(v2, x_v2),
);
let y_v0 = i8x16(2, 3, 8, 9, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let y_v1 = i8x16(-1, -1, -1, -1, -1, -1, 4, 5, 10, 11, -1, -1, -1, -1, -1, -1);
let y_v2 = i8x16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 6, 7, 12, 13);
let y = v128_or(
v128_or(u8x16_swizzle(v0, y_v0), u8x16_swizzle(v1, y_v1)),
u8x16_swizzle(v2, y_v2),
);
let z_v0 = i8x16(4, 5, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let z_v1 = i8x16(-1, -1, -1, -1, 0, 1, 6, 7, 12, 13, -1, -1, -1, -1, -1, -1);
let z_v2 = i8x16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 3, 8, 9, 14, 15);
let z = v128_or(
v128_or(u8x16_swizzle(v0, z_v0), u8x16_swizzle(v1, z_v1)),
u8x16_swizzle(v2, z_v2),
);
(x, y, z)
}
#[inline(always)]
unsafe fn u16x8_to_f32x4_pair(v: v128) -> (v128, v128) {
let lo_i32 = i32x4_extend_low_i16x8(v);
let hi_i32 = i32x4_extend_high_i16x8(v);
(f32x4_convert_i32x4(lo_i32), f32x4_convert_i32x4(hi_i32))
}
#[inline(always)]
unsafe fn smpte428_inv_oetf_scalar4(v: v128) -> v128 {
unsafe {
let mut buf = [0.0_f32; 4];
v128_store(buf.as_mut_ptr() as *mut v128, v);
for slot in &mut buf {
*slot = smpte428_inverse_oetf(*slot as u16);
}
v128_load(buf.as_ptr() as *const v128)
}
}
#[inline(always)]
unsafe fn oetf_srgb_scalar4(v: v128) -> v128 {
unsafe {
let mut buf = [0.0_f32; 4];
v128_store(buf.as_mut_ptr() as *mut v128, v);
for slot in &mut buf {
*slot = oetf_srgb(*slot);
}
v128_load(buf.as_ptr() as *const v128)
}
}
#[inline(always)]
unsafe fn matmul_xyz_to_rgb_4lane(
m: &[[f32; 3]; 3],
x: v128,
y: v128,
z: v128,
) -> (v128, v128, v128) {
let m00 = f32x4_splat(m[0][0]);
let m01 = f32x4_splat(m[0][1]);
let m02 = f32x4_splat(m[0][2]);
let m10 = f32x4_splat(m[1][0]);
let m11 = f32x4_splat(m[1][1]);
let m12 = f32x4_splat(m[1][2]);
let m20 = f32x4_splat(m[2][0]);
let m21 = f32x4_splat(m[2][1]);
let m22 = f32x4_splat(m[2][2]);
let r = f32x4_add(
f32x4_add(f32x4_mul(m00, x), f32x4_mul(m01, y)),
f32x4_mul(m02, z),
);
let g = f32x4_add(
f32x4_add(f32x4_mul(m10, x), f32x4_mul(m11, y)),
f32x4_mul(m12, z),
);
let b = f32x4_add(
f32x4_add(f32x4_mul(m20, x), f32x4_mul(m21, y)),
f32x4_mul(m22, z),
);
(r, g, b)
}
#[inline(always)]
unsafe fn load_and_matmul_8px<const BE: bool>(
p: *const u8,
m: &[[f32; 3]; 3],
) -> ((v128, v128), (v128, v128), (v128, v128)) {
unsafe {
let v0 = load_endian_u16x8::<BE>(p);
let v1 = load_endian_u16x8::<BE>(p.add(16));
let v2 = load_endian_u16x8::<BE>(p.add(32));
let (x_u, y_u, z_u) = deinterleave_xyz12_8px(v0, v1, v2);
let mask = u16x8_splat(SAMPLE_MASK_U16);
let x_shr = v128_and(u16x8_shr(x_u, 4), mask);
let y_shr = v128_and(u16x8_shr(y_u, 4), mask);
let z_shr = v128_and(u16x8_shr(z_u, 4), mask);
let (x_lo, x_hi) = u16x8_to_f32x4_pair(x_shr);
let (y_lo, y_hi) = u16x8_to_f32x4_pair(y_shr);
let (z_lo, z_hi) = u16x8_to_f32x4_pair(z_shr);
let x_lo = smpte428_inv_oetf_scalar4(x_lo);
let x_hi = smpte428_inv_oetf_scalar4(x_hi);
let y_lo = smpte428_inv_oetf_scalar4(y_lo);
let y_hi = smpte428_inv_oetf_scalar4(y_hi);
let z_lo = smpte428_inv_oetf_scalar4(z_lo);
let z_hi = smpte428_inv_oetf_scalar4(z_hi);
let (r_lo, g_lo, b_lo) = matmul_xyz_to_rgb_4lane(m, x_lo, y_lo, z_lo);
let (r_hi, g_hi, b_hi) = matmul_xyz_to_rgb_4lane(m, x_hi, y_hi, z_hi);
((r_lo, r_hi), (g_lo, g_hi), (b_lo, b_hi))
}
}
#[inline(always)]
unsafe fn load_xyz_linear_8px<const BE: bool>(
p: *const u8,
) -> ((v128, v128), (v128, v128), (v128, v128)) {
unsafe {
let v0 = load_endian_u16x8::<BE>(p);
let v1 = load_endian_u16x8::<BE>(p.add(16));
let v2 = load_endian_u16x8::<BE>(p.add(32));
let (x_u, y_u, z_u) = deinterleave_xyz12_8px(v0, v1, v2);
let mask = u16x8_splat(SAMPLE_MASK_U16);
let x_shr = v128_and(u16x8_shr(x_u, 4), mask);
let y_shr = v128_and(u16x8_shr(y_u, 4), mask);
let z_shr = v128_and(u16x8_shr(z_u, 4), mask);
let (x_lo, x_hi) = u16x8_to_f32x4_pair(x_shr);
let (y_lo, y_hi) = u16x8_to_f32x4_pair(y_shr);
let (z_lo, z_hi) = u16x8_to_f32x4_pair(z_shr);
(
(
smpte428_inv_oetf_scalar4(x_lo),
smpte428_inv_oetf_scalar4(x_hi),
),
(
smpte428_inv_oetf_scalar4(y_lo),
smpte428_inv_oetf_scalar4(y_hi),
),
(
smpte428_inv_oetf_scalar4(z_lo),
smpte428_inv_oetf_scalar4(z_hi),
),
)
}
}
#[inline(always)]
unsafe fn clamp_scale_to_i32x4(v: v128, zero: v128, one: v128, scale: v128) -> v128 {
let half = f32x4_splat(0.5);
let clamped = f32x4_min(f32x4_max(v, zero), one);
let scaled = f32x4_add(f32x4_mul(clamped, scale), half);
i32x4_trunc_sat_f32x4(scaled)
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn xyz12_to_rgb_row<const BE: bool>(
xyz: &[u16],
rgb_out: &mut [u8],
width: usize,
target_gamut: DcpTargetGamut,
) {
debug_assert!(xyz.len() >= width * 3, "xyz row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
let m = xyz_to_rgb_matrix(target_gamut);
unsafe {
let zero_ps = f32x4_splat(0.0);
let one_ps = f32x4_splat(1.0);
let scale = f32x4_splat(255.0);
let zero_si = i32x4_splat(0);
let mut x = 0usize;
while x + PIXELS_PER_ITER <= width {
let p = xyz.as_ptr().add(x * 3) as *const u8;
let ((r_lo, r_hi), (g_lo, g_hi), (b_lo, b_hi)) = load_and_matmul_8px::<BE>(p, &m);
let r_lo = oetf_srgb_scalar4(r_lo);
let r_hi = oetf_srgb_scalar4(r_hi);
let g_lo = oetf_srgb_scalar4(g_lo);
let g_hi = oetf_srgb_scalar4(g_hi);
let b_lo = oetf_srgb_scalar4(b_lo);
let b_hi = oetf_srgb_scalar4(b_hi);
let r_lo_i = clamp_scale_to_i32x4(r_lo, zero_ps, one_ps, scale);
let r_hi_i = clamp_scale_to_i32x4(r_hi, zero_ps, one_ps, scale);
let g_lo_i = clamp_scale_to_i32x4(g_lo, zero_ps, one_ps, scale);
let g_hi_i = clamp_scale_to_i32x4(g_hi, zero_ps, one_ps, scale);
let b_lo_i = clamp_scale_to_i32x4(b_lo, zero_ps, one_ps, scale);
let b_hi_i = clamp_scale_to_i32x4(b_hi, zero_ps, one_ps, scale);
let r_u16 = u16x8_narrow_i32x4(r_lo_i, r_hi_i);
let g_u16 = u16x8_narrow_i32x4(g_lo_i, g_hi_i);
let b_u16 = u16x8_narrow_i32x4(b_lo_i, b_hi_i);
let r_u8 = u8x16_narrow_i16x8(r_u16, zero_si);
let g_u8 = u8x16_narrow_i16x8(g_u16, zero_si);
let b_u8 = u8x16_narrow_i16x8(b_u16, zero_si);
let mut tmp_r = [0u8; 16];
let mut tmp_g = [0u8; 16];
let mut tmp_b = [0u8; 16];
v128_store(tmp_r.as_mut_ptr() as *mut v128, r_u8);
v128_store(tmp_g.as_mut_ptr() as *mut v128, g_u8);
v128_store(tmp_b.as_mut_ptr() as *mut v128, b_u8);
let dst = rgb_out.as_mut_ptr().add(x * 3);
for i in 0..PIXELS_PER_ITER {
*dst.add(i * 3) = tmp_r[i];
*dst.add(i * 3 + 1) = tmp_g[i];
*dst.add(i * 3 + 2) = tmp_b[i];
}
x += PIXELS_PER_ITER;
}
if x < width {
scalar::xyz12::xyz12_to_rgb_row::<BE>(
&xyz[x * 3..width * 3],
&mut rgb_out[x * 3..width * 3],
width - x,
target_gamut,
);
}
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn xyz12_to_rgba_row<const BE: bool>(
xyz: &[u16],
rgba_out: &mut [u8],
width: usize,
target_gamut: DcpTargetGamut,
) {
debug_assert!(xyz.len() >= width * 3, "xyz row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
let m = xyz_to_rgb_matrix(target_gamut);
unsafe {
let zero_ps = f32x4_splat(0.0);
let one_ps = f32x4_splat(1.0);
let scale = f32x4_splat(255.0);
let zero_si = i32x4_splat(0);
let mut x = 0usize;
while x + PIXELS_PER_ITER <= width {
let p = xyz.as_ptr().add(x * 3) as *const u8;
let ((r_lo, r_hi), (g_lo, g_hi), (b_lo, b_hi)) = load_and_matmul_8px::<BE>(p, &m);
let r_lo = oetf_srgb_scalar4(r_lo);
let r_hi = oetf_srgb_scalar4(r_hi);
let g_lo = oetf_srgb_scalar4(g_lo);
let g_hi = oetf_srgb_scalar4(g_hi);
let b_lo = oetf_srgb_scalar4(b_lo);
let b_hi = oetf_srgb_scalar4(b_hi);
let r_lo_i = clamp_scale_to_i32x4(r_lo, zero_ps, one_ps, scale);
let r_hi_i = clamp_scale_to_i32x4(r_hi, zero_ps, one_ps, scale);
let g_lo_i = clamp_scale_to_i32x4(g_lo, zero_ps, one_ps, scale);
let g_hi_i = clamp_scale_to_i32x4(g_hi, zero_ps, one_ps, scale);
let b_lo_i = clamp_scale_to_i32x4(b_lo, zero_ps, one_ps, scale);
let b_hi_i = clamp_scale_to_i32x4(b_hi, zero_ps, one_ps, scale);
let r_u16 = u16x8_narrow_i32x4(r_lo_i, r_hi_i);
let g_u16 = u16x8_narrow_i32x4(g_lo_i, g_hi_i);
let b_u16 = u16x8_narrow_i32x4(b_lo_i, b_hi_i);
let r_u8 = u8x16_narrow_i16x8(r_u16, zero_si);
let g_u8 = u8x16_narrow_i16x8(g_u16, zero_si);
let b_u8 = u8x16_narrow_i16x8(b_u16, zero_si);
let mut tmp_r = [0u8; 16];
let mut tmp_g = [0u8; 16];
let mut tmp_b = [0u8; 16];
v128_store(tmp_r.as_mut_ptr() as *mut v128, r_u8);
v128_store(tmp_g.as_mut_ptr() as *mut v128, g_u8);
v128_store(tmp_b.as_mut_ptr() as *mut v128, b_u8);
let dst = rgba_out.as_mut_ptr().add(x * 4);
for i in 0..PIXELS_PER_ITER {
*dst.add(i * 4) = tmp_r[i];
*dst.add(i * 4 + 1) = tmp_g[i];
*dst.add(i * 4 + 2) = tmp_b[i];
*dst.add(i * 4 + 3) = 0xFF;
}
x += PIXELS_PER_ITER;
}
if x < width {
scalar::xyz12::xyz12_to_rgba_row::<BE>(
&xyz[x * 3..width * 3],
&mut rgba_out[x * 4..width * 4],
width - x,
target_gamut,
);
}
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn xyz12_to_rgb_u16_row<const BE: bool>(
xyz: &[u16],
rgb_out: &mut [u16],
width: usize,
target_gamut: DcpTargetGamut,
) {
debug_assert!(xyz.len() >= width * 3, "xyz row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
let m = xyz_to_rgb_matrix(target_gamut);
unsafe {
let zero_ps = f32x4_splat(0.0);
let one_ps = f32x4_splat(1.0);
let scale = f32x4_splat(65535.0);
let mut x = 0usize;
while x + PIXELS_PER_ITER <= width {
let p = xyz.as_ptr().add(x * 3) as *const u8;
let ((r_lo, r_hi), (g_lo, g_hi), (b_lo, b_hi)) = load_and_matmul_8px::<BE>(p, &m);
let r_lo = oetf_srgb_scalar4(r_lo);
let r_hi = oetf_srgb_scalar4(r_hi);
let g_lo = oetf_srgb_scalar4(g_lo);
let g_hi = oetf_srgb_scalar4(g_hi);
let b_lo = oetf_srgb_scalar4(b_lo);
let b_hi = oetf_srgb_scalar4(b_hi);
let r_lo_i = clamp_scale_to_i32x4(r_lo, zero_ps, one_ps, scale);
let r_hi_i = clamp_scale_to_i32x4(r_hi, zero_ps, one_ps, scale);
let g_lo_i = clamp_scale_to_i32x4(g_lo, zero_ps, one_ps, scale);
let g_hi_i = clamp_scale_to_i32x4(g_hi, zero_ps, one_ps, scale);
let b_lo_i = clamp_scale_to_i32x4(b_lo, zero_ps, one_ps, scale);
let b_hi_i = clamp_scale_to_i32x4(b_hi, zero_ps, one_ps, scale);
let r_u16 = u16x8_narrow_i32x4(r_lo_i, r_hi_i);
let g_u16 = u16x8_narrow_i32x4(g_lo_i, g_hi_i);
let b_u16 = u16x8_narrow_i32x4(b_lo_i, b_hi_i);
let mut tmp_r = [0u16; 8];
let mut tmp_g = [0u16; 8];
let mut tmp_b = [0u16; 8];
v128_store(tmp_r.as_mut_ptr() as *mut v128, r_u16);
v128_store(tmp_g.as_mut_ptr() as *mut v128, g_u16);
v128_store(tmp_b.as_mut_ptr() as *mut v128, b_u16);
let dst = rgb_out.as_mut_ptr().add(x * 3);
for i in 0..PIXELS_PER_ITER {
*dst.add(i * 3) = tmp_r[i];
*dst.add(i * 3 + 1) = tmp_g[i];
*dst.add(i * 3 + 2) = tmp_b[i];
}
x += PIXELS_PER_ITER;
}
if x < width {
scalar::xyz12::xyz12_to_rgb_u16_row::<BE>(
&xyz[x * 3..width * 3],
&mut rgb_out[x * 3..width * 3],
width - x,
target_gamut,
);
}
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn xyz12_to_rgba_u16_row<const BE: bool>(
xyz: &[u16],
rgba_out: &mut [u16],
width: usize,
target_gamut: DcpTargetGamut,
) {
debug_assert!(xyz.len() >= width * 3, "xyz row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
let m = xyz_to_rgb_matrix(target_gamut);
unsafe {
let zero_ps = f32x4_splat(0.0);
let one_ps = f32x4_splat(1.0);
let scale = f32x4_splat(65535.0);
let mut x = 0usize;
while x + PIXELS_PER_ITER <= width {
let p = xyz.as_ptr().add(x * 3) as *const u8;
let ((r_lo, r_hi), (g_lo, g_hi), (b_lo, b_hi)) = load_and_matmul_8px::<BE>(p, &m);
let r_lo = oetf_srgb_scalar4(r_lo);
let r_hi = oetf_srgb_scalar4(r_hi);
let g_lo = oetf_srgb_scalar4(g_lo);
let g_hi = oetf_srgb_scalar4(g_hi);
let b_lo = oetf_srgb_scalar4(b_lo);
let b_hi = oetf_srgb_scalar4(b_hi);
let r_lo_i = clamp_scale_to_i32x4(r_lo, zero_ps, one_ps, scale);
let r_hi_i = clamp_scale_to_i32x4(r_hi, zero_ps, one_ps, scale);
let g_lo_i = clamp_scale_to_i32x4(g_lo, zero_ps, one_ps, scale);
let g_hi_i = clamp_scale_to_i32x4(g_hi, zero_ps, one_ps, scale);
let b_lo_i = clamp_scale_to_i32x4(b_lo, zero_ps, one_ps, scale);
let b_hi_i = clamp_scale_to_i32x4(b_hi, zero_ps, one_ps, scale);
let r_u16 = u16x8_narrow_i32x4(r_lo_i, r_hi_i);
let g_u16 = u16x8_narrow_i32x4(g_lo_i, g_hi_i);
let b_u16 = u16x8_narrow_i32x4(b_lo_i, b_hi_i);
let mut tmp_r = [0u16; 8];
let mut tmp_g = [0u16; 8];
let mut tmp_b = [0u16; 8];
v128_store(tmp_r.as_mut_ptr() as *mut v128, r_u16);
v128_store(tmp_g.as_mut_ptr() as *mut v128, g_u16);
v128_store(tmp_b.as_mut_ptr() as *mut v128, b_u16);
let dst = rgba_out.as_mut_ptr().add(x * 4);
for i in 0..PIXELS_PER_ITER {
*dst.add(i * 4) = tmp_r[i];
*dst.add(i * 4 + 1) = tmp_g[i];
*dst.add(i * 4 + 2) = tmp_b[i];
*dst.add(i * 4 + 3) = 0xFFFF;
}
x += PIXELS_PER_ITER;
}
if x < width {
scalar::xyz12::xyz12_to_rgba_u16_row::<BE>(
&xyz[x * 3..width * 3],
&mut rgba_out[x * 4..width * 4],
width - x,
target_gamut,
);
}
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn xyz12_to_rgb_f32_row<const BE: bool>(
xyz: &[u16],
rgb_out: &mut [f32],
width: usize,
target_gamut: DcpTargetGamut,
) {
debug_assert!(xyz.len() >= width * 3, "xyz row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
let m = xyz_to_rgb_matrix(target_gamut);
unsafe {
let mut x = 0usize;
while x + PIXELS_PER_ITER <= width {
let p = xyz.as_ptr().add(x * 3) as *const u8;
let ((r_lo, r_hi), (g_lo, g_hi), (b_lo, b_hi)) = load_and_matmul_8px::<BE>(p, &m);
let mut rb = [0.0_f32; 8];
let mut gb = [0.0_f32; 8];
let mut bb = [0.0_f32; 8];
v128_store(rb.as_mut_ptr() as *mut v128, r_lo);
v128_store(rb.as_mut_ptr().add(4) as *mut v128, r_hi);
v128_store(gb.as_mut_ptr() as *mut v128, g_lo);
v128_store(gb.as_mut_ptr().add(4) as *mut v128, g_hi);
v128_store(bb.as_mut_ptr() as *mut v128, b_lo);
v128_store(bb.as_mut_ptr().add(4) as *mut v128, b_hi);
let dst = rgb_out.as_mut_ptr().add(x * 3);
for i in 0..PIXELS_PER_ITER {
*dst.add(i * 3) = rb[i];
*dst.add(i * 3 + 1) = gb[i];
*dst.add(i * 3 + 2) = bb[i];
}
x += PIXELS_PER_ITER;
}
if x < width {
scalar::xyz12::xyz12_to_rgb_f32_row::<BE>(
&xyz[x * 3..width * 3],
&mut rgb_out[x * 3..width * 3],
width - x,
target_gamut,
);
}
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn xyz12_to_xyz_f32_row<const BE: bool>(
xyz: &[u16],
xyz_out: &mut [f32],
width: usize,
) {
debug_assert!(xyz.len() >= width * 3, "xyz row too short");
debug_assert!(xyz_out.len() >= width * 3, "xyz_out row too short");
unsafe {
let mut x = 0usize;
while x + PIXELS_PER_ITER <= width {
let p = xyz.as_ptr().add(x * 3) as *const u8;
let ((x_lo, x_hi), (y_lo, y_hi), (z_lo, z_hi)) = load_xyz_linear_8px::<BE>(p);
let mut xb = [0.0_f32; 8];
let mut yb = [0.0_f32; 8];
let mut zb = [0.0_f32; 8];
v128_store(xb.as_mut_ptr() as *mut v128, x_lo);
v128_store(xb.as_mut_ptr().add(4) as *mut v128, x_hi);
v128_store(yb.as_mut_ptr() as *mut v128, y_lo);
v128_store(yb.as_mut_ptr().add(4) as *mut v128, y_hi);
v128_store(zb.as_mut_ptr() as *mut v128, z_lo);
v128_store(zb.as_mut_ptr().add(4) as *mut v128, z_hi);
let dst = xyz_out.as_mut_ptr().add(x * 3);
for i in 0..PIXELS_PER_ITER {
*dst.add(i * 3) = xb[i];
*dst.add(i * 3 + 1) = yb[i];
*dst.add(i * 3 + 2) = zb[i];
}
x += PIXELS_PER_ITER;
}
if x < width {
scalar::xyz12::xyz12_to_xyz_f32_row::<BE>(
&xyz[x * 3..width * 3],
&mut xyz_out[x * 3..width * 3],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn xyz12_to_rgb_f16_row<const BE: bool>(
xyz: &[u16],
rgb_out: &mut [half::f16],
width: usize,
target_gamut: DcpTargetGamut,
) {
debug_assert!(xyz.len() >= width * 3, "xyz row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
let m = xyz_to_rgb_matrix(target_gamut);
unsafe {
let zero_ps = f32x4_splat(0.0);
let one_ps = f32x4_splat(1.0);
let mut x = 0usize;
while x + PIXELS_PER_ITER <= width {
let p = xyz.as_ptr().add(x * 3) as *const u8;
let ((r_lo, r_hi), (g_lo, g_hi), (b_lo, b_hi)) = load_and_matmul_8px::<BE>(p, &m);
let r_lo = oetf_srgb_scalar4(r_lo);
let r_hi = oetf_srgb_scalar4(r_hi);
let g_lo = oetf_srgb_scalar4(g_lo);
let g_hi = oetf_srgb_scalar4(g_hi);
let b_lo = oetf_srgb_scalar4(b_lo);
let b_hi = oetf_srgb_scalar4(b_hi);
let r_lo = f32x4_min(f32x4_max(r_lo, zero_ps), one_ps);
let r_hi = f32x4_min(f32x4_max(r_hi, zero_ps), one_ps);
let g_lo = f32x4_min(f32x4_max(g_lo, zero_ps), one_ps);
let g_hi = f32x4_min(f32x4_max(g_hi, zero_ps), one_ps);
let b_lo = f32x4_min(f32x4_max(b_lo, zero_ps), one_ps);
let b_hi = f32x4_min(f32x4_max(b_hi, zero_ps), one_ps);
let mut rb = [0.0_f32; 8];
let mut gb = [0.0_f32; 8];
let mut bb = [0.0_f32; 8];
v128_store(rb.as_mut_ptr() as *mut v128, r_lo);
v128_store(rb.as_mut_ptr().add(4) as *mut v128, r_hi);
v128_store(gb.as_mut_ptr() as *mut v128, g_lo);
v128_store(gb.as_mut_ptr().add(4) as *mut v128, g_hi);
v128_store(bb.as_mut_ptr() as *mut v128, b_lo);
v128_store(bb.as_mut_ptr().add(4) as *mut v128, b_hi);
for i in 0..PIXELS_PER_ITER {
let oi = (x + i) * 3;
rgb_out[oi] = half::f16::from_f32(rb[i]);
rgb_out[oi + 1] = half::f16::from_f32(gb[i]);
rgb_out[oi + 2] = half::f16::from_f32(bb[i]);
}
x += PIXELS_PER_ITER;
}
if x < width {
scalar::xyz12::xyz12_to_rgb_f16_row::<BE>(
&xyz[x * 3..width * 3],
&mut rgb_out[x * 3..width * 3],
width - x,
target_gamut,
);
}
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn xyz12_to_rgba_f16_row<const BE: bool>(
xyz: &[u16],
rgba_out: &mut [half::f16],
width: usize,
target_gamut: DcpTargetGamut,
) {
debug_assert!(xyz.len() >= width * 3, "xyz row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
let m = xyz_to_rgb_matrix(target_gamut);
let one_f16 = half::f16::from_f32(1.0);
unsafe {
let zero_ps = f32x4_splat(0.0);
let one_ps = f32x4_splat(1.0);
let mut x = 0usize;
while x + PIXELS_PER_ITER <= width {
let p = xyz.as_ptr().add(x * 3) as *const u8;
let ((r_lo, r_hi), (g_lo, g_hi), (b_lo, b_hi)) = load_and_matmul_8px::<BE>(p, &m);
let r_lo = oetf_srgb_scalar4(r_lo);
let r_hi = oetf_srgb_scalar4(r_hi);
let g_lo = oetf_srgb_scalar4(g_lo);
let g_hi = oetf_srgb_scalar4(g_hi);
let b_lo = oetf_srgb_scalar4(b_lo);
let b_hi = oetf_srgb_scalar4(b_hi);
let r_lo = f32x4_min(f32x4_max(r_lo, zero_ps), one_ps);
let r_hi = f32x4_min(f32x4_max(r_hi, zero_ps), one_ps);
let g_lo = f32x4_min(f32x4_max(g_lo, zero_ps), one_ps);
let g_hi = f32x4_min(f32x4_max(g_hi, zero_ps), one_ps);
let b_lo = f32x4_min(f32x4_max(b_lo, zero_ps), one_ps);
let b_hi = f32x4_min(f32x4_max(b_hi, zero_ps), one_ps);
let mut rb = [0.0_f32; 8];
let mut gb = [0.0_f32; 8];
let mut bb = [0.0_f32; 8];
v128_store(rb.as_mut_ptr() as *mut v128, r_lo);
v128_store(rb.as_mut_ptr().add(4) as *mut v128, r_hi);
v128_store(gb.as_mut_ptr() as *mut v128, g_lo);
v128_store(gb.as_mut_ptr().add(4) as *mut v128, g_hi);
v128_store(bb.as_mut_ptr() as *mut v128, b_lo);
v128_store(bb.as_mut_ptr().add(4) as *mut v128, b_hi);
for i in 0..PIXELS_PER_ITER {
let oi = (x + i) * 4;
rgba_out[oi] = half::f16::from_f32(rb[i]);
rgba_out[oi + 1] = half::f16::from_f32(gb[i]);
rgba_out[oi + 2] = half::f16::from_f32(bb[i]);
rgba_out[oi + 3] = one_f16;
}
x += PIXELS_PER_ITER;
}
if x < width {
scalar::xyz12::xyz12_to_rgba_f16_row::<BE>(
&xyz[x * 3..width * 3],
&mut rgba_out[x * 4..width * 4],
width - x,
target_gamut,
);
}
}
}