use core::arch::aarch64::*;
use crate::{
DcpTargetGamut,
row::scalar::{
self,
xyz12::{oetf_srgb, smpte428_inverse_oetf},
xyz12_constants::xyz_to_rgb_matrix,
},
};
const LANES: usize = 4;
const SAMPLE_MASK_U16: u16 = 0x0FFF;
const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
#[inline(always)]
unsafe fn load_xyz4(p: *const u16) -> (uint16x4_t, uint16x4_t, uint16x4_t) {
unsafe {
let triple = vld3_u16(p);
(triple.0, triple.1, triple.2)
}
}
#[inline(always)]
unsafe fn shift_mask_widen_cvt(v: uint16x4_t) -> float32x4_t {
unsafe {
let mask = vdup_n_u16(SAMPLE_MASK_U16);
let shifted = vshr_n_u16::<4>(v);
let masked = vand_u16(shifted, mask);
let widened = vmovl_u16(masked); vcvtq_f32_u32(widened)
}
}
#[inline(always)]
unsafe fn smpte428_inv_oetf_scalar4(v: float32x4_t) -> float32x4_t {
unsafe {
let mut buf = [0.0_f32; LANES];
vst1q_f32(buf.as_mut_ptr(), v);
for slot in &mut buf {
*slot = smpte428_inverse_oetf(*slot as u16);
}
vld1q_f32(buf.as_ptr())
}
}
#[inline(always)]
unsafe fn oetf_srgb_scalar4(v: float32x4_t) -> float32x4_t {
unsafe {
let mut buf = [0.0_f32; LANES];
vst1q_f32(buf.as_mut_ptr(), v);
for slot in &mut buf {
*slot = oetf_srgb(*slot);
}
vld1q_f32(buf.as_ptr())
}
}
#[inline(always)]
unsafe fn matmul_xyz_to_rgb(
m: &[[f32; 3]; 3],
x: float32x4_t,
y: float32x4_t,
z: float32x4_t,
) -> (float32x4_t, float32x4_t, float32x4_t) {
unsafe {
let m00 = vdupq_n_f32(m[0][0]);
let m01 = vdupq_n_f32(m[0][1]);
let m02 = vdupq_n_f32(m[0][2]);
let m10 = vdupq_n_f32(m[1][0]);
let m11 = vdupq_n_f32(m[1][1]);
let m12 = vdupq_n_f32(m[1][2]);
let m20 = vdupq_n_f32(m[2][0]);
let m21 = vdupq_n_f32(m[2][1]);
let m22 = vdupq_n_f32(m[2][2]);
let r = vaddq_f32(
vaddq_f32(vmulq_f32(m00, x), vmulq_f32(m01, y)),
vmulq_f32(m02, z),
);
let g = vaddq_f32(
vaddq_f32(vmulq_f32(m10, x), vmulq_f32(m11, y)),
vmulq_f32(m12, z),
);
let b = vaddq_f32(
vaddq_f32(vmulq_f32(m20, x), vmulq_f32(m21, y)),
vmulq_f32(m22, z),
);
(r, g, b)
}
}
#[inline(always)]
unsafe fn load_and_matmul<const BE: bool>(
p: *const u16,
m: &[[f32; 3]; 3],
) -> (float32x4_t, float32x4_t, float32x4_t) {
unsafe {
let _ = BE; let (x_u, y_u, z_u) = load_xyz4(p);
let x_lin = smpte428_inv_oetf_scalar4(shift_mask_widen_cvt(x_u));
let y_lin = smpte428_inv_oetf_scalar4(shift_mask_widen_cvt(y_u));
let z_lin = smpte428_inv_oetf_scalar4(shift_mask_widen_cvt(z_u));
matmul_xyz_to_rgb(m, x_lin, y_lin, z_lin)
}
}
#[inline(always)]
unsafe fn load_xyz_linear<const BE: bool>(
p: *const u16,
) -> (float32x4_t, float32x4_t, float32x4_t) {
unsafe {
let _ = BE; let (x_u, y_u, z_u) = load_xyz4(p);
(
smpte428_inv_oetf_scalar4(shift_mask_widen_cvt(x_u)),
smpte428_inv_oetf_scalar4(shift_mask_widen_cvt(y_u)),
smpte428_inv_oetf_scalar4(shift_mask_widen_cvt(z_u)),
)
}
}
#[inline(always)]
unsafe fn clamp_scale_to_u16x4(v: float32x4_t, scale: float32x4_t) -> uint16x4_t {
unsafe {
let zero = vdupq_n_f32(0.0);
let one = vdupq_n_f32(1.0);
let clamped = vminq_f32(vmaxq_f32(v, zero), one);
let half = vdupq_n_f32(0.5);
let scaled = vaddq_f32(vmulq_f32(clamped, scale), half);
let as_u32 = vcvtq_u32_f32(scaled); vqmovn_u32(as_u32)
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn xyz12_to_rgb_row<const BE: bool>(
xyz: &[u16],
rgb_out: &mut [u8],
width: usize,
target_gamut: DcpTargetGamut,
) {
debug_assert!(xyz.len() >= width * 3, "xyz row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
let m = xyz_to_rgb_matrix(target_gamut);
let scale = vdupq_n_f32(255.0);
unsafe {
let mut x = 0usize;
if BE == HOST_NATIVE_BE {
while x + LANES <= width {
let p = xyz.as_ptr().add(x * 3);
let (r_lin, g_lin, b_lin) = load_and_matmul::<BE>(p, &m);
let r_oetf = oetf_srgb_scalar4(r_lin);
let g_oetf = oetf_srgb_scalar4(g_lin);
let b_oetf = oetf_srgb_scalar4(b_lin);
let r_u16 = clamp_scale_to_u16x4(r_oetf, scale);
let g_u16 = clamp_scale_to_u16x4(g_oetf, scale);
let b_u16 = clamp_scale_to_u16x4(b_oetf, scale);
let r_u8 = vqmovn_u16(vcombine_u16(r_u16, r_u16));
let g_u8 = vqmovn_u16(vcombine_u16(g_u16, g_u16));
let b_u8 = vqmovn_u16(vcombine_u16(b_u16, b_u16));
let mut tmp = [0u8; 24];
vst3_u8(tmp.as_mut_ptr(), uint8x8x3_t(r_u8, g_u8, b_u8));
rgb_out
.get_unchecked_mut(x * 3..x * 3 + 12)
.copy_from_slice(&tmp[..12]);
x += LANES;
}
}
if x < width {
scalar::xyz12::xyz12_to_rgb_row::<BE>(
&xyz[x * 3..width * 3],
&mut rgb_out[x * 3..width * 3],
width - x,
target_gamut,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn xyz12_to_rgba_row<const BE: bool>(
xyz: &[u16],
rgba_out: &mut [u8],
width: usize,
target_gamut: DcpTargetGamut,
) {
debug_assert!(xyz.len() >= width * 3, "xyz row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
let m = xyz_to_rgb_matrix(target_gamut);
let scale = vdupq_n_f32(255.0);
let alpha = vdup_n_u8(0xFF);
unsafe {
let mut x = 0usize;
if BE == HOST_NATIVE_BE {
while x + LANES <= width {
let p = xyz.as_ptr().add(x * 3);
let (r_lin, g_lin, b_lin) = load_and_matmul::<BE>(p, &m);
let r_oetf = oetf_srgb_scalar4(r_lin);
let g_oetf = oetf_srgb_scalar4(g_lin);
let b_oetf = oetf_srgb_scalar4(b_lin);
let r_u16 = clamp_scale_to_u16x4(r_oetf, scale);
let g_u16 = clamp_scale_to_u16x4(g_oetf, scale);
let b_u16 = clamp_scale_to_u16x4(b_oetf, scale);
let r_u8 = vqmovn_u16(vcombine_u16(r_u16, r_u16));
let g_u8 = vqmovn_u16(vcombine_u16(g_u16, g_u16));
let b_u8 = vqmovn_u16(vcombine_u16(b_u16, b_u16));
let mut tmp = [0u8; 32];
vst4_u8(tmp.as_mut_ptr(), uint8x8x4_t(r_u8, g_u8, b_u8, alpha));
rgba_out
.get_unchecked_mut(x * 4..x * 4 + 16)
.copy_from_slice(&tmp[..16]);
x += LANES;
}
}
if x < width {
scalar::xyz12::xyz12_to_rgba_row::<BE>(
&xyz[x * 3..width * 3],
&mut rgba_out[x * 4..width * 4],
width - x,
target_gamut,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn xyz12_to_rgb_u16_row<const BE: bool>(
xyz: &[u16],
rgb_out: &mut [u16],
width: usize,
target_gamut: DcpTargetGamut,
) {
debug_assert!(xyz.len() >= width * 3, "xyz row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
let m = xyz_to_rgb_matrix(target_gamut);
let scale = vdupq_n_f32(65535.0);
unsafe {
let mut x = 0usize;
if BE == HOST_NATIVE_BE {
while x + LANES <= width {
let p = xyz.as_ptr().add(x * 3);
let (r_lin, g_lin, b_lin) = load_and_matmul::<BE>(p, &m);
let r_oetf = oetf_srgb_scalar4(r_lin);
let g_oetf = oetf_srgb_scalar4(g_lin);
let b_oetf = oetf_srgb_scalar4(b_lin);
let r_u16 = clamp_scale_to_u16x4(r_oetf, scale);
let g_u16 = clamp_scale_to_u16x4(g_oetf, scale);
let b_u16 = clamp_scale_to_u16x4(b_oetf, scale);
vst3_u16(
rgb_out.as_mut_ptr().add(x * 3),
uint16x4x3_t(r_u16, g_u16, b_u16),
);
x += LANES;
}
}
if x < width {
scalar::xyz12::xyz12_to_rgb_u16_row::<BE>(
&xyz[x * 3..width * 3],
&mut rgb_out[x * 3..width * 3],
width - x,
target_gamut,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn xyz12_to_rgba_u16_row<const BE: bool>(
xyz: &[u16],
rgba_out: &mut [u16],
width: usize,
target_gamut: DcpTargetGamut,
) {
debug_assert!(xyz.len() >= width * 3, "xyz row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
let m = xyz_to_rgb_matrix(target_gamut);
let scale = vdupq_n_f32(65535.0);
let alpha = vdup_n_u16(0xFFFF);
unsafe {
let mut x = 0usize;
if BE == HOST_NATIVE_BE {
while x + LANES <= width {
let p = xyz.as_ptr().add(x * 3);
let (r_lin, g_lin, b_lin) = load_and_matmul::<BE>(p, &m);
let r_oetf = oetf_srgb_scalar4(r_lin);
let g_oetf = oetf_srgb_scalar4(g_lin);
let b_oetf = oetf_srgb_scalar4(b_lin);
let r_u16 = clamp_scale_to_u16x4(r_oetf, scale);
let g_u16 = clamp_scale_to_u16x4(g_oetf, scale);
let b_u16 = clamp_scale_to_u16x4(b_oetf, scale);
vst4_u16(
rgba_out.as_mut_ptr().add(x * 4),
uint16x4x4_t(r_u16, g_u16, b_u16, alpha),
);
x += LANES;
}
}
if x < width {
scalar::xyz12::xyz12_to_rgba_u16_row::<BE>(
&xyz[x * 3..width * 3],
&mut rgba_out[x * 4..width * 4],
width - x,
target_gamut,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn xyz12_to_rgb_f32_row<const BE: bool>(
xyz: &[u16],
rgb_out: &mut [f32],
width: usize,
target_gamut: DcpTargetGamut,
) {
debug_assert!(xyz.len() >= width * 3, "xyz row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
let m = xyz_to_rgb_matrix(target_gamut);
unsafe {
let mut x = 0usize;
if BE == HOST_NATIVE_BE {
while x + LANES <= width {
let p = xyz.as_ptr().add(x * 3);
let (r_lin, g_lin, b_lin) = load_and_matmul::<BE>(p, &m);
vst3q_f32(
rgb_out.as_mut_ptr().add(x * 3),
float32x4x3_t(r_lin, g_lin, b_lin),
);
x += LANES;
}
}
if x < width {
scalar::xyz12::xyz12_to_rgb_f32_row::<BE>(
&xyz[x * 3..width * 3],
&mut rgb_out[x * 3..width * 3],
width - x,
target_gamut,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn xyz12_to_xyz_f32_row<const BE: bool>(
xyz: &[u16],
xyz_out: &mut [f32],
width: usize,
) {
debug_assert!(xyz.len() >= width * 3, "xyz row too short");
debug_assert!(xyz_out.len() >= width * 3, "xyz_out row too short");
unsafe {
let mut x = 0usize;
if BE == HOST_NATIVE_BE {
while x + LANES <= width {
let p = xyz.as_ptr().add(x * 3);
let (xv, yv, zv) = load_xyz_linear::<BE>(p);
vst3q_f32(xyz_out.as_mut_ptr().add(x * 3), float32x4x3_t(xv, yv, zv));
x += LANES;
}
}
if x < width {
scalar::xyz12::xyz12_to_xyz_f32_row::<BE>(
&xyz[x * 3..width * 3],
&mut xyz_out[x * 3..width * 3],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn xyz12_to_rgb_f16_row<const BE: bool>(
xyz: &[u16],
rgb_out: &mut [half::f16],
width: usize,
target_gamut: DcpTargetGamut,
) {
debug_assert!(xyz.len() >= width * 3, "xyz row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
let m = xyz_to_rgb_matrix(target_gamut);
unsafe {
let mut x = 0usize;
if BE == HOST_NATIVE_BE {
while x + LANES <= width {
let p = xyz.as_ptr().add(x * 3);
let (r_lin, g_lin, b_lin) = load_and_matmul::<BE>(p, &m);
let r_oetf = oetf_srgb_scalar4(r_lin);
let g_oetf = oetf_srgb_scalar4(g_lin);
let b_oetf = oetf_srgb_scalar4(b_lin);
let zero = vdupq_n_f32(0.0);
let one = vdupq_n_f32(1.0);
let r_clamp = vminq_f32(vmaxq_f32(r_oetf, zero), one);
let g_clamp = vminq_f32(vmaxq_f32(g_oetf, zero), one);
let b_clamp = vminq_f32(vmaxq_f32(b_oetf, zero), one);
let mut rb = [0.0_f32; LANES];
let mut gb = [0.0_f32; LANES];
let mut bb = [0.0_f32; LANES];
vst1q_f32(rb.as_mut_ptr(), r_clamp);
vst1q_f32(gb.as_mut_ptr(), g_clamp);
vst1q_f32(bb.as_mut_ptr(), b_clamp);
for i in 0..LANES {
let oi = (x + i) * 3;
rgb_out[oi] = half::f16::from_f32(rb[i]);
rgb_out[oi + 1] = half::f16::from_f32(gb[i]);
rgb_out[oi + 2] = half::f16::from_f32(bb[i]);
}
x += LANES;
}
}
if x < width {
scalar::xyz12::xyz12_to_rgb_f16_row::<BE>(
&xyz[x * 3..width * 3],
&mut rgb_out[x * 3..width * 3],
width - x,
target_gamut,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn xyz12_to_rgba_f16_row<const BE: bool>(
xyz: &[u16],
rgba_out: &mut [half::f16],
width: usize,
target_gamut: DcpTargetGamut,
) {
debug_assert!(xyz.len() >= width * 3, "xyz row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
let m = xyz_to_rgb_matrix(target_gamut);
let one_f16 = half::f16::from_f32(1.0);
unsafe {
let mut x = 0usize;
if BE == HOST_NATIVE_BE {
while x + LANES <= width {
let p = xyz.as_ptr().add(x * 3);
let (r_lin, g_lin, b_lin) = load_and_matmul::<BE>(p, &m);
let r_oetf = oetf_srgb_scalar4(r_lin);
let g_oetf = oetf_srgb_scalar4(g_lin);
let b_oetf = oetf_srgb_scalar4(b_lin);
let zero = vdupq_n_f32(0.0);
let one = vdupq_n_f32(1.0);
let r_clamp = vminq_f32(vmaxq_f32(r_oetf, zero), one);
let g_clamp = vminq_f32(vmaxq_f32(g_oetf, zero), one);
let b_clamp = vminq_f32(vmaxq_f32(b_oetf, zero), one);
let mut rb = [0.0_f32; LANES];
let mut gb = [0.0_f32; LANES];
let mut bb = [0.0_f32; LANES];
vst1q_f32(rb.as_mut_ptr(), r_clamp);
vst1q_f32(gb.as_mut_ptr(), g_clamp);
vst1q_f32(bb.as_mut_ptr(), b_clamp);
for i in 0..LANES {
let oi = (x + i) * 4;
rgba_out[oi] = half::f16::from_f32(rb[i]);
rgba_out[oi + 1] = half::f16::from_f32(gb[i]);
rgba_out[oi + 2] = half::f16::from_f32(bb[i]);
rgba_out[oi + 3] = one_f16;
}
x += LANES;
}
}
if x < width {
scalar::xyz12::xyz12_to_rgba_f16_row::<BE>(
&xyz[x * 3..width * 3],
&mut rgba_out[x * 4..width * 4],
width - x,
target_gamut,
);
}
}
}