use core::arch::aarch64::*;
use super::{endian::load_endian_u32x4, scalar};
#[inline(always)]
unsafe fn load_f32x4<const BE: bool>(ptr: *const f32) -> float32x4_t {
unsafe {
let u = load_endian_u32x4::<BE>(ptr as *const u8);
vreinterpretq_f32_u32(u)
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn rgbf32_to_rgb_row<const BE: bool>(
rgb_in: &[f32],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let zero = vdupq_n_f32(0.0);
let one = vdupq_n_f32(1.0);
let scale = vdupq_n_f32(255.0);
let total_lanes = width * 3;
let mut lane = 0usize;
while lane + 12 <= total_lanes {
let v0 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane));
let v1 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 4));
let v2 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 8));
let s0 = vmulq_f32(vminq_f32(vmaxq_f32(v0, zero), one), scale);
let s1 = vmulq_f32(vminq_f32(vmaxq_f32(v1, zero), one), scale);
let s2 = vmulq_f32(vminq_f32(vmaxq_f32(v2, zero), one), scale);
let u0 = vqmovn_u32(vcvtnq_u32_f32(s0));
let u1 = vqmovn_u32(vcvtnq_u32_f32(s1));
let u2 = vqmovn_u32(vcvtnq_u32_f32(s2));
let b0 = vqmovn_u16(vcombine_u16(u0, u0));
let b1 = vqmovn_u16(vcombine_u16(u1, u1));
let b2 = vqmovn_u16(vcombine_u16(u2, u2));
let mut tmp = [0u8; 8];
vst1_u8(tmp.as_mut_ptr(), b0);
rgb_out
.get_unchecked_mut(lane..lane + 4)
.copy_from_slice(&tmp[..4]);
vst1_u8(tmp.as_mut_ptr(), b1);
rgb_out
.get_unchecked_mut(lane + 4..lane + 8)
.copy_from_slice(&tmp[..4]);
vst1_u8(tmp.as_mut_ptr(), b2);
rgb_out
.get_unchecked_mut(lane + 8..lane + 12)
.copy_from_slice(&tmp[..4]);
lane += 12;
}
let pix_done = lane / 3;
let tail_pix = width - pix_done;
if tail_pix > 0 {
scalar::rgbf32_to_rgb_row::<BE>(
&rgb_in[pix_done * 3..width * 3],
&mut rgb_out[pix_done * 3..width * 3],
tail_pix,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn rgbf32_to_rgba_row<const BE: bool>(
rgb_in: &[f32],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let zero = vdupq_n_f32(0.0);
let one = vdupq_n_f32(1.0);
let scale = vdupq_n_f32(255.0);
let alpha = vdupq_n_u8(0xFF);
let mut x = 0usize;
while x + 16 <= width {
let mut r_bytes = [0u8; 16];
let mut g_bytes = [0u8; 16];
let mut b_bytes = [0u8; 16];
for sub in 0..4 {
let base = (x + sub * 4) * 3;
let (r_v, g_v, b_v) = if BE == HOST_NATIVE_BE {
let v_rgb = vld3q_f32(rgb_in.as_ptr().add(base));
(v_rgb.0, v_rgb.1, v_rgb.2)
} else {
let raw0 = load_f32x4::<BE>(rgb_in.as_ptr().add(base));
let raw1 = load_f32x4::<BE>(rgb_in.as_ptr().add(base + 4));
let raw2 = load_f32x4::<BE>(rgb_in.as_ptr().add(base + 8));
let mut r_arr = [0.0f32; 4];
let mut g_arr = [0.0f32; 4];
let mut b_arr = [0.0f32; 4];
vst1q_f32(r_arr.as_mut_ptr(), raw0);
vst1q_f32(g_arr.as_mut_ptr(), raw1);
vst1q_f32(b_arr.as_mut_ptr(), raw2);
let r_deint = [r_arr[0], r_arr[3], g_arr[2], b_arr[1]];
let g_deint = [r_arr[1], g_arr[0], g_arr[3], b_arr[2]];
let b_deint = [r_arr[2], g_arr[1], b_arr[0], b_arr[3]];
(
vld1q_f32(r_deint.as_ptr()),
vld1q_f32(g_deint.as_ptr()),
vld1q_f32(b_deint.as_ptr()),
)
};
let r_clamped = vmulq_f32(vminq_f32(vmaxq_f32(r_v, zero), one), scale);
let g_clamped = vmulq_f32(vminq_f32(vmaxq_f32(g_v, zero), one), scale);
let b_clamped = vmulq_f32(vminq_f32(vmaxq_f32(b_v, zero), one), scale);
let r_u32 = vcvtnq_u32_f32(r_clamped);
let g_u32 = vcvtnq_u32_f32(g_clamped);
let b_u32 = vcvtnq_u32_f32(b_clamped);
let r_u16 = vqmovn_u32(r_u32);
let g_u16 = vqmovn_u32(g_u32);
let b_u16 = vqmovn_u32(b_u32);
let r_u8 = vqmovn_u16(vcombine_u16(r_u16, r_u16));
let g_u8 = vqmovn_u16(vcombine_u16(g_u16, g_u16));
let b_u8 = vqmovn_u16(vcombine_u16(b_u16, b_u16));
let mut tmp = [0u8; 8];
vst1_u8(tmp.as_mut_ptr(), r_u8);
r_bytes[sub * 4..sub * 4 + 4].copy_from_slice(&tmp[..4]);
vst1_u8(tmp.as_mut_ptr(), g_u8);
g_bytes[sub * 4..sub * 4 + 4].copy_from_slice(&tmp[..4]);
vst1_u8(tmp.as_mut_ptr(), b_u8);
b_bytes[sub * 4..sub * 4 + 4].copy_from_slice(&tmp[..4]);
}
let r = vld1q_u8(r_bytes.as_ptr());
let g = vld1q_u8(g_bytes.as_ptr());
let b = vld1q_u8(b_bytes.as_ptr());
let rgba = uint8x16x4_t(r, g, b, alpha);
vst4q_u8(rgba_out.as_mut_ptr().add(x * 4), rgba);
x += 16;
}
if x < width {
scalar::rgbf32_to_rgba_row::<BE>(
&rgb_in[x * 3..width * 3],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn rgbf32_to_rgb_u16_row<const BE: bool>(
rgb_in: &[f32],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_u16_out row too short");
unsafe {
let zero = vdupq_n_f32(0.0);
let one = vdupq_n_f32(1.0);
let scale = vdupq_n_f32(65535.0);
let total_lanes = width * 3;
let mut lane = 0usize;
while lane + 12 <= total_lanes {
let v0 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane));
let v1 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 4));
let v2 = load_f32x4::<BE>(rgb_in.as_ptr().add(lane + 8));
let s0 = vmulq_f32(vminq_f32(vmaxq_f32(v0, zero), one), scale);
let s1 = vmulq_f32(vminq_f32(vmaxq_f32(v1, zero), one), scale);
let s2 = vmulq_f32(vminq_f32(vmaxq_f32(v2, zero), one), scale);
let u0 = vqmovn_u32(vcvtnq_u32_f32(s0));
let u1 = vqmovn_u32(vcvtnq_u32_f32(s1));
let u2 = vqmovn_u32(vcvtnq_u32_f32(s2));
vst1_u16(rgb_out.as_mut_ptr().add(lane), u0);
vst1_u16(rgb_out.as_mut_ptr().add(lane + 4), u1);
vst1_u16(rgb_out.as_mut_ptr().add(lane + 8), u2);
lane += 12;
}
let pix_done = lane / 3;
let tail_pix = width - pix_done;
if tail_pix > 0 {
scalar::rgbf32_to_rgb_u16_row::<BE>(
&rgb_in[pix_done * 3..width * 3],
&mut rgb_out[pix_done * 3..width * 3],
tail_pix,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn rgbf32_to_rgba_u16_row<const BE: bool>(
rgb_in: &[f32],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_u16_out row too short");
unsafe {
let zero = vdupq_n_f32(0.0);
let one = vdupq_n_f32(1.0);
let scale = vdupq_n_f32(65535.0);
let alpha_v = vdupq_n_u16(0xFFFF);
let mut x = 0usize;
while x + 8 <= width {
let mut r_h = [0u16; 8];
let mut g_h = [0u16; 8];
let mut b_h = [0u16; 8];
for sub in 0..2 {
let base = (x + sub * 4) * 3;
let (r_v, g_v, b_v) = if BE == HOST_NATIVE_BE {
let v_rgb = vld3q_f32(rgb_in.as_ptr().add(base));
(v_rgb.0, v_rgb.1, v_rgb.2)
} else {
let raw0 = load_f32x4::<BE>(rgb_in.as_ptr().add(base));
let raw1 = load_f32x4::<BE>(rgb_in.as_ptr().add(base + 4));
let raw2 = load_f32x4::<BE>(rgb_in.as_ptr().add(base + 8));
let mut r_arr = [0.0f32; 4];
let mut g_arr = [0.0f32; 4];
let mut b_arr = [0.0f32; 4];
vst1q_f32(r_arr.as_mut_ptr(), raw0);
vst1q_f32(g_arr.as_mut_ptr(), raw1);
vst1q_f32(b_arr.as_mut_ptr(), raw2);
let r_deint = [r_arr[0], r_arr[3], g_arr[2], b_arr[1]];
let g_deint = [r_arr[1], g_arr[0], g_arr[3], b_arr[2]];
let b_deint = [r_arr[2], g_arr[1], b_arr[0], b_arr[3]];
(
vld1q_f32(r_deint.as_ptr()),
vld1q_f32(g_deint.as_ptr()),
vld1q_f32(b_deint.as_ptr()),
)
};
let r_s = vmulq_f32(vminq_f32(vmaxq_f32(r_v, zero), one), scale);
let g_s = vmulq_f32(vminq_f32(vmaxq_f32(g_v, zero), one), scale);
let b_s = vmulq_f32(vminq_f32(vmaxq_f32(b_v, zero), one), scale);
let r_u = vqmovn_u32(vcvtnq_u32_f32(r_s));
let g_u = vqmovn_u32(vcvtnq_u32_f32(g_s));
let b_u = vqmovn_u32(vcvtnq_u32_f32(b_s));
vst1_u16(r_h.as_mut_ptr().add(sub * 4), r_u);
vst1_u16(g_h.as_mut_ptr().add(sub * 4), g_u);
vst1_u16(b_h.as_mut_ptr().add(sub * 4), b_u);
}
let r = vld1q_u16(r_h.as_ptr());
let g = vld1q_u16(g_h.as_ptr());
let b = vld1q_u16(b_h.as_ptr());
let rgba = uint16x8x4_t(r, g, b, alpha_v);
vst4q_u16(rgba_out.as_mut_ptr().add(x * 4), rgba);
x += 8;
}
if x < width {
scalar::rgbf32_to_rgba_u16_row::<BE>(
&rgb_in[x * 3..width * 3],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn rgbf32_to_rgb_f32_row<const BE: bool>(
rgb_in: &[f32],
rgb_out: &mut [f32],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf32 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_f32_out row too short");
unsafe {
let total = width * 3;
let mut i = 0usize;
if BE == HOST_NATIVE_BE {
while i + 4 <= total {
let v = vld1q_f32(rgb_in.as_ptr().add(i));
vst1q_f32(rgb_out.as_mut_ptr().add(i), v);
i += 4;
}
while i < total {
*rgb_out.get_unchecked_mut(i) = *rgb_in.get_unchecked(i);
i += 1;
}
} else {
while i + 4 <= total {
let v = load_f32x4::<BE>(rgb_in.as_ptr().add(i));
vst1q_f32(rgb_out.as_mut_ptr().add(i), v);
i += 4;
}
while i < total {
let bits = (*rgb_in.get_unchecked(i)).to_bits();
let host_bits = if BE {
u32::from_be(bits)
} else {
u32::from_le(bits)
};
*rgb_out.get_unchecked_mut(i) = f32::from_bits(host_bits);
i += 1;
}
}
}
}
use super::endian::load_endian_u16x4;
const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
#[inline(always)]
unsafe fn widen_f16x4<const BE: bool>(ptr: *const half::f16, out: *mut f32) {
unsafe {
let u16x4 = load_endian_u16x4::<BE>(ptr as *const u8);
let f16x4 = vreinterpret_f16_u16(u16x4);
let f32x4 = vcvt_f32_f16(f16x4);
vst1q_f32(out, f32x4);
}
}
#[inline(always)]
unsafe fn widen_f16_tail<const BE: bool>(src: &[half::f16], dst: &mut [f32], n: usize) {
for i in 0..n {
unsafe {
let raw = src.get_unchecked(i).to_bits();
let host_bits = if BE {
u16::from_be(raw)
} else {
u16::from_le(raw)
};
*dst.get_unchecked_mut(i) = half::f16::from_bits(host_bits).to_f32();
}
}
}
#[inline]
#[target_feature(enable = "neon,fp16")]
pub(crate) unsafe fn rgbf16_to_rgb_row<const BE: bool>(
rgb_in: &[half::f16],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
let total_lanes = width * 3;
let mut lane = 0usize;
while lane + 12 <= total_lanes {
let mut buf = [0.0f32; 12];
unsafe {
widen_f16x4::<BE>(rgb_in.as_ptr().add(lane), buf.as_mut_ptr());
widen_f16x4::<BE>(rgb_in.as_ptr().add(lane + 4), buf.as_mut_ptr().add(4));
widen_f16x4::<BE>(rgb_in.as_ptr().add(lane + 8), buf.as_mut_ptr().add(8));
rgbf32_to_rgb_row::<HOST_NATIVE_BE>(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
}
lane += 12;
}
let pix_done = lane / 3;
if pix_done < width {
scalar::rgbf16_to_rgb_row::<BE>(
&rgb_in[pix_done * 3..width * 3],
&mut rgb_out[pix_done * 3..width * 3],
width - pix_done,
);
}
}
#[inline]
#[target_feature(enable = "neon,fp16")]
pub(crate) unsafe fn rgbf16_to_rgba_row<const BE: bool>(
rgb_in: &[half::f16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
let total_lanes = width * 3;
let mut lane = 0usize;
let mut pix = 0usize;
while lane + 12 <= total_lanes {
let mut buf = [0.0f32; 12];
unsafe {
widen_f16x4::<BE>(rgb_in.as_ptr().add(lane), buf.as_mut_ptr());
widen_f16x4::<BE>(rgb_in.as_ptr().add(lane + 4), buf.as_mut_ptr().add(4));
widen_f16x4::<BE>(rgb_in.as_ptr().add(lane + 8), buf.as_mut_ptr().add(8));
rgbf32_to_rgba_row::<HOST_NATIVE_BE>(
&buf,
rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16),
4,
);
}
lane += 12;
pix += 4;
}
if pix < width {
scalar::rgbf16_to_rgba_row::<BE>(
&rgb_in[pix * 3..width * 3],
&mut rgba_out[pix * 4..width * 4],
width - pix,
);
}
}
#[inline]
#[target_feature(enable = "neon,fp16")]
pub(crate) unsafe fn rgbf16_to_rgb_u16_row<const BE: bool>(
rgb_in: &[half::f16],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_u16_out row too short");
let total_lanes = width * 3;
let mut lane = 0usize;
while lane + 12 <= total_lanes {
let mut buf = [0.0f32; 12];
unsafe {
widen_f16x4::<BE>(rgb_in.as_ptr().add(lane), buf.as_mut_ptr());
widen_f16x4::<BE>(rgb_in.as_ptr().add(lane + 4), buf.as_mut_ptr().add(4));
widen_f16x4::<BE>(rgb_in.as_ptr().add(lane + 8), buf.as_mut_ptr().add(8));
rgbf32_to_rgb_u16_row::<HOST_NATIVE_BE>(&buf, rgb_out.get_unchecked_mut(lane..lane + 12), 4);
}
lane += 12;
}
let pix_done = lane / 3;
if pix_done < width {
scalar::rgbf16_to_rgb_u16_row::<BE>(
&rgb_in[pix_done * 3..width * 3],
&mut rgb_out[pix_done * 3..width * 3],
width - pix_done,
);
}
}
#[inline]
#[target_feature(enable = "neon,fp16")]
pub(crate) unsafe fn rgbf16_to_rgba_u16_row<const BE: bool>(
rgb_in: &[half::f16],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_u16_out row too short");
let total_lanes = width * 3;
let mut lane = 0usize;
let mut pix = 0usize;
while lane + 12 <= total_lanes {
let mut buf = [0.0f32; 12];
unsafe {
widen_f16x4::<BE>(rgb_in.as_ptr().add(lane), buf.as_mut_ptr());
widen_f16x4::<BE>(rgb_in.as_ptr().add(lane + 4), buf.as_mut_ptr().add(4));
widen_f16x4::<BE>(rgb_in.as_ptr().add(lane + 8), buf.as_mut_ptr().add(8));
rgbf32_to_rgba_u16_row::<HOST_NATIVE_BE>(
&buf,
rgba_out.get_unchecked_mut(pix * 4..pix * 4 + 16),
4,
);
}
lane += 12;
pix += 4;
}
if pix < width {
scalar::rgbf16_to_rgba_u16_row::<BE>(
&rgb_in[pix * 3..width * 3],
&mut rgba_out[pix * 4..width * 4],
width - pix,
);
}
}
#[inline]
#[target_feature(enable = "neon,fp16")]
pub(crate) unsafe fn rgbf16_to_rgb_f32_row<const BE: bool>(
rgb_in: &[half::f16],
rgb_out: &mut [f32],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_f32_out row too short");
let total_lanes = width * 3;
let mut lane = 0usize;
while lane + 4 <= total_lanes {
unsafe {
widen_f16x4::<BE>(rgb_in.as_ptr().add(lane), rgb_out.as_mut_ptr().add(lane));
}
lane += 4;
}
unsafe {
widen_f16_tail::<BE>(
rgb_in.get_unchecked(lane..),
rgb_out.get_unchecked_mut(lane..),
total_lanes - lane,
);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn rgbf16_to_rgb_f16_row<const BE: bool>(
rgb_in: &[half::f16],
rgb_out: &mut [half::f16],
width: usize,
) {
debug_assert!(rgb_in.len() >= width * 3, "rgbf16 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_f16_out row too short");
scalar::rgbf16_to_rgb_f16_row::<BE>(rgb_in, rgb_out, width);
}