use core::arch::aarch64::*;
use crate::row::scalar::pal8 as scalar_pal8;
#[inline(always)]
unsafe fn gather_16_rgba(indices: *const u8, palette: &[[u8; 4]; 256]) -> [u8; 64] {
let mut buf = [0u8; 64];
unsafe {
for lane in 0..16usize {
let idx = *indices.add(lane) as usize;
let [b, g, r, a] = palette[idx];
buf[lane * 4] = r;
buf[lane * 4 + 1] = g;
buf[lane * 4 + 2] = b;
buf[lane * 4 + 3] = a;
}
}
buf
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn pal8_to_rgb_row(indices: &[u8], palette: &[[u8; 4]; 256], rgb_out: &mut [u8]) {
let w = indices.len();
debug_assert!(rgb_out.len() >= 3 * w, "rgb_out too short");
unsafe {
let mut x = 0usize;
while x + 16 <= w {
let buf = gather_16_rgba(indices.as_ptr().add(x), palette);
let rgba = vld4q_u8(buf.as_ptr());
let rgb = uint8x16x3_t(rgba.0, rgba.1, rgba.2);
vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb);
x += 16;
}
if x < w {
scalar_pal8::pal8_to_rgb_row(&indices[x..w], palette, &mut rgb_out[x * 3..w * 3]);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn pal8_to_rgba_row(
indices: &[u8],
palette: &[[u8; 4]; 256],
rgba_out: &mut [u8],
) {
let w = indices.len();
debug_assert!(rgba_out.len() >= 4 * w, "rgba_out too short");
unsafe {
let mut x = 0usize;
while x + 16 <= w {
let buf = gather_16_rgba(indices.as_ptr().add(x), palette);
let rgba = vld4q_u8(buf.as_ptr());
vst4q_u8(rgba_out.as_mut_ptr().add(x * 4), rgba);
x += 16;
}
if x < w {
scalar_pal8::pal8_to_rgba_row(&indices[x..w], palette, &mut rgba_out[x * 4..w * 4]);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn pal8_to_rgb_u16_row(
indices: &[u8],
palette: &[[u8; 4]; 256],
rgb_u16_out: &mut [u16],
) {
let w = indices.len();
debug_assert!(rgb_u16_out.len() >= 3 * w, "rgb_u16_out too short");
unsafe {
let mut x = 0usize;
while x + 16 <= w {
let buf = gather_16_rgba(indices.as_ptr().add(x), palette);
let rgba = vld4q_u8(buf.as_ptr());
let r_lo = vmovl_u8(vget_low_u8(rgba.0));
let r_hi = vmovl_u8(vget_high_u8(rgba.0));
let g_lo = vmovl_u8(vget_low_u8(rgba.1));
let g_hi = vmovl_u8(vget_high_u8(rgba.1));
let b_lo = vmovl_u8(vget_low_u8(rgba.2));
let b_hi = vmovl_u8(vget_high_u8(rgba.2));
let r_lo16 = vorrq_u16(vshlq_n_u16::<8>(r_lo), r_lo);
let r_hi16 = vorrq_u16(vshlq_n_u16::<8>(r_hi), r_hi);
let g_lo16 = vorrq_u16(vshlq_n_u16::<8>(g_lo), g_lo);
let g_hi16 = vorrq_u16(vshlq_n_u16::<8>(g_hi), g_hi);
let b_lo16 = vorrq_u16(vshlq_n_u16::<8>(b_lo), b_lo);
let b_hi16 = vorrq_u16(vshlq_n_u16::<8>(b_hi), b_hi);
let rgb_lo = uint16x8x3_t(r_lo16, g_lo16, b_lo16);
let rgb_hi = uint16x8x3_t(r_hi16, g_hi16, b_hi16);
vst3q_u16(rgb_u16_out.as_mut_ptr().add(x * 3), rgb_lo);
vst3q_u16(rgb_u16_out.as_mut_ptr().add(x * 3 + 24), rgb_hi);
x += 16;
}
if x < w {
scalar_pal8::pal8_to_rgb_u16_row(&indices[x..w], palette, &mut rgb_u16_out[x * 3..w * 3]);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn pal8_to_rgba_u16_row(
indices: &[u8],
palette: &[[u8; 4]; 256],
rgba_u16_out: &mut [u16],
) {
let w = indices.len();
debug_assert!(rgba_u16_out.len() >= 4 * w, "rgba_u16_out too short");
unsafe {
let mut x = 0usize;
while x + 16 <= w {
let buf = gather_16_rgba(indices.as_ptr().add(x), palette);
let rgba = vld4q_u8(buf.as_ptr());
let r_lo = vmovl_u8(vget_low_u8(rgba.0));
let r_hi = vmovl_u8(vget_high_u8(rgba.0));
let g_lo = vmovl_u8(vget_low_u8(rgba.1));
let g_hi = vmovl_u8(vget_high_u8(rgba.1));
let b_lo = vmovl_u8(vget_low_u8(rgba.2));
let b_hi = vmovl_u8(vget_high_u8(rgba.2));
let a_lo = vmovl_u8(vget_low_u8(rgba.3));
let a_hi = vmovl_u8(vget_high_u8(rgba.3));
let r_lo16 = vorrq_u16(vshlq_n_u16::<8>(r_lo), r_lo);
let r_hi16 = vorrq_u16(vshlq_n_u16::<8>(r_hi), r_hi);
let g_lo16 = vorrq_u16(vshlq_n_u16::<8>(g_lo), g_lo);
let g_hi16 = vorrq_u16(vshlq_n_u16::<8>(g_hi), g_hi);
let b_lo16 = vorrq_u16(vshlq_n_u16::<8>(b_lo), b_lo);
let b_hi16 = vorrq_u16(vshlq_n_u16::<8>(b_hi), b_hi);
let a_lo16 = vorrq_u16(vshlq_n_u16::<8>(a_lo), a_lo);
let a_hi16 = vorrq_u16(vshlq_n_u16::<8>(a_hi), a_hi);
let rgba_lo = uint16x8x4_t(r_lo16, g_lo16, b_lo16, a_lo16);
let rgba_hi = uint16x8x4_t(r_hi16, g_hi16, b_hi16, a_hi16);
vst4q_u16(rgba_u16_out.as_mut_ptr().add(x * 4), rgba_lo);
vst4q_u16(rgba_u16_out.as_mut_ptr().add(x * 4 + 32), rgba_hi);
x += 16;
}
if x < w {
scalar_pal8::pal8_to_rgba_u16_row(&indices[x..w], palette, &mut rgba_u16_out[x * 4..w * 4]);
}
}
}