use core::arch::aarch64::*;
use super::endian::load_endian_u32x4;
use crate::row::scalar;
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn bgr_rgb_swap_row(input: &[u8], output: &mut [u8], width: usize) {
debug_assert!(input.len() >= width * 3, "input row too short");
debug_assert!(output.len() >= width * 3, "output row too short");
unsafe {
let mut x = 0usize;
while x + 16 <= width {
let triple = vld3q_u8(input.as_ptr().add(x * 3));
let swapped = uint8x16x3_t(triple.2, triple.1, triple.0);
vst3q_u8(output.as_mut_ptr().add(x * 3), swapped);
x += 16;
}
if x < width {
scalar::bgr_rgb_swap_row(
&input[x * 3..width * 3],
&mut output[x * 3..width * 3],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn rgba_to_rgb_row(rgba: &[u8], rgb_out: &mut [u8], width: usize) {
debug_assert!(rgba.len() >= width * 4, "rgba row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 16 <= width {
let quad = vld4q_u8(rgba.as_ptr().add(x * 4));
let triple = uint8x16x3_t(quad.0, quad.1, quad.2);
vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), triple);
x += 16;
}
if x < width {
scalar::rgba_to_rgb_row(
&rgba[x * 4..width * 4],
&mut rgb_out[x * 3..width * 3],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn bgra_to_rgba_row(bgra: &[u8], rgba_out: &mut [u8], width: usize) {
debug_assert!(bgra.len() >= width * 4, "bgra row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let mut x = 0usize;
while x + 16 <= width {
let quad = vld4q_u8(bgra.as_ptr().add(x * 4));
let swapped = uint8x16x4_t(quad.2, quad.1, quad.0, quad.3);
vst4q_u8(rgba_out.as_mut_ptr().add(x * 4), swapped);
x += 16;
}
if x < width {
scalar::bgra_to_rgba_row(
&bgra[x * 4..width * 4],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn bgra_to_rgb_row(bgra: &[u8], rgb_out: &mut [u8], width: usize) {
debug_assert!(bgra.len() >= width * 4, "bgra row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 16 <= width {
let quad = vld4q_u8(bgra.as_ptr().add(x * 4));
let triple = uint8x16x3_t(quad.2, quad.1, quad.0);
vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), triple);
x += 16;
}
if x < width {
scalar::bgra_to_rgb_row(
&bgra[x * 4..width * 4],
&mut rgb_out[x * 3..width * 3],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn argb_to_rgb_row(argb: &[u8], rgb_out: &mut [u8], width: usize) {
debug_assert!(argb.len() >= width * 4, "argb row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 16 <= width {
let quad = vld4q_u8(argb.as_ptr().add(x * 4));
let triple = uint8x16x3_t(quad.1, quad.2, quad.3);
vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), triple);
x += 16;
}
if x < width {
scalar::argb_to_rgb_row(
&argb[x * 4..width * 4],
&mut rgb_out[x * 3..width * 3],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn abgr_to_rgb_row(abgr: &[u8], rgb_out: &mut [u8], width: usize) {
debug_assert!(abgr.len() >= width * 4, "abgr row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 16 <= width {
let quad = vld4q_u8(abgr.as_ptr().add(x * 4));
let triple = uint8x16x3_t(quad.3, quad.2, quad.1);
vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), triple);
x += 16;
}
if x < width {
scalar::abgr_to_rgb_row(
&abgr[x * 4..width * 4],
&mut rgb_out[x * 3..width * 3],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn argb_to_rgba_row(argb: &[u8], rgba_out: &mut [u8], width: usize) {
debug_assert!(argb.len() >= width * 4, "argb row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let mut x = 0usize;
while x + 16 <= width {
let quad = vld4q_u8(argb.as_ptr().add(x * 4));
let rotated = uint8x16x4_t(quad.1, quad.2, quad.3, quad.0);
vst4q_u8(rgba_out.as_mut_ptr().add(x * 4), rotated);
x += 16;
}
if x < width {
scalar::argb_to_rgba_row(
&argb[x * 4..width * 4],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn abgr_to_rgba_row(abgr: &[u8], rgba_out: &mut [u8], width: usize) {
debug_assert!(abgr.len() >= width * 4, "abgr row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let mut x = 0usize;
while x + 16 <= width {
let quad = vld4q_u8(abgr.as_ptr().add(x * 4));
let reversed = uint8x16x4_t(quad.3, quad.2, quad.1, quad.0);
vst4q_u8(rgba_out.as_mut_ptr().add(x * 4), reversed);
x += 16;
}
if x < width {
scalar::abgr_to_rgba_row(
&abgr[x * 4..width * 4],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn xrgb_to_rgba_row(xrgb: &[u8], rgba_out: &mut [u8], width: usize) {
debug_assert!(xrgb.len() >= width * 4, "xrgb row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let alpha = vdupq_n_u8(0xFF);
let mut x = 0usize;
while x + 16 <= width {
let quad = vld4q_u8(xrgb.as_ptr().add(x * 4));
let out = uint8x16x4_t(quad.1, quad.2, quad.3, alpha);
vst4q_u8(rgba_out.as_mut_ptr().add(x * 4), out);
x += 16;
}
if x < width {
scalar::xrgb_to_rgba_row(
&xrgb[x * 4..width * 4],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn rgbx_to_rgba_row(rgbx: &[u8], rgba_out: &mut [u8], width: usize) {
debug_assert!(rgbx.len() >= width * 4, "rgbx row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let alpha = vdupq_n_u8(0xFF);
let mut x = 0usize;
while x + 16 <= width {
let quad = vld4q_u8(rgbx.as_ptr().add(x * 4));
let out = uint8x16x4_t(quad.0, quad.1, quad.2, alpha);
vst4q_u8(rgba_out.as_mut_ptr().add(x * 4), out);
x += 16;
}
if x < width {
scalar::rgbx_to_rgba_row(
&rgbx[x * 4..width * 4],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn xbgr_to_rgba_row(xbgr: &[u8], rgba_out: &mut [u8], width: usize) {
debug_assert!(xbgr.len() >= width * 4, "xbgr row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let alpha = vdupq_n_u8(0xFF);
let mut x = 0usize;
while x + 16 <= width {
let quad = vld4q_u8(xbgr.as_ptr().add(x * 4));
let out = uint8x16x4_t(quad.3, quad.2, quad.1, alpha);
vst4q_u8(rgba_out.as_mut_ptr().add(x * 4), out);
x += 16;
}
if x < width {
scalar::xbgr_to_rgba_row(
&xbgr[x * 4..width * 4],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn bgrx_to_rgba_row(bgrx: &[u8], rgba_out: &mut [u8], width: usize) {
debug_assert!(bgrx.len() >= width * 4, "bgrx row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let alpha = vdupq_n_u8(0xFF);
let mut x = 0usize;
while x + 16 <= width {
let quad = vld4q_u8(bgrx.as_ptr().add(x * 4));
let out = uint8x16x4_t(quad.2, quad.1, quad.0, alpha);
vst4q_u8(rgba_out.as_mut_ptr().add(x * 4), out);
x += 16;
}
if x < width {
scalar::bgrx_to_rgba_row(
&bgrx[x * 4..width * 4],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[inline(always)]
unsafe fn x2_extract_10bit_u8_lane(pix: uint32x4_t, shift: i32) -> uint16x4_t {
unsafe {
let shifted = match shift {
22 => vshrq_n_u32(pix, 22),
12 => vshrq_n_u32(pix, 12),
2 => vshrq_n_u32(pix, 2),
_ => unreachable!(),
};
let mask = vdupq_n_u32(0xFF);
vqmovn_u32(vandq_u32(shifted, mask))
}
}
#[inline(always)]
unsafe fn x2_extract_10bit_u16_lane(pix: uint32x4_t, shift: i32) -> uint16x4_t {
unsafe {
let shifted = match shift {
20 => vshrq_n_u32(pix, 20),
10 => vshrq_n_u32(pix, 10),
0 => pix,
_ => unreachable!(),
};
let mask = vdupq_n_u32(0x3FF);
vqmovn_u32(vandq_u32(shifted, mask))
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn x2rgb10_to_rgb_row<const BE: bool>(
x2rgb10: &[u8],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 16 <= width {
let p0 = load_endian_u32x4::<BE>(x2rgb10.as_ptr().add(x * 4));
let p1 = load_endian_u32x4::<BE>(x2rgb10.as_ptr().add(x * 4 + 16));
let p2 = load_endian_u32x4::<BE>(x2rgb10.as_ptr().add(x * 4 + 32));
let p3 = load_endian_u32x4::<BE>(x2rgb10.as_ptr().add(x * 4 + 48));
let r_lo = vcombine_u16(
x2_extract_10bit_u8_lane(p0, 22),
x2_extract_10bit_u8_lane(p1, 22),
);
let r_hi = vcombine_u16(
x2_extract_10bit_u8_lane(p2, 22),
x2_extract_10bit_u8_lane(p3, 22),
);
let g_lo = vcombine_u16(
x2_extract_10bit_u8_lane(p0, 12),
x2_extract_10bit_u8_lane(p1, 12),
);
let g_hi = vcombine_u16(
x2_extract_10bit_u8_lane(p2, 12),
x2_extract_10bit_u8_lane(p3, 12),
);
let b_lo = vcombine_u16(
x2_extract_10bit_u8_lane(p0, 2),
x2_extract_10bit_u8_lane(p1, 2),
);
let b_hi = vcombine_u16(
x2_extract_10bit_u8_lane(p2, 2),
x2_extract_10bit_u8_lane(p3, 2),
);
let r = vcombine_u8(vqmovn_u16(r_lo), vqmovn_u16(r_hi));
let g = vcombine_u8(vqmovn_u16(g_lo), vqmovn_u16(g_hi));
let b = vcombine_u8(vqmovn_u16(b_lo), vqmovn_u16(b_hi));
let rgb = uint8x16x3_t(r, g, b);
vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb);
x += 16;
}
if x < width {
scalar::x2rgb10_to_rgb_row::<BE>(
&x2rgb10[x * 4..width * 4],
&mut rgb_out[x * 3..width * 3],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn x2rgb10_to_rgba_row<const BE: bool>(
x2rgb10: &[u8],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let alpha = vdupq_n_u8(0xFF);
let mut x = 0usize;
while x + 16 <= width {
let p0 = load_endian_u32x4::<BE>(x2rgb10.as_ptr().add(x * 4));
let p1 = load_endian_u32x4::<BE>(x2rgb10.as_ptr().add(x * 4 + 16));
let p2 = load_endian_u32x4::<BE>(x2rgb10.as_ptr().add(x * 4 + 32));
let p3 = load_endian_u32x4::<BE>(x2rgb10.as_ptr().add(x * 4 + 48));
let r_lo = vcombine_u16(
x2_extract_10bit_u8_lane(p0, 22),
x2_extract_10bit_u8_lane(p1, 22),
);
let r_hi = vcombine_u16(
x2_extract_10bit_u8_lane(p2, 22),
x2_extract_10bit_u8_lane(p3, 22),
);
let g_lo = vcombine_u16(
x2_extract_10bit_u8_lane(p0, 12),
x2_extract_10bit_u8_lane(p1, 12),
);
let g_hi = vcombine_u16(
x2_extract_10bit_u8_lane(p2, 12),
x2_extract_10bit_u8_lane(p3, 12),
);
let b_lo = vcombine_u16(
x2_extract_10bit_u8_lane(p0, 2),
x2_extract_10bit_u8_lane(p1, 2),
);
let b_hi = vcombine_u16(
x2_extract_10bit_u8_lane(p2, 2),
x2_extract_10bit_u8_lane(p3, 2),
);
let r = vcombine_u8(vqmovn_u16(r_lo), vqmovn_u16(r_hi));
let g = vcombine_u8(vqmovn_u16(g_lo), vqmovn_u16(g_hi));
let b = vcombine_u8(vqmovn_u16(b_lo), vqmovn_u16(b_hi));
let rgba = uint8x16x4_t(r, g, b, alpha);
vst4q_u8(rgba_out.as_mut_ptr().add(x * 4), rgba);
x += 16;
}
if x < width {
scalar::x2rgb10_to_rgba_row::<BE>(
&x2rgb10[x * 4..width * 4],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn x2rgb10_to_rgb_u16_row<const BE: bool>(
x2rgb10: &[u8],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let p0 = load_endian_u32x4::<BE>(x2rgb10.as_ptr().add(x * 4));
let p1 = load_endian_u32x4::<BE>(x2rgb10.as_ptr().add(x * 4 + 16));
let r = vcombine_u16(
x2_extract_10bit_u16_lane(p0, 20),
x2_extract_10bit_u16_lane(p1, 20),
);
let g = vcombine_u16(
x2_extract_10bit_u16_lane(p0, 10),
x2_extract_10bit_u16_lane(p1, 10),
);
let b = vcombine_u16(
x2_extract_10bit_u16_lane(p0, 0),
x2_extract_10bit_u16_lane(p1, 0),
);
let rgb = uint16x8x3_t(r, g, b);
vst3q_u16(rgb_out.as_mut_ptr().add(x * 3), rgb);
x += 8;
}
if x < width {
scalar::x2rgb10_to_rgb_u16_row::<BE>(
&x2rgb10[x * 4..width * 4],
&mut rgb_out[x * 3..width * 3],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn x2bgr10_to_rgb_row<const BE: bool>(
x2bgr10: &[u8],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 16 <= width {
let p0 = load_endian_u32x4::<BE>(x2bgr10.as_ptr().add(x * 4));
let p1 = load_endian_u32x4::<BE>(x2bgr10.as_ptr().add(x * 4 + 16));
let p2 = load_endian_u32x4::<BE>(x2bgr10.as_ptr().add(x * 4 + 32));
let p3 = load_endian_u32x4::<BE>(x2bgr10.as_ptr().add(x * 4 + 48));
let r_lo = vcombine_u16(
x2_extract_10bit_u8_lane(p0, 2),
x2_extract_10bit_u8_lane(p1, 2),
);
let r_hi = vcombine_u16(
x2_extract_10bit_u8_lane(p2, 2),
x2_extract_10bit_u8_lane(p3, 2),
);
let g_lo = vcombine_u16(
x2_extract_10bit_u8_lane(p0, 12),
x2_extract_10bit_u8_lane(p1, 12),
);
let g_hi = vcombine_u16(
x2_extract_10bit_u8_lane(p2, 12),
x2_extract_10bit_u8_lane(p3, 12),
);
let b_lo = vcombine_u16(
x2_extract_10bit_u8_lane(p0, 22),
x2_extract_10bit_u8_lane(p1, 22),
);
let b_hi = vcombine_u16(
x2_extract_10bit_u8_lane(p2, 22),
x2_extract_10bit_u8_lane(p3, 22),
);
let r = vcombine_u8(vqmovn_u16(r_lo), vqmovn_u16(r_hi));
let g = vcombine_u8(vqmovn_u16(g_lo), vqmovn_u16(g_hi));
let b = vcombine_u8(vqmovn_u16(b_lo), vqmovn_u16(b_hi));
let rgb = uint8x16x3_t(r, g, b);
vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb);
x += 16;
}
if x < width {
scalar::x2bgr10_to_rgb_row::<BE>(
&x2bgr10[x * 4..width * 4],
&mut rgb_out[x * 3..width * 3],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn x2bgr10_to_rgba_row<const BE: bool>(
x2bgr10: &[u8],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let alpha = vdupq_n_u8(0xFF);
let mut x = 0usize;
while x + 16 <= width {
let p0 = load_endian_u32x4::<BE>(x2bgr10.as_ptr().add(x * 4));
let p1 = load_endian_u32x4::<BE>(x2bgr10.as_ptr().add(x * 4 + 16));
let p2 = load_endian_u32x4::<BE>(x2bgr10.as_ptr().add(x * 4 + 32));
let p3 = load_endian_u32x4::<BE>(x2bgr10.as_ptr().add(x * 4 + 48));
let r_lo = vcombine_u16(
x2_extract_10bit_u8_lane(p0, 2),
x2_extract_10bit_u8_lane(p1, 2),
);
let r_hi = vcombine_u16(
x2_extract_10bit_u8_lane(p2, 2),
x2_extract_10bit_u8_lane(p3, 2),
);
let g_lo = vcombine_u16(
x2_extract_10bit_u8_lane(p0, 12),
x2_extract_10bit_u8_lane(p1, 12),
);
let g_hi = vcombine_u16(
x2_extract_10bit_u8_lane(p2, 12),
x2_extract_10bit_u8_lane(p3, 12),
);
let b_lo = vcombine_u16(
x2_extract_10bit_u8_lane(p0, 22),
x2_extract_10bit_u8_lane(p1, 22),
);
let b_hi = vcombine_u16(
x2_extract_10bit_u8_lane(p2, 22),
x2_extract_10bit_u8_lane(p3, 22),
);
let r = vcombine_u8(vqmovn_u16(r_lo), vqmovn_u16(r_hi));
let g = vcombine_u8(vqmovn_u16(g_lo), vqmovn_u16(g_hi));
let b = vcombine_u8(vqmovn_u16(b_lo), vqmovn_u16(b_hi));
let rgba = uint8x16x4_t(r, g, b, alpha);
vst4q_u8(rgba_out.as_mut_ptr().add(x * 4), rgba);
x += 16;
}
if x < width {
scalar::x2bgr10_to_rgba_row::<BE>(
&x2bgr10[x * 4..width * 4],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn x2bgr10_to_rgb_u16_row<const BE: bool>(
x2bgr10: &[u8],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let p0 = load_endian_u32x4::<BE>(x2bgr10.as_ptr().add(x * 4));
let p1 = load_endian_u32x4::<BE>(x2bgr10.as_ptr().add(x * 4 + 16));
let r = vcombine_u16(
x2_extract_10bit_u16_lane(p0, 0),
x2_extract_10bit_u16_lane(p1, 0),
);
let g = vcombine_u16(
x2_extract_10bit_u16_lane(p0, 10),
x2_extract_10bit_u16_lane(p1, 10),
);
let b = vcombine_u16(
x2_extract_10bit_u16_lane(p0, 20),
x2_extract_10bit_u16_lane(p1, 20),
);
let rgb = uint16x8x3_t(r, g, b);
vst3q_u16(rgb_out.as_mut_ptr().add(x * 3), rgb);
x += 8;
}
if x < width {
scalar::x2bgr10_to_rgb_u16_row::<BE>(
&x2bgr10[x * 4..width * 4],
&mut rgb_out[x * 3..width * 3],
width - x,
);
}
}
}