#![allow(dead_code)]
use core::arch::aarch64::*;
use super::bswap_u16x8_if_be;
use crate::row::scalar;
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn neon_rgb48_to_rgb_row<const BE: bool>(
rgb48: &[u16],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let px: uint16x8x3_t = vld3q_u16(rgb48.as_ptr().add(x * 3));
let r8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::<BE>(px.0));
let g8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::<BE>(px.1));
let b8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::<BE>(px.2));
vst3_u8(rgb_out.as_mut_ptr().add(x * 3), uint8x8x3_t(r8, g8, b8));
x += 8;
}
if x < width {
scalar::rgb48_to_rgb_row::<BE>(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn neon_rgb48_to_rgba_row<const BE: bool>(
rgb48: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let alpha = vdup_n_u8(0xFF);
let mut x = 0usize;
while x + 8 <= width {
let px: uint16x8x3_t = vld3q_u16(rgb48.as_ptr().add(x * 3));
let r8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::<BE>(px.0));
let g8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::<BE>(px.1));
let b8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::<BE>(px.2));
vst4_u8(
rgba_out.as_mut_ptr().add(x * 4),
uint8x8x4_t(r8, g8, b8, alpha),
);
x += 8;
}
if x < width {
scalar::rgb48_to_rgba_row::<BE>(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn neon_rgb48_to_rgb_u16_row<const BE: bool>(
rgb48: &[u16],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let px: uint16x8x3_t = vld3q_u16(rgb48.as_ptr().add(x * 3));
vst3q_u16(
rgb_out.as_mut_ptr().add(x * 3),
uint16x8x3_t(
bswap_u16x8_if_be::<BE>(px.0),
bswap_u16x8_if_be::<BE>(px.1),
bswap_u16x8_if_be::<BE>(px.2),
),
);
x += 8;
}
if x < width {
scalar::rgb48_to_rgb_u16_row::<BE>(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn neon_rgb48_to_rgba_u16_row<const BE: bool>(
rgb48: &[u16],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let alpha = vdupq_n_u16(0xFFFF);
let mut x = 0usize;
while x + 8 <= width {
let px: uint16x8x3_t = vld3q_u16(rgb48.as_ptr().add(x * 3));
vst4q_u16(
rgba_out.as_mut_ptr().add(x * 4),
uint16x8x4_t(
bswap_u16x8_if_be::<BE>(px.0),
bswap_u16x8_if_be::<BE>(px.1),
bswap_u16x8_if_be::<BE>(px.2),
alpha,
),
);
x += 8;
}
if x < width {
scalar::rgb48_to_rgba_u16_row::<BE>(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn neon_bgr48_to_rgb_row<const BE: bool>(
bgr48: &[u16],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let px: uint16x8x3_t = vld3q_u16(bgr48.as_ptr().add(x * 3));
let r8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::<BE>(px.2)); let g8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::<BE>(px.1)); let b8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::<BE>(px.0)); vst3_u8(rgb_out.as_mut_ptr().add(x * 3), uint8x8x3_t(r8, g8, b8));
x += 8;
}
if x < width {
scalar::bgr48_to_rgb_row::<BE>(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn neon_bgr48_to_rgba_row<const BE: bool>(
bgr48: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let alpha = vdup_n_u8(0xFF);
let mut x = 0usize;
while x + 8 <= width {
let px: uint16x8x3_t = vld3q_u16(bgr48.as_ptr().add(x * 3));
let r8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::<BE>(px.2));
let g8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::<BE>(px.1));
let b8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::<BE>(px.0));
vst4_u8(
rgba_out.as_mut_ptr().add(x * 4),
uint8x8x4_t(r8, g8, b8, alpha),
);
x += 8;
}
if x < width {
scalar::bgr48_to_rgba_row::<BE>(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn neon_bgr48_to_rgb_u16_row<const BE: bool>(
bgr48: &[u16],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let px: uint16x8x3_t = vld3q_u16(bgr48.as_ptr().add(x * 3));
vst3q_u16(
rgb_out.as_mut_ptr().add(x * 3),
uint16x8x3_t(
bswap_u16x8_if_be::<BE>(px.2),
bswap_u16x8_if_be::<BE>(px.1),
bswap_u16x8_if_be::<BE>(px.0),
),
);
x += 8;
}
if x < width {
scalar::bgr48_to_rgb_u16_row::<BE>(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn neon_bgr48_to_rgba_u16_row<const BE: bool>(
bgr48: &[u16],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let alpha = vdupq_n_u16(0xFFFF);
let mut x = 0usize;
while x + 8 <= width {
let px: uint16x8x3_t = vld3q_u16(bgr48.as_ptr().add(x * 3));
vst4q_u16(
rgba_out.as_mut_ptr().add(x * 4),
uint16x8x4_t(
bswap_u16x8_if_be::<BE>(px.2),
bswap_u16x8_if_be::<BE>(px.1),
bswap_u16x8_if_be::<BE>(px.0),
alpha,
),
);
x += 8;
}
if x < width {
scalar::bgr48_to_rgba_u16_row::<BE>(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn neon_rgba64_to_rgb_row<const BE: bool>(
rgba64: &[u16],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let px: uint16x8x4_t = vld4q_u16(rgba64.as_ptr().add(x * 4));
let r8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::<BE>(px.0));
let g8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::<BE>(px.1));
let b8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::<BE>(px.2));
vst3_u8(rgb_out.as_mut_ptr().add(x * 3), uint8x8x3_t(r8, g8, b8));
x += 8;
}
if x < width {
scalar::rgba64_to_rgb_row::<BE>(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn neon_rgba64_to_rgba_row<const BE: bool>(
rgba64: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let px: uint16x8x4_t = vld4q_u16(rgba64.as_ptr().add(x * 4));
let r8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::<BE>(px.0));
let g8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::<BE>(px.1));
let b8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::<BE>(px.2));
let a8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::<BE>(px.3)); vst4_u8(
rgba_out.as_mut_ptr().add(x * 4),
uint8x8x4_t(r8, g8, b8, a8),
);
x += 8;
}
if x < width {
scalar::rgba64_to_rgba_row::<BE>(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn neon_rgba64_to_rgb_u16_row<const BE: bool>(
rgba64: &[u16],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let px: uint16x8x4_t = vld4q_u16(rgba64.as_ptr().add(x * 4));
vst3q_u16(
rgb_out.as_mut_ptr().add(x * 3),
uint16x8x3_t(
bswap_u16x8_if_be::<BE>(px.0),
bswap_u16x8_if_be::<BE>(px.1),
bswap_u16x8_if_be::<BE>(px.2),
),
);
x += 8;
}
if x < width {
scalar::rgba64_to_rgb_u16_row::<BE>(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn neon_rgba64_to_rgba_u16_row<const BE: bool>(
rgba64: &[u16],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let px: uint16x8x4_t = vld4q_u16(rgba64.as_ptr().add(x * 4));
vst4q_u16(
rgba_out.as_mut_ptr().add(x * 4),
uint16x8x4_t(
bswap_u16x8_if_be::<BE>(px.0),
bswap_u16x8_if_be::<BE>(px.1),
bswap_u16x8_if_be::<BE>(px.2),
bswap_u16x8_if_be::<BE>(px.3),
),
);
x += 8;
}
if x < width {
scalar::rgba64_to_rgba_u16_row::<BE>(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn neon_bgra64_to_rgb_row<const BE: bool>(
bgra64: &[u16],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let px: uint16x8x4_t = vld4q_u16(bgra64.as_ptr().add(x * 4));
let r8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::<BE>(px.2)); let g8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::<BE>(px.1)); let b8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::<BE>(px.0)); vst3_u8(rgb_out.as_mut_ptr().add(x * 3), uint8x8x3_t(r8, g8, b8));
x += 8;
}
if x < width {
scalar::bgra64_to_rgb_row::<BE>(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn neon_bgra64_to_rgba_row<const BE: bool>(
bgra64: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let px: uint16x8x4_t = vld4q_u16(bgra64.as_ptr().add(x * 4));
let r8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::<BE>(px.2));
let g8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::<BE>(px.1));
let b8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::<BE>(px.0));
let a8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::<BE>(px.3)); vst4_u8(
rgba_out.as_mut_ptr().add(x * 4),
uint8x8x4_t(r8, g8, b8, a8),
);
x += 8;
}
if x < width {
scalar::bgra64_to_rgba_row::<BE>(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn neon_bgra64_to_rgb_u16_row<const BE: bool>(
bgra64: &[u16],
rgb_out: &mut [u16],
width: usize,
) {
debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let px: uint16x8x4_t = vld4q_u16(bgra64.as_ptr().add(x * 4));
vst3q_u16(
rgb_out.as_mut_ptr().add(x * 3),
uint16x8x3_t(
bswap_u16x8_if_be::<BE>(px.2),
bswap_u16x8_if_be::<BE>(px.1),
bswap_u16x8_if_be::<BE>(px.0),
),
);
x += 8;
}
if x < width {
scalar::bgra64_to_rgb_u16_row::<BE>(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn neon_bgra64_to_rgba_u16_row<const BE: bool>(
bgra64: &[u16],
rgba_out: &mut [u16],
width: usize,
) {
debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let mut x = 0usize;
while x + 8 <= width {
let px: uint16x8x4_t = vld4q_u16(bgra64.as_ptr().add(x * 4));
vst4q_u16(
rgba_out.as_mut_ptr().add(x * 4),
uint16x8x4_t(
bswap_u16x8_if_be::<BE>(px.2),
bswap_u16x8_if_be::<BE>(px.1),
bswap_u16x8_if_be::<BE>(px.0),
bswap_u16x8_if_be::<BE>(px.3),
),
);
x += 8;
}
if x < width {
scalar::bgra64_to_rgba_u16_row::<BE>(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x);
}
}
}