use core::arch::aarch64::*;
use crate::row::scalar;
use super::endian::load_endian_u16x8;
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn gbr_to_rgb_high_bit_row<const BITS: u32, const BE: bool>(
g: &[u16],
b: &[u16],
r: &[u16],
rgb_out: &mut [u8],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
unsafe {
let shr = vdupq_n_s16(-((BITS - 8) as i16));
let mask_v = vdupq_n_u16(((1u32 << BITS) - 1) as u16);
let mut x = 0usize;
while x + 8 <= width {
let g_raw = load_endian_u16x8::<BE>(g.as_ptr().add(x).cast());
let b_raw = load_endian_u16x8::<BE>(b.as_ptr().add(x).cast());
let r_raw = load_endian_u16x8::<BE>(r.as_ptr().add(x).cast());
let g_v = vandq_u16(g_raw, mask_v);
let b_v = vandq_u16(b_raw, mask_v);
let r_v = vandq_u16(r_raw, mask_v);
let r_sh = vqmovn_u16(vshlq_u16(r_v, shr));
let g_sh = vqmovn_u16(vshlq_u16(g_v, shr));
let b_sh = vqmovn_u16(vshlq_u16(b_v, shr));
vst3_u8(
rgb_out.as_mut_ptr().add(x * 3),
uint8x8x3_t(r_sh, g_sh, b_sh),
);
x += 8;
}
if x < width {
scalar::gbr_to_rgb_high_bit_row::<BITS, BE>(
&g[x..width],
&b[x..width],
&r[x..width],
&mut rgb_out[x * 3..width * 3],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn gbr_to_rgba_opaque_high_bit_row<const BITS: u32, const BE: bool>(
g: &[u16],
b: &[u16],
r: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let shr = vdupq_n_s16(-((BITS - 8) as i16));
let mask_v = vdupq_n_u16(((1u32 << BITS) - 1) as u16);
let opaque = vdup_n_u8(0xFF);
let mut x = 0usize;
while x + 8 <= width {
let g_raw = load_endian_u16x8::<BE>(g.as_ptr().add(x).cast());
let b_raw = load_endian_u16x8::<BE>(b.as_ptr().add(x).cast());
let r_raw = load_endian_u16x8::<BE>(r.as_ptr().add(x).cast());
let g_v = vandq_u16(g_raw, mask_v);
let b_v = vandq_u16(b_raw, mask_v);
let r_v = vandq_u16(r_raw, mask_v);
let r_sh = vqmovn_u16(vshlq_u16(r_v, shr));
let g_sh = vqmovn_u16(vshlq_u16(g_v, shr));
let b_sh = vqmovn_u16(vshlq_u16(b_v, shr));
vst4_u8(
rgba_out.as_mut_ptr().add(x * 4),
uint8x8x4_t(r_sh, g_sh, b_sh, opaque),
);
x += 8;
}
if x < width {
scalar::gbr_to_rgba_opaque_high_bit_row::<BITS, BE>(
&g[x..width],
&b[x..width],
&r[x..width],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn gbra_to_rgba_high_bit_row<const BITS: u32, const BE: bool>(
g: &[u16],
b: &[u16],
r: &[u16],
a: &[u16],
rgba_out: &mut [u8],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(a.len() >= width, "a row too short");
debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
unsafe {
let shr = vdupq_n_s16(-((BITS - 8) as i16));
let mask_v = vdupq_n_u16(((1u32 << BITS) - 1) as u16);
let mut x = 0usize;
while x + 8 <= width {
let g_raw = load_endian_u16x8::<BE>(g.as_ptr().add(x).cast());
let b_raw = load_endian_u16x8::<BE>(b.as_ptr().add(x).cast());
let r_raw = load_endian_u16x8::<BE>(r.as_ptr().add(x).cast());
let a_raw = load_endian_u16x8::<BE>(a.as_ptr().add(x).cast());
let g_v = vandq_u16(g_raw, mask_v);
let b_v = vandq_u16(b_raw, mask_v);
let r_v = vandq_u16(r_raw, mask_v);
let a_v = vandq_u16(a_raw, mask_v);
let r_sh = vqmovn_u16(vshlq_u16(r_v, shr));
let g_sh = vqmovn_u16(vshlq_u16(g_v, shr));
let b_sh = vqmovn_u16(vshlq_u16(b_v, shr));
let a_sh = vqmovn_u16(vshlq_u16(a_v, shr));
vst4_u8(
rgba_out.as_mut_ptr().add(x * 4),
uint8x8x4_t(r_sh, g_sh, b_sh, a_sh),
);
x += 8;
}
if x < width {
scalar::gbra_to_rgba_high_bit_row::<BITS, BE>(
&g[x..width],
&b[x..width],
&r[x..width],
&a[x..width],
&mut rgba_out[x * 4..width * 4],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn gbr_to_rgb_u16_high_bit_row<const BITS: u32, const BE: bool>(
g: &[u16],
b: &[u16],
r: &[u16],
rgb_u16_out: &mut [u16],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out row too short");
unsafe {
let mask_v = vdupq_n_u16(((1u32 << BITS) - 1) as u16);
let mut x = 0usize;
while x + 8 <= width {
let r_v = vandq_u16(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
let g_v = vandq_u16(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
let b_v = vandq_u16(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
let triple = uint16x8x3_t(r_v, g_v, b_v);
vst3q_u16(rgb_u16_out.as_mut_ptr().add(x * 3), triple);
x += 8;
}
if x < width {
scalar::gbr_to_rgb_u16_high_bit_row::<BITS, BE>(
&g[x..width],
&b[x..width],
&r[x..width],
&mut rgb_u16_out[x * 3..width * 3],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn gbr_to_rgba_opaque_u16_high_bit_row<const BITS: u32, const BE: bool>(
g: &[u16],
b: &[u16],
r: &[u16],
rgba_u16_out: &mut [u16],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(
rgba_u16_out.len() >= width * 4,
"rgba_u16_out row too short"
);
unsafe {
let mask_v = vdupq_n_u16(((1u32 << BITS) - 1) as u16);
let opaque = mask_v;
let mut x = 0usize;
while x + 8 <= width {
let r_v = vandq_u16(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
let g_v = vandq_u16(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
let b_v = vandq_u16(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
let quad = uint16x8x4_t(r_v, g_v, b_v, opaque);
vst4q_u16(rgba_u16_out.as_mut_ptr().add(x * 4), quad);
x += 8;
}
if x < width {
scalar::gbr_to_rgba_opaque_u16_high_bit_row::<BITS, BE>(
&g[x..width],
&b[x..width],
&r[x..width],
&mut rgba_u16_out[x * 4..width * 4],
width - x,
);
}
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn gbra_to_rgba_u16_high_bit_row<const BITS: u32, const BE: bool>(
g: &[u16],
b: &[u16],
r: &[u16],
a: &[u16],
rgba_u16_out: &mut [u16],
width: usize,
) {
debug_assert!(g.len() >= width, "g row too short");
debug_assert!(b.len() >= width, "b row too short");
debug_assert!(r.len() >= width, "r row too short");
debug_assert!(a.len() >= width, "a row too short");
debug_assert!(
rgba_u16_out.len() >= width * 4,
"rgba_u16_out row too short"
);
unsafe {
let mask_v = vdupq_n_u16(((1u32 << BITS) - 1) as u16);
let mut x = 0usize;
while x + 8 <= width {
let r_v = vandq_u16(load_endian_u16x8::<BE>(r.as_ptr().add(x).cast()), mask_v);
let g_v = vandq_u16(load_endian_u16x8::<BE>(g.as_ptr().add(x).cast()), mask_v);
let b_v = vandq_u16(load_endian_u16x8::<BE>(b.as_ptr().add(x).cast()), mask_v);
let a_v = vandq_u16(load_endian_u16x8::<BE>(a.as_ptr().add(x).cast()), mask_v);
let quad = uint16x8x4_t(r_v, g_v, b_v, a_v);
vst4q_u16(rgba_u16_out.as_mut_ptr().add(x * 4), quad);
x += 8;
}
if x < width {
scalar::gbra_to_rgba_u16_high_bit_row::<BITS, BE>(
&g[x..width],
&b[x..width],
&r[x..width],
&a[x..width],
&mut rgba_u16_out[x * 4..width * 4],
width - x,
);
}
}
}