#![cfg_attr(not(feature = "std"), allow(dead_code))]
use core::arch::aarch64::*;
use crate::row::{
arch::neon::endian::{load_endian_u16x8, load_endian_u32x4},
scalar::{bits_mask, gray as scalar},
};
#[inline]
#[target_feature(enable = "neon")]
unsafe fn store_rgb_16x(v: uint8x16_t, out: &mut [u8], x: usize) {
unsafe {
let rgb = uint8x16x3_t(v, v, v);
vst3q_u8(out.as_mut_ptr().add(x * 3), rgb);
}
}
#[inline]
#[target_feature(enable = "neon")]
unsafe fn store_rgba_16x(v: uint8x16_t, out: &mut [u8], x: usize) {
unsafe {
let rgba = uint8x16x4_t(v, v, v, vdupq_n_u8(0xFF));
vst4q_u8(out.as_mut_ptr().add(x * 4), rgba);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn gray8_to_rgb_row(
y_plane: &[u8],
out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
if !full_range {
return scalar::gray8_to_rgb_row(y_plane, out, width, full_range);
}
let mut x = 0usize;
unsafe {
while x + 16 <= width {
let v = vld1q_u8(y_plane.as_ptr().add(x));
store_rgb_16x(v, out, x);
x += 16;
}
}
if x < width {
scalar::gray8_to_rgb_row(
&y_plane[x..width],
&mut out[x * 3..width * 3],
width - x,
true,
);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn gray8_to_rgba_row(
y_plane: &[u8],
out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 4);
if !full_range {
return scalar::gray8_to_rgba_row(y_plane, out, width, full_range);
}
let mut x = 0usize;
unsafe {
while x + 16 <= width {
let v = vld1q_u8(y_plane.as_ptr().add(x));
store_rgba_16x(v, out, x);
x += 16;
}
}
if x < width {
scalar::gray8_to_rgba_row(
&y_plane[x..width],
&mut out[x * 4..width * 4],
width - x,
true,
);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn gray8_to_hsv_row(
y_plane: &[u8],
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(h_out.len() >= width);
debug_assert!(s_out.len() >= width);
debug_assert!(v_out.len() >= width);
if !full_range {
return scalar::gray8_to_hsv_row(y_plane, h_out, s_out, v_out, width, full_range);
}
let mut x = 0usize;
unsafe {
let zero = vdupq_n_u8(0);
while x + 16 <= width {
let v = vld1q_u8(y_plane.as_ptr().add(x));
vst1q_u8(h_out.as_mut_ptr().add(x), zero);
vst1q_u8(s_out.as_mut_ptr().add(x), zero);
vst1q_u8(v_out.as_mut_ptr().add(x), v);
x += 16;
}
}
if x < width {
scalar::gray8_to_hsv_row(
&y_plane[x..width],
&mut h_out[x..width],
&mut s_out[x..width],
&mut v_out[x..width],
width - x,
true,
);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn gray_n_to_rgb_row<const BITS: u32, const BE: bool>(
y_plane: &[u16],
out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
if !full_range {
return scalar::gray_n_to_rgb_row::<BITS, BE>(y_plane, out, width, full_range);
}
let shift = (BITS - 8) as i32;
let mask = bits_mask::<BITS>();
let mut x = 0usize;
unsafe {
let mask_v = vdupq_n_u16(mask);
while x + 8 <= width {
let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
let masked = vandq_u16(raw, mask_v);
let shifted = vshlq_u16(masked, vdupq_n_s16(-(shift as i16)));
let narrow = vmovn_u16(shifted);
let rgb8 = uint8x8x3_t(narrow, narrow, narrow);
vst3_u8(out.as_mut_ptr().add(x * 3), rgb8);
x += 8;
}
}
if x < width {
scalar::gray_n_to_rgb_row::<BITS, BE>(
&y_plane[x..width],
&mut out[x * 3..width * 3],
width - x,
true,
);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn gray_n_to_rgba_row<const BITS: u32, const BE: bool>(
y_plane: &[u16],
out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 4);
if !full_range {
return scalar::gray_n_to_rgba_row::<BITS, BE>(y_plane, out, width, full_range);
}
let shift = (BITS - 8) as i32;
let mask = bits_mask::<BITS>();
let mut x = 0usize;
unsafe {
let mask_v = vdupq_n_u16(mask);
let alpha = vdup_n_u8(0xFF);
while x + 8 <= width {
let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
let masked = vandq_u16(raw, mask_v);
let shifted = vshlq_u16(masked, vdupq_n_s16(-(shift as i16)));
let narrow = vmovn_u16(shifted);
let rgba8 = uint8x8x4_t(narrow, narrow, narrow, alpha);
vst4_u8(out.as_mut_ptr().add(x * 4), rgba8);
x += 8;
}
}
if x < width {
scalar::gray_n_to_rgba_row::<BITS, BE>(
&y_plane[x..width],
&mut out[x * 4..width * 4],
width - x,
true,
);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn gray_n_to_rgb_u16_row<const BITS: u32, const BE: bool>(
y_plane: &[u16],
out: &mut [u16],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
if !full_range {
return scalar::gray_n_to_rgb_u16_row::<BITS, BE>(y_plane, out, width, full_range);
}
let mask = bits_mask::<BITS>();
let mut x = 0usize;
unsafe {
let mask_v = vdupq_n_u16(mask);
while x + 8 <= width {
let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
let y = vandq_u16(raw, mask_v);
let rgb = uint16x8x3_t(y, y, y);
vst3q_u16(out.as_mut_ptr().add(x * 3), rgb);
x += 8;
}
}
if x < width {
scalar::gray_n_to_rgb_u16_row::<BITS, BE>(
&y_plane[x..width],
&mut out[x * 3..width * 3],
width - x,
true,
);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn gray_n_to_rgba_u16_row<const BITS: u32, const BE: bool>(
y_plane: &[u16],
out: &mut [u16],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 4);
if !full_range {
return scalar::gray_n_to_rgba_u16_row::<BITS, BE>(y_plane, out, width, full_range);
}
let mask = bits_mask::<BITS>();
let mut x = 0usize;
unsafe {
let mask_v = vdupq_n_u16(mask);
let alpha_v = vdupq_n_u16(mask); while x + 8 <= width {
let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
let y = vandq_u16(raw, mask_v);
let rgba = uint16x8x4_t(y, y, y, alpha_v);
vst4q_u16(out.as_mut_ptr().add(x * 4), rgba);
x += 8;
}
}
if x < width {
scalar::gray_n_to_rgba_u16_row::<BITS, BE>(
&y_plane[x..width],
&mut out[x * 4..width * 4],
width - x,
true,
);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn gray_n_to_luma_row<const BITS: u32, const BE: bool>(
y_plane: &[u16],
out: &mut [u8],
width: usize,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width);
let shift = (BITS - 8) as i32;
let mask = bits_mask::<BITS>();
let mut x = 0usize;
unsafe {
let mask_v = vdupq_n_u16(mask);
while x + 8 <= width {
let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
let masked = vandq_u16(raw, mask_v);
let shifted = vshlq_u16(masked, vdupq_n_s16(-(shift as i16)));
let narrow = vmovn_u16(shifted);
vst1_u8(out.as_mut_ptr().add(x), narrow);
x += 8;
}
}
if x < width {
scalar::gray_n_to_luma_row::<BITS, BE>(&y_plane[x..width], &mut out[x..width], width - x);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn gray_n_to_luma_u16_row<const BITS: u32, const BE: bool>(
y_plane: &[u16],
out: &mut [u16],
width: usize,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width);
let mask = bits_mask::<BITS>();
let mut x = 0usize;
unsafe {
let mask_v = vdupq_n_u16(mask);
while x + 8 <= width {
let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
let masked = vandq_u16(raw, mask_v);
vst1q_u16(out.as_mut_ptr().add(x), masked);
x += 8;
}
}
if x < width {
scalar::gray_n_to_luma_u16_row::<BITS, BE>(&y_plane[x..width], &mut out[x..width], width - x);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn gray_n_to_hsv_row<const BITS: u32, const BE: bool>(
y_plane: &[u16],
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(h_out.len() >= width);
debug_assert!(s_out.len() >= width);
debug_assert!(v_out.len() >= width);
if !full_range {
return scalar::gray_n_to_hsv_row::<BITS, BE>(y_plane, h_out, s_out, v_out, width, full_range);
}
let shift = (BITS - 8) as i32;
let mask = bits_mask::<BITS>();
let mut x = 0usize;
unsafe {
let mask_v = vdupq_n_u16(mask);
let zero = vdup_n_u8(0);
while x + 8 <= width {
let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
let masked = vandq_u16(raw, mask_v);
let shifted = vshlq_u16(masked, vdupq_n_s16(-(shift as i16)));
let narrow = vmovn_u16(shifted);
vst1_u8(h_out.as_mut_ptr().add(x), zero);
vst1_u8(s_out.as_mut_ptr().add(x), zero);
vst1_u8(v_out.as_mut_ptr().add(x), narrow);
x += 8;
}
}
if x < width {
scalar::gray_n_to_hsv_row::<BITS, BE>(
&y_plane[x..width],
&mut h_out[x..width],
&mut s_out[x..width],
&mut v_out[x..width],
width - x,
true,
);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn gray16_to_rgb_row<const BE: bool>(
y_plane: &[u16],
out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
if !full_range {
return scalar::gray16_to_rgb_row::<BE>(y_plane, out, width, full_range);
}
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
let y8 = vshrn_n_u16::<8>(raw);
let rgb = uint8x8x3_t(y8, y8, y8);
vst3_u8(out.as_mut_ptr().add(x * 3), rgb);
x += 8;
}
}
if x < width {
scalar::gray16_to_rgb_row::<BE>(
&y_plane[x..width],
&mut out[x * 3..width * 3],
width - x,
true,
);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn gray16_to_rgba_row<const BE: bool>(
y_plane: &[u16],
out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 4);
if !full_range {
return scalar::gray16_to_rgba_row::<BE>(y_plane, out, width, full_range);
}
let mut x = 0usize;
unsafe {
let alpha = vdup_n_u8(0xFF);
while x + 8 <= width {
let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
let y8 = vshrn_n_u16::<8>(raw);
let rgba = uint8x8x4_t(y8, y8, y8, alpha);
vst4_u8(out.as_mut_ptr().add(x * 4), rgba);
x += 8;
}
}
if x < width {
scalar::gray16_to_rgba_row::<BE>(
&y_plane[x..width],
&mut out[x * 4..width * 4],
width - x,
true,
);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn gray16_to_rgb_u16_row<const BE: bool>(
y_plane: &[u16],
out: &mut [u16],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
if !full_range {
return scalar::gray16_to_rgb_u16_row::<BE>(y_plane, out, width, full_range);
}
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let y = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
let rgb = uint16x8x3_t(y, y, y);
vst3q_u16(out.as_mut_ptr().add(x * 3), rgb);
x += 8;
}
}
if x < width {
scalar::gray16_to_rgb_u16_row::<BE>(
&y_plane[x..width],
&mut out[x * 3..width * 3],
width - x,
true,
);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn gray16_to_rgba_u16_row<const BE: bool>(
y_plane: &[u16],
out: &mut [u16],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 4);
if !full_range {
return scalar::gray16_to_rgba_u16_row::<BE>(y_plane, out, width, full_range);
}
let mut x = 0usize;
unsafe {
let alpha = vdupq_n_u16(0xFFFF);
while x + 8 <= width {
let y = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
let rgba = uint16x8x4_t(y, y, y, alpha);
vst4q_u16(out.as_mut_ptr().add(x * 4), rgba);
x += 8;
}
}
if x < width {
scalar::gray16_to_rgba_u16_row::<BE>(
&y_plane[x..width],
&mut out[x * 4..width * 4],
width - x,
true,
);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn gray16_to_luma_row<const BE: bool>(
y_plane: &[u16],
out: &mut [u8],
width: usize,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width);
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
let y8 = vshrn_n_u16::<8>(raw);
vst1_u8(out.as_mut_ptr().add(x), y8);
x += 8;
}
}
if x < width {
scalar::gray16_to_luma_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn gray16_to_luma_u16_row<const BE: bool>(
y_plane: &[u16],
out: &mut [u16],
width: usize,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width);
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let y = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
vst1q_u16(out.as_mut_ptr().add(x), y);
x += 8;
}
}
if x < width {
scalar::gray16_to_luma_u16_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn gray16_to_hsv_row<const BE: bool>(
y_plane: &[u16],
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
width: usize,
full_range: bool,
) {
debug_assert!(y_plane.len() >= width);
debug_assert!(h_out.len() >= width);
debug_assert!(s_out.len() >= width);
debug_assert!(v_out.len() >= width);
if !full_range {
return scalar::gray16_to_hsv_row::<BE>(y_plane, h_out, s_out, v_out, width, full_range);
}
let mut x = 0usize;
unsafe {
let zero = vdup_n_u8(0);
while x + 8 <= width {
let raw = load_endian_u16x8::<BE>(y_plane.as_ptr().cast::<u8>().add(x * 2));
let y8 = vshrn_n_u16::<8>(raw);
vst1_u8(h_out.as_mut_ptr().add(x), zero);
vst1_u8(s_out.as_mut_ptr().add(x), zero);
vst1_u8(v_out.as_mut_ptr().add(x), y8);
x += 8;
}
}
if x < width {
scalar::gray16_to_hsv_row::<BE>(
&y_plane[x..width],
&mut h_out[x..width],
&mut s_out[x..width],
&mut v_out[x..width],
width - x,
true,
);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn grayf32_to_rgb_row<const BE: bool>(
y_plane: &[f32],
out: &mut [u8],
width: usize,
) {
use crate::row::scalar::grayf32 as scalar;
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
let scale = vdupq_n_f32(255.0);
let zero = vdupq_n_f32(0.0);
let one = vdupq_n_f32(1.0);
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let y0 = vreinterpretq_f32_u32(load_endian_u32x4::<BE>(
y_plane.as_ptr().cast::<u8>().add(x * 4),
));
let y1 = vreinterpretq_f32_u32(load_endian_u32x4::<BE>(
y_plane.as_ptr().cast::<u8>().add((x + 4) * 4),
));
let c0 = vmulq_f32(vmaxq_f32(vminq_f32(y0, one), zero), scale);
let c1 = vmulq_f32(vmaxq_f32(vminq_f32(y1, one), zero), scale);
let u0 = vcvtaq_u32_f32(c0);
let u1 = vcvtaq_u32_f32(c1);
let n0 = vmovn_u32(u0); let n1 = vmovn_u32(u1);
let n8 = vmovn_u16(vcombine_u16(n0, n1)); let rgb = uint8x8x3_t(n8, n8, n8);
vst3_u8(out.as_mut_ptr().add(x * 3), rgb);
x += 8;
}
}
if x < width {
scalar::grayf32_to_rgb_row::<BE>(&y_plane[x..width], &mut out[x * 3..width * 3], width - x);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn grayf32_to_rgba_row<const BE: bool>(
y_plane: &[f32],
out: &mut [u8],
width: usize,
) {
use crate::row::scalar::grayf32 as scalar;
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 4);
let scale = vdupq_n_f32(255.0);
let zero = vdupq_n_f32(0.0);
let one = vdupq_n_f32(1.0);
let alpha = vdup_n_u8(0xFF);
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let y0 = vreinterpretq_f32_u32(load_endian_u32x4::<BE>(
y_plane.as_ptr().cast::<u8>().add(x * 4),
));
let y1 = vreinterpretq_f32_u32(load_endian_u32x4::<BE>(
y_plane.as_ptr().cast::<u8>().add((x + 4) * 4),
));
let c0 = vmulq_f32(vmaxq_f32(vminq_f32(y0, one), zero), scale);
let c1 = vmulq_f32(vmaxq_f32(vminq_f32(y1, one), zero), scale);
let u0 = vcvtaq_u32_f32(c0);
let u1 = vcvtaq_u32_f32(c1);
let n8 = vmovn_u16(vcombine_u16(vmovn_u32(u0), vmovn_u32(u1)));
let rgba = uint8x8x4_t(n8, n8, n8, alpha);
vst4_u8(out.as_mut_ptr().add(x * 4), rgba);
x += 8;
}
}
if x < width {
scalar::grayf32_to_rgba_row::<BE>(&y_plane[x..width], &mut out[x * 4..width * 4], width - x);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn grayf32_to_rgb_u16_row<const BE: bool>(
y_plane: &[f32],
out: &mut [u16],
width: usize,
) {
use crate::row::scalar::grayf32 as scalar;
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
let scale = vdupq_n_f32(65535.0);
let zero = vdupq_n_f32(0.0);
let one = vdupq_n_f32(1.0);
let mut x = 0usize;
unsafe {
while x + 4 <= width {
let y = vreinterpretq_f32_u32(load_endian_u32x4::<BE>(
y_plane.as_ptr().cast::<u8>().add(x * 4),
));
let c = vmulq_f32(vmaxq_f32(vminq_f32(y, one), zero), scale);
let u32v = vcvtaq_u32_f32(c);
let u16v = vqmovn_u32(u32v); let rgb = uint16x4x3_t(u16v, u16v, u16v);
vst3_u16(out.as_mut_ptr().add(x * 3), rgb);
x += 4;
}
}
if x < width {
scalar::grayf32_to_rgb_u16_row::<BE>(&y_plane[x..width], &mut out[x * 3..width * 3], width - x);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn grayf32_to_rgba_u16_row<const BE: bool>(
y_plane: &[f32],
out: &mut [u16],
width: usize,
) {
use crate::row::scalar::grayf32 as scalar;
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 4);
let scale = vdupq_n_f32(65535.0);
let zero = vdupq_n_f32(0.0);
let one = vdupq_n_f32(1.0);
let alpha = vdup_n_u16(0xFFFF);
let mut x = 0usize;
unsafe {
while x + 4 <= width {
let y = vreinterpretq_f32_u32(load_endian_u32x4::<BE>(
y_plane.as_ptr().cast::<u8>().add(x * 4),
));
let c = vmulq_f32(vmaxq_f32(vminq_f32(y, one), zero), scale);
let u16v = vqmovn_u32(vcvtaq_u32_f32(c));
let rgba = uint16x4x4_t(u16v, u16v, u16v, alpha);
vst4_u16(out.as_mut_ptr().add(x * 4), rgba);
x += 4;
}
}
if x < width {
scalar::grayf32_to_rgba_u16_row::<BE>(
&y_plane[x..width],
&mut out[x * 4..width * 4],
width - x,
);
}
}
#[allow(dead_code)] #[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn grayf32_to_rgb_f32_row<const BE: bool>(
y_plane: &[f32],
out: &mut [f32],
width: usize,
) {
use crate::row::scalar::grayf32 as scalar;
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width * 3);
let mut x = 0usize;
unsafe {
while x + 4 <= width {
let y = vreinterpretq_f32_u32(load_endian_u32x4::<BE>(
y_plane.as_ptr().cast::<u8>().add(x * 4),
));
let rgb = float32x4x3_t(y, y, y);
vst3q_f32(out.as_mut_ptr().add(x * 3), rgb);
x += 4;
}
}
if x < width {
scalar::grayf32_to_rgb_f32_row::<BE>(&y_plane[x..width], &mut out[x * 3..width * 3], width - x);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn grayf32_to_luma_row<const BE: bool>(
y_plane: &[f32],
out: &mut [u8],
width: usize,
) {
use crate::row::scalar::grayf32 as scalar;
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width);
let scale = vdupq_n_f32(255.0);
let zero = vdupq_n_f32(0.0);
let one = vdupq_n_f32(1.0);
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let y0 = vreinterpretq_f32_u32(load_endian_u32x4::<BE>(
y_plane.as_ptr().cast::<u8>().add(x * 4),
));
let y1 = vreinterpretq_f32_u32(load_endian_u32x4::<BE>(
y_plane.as_ptr().cast::<u8>().add((x + 4) * 4),
));
let c0 = vmulq_f32(vmaxq_f32(vminq_f32(y0, one), zero), scale);
let c1 = vmulq_f32(vmaxq_f32(vminq_f32(y1, one), zero), scale);
let n8 = vmovn_u16(vcombine_u16(
vmovn_u32(vcvtaq_u32_f32(c0)),
vmovn_u32(vcvtaq_u32_f32(c1)),
));
vst1_u8(out.as_mut_ptr().add(x), n8);
x += 8;
}
}
if x < width {
scalar::grayf32_to_luma_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn grayf32_to_luma_u16_row<const BE: bool>(
y_plane: &[f32],
out: &mut [u16],
width: usize,
) {
use crate::row::scalar::grayf32 as scalar;
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width);
let scale = vdupq_n_f32(65535.0);
let zero = vdupq_n_f32(0.0);
let one = vdupq_n_f32(1.0);
let mut x = 0usize;
unsafe {
while x + 4 <= width {
let y = vreinterpretq_f32_u32(load_endian_u32x4::<BE>(
y_plane.as_ptr().cast::<u8>().add(x * 4),
));
let c = vmulq_f32(vmaxq_f32(vminq_f32(y, one), zero), scale);
let u16v = vqmovn_u32(vcvtaq_u32_f32(c));
vst1_u16(out.as_mut_ptr().add(x), u16v);
x += 4;
}
}
if x < width {
scalar::grayf32_to_luma_u16_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
}
}
#[allow(dead_code)] #[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn grayf32_to_luma_f32_row<const BE: bool>(
y_plane: &[f32],
out: &mut [f32],
width: usize,
) {
use crate::row::scalar::grayf32 as scalar;
debug_assert!(y_plane.len() >= width);
debug_assert!(out.len() >= width);
let mut x = 0usize;
unsafe {
while x + 4 <= width {
let y = vreinterpretq_f32_u32(load_endian_u32x4::<BE>(
y_plane.as_ptr().cast::<u8>().add(x * 4),
));
vst1q_f32(out.as_mut_ptr().add(x), y);
x += 4;
}
}
if x < width {
scalar::grayf32_to_luma_f32_row::<BE>(&y_plane[x..width], &mut out[x..width], width - x);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn grayf32_to_hsv_row<const BE: bool>(
y_plane: &[f32],
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
width: usize,
) {
use crate::row::scalar::grayf32 as scalar;
debug_assert!(y_plane.len() >= width);
let scale = vdupq_n_f32(255.0);
let zero_f = vdupq_n_f32(0.0);
let one = vdupq_n_f32(1.0);
let zero_u8 = vdup_n_u8(0);
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let y0 = vreinterpretq_f32_u32(load_endian_u32x4::<BE>(
y_plane.as_ptr().cast::<u8>().add(x * 4),
));
let y1 = vreinterpretq_f32_u32(load_endian_u32x4::<BE>(
y_plane.as_ptr().cast::<u8>().add((x + 4) * 4),
));
let c0 = vmulq_f32(vmaxq_f32(vminq_f32(y0, one), zero_f), scale);
let c1 = vmulq_f32(vmaxq_f32(vminq_f32(y1, one), zero_f), scale);
let v8 = vmovn_u16(vcombine_u16(
vmovn_u32(vcvtaq_u32_f32(c0)),
vmovn_u32(vcvtaq_u32_f32(c1)),
));
vst1_u8(h_out.as_mut_ptr().add(x), zero_u8);
vst1_u8(s_out.as_mut_ptr().add(x), zero_u8);
vst1_u8(v_out.as_mut_ptr().add(x), v8);
x += 8;
}
}
if x < width {
scalar::grayf32_to_hsv_row::<BE>(
&y_plane[x..width],
&mut h_out[x..width],
&mut s_out[x..width],
&mut v_out[x..width],
width - x,
);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn ya8_to_rgb_row(packed: &[u8], out: &mut [u8], width: usize) {
use crate::row::scalar::ya8 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 3);
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let ya = vld2_u8(packed.as_ptr().add(x * 2));
let y = ya.0; let rgb = uint8x8x3_t(y, y, y);
vst3_u8(out.as_mut_ptr().add(x * 3), rgb);
x += 8;
}
}
if x < width {
scalar::ya8_to_rgb_row(
&packed[x * 2..width * 2],
&mut out[x * 3..width * 3],
width - x,
);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn ya8_to_rgba_row(packed: &[u8], out: &mut [u8], width: usize) {
use crate::row::scalar::ya8 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 4);
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let ya = vld2_u8(packed.as_ptr().add(x * 2));
let y = ya.0;
let a = ya.1;
let rgba = uint8x8x4_t(y, y, y, a);
vst4_u8(out.as_mut_ptr().add(x * 4), rgba);
x += 8;
}
}
if x < width {
scalar::ya8_to_rgba_row(
&packed[x * 2..width * 2],
&mut out[x * 4..width * 4],
width - x,
);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn ya8_to_rgb_u16_row(packed: &[u8], out: &mut [u16], width: usize) {
use crate::row::scalar::ya8 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 3);
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let ya = vld2_u8(packed.as_ptr().add(x * 2));
let y8 = ya.0;
let y16 = vmovl_u8(y8);
let ylo = vget_low_u16(y16);
let yhi = vget_high_u16(y16);
let rgb_lo = uint16x4x3_t(ylo, ylo, ylo);
let rgb_hi = uint16x4x3_t(yhi, yhi, yhi);
vst3_u16(out.as_mut_ptr().add(x * 3), rgb_lo);
vst3_u16(out.as_mut_ptr().add((x + 4) * 3), rgb_hi);
x += 8;
}
}
if x < width {
scalar::ya8_to_rgb_u16_row(
&packed[x * 2..width * 2],
&mut out[x * 3..width * 3],
width - x,
);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn ya8_to_rgba_u16_row(packed: &[u8], out: &mut [u16], width: usize) {
use crate::row::scalar::ya8 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 4);
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let ya = vld2_u8(packed.as_ptr().add(x * 2));
let y16 = vmovl_u8(ya.0);
let a16 = vmovl_u8(ya.1);
let ylo = vget_low_u16(y16);
let yhi = vget_high_u16(y16);
let alo = vget_low_u16(a16);
let ahi = vget_high_u16(a16);
let rgba_lo = uint16x4x4_t(ylo, ylo, ylo, alo);
let rgba_hi = uint16x4x4_t(yhi, yhi, yhi, ahi);
vst4_u16(out.as_mut_ptr().add(x * 4), rgba_lo);
vst4_u16(out.as_mut_ptr().add((x + 4) * 4), rgba_hi);
x += 8;
}
}
if x < width {
scalar::ya8_to_rgba_u16_row(
&packed[x * 2..width * 2],
&mut out[x * 4..width * 4],
width - x,
);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn ya8_to_luma_row(packed: &[u8], out: &mut [u8], width: usize) {
use crate::row::scalar::ya8 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width);
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let ya = vld2_u8(packed.as_ptr().add(x * 2));
vst1_u8(out.as_mut_ptr().add(x), ya.0);
x += 8;
}
}
if x < width {
scalar::ya8_to_luma_row(&packed[x * 2..width * 2], &mut out[x..width], width - x);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn ya8_to_luma_u16_row(packed: &[u8], out: &mut [u16], width: usize) {
use crate::row::scalar::ya8 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width);
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let ya = vld2_u8(packed.as_ptr().add(x * 2));
let y16 = vmovl_u8(ya.0);
vst1q_u16(out.as_mut_ptr().add(x), y16);
x += 8;
}
}
if x < width {
scalar::ya8_to_luma_u16_row(&packed[x * 2..width * 2], &mut out[x..width], width - x);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn ya8_to_hsv_row(
packed: &[u8],
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
width: usize,
) {
use crate::row::scalar::ya8 as scalar;
debug_assert!(packed.len() >= width * 2);
let mut x = 0usize;
unsafe {
let zero = vdup_n_u8(0);
while x + 8 <= width {
let ya = vld2_u8(packed.as_ptr().add(x * 2));
vst1_u8(h_out.as_mut_ptr().add(x), zero);
vst1_u8(s_out.as_mut_ptr().add(x), zero);
vst1_u8(v_out.as_mut_ptr().add(x), ya.0);
x += 8;
}
}
if x < width {
scalar::ya8_to_hsv_row(
&packed[x * 2..width * 2],
&mut h_out[x..width],
&mut s_out[x..width],
&mut v_out[x..width],
width - x,
);
}
}
const HOST_NATIVE_BE: bool = cfg!(target_endian = "big");
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn ya16_to_rgb_row<const BE: bool>(packed: &[u16], out: &mut [u8], width: usize) {
use crate::row::scalar::ya16 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 3);
if BE != HOST_NATIVE_BE {
return scalar::ya16_to_rgb_row::<BE>(packed, out, width);
}
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let ya = vld2q_u16(packed.as_ptr().add(x * 2));
let y8 = vshrn_n_u16::<8>(ya.0); let rgb = uint8x8x3_t(y8, y8, y8);
vst3_u8(out.as_mut_ptr().add(x * 3), rgb);
x += 8;
}
}
if x < width {
scalar::ya16_to_rgb_row::<BE>(
&packed[x * 2..width * 2],
&mut out[x * 3..width * 3],
width - x,
);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn ya16_to_rgba_row<const BE: bool>(
packed: &[u16],
out: &mut [u8],
width: usize,
) {
use crate::row::scalar::ya16 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 4);
if BE != HOST_NATIVE_BE {
return scalar::ya16_to_rgba_row::<BE>(packed, out, width);
}
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let ya = vld2q_u16(packed.as_ptr().add(x * 2));
let y8 = vshrn_n_u16::<8>(ya.0);
let a8 = vshrn_n_u16::<8>(ya.1);
let rgba = uint8x8x4_t(y8, y8, y8, a8);
vst4_u8(out.as_mut_ptr().add(x * 4), rgba);
x += 8;
}
}
if x < width {
scalar::ya16_to_rgba_row::<BE>(
&packed[x * 2..width * 2],
&mut out[x * 4..width * 4],
width - x,
);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn ya16_to_rgb_u16_row<const BE: bool>(
packed: &[u16],
out: &mut [u16],
width: usize,
) {
use crate::row::scalar::ya16 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 3);
if BE != HOST_NATIVE_BE {
return scalar::ya16_to_rgb_u16_row::<BE>(packed, out, width);
}
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let ya = vld2q_u16(packed.as_ptr().add(x * 2));
let ylo = vget_low_u16(ya.0);
let yhi = vget_high_u16(ya.0);
let rgb_lo = uint16x4x3_t(ylo, ylo, ylo);
let rgb_hi = uint16x4x3_t(yhi, yhi, yhi);
vst3_u16(out.as_mut_ptr().add(x * 3), rgb_lo);
vst3_u16(out.as_mut_ptr().add((x + 4) * 3), rgb_hi);
x += 8;
}
}
if x < width {
scalar::ya16_to_rgb_u16_row::<BE>(
&packed[x * 2..width * 2],
&mut out[x * 3..width * 3],
width - x,
);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn ya16_to_rgba_u16_row<const BE: bool>(
packed: &[u16],
out: &mut [u16],
width: usize,
) {
use crate::row::scalar::ya16 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width * 4);
if BE != HOST_NATIVE_BE {
return scalar::ya16_to_rgba_u16_row::<BE>(packed, out, width);
}
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let ya = vld2q_u16(packed.as_ptr().add(x * 2));
let ylo = vget_low_u16(ya.0);
let yhi = vget_high_u16(ya.0);
let alo = vget_low_u16(ya.1);
let ahi = vget_high_u16(ya.1);
let rgba_lo = uint16x4x4_t(ylo, ylo, ylo, alo);
let rgba_hi = uint16x4x4_t(yhi, yhi, yhi, ahi);
vst4_u16(out.as_mut_ptr().add(x * 4), rgba_lo);
vst4_u16(out.as_mut_ptr().add((x + 4) * 4), rgba_hi);
x += 8;
}
}
if x < width {
scalar::ya16_to_rgba_u16_row::<BE>(
&packed[x * 2..width * 2],
&mut out[x * 4..width * 4],
width - x,
);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn ya16_to_luma_row<const BE: bool>(
packed: &[u16],
out: &mut [u8],
width: usize,
) {
use crate::row::scalar::ya16 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width);
if BE != HOST_NATIVE_BE {
return scalar::ya16_to_luma_row::<BE>(packed, out, width);
}
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let ya = vld2q_u16(packed.as_ptr().add(x * 2));
let y8 = vshrn_n_u16::<8>(ya.0);
vst1_u8(out.as_mut_ptr().add(x), y8);
x += 8;
}
}
if x < width {
scalar::ya16_to_luma_row::<BE>(&packed[x * 2..width * 2], &mut out[x..width], width - x);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn ya16_to_luma_u16_row<const BE: bool>(
packed: &[u16],
out: &mut [u16],
width: usize,
) {
use crate::row::scalar::ya16 as scalar;
debug_assert!(packed.len() >= width * 2);
debug_assert!(out.len() >= width);
if BE != HOST_NATIVE_BE {
return scalar::ya16_to_luma_u16_row::<BE>(packed, out, width);
}
let mut x = 0usize;
unsafe {
while x + 8 <= width {
let ya = vld2q_u16(packed.as_ptr().add(x * 2));
vst1q_u16(out.as_mut_ptr().add(x), ya.0);
x += 8;
}
}
if x < width {
scalar::ya16_to_luma_u16_row::<BE>(&packed[x * 2..width * 2], &mut out[x..width], width - x);
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn ya16_to_hsv_row<const BE: bool>(
packed: &[u16],
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
width: usize,
) {
use crate::row::scalar::ya16 as scalar;
debug_assert!(packed.len() >= width * 2);
if BE != HOST_NATIVE_BE {
return scalar::ya16_to_hsv_row::<BE>(packed, h_out, s_out, v_out, width);
}
let mut x = 0usize;
unsafe {
let zero = vdup_n_u8(0);
while x + 8 <= width {
let ya = vld2q_u16(packed.as_ptr().add(x * 2));
let y8 = vshrn_n_u16::<8>(ya.0);
vst1_u8(h_out.as_mut_ptr().add(x), zero);
vst1_u8(s_out.as_mut_ptr().add(x), zero);
vst1_u8(v_out.as_mut_ptr().add(x), y8);
x += 8;
}
}
if x < width {
scalar::ya16_to_hsv_row::<BE>(
&packed[x * 2..width * 2],
&mut h_out[x..width],
&mut s_out[x..width],
&mut v_out[x..width],
width - x,
);
}
}