use core::arch::aarch64::*;
use crate::{ColorMatrix, row::scalar};
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn rgb_to_hsv_row(
rgb: &[u8],
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb.len() >= width * 3, "rgb row too short");
debug_assert!(h_out.len() >= width, "H row too short");
debug_assert!(s_out.len() >= width, "S row too short");
debug_assert!(v_out.len() >= width, "V row too short");
unsafe {
let mut x = 0usize;
while x + 16 <= width {
let rgb_vec = vld3q_u8(rgb.as_ptr().add(x * 3));
let r_u8 = rgb_vec.0;
let g_u8 = rgb_vec.1;
let b_u8 = rgb_vec.2;
let (b0, b1, b2, b3) = u8x16_to_f32x4_quad(b_u8);
let (g0, g1, g2, g3) = u8x16_to_f32x4_quad(g_u8);
let (r0, r1, r2, r3) = u8x16_to_f32x4_quad(r_u8);
let (h0, s0, v0) = hsv_group(b0, g0, r0);
let (h1, s1, v1) = hsv_group(b1, g1, r1);
let (h2, s2, v2) = hsv_group(b2, g2, r2);
let (h3, s3, v3) = hsv_group(b3, g3, r3);
let h_u8 = f32x4_quad_to_u8x16(h0, h1, h2, h3);
let s_u8 = f32x4_quad_to_u8x16(s0, s1, s2, s3);
let v_u8 = f32x4_quad_to_u8x16(v0, v1, v2, v3);
vst1q_u8(h_out.as_mut_ptr().add(x), h_u8);
vst1q_u8(s_out.as_mut_ptr().add(x), s_u8);
vst1q_u8(v_out.as_mut_ptr().add(x), v_u8);
x += 16;
}
if x < width {
scalar::rgb_to_hsv_row(
&rgb[x * 3..width * 3],
&mut h_out[x..width],
&mut s_out[x..width],
&mut v_out[x..width],
width - x,
);
}
}
}
#[inline(always)]
fn u8x16_to_f32x4_quad(v: uint8x16_t) -> (float32x4_t, float32x4_t, float32x4_t, float32x4_t) {
unsafe {
let u16_lo = vmovl_u8(vget_low_u8(v)); let u16_hi = vmovl_u8(vget_high_u8(v)); let u32_0 = vmovl_u16(vget_low_u16(u16_lo)); let u32_1 = vmovl_u16(vget_high_u16(u16_lo)); let u32_2 = vmovl_u16(vget_low_u16(u16_hi)); let u32_3 = vmovl_u16(vget_high_u16(u16_hi)); (
vcvtq_f32_u32(u32_0),
vcvtq_f32_u32(u32_1),
vcvtq_f32_u32(u32_2),
vcvtq_f32_u32(u32_3),
)
}
}
#[inline(always)]
fn hsv_group(
b: float32x4_t,
g: float32x4_t,
r: float32x4_t,
) -> (float32x4_t, float32x4_t, float32x4_t) {
unsafe {
let zero = vdupq_n_f32(0.0);
let half = vdupq_n_f32(0.5);
let sixty = vdupq_n_f32(60.0);
let one_twenty = vdupq_n_f32(120.0);
let two_forty = vdupq_n_f32(240.0);
let three_sixty = vdupq_n_f32(360.0);
let one_seventy_nine = vdupq_n_f32(179.0);
let two_fifty_five = vdupq_n_f32(255.0);
let v = vmaxq_f32(vmaxq_f32(b, g), r);
let min_bgr = vminq_f32(vminq_f32(b, g), r);
let delta = vsubq_f32(v, min_bgr);
let mask_v_nonzero = vmvnq_u32(vceqq_f32(v, zero));
let s_nonzero = vdivq_f32(vmulq_f32(two_fifty_five, delta), v);
let s = vbslq_f32(mask_v_nonzero, s_nonzero, zero);
let mask_delta_zero = vceqq_f32(delta, zero);
let mask_v_is_r = vceqq_f32(v, r);
let mask_v_is_g = vceqq_f32(v, g);
let h_r = {
let raw = vdivq_f32(vmulq_f32(sixty, vsubq_f32(g, b)), delta);
let mask_neg = vcltq_f32(raw, zero);
vbslq_f32(mask_neg, vaddq_f32(raw, three_sixty), raw)
};
let h_g = vaddq_f32(
vdivq_f32(vmulq_f32(sixty, vsubq_f32(b, r)), delta),
one_twenty,
);
let h_b = vaddq_f32(
vdivq_f32(vmulq_f32(sixty, vsubq_f32(r, g)), delta),
two_forty,
);
let hue_g_or_b = vbslq_f32(mask_v_is_g, h_g, h_b);
let hue_nonzero_delta = vbslq_f32(mask_v_is_r, h_r, hue_g_or_b);
let hue = vbslq_f32(mask_delta_zero, zero, hue_nonzero_delta);
let h_quant = vminq_f32(
vmaxq_f32(vaddq_f32(vmulq_f32(hue, half), half), zero),
one_seventy_nine,
);
let s_quant = vminq_f32(vmaxq_f32(vaddq_f32(s, half), zero), two_fifty_five);
let v_quant = vminq_f32(vmaxq_f32(vaddq_f32(v, half), zero), two_fifty_five);
(h_quant, s_quant, v_quant)
}
}
#[inline(always)]
fn f32x4_quad_to_u8x16(
a: float32x4_t,
b: float32x4_t,
c: float32x4_t,
d: float32x4_t,
) -> uint8x16_t {
unsafe {
let a_u32 = vcvtq_u32_f32(a);
let b_u32 = vcvtq_u32_f32(b);
let c_u32 = vcvtq_u32_f32(c);
let d_u32 = vcvtq_u32_f32(d);
let ab_u16 = vcombine_u16(vmovn_u32(a_u32), vmovn_u32(b_u32));
let cd_u16 = vcombine_u16(vmovn_u32(c_u32), vmovn_u32(d_u32));
vcombine_u8(vmovn_u16(ab_u16), vmovn_u16(cd_u16))
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn rgb_to_luma_row(
rgb: &[u8],
luma_out: &mut [u8],
width: usize,
matrix: ColorMatrix,
full_range: bool,
) {
debug_assert!(rgb.len() >= width * 3, "rgb row too short");
debug_assert!(luma_out.len() >= width, "luma row too short");
let (k_r, k_g, k_b) = scalar::luma_coefficients_q15(matrix);
let kr_v = vdup_n_s16(k_r as i16);
let kg_v = vdup_n_s16(k_g as i16);
let kb_v = vdup_n_s16(k_b as i16);
let rnd_v = vdupq_n_s32(1 << 14);
let lim_scale_v = vdup_n_s16(28142);
let lim_off_v = vdupq_n_s16(16);
unsafe {
let mut x = 0usize;
while x + 16 <= width {
let rgb_vec = vld3q_u8(rgb.as_ptr().add(x * 3));
let r_u8 = rgb_vec.0;
let g_u8 = rgb_vec.1;
let b_u8 = rgb_vec.2;
let r_lo_u16 = vmovl_u8(vget_low_u8(r_u8));
let r_hi_u16 = vmovl_u8(vget_high_u8(r_u8));
let g_lo_u16 = vmovl_u8(vget_low_u8(g_u8));
let g_hi_u16 = vmovl_u8(vget_high_u8(g_u8));
let b_lo_u16 = vmovl_u8(vget_low_u8(b_u8));
let b_hi_u16 = vmovl_u8(vget_high_u8(b_u8));
let r_lo = vreinterpretq_s16_u16(r_lo_u16);
let r_hi = vreinterpretq_s16_u16(r_hi_u16);
let g_lo = vreinterpretq_s16_u16(g_lo_u16);
let g_hi = vreinterpretq_s16_u16(g_hi_u16);
let b_lo = vreinterpretq_s16_u16(b_lo_u16);
let b_hi = vreinterpretq_s16_u16(b_hi_u16);
let y0 = q15_luma(
vget_low_s16(r_lo),
vget_low_s16(g_lo),
vget_low_s16(b_lo),
kr_v,
kg_v,
kb_v,
rnd_v,
);
let y1 = q15_luma(
vget_high_s16(r_lo),
vget_high_s16(g_lo),
vget_high_s16(b_lo),
kr_v,
kg_v,
kb_v,
rnd_v,
);
let y2 = q15_luma(
vget_low_s16(r_hi),
vget_low_s16(g_hi),
vget_low_s16(b_hi),
kr_v,
kg_v,
kb_v,
rnd_v,
);
let y3 = q15_luma(
vget_high_s16(r_hi),
vget_high_s16(g_hi),
vget_high_s16(b_hi),
kr_v,
kg_v,
kb_v,
rnd_v,
);
let y_lo_i16 = vcombine_s16(vqmovn_s32(y0), vqmovn_s32(y1));
let y_hi_i16 = vcombine_s16(vqmovn_s32(y2), vqmovn_s32(y3));
let y_u8 = if full_range {
vcombine_u8(vqmovun_s16(y_lo_i16), vqmovun_s16(y_hi_i16))
} else {
let y_clamp_u8_lo = vqmovun_s16(y_lo_i16);
let y_clamp_u8_hi = vqmovun_s16(y_hi_i16);
let yc_lo_i16 = vreinterpretq_s16_u16(vmovl_u8(y_clamp_u8_lo));
let yc_hi_i16 = vreinterpretq_s16_u16(vmovl_u8(y_clamp_u8_hi));
let y_lim_lo = limited_range_scale(yc_lo_i16, lim_scale_v, lim_off_v, rnd_v);
let y_lim_hi = limited_range_scale(yc_hi_i16, lim_scale_v, lim_off_v, rnd_v);
vcombine_u8(vqmovun_s16(y_lim_lo), vqmovun_s16(y_lim_hi))
};
vst1q_u8(luma_out.as_mut_ptr().add(x), y_u8);
x += 16;
}
if x < width {
scalar::rgb_to_luma_row(
&rgb[x * 3..width * 3],
&mut luma_out[x..width],
width - x,
matrix,
full_range,
);
}
}
}
#[inline(always)]
fn q15_luma(
r: int16x4_t,
g: int16x4_t,
b: int16x4_t,
kr: int16x4_t,
kg: int16x4_t,
kb: int16x4_t,
rnd: int32x4_t,
) -> int32x4_t {
unsafe {
let acc = vmull_s16(r, kr);
let acc = vmlal_s16(acc, g, kg);
let acc = vmlal_s16(acc, b, kb);
let acc = vaddq_s32(acc, rnd);
vshrq_n_s32::<15>(acc)
}
}
#[inline(always)]
fn limited_range_scale(
yc: int16x8_t,
scale: int16x4_t,
off: int16x8_t,
rnd: int32x4_t,
) -> int16x8_t {
unsafe {
let lo = vmull_s16(vget_low_s16(yc), scale);
let hi = vmull_s16(vget_high_s16(yc), scale);
let lo = vshrq_n_s32::<15>(vaddq_s32(lo, rnd));
let hi = vshrq_n_s32::<15>(vaddq_s32(hi, rnd));
let scaled_i16 = vcombine_s16(vqmovn_s32(lo), vqmovn_s32(hi));
vaddq_s16(scaled_i16, off)
}
}