use core::arch::wasm32::*;
use super::*;
#[inline]
#[target_feature(enable = "simd128")]
unsafe fn deinterleave_rgb_16px(ptr: *const u8) -> (v128, v128, v128) {
unsafe {
let in0 = v128_load(ptr.cast());
let in1 = v128_load(ptr.add(16).cast());
let in2 = v128_load(ptr.add(32).cast());
let mr0 = i8x16(0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let mr1 = i8x16(-1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14, -1, -1, -1, -1, -1);
let mr2 = i8x16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 4, 7, 10, 13);
let r_u8 = v128_or(
v128_or(u8x16_swizzle(in0, mr0), u8x16_swizzle(in1, mr1)),
u8x16_swizzle(in2, mr2),
);
let mg0 = i8x16(1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let mg1 = i8x16(-1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1);
let mg2 = i8x16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14);
let g_u8 = v128_or(
v128_or(u8x16_swizzle(in0, mg0), u8x16_swizzle(in1, mg1)),
u8x16_swizzle(in2, mg2),
);
let mb0 = i8x16(2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
let mb1 = i8x16(-1, -1, -1, -1, -1, 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1);
let mb2 = i8x16(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15);
let b_u8 = v128_or(
v128_or(u8x16_swizzle(in0, mb0), u8x16_swizzle(in1, mb1)),
u8x16_swizzle(in2, mb2),
);
(r_u8, g_u8, b_u8)
}
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn rgb_to_hsv_row(
rgb: &[u8],
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
width: usize,
) {
debug_assert!(rgb.len() >= width * 3);
debug_assert!(h_out.len() >= width);
debug_assert!(s_out.len() >= width);
debug_assert!(v_out.len() >= width);
unsafe {
let mut x = 0usize;
while x + 16 <= width {
let (r_u8, g_u8, b_u8) = deinterleave_rgb_16px(rgb.as_ptr().add(x * 3));
let (r0, r1, r2, r3) = u8x16_to_f32x4_quad(r_u8);
let (g0, g1, g2, g3) = u8x16_to_f32x4_quad(g_u8);
let (b0, b1, b2, b3) = u8x16_to_f32x4_quad(b_u8);
let (h0, s0, v0) = hsv_group(r0, g0, b0);
let (h1, s1, v1) = hsv_group(r1, g1, b1);
let (h2, s2, v2) = hsv_group(r2, g2, b2);
let (h3, s3, v3) = hsv_group(r3, g3, b3);
v128_store(
h_out.as_mut_ptr().add(x).cast(),
f32x4_quad_to_u8x16(h0, h1, h2, h3),
);
v128_store(
s_out.as_mut_ptr().add(x).cast(),
f32x4_quad_to_u8x16(s0, s1, s2, s3),
);
v128_store(
v_out.as_mut_ptr().add(x).cast(),
f32x4_quad_to_u8x16(v0, v1, v2, v3),
);
x += 16;
}
if x < width {
scalar::rgb_to_hsv_row(
&rgb[x * 3..width * 3],
&mut h_out[x..width],
&mut s_out[x..width],
&mut v_out[x..width],
width - x,
);
}
}
}
#[inline(always)]
fn u8x16_to_f32x4_quad(v: v128) -> (v128, v128, v128, v128) {
let u16_lo = u16x8_extend_low_u8x16(v);
let u16_hi = u16x8_extend_high_u8x16(v);
let u32_0 = u32x4_extend_low_u16x8(u16_lo);
let u32_1 = u32x4_extend_high_u16x8(u16_lo);
let u32_2 = u32x4_extend_low_u16x8(u16_hi);
let u32_3 = u32x4_extend_high_u16x8(u16_hi);
(
f32x4_convert_i32x4(u32_0),
f32x4_convert_i32x4(u32_1),
f32x4_convert_i32x4(u32_2),
f32x4_convert_i32x4(u32_3),
)
}
#[inline(always)]
fn f32x4_quad_to_u8x16(a: v128, b: v128, c: v128, d: v128) -> v128 {
let ai = i32x4_trunc_sat_f32x4(a);
let bi = i32x4_trunc_sat_f32x4(b);
let ci = i32x4_trunc_sat_f32x4(c);
let di = i32x4_trunc_sat_f32x4(d);
let ab = i16x8_narrow_i32x4(ai, bi);
let cd = i16x8_narrow_i32x4(ci, di);
u8x16_narrow_i16x8(ab, cd)
}
#[inline(always)]
fn hsv_group(r: v128, g: v128, b: v128) -> (v128, v128, v128) {
let zero = f32x4_splat(0.0);
let half = f32x4_splat(0.5);
let sixty = f32x4_splat(60.0);
let one_twenty = f32x4_splat(120.0);
let two_forty = f32x4_splat(240.0);
let three_sixty = f32x4_splat(360.0);
let one_seventy_nine = f32x4_splat(179.0);
let two_fifty_five = f32x4_splat(255.0);
let v = f32x4_max(f32x4_max(r, g), b);
let min_rgb = f32x4_min(f32x4_min(r, g), b);
let delta = f32x4_sub(v, min_rgb);
let mask_v_zero = f32x4_eq(v, zero);
let s_nonzero = f32x4_div(f32x4_mul(two_fifty_five, delta), v);
let s = v128_bitselect(zero, s_nonzero, mask_v_zero);
let mask_delta_zero = f32x4_eq(delta, zero);
let mask_v_is_r = f32x4_eq(v, r);
let mask_v_is_g = f32x4_eq(v, g);
let h_r_raw = f32x4_div(f32x4_mul(sixty, f32x4_sub(g, b)), delta);
let mask_neg = f32x4_lt(h_r_raw, zero);
let h_r = v128_bitselect(f32x4_add(h_r_raw, three_sixty), h_r_raw, mask_neg);
let h_g = f32x4_add(
f32x4_div(f32x4_mul(sixty, f32x4_sub(b, r)), delta),
one_twenty,
);
let h_b = f32x4_add(
f32x4_div(f32x4_mul(sixty, f32x4_sub(r, g)), delta),
two_forty,
);
let h_g_or_b = v128_bitselect(h_g, h_b, mask_v_is_g);
let h_nonzero = v128_bitselect(h_r, h_g_or_b, mask_v_is_r);
let hue = v128_bitselect(zero, h_nonzero, mask_delta_zero);
let h_quant = f32x4_min(
f32x4_max(f32x4_add(f32x4_mul(hue, half), half), zero),
one_seventy_nine,
);
let s_quant = f32x4_min(f32x4_max(f32x4_add(s, half), zero), two_fifty_five);
let v_quant = f32x4_min(f32x4_max(f32x4_add(v, half), zero), two_fifty_five);
(h_quant, s_quant, v_quant)
}
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn rgb_to_luma_row(
rgb: &[u8],
luma_out: &mut [u8],
width: usize,
matrix: ColorMatrix,
full_range: bool,
) {
debug_assert!(rgb.len() >= width * 3);
debug_assert!(luma_out.len() >= width);
let (k_r, k_g, k_b) = scalar::luma_coefficients_q15(matrix);
let kr_v = i32x4_splat(k_r);
let kg_v = i32x4_splat(k_g);
let kb_v = i32x4_splat(k_b);
let rnd_v = i32x4_splat(1 << 14);
unsafe {
let mut x = 0usize;
while x + 16 <= width {
let (r_u8, g_u8, b_u8) = deinterleave_rgb_16px(rgb.as_ptr().add(x * 3));
let r_lo_i16 = i16x8_extend_low_u8x16(r_u8);
let r_hi_i16 = i16x8_extend_high_u8x16(r_u8);
let g_lo_i16 = i16x8_extend_low_u8x16(g_u8);
let g_hi_i16 = i16x8_extend_high_u8x16(g_u8);
let b_lo_i16 = i16x8_extend_low_u8x16(b_u8);
let b_hi_i16 = i16x8_extend_high_u8x16(b_u8);
let y0 = q15_luma(r_lo_i16, g_lo_i16, b_lo_i16, false, kr_v, kg_v, kb_v, rnd_v);
let y1 = q15_luma(r_lo_i16, g_lo_i16, b_lo_i16, true, kr_v, kg_v, kb_v, rnd_v);
let y2 = q15_luma(r_hi_i16, g_hi_i16, b_hi_i16, false, kr_v, kg_v, kb_v, rnd_v);
let y3 = q15_luma(r_hi_i16, g_hi_i16, b_hi_i16, true, kr_v, kg_v, kb_v, rnd_v);
let y_lo_i16 = i16x8_narrow_i32x4(y0, y1);
let y_hi_i16 = i16x8_narrow_i32x4(y2, y3);
let y_u8 = if full_range {
u8x16_narrow_i16x8(y_lo_i16, y_hi_i16)
} else {
let y_clamp_u8 = u8x16_narrow_i16x8(y_lo_i16, y_hi_i16);
limited_range_scale_16(y_clamp_u8, rnd_v)
};
v128_store(luma_out.as_mut_ptr().add(x).cast(), y_u8);
x += 16;
}
if x < width {
scalar::rgb_to_luma_row(
&rgb[x * 3..width * 3],
&mut luma_out[x..width],
width - x,
matrix,
full_range,
);
}
}
}
#[inline(always)]
#[allow(clippy::too_many_arguments)]
fn q15_luma(
r_i16: v128,
g_i16: v128,
b_i16: v128,
high: bool,
kr_v: v128,
kg_v: v128,
kb_v: v128,
rnd_v: v128,
) -> v128 {
let (r, g, b) = if high {
(
i32x4_extend_high_i16x8(r_i16),
i32x4_extend_high_i16x8(g_i16),
i32x4_extend_high_i16x8(b_i16),
)
} else {
(
i32x4_extend_low_i16x8(r_i16),
i32x4_extend_low_i16x8(g_i16),
i32x4_extend_low_i16x8(b_i16),
)
};
let acc = i32x4_mul(r, kr_v);
let acc = i32x4_add(acc, i32x4_mul(g, kg_v));
let acc = i32x4_add(acc, i32x4_mul(b, kb_v));
let acc = i32x4_add(acc, rnd_v);
i32x4_shr(acc, 15)
}
#[inline(always)]
fn limited_range_scale_16(y_clamp_u8: v128, rnd_v: v128) -> v128 {
let scale = i32x4_splat(28142);
let off = i16x8_splat(16);
let y_lo_i16 = i16x8_extend_low_u8x16(y_clamp_u8);
let y_hi_i16 = i16x8_extend_high_u8x16(y_clamp_u8);
let y_lim_lo = limited_range_quarter(y_lo_i16, scale, rnd_v);
let y_lim_hi = limited_range_quarter(y_hi_i16, scale, rnd_v);
let y_lim_lo = i16x8_add(y_lim_lo, off);
let y_lim_hi = i16x8_add(y_lim_hi, off);
u8x16_narrow_i16x8(y_lim_lo, y_lim_hi)
}
#[inline(always)]
fn limited_range_quarter(y_i16: v128, scale_i32: v128, rnd_v: v128) -> v128 {
let lo_i32 = i32x4_extend_low_i16x8(y_i16);
let hi_i32 = i32x4_extend_high_i16x8(y_i16);
let lo = i32x4_shr(i32x4_add(i32x4_mul(lo_i32, scale_i32), rnd_v), 15);
let hi = i32x4_shr(i32x4_add(i32x4_mul(hi_i32, scale_i32), rnd_v), 15);
i16x8_narrow_i32x4(lo, hi)
}