#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
const BLK0_B: [i8; 16] = [0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1];
const BLK0_G: [i8; 16] = [1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1];
const BLK0_R: [i8; 16] = [2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1];
const BLK1_B: [i8; 16] = [-1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14, -1, -1, -1, -1, -1];
const BLK1_G: [i8; 16] = [-1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1];
const BLK1_R: [i8; 16] = [-1, -1, -1, -1, -1, 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1];
const BLK2_B: [i8; 16] = [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 4, 7, 10, 13];
const BLK2_G: [i8; 16] = [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14];
const BLK2_R: [i8; 16] = [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15];
#[target_feature(enable = "avx2", enable = "ssse3")]
#[allow(unused_unsafe)]
pub(super) unsafe fn bgr_to_hsv_planes(
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
src: &[u8],
width: u32,
height: u32,
stride: u32,
) {
const LANES: usize = 16;
let w = width as usize;
let h = height as usize;
let s = stride as usize;
let whole = w / LANES * LANES;
let m_b0 = unsafe { _mm_loadu_si128(BLK0_B.as_ptr() as *const __m128i) };
let m_g0 = unsafe { _mm_loadu_si128(BLK0_G.as_ptr() as *const __m128i) };
let m_r0 = unsafe { _mm_loadu_si128(BLK0_R.as_ptr() as *const __m128i) };
let m_b1 = unsafe { _mm_loadu_si128(BLK1_B.as_ptr() as *const __m128i) };
let m_g1 = unsafe { _mm_loadu_si128(BLK1_G.as_ptr() as *const __m128i) };
let m_r1 = unsafe { _mm_loadu_si128(BLK1_R.as_ptr() as *const __m128i) };
let m_b2 = unsafe { _mm_loadu_si128(BLK2_B.as_ptr() as *const __m128i) };
let m_g2 = unsafe { _mm_loadu_si128(BLK2_G.as_ptr() as *const __m128i) };
let m_r2 = unsafe { _mm_loadu_si128(BLK2_R.as_ptr() as *const __m128i) };
let zero_i = unsafe { _mm_setzero_si128() };
for y in 0..h {
let row_base = y * s;
let dst_off = y * w;
let mut x = 0;
while x < whole {
let p = unsafe { src.as_ptr().add(row_base + x * 3) };
let blk0 = unsafe { _mm_loadu_si128(p as *const __m128i) };
let blk1 = unsafe { _mm_loadu_si128(p.add(16) as *const __m128i) };
let blk2 = unsafe { _mm_loadu_si128(p.add(32) as *const __m128i) };
let b = unsafe {
_mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(blk0, m_b0), _mm_shuffle_epi8(blk1, m_b1)),
_mm_shuffle_epi8(blk2, m_b2),
)
};
let g = unsafe {
_mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(blk0, m_g0), _mm_shuffle_epi8(blk1, m_g1)),
_mm_shuffle_epi8(blk2, m_g2),
)
};
let r = unsafe {
_mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(blk0, m_r0), _mm_shuffle_epi8(blk1, m_r1)),
_mm_shuffle_epi8(blk2, m_r2),
)
};
let b_lo32 = unsafe { _mm256_cvtepu8_epi32(b) };
let b_hi32 = unsafe { _mm256_cvtepu8_epi32(_mm_unpackhi_epi64(b, b)) };
let g_lo32 = unsafe { _mm256_cvtepu8_epi32(g) };
let g_hi32 = unsafe { _mm256_cvtepu8_epi32(_mm_unpackhi_epi64(g, g)) };
let r_lo32 = unsafe { _mm256_cvtepu8_epi32(r) };
let r_hi32 = unsafe { _mm256_cvtepu8_epi32(_mm_unpackhi_epi64(r, r)) };
let b_lo = unsafe { _mm256_cvtepi32_ps(b_lo32) };
let b_hi = unsafe { _mm256_cvtepi32_ps(b_hi32) };
let g_lo = unsafe { _mm256_cvtepi32_ps(g_lo32) };
let g_hi = unsafe { _mm256_cvtepi32_ps(g_hi32) };
let r_lo = unsafe { _mm256_cvtepi32_ps(r_lo32) };
let r_hi = unsafe { _mm256_cvtepi32_ps(r_hi32) };
let (hue_lo, sat_lo, val_lo) = unsafe { bgr_to_hsv_f32x8(b_lo, g_lo, r_lo) };
let (hue_hi, sat_hi, val_hi) = unsafe { bgr_to_hsv_f32x8(b_hi, g_hi, r_hi) };
let half = unsafe { _mm256_set1_ps(0.5) };
let round_half = half; let hh_lo_i =
unsafe { _mm256_cvttps_epi32(_mm256_add_ps(_mm256_mul_ps(hue_lo, half), round_half)) };
let hh_hi_i =
unsafe { _mm256_cvttps_epi32(_mm256_add_ps(_mm256_mul_ps(hue_hi, half), round_half)) };
let ss_lo_i = unsafe { _mm256_cvttps_epi32(_mm256_add_ps(sat_lo, round_half)) };
let ss_hi_i = unsafe { _mm256_cvttps_epi32(_mm256_add_ps(sat_hi, round_half)) };
let vv_lo_i = unsafe { _mm256_cvttps_epi32(_mm256_add_ps(val_lo, round_half)) };
let vv_hi_i = unsafe { _mm256_cvttps_epi32(_mm256_add_ps(val_hi, round_half)) };
let h_lo = unsafe { _mm256_min_epi32(hh_lo_i, _mm256_set1_epi32(179)) };
let h_hi = unsafe { _mm256_min_epi32(hh_hi_i, _mm256_set1_epi32(179)) };
let s_lo = unsafe { _mm256_min_epi32(ss_lo_i, _mm256_set1_epi32(255)) };
let s_hi = unsafe { _mm256_min_epi32(ss_hi_i, _mm256_set1_epi32(255)) };
let v_lo = unsafe { _mm256_min_epi32(vv_lo_i, _mm256_set1_epi32(255)) };
let v_hi = unsafe { _mm256_min_epi32(vv_hi_i, _mm256_set1_epi32(255)) };
let h_vec = unsafe { pack_avx2(h_lo, h_hi) };
let s_vec = unsafe { pack_avx2(s_lo, s_hi) };
let v_vec = unsafe { pack_avx2(v_lo, v_hi) };
unsafe {
_mm_storeu_si128(h_out.as_mut_ptr().add(dst_off + x) as *mut __m128i, h_vec);
_mm_storeu_si128(s_out.as_mut_ptr().add(dst_off + x) as *mut __m128i, s_vec);
_mm_storeu_si128(v_out.as_mut_ptr().add(dst_off + x) as *mut __m128i, v_vec);
}
x += LANES;
}
let _ = zero_i;
let row = &src[row_base..row_base + w * 3];
while x < w {
let b = row[x * 3] as f32;
let g = row[x * 3 + 1] as f32;
let r = row[x * 3 + 2] as f32;
let (hue, sat, val) = super::scalar::Scalar::bgr_to_hsv_pixel(b, g, r);
h_out[dst_off + x] = hue;
s_out[dst_off + x] = sat;
v_out[dst_off + x] = val;
x += 1;
}
}
}
#[target_feature(enable = "avx2")]
#[allow(unused_unsafe)]
#[inline]
unsafe fn pack_avx2(lo: __m256i, hi: __m256i) -> __m128i {
let packed16 = unsafe { _mm256_packs_epi32(lo, hi) };
let reordered = unsafe { _mm256_permute4x64_epi64::<0b1101_1000>(packed16) };
let packed8 = unsafe { _mm256_packus_epi16(reordered, reordered) };
unsafe { _mm256_castsi256_si128(_mm256_permute4x64_epi64::<0b1101_1000>(packed8)) }
}
#[target_feature(enable = "avx2")]
#[allow(unused_unsafe)]
#[inline]
unsafe fn bgr_to_hsv_f32x8(b: __m256, g: __m256, r: __m256) -> (__m256, __m256, __m256) {
let zero = unsafe { _mm256_setzero_ps() };
let one = unsafe { _mm256_set1_ps(1.0) };
let v = unsafe { _mm256_max_ps(_mm256_max_ps(b, g), r) };
let min = unsafe { _mm256_min_ps(_mm256_min_ps(b, g), r) };
let delta = unsafe { _mm256_sub_ps(v, min) };
let delta_zero = unsafe { _mm256_cmp_ps::<_CMP_EQ_OQ>(delta, zero) };
let v_zero = unsafe { _mm256_cmp_ps::<_CMP_EQ_OQ>(v, zero) };
let delta_safe = unsafe { _mm256_blendv_ps(delta, one, delta_zero) };
let sixty = unsafe { _mm256_set1_ps(60.0) };
let c120 = unsafe { _mm256_set1_ps(120.0) };
let c240 = unsafe { _mm256_set1_ps(240.0) };
let c360 = unsafe { _mm256_set1_ps(360.0) };
let c255 = unsafe { _mm256_set1_ps(255.0) };
let h_r = unsafe { _mm256_div_ps(_mm256_mul_ps(sixty, _mm256_sub_ps(g, b)), delta_safe) };
let h_g = unsafe {
_mm256_add_ps(
_mm256_div_ps(_mm256_mul_ps(sixty, _mm256_sub_ps(b, r)), delta_safe),
c120,
)
};
let h_b = unsafe {
_mm256_add_ps(
_mm256_div_ps(_mm256_mul_ps(sixty, _mm256_sub_ps(r, g)), delta_safe),
c240,
)
};
let is_r = unsafe { _mm256_cmp_ps::<_CMP_EQ_OQ>(v, r) };
let is_g = unsafe { _mm256_cmp_ps::<_CMP_EQ_OQ>(v, g) };
let not_r_and_g = unsafe { _mm256_andnot_ps(is_r, is_g) };
let hue_rg = unsafe { _mm256_blendv_ps(h_b, h_r, is_r) };
let hue = unsafe { _mm256_blendv_ps(hue_rg, h_g, not_r_and_g) };
let neg = unsafe { _mm256_cmp_ps::<_CMP_LT_OQ>(hue, zero) };
let hue = unsafe { _mm256_blendv_ps(hue, _mm256_add_ps(hue, c360), neg) };
let hue = unsafe { _mm256_blendv_ps(hue, zero, delta_zero) };
let v_safe = unsafe { _mm256_blendv_ps(v, one, v_zero) };
let sat = unsafe { _mm256_div_ps(_mm256_mul_ps(c255, delta), v_safe) };
let sat = unsafe { _mm256_blendv_ps(sat, zero, v_zero) };
(hue, sat, v)
}