#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
#[allow(dead_code)]
const BLK0_B: [i8; 16] = [0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1];
#[allow(dead_code)]
const BLK0_G: [i8; 16] = [1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1];
#[allow(dead_code)]
const BLK0_R: [i8; 16] = [2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1];
#[allow(dead_code)]
const BLK1_B: [i8; 16] = [-1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14, -1, -1, -1, -1, -1];
#[allow(dead_code)]
const BLK1_G: [i8; 16] = [-1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1];
#[allow(dead_code)]
const BLK1_R: [i8; 16] = [-1, -1, -1, -1, -1, 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1];
#[allow(dead_code)]
const BLK2_B: [i8; 16] = [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 4, 7, 10, 13];
#[allow(dead_code)]
const BLK2_G: [i8; 16] = [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14];
#[allow(dead_code)]
const BLK2_R: [i8; 16] = [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15];
#[allow(dead_code)] #[target_feature(enable = "ssse3")]
#[allow(unused_unsafe)]
pub(super) unsafe fn bgr_to_hsv_planes(
h_out: &mut [u8],
s_out: &mut [u8],
v_out: &mut [u8],
src: &[u8],
width: u32,
height: u32,
stride: u32,
) {
const LANES: usize = 16;
let w = width as usize;
let h = height as usize;
let s = stride as usize;
let whole = w / LANES * LANES;
let m_b0 = unsafe { _mm_loadu_si128(BLK0_B.as_ptr() as *const __m128i) };
let m_g0 = unsafe { _mm_loadu_si128(BLK0_G.as_ptr() as *const __m128i) };
let m_r0 = unsafe { _mm_loadu_si128(BLK0_R.as_ptr() as *const __m128i) };
let m_b1 = unsafe { _mm_loadu_si128(BLK1_B.as_ptr() as *const __m128i) };
let m_g1 = unsafe { _mm_loadu_si128(BLK1_G.as_ptr() as *const __m128i) };
let m_r1 = unsafe { _mm_loadu_si128(BLK1_R.as_ptr() as *const __m128i) };
let m_b2 = unsafe { _mm_loadu_si128(BLK2_B.as_ptr() as *const __m128i) };
let m_g2 = unsafe { _mm_loadu_si128(BLK2_G.as_ptr() as *const __m128i) };
let m_r2 = unsafe { _mm_loadu_si128(BLK2_R.as_ptr() as *const __m128i) };
let zero_i = unsafe { _mm_setzero_si128() };
for y in 0..h {
let row_base = y * s;
let dst_off = y * w;
let mut x = 0;
while x < whole {
let p = unsafe { src.as_ptr().add(row_base + x * 3) };
let blk0 = unsafe { _mm_loadu_si128(p as *const __m128i) };
let blk1 = unsafe { _mm_loadu_si128(p.add(16) as *const __m128i) };
let blk2 = unsafe { _mm_loadu_si128(p.add(32) as *const __m128i) };
let b = unsafe {
_mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(blk0, m_b0), _mm_shuffle_epi8(blk1, m_b1)),
_mm_shuffle_epi8(blk2, m_b2),
)
};
let g = unsafe {
_mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(blk0, m_g0), _mm_shuffle_epi8(blk1, m_g1)),
_mm_shuffle_epi8(blk2, m_g2),
)
};
let r = unsafe {
_mm_or_si128(
_mm_or_si128(_mm_shuffle_epi8(blk0, m_r0), _mm_shuffle_epi8(blk1, m_r1)),
_mm_shuffle_epi8(blk2, m_r2),
)
};
let b_lo16 = unsafe { _mm_unpacklo_epi8(b, zero_i) };
let b_hi16 = unsafe { _mm_unpackhi_epi8(b, zero_i) };
let g_lo16 = unsafe { _mm_unpacklo_epi8(g, zero_i) };
let g_hi16 = unsafe { _mm_unpackhi_epi8(g, zero_i) };
let r_lo16 = unsafe { _mm_unpacklo_epi8(r, zero_i) };
let r_hi16 = unsafe { _mm_unpackhi_epi8(r, zero_i) };
macro_rules! group {
($b16:expr, $g16:expr, $r16:expr, $half:ident) => {{
let bu = unsafe { $half($b16, zero_i) };
let gu = unsafe { $half($g16, zero_i) };
let ru = unsafe { $half($r16, zero_i) };
let bf = unsafe { _mm_cvtepi32_ps(bu) };
let gf = unsafe { _mm_cvtepi32_ps(gu) };
let rf = unsafe { _mm_cvtepi32_ps(ru) };
let (hue, sat, val) = unsafe { bgr_to_hsv_f32x4(bf, gf, rf) };
let half = unsafe { _mm_set1_ps(0.5) };
let hh = unsafe { _mm_mul_ps(hue, _mm_set1_ps(0.5)) };
let h_u32 = unsafe { clamp_i32_max(_mm_cvttps_epi32(_mm_add_ps(hh, half)), 179) };
let s_u32 = unsafe { clamp_i32_max(_mm_cvttps_epi32(_mm_add_ps(sat, half)), 255) };
let v_u32 = unsafe { clamp_i32_max(_mm_cvttps_epi32(_mm_add_ps(val, half)), 255) };
(h_u32, s_u32, v_u32)
}};
}
let (h0, s0, v0) = group!(b_lo16, g_lo16, r_lo16, _mm_unpacklo_epi16);
let (h1, s1, v1) = group!(b_lo16, g_lo16, r_lo16, _mm_unpackhi_epi16);
let (h2, s2, v2) = group!(b_hi16, g_hi16, r_hi16, _mm_unpacklo_epi16);
let (h3, s3, v3) = group!(b_hi16, g_hi16, r_hi16, _mm_unpackhi_epi16);
let h_vec = unsafe { pack_quad(h0, h1, h2, h3) };
let s_vec = unsafe { pack_quad(s0, s1, s2, s3) };
let v_vec = unsafe { pack_quad(v0, v1, v2, v3) };
unsafe {
_mm_storeu_si128(h_out.as_mut_ptr().add(dst_off + x) as *mut __m128i, h_vec);
_mm_storeu_si128(s_out.as_mut_ptr().add(dst_off + x) as *mut __m128i, s_vec);
_mm_storeu_si128(v_out.as_mut_ptr().add(dst_off + x) as *mut __m128i, v_vec);
}
x += LANES;
}
let row = &src[row_base..row_base + w * 3];
while x < w {
let b = row[x * 3] as f32;
let g = row[x * 3 + 1] as f32;
let r = row[x * 3 + 2] as f32;
let (hue, sat, val) = super::scalar::Scalar::bgr_to_hsv_pixel(b, g, r);
h_out[dst_off + x] = hue;
s_out[dst_off + x] = sat;
v_out[dst_off + x] = val;
x += 1;
}
}
}
#[allow(dead_code)]
#[target_feature(enable = "ssse3")]
#[allow(unused_unsafe)]
#[inline]
unsafe fn clamp_i32_max(v: __m128i, max: i32) -> __m128i {
let mv = unsafe { _mm_set1_epi32(max) };
let gt = unsafe { _mm_cmpgt_epi32(v, mv) };
unsafe { _mm_or_si128(_mm_and_si128(gt, mv), _mm_andnot_si128(gt, v)) }
}
#[allow(dead_code)]
#[target_feature(enable = "ssse3")]
#[allow(unused_unsafe)]
#[inline]
unsafe fn pack_quad(a: __m128i, b: __m128i, c: __m128i, d: __m128i) -> __m128i {
let lo = unsafe { _mm_packs_epi32(a, b) };
let hi = unsafe { _mm_packs_epi32(c, d) };
unsafe { _mm_packus_epi16(lo, hi) }
}
#[allow(dead_code)]
#[target_feature(enable = "ssse3")]
#[allow(unused_unsafe)]
#[inline]
unsafe fn bgr_to_hsv_f32x4(b: __m128, g: __m128, r: __m128) -> (__m128, __m128, __m128) {
let zero = unsafe { _mm_setzero_ps() };
let one = unsafe { _mm_set1_ps(1.0) };
let v = unsafe { _mm_max_ps(_mm_max_ps(b, g), r) };
let min = unsafe { _mm_min_ps(_mm_min_ps(b, g), r) };
let delta = unsafe { _mm_sub_ps(v, min) };
let delta_zero = unsafe { _mm_cmpeq_ps(delta, zero) };
let v_zero = unsafe { _mm_cmpeq_ps(v, zero) };
let delta_safe = unsafe { blend(delta_zero, one, delta) };
let sixty = unsafe { _mm_set1_ps(60.0) };
let c120 = unsafe { _mm_set1_ps(120.0) };
let c240 = unsafe { _mm_set1_ps(240.0) };
let c360 = unsafe { _mm_set1_ps(360.0) };
let c255 = unsafe { _mm_set1_ps(255.0) };
let h_r = unsafe { _mm_div_ps(_mm_mul_ps(sixty, _mm_sub_ps(g, b)), delta_safe) };
let h_g = unsafe {
_mm_add_ps(
_mm_div_ps(_mm_mul_ps(sixty, _mm_sub_ps(b, r)), delta_safe),
c120,
)
};
let h_b = unsafe {
_mm_add_ps(
_mm_div_ps(_mm_mul_ps(sixty, _mm_sub_ps(r, g)), delta_safe),
c240,
)
};
let is_r = unsafe { _mm_cmpeq_ps(v, r) };
let is_g = unsafe { _mm_cmpeq_ps(v, g) };
let not_r_and_g = unsafe { _mm_andnot_ps(is_r, is_g) };
let hue_rg = unsafe { blend(is_r, h_r, h_b) };
let hue = unsafe { blend(not_r_and_g, h_g, hue_rg) };
let neg = unsafe { _mm_cmplt_ps(hue, zero) };
let hue = unsafe { blend(neg, _mm_add_ps(hue, c360), hue) };
let hue = unsafe { blend(delta_zero, zero, hue) };
let v_safe = unsafe { blend(v_zero, one, v) };
let sat = unsafe { _mm_div_ps(_mm_mul_ps(c255, delta), v_safe) };
let sat = unsafe { blend(v_zero, zero, sat) };
(hue, sat, v)
}
#[allow(dead_code)]
#[target_feature(enable = "ssse3")]
#[allow(unused_unsafe)]
#[inline]
unsafe fn blend(mask: __m128, t: __m128, f: __m128) -> __m128 {
unsafe { _mm_or_ps(_mm_and_ps(mask, t), _mm_andnot_ps(mask, f)) }
}
#[target_feature(enable = "ssse3")]
#[allow(unused_unsafe)]
pub(super) unsafe fn mean_abs_diff(a: &[u8], b: &[u8], n: usize) -> f64 {
const LANES: usize = 16;
let whole = n / LANES * LANES;
let mut acc = unsafe { _mm_setzero_si128() };
let mut i = 0;
while i < whole {
let va = unsafe { _mm_loadu_si128(a.as_ptr().add(i) as *const __m128i) };
let vb = unsafe { _mm_loadu_si128(b.as_ptr().add(i) as *const __m128i) };
let sad = unsafe { _mm_sad_epu8(va, vb) };
acc = unsafe { _mm_add_epi64(acc, sad) };
i += LANES;
}
let hi = unsafe { _mm_srli_si128::<8>(acc) };
let total = unsafe { _mm_add_epi64(acc, hi) };
#[cfg(target_arch = "x86_64")]
let mut sum: u64 = unsafe { _mm_cvtsi128_si64(total) as u64 };
#[cfg(target_arch = "x86")]
let mut sum: u64 = {
let mut tmp = 0u64;
unsafe { _mm_storel_epi64(&mut tmp as *mut u64 as *mut __m128i, total) };
tmp
};
while i < n {
let da = a[i] as i32 - b[i] as i32;
sum += da.unsigned_abs() as u64;
i += 1;
}
sum as f64 / n as f64
}
#[target_feature(enable = "ssse3")]
#[allow(unused_unsafe)]
pub(super) unsafe fn sobel(input: &[u8], mag: &mut [i32], dir: &mut [u8], w: usize, h: usize) {
mag.fill(0);
dir.fill(0);
const LANES: usize = 8;
let zero_i = unsafe { _mm_setzero_si128() };
for y in 1..h.saturating_sub(1) {
let prev = &input[(y - 1) * w..];
let curr = &input[y * w..];
let next = &input[(y + 1) * w..];
let off = y * w;
let mut x = 1usize;
while x + LANES < w {
macro_rules! ld {
($row:expr, $o:expr) => {{
let v = unsafe { _mm_loadl_epi64($row.as_ptr().add($o) as *const __m128i) };
unsafe { _mm_unpacklo_epi8(v, zero_i) } }};
}
let pl = ld!(prev, x - 1);
let pm = ld!(prev, x);
let pr = ld!(prev, x + 1);
let cl = ld!(curr, x - 1);
let cr = ld!(curr, x + 1);
let nl = ld!(next, x - 1);
let nm = ld!(next, x);
let nr = ld!(next, x + 1);
let gx = unsafe {
let pos = _mm_add_epi16(_mm_add_epi16(pr, _mm_slli_epi16::<1>(cr)), nr);
let neg = _mm_add_epi16(_mm_add_epi16(pl, _mm_slli_epi16::<1>(cl)), nl);
_mm_sub_epi16(pos, neg)
};
let gy = unsafe {
let pos = _mm_add_epi16(_mm_add_epi16(nl, _mm_slli_epi16::<1>(nm)), nr);
let neg = _mm_add_epi16(_mm_add_epi16(pl, _mm_slli_epi16::<1>(pm)), pr);
_mm_sub_epi16(pos, neg)
};
let mag_i16 = unsafe { _mm_add_epi16(_mm_abs_epi16(gx), _mm_abs_epi16(gy)) };
let lo = unsafe { _mm_unpacklo_epi16(mag_i16, _mm_cmpgt_epi16(zero_i, mag_i16)) };
let hi = unsafe { _mm_unpackhi_epi16(mag_i16, _mm_cmpgt_epi16(zero_i, mag_i16)) };
unsafe {
_mm_storeu_si128(mag.as_mut_ptr().add(off + x) as *mut __m128i, lo);
_mm_storeu_si128(mag.as_mut_ptr().add(off + x + 4) as *mut __m128i, hi);
}
let gx_arr: [i16; 8] = unsafe { core::mem::transmute(gx) };
let gy_arr: [i16; 8] = unsafe { core::mem::transmute(gy) };
for j in 0..LANES {
let ax = gx_arr[j].unsigned_abs() as u32;
let ay = gy_arr[j].unsigned_abs() as u32;
dir[off + x + j] = if ay * 1000 < ax * 414 {
0
} else if ay * 1000 > ax * 2414 {
2
} else if (gx_arr[j] >= 0) == (gy_arr[j] >= 0) {
1
} else {
3
};
}
x += LANES;
}
while x < w - 1 {
let i = |yy: usize, xx: usize| input[yy * w + xx] as i32;
let gx = -i(y - 1, x - 1) - 2 * i(y, x - 1) - i(y + 1, x - 1)
+ i(y - 1, x + 1)
+ 2 * i(y, x + 1)
+ i(y + 1, x + 1);
let gy = -i(y - 1, x - 1) - 2 * i(y - 1, x) - i(y - 1, x + 1)
+ i(y + 1, x - 1)
+ 2 * i(y + 1, x)
+ i(y + 1, x + 1);
mag[off + x] = gx.abs() + gy.abs();
let ax = gx.unsigned_abs();
let ay = gy.unsigned_abs();
dir[off + x] = if ay * 1000 < ax * 414 {
0
} else if ay * 1000 > ax * 2414 {
2
} else if gx.signum() == gy.signum() {
1
} else {
3
};
x += 1;
}
}
}