#![cfg(all(
feature = "simd-per-arch",
feature = "opt-simd-body-comparison",
any(target_arch = "x86", target_arch = "x86_64"),
any(feature = "detect-features", target_feature = "avx2")
))]
#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
static_assertions::const_assert_eq!(super::BODY_OUTLIER_VALUE, 6);
#[allow(unsafe_code)]
#[cfg_attr(not(feature = "detect-features"), inline(always))]
#[cfg_attr(feature = "detect-features", target_feature(enable = "avx2"), inline)]
unsafe fn packed_distance_as_u32x8(x: __m256i, y: __m256i) -> __m256i {
let mask_dibit_01 = _mm256_set1_epi8(0b01_01_01_01i8);
let mask_dibit_10 = _mm256_set1_epi8(0b10_10_10_10u8 as i8);
let mask_nibble_0011 = _mm256_set1_epi8(0b0011_0011);
let mask_byte_00001111 = _mm256_set1_epi8(0b00001111);
let value_dword_0x01010101 = _mm256_set1_epi32(0x01010101);
let z = _mm256_xor_si256(x, y);
let ta = _mm256_and_si256(y, mask_dibit_01);
let tb = _mm256_and_si256(x, mask_dibit_01);
let ta = _mm256_or_si256(ta, _mm256_slli_epi32::<1>(ta)); let tb = _mm256_sub_epi32(mask_dibit_10, tb);
let ta = _mm256_xor_si256(ta, x);
let tb = _mm256_xor_si256(tb, x);
let sa = _mm256_and_si256(ta, z); let tb = _mm256_and_si256(tb, z);
let ta = _mm256_srli_epi32::<2>(sa);
let sa = _mm256_and_si256(sa, mask_nibble_0011);
let tb = _mm256_srli_epi32::<1>(tb);
let ta = _mm256_and_si256(ta, mask_nibble_0011);
let tb = _mm256_or_si256(tb, _mm256_slli_epi32::<1>(tb)); let sa = _mm256_add_epi32(sa, ta); let sb = _mm256_and_si256(tb, z); let tb = _mm256_srli_epi32::<2>(sb);
let sb = _mm256_and_si256(sb, mask_nibble_0011);
let tb = _mm256_and_si256(tb, mask_nibble_0011);
let sb = _mm256_add_epi32(sb, tb);
let s = _mm256_add_epi32(sb, sa); let t = _mm256_srli_epi32::<4>(s);
let s = _mm256_and_si256(s, mask_byte_00001111);
let t = _mm256_and_si256(t, mask_byte_00001111);
let s = _mm256_add_epi32(s, t); let s = _mm256_mullo_epi32(s, value_dword_0x01010101);
_mm256_srli_epi32::<24>(s) }
#[allow(unsafe_code)]
#[cfg_attr(not(feature = "detect-features"), inline(always))]
#[cfg_attr(feature = "detect-features", target_feature(enable = "avx2"), inline)]
pub unsafe fn distance_32(body1: &[u8; 32], body2: &[u8; 32]) -> u32 {
let x = _mm256_loadu_si256(body1 as *const u8 as *const __m256i);
let y = _mm256_loadu_si256(body2 as *const u8 as *const __m256i);
let s = packed_distance_as_u32x8(x, y);
let t = _mm256_shuffle_epi32::<0b11_10_11_10>(s);
let s = _mm256_add_epi32(s, t); let t = _mm256_shuffle_epi32::<0b01_01_01_01>(s);
let s = _mm256_add_epi32(s, t); let s0 = _mm256_extract_epi32::<0>(s) as u32;
let s1 = _mm256_extract_epi32::<4>(s) as u32;
s0 + s1
}
#[allow(unsafe_code)]
#[cfg_attr(not(feature = "detect-features"), inline(always))]
#[cfg_attr(feature = "detect-features", target_feature(enable = "avx2"), inline)]
pub unsafe fn distance_64(body1: &[u8; 64], body2: &[u8; 64]) -> u32 {
let px = body1 as *const u8 as *const __m256i;
let py = body2 as *const u8 as *const __m256i;
let x = _mm256_loadu_si256(px);
let y = _mm256_loadu_si256(py);
let s = packed_distance_as_u32x8(x, y);
let t = _mm256_shuffle_epi32::<0b11_10_11_10>(s);
let s = _mm256_add_epi32(s, t); let t = _mm256_shuffle_epi32::<0b01_01_01_01>(s);
let s = _mm256_add_epi32(s, t); let s0 = _mm256_extract_epi32::<0>(s) as u32;
let s1 = _mm256_extract_epi32::<4>(s) as u32;
let v0 = s0 + s1;
let x = _mm256_loadu_si256(px.add(1));
let y = _mm256_loadu_si256(py.add(1));
let s = packed_distance_as_u32x8(x, y);
let t = _mm256_shuffle_epi32::<0b11_10_11_10>(s);
let s = _mm256_add_epi32(s, t); let t = _mm256_shuffle_epi32::<0b01_01_01_01>(s);
let s = _mm256_add_epi32(s, t); let s0 = _mm256_extract_epi32::<0>(s) as u32;
let s1 = _mm256_extract_epi32::<4>(s) as u32;
let v1 = s0 + s1;
v0 + v1
}