#![cfg(all(
feature = "simd-per-arch",
feature = "opt-simd-body-comparison",
any(
all(target_arch = "aarch64", any(doc, target_feature = "neon")),
all(
target_arch = "arm",
feature = "unstable",
any(
doc,
all(
target_feature = "v7",
any(feature = "detect-features", target_feature = "neon")
)
)
)
)
))]
#[cfg(target_arch = "aarch64")]
use core::arch::aarch64::*;
#[cfg(all(target_arch = "arm", feature = "unstable"))]
use core::arch::arm::*;
static_assertions::const_assert_eq!(super::BODY_OUTLIER_VALUE, 6);
#[allow(unsafe_code)]
#[cfg_attr(
not(all(
target_arch = "arm",
feature = "detect-features",
feature = "unstable",
target_feature = "v7"
)),
inline(always)
)]
#[cfg_attr(
all(
target_arch = "arm",
feature = "detect-features",
feature = "unstable",
target_feature = "v7"
),
target_feature(enable = "neon"),
inline
)]
unsafe fn packed_distance_as_u16x8(x: uint8x16_t, y: uint8x16_t) -> uint16x8_t {
let mask_dibit_01 = vreinterpretq_u32_u8(vdupq_n_u8(0b01_01_01_01));
let mask_dibit_10 = vreinterpretq_u32_u8(vdupq_n_u8(0b10_10_10_10));
let mask_nibble_0011 = vreinterpretq_u32_u8(vdupq_n_u8(0b0011_0011));
let mask_byte_00001111 = vreinterpretq_u32_u8(vdupq_n_u8(0b00001111));
let x = vreinterpretq_u32_u8(x);
let y = vreinterpretq_u32_u8(y);
let z = veorq_u32(x, y);
let ta = vandq_u32(y, mask_dibit_01);
let tb = vandq_u32(x, mask_dibit_01);
let ta = vorrq_u32(ta, vshlq_n_u32::<1>(ta)); let tb = vsubq_u32(mask_dibit_10, tb);
let ta = veorq_u32(ta, x);
let tb = veorq_u32(tb, x);
let sa = vandq_u32(ta, z); let tb = vandq_u32(tb, z);
let ta = vshrq_n_u32::<2>(sa);
let sa = vandq_u32(sa, mask_nibble_0011);
let tb = vshrq_n_u32::<1>(tb);
let ta = vandq_u32(ta, mask_nibble_0011);
let tb = vorrq_u32(tb, vshlq_n_u32::<1>(tb)); let sa = vaddq_u32(sa, ta); let sb = vandq_u32(tb, z); let tb = vshrq_n_u32::<2>(sb);
let sb = vandq_u32(sb, mask_nibble_0011);
let tb = vandq_u32(tb, mask_nibble_0011);
let sb = vaddq_u32(sb, tb);
let s = vaddq_u32(sb, sa); let t = vshrq_n_u32::<4>(s);
let s = vandq_u32(s, mask_byte_00001111);
let t = vandq_u32(t, mask_byte_00001111);
let s = vaddq_u32(s, t); vpaddlq_u8(vreinterpretq_u8_u32(s))
}
#[allow(unsafe_code)]
#[cfg_attr(
not(all(
target_arch = "arm",
feature = "detect-features",
feature = "unstable",
target_feature = "v7"
)),
inline(always)
)]
#[cfg_attr(
all(
target_arch = "arm",
feature = "detect-features",
feature = "unstable",
target_feature = "v7"
),
target_feature(enable = "neon"),
inline
)]
pub unsafe fn distance_32(body1: &[u8; 32], body2: &[u8; 32]) -> u32 {
let px = body1 as *const u8;
let py = body2 as *const u8;
let (x, y) = (vld1q_u8(px), vld1q_u8(py));
let s1 = packed_distance_as_u16x8(x, y); let (x, y) = (vld1q_u8(px.add(16)), vld1q_u8(py.add(16)));
let s2 = packed_distance_as_u16x8(x, y);
let s = vaddq_u16(s1, s2); let t = vpaddlq_u16(s); let s = vget_high_u32(t);
let t = vget_low_u32(t);
let s = vadd_u32(s, t); vget_lane_u32::<0>(s).wrapping_add(vget_lane_u32::<1>(s))
}
#[allow(unsafe_code)]
#[cfg_attr(
not(all(
target_arch = "arm",
feature = "detect-features",
feature = "unstable",
target_feature = "v7"
)),
inline(always)
)]
#[cfg_attr(
all(
target_arch = "arm",
feature = "detect-features",
feature = "unstable",
target_feature = "v7"
),
target_feature(enable = "neon"),
inline
)]
pub unsafe fn distance_64(body1: &[u8; 64], body2: &[u8; 64]) -> u32 {
let mut px = body1 as *const u8;
let mut py = body2 as *const u8;
let mut s = vdupq_n_u16(0);
for _ in 0..4 {
let (x, y) = (vld1q_u8(px), vld1q_u8(py));
s = vaddq_u16(s, packed_distance_as_u16x8(x, y));
(px, py) = (px.add(16), py.add(16));
}
let s = vpaddlq_u16(s); let t = vget_high_u32(s);
let s = vget_low_u32(s);
let s = vadd_u32(s, t); vget_lane_u32::<0>(s).wrapping_add(vget_lane_u32::<1>(s))
}