#[cfg(target_arch = "aarch64")]
use core::arch::aarch64::{
uint8x16_t, vandq_u8, vceqq_u8, vdupq_n_u8, vget_lane_u64, vld1q_u8, vreinterpret_u64_u8,
vreinterpretq_u16_u8, vshrn_n_u16,
};
#[cfg(target_arch = "x86_64")]
use {
core::arch::x86_64::{_MM_HINT_T0, _mm_prefetch},
std::arch::x86_64::{
__m128i, __m256i, _mm_and_si128, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8,
_mm_set1_epi8, _mm_setzero_si128, _mm256_cmpeq_epi8, _mm256_loadu_si256,
_mm256_movemask_epi8, _mm256_set1_epi8,
},
std::sync::OnceLock,
};
use super::bitmask::BitMask;
pub(crate) const CONTROL_GROUP_SIZE: usize = 16;
pub(crate) const CTRL_EMPTY: u8 = 0;
pub(crate) const CTRL_TOMBSTONE: u8 = 0x80;
pub(crate) const FINGERPRINT_MASK: u8 = 0x7F;
const FINGERPRINT_SHIFT: u32 = 57;
pub(crate) trait ControlByte {
fn is_occupied(&self) -> bool;
fn is_free(&self) -> bool;
}
impl ControlByte for u8 {
#[inline]
fn is_occupied(&self) -> bool {
(*self & FINGERPRINT_MASK) != 0
}
#[inline]
fn is_free(&self) -> bool {
(*self & FINGERPRINT_MASK) == 0
}
}
pub(crate) struct ControlOps;
impl ControlOps {
#[inline]
#[must_use]
pub(crate) fn control_fingerprint(hash: u64) -> u8 {
#[allow(clippy::cast_possible_truncation)]
let high = ((hash >> FINGERPRINT_SHIFT) & u64::from(FINGERPRINT_MASK)) as u8;
high.max(1)
}
#[inline]
#[must_use]
pub(crate) fn fingerprint_bit(fingerprint: u8) -> u128 {
1u128 << u32::from(fingerprint.saturating_sub(1))
}
#[inline]
#[must_use]
pub(crate) fn find_next_fingerprint_in_controls(
controls: &[u8],
fingerprint: u8,
start: usize,
) -> Option<usize> {
if start >= controls.len() {
return None;
}
if controls.len() - start < CONTROL_GROUP_SIZE {
return controls[start..]
.iter()
.position(|&control| control == fingerprint)
.map(|offset| start + offset);
}
let wide = Self::preferred_group_width();
let mut index = start;
while wide > CONTROL_GROUP_SIZE && index + wide <= controls.len() {
let mask =
Self::control_match_fingerprint_group(&controls[index..index + wide], fingerprint);
if mask != 0 {
return Some(index + mask.trailing_zeros() as usize);
}
index += wide;
}
while index + CONTROL_GROUP_SIZE <= controls.len() {
let mask = Self::control_match_fingerprint_group(
&controls[index..index + CONTROL_GROUP_SIZE],
fingerprint,
);
if mask != 0 {
return Some(index + mask.trailing_zeros() as usize);
}
index += CONTROL_GROUP_SIZE;
}
controls[index..]
.iter()
.position(|&control| control == fingerprint)
.map(|offset| index + offset)
}
#[inline]
#[must_use]
fn preferred_group_width() -> usize {
#[cfg(target_arch = "x86_64")]
{
static WIDTH: OnceLock<usize> = OnceLock::new();
*WIDTH.get_or_init(|| {
if std::is_x86_feature_detected!("avx2") {
32
} else {
CONTROL_GROUP_SIZE
}
})
}
#[cfg(not(target_arch = "x86_64"))]
{
CONTROL_GROUP_SIZE
}
}
#[inline]
#[must_use]
pub(crate) fn control_match_fingerprint_group(chunk: &[u8], target: u8) -> u32 {
match chunk.len() {
CONTROL_GROUP_SIZE => match_fingerprint_group_u32(chunk.as_ptr(), target),
32 => unsafe { eq_mask_32(chunk.as_ptr(), target) },
_ => panic!("group matching requires 16 or 32 byte chunks"),
}
}
}
#[inline]
fn match_fingerprint_group_u32(ptr: *const u8, target: u8) -> u32 {
#[cfg(target_arch = "x86_64")]
#[allow(clippy::cast_ptr_alignment)]
unsafe {
let data = _mm_loadu_si128(ptr.cast::<__m128i>());
#[allow(clippy::cast_possible_wrap)]
let cmp = _mm_cmpeq_epi8(data, _mm_set1_epi8(target as i8));
#[allow(clippy::cast_sign_loss)]
{
(_mm_movemask_epi8(cmp) as u32) & 0xFFFF
}
}
#[cfg(not(target_arch = "x86_64"))]
{
let mut m = 0u32;
for i in 0..CONTROL_GROUP_SIZE {
if unsafe { *ptr.add(i) } == target {
m |= 1 << i;
}
}
m
}
}
pub(crate) struct ProbeOps;
impl ProbeOps {
#[allow(
clippy::cast_precision_loss,
clippy::cast_possible_truncation,
clippy::cast_sign_loss
)]
#[inline]
#[must_use]
pub(crate) fn log_log_probe_limit(capacity: usize) -> usize {
let n = capacity.max(4) as f64;
n.log2().max(2.0).log2().ceil().max(1.0) as usize
}
#[allow(clippy::cast_possible_truncation)]
#[inline]
#[must_use]
pub(crate) fn hash_to_usize(hash: u64) -> usize {
hash as usize
}
}
#[must_use]
pub(crate) unsafe fn eq_mask_16(ptr: *const u8, target: u8) -> BitMask {
#[cfg(target_arch = "aarch64")]
unsafe {
eq_mask_16_neon(ptr, target)
}
#[cfg(target_arch = "x86_64")]
unsafe {
eq_mask_16_sse2(ptr, target)
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
let mut m: u16 = 0;
for i in 0..CONTROL_GROUP_SIZE {
if unsafe { *ptr.add(i) } == target {
m |= 1u16 << i;
}
}
BitMask(m)
}
}
#[must_use]
pub(crate) unsafe fn free_mask_16(ptr: *const u8) -> BitMask {
#[cfg(target_arch = "aarch64")]
unsafe {
free_mask_16_neon(ptr)
}
#[cfg(target_arch = "x86_64")]
unsafe {
free_mask_16_sse2(ptr)
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
let mut m: u16 = 0;
for i in 0..CONTROL_GROUP_SIZE {
let b = unsafe { *ptr.add(i) };
if b == CTRL_EMPTY || b == CTRL_TOMBSTONE {
m |= 1u16 << i;
}
}
BitMask(m)
}
}
#[must_use]
pub(crate) unsafe fn eq_mask_32(ptr: *const u8, target: u8) -> u32 {
#[cfg(target_arch = "x86_64")]
{
if std::is_x86_feature_detected!("avx2") {
unsafe { return eq_mask_32_avx2(ptr, target) };
}
}
let lo = match_fingerprint_group_u32(ptr, target);
let hi = match_fingerprint_group_u32(unsafe { ptr.add(CONTROL_GROUP_SIZE) }, target);
lo | (hi << CONTROL_GROUP_SIZE)
}
#[inline]
pub(crate) unsafe fn prefetch_read(ptr: *const u8) {
#[cfg(target_arch = "aarch64")]
unsafe {
core::arch::asm!("prfm pldl1keep, [{}]", in(reg) ptr, options(nostack, preserves_flags));
}
#[cfg(target_arch = "x86_64")]
unsafe {
_mm_prefetch(ptr.cast::<i8>(), _MM_HINT_T0);
}
}
#[cfg(target_arch = "aarch64")]
#[inline]
unsafe fn nibble_mask_from_cmp(cmp: uint8x16_t) -> BitMask {
unsafe {
let narrowed = vshrn_n_u16(vreinterpretq_u16_u8(cmp), 4);
BitMask(vget_lane_u64(vreinterpret_u64_u8(narrowed), 0))
}
}
#[cfg(target_arch = "aarch64")]
#[inline]
unsafe fn eq_mask_16_neon(ptr: *const u8, target: u8) -> BitMask {
unsafe {
let bytes = vld1q_u8(ptr);
let cmp = vceqq_u8(bytes, vdupq_n_u8(target));
nibble_mask_from_cmp(cmp)
}
}
#[cfg(target_arch = "aarch64")]
#[inline]
unsafe fn free_mask_16_neon(ptr: *const u8) -> BitMask {
unsafe {
let bytes = vld1q_u8(ptr);
let masked = vandq_u8(bytes, vdupq_n_u8(FINGERPRINT_MASK));
let free_cmp = vceqq_u8(masked, vdupq_n_u8(0));
nibble_mask_from_cmp(free_cmp)
}
}
#[allow(
clippy::cast_possible_wrap,
clippy::cast_sign_loss,
clippy::cast_ptr_alignment
)]
#[cfg(target_arch = "x86_64")]
#[inline]
unsafe fn eq_mask_16_sse2(ptr: *const u8, target: u8) -> BitMask {
unsafe {
let data = _mm_loadu_si128(ptr.cast::<__m128i>());
let target_vec = _mm_set1_epi8(target as i8);
let cmp = _mm_cmpeq_epi8(data, target_vec);
#[allow(clippy::cast_possible_truncation)]
{
BitMask(_mm_movemask_epi8(cmp) as u16)
}
}
}
#[allow(
clippy::cast_possible_wrap,
clippy::cast_sign_loss,
clippy::cast_ptr_alignment
)]
#[cfg(target_arch = "x86_64")]
#[inline]
unsafe fn free_mask_16_sse2(ptr: *const u8) -> BitMask {
unsafe {
let data = _mm_loadu_si128(ptr.cast::<__m128i>());
let masked = _mm_and_si128(data, _mm_set1_epi8(FINGERPRINT_MASK as i8));
let free = _mm_cmpeq_epi8(masked, _mm_setzero_si128());
#[allow(clippy::cast_possible_truncation)]
{
BitMask(_mm_movemask_epi8(free) as u16)
}
}
}
#[allow(clippy::cast_possible_wrap, clippy::cast_ptr_alignment)]
#[cfg(target_arch = "x86_64")]
#[inline]
unsafe fn eq_mask_32_avx2(ptr: *const u8, target: u8) -> u32 {
unsafe {
let data = _mm256_loadu_si256(ptr.cast::<__m256i>());
let target_vec = _mm256_set1_epi8(target as i8);
let cmp = _mm256_cmpeq_epi8(data, target_vec);
#[allow(clippy::cast_sign_loss)]
{
_mm256_movemask_epi8(cmp) as u32
}
}
}