use rustc_hash::FxHashSet;
use vyre_primitives::hash::fnv1a::{fnv1a64_initial_state, fnv1a64_update_byte};
const ALPHA: f64 = 1.23;
const BUCKET_DIVISOR: usize = 4;
const MAX_DISPLACEMENT_TRIES: u32 = 1_000_000;
const MAX_SALT_RETRIES: u32 = 16;
#[derive(Debug, Clone, Default)]
pub struct PerfectHash {
seed1: u64,
seed2: u64,
displacement: Vec<u32>,
key_hashes: Vec<u64>,
values: Vec<u32>,
len: usize,
}
impl PerfectHash {
pub fn lookup(&self, key: &str) -> Option<u32> {
if self.displacement.is_empty() {
return None;
}
let bytes = key.as_bytes();
let h1 = hash_with_seed(bytes, self.seed1) as usize;
let bucket = h1 % self.displacement.len();
let disp = self.displacement[bucket];
let h2 = hash_with_seed(bytes, self.seed2.wrapping_add(disp as u64));
let slot = (h2 as usize) % self.key_hashes.len();
if self.key_hashes[slot] == hash_verify(bytes) {
Some(self.values[slot])
} else {
None
}
}
pub fn len(&self) -> usize {
self.len
}
pub fn is_empty(&self) -> bool {
self.len == 0
}
pub fn slots(&self) -> usize {
self.key_hashes.len()
}
pub fn displacement(&self) -> &[u32] {
&self.displacement
}
pub fn key_hashes(&self) -> &[u64] {
&self.key_hashes
}
pub fn values(&self) -> &[u32] {
&self.values
}
pub fn seeds(&self) -> (u64, u64) {
(self.seed1, self.seed2)
}
}
pub fn build_chd<I, S>(entries: I) -> PerfectHash
where
I: IntoIterator<Item = (S, u32)>,
S: AsRef<str>,
{
try_build_chd(entries)
.unwrap_or_else(|error| panic!("vyre-libs CHD perfect-hash construction failed: {error}"))
}
pub fn try_build_chd<I, S>(entries: I) -> Result<PerfectHash, BuildError>
where
I: IntoIterator<Item = (S, u32)>,
S: AsRef<str>,
{
let pairs: Vec<(String, u32)> = entries
.into_iter()
.map(|(k, v)| (k.as_ref().to_owned(), v))
.collect();
if pairs.is_empty() {
return Ok(PerfectHash::default());
}
let mut seen = FxHashSet::default();
seen.reserve(pairs.len());
for (k, _) in &pairs {
if !seen.insert(k.as_str()) {
return Err(BuildError::DuplicateKey(k.clone()));
}
}
for salt in 0..MAX_SALT_RETRIES {
if let Some(ph) = try_build_with_salt(&pairs, salt as u64) {
return Ok(ph);
}
}
Err(BuildError::ConstructionFailed(pairs.len()))
}
fn try_build_with_salt(pairs: &[(String, u32)], salt: u64) -> Option<PerfectHash> {
let n = pairs.len();
let table_size = (((n as f64) * ALPHA).ceil() as usize) | 1;
let n_buckets = ((n / BUCKET_DIVISOR).max(1)) | 1;
let seed1 = salt.wrapping_mul(0x9E37_79B9_7F4A_7C15).wrapping_add(1);
let seed2 = salt
.wrapping_mul(0xBF58_476D_1CE4_E5B9)
.wrapping_add(0xDEAD_BEEF_CAFE_BABE);
let mut bucket_offsets = vec![0usize; n_buckets + 1];
for (k, _) in pairs {
let h = hash_with_seed(k.as_bytes(), seed1) as usize;
bucket_offsets[h % n_buckets + 1] += 1;
}
for i in 1..bucket_offsets.len() {
bucket_offsets[i] += bucket_offsets[i - 1];
}
let mut bucket_cursor = bucket_offsets[..n_buckets].to_vec();
let mut bucket_items = vec![0usize; n];
for (i, (k, _)) in pairs.iter().enumerate() {
let h = hash_with_seed(k.as_bytes(), seed1) as usize;
let bucket = h % n_buckets;
let slot = bucket_cursor[bucket];
bucket_items[slot] = i;
bucket_cursor[bucket] += 1;
}
let mut bucket_order: Vec<usize> = (0..n_buckets).collect();
bucket_order.sort_by_key(|&b| std::cmp::Reverse(bucket_offsets[b + 1] - bucket_offsets[b]));
let mut displacement = vec![0_u32; n_buckets];
let mut key_hashes = vec![0_u64; table_size];
let mut values = vec![0_u32; table_size];
let mut occupied = vec![false; table_size];
let mut candidate_scratch = vec![false; table_size];
let mut candidate_slots = Vec::new();
'bucket: for b in bucket_order {
let bucket = &bucket_items[bucket_offsets[b]..bucket_offsets[b + 1]];
if bucket.is_empty() {
continue;
}
for disp in 0..MAX_DISPLACEMENT_TRIES {
candidate_slots.clear();
candidate_slots.reserve(bucket.len());
let mut ok = true;
for &ki in bucket {
let key = pairs[ki].0.as_bytes();
let h2 = hash_with_seed(key, seed2.wrapping_add(disp as u64));
let slot = (h2 as usize) % table_size;
if occupied[slot] || candidate_scratch[slot] {
ok = false;
break;
}
candidate_scratch[slot] = true;
candidate_slots.push(slot);
}
for slot in &candidate_slots {
candidate_scratch[*slot] = false;
}
if ok {
displacement[b] = disp;
for (ki, slot) in bucket.iter().zip(candidate_slots.iter()) {
let key = pairs[*ki].0.as_bytes();
key_hashes[*slot] = hash_verify(key);
values[*slot] = pairs[*ki].1;
occupied[*slot] = true;
}
continue 'bucket;
}
}
return None;
}
Some(PerfectHash {
seed1,
seed2,
displacement,
key_hashes,
values,
len: n,
})
}
#[derive(Debug, thiserror::Error)]
pub enum BuildError {
#[error("duplicate key: {0:?}")]
DuplicateKey(String),
#[error("CHD construction failed for {0} keys after all salt retries")]
ConstructionFailed(usize),
}
#[inline]
fn hash_with_seed(data: &[u8], seed: u64) -> u64 {
let mut h = seed ^ fnv1a64_initial_state();
for &b in data {
h = fnv1a64_update_byte(h, b);
}
h
}
#[inline]
fn hash_verify(data: &[u8]) -> u64 {
let mut h: u64 = 0x517c_c1b7_2722_0a95;
for &b in data {
h = h.rotate_left(5) ^ (b as u64);
h = h.wrapping_mul(0x9e37_79b9_7f4a_7c15);
}
h ^= h >> 33;
h = h.wrapping_mul(0xff51_afd7_ed55_8ccd);
h ^= h >> 33;
h = h.wrapping_mul(0xc4ce_b9fe_1a85_ec53);
h ^= h >> 33;
h
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_input_roundtrips() {
let ph = build_chd(Vec::<(&str, u32)>::new());
assert_eq!(ph.len(), 0);
assert!(ph.is_empty());
assert_eq!(ph.lookup("any"), None);
}
#[test]
fn single_entry() {
let ph = build_chd([("hello", 42_u32)]);
assert_eq!(ph.len(), 1);
assert_eq!(ph.lookup("hello"), Some(42));
assert_eq!(ph.lookup("world"), None);
}
#[test]
fn ten_keys_roundtrip() {
let entries: Vec<(String, u32)> = (0..10).map(|i| (format!("key_{i}"), i as u32)).collect();
let ph = build_chd(entries.clone());
assert_eq!(ph.len(), 10);
for (k, v) in &entries {
assert_eq!(ph.lookup(k), Some(*v), "key={k:?}");
}
assert_eq!(ph.lookup("unknown"), None);
}
#[test]
fn thousand_keys_roundtrip() {
let entries: Vec<(String, u32)> = (0..1000)
.map(|i| (format!("func_{i:04}"), i as u32))
.collect();
let ph = build_chd(entries.clone());
assert_eq!(ph.len(), 1000);
for (k, v) in &entries {
assert_eq!(ph.lookup(k), Some(*v), "key={k:?}");
}
for i in 1000..1100 {
assert_eq!(ph.lookup(&format!("func_{i:04}")), None);
}
}
#[test]
fn duplicate_keys_rejected() {
let err = try_build_chd([("dup", 1_u32), ("dup", 2_u32)]).unwrap_err();
assert!(matches!(err, BuildError::DuplicateKey(k) if k == "dup"));
}
#[test]
#[should_panic(expected = "CHD perfect-hash construction failed")]
fn infallible_builder_panics_on_duplicates() {
let _ = build_chd([("dup", 1_u32), ("dup", 2_u32)]);
}
#[test]
fn value_preserved_bitwise() {
let entries: Vec<(String, u32)> = (0..100)
.map(|i| (format!("k_{i}"), (i as u32).wrapping_mul(0xDEAD_BEEF)))
.collect();
let ph = build_chd(entries.clone());
for (k, v) in entries {
assert_eq!(ph.lookup(&k), Some(v));
}
}
#[test]
fn unicode_keys_work() {
let entries = vec![
("naïve".to_string(), 1_u32),
("咖啡".to_string(), 2),
("über".to_string(), 3),
("🎉".to_string(), 4),
("test".to_string(), 5),
];
let ph = build_chd(entries.clone());
for (k, v) in entries {
assert_eq!(ph.lookup(&k), Some(v));
}
}
#[test]
fn space_overhead_under_30_percent() {
let entries: Vec<(String, u32)> = (0..500).map(|i| (format!("k_{i}"), i as u32)).collect();
let n = entries.len();
let ph = build_chd(entries);
let ratio = ph.slots() as f64 / n as f64;
assert!(ratio < 1.30, "slots/len ratio {ratio} > 1.30 budget");
}
#[test]
fn seeds_and_tables_are_non_trivial_after_build() {
let entries: Vec<(String, u32)> = (0..50).map(|i| (format!("k_{i}"), i as u32)).collect();
let ph = build_chd(entries);
let (s1, s2) = ph.seeds();
assert_ne!(s1, 0);
assert_ne!(s2, 0);
assert!(!ph.displacement().is_empty());
assert!(!ph.key_hashes().is_empty());
assert!(!ph.values().is_empty());
}
#[test]
fn negative_lookups_are_rejected_by_verify_hash() {
let entries: Vec<(String, u32)> = (0..200).map(|i| (format!("k_{i}"), i as u32)).collect();
let ph = build_chd(entries);
for i in 1000..1500 {
assert_eq!(ph.lookup(&format!("q_{i}")), None, "false hit on q_{i}");
}
}
#[test]
fn hash_with_seed_is_deterministic() {
assert_eq!(hash_with_seed(b"hello", 42), hash_with_seed(b"hello", 42));
assert_ne!(hash_with_seed(b"hello", 42), hash_with_seed(b"hello", 43));
assert_ne!(hash_with_seed(b"hello", 42), hash_with_seed(b"world", 42));
}
#[test]
fn hash_verify_differs_from_seeded_hash() {
let key = b"hello";
assert_ne!(hash_with_seed(key, 0), hash_verify(key));
}
#[test]
fn real_label_family_names_build_and_lookup() {
let funcs = [
"fopen",
"open",
"openat",
"CreateFileA",
"CreateFileW",
"std::fs::OpenOptions::open",
"std::fs::File::open",
"std::fs::File::create",
"tokio::fs::File::open",
"tokio::fs::File::create",
"rocket::response::NamedFile::open",
];
let entries: Vec<(String, u32)> = funcs
.iter()
.enumerate()
.map(|(i, f)| (f.to_string(), i as u32))
.collect();
let ph = build_chd(entries.clone());
for (k, v) in entries {
assert_eq!(ph.lookup(&k), Some(v));
}
assert_eq!(ph.lookup("not_in_family"), None);
assert_eq!(ph.lookup("malloc"), None);
}
use proptest::prelude::*;
proptest! {
#[test]
fn proptest_roundtrip_random_keys(
entries in prop::collection::hash_map(
"[a-zA-Z0-9_]{1,32}",
0u32..10000u32,
1..256usize,
),
) {
let vec: Vec<(String, u32)> = entries.into_iter().collect();
let ph = build_chd(vec.clone());
for (k, v) in &vec {
prop_assert_eq!(ph.lookup(k), Some(*v), "key={}", k);
}
}
#[test]
fn proptest_negative_lookups_miss(
entries in prop::collection::vec(("[a-z]{1,16}", 0u32..100u32), 1..100usize),
queries in prop::collection::vec("[a-z]{1,16}", 1..50usize),
) {
let deduped: std::collections::HashMap<String, u32> = entries.into_iter().collect();
prop_assume!(!deduped.is_empty());
let vec: Vec<(String, u32)> = deduped.clone().into_iter().collect();
let ph = build_chd(vec);
let key_set: std::collections::HashSet<&str> = deduped.keys().map(|k| k.as_str()).collect();
for q in &queries {
if key_set.contains(q.as_str()) {
continue;
}
prop_assert_eq!(ph.lookup(q), None, "false hit on {}", q);
}
}
#[test]
fn proptest_space_overhead_under_35_percent(
entries in prop::collection::vec(("[a-zA-Z0-9_]{1,32}", 0u32..10000u32), 10..500usize),
) {
let deduped: std::collections::HashMap<String, u32> = entries.into_iter().collect();
prop_assume!(deduped.len() >= 10);
let vec: Vec<(String, u32)> = deduped.into_iter().collect();
let ph = build_chd(vec.clone());
let n = vec.len();
let ratio = ph.slots() as f64 / n as f64;
let budget = if n < 20 { 1.5 } else { 1.35 };
prop_assert!(ratio < budget, "slots/len ratio {ratio} > {budget} budget for n={n}");
}
}
}