Skip to main content

ph_temp/phast/
perfect.rs

1use dyn_size_of::GetSize;
2use seedable_hash::{BuildDefaultSeededHasher, BuildSeededHasher};
3use voracious_radix_sort::RadixSort;
4use std::hash::Hash;
5use rayon::prelude::*;
6
7use crate::{phast::{bits_per_seed_to_100_bucket_size, builder::{build_mt, build_st}, function::{Level, SeedEx}, Params, SeedChooser, SeedOnly, SeedOnlyK, WINDOW_SIZE}, seeds::{Bits8, SeedSize}};
8
9/// PHast (Perfect Hashing made fast) - (K-)Perfect (not necessary minimal) Hash Function
10/// with very fast evaluation developed by Piotr Beling and Peter Sanders.
11/// Experimental.
12/// 
13/// Can be used with the following seed choosers (which specify a particular PHast variant):
14/// [`ShiftOnlyWrapped`], [`ShiftSeedWrapped`], [`SeedOnly`], [`SeedOnlyK`].
15/// 
16/// See:
17/// Piotr Beling, Peter Sanders, *PHast - Perfect Hashing made fast*, 2025, <https://arxiv.org/abs/2504.17918>
18pub struct Perfect<SS: SeedSize, SC = SeedOnly, S = BuildDefaultSeededHasher>
19{
20    level0: SeedEx<SS::VecElement>,
21    levels: Box<[Level<SS::VecElement>]>,
22    hasher: S,
23    seed_chooser: SC,
24    seed_size: SS
25}
26
27impl<SC, SS: SeedSize, S> GetSize for Perfect<SS, SC, S> {
28    fn size_bytes_dyn(&self) -> usize {
29        self.level0.size_bytes_dyn() + self.levels.size_bytes_dyn()
30    }
31    fn size_bytes_content_dyn(&self) -> usize {
32        self.level0.size_bytes_content_dyn() + self.levels.size_bytes_content_dyn()
33    }
34    const USES_DYN_MEM: bool = true;
35}
36
37impl<SS: SeedSize, SC: SeedChooser, S: BuildSeededHasher> Perfect<SS, SC, S> {
38    
39    /// Returns value assigned to the given `key`.
40    /// 
41    /// The returned value is in the range from `0` (inclusive) to the number of elements in the input key collection (exclusive).
42    /// `key` must come from the input key collection given during construction.
43    #[inline(always)]   //inline(always) is important here
44    pub fn get<K>(&self, key: &K) -> usize where K: Hash + ?Sized {
45        let key_hash = self.hasher.hash_one(key, 0);
46        let seed = unsafe { self.level0.seed_for(self.seed_size, key_hash) };
47        if seed != 0 { return self.seed_chooser.f(key_hash, seed, &self.level0.conf); }
48
49        for level_nr in 0..self.levels.len() {
50            let l = &self.levels[level_nr];
51            let key_hash = self.hasher.hash_one(key, level_nr as u64 + 1);
52            let seed = unsafe { l.seeds.seed_for(self.seed_size, key_hash) };
53            if seed != 0 {
54                return self.seed_chooser.f(key_hash, seed, &l.seeds.conf) + l.shift
55            }
56        }
57
58        unreachable!("phast::Perfect::get called for key not included in the input set")
59    }
60
61    /// Constructs [`Perfect`] function for given `keys`, using a single thread and given parameters:
62    /// number of bits per seed, average bucket size (equals `bucket_size100/100.0`) and `hasher`.
63    /// 
64    /// `bits_per_seed_to_100_bucket_size` can be used to calculate good `bucket_size100`.
65    /// `keys` cannot contain duplicates.
66    pub fn with_vec_p_hash_sc<K>(mut keys: Vec::<K>, params: &Params<SS>, hasher: S, seed_chooser: SC) -> Self where K: Hash {
67        Self::_new(|h| {
68            let level0 = Self::build_level_st(&mut keys, params, h, seed_chooser, 0);
69            (keys, level0)
70        }, |keys, level_nr, h| {
71            Self::build_level_st(keys, params, h, seed_chooser, level_nr)
72        }, hasher, seed_chooser, params.seed_size)
73    }
74
75    /// Constructs [`Perfect`] function for given `keys`, using multiple (given number of) threads and given parameters:
76    /// number of bits per seed, average bucket size (equals `bucket_size100/100.0`) and `hasher`.
77    /// 
78    /// `bits_per_seed_to_100_bucket_size` can be used to calculate good `bucket_size100`.
79    /// `keys` cannot contain duplicates.
80    pub fn with_vec_p_threads_hash_sc<K>(mut keys: Vec::<K>, params: &Params<SS>, threads_num: usize, hasher: S, seed_chooser: SC) -> Self
81        where K: Hash+Sync+Send, S: Sync, SC: Sync {
82        if threads_num == 1 { return Self::with_vec_p_hash_sc(keys, params, hasher, seed_chooser); }
83        Self::_new(|h| {
84            let level0 = Self::build_level_mt(&mut keys, params, threads_num, &h, seed_chooser, 0);
85            (keys, level0)
86        }, |keys, level_nr, h| {
87            Self::build_level_mt(keys, params, threads_num, &h, seed_chooser, level_nr)
88        }, hasher, seed_chooser, params.seed_size)
89    }
90
91
92    /// Constructs [`Perfect`] function for given `keys`, using a single thread and given parameters:
93    /// number of bits per seed, average bucket size (equals `bucket_size100/100.0`) and `hasher`.
94    /// 
95    /// `bits_per_seed_to_100_bucket_size` can be used to calculate good `bucket_size100`.
96    /// `keys` cannot contain duplicates.
97    pub fn with_slice_p_hash_sc<K>(keys: &[K], params: &Params<SS>, hasher: S, seed_chooser: SC) -> Self where K: Hash+Clone {
98        Self::_new(|h| {
99            Self::build_level_from_slice_st(keys, params, h, seed_chooser, 0)
100        }, |keys, level_nr, h| {
101            Self::build_level_st(keys, params, &h, seed_chooser, level_nr)
102        }, hasher, seed_chooser, params.seed_size)
103    }
104
105
106    /// Constructs [`Perfect`] function for given `keys`, using multiple (given number of) threads and given parameters:
107    /// number of bits per seed, average bucket size (equals `bucket_size100/100.0`) and `hasher`.
108    /// 
109    /// `bits_per_seed_to_100_bucket_size` can be used to calculate good `bucket_size100`.
110    /// `keys` cannot contain duplicates.
111    pub fn with_slice_p_threads_hash_sc<K>(keys: &[K], params: &Params<SS>, threads_num: usize, hasher: S, seed_chooser: SC) -> Self
112        where K: Hash+Sync+Send+Clone, S: Sync, SC: Sync {
113        if threads_num == 1 { return Self::with_slice_p_hash_sc(keys, params, hasher, seed_chooser); }
114        Self::_new(|h| {
115            Self::build_level_from_slice_mt(keys, params, threads_num, h, seed_chooser, 0)
116        }, |keys, level_nr, h| {
117            Self::build_level_mt(keys, params, threads_num, h, seed_chooser, level_nr)
118        }, hasher, seed_chooser, params.seed_size)
119    }
120
121    #[inline]
122    fn _new<K, BF, BL>(build_first: BF, build_level: BL, hasher: S, seed_chooser: SC, seed_size: SS) -> Self
123        where BF: FnOnce(&S) -> (Vec::<K>, SeedEx<SS::VecElement>),
124            BL: Fn(&mut Vec::<K>, u64, &S) -> SeedEx<SS::VecElement>,
125            K: Hash
126        {
127        let (mut keys, level0) = build_first(&hasher);
128        let mut shift = level0.conf.output_range(seed_chooser, seed_size.into());
129        let mut levels = Vec::with_capacity(16);
130        while !keys.is_empty() {
131            let seeds = build_level(&mut keys, levels.len() as u64+1, &hasher);
132            let out_range = seeds.conf.output_range(seed_chooser, seed_size.into());
133            levels.push(Level { seeds, shift });
134            shift += out_range;
135        }
136        Self {
137            level0,
138            levels: levels.into_boxed_slice(),
139            hasher,
140            seed_chooser,
141            seed_size,
142        }
143    }
144
145    #[inline]
146    fn build_level_from_slice_st<K>(keys: &[K], params: &Params<SS>, hasher: &S, seed_chooser: SC, level_nr: u64)
147        -> (Vec<K>, SeedEx<SS::VecElement>)
148        where K: Hash+Clone
149    {
150        let mut hashes: Box<[_]> = keys.iter().map(|k| hasher.hash_one(k, level_nr)).collect();
151        //radsort::unopt::sort(&mut hashes);
152        hashes.voracious_sort();
153        let conf = seed_chooser.conf_for_minimal_p(hashes.len(), params);
154        let (seeds, builder) =
155            build_st(&hashes, conf, params.seed_size, seed_chooser.bucket_evaluator(params.bits_per_seed(), conf.slice_len()), seed_chooser);
156        let mut keys_vec = Vec::with_capacity(builder.unassigned_len(&seeds));
157        drop(builder);
158        keys_vec.extend(keys.into_iter().filter(|key| {
159            unsafe { params.seed_size.get_seed(&seeds, conf.bucket_for(hasher.hash_one(key, level_nr))) == 0 }
160        }).cloned());
161        (keys_vec, SeedEx{ seeds, conf })
162    }
163
164    #[inline]
165    fn build_level_from_slice_mt<K>(keys: &[K], params: &Params<SS>, threads_num: usize, hasher: &S, seed_chooser: SC, level_nr: u64)
166        -> (Vec<K>, SeedEx<SS::VecElement>)
167        where K: Hash+Sync+Send+Clone, S: Sync, SC: Sync
168    {
169        let mut hashes: Box<[_]> = if keys.len() > 4*2048 {    //maybe better for string keys
170            keys.par_iter().with_min_len(256).map(|k| hasher.hash_one(k, level_nr)).collect()
171        } else {
172            keys.iter().map(|k| hasher.hash_one(k, level_nr)).collect()
173        };
174        //radsort::unopt::sort(&mut hashes);
175        hashes.voracious_mt_sort(threads_num);
176        let conf = seed_chooser.conf_for_minimal_p(hashes.len(), params);
177        let (seeds, builder) =
178            build_mt(&hashes, conf, params.seed_size, WINDOW_SIZE, seed_chooser.bucket_evaluator(params.bits_per_seed(), conf.slice_len()), seed_chooser, threads_num);
179        let mut keys_vec = Vec::with_capacity(builder.unassigned_len(&seeds));
180        drop(builder);
181        keys_vec.par_extend(keys.into_par_iter().filter(|key| {
182            unsafe { params.seed_size.get_seed(&seeds, conf.bucket_for(hasher.hash_one(key, level_nr))) == 0 }
183        }).cloned());
184        (keys_vec, SeedEx{ seeds, conf })
185    }
186
187    #[inline(always)]
188    fn build_level_st<K>(keys: &mut Vec::<K>, params: &Params<SS>, hasher: &S, seed_chooser: SC, level_nr: u64) -> SeedEx<SS::VecElement>
189        where K: Hash
190    {
191        let mut hashes: Box<[_]> = keys.iter().map(|k| hasher.hash_one(k, level_nr)).collect();
192        hashes.voracious_sort();
193        let conf = seed_chooser.conf_for_minimal_p(hashes.len(), params);
194        let (seeds, _) =
195            build_st(&hashes, conf, params.seed_size, seed_chooser.bucket_evaluator(params.bits_per_seed(), conf.slice_len()), seed_chooser);
196        keys.retain(|key| {
197            unsafe { params.seed_size.get_seed(&seeds, conf.bucket_for(hasher.hash_one(key, level_nr))) == 0 }
198        });
199        SeedEx{ seeds, conf }
200    }
201
202    #[inline]
203    fn build_level_mt<K>(keys: &mut Vec::<K>, params: &Params<SS>, threads_num: usize, hasher: &S, seed_chooser: SC, level_nr: u64)
204        -> SeedEx<SS::VecElement>
205        where K: Hash+Sync+Send, S: Sync, SC: Sync
206    {
207        let mut hashes: Box<[_]> = if keys.len() > 4*2048 {    //maybe better for string keys
208            //let mut k = Vec::with_capacity(keys.len());
209            //k.par_extend(keys.par_iter().with_min_len(10000).map(|k| hasher.hash_one_s64(k, level_nr)));
210            //k.into_boxed_slice()
211            keys.par_iter().with_min_len(256).map(|k| hasher.hash_one(k, level_nr)).collect()
212        } else {
213            keys.iter().map(|k| hasher.hash_one(k, level_nr)).collect()
214        };
215        //radsort::unopt::sort(&mut hashes);
216        hashes.voracious_mt_sort(threads_num);
217        let conf = seed_chooser.conf_for_minimal_p(hashes.len(), params);
218        let (seeds, builder) =
219            build_mt(&hashes, conf, params.seed_size, WINDOW_SIZE, seed_chooser.bucket_evaluator(params.bits_per_seed(), conf.slice_len()), seed_chooser, threads_num);
220        let mut result = Vec::with_capacity(builder.unassigned_len(&seeds));
221        drop(builder);
222        std::mem::swap(keys, &mut result);
223        keys.par_extend(result.into_par_iter().filter(|key| {
224            unsafe { params.seed_size.get_seed(&seeds, conf.bucket_for(hasher.hash_one(key, level_nr))) == 0 }
225        }));
226        SeedEx{ seeds, conf }
227    }
228
229    /// Returns maximum number of keys which can be mapped to the same value by `k`-[`Perfect`] function `self`.
230    #[inline(always)] pub fn k(&self) -> u8 { self.seed_chooser.k() }
231
232    /// Returns output range of minimal (perfect or k-perfect) function for given number of keys,
233    /// i.e. 1 + maximum value that minimal function can return.
234    #[inline(always)] pub fn minimal_output_range(&self, num_of_keys: usize) -> usize { self.seed_chooser.minimal_output_range(num_of_keys) }
235
236    /// Returns output range of `self`, i.e. 1 + maximum value that `self` can return.
237    pub fn output_range(&self) -> usize {
238        if let Some(last_level) = self.levels.last() {
239            last_level.shift + last_level.seeds.conf.output_range(self.seed_chooser, self.seed_size.into())
240        } else {
241            self.level0.conf.output_range(self.seed_chooser, self.seed_size.into())
242        }
243    }
244}
245
246impl Perfect<Bits8, SeedOnly, BuildDefaultSeededHasher> {
247    /// Constructs [`Perfect`] function for given `keys`, using a single thread.
248    /// 
249    /// `keys` cannot contain duplicates.
250    pub fn from_vec_st<K>(keys: Vec::<K>) -> Self where K: Hash {
251        Self::with_vec_p_hash_sc(keys, &Params::new(Bits8, bits_per_seed_to_100_bucket_size(8)),
252        BuildDefaultSeededHasher::default(), SeedOnly)
253    }
254
255    /// Constructs [`Perfect`] function for given `keys`, using multiple threads.
256    /// 
257    /// `keys` cannot contain duplicates.
258    pub fn from_vec_mt<K>(keys: Vec::<K>) -> Self where K: Hash+Send+Sync {
259        Self::with_vec_p_threads_hash_sc(keys, &Params::new(Bits8, bits_per_seed_to_100_bucket_size(8)),
260        std::thread::available_parallelism().map_or(1, |v| v.into()), BuildDefaultSeededHasher::default(), SeedOnly)
261    }
262
263    /// Constructs [`Perfect`] function for given `keys`, using a single thread.
264    /// 
265    /// `keys` cannot contain duplicates.
266    pub fn from_slice_st<K>(keys: &[K]) -> Self where K: Hash+Clone {
267        Self::with_slice_p_hash_sc(keys, &Params::new(Bits8, bits_per_seed_to_100_bucket_size(8)),
268        BuildDefaultSeededHasher::default(), SeedOnly)
269    }
270
271    /// Constructs [`Perfect`] function for given `keys`, using multiple threads.
272    /// 
273    /// `keys` cannot contain duplicates.
274    pub fn from_slice_mt<K>(keys: &[K]) -> Self where K: Hash+Clone+Send+Sync {
275        Self::with_slice_p_threads_hash_sc(keys, &Params::new(Bits8, bits_per_seed_to_100_bucket_size(8)),
276        std::thread::available_parallelism().map_or(1, |v| v.into()), BuildDefaultSeededHasher::default(), SeedOnly)
277    }
278}
279
280impl Perfect<Bits8, SeedOnlyK, BuildDefaultSeededHasher> {
281    /// Constructs `k`-[`Perfect`] function for given `keys`, using a single thread.
282    /// `k`-[`Perfect`] function maps `k` or less different keys to each value.
283    /// 
284    /// `keys` cannot contain duplicates.
285    pub fn k_from_vec_st<K>(k: u8, keys: Vec::<K>) -> Self where K: Hash {
286        Self::with_vec_p_hash_sc(keys, &Params::new(Bits8, bits_per_seed_to_100_bucket_size(8)),
287        BuildDefaultSeededHasher::default(), SeedOnlyK(k))
288    }
289
290    /// Constructs `k`-[`Perfect`] function for given `keys`, using multiple threads.
291    /// `k`-[`Perfect`] function maps `k` or less different keys to each value.
292    /// 
293    /// `keys` cannot contain duplicates.
294    pub fn k_from_vec_mt<K>(k: u8, keys: Vec::<K>) -> Self where K: Hash+Send+Sync {
295        Self::with_vec_p_threads_hash_sc(keys, &Params::new(Bits8, bits_per_seed_to_100_bucket_size(8)),
296        std::thread::available_parallelism().map_or(1, |v| v.into()), BuildDefaultSeededHasher::default(), SeedOnlyK(k))
297    }
298
299    /// Constructs `k`-[`Perfect`] function for given `keys`, using a single thread.
300    /// `k`-[`Perfect`] function maps `k` or less different keys to each value.
301    /// 
302    /// `keys` cannot contain duplicates.
303    pub fn k_from_slice_st<K>(k: u8, keys: &[K]) -> Self where K: Hash+Clone {
304        Self::with_slice_p_hash_sc(keys, &Params::new(Bits8, bits_per_seed_to_100_bucket_size(8)),
305        BuildDefaultSeededHasher::default(), SeedOnlyK(k))
306    }
307
308    /// Constructs `k`-[`Perfect`] function for given `keys`, using multiple threads.
309    /// `k`-[`Perfect`] function maps `k` or less different keys to each value.
310    /// 
311    /// `keys` cannot contain duplicates.
312    pub fn k_from_slice_mt<K>(k: u8, keys: &[K]) -> Self where K: Hash+Clone+Send+Sync {
313        Self::with_slice_p_threads_hash_sc(keys, &Params::new(Bits8, bits_per_seed_to_100_bucket_size(8)),
314        std::thread::available_parallelism().map_or(1, |v| v.into()), BuildDefaultSeededHasher::default(), SeedOnlyK(k))
315    }
316}
317
318
319#[cfg(test)]
320pub(crate) mod tests {
321    use std::fmt::Display;
322
323    use crate::utils::{verify_partial_kphf, verify_partial_phf};
324
325    use super::*;
326
327    fn phf_test<SC, K, SS, S>(f: &Perfect<SS, SC, S>, keys: &[K])
328        where K: Display+Hash, SC: SeedChooser, SS: SeedSize, S: BuildSeededHasher
329    {
330        verify_partial_phf(f.output_range(), keys, |key| Some(f.get(key)));
331    }
332
333    fn kphf_test<K: Display+Hash, SS: SeedSize, S: BuildSeededHasher>(f: &Perfect<SS, SeedOnlyK, S>, keys: &[K]) {
334        verify_partial_kphf(f.k(), f.output_range(), keys, |key| Some(f.get(key)));
335    }
336    
337    #[test]
338    fn test_small() {
339        let input = [1, 2, 3, 4, 5];
340        let f = Perfect::from_slice_st(&input);
341        phf_test(&f, &input);
342    }
343
344    #[test]
345    fn test_medium() {
346        let input: Box<[u16]> = (0..1000).collect();
347        let f = Perfect::from_slice_st(&input);
348        phf_test(&f, &input);
349    }
350
351    #[test]
352    fn test_small_k() {
353        let input = [1, 2, 3, 4, 5];
354        let f = Perfect::k_from_slice_st(3, &input);
355        kphf_test(&f, &input);
356    }
357
358    #[test]
359    fn test_medium_k() {
360        let input: Box<[u16]> = (0..1000).collect();
361        let f = Perfect::k_from_slice_st(3, &input);
362        kphf_test(&f, &input);
363    }
364}