Skip to main content

ph_temp/phast/
function2.rs

1use std::{hash::Hash, io, usize};
2
3use crate::{phast::{function::{build_level_from_slice_mt, build_level_from_slice_st, build_level_mt, build_level_st, Level, SeedEx}, seed_chooser::SeedOnlyNoBump, Params, ShiftOnlyWrapped}, seeds::{Bits8, SeedSize}};
4use super::{bits_per_seed_to_100_bucket_size, builder::build_last_level, conf::Conf, seed_chooser::{SeedChooser, SeedOnly}, CompressedArray, DefaultCompressedArray};
5use binout::{Serializer as _, VByte};
6use bitm::BitAccess;
7use dyn_size_of::GetSize;
8use seedable_hash::{BuildDefaultSeededHasher, BuildSeededHasher};
9use voracious_radix_sort::RadixSort;
10
11/// PHast (Perfect Hashing made fast) - Minimal Perfect Hash Function
12/// with very fast evaluation and size below 2 bits/key
13/// developed by Piotr Beling and Peter Sanders.
14/// 
15/// The last layer (when the number of keys is small) is constructed using regular PHast.
16/// This makes `Function2` compatible with almost all [`SeedChooser`]s (including non-wrapping `ShiftOnly`).
17/// 
18/// It can be used with the following [`SeedChooser`] (which specify a particular PHast variant):
19/// [`ShiftOnly`] (PHast+ without wrapping),
20/// [`ShiftOnlyWrapped`] (PHast+ with wrapping),
21/// [`ShiftSeedWrapped`] (PHast/PHast+ hybrid),
22/// [`SeedOnly`] (regular PHast).
23/// 
24/// Note that some [`SeedChooser`]s can also be used with [`Function`](crate::phast::Function).
25/// 
26/// See:
27/// Piotr Beling, Peter Sanders, *PHast - Perfect Hashing made fast*, 2025, <https://arxiv.org/abs/2504.17918>
28pub struct Function2<SS, SC = ShiftOnlyWrapped, CA = DefaultCompressedArray, S = BuildDefaultSeededHasher>
29    where SS: SeedSize
30{
31    level0: SeedEx<SS::VecElement>,
32    unassigned: CA,
33    levels: Box<[Level<SS::VecElement>]>,
34    hasher: S,
35    last_level: Level<<Bits8 as SeedSize>::VecElement>,
36    last_level_seed: u64,
37    seed_chooser: SC,
38    seed_size: SS,
39}
40
41impl<SC, SS: SeedSize, CA, S> GetSize for Function2<SS, SC, CA, S> where CA: GetSize {
42    fn size_bytes_dyn(&self) -> usize {
43        self.level0.size_bytes_dyn() +
44            self.unassigned.size_bytes_dyn() +
45            self.levels.size_bytes_dyn() +
46            self.last_level.size_bytes_dyn()
47    }
48    fn size_bytes_content_dyn(&self) -> usize {
49        self.level0.size_bytes_content_dyn() +
50            self.unassigned.size_bytes_content_dyn() +
51            self.levels.size_bytes_content_dyn() +
52            self.last_level.size_bytes_content_dyn()
53    }
54    const USES_DYN_MEM: bool = true;
55}
56
57impl<SS: SeedSize, SC: SeedChooser, CA: CompressedArray, S: BuildSeededHasher> Function2<SS, SC, CA, S> {
58    
59    /// Returns value assigned to the given `key`.
60    /// 
61    /// The returned value is in the range from `0` (inclusive) to the number of elements in the input key collection (exclusive).
62    /// `key` must come from the input key collection given during construction.
63    #[inline(always)]   //inline(always) is important here
64    pub fn get<K>(&self, key: &K) -> usize where K: Hash + ?Sized {
65        let key_hash = self.hasher.hash_one(key, 0);
66        let seed = unsafe { self.level0.seed_for(self.seed_size, key_hash) };
67        if seed != 0 { return self.seed_chooser.f(key_hash, seed, &self.level0.conf); }
68
69        for level_nr in 0..self.levels.len() {
70            let l = &self.levels[level_nr];
71            let key_hash = self.hasher.hash_one(key, level_nr as u64 + 1);
72            let seed = unsafe { l.seeds.seed_for(self.seed_size, key_hash) };
73            if seed != 0 {
74                return self.unassigned.get(self.seed_chooser.f(key_hash, seed, &l.seeds.conf) + l.shift)
75            }
76        }
77
78        let key_hash = self.hasher.hash_one(key, self.last_level_seed);
79        let seed = unsafe { self.last_level.seeds.seed_for(Bits8, key_hash) };
80        return self.unassigned.get(SeedOnlyNoBump.f(key_hash, seed, &self.last_level.seeds.conf) + self.last_level.shift)
81    }
82
83    /// Constructs [`Function`] for given `keys`, using a single thread and given parameters:
84    /// number of bits per seed, average bucket size (equals `bucket_size100/100.0`) and `hasher`.
85    /// 
86    /// `bits_per_seed_to_100_bucket_size` can be used to calculate good `bucket_size100`.
87    /// `keys` cannot contain duplicates.
88    pub fn with_vec_p_hash_sc<K>(mut keys: Vec::<K>, params: &Params<SS>, hasher: S, seed_chooser: SC) -> Self where K: Hash {
89        let number_of_keys = keys.len();
90        Self::_new(|h| {
91            let (level0, unassigned_values, unassigned_len) =
92                build_level_st(&mut keys, params, h, seed_chooser, 0);
93            (keys, level0, unassigned_values, unassigned_len)
94        }, |keys, level_nr, h| {
95            build_level_st(keys, params, h, seed_chooser, level_nr)
96        }, hasher, seed_chooser, params.seed_size, number_of_keys)
97    }
98
99    /// Constructs [`Function`] for given `keys`, using multiple (given number of) threads and given parameters:
100    /// number of bits per seed, average bucket size (equals `bucket_size100/100.0`) and `hasher`.
101    /// 
102    /// `bits_per_seed_to_100_bucket_size` can be used to calculate good `bucket_size100`.
103    /// `keys` cannot contain duplicates.
104    pub fn with_vec_p_threads_hash_sc<K>(mut keys: Vec::<K>, params: &Params<SS>, threads_num: usize, hasher: S, seed_chooser: SC) -> Self
105        where K: Hash+Sync+Send, S: Sync, SC: Sync {
106        if threads_num == 1 { return Self::with_vec_p_hash_sc(keys, params, hasher, seed_chooser); }
107        let number_of_keys = keys.len();
108        Self::_new(|h| {
109            let (level0, unassigned_values, unassigned_len) =
110                build_level_mt(&mut keys, params, threads_num, h, seed_chooser, 0);
111            (keys, level0, unassigned_values, unassigned_len)
112        }, |keys, level_nr, h| {
113            build_level_mt(keys, params, threads_num, h, seed_chooser, level_nr)
114        }, hasher, seed_chooser, params.seed_size, number_of_keys)
115    }
116
117
118    /// Constructs [`Function`] for given `keys`, using a single thread and given parameters:
119    /// number of bits per seed, average bucket size (equals `bucket_size100/100.0`) and `hasher`.
120    /// 
121    /// `bits_per_seed_to_100_bucket_size` can be used to calculate good `bucket_size100`.
122    /// `keys` cannot contain duplicates.
123    pub fn with_slice_p_hash_sc<K>(keys: &[K], params: &Params<SS>, hasher: S, seed_chooser: SC) -> Self where K: Hash+Clone {
124        Self::_new(|h| {
125            build_level_from_slice_st(keys, params, h, seed_chooser, 0)
126        }, |keys, level_nr, h| {
127            build_level_st(keys, params, h, seed_chooser, level_nr)
128        }, hasher, seed_chooser, params.seed_size, keys.len())
129    }
130
131
132    /// Constructs [`Function`] for given `keys`, using multiple (given number of) threads and given parameters:
133    /// number of bits per seed, average bucket size (equals `bucket_size100/100.0`) and `hasher`.
134    /// 
135    /// `bits_per_seed_to_100_bucket_size` can be used to calculate good `bucket_size100`.
136    /// `keys` cannot contain duplicates.
137    pub fn with_slice_p_threads_hash_sc<K>(keys: &[K], params: &Params<SS>, threads_num: usize, hasher: S, seed_chooser: SC) -> Self
138        where K: Hash+Sync+Send+Clone, S: Sync, SC: Sync {
139        if threads_num == 1 { return Self::with_slice_p_hash_sc(keys, params, hasher, seed_chooser); }
140        Self::_new(|h| {
141            build_level_from_slice_mt(keys, params, threads_num, h, seed_chooser, 0)
142        }, |keys, level_nr, h| {
143            build_level_mt(keys, params, threads_num, h, seed_chooser, level_nr)
144        }, hasher, seed_chooser, params.seed_size, keys.len())
145    }
146
147    #[inline]
148    fn _new<K, BF, BL>(build_first: BF, build_level: BL, hasher: S, seed_chooser: SC, seed_size: SS, number_of_keys: usize) -> Self
149        where BF: FnOnce(&S) -> (Vec::<K>, SeedEx<SS::VecElement>, Box<[u64]>, usize),
150            BL: Fn(&mut Vec::<K>, u64, &S) -> (SeedEx<SS::VecElement>, Box<[u64]>, usize),
151            K: Hash
152        {
153        let (mut keys, level0, unassigned_values, unassigned_len) = build_first(&hasher);
154        //dbg!(keys.len(), unassigned_len, unassigned_values.bit_ones().count());
155        debug_assert_eq!(unassigned_len, unassigned_values.bit_ones().count());
156        //Self::finish_building(keys, bits_per_seed, bucket_size100, threads_num, hasher, level0, unassigned_values, unassigned_len)
157        let mut level0_unassigned = unassigned_values.bit_ones();
158        let mut unassigned = Vec::with_capacity(unassigned_len * 3 / 2);
159
160        let mut levels = Vec::with_capacity(16);
161        let mut last = 0;
162        //let max_keys = 2048.max(SC::extra_shift(bits_p))
163        while keys.len() > SC::FUNCTION2_THRESHOLD /*2048*2*/ {
164            let keys_len = keys.len();
165            //println!("{keys_len} {:.2}% keys bumped, {} {}% in {} self-collided buckets",
166            //    keys_len as f64 / 100000.0,
167                //crate::phast::seed_chooser::SELF_COLLISION_KEYS.load(std::sync::atomic::Ordering::SeqCst),
168                //crate::phast::seed_chooser::SELF_COLLISION_KEYS.load(std::sync::atomic::Ordering::SeqCst) * 100 / keys_len as u64,
169                //crate::phast::seed_chooser::SELF_COLLISION_BUCKETS.load(std::sync::atomic::Ordering::SeqCst));
170            let (seeds, unassigned_values, _unassigned_len) =
171                build_level(&mut keys, levels.len() as u64+1, &hasher);
172            let shift = unassigned.len();
173            for i in 0..keys_len {
174                if CA::MAX_FOR_UNUSED {
175                    if !unsafe{unassigned_values.get_bit_unchecked(i)} {
176                        last = level0_unassigned.next().unwrap();
177                        unassigned.push(last);
178                    } else {
179                        unassigned.push(usize::MAX);
180                    }
181                } else {
182                    if !unsafe{unassigned_values.get_bit_unchecked(i)} {
183                        last = level0_unassigned.next().unwrap();
184                    }
185                    unassigned.push(last);
186                }
187            }
188            levels.push(Level { seeds, shift });
189        }
190        //dbg!(keys.len());   // TODO keys.len()==0
191        let mut last_seed = levels.len() as u64+1;
192        let last_shift;
193        let last_seeds =
194        if keys.is_empty() {
195            last_shift = 0;
196            SeedEx::<<Bits8 as SeedSize>::VecElement>{ seeds: Box::default(), conf: Conf { buckets_num: 0, slice_len_minus_one: 0, num_of_slices: 0 } }
197        } else {
198            let (last_seeds, unassigned_values, _unassigned_len) =
199                Self::build_last_level(keys, &hasher, &mut last_seed);
200            last_shift = unassigned.len();
201            for i in 0..last_seeds.conf.output_range(SeedOnlyNoBump, Bits8.into()) {
202                if CA::MAX_FOR_UNUSED {
203                    if !unsafe{unassigned_values.get_bit_unchecked(i)} {
204                        last = level0_unassigned.next().unwrap();
205                        unassigned.push(last);
206                    } else {
207                        unassigned.push(usize::MAX);
208                    }
209                } else {
210                    if !unsafe{unassigned_values.get_bit_unchecked(i)} {
211                        last = level0_unassigned.next().unwrap();
212                    }
213                    unassigned.push(last);
214                }
215            }
216            //drop(unassigned_values);
217            last_seeds
218        };
219        debug_assert!(level0_unassigned.next().is_none());  // TODO
220        drop(level0_unassigned);
221        Self {
222            level0,
223            unassigned: CA::new(unassigned, last, number_of_keys),
224            levels: levels.into_boxed_slice(),
225            hasher,
226            seed_chooser,
227            last_level: Level { seeds: last_seeds, shift: last_shift },
228            last_level_seed: last_seed,
229            seed_size,
230        }
231    }
232
233    #[inline(always)]
234    fn build_last_level<K>(keys: Vec::<K>, hasher: &S, seed: &mut u64)
235        -> (SeedEx<<Bits8 as SeedSize>::VecElement>, Box<[u64]>, usize)
236        where K: Hash
237    {
238        let bits_per_seed = Bits8;
239        let len100 = (keys.len()+10)*120;
240        let conf = SeedOnly.conf_for_minimal((len100+50)/100,
241            bits_per_seed.into(), 400, 0);  // TODO use turbo variant here
242        let evaluator = SeedOnly.bucket_evaluator(bits_per_seed.into(), conf.slice_len());
243        loop {
244            let mut hashes: Box<[_]> = keys.iter().map(|k| hasher.hash_one(k, *seed)).collect();
245            hashes.voracious_sort();    // maybe standard sort here?
246            if let Some((seeds, unassigned_values, unassigned_len)) =
247                build_last_level(&hashes, conf, bits_per_seed, evaluator.clone())
248            {
249                return (SeedEx{ seeds, conf }, unassigned_values, unassigned_len);
250            }
251            *seed += 1;
252            //dbg!(*seed);
253        }
254    }
255
256    /// Returns output range of minimal (perfect or k-perfect) function for given number of keys,
257    /// i.e. 1 + maximum value that minimal function can return.
258    #[inline(always)] pub fn minimal_output_range(&self, num_of_keys: usize) -> usize { self.seed_chooser.minimal_output_range(num_of_keys) }
259
260    /// Returns output range of `self`, i.e. 1 + maximum value that `self` can return.
261    pub fn output_range(&self) -> usize {
262        self.level0.conf.output_range(self.seed_chooser, self.seed_size.into())
263    }
264
265    /*#[inline(always)]
266    fn finish_building<K>(mut keys: Vec::<K>, bits_per_seed: SS, bucket_size100: u16, threads_num: usize, hasher: S, level0: SeedEx<SS>, unassigned_values: Box<[u64]>, unassigned_len: usize) -> Self where K: Hash+Sync+Send, S: Sync {
267        let mut level0_unassigned = unassigned_values.bit_ones();
268        let mut unassigned = Vec::with_capacity(unassigned_len * 3 / 2);
269
270        let mut levels = Vec::with_capacity(16);
271        let mut last = 0;
272        while !keys.is_empty() {
273            let keys_len = keys.len();
274            let (seeds, unassigned_values, _unassigned_len) =
275                Self::build_level(&mut keys, bits_per_seed, bucket_size100, threads_num, &hasher, levels.len() as u32+1);
276            let shift = unassigned.len();
277            for i in 0..keys_len {
278                if !unsafe{unassigned_values.get_bit_unchecked(i)} {
279                    last = level0_unassigned.next().unwrap();                    
280                }
281                unassigned.push(last);
282            }
283            levels.push(Level { seeds, shift });
284        }
285        debug_assert!(level0_unassigned.next().is_none());
286        drop(level0_unassigned);
287
288        let mut builder = CA::Builder::new(unassigned.len(), last);
289        builder.push_all(unassigned);
290
291        Self {
292            level0,
293            unassigned: CA::finish(builder),
294            levels: levels.into_boxed_slice(),
295            hasher,
296        }
297    }*/
298
299    /*pub fn new2<K>(mut keys: Vec::<K>, bits_per_seed: SS, bucket_size100: u16, threads_num: usize, hasher: S) -> Self where K: Hash+Sync+Send, S: Sync {
300        let keys_len = keys.len();
301        let (level0, unassigned_values, _unassigned_len) =
302            Self::build_level(&mut keys, bits_per_seed, bucket_size100, threads_num, &hasher, 0);
303        let largest_unassigned = bitmap_largest(&unassigned_values, keys_len);
304
305        let mut levels_data = Vec::with_capacity(16);
306        let mut total_len = 0;
307        while !keys.is_empty() {
308            let keys_len = keys.len();
309            let (seeds, unassigned_values, _unassigned_len) =
310                Self::build_level(&mut keys, bits_per_seed, bucket_size100, threads_num, &hasher, levels_data.len() as u32+1);
311            levels_data.push((seeds, unassigned_values, keys_len, total_len));
312            total_len += keys_len;
313        }
314        let mut levels = Vec::with_capacity(levels_data.len());
315        let mut builder = CA::Builder::new(total_len, largest_unassigned);
316        let mut level0_unassigned = unassigned_values.bit_ones();
317        let mut last = 0;
318        for (seeds, unassigned_values, keys_len, shift) in levels_data {
319            for i in 0..keys_len {
320                if !unsafe{unassigned_values.get_bit_unchecked(i)} {
321                    last = level0_unassigned.next().unwrap();                    
322                }
323                builder.push(last);
324            }
325            levels.push(Level { seeds, shift });
326        }
327        debug_assert!(level0_unassigned.next().is_none());
328        drop(level0_unassigned);
329
330        Self {
331            level0,
332            unassigned: CA::finish(builder),
333            levels: levels.into_boxed_slice(),
334            hasher,
335        }
336    }*/
337
338    /// Writes `self` to the `output`.
339    pub fn write(&self, output: &mut dyn io::Write) -> io::Result<()>
340    {
341        self.level0.write(output, self.seed_size)?;
342        self.unassigned.write(output)?;
343        VByte::write(output, self.levels.len())?;
344        for level in &self.levels {
345            level.write(output, self.seed_size)?;
346        }
347        VByte::write(output, self.last_level_seed)?;
348        self.last_level.write(output, Bits8)?;
349        Ok(())
350    }
351
352    /// Returns number of bytes which `write` will write.
353    pub fn write_bytes(&self) -> usize {
354        self.level0.write_bytes() +
355        self.unassigned.write_bytes() +
356        VByte::size(self.levels.len()) +
357        self.levels.iter().map(|l| l.size_bytes()).sum::<usize>() +
358        VByte::size(self.last_level_seed) +
359        self.last_level.write_bytes()
360    }
361
362    /// Read `Self` from the `input`. `hasher` and `seed_chooser` must be the same as used by the structure written.
363    pub fn read_with_hasher_sc(input: &mut dyn io::Read, hasher: S, seed_chooser: SC) -> io::Result<Self> {
364        let (seed_size, level0) = SeedEx::read(input)?;
365        let unassigned = CA::read(input)?;
366        let levels_num: usize = VByte::read(input)?;
367        let mut levels = Vec::with_capacity(levels_num);
368        for _ in 0..levels_num {
369            levels.push(Level::read::<SS>(input)?.1);
370        }
371        let last_level_seed = VByte::read(input)?;
372        let last_level = Level::read::<Bits8>(input)?.1;
373        Ok(Self { level0, unassigned, levels: levels.into_boxed_slice(), hasher, seed_chooser, seed_size, last_level, last_level_seed })
374    }
375}
376
377impl<SS: SeedSize> Function2<SS, ShiftOnlyWrapped, DefaultCompressedArray, BuildDefaultSeededHasher> {
378
379    /// Read `Self` from the `input`. Uses default hasher and seed chooser.
380    pub fn read(input: &mut dyn io::Read) -> io::Result<Self> {
381        Self::read_with_hasher_sc(input, BuildDefaultSeededHasher::default(), ShiftOnlyWrapped)
382    }
383}
384
385impl Function2<Bits8, ShiftOnlyWrapped, DefaultCompressedArray, BuildDefaultSeededHasher> {
386    /// Constructs [`Function`] for given `keys`, using a single thread.
387    /// 
388    /// `keys` cannot contain duplicates.
389    pub fn from_vec_st<K>(keys: Vec::<K>) -> Self where K: Hash {
390        Self::with_vec_p_hash_sc(keys, &Params::new(Bits8, bits_per_seed_to_100_bucket_size(8)),
391        BuildDefaultSeededHasher::default(), ShiftOnlyWrapped)
392    }
393
394    /// Constructs [`Function`] for given `keys`, using multiple threads.
395    /// 
396    /// `keys` cannot contain duplicates.
397    pub fn from_vec_mt<K>(keys: Vec::<K>) -> Self where K: Hash+Send+Sync {
398        Self::with_vec_p_threads_hash_sc(keys, &Params::new(Bits8, bits_per_seed_to_100_bucket_size(8)),
399        std::thread::available_parallelism().map_or(1, |v| v.into()), BuildDefaultSeededHasher::default(), ShiftOnlyWrapped)
400    }
401
402    /// Constructs [`Function`] for given `keys`, using a single thread.
403    /// 
404    /// `keys` cannot contain duplicates.
405    pub fn from_slice_st<K>(keys: &[K]) -> Self where K: Hash+Clone {
406        Self::with_slice_p_hash_sc(keys, &Params::new(Bits8, bits_per_seed_to_100_bucket_size(8)),
407        BuildDefaultSeededHasher::default(), ShiftOnlyWrapped)
408    }
409
410    /// Constructs [`Function`] for given `keys`, using multiple threads.
411    /// 
412    /// `keys` cannot contain duplicates.
413    pub fn from_slice_mt<K>(keys: &[K]) -> Self where K: Hash+Clone+Send+Sync {
414        Self::with_slice_p_threads_hash_sc(keys, &Params::new(Bits8, bits_per_seed_to_100_bucket_size(8)),
415        std::thread::available_parallelism().map_or(1, |v| v.into()), BuildDefaultSeededHasher::default(), ShiftOnlyWrapped)
416    }
417}
418
419#[cfg(test)]
420pub(crate) mod tests {
421    use super::*;
422    use crate::utils::tests::test_mphf;
423
424    fn test_read_write<SS: SeedSize>(h: &Function2::<SS>) where SS::VecElement: std::fmt::Debug+PartialEq {
425        let mut buff = Vec::new();
426        h.write(&mut buff).unwrap();
427        //assert_eq!(buff.len(), h.write_bytes());
428        let read = Function2::<SS>::read(&mut &buff[..]).unwrap();
429        assert_eq!(h.level0.conf, read.level0.conf);
430        assert_eq!(h.levels.len(), read.levels.len());
431        for (hl, rl) in h.levels.iter().zip(&read.levels) {
432            assert_eq!(hl.shift, rl.shift);
433            assert_eq!(hl.seeds.conf, rl.seeds.conf);
434            assert_eq!(hl.seeds.seeds, rl.seeds.seeds);
435        }
436    }
437
438    #[test]
439    fn test_small() {
440        let input = [1, 2, 3, 4, 5];
441        let f = Function2::from_slice_st(&input);
442        test_mphf(&input, |key| Some(f.get(key)));
443        test_read_write(&f);
444    }
445}