Skip to main content

ph_temp/fmph/
function.rs

1use std::hash::Hash;
2use binout::{AsIs, Serializer, VByte};
3use bitm::{BitAccess, Rank, ceiling_div};
4
5use crate::utils::ArrayWithRank;
6use crate::{BuildDefaultSeededHasher, BuildSeededHasher, stats, utils};
7
8use std::io;
9use std::sync::atomic::AtomicU64;
10use std::sync::atomic::Ordering::Relaxed;
11use rayon::prelude::*;
12use dyn_size_of::GetSize;
13
14use crate::fmph::keyset::{KeySet, SliceMutSource, SliceSourceWithRefs};
15
16/// Build configuration that is accepted by [`Function`] constructors.
17/// 
18/// See field descriptions for details.
19#[derive(Clone)]
20pub struct BuildConf<S = BuildDefaultSeededHasher> {
21    /// The family of hash functions used by the constructed FMPH. (default: [`BuildDefaultSeededHasher`])
22    pub hash_builder: S,
23
24    /// The threshold for the number of keys below which their hashes will be cached during level construction.
25    /// (default: [`BuildConf::DEFAULT_CACHE_THRESHOLD`])
26    /// 
27    /// Caching speeds up level construction at the expense of memory consumption during construction
28    /// (caching a single key requires 8 bytes of memory).
29    /// Caching is particularly recommended for keys with complex types whose hashing is slow.
30    /// It is possible to use a value of `0` to disable caching completely, or [`usize::MAX`] to use it on all levels.
31    pub cache_threshold: usize,
32
33    /// Size of each level given as a percentage of the number of level input keys. (default: `100`)
34    /// 
35    /// A value of 100 minimizes the size of the constructed minimum perfect hash function.
36    /// Larger values speed up evaluation at the expense of increased size.
37    /// For example, the values 100 and 200 lead to the sizes of approximately 2.8 and 3.4 bits per input key, respectively.
38    /// It does not make sense to use values below 100.
39    pub relative_level_size: u16,
40
41    /// Whether to use multiple threads during construction. (default: `true`)
42    /// 
43    /// If `true`, the construction will be performed using the default [rayon] thread pool.
44    pub use_multiple_threads: bool
45}
46
47impl Default for BuildConf {
48    fn default() -> Self {
49        Self {
50            hash_builder: Default::default(),
51            cache_threshold: Self::DEFAULT_CACHE_THRESHOLD,
52            relative_level_size: 100,
53            use_multiple_threads: true
54        }
55    }
56}
57
58impl BuildConf {
59    /// Returns configuration that potentially uses [multiple threads](BuildConf::use_multiple_threads) to build [Function].
60    pub fn mt(use_multiple_threads: bool) -> Self {
61        Self { use_multiple_threads, ..Default::default() }
62    }
63
64    /// Returns configuration that uses custom [`cache_threshold`](BuildConf::cache_threshold) to build [Function].
65    pub fn ct(cache_threshold: usize) -> Self {
66        Self { cache_threshold, ..Default::default() }
67    }
68
69    /// Returns configuration that uses custom [`cache_threshold`](BuildConf::cache_threshold) and
70    /// potentially uses [multiple threads](BuildConf::use_multiple_threads) to build [Function].
71    pub fn ct_mt(cache_threshold: usize, use_multiple_threads: bool) -> Self {
72        Self { use_multiple_threads, cache_threshold, ..Default::default() }
73    }
74
75    /// Returns configuration that uses at each level a bit-array
76    /// of size [`relative_level_size`](BuildConf::relative_level_size)
77    /// given as a percent of number of level input keys.
78    pub fn lsize(relative_level_size: u16) -> Self {
79        Self { relative_level_size, ..Default::default() }
80    }
81
82    /// Returns configuration that uses custom [`relative_level_size`](BuildConf::relative_level_size)
83    /// and [`cache_threshold`](BuildConf::cache_threshold) to build [Function].
84    pub fn lsize_ct(relative_level_size: u16, cache_threshold: usize) -> Self {
85        Self { relative_level_size, cache_threshold, ..Default::default() }
86    }
87
88    /// Returns configuration that potentially uses [multiple threads](BuildConf::use_multiple_threads) and
89    /// at each level a bit-array of size [`relative_level_size`](BuildConf::relative_level_size)
90    /// given as a percent of number of level input keys.
91    pub fn lsize_mt(relative_level_size: u16, use_multiple_threads: bool) -> Self {
92        Self { relative_level_size, use_multiple_threads, ..Default::default() }
93    }
94}
95
96impl<S> BuildConf<S> {
97    /// The default value for [`relative_level_size`](BuildConf::relative_level_size),
98    /// which results in building the cache with a maximum size of 1GB.
99    pub const DEFAULT_CACHE_THRESHOLD: usize = 1024*1024*128; // *8 bytes = 1GB
100
101    /// Returns configuration that uses custom [`hash_builder`](BuildConf::hash_builder).
102    pub fn hash(hash_builder: S) -> Self {
103        Self { hash_builder, cache_threshold: Self::DEFAULT_CACHE_THRESHOLD, relative_level_size: 100, use_multiple_threads: true }
104    }
105
106    /// Returns configuration that uses custom [`hash_builder`](BuildConf::hash_builder) and [`relative_level_size`](BuildConf::relative_level_size).
107    pub fn hash_lsize(hash_builder: S, relative_level_size: u16) -> Self {
108        Self { relative_level_size, ..Self::hash(hash_builder) }
109    }
110
111    /// Returns configuration that uses custom [`hash_builder`](BuildConf::hash_builder), [`relative_level_size`](BuildConf::relative_level_size)
112    /// and potentially uses [multiple threads](BuildConf::use_multiple_threads) to build [Function].
113    pub fn hash_lsize_mt(hash_builder: S, relative_level_size: u16, use_multiple_threads: bool) -> Self {
114        Self { relative_level_size, hash_builder, use_multiple_threads, cache_threshold: Self::DEFAULT_CACHE_THRESHOLD }
115    }
116
117    /// Returns configuration that uses custom [`hash_builder`](BuildConf::hash_builder),
118    /// [`relative_level_size`](BuildConf::relative_level_size) and [`cache_threshold`](BuildConf::cache_threshold)
119    /// to build [Function].
120    pub fn hash_lsize_ct(hash_builder: S, relative_level_size: u16, cache_threshold: usize) -> Self {
121        Self { relative_level_size, hash_builder, use_multiple_threads: true, cache_threshold }
122    }
123
124    /// Returns configuration that uses custom [`hash_builder`](BuildConf::hash_builder),
125    /// [`relative_level_size`](BuildConf::relative_level_size), [`cache_threshold`](BuildConf::cache_threshold)
126    /// and potentially uses [multiple threads](BuildConf::use_multiple_threads) to build [Function].
127    pub fn hash_lsize_ct_mt(hash_builder: S, relative_level_size: u16, cache_threshold: usize, use_multiple_threads: bool) -> Self {
128        Self { relative_level_size, hash_builder, use_multiple_threads, cache_threshold }
129    }
130}
131
132/// Set `bit_index` bit in `result`. If it already was set, then set it in `collision`.
133#[inline]
134pub(crate) fn fphash_add_bit(result: &mut [u64], collision: &mut [u64], bit_index: usize) {
135    let index = bit_index / 64;
136    let mask = 1u64 << (bit_index % 64) as u64;
137    collision[index] |= result[index] & mask;
138    result[index] |= mask;
139    //result[index] |= (!collision[index] & mask);
140}
141
142/// Set `bit_index` bit in `result`. If it already was set, then set it in `collision`.
143#[inline]
144pub(crate) fn fphash_sync_add_bit(result: &[AtomicU64], collision: &[AtomicU64], bit_index: usize) {
145    let index = bit_index / 64;
146    let mask = 1u64 << (bit_index % 64) as u64;
147    // if collision[index].load(Relaxed) & mask != 0 { return; } // TODO opłaca się? bezpieczne? benchmarki pokazują że się opłaca!!
148    let old_result = result[index].fetch_or(mask, Relaxed);
149    if old_result & mask != 0 { collision[index].fetch_or(mask, Relaxed); }
150    //collision[index].fetch_or(old_result & mask, Relaxed);    // alternative to line above
151}
152
153/// Remove from bit-array `result` all bits that are set in `collision`.
154pub(crate) fn fphash_remove_collided(result: &mut LevelArray, collision: &[u64]) {
155    for (r, c) in result.iter_mut().zip(collision.iter()) {
156        *r &= !c;
157    }
158}
159
160#[cfg(not(all(feature = "aligned-vec", target_pointer_width = "32")))] pub (crate) type LevelArray = Box<[u64]>;
161#[cfg(all(feature = "aligned-vec", target_pointer_width = "32"))] pub (crate) type LevelArray = aligned_vec::ABox<[u64], aligned_vec::ConstAlign<{std::mem::align_of::<AtomicU64>()}>>;
162pub(crate) fn level_array_for(size_segments: usize) -> LevelArray {
163    #[cfg(not(all(feature = "aligned-vec", target_pointer_width = "32")))] { vec![0u64; size_segments].into_boxed_slice() }
164    #[cfg(all(feature = "aligned-vec", target_pointer_width = "32"))] { aligned_vec::avec![[{std::mem::align_of::<AtomicU64>()}] | 0u64; size_segments].into_boxed_slice() }
165}
166
167pub(crate) fn concat_level_arrays(levels: Vec<LevelArray>) -> Box<[u64]> {
168    let result_len = levels.iter().map(|l| l.len()).sum();
169    let mut result = Vec::with_capacity(result_len);
170    for a in levels { result.extend_from_slice(&a) }
171    result.into_boxed_slice()
172}
173
174/// Cast `v` to slice of `AtomicU64`.
175#[inline]
176pub(crate) fn from_mut_slice(v: &mut LevelArray) -> &mut [AtomicU64] {
177    #[cfg(not(all(feature = "aligned-vec", target_pointer_width = "32")))]
178    let [] = [(); core::mem::align_of::<AtomicU64>() - core::mem::align_of::<u64>()];
179    unsafe { &mut *(v.as_mut() as *mut [u64] as *mut [AtomicU64]) }
180}   // copied from unstable rust, from_mut_slice, commit #94816, issue #76314
181
182/// Cast `v` to slice of `u64`.
183#[inline]
184pub(crate) fn get_mut_slice(v: &mut [AtomicU64]) -> &mut [u64] {
185    // SAFETY: the mutable reference guarantees unique ownership.
186    unsafe { &mut *(v as *mut [AtomicU64] as *mut [u64]) }
187}   // copied from unstable rust, get_mut_slice, commit #94816, issue #76314
188
189// Remove from bit-array `result` all bits that are set in `collision`. Uses multiple threads.
190/*pub(crate) fn fphash_par_remove_collided(result: &mut Box<[u64]>, collision: &[u64]) {
191    result.par_iter_mut().zip(collision.par_iter()).for_each(|(r, c)| {
192        *r &= !c;
193    });
194}*/ // works, bot difference is negligible
195
196/// Returns the index of `key` at level with given `seed` and size (`level_size`), using given (seeded) `hash` method.
197#[inline(always)] fn index(key: &impl Hash, hash: &impl BuildSeededHasher, seed: u64, level_size: usize) -> usize {
198    utils::map64_to_64(hash.hash_one(key, seed), level_size as u64) as usize
199}
200
201/// Helper structure for building fingerprinting-based minimal perfect hash function (FMPH).
202struct Builder<S> {
203    arrays: Vec::<LevelArray>,
204    input_size: usize,
205    conf: BuildConf<S>
206}
207
208impl<S: BuildSeededHasher + Sync> Builder<S> {
209    pub fn new<K>(mut conf: BuildConf<S>, keys: &impl KeySet<K>) -> Self {
210        if conf.use_multiple_threads { conf.use_multiple_threads = rayon::current_num_threads() > 1; }
211        Self {
212            arrays: Vec::<LevelArray>::new(),
213            input_size: keys.keys_len(),
214            conf
215        }
216    }
217
218    /// Returns whether `key` is retained (`false` if it is already hashed at the levels built so far).
219    pub fn retained<K>(&self, key: &K) -> bool where K: Hash {
220        self.arrays.iter().enumerate()
221            .all(|(seed, a)| !a.get_bit(index(key, &self.conf.hash_builder, seed as u64, a.len() << 6)))
222    }
223
224    /// Build level for given sequence of key indices using a single thread
225    fn build_array_for_indices_st(&self, bit_indices: &[usize], level_size_segments: usize) -> LevelArray
226    {
227        let mut result = level_array_for(level_size_segments);
228        let mut collision = vec![0u64; level_size_segments].into_boxed_slice();
229        for bit_index in bit_indices {
230            fphash_add_bit(&mut result, &mut collision, *bit_index);
231        };
232        fphash_remove_collided(&mut result, &collision);
233        result
234    }
235
236    /// Build level for given sequence of key indices possibly using multiple threads.
237    fn build_array_for_indices(&self, bit_indices: &[usize], level_size_segments: usize) -> LevelArray
238    {
239        if !self.conf.use_multiple_threads {
240            return self.build_array_for_indices_st(bit_indices, level_size_segments)
241        }
242        let mut result = level_array_for(level_size_segments);
243        let result_atom = from_mut_slice(&mut result);
244        let mut collision: Box<[AtomicU64]> = (0..level_size_segments).map(|_| AtomicU64::default()).collect();
245        bit_indices.par_iter().for_each(
246            |bit_index| fphash_sync_add_bit(&result_atom, &collision, *bit_index)
247        );
248        fphash_remove_collided(&mut result, get_mut_slice(&mut collision));
249        result
250    }
251
252    /// Builds level using a single thread.
253    fn build_level_st<K>(&self, keys: &impl KeySet<K>, level_size_segments: usize, seed: u64) -> LevelArray
254        where K: Hash
255    {
256        let mut result = level_array_for(level_size_segments);
257        let mut collision = vec![0u64; level_size_segments].into_boxed_slice();
258        let level_size = level_size_segments * 64;
259        keys.for_each_key(
260            |key| fphash_add_bit(&mut result, &mut collision, index(key, &self.conf.hash_builder, seed, level_size)),
261            |key| self.retained(key)
262        );
263        fphash_remove_collided(&mut result, &collision);
264        result
265    }
266
267    /// Builds level possibly (if `keys` can be iterated in parallel) using multiple threads
268    fn build_level<K>(&self, keys: &impl KeySet<K>, level_size_segments: usize, seed: u64) -> LevelArray
269        where K: Hash + Sync
270    {
271        if !(self.conf.use_multiple_threads && keys.has_par_for_each_key()) {
272            return self.build_level_st(keys, level_size_segments, seed);
273        }
274        let mut result = level_array_for(level_size_segments);
275        let result_atom = from_mut_slice(&mut result);
276        let mut collision: Box<[AtomicU64]> = (0..level_size_segments).map(|_| AtomicU64::default()).collect();
277        let level_size = level_size_segments * 64;
278        keys.par_for_each_key(
279            |key| fphash_sync_add_bit(&&result_atom, &collision, index(key, &self.conf.hash_builder, seed, level_size)),
280            |key| self.retained(key)
281        );
282        fphash_remove_collided(&mut result, get_mut_slice(&mut collision));
283        result
284    }
285
286    /// Returns number of the level about to build (number of levels built so far).
287    #[inline(always)] fn level_nr(&self) -> u64 { self.arrays.len() as u64 }
288
289    fn build_levels<K, BS>(&mut self, keys: &mut impl KeySet<K>, stats: &mut BS)
290        where K: Hash + Sync, BS: stats::BuildStatsCollector
291    {
292        let mut levels_without_reduction = 0;   // number of levels without any reduction in number of the keys
293        while self.input_size != 0 {
294            let level_size_segments = ceiling_div(self.input_size * self.conf.relative_level_size as usize, 64*100);
295            let level_size = level_size_segments * 64;
296            stats.level(self.input_size, level_size);
297            let seed = self.level_nr();
298            let array = if self.input_size < self.conf.cache_threshold {
299                // build level with hash caching:
300                let bit_indices = keys.maybe_par_map_each_key(
301                    |key| index(key, &self.conf.hash_builder, seed, level_size),
302                    |key| self.retained(key),
303                    self.conf.use_multiple_threads
304                );
305                let array = self.build_array_for_indices(&bit_indices, level_size_segments);
306                keys.maybe_par_retain_keys_with_indices(
307                    |i| !array.get_bit(bit_indices[i]),
308                    |key| !array.get_bit(index(key, &self.conf.hash_builder, seed, level_size)),
309                    |key| self.retained(key),
310                    || array.count_bit_ones(),
311                    self.conf.use_multiple_threads
312                );
313                array
314            } else {
315                // build level without hash caching:
316                let current_array = self.build_level(keys, level_size_segments, seed);
317                keys.maybe_par_retain_keys(
318                    |key| !current_array.get_bit(index(key, &self.conf.hash_builder, seed, level_size)),
319                    |key| self.retained(key),
320                    || current_array.count_bit_ones(),
321                    self.conf.use_multiple_threads               
322                );
323                current_array
324            };
325            self.arrays.push(array);
326            let prev_input_size = self.input_size;
327            self.input_size = keys.keys_len();
328            if self.input_size == prev_input_size {
329                levels_without_reduction += 1;
330                if levels_without_reduction == 10 {
331                    self.arrays.truncate(self.arrays.len()-levels_without_reduction);
332                    stats.end(self.input_size);
333                    break;
334                }
335            } else {
336                levels_without_reduction = 0;
337            }
338        }
339        stats.end(0)
340    }
341
342    pub fn finish(self) -> Function<S> {
343        let level_sizes = self.arrays.iter().map(|l| l.len()).collect();
344        //let (array, _)  = ArrayWithRank::build(self.arrays.concat().into_boxed_slice());
345        let (array, _)  = ArrayWithRank::build(concat_level_arrays(self.arrays));
346        Function::<S> {
347            array,
348            level_sizes,
349            hash_builder: self.conf.hash_builder
350        }
351    }
352
353    /*fn fphash_build_level_MT2<K, S>(hash: &S, keys: &[K], level_size_segments: u32, seed: u32, thread_pool: &Option<ThreadPool>) -> Box<[u64]>
354        where S: BuildSeededHasher + Sync, K: Hash + Sync
355    {
356        if let Some(thread_pool) = thread_pool {
357            let level_size_segments = level_size_segments as usize;
358            let level_size = level_size_segments * 64;
359            let (mut result, collisions) = thread_pool.install(|| {
360                keys.into_par_iter()
361                //keys.par_chunks(100)
362                    .fold(|| (vec![0u64; level_size_segments].into_boxed_slice(),
363                         vec![0u64; level_size_segments].into_boxed_slice()),
364                    |(mut a, mut c), k| {
365                        //for k in keys { // for par_chunks
366                            let bit = utils::map64_to_64(hash.hash_one(k, seed), level_size as u64) as usize;
367                            fphash_add_bit(&mut a, &mut c, bit);
368                        //}   //  for par_chunks
369                        (a, c)
370                    }
371                ).reduce_with(|(mut arr1, mut col1), (arr2, col2)| {
372                    for (((a1, c1), a2), c2) in arr1.iter_mut().zip(col1.iter_mut()).zip(arr2.into_iter()).zip(col2.into_iter()) {
373                        *c1 |= *c2 | (*a1 & a2);
374                        *a1 |= a2;
375                    };
376                    (arr1, col1)
377                }).unwrap()
378            });
379            fphash_remove_collided(&mut result, &collisions);
380            result
381        } else {
382            fphash_build_level(hash, keys, level_size_segments, seed)
383        }
384    }*/
385}
386
387
388/// Fingerprinting-based minimal perfect hash function (FMPH).
389///
390/// See:
391/// - P. Beling, *Fingerprinting-based minimal perfect hashing revisited*, ACM Journal of Experimental Algorithmics, 2023, <https://doi.org/10.1145/3596453>
392/// - A. Limasset, G. Rizk, R. Chikhi, P. Peterlongo, *Fast and Scalable Minimal Perfect Hashing for Massive Key Sets*, SEA 2017
393#[derive(Clone)]
394pub struct Function<S = BuildDefaultSeededHasher> {
395    array: ArrayWithRank,
396    level_sizes: Box<[usize]>,
397    hash_builder: S
398}
399
400impl<S: BuildSeededHasher> GetSize for Function<S> {
401    fn size_bytes_dyn(&self) -> usize { self.array.size_bytes_dyn() + self.level_sizes.size_bytes_dyn() }
402    fn size_bytes_content_dyn(&self) -> usize { self.array.size_bytes_content_dyn() + self.level_sizes.size_bytes_content_dyn() }
403    const USES_DYN_MEM: bool = true;
404}
405
406impl<S: BuildSeededHasher> Function<S> {
407
408    /// Returns index of the key `k` at the level of the given number (`level_nr`) and `size`.
409    #[inline(always)] fn index<K: Hash + ?Sized>(&self, k: &K, level_nr: u64, size: usize) -> usize {
410        //utils::map64_to_32(self.hash_builder.hash_one(k, level_nr), size as u32) as usize
411        utils::map64_to_64(self.hash_builder.hash_one(k, level_nr), size as u64) as usize
412    }
413
414    /// Gets the value associated with the given `key` and reports statistics to `access_stats`.
415    /// 
416    /// The returned value is in the range from `0` (inclusive) to the number of elements in the input key collection (exclusive).
417    /// If the `key` was not in the input key collection given during construction,
418    /// either [`None`] or an undetermined value from the specified range is returned.
419    #[inline(always)]
420    pub fn get_stats<K: Hash + ?Sized, A: stats::AccessStatsCollector>(&self, key: &K, access_stats: &mut A) -> Option<u64> {
421        let mut array_begin_index = 0usize;
422        let mut level_nr = 0usize;
423        loop {
424            let level_size = (*self.level_sizes.get(level_nr)?) << 6;
425            let i = array_begin_index + self.index(key, level_nr as u64, level_size);
426            if self.array.content.get_bit(i) {
427                access_stats.found_on_level(level_nr);
428                return Some(unsafe{self.array.rank_unchecked(i)} as u64);
429            }
430            array_begin_index += level_size;
431            level_nr += 1;
432        }
433    }
434
435    /// Gets the value associated with the given `key`.
436    /// 
437    /// The returned value is in the range: `0` (inclusive), the number of elements in the input key collection (exclusive).
438    /// If the `key` was not in the input key collection given during construction,
439    /// either [`None`] or an undetermined value from the specified range is returned.
440    #[inline(always)] pub fn get<K: Hash + ?Sized>(&self, key: &K) -> Option<u64> {
441        self.get_stats(key, &mut ())
442    }
443
444    /// Gets the value associated with the given `key` and reports statistics to `access_stats`.
445    /// 
446    /// The returned value is in the range: `0` (inclusive), the number of elements in the input key collection (exclusive).
447    /// If the `key` was not in the input key collection given during construction,
448    /// it either panics or returns an undetermined value from the specified range.
449    #[inline(always)] pub fn get_stats_or_panic<K: Hash + ?Sized, A: stats::AccessStatsCollector>(&self, key: &K, access_stats: &mut A) -> u64 {
450        self.get_stats(key, access_stats).expect("Invalid access to an item outside the set given during construction.")
451    }
452
453    /// Gets the value associated with the given `key` and reports statistics to `access_stats`.
454    /// 
455    /// The returned value is in the range: `0` (inclusive), the number of elements in the input key collection (exclusive).
456    /// If the `key` was not in the input key collection given during construction,
457    /// it either panics or returns an undetermined value from the specified range.
458    #[inline(always)] pub fn get_or_panic<K: Hash + ?Sized>(&self, key: &K) -> u64 {
459        self.get_stats_or_panic(key, &mut ())
460    }
461
462    /// Returns number of bytes which `write` will write.
463    pub fn write_bytes(&self) -> usize {
464        VByte::array_size(&self.level_sizes) + AsIs::array_content_size(&self.array.content)
465    }
466
467    /// Writes `self` to the `output`.
468    pub fn write(&self, output: &mut dyn io::Write) -> io::Result<()>
469    {
470        VByte::write_array(output, &self.level_sizes)?;
471        AsIs::write_all(output, self.array.content.iter())
472    }
473
474    /// Reads `Self` from the `input`. Hasher must be the same as the one used to write.
475    pub fn read_with_hasher(input: &mut dyn io::Read, hasher: S) -> io::Result<Self>
476    {
477        let level_sizes = VByte::read_array(input)?;
478        let array_content_len = level_sizes.iter().map(|v|*v as usize).sum::<usize>();
479        let array_content = AsIs::read_n(input, array_content_len)?;
480        let (array_with_rank, _) = ArrayWithRank::build(array_content);
481        Ok(Self { array: array_with_rank, level_sizes, hash_builder: hasher })
482    }
483
484    /// Returns sizes of the successive levels.
485    pub fn level_sizes(&self) -> &[usize] {
486        &self.level_sizes
487    }
488}
489
490impl<S: BuildSeededHasher + Sync> Function<S> {
491    /// Constructs [`Function`] for given input `keys`,
492    /// using the build configuration `conf` and reporting statistics with `stats`.
493    /// 
494    /// If the construction fails, it returns `Err` with a triple *(f, k, s)*, where:
495    /// - *f* is a [`Function`] handling only part of the keys
496    ///   (that returns numbers in the interval *[0, s-k.keys_len())*);
497    /// - *k* is a set of the remaining keys,
498    /// - *s* is the initial number of keys.
499    /// If needed, the keys from *k* can be placed in another data structure to handle all the keys.
500    /// 
501    /// If the construction fails, it is almost certain that the input contains either duplicate keys
502    /// or keys indistinguishable by any hash function from the family used.
503    /// The duplicate keys will be included in the *k* set.
504    pub fn try_with_conf_stats_or_partial<K, BS, KS>(mut keys: KS, conf: BuildConf<S>, stats: &mut BS) -> Result<Self, (Self, KS, usize)>
505        where K: Hash + Sync, KS: KeySet<K>, BS: stats::BuildStatsCollector
506    {
507        let mut builder = Builder::new(conf, &keys);
508        let initial_size = builder.input_size;
509        builder.build_levels(&mut keys, stats);
510        if builder.input_size == 0 {
511            drop(keys);
512            Ok(builder.finish())
513        } else {
514            Err((builder.finish(), keys, initial_size))
515        }
516    }
517
518    /// Constructs [`Function`] for given `keys`, using the build configuration `conf` and reporting statistics with `stats`.
519    /// 
520    /// [`None`] is returned if the construction fails.
521    /// Then it is almost certain that the input contains either duplicate keys
522    /// or keys indistinguishable by any hash function from the family used.
523    pub fn try_with_conf_stats<K, BS, KS>(mut keys: KS, conf: BuildConf<S>, stats: &mut BS) -> Option<Self>
524        where K: Hash + Sync, KS: KeySet<K>, BS: stats::BuildStatsCollector
525    {
526        let mut builder = Builder::new(conf, &keys);
527        builder.build_levels(&mut keys, stats);
528        drop(keys);
529        (builder.input_size == 0).then(|| builder.finish())
530    }
531
532    /// Constructs [`Function`] for given `keys`, using the build configuration `conf` and reporting statistics with `stats`.
533    /// 
534    /// Panics if the construction fails.
535    /// Then it is almost certain that the input contains either duplicate keys
536    /// or keys indistinguishable by any hash function from the family used.
537    pub fn with_conf_stats<K, BS>(keys: impl KeySet<K>, conf: BuildConf<S>, stats: &mut BS) -> Self
538    where K: Hash + Sync, BS: stats::BuildStatsCollector
539    {
540        Self::try_with_conf_stats(keys, conf, stats).expect("Constructing fmph::Function failed. Probably the input contains duplicate keys.")
541    }
542
543    /// Constructs [Function] for given `keys`, using the build configuration `conf`.
544    /// 
545    /// Panics if the construction fails.
546    /// Then it is almost certain that the input contains either duplicate keys
547    /// or keys indistinguishable by any hash function from the family used.
548    #[inline] pub fn with_conf<K>(keys: impl KeySet<K>, conf: BuildConf<S>) -> Self
549        where K: Hash + Sync
550    {
551        Self::with_conf_stats(keys, conf, &mut ())
552    }
553
554    /// Constructs [`Function`] for given `keys`, using the build configuration `conf` and reporting statistics with `stats`.
555    /// 
556    /// Panics if the construction fails.
557    /// Then it is almost certain that the input contains either duplicate keys
558    /// or keys indistinguishable by any hash function from the family used.
559    #[inline] pub fn from_slice_with_conf_stats<K, BS>(keys: &[K], conf: BuildConf<S>, stats: &mut BS) -> Self
560        where K: Hash + Sync, BS: stats::BuildStatsCollector
561    {
562        Self::with_conf_stats(SliceSourceWithRefs::<_, u8>::new(keys), conf, stats)
563    }
564
565    /// Constructs [`Function`] for given `keys`, using the build configuration `conf`.
566    /// 
567    /// Panics if the construction fails.
568    /// Then it is almost certain that the input contains either duplicate keys
569    /// or keys indistinguishable by any hash function from the family used.
570    #[inline] pub fn from_slice_with_conf<K>(keys: &[K], conf: BuildConf<S>) -> Self
571        where K: Hash + Sync
572    {
573        Self::with_conf_stats(SliceSourceWithRefs::<_, u8>::new(keys), conf, &mut ())
574    }
575
576    /// Constructs [`Function`] for given `keys`, using the build configuration `conf` and reporting statistics with `stats`.
577    /// Note that `keys` can be reordered during construction.
578    /// 
579    /// Panics if the construction fails.
580    /// Then it is almost certain that the input contains either duplicate keys
581    /// or keys indistinguishable by any hash function from the family used.
582
583    #[inline] pub fn from_slice_mut_with_conf_stats<K, BS>(keys: &mut [K], conf: BuildConf<S>, stats: &mut BS) -> Self
584        where K: Hash + Sync, BS: stats::BuildStatsCollector
585    {
586        Self::with_conf_stats(SliceMutSource::new(keys), conf, stats)
587    }
588
589    /// Constructs [`Function`] for given `keys`, using the build configuration `conf`.
590    /// Note that `keys` can be reordered during construction.
591    /// 
592    /// Panics if the construction fails.
593    /// Then it is almost certain that the input contains either duplicate keys
594    /// or keys indistinguishable by any hash function from the family used.
595    #[inline] pub fn from_slice_mut_with_conf<K>(keys: &mut [K], conf: BuildConf<S>) -> Self
596        where K: Hash + Sync
597    {
598        Self::with_conf_stats(SliceMutSource::new(keys), conf, &mut ())
599    }
600}
601
602impl Function {
603    /// Reads `Self` from the `input`.
604    /// Only [Function]s that use default hasher can be read by this method.
605    pub fn read(input: &mut dyn io::Read) -> io::Result<Self> {
606        Self::read_with_hasher(input, Default::default())
607    }
608
609    /// Builds [`Function`] for given `keys`, reporting statistics with `stats`.
610    /// 
611    /// Panics if constructing [`Function`] fails.
612    /// Then it is almost certain that the input contains either duplicate keys
613    /// or keys indistinguishable by any hash function from the family used.
614    pub fn with_stats<K, BS>(keys: impl KeySet<K>, stats: &mut BS) -> Self
615        where K: Hash + Sync, BS: stats::BuildStatsCollector
616    {
617        Self::with_conf_stats(keys, Default::default(), stats)
618    }
619
620    /// Builds [`Function`] for given `keys`.
621    /// 
622    /// Panics if constructing [`Function`] fails.
623    /// Then it is almost certain that the input contains either duplicate keys
624    /// or keys indistinguishable by any hash function from the family used.
625    pub fn new<K: Hash + Sync>(keys: impl KeySet<K>) -> Self {
626        Self::with_conf_stats(keys, Default::default(), &mut ())
627    }
628}
629
630impl<S> Function<S> {
631    /// Returns the number of keys in the input collection given during construction.
632    /// 
633    /// The time complexity is proportional to the number returned.
634    pub fn len(&self) -> usize {
635        self.array.content.count_bit_ones()
636    }
637}
638
639impl<K: Hash + Sync> From<&[K]> for Function {
640    fn from(keys: &[K]) -> Self {
641        Self::new(SliceSourceWithRefs::<_, u8>::new(keys))
642    }
643}
644
645impl<K: Hash + Sync + Send> From<Vec<K>> for Function {
646    fn from(keys: Vec<K>) -> Self {
647        Self::new(keys)
648    }
649}
650
651#[cfg(test)]
652pub(crate) mod tests {
653    use super::*;
654    use std::fmt::Display;
655    use crate::utils::{tests::test_mphf_u64, verify_phf};
656
657    fn test_read_write(h: &Function) {
658        let mut buff = Vec::new();
659        h.write(&mut buff).unwrap();
660        assert_eq!(buff.len(), h.write_bytes());
661        let read = Function::read(&mut &buff[..]).unwrap();
662        assert_eq!(h.level_sizes.len(), read.level_sizes.len());
663        assert_eq!(h.array.content, read.array.content);
664    }
665
666    fn test_with_input<K: Hash + Clone + Display + Sync>(to_hash: &[K]) {
667        let h = Function::from_slice_with_conf(to_hash, BuildConf::mt(false));
668        test_mphf_u64(to_hash, |key| h.get(key));
669        test_read_write(&h);
670        assert_eq!(h.len(), to_hash.len());
671    }
672
673    #[test]
674    fn test_small() {
675        test_with_input(&[1, 2, 5]);
676        test_with_input(&(-50..150).collect::<Vec<_>>());
677        test_with_input(&['a', 'b', 'c', 'd']);
678    }
679
680    #[test]
681    fn test_large_size() {
682        let keys = (-20000..20000).collect::<Vec<_>>();
683        assert!(Function::from(&keys[..]).size_bytes() as f64 * (8.0/40000.0) < 2.9);
684        assert!(Function::from_slice_with_conf(&keys[..], BuildConf::lsize(200)).size_bytes() as f64 * (8.0/40000.0) < 3.5);
685    }
686
687    #[test]
688    fn test_dynamic() {
689        const LEN: u64 = 50_000;
690        let f = Function::new(
691            crate::fmph::keyset::CachedKeySet::dynamic(|| 0..LEN, 10_000));
692        verify_phf(LEN as usize, 0..LEN, |key| f.get(key).map(|v| v as usize));
693        assert!(f.size_bytes() as f64 * (8.0/LEN as f64) < 2.9);
694    }
695
696    #[test]
697    fn test_dynamic_par() {
698        const LEN: u64 = 50_000;
699        let f = Function::new(
700            crate::fmph::keyset::CachedKeySet::dynamic((|| 0..LEN, || (0..LEN).into_par_iter()), 10_000));
701        verify_phf(LEN as usize, 0..LEN, |key| f.get(key).map(|v| v as usize));
702        assert!(f.size_bytes() as f64 * (8.0/LEN as f64) < 2.9);
703    }
704
705    #[test]
706    #[ignore = "uses much memory and time"]
707    fn test_fmph_for_over_2to32_keys() {
708        const LEN: u64 = 5_000_000_000;
709        let f = Function::with_stats(
710            crate::fmph::keyset::CachedKeySet::dynamic(|| 0..LEN, usize::MAX/*1_000_000_000*/),
711            &mut crate::stats::BuildStatsPrinter::stdout());
712        verify_phf(LEN as usize, 0..LEN, |key| f.get(key).map(|v| v as usize));
713        assert!(f.size_bytes() as f64 * (8.0/LEN as f64) < 2.9);
714    }
715
716    #[test]
717    fn test_duplicates() {
718        assert!(Function::try_with_conf_stats(vec![1, 1], Default::default(), &mut ()).is_none());
719        assert!(Function::try_with_conf_stats(vec![1, 2, 3, 1, 4], Default::default(), &mut ()).is_none());
720    }
721
722    #[test]
723    fn test_duplicates_partial() {
724        let keys = vec![1, 2, 3, 1, 4];
725        let expected_initial_len = keys.len();
726        let r = Function::try_with_conf_stats_or_partial(keys, Default::default(), &mut ());
727        if let Err((mphf, mut remaining, initial_len)) = r {
728            assert_eq!(initial_len, expected_initial_len);
729            remaining.sort();
730            assert_eq!(remaining, vec![1, 1]);
731            test_mphf_u64(&[2, 3, 4], |key| mphf.get(key));
732        } else {
733            assert!(false)
734        }
735    }
736}