ph/fmph/
gofunction.rs

1use std::hash::Hash;
2use binout::{VByte, Serializer, AsIs};
3use bitm::{BitAccess, Rank, ceiling_div};
4
5use crate::seeds::{Bits8, SeedSize, TwoToPowerBitsStatic};
6use crate::utils::{ArrayWithRank, read_bits};
7use crate::{BuildDefaultSeededHasher, BuildSeededHasher, stats};
8
9use super::function::{concat_level_arrays, from_mut_slice, get_mut_slice, level_array_for, LevelArray};
10use super::goindexing::GroupSize;
11use std::io;
12use std::sync::atomic::AtomicU64;
13use dyn_size_of::GetSize;
14use crate::fmph::function::{fphash_add_bit, fphash_remove_collided, fphash_sync_add_bit};
15use crate::fmph::goindexing::group_nr;
16
17use rayon::prelude::*;
18use crate::fmph::keyset::{KeySet, SliceMutSource, SliceSourceWithRefs};
19
20/// Configuration of family of (group-optimized) hash functions used by [`GOFunction`] and accepted by [`GOBuildConf`] constructors.
21/// 
22/// Good configurations can be obtained by calling one of the following functions:
23/// [default_biggest](GOConf::default_biggest), [default_bigger](GOConf::default_bigger),
24/// [default](GOConf::default), [default_smallest](GOConf::default_smallest).
25/// These functions are listed in order of increasing performance (in terms of size and evaluation speed)
26/// and time to construct the minimum perfect hash function.
27/// More details are included in their documentation and the paper:
28/// P. Beling, *Fingerprinting-based minimal perfect hashing revisited*, ACM Journal of Experimental Algorithmics, 2023, <https://doi.org/10.1145/3596453>
29#[derive(Clone)]
30pub struct GOConf<GS: GroupSize = TwoToPowerBitsStatic::<4>, SS: SeedSize = TwoToPowerBitsStatic<2>, S = BuildDefaultSeededHasher> {
31    /// The family of hash functions used by the constructed [`GOFunction`]. (default: [`BuildDefaultSeededHasher`])
32    pub hash_builder: S,
33    /// Size of seeds (in bits). (default: 4)
34    pub bits_per_seed: SS,
35    /// Size of groups (in bits). (default: 16)
36    pub bits_per_group: GS
37}
38
39impl GOConf<TwoToPowerBitsStatic::<3>, TwoToPowerBitsStatic::<0>, BuildDefaultSeededHasher> {
40    /// Creates a configuration in which the seed and group sizes are 1 and 8 bits respectively,
41    /// which (when relative level size is 100) leads to a minimum perfect hash function whose:
42    /// - size is about 2.52 bits per input key,
43    /// - the expected number of levels visited during the evaluation is about 2.18,
44    /// - construction takes about 4 times less time compared to the [default](GOConf::default) configuration.
45    pub fn default_biggest() -> Self {
46        Self {
47            hash_builder: Default::default(),
48            bits_per_seed: Default::default(),
49            bits_per_group: Default::default()
50        }
51    }
52}
53
54impl GOConf<TwoToPowerBitsStatic::<4>, TwoToPowerBitsStatic::<1>, BuildDefaultSeededHasher> {
55    /// Creates a configuration in which the seed and group sizes are 2 and 16 bits respectively,
56    /// which (when relative level size is 100) leads to a minimum perfect hash function whose:
57    /// - size is about 2.36 bits per input key,
58    /// - the expected number of levels visited during the evaluation is about 2.04,
59    /// - construction takes about 3 times less time compared to the [default](GOConf::default) configuration.
60    pub fn default_bigger() -> Self {
61        Self {
62            hash_builder: Default::default(),
63            bits_per_seed: Default::default(),
64            bits_per_group: Default::default()
65        }
66    }
67}
68
69impl Default for GOConf {
70    /// Creates a configuration in which the seed and group sizes are 4 and 16 bits respectively,
71    /// which (when relative level size is 100) leads to a minimum perfect hash function whose:
72    /// - size is about 2.21 bits per input key,
73    /// - the expected number of levels visited during the evaluation is about 1.73.
74    fn default() -> Self {
75        Self {
76            hash_builder: Default::default(),
77            bits_per_seed: Default::default(),
78            bits_per_group: Default::default()
79        }
80    }
81}
82
83impl GOConf<TwoToPowerBitsStatic::<5>, Bits8, BuildDefaultSeededHasher> {
84    /// Creates a configuration in which the seed and group sizes are 8 and 32 bits respectively,
85    /// which (when relative level size is 100) leads to a minimum perfect hash function whose:
86    /// - size is about 2.10 bits per input key,
87    /// - the expected number of levels visited during the evaluation is about 1.64,
88    /// - construction takes about 13 times longer compared to the [default](GOConf::default) configuration.
89    pub fn default_smallest() -> Self {
90        Self {
91            hash_builder: Default::default(),
92            bits_per_seed: Default::default(),
93            bits_per_group: Default::default()
94        }
95    }
96}
97
98impl<GS: GroupSize, SS: SeedSize> GOConf<GS, SS> {
99    /// Returns a configuration that uses seeds and groups of the sizes given in bits.
100    pub fn bps_bpg(bits_per_seed: SS, bits_per_group: GS) -> Self {
101        Self {
102            hash_builder: Default::default(),
103            bits_per_seed,
104            bits_per_group,
105        }
106    }
107}
108
109impl<GS: GroupSize, SS: SeedSize, S: BuildSeededHasher> GOConf<GS, SS, S> {
110    /// Panics if the configuration is incorrect.
111    pub fn validate(&self) {
112        self.bits_per_seed.validate().unwrap();
113        self.bits_per_group.validate().unwrap();
114    }
115
116    /// Returns a configuration that uses given family of hash functions and seeds and groups of the sizes given in bits.
117    pub fn hash_bps_bpg(hash_builder: S, bits_per_seed: SS, bits_per_group: GS) -> Self {
118        Self { hash_builder, bits_per_seed, bits_per_group }  // 1<<6=64
119    }
120
121    /// Returns array index for given `hash` of key, size of level in groups, and group seed provided by `group_seed`.
122    #[inline(always)] pub fn hash_index<GetGroupSeed>(&self, hash: u64, level_size_groups: u64, group_seed: GetGroupSeed) -> usize
123        where GetGroupSeed: FnOnce(u64) -> u16  // returns group seed for group with given index
124    {
125        let group = group_nr(hash, level_size_groups);
126        self.bits_per_group.bit_index_for_seed(hash, group_seed(group), group)
127    }
128
129    /// Returns array index for given `key`, seed and size (in groups) of level, and group seed provided by `group_seed`.
130    #[inline(always)] pub fn key_index<GetGroupSeed, K>(&self, key: &K, level_seed: u32, level_size_groups: u64, group_seed: GetGroupSeed) -> usize
131        where GetGroupSeed: FnOnce(u64) -> u16, K: Hash
132    {
133        self.hash_index(self.hash_builder.hash_one(key, level_seed), level_size_groups, group_seed)
134    }
135
136    /// Returns fingerprint array for given hashes of keys, level size, and group seeds (given as a function that returns seeds for provided group indices).
137    fn build_array_for_hashes(&self, key_hashes: &[u64], level_size_segments: usize, level_size_groups: u64, group_seed: u16) -> LevelArray
138    {
139        let mut result = level_array_for(level_size_segments);
140        let mut collision = vec![0u64; level_size_segments].into_boxed_slice();
141        for hash in key_hashes {
142            fphash_add_bit(&mut result, &mut collision, self.hash_index(*hash, level_size_groups, |_| group_seed));
143        };
144        fphash_remove_collided(&mut result, &collision);
145        result
146    }
147}
148
149impl<GS: GroupSize + Sync, SS: SeedSize, S: BuildSeededHasher + Sync> GOConf<GS, SS, S> {
150    /// Returns fingerprint array for given hashes of keys, level size, and group seeds (given as a function that returns seeds for provided group indices).
151    fn build_array_for_hashes_mt(&self, key_hashes: &[u64], level_size_segments: usize, level_size_groups: u64, group_seed: u16) -> LevelArray
152    {
153        let mut result = level_array_for(level_size_segments);
154        let result_atom = from_mut_slice(&mut result);
155        let mut collision: Box<[AtomicU64]> = (0..level_size_segments).map(|_| AtomicU64::default()).collect();
156        key_hashes.par_iter().for_each(
157            |hash| fphash_sync_add_bit(&result_atom, &collision, self.hash_index(*hash, level_size_groups, |_| group_seed))
158        );
159        fphash_remove_collided(&mut result, get_mut_slice(&mut collision));
160        result
161    }
162}
163
164/// Build configuration that is accepted by [`GOFunction`] constructors.
165///
166/// See field descriptions for details.
167#[derive(Clone)]
168pub struct GOBuildConf<GS: GroupSize = TwoToPowerBitsStatic::<4>, SS: SeedSize = TwoToPowerBitsStatic<2>, S = BuildDefaultSeededHasher> {
169    /// The threshold for the number of keys below which their hashes will be cached during level construction.
170    /// (default: [`GOBuildConf::DEFAULT_CACHE_THRESHOLD`])
171    /// 
172    /// Caching speeds up level construction at the expense of memory consumption during construction
173    /// (caching a single key requires 8 bytes of memory).
174    /// Caching is particularly recommended for keys with complex types whose hashing is slow.
175    /// It is possible to use a value of `0` to disable caching completely, or [`usize::MAX`] to use it on all levels.
176    pub cache_threshold: usize,
177
178    /// Size of each level given as a percentage of the number of level input keys. (default: `100`)
179    /// 
180    /// A value of 100 minimizes the size of the constructed minimum perfect hash function.
181    /// Larger values speed up evaluation at the expense of increased size.
182    /// It does not make sense to use values below 100.
183    pub relative_level_size: u16,
184
185    /// Whether to use multiple threads during construction. (default: `true`)
186    /// 
187    /// If `true`, the construction will be performed using the default [rayon] thread pool.
188    pub use_multiple_threads: bool,
189
190    /// Configuration of family of (group-optimized) hash functions (default: [`GOConf::default`]).
191    pub goconf: GOConf<GS, SS, S>,
192}   // TODO introduce trait to make other builders possible
193
194impl Default for GOBuildConf {
195    fn default() -> Self { Self::new(Default::default()) }
196}
197
198impl<GS: GroupSize + Sync, SS: SeedSize, S: BuildSeededHasher + Sync> From<GOConf<GS, SS, S>> for GOBuildConf<GS, SS, S> {
199    #[inline] fn from(value: GOConf<GS, SS, S>) -> Self { Self::new(value) }
200}
201
202impl<GS: GroupSize + Sync, SS: SeedSize, S: BuildSeededHasher + Sync> GOBuildConf<GS, SS, S>
203{
204    /// The default value for [`relative_level_size`](GOBuildConf::relative_level_size),
205    /// which results in building the cache with a maximum size of 1GB.
206    pub const DEFAULT_CACHE_THRESHOLD: usize = 1024*1024*128; // *8 bytes = max 1GB for pre-hashing
207
208    /// Returns configuration with custom [group-optimized family of hash functions](GOBuildConf::goconf),
209    /// [relative level size](GOBuildConf::relative_level_size), [cache threshold](GOBuildConf::cache_threshold)
210    /// and potentially enabled [multiple threads](GOBuildConf::use_multiple_threads).
211    pub fn with_lsize_ct_mt(goconf: GOConf<GS, SS, S>, relative_level_size: u16, cache_threshold: usize, use_multiple_threads: bool) -> Self {
212        Self {
213            cache_threshold,
214            relative_level_size,
215            use_multiple_threads,
216            goconf
217        }
218    }
219
220    /// Returns configuration with custom [group-optimized family of hash functions](GOBuildConf::goconf).
221    pub fn new(goconf: GOConf<GS, SS, S>) -> Self {
222        Self::with_lsize_ct_mt(goconf, 100, Self::DEFAULT_CACHE_THRESHOLD, true)
223    }
224
225    /// Returns configuration with custom [group-optimized family of hash functions](GOBuildConf::goconf)
226    /// and [cache threshold](GOBuildConf::cache_threshold).
227    pub fn with_ct(goconf: GOConf<GS, SS, S>, cache_threshold: usize) -> Self {
228        Self::with_lsize_ct_mt(goconf, 100, cache_threshold, true)
229    }
230
231    /// Returns configuration with custom [group-optimized family of hash functions](GOBuildConf::goconf),
232    /// [relative level size](GOBuildConf::relative_level_size) and [cache threshold](GOBuildConf::cache_threshold).
233    pub fn with_lsize_ct(goconf: GOConf<GS, SS, S>, relative_level_size: u16, cache_threshold: usize) -> Self {
234        Self::with_lsize_ct_mt(goconf, relative_level_size, cache_threshold, true)
235    }
236
237    /// Returns configuration with custom [group-optimized family of hash functions](GOBuildConf::goconf)
238    /// and [relative level size](GOBuildConf::relative_level_size).
239    pub fn with_lsize(goconf: GOConf<GS, SS, S>, relative_level_size: u16) -> Self {
240        Self::with_lsize_ct_mt(goconf, relative_level_size, Self::DEFAULT_CACHE_THRESHOLD, true)
241    }
242
243    /// Returns configuration with custom [group-optimized family of hash functions](GOBuildConf::goconf),
244    /// [relative level size](GOBuildConf::relative_level_size)
245    /// and potentially enabled [multiple threads](GOBuildConf::use_multiple_threads).
246    pub fn with_lsize_mt(goconf: GOConf<GS, SS, S>, relative_level_size: u16, use_multiple_threads: bool) -> Self {
247        Self::with_lsize_ct_mt(goconf, relative_level_size, Self::DEFAULT_CACHE_THRESHOLD, use_multiple_threads)
248    }
249
250    /// Returns configuration with custom [group-optimized family of hash functions](GOBuildConf::goconf),
251    /// and potentially enabled [multiple threads](GOBuildConf::use_multiple_threads).
252    pub fn with_mt(goconf: GOConf<GS, SS, S>, use_multiple_threads: bool) -> Self {
253        Self::with_lsize_ct_mt(goconf, 100, Self::DEFAULT_CACHE_THRESHOLD, use_multiple_threads)
254    }
255
256    #[inline(always)] fn last_seed(&self) -> u16 { ((1u32 << self.goconf.bits_per_seed.into())-1) as u16 }
257
258    /// Update `best_array` and `best_seeds` copying groups that are better (have more ones in `array`) from `array` and `array_seed`.
259    fn update_best<GetGroupSeed>(&self, level_size_groups: u64, best_array: &mut [u64], best_seeds: &mut [SS::VecElement], array: &[u64], array_seed: GetGroupSeed)
260        where GetGroupSeed: Fn(u64) -> u16
261    {
262        for group_index in 0..level_size_groups {
263            self.goconf.bits_per_group.copy_group_if_better(best_array, array, group_index as usize,
264                || self.goconf.bits_per_seed.set_seed(best_seeds, group_index as usize, array_seed(group_index))
265            )
266        }
267    }
268
269    /// Build (by calling `build_for_group`) arrays for all group seeds sequentially and select best groups and seeds (which are returned).
270    /// `build_for_group` can use multiple threads internally to build each array.
271    #[inline(always)]
272    fn best_array<AB>(&self, build_for_group: AB, level_size_groups: u64) -> (LevelArray, Box<[SS::VecElement]>)
273        where AB: Fn(u16) -> LevelArray // build array for given group nr
274    {
275        let mut best_array = build_for_group(0);
276        let mut best_seeds = self.goconf.bits_per_seed.new_zeroed_seed_vec(level_size_groups as usize);
277        for group_seed in 1..=self.last_seed() {
278            let with_new_seed = build_for_group(group_seed);
279            self.update_best(level_size_groups, &mut best_array, &mut best_seeds, &with_new_seed, |_| group_seed);
280        }
281        (best_array, best_seeds)
282    }
283}
284
285/// Helper structure for building fingerprinting-based minimal perfect hash function with group optimization (FMPHGO, [`GOFunction`]).
286pub struct GOBuilder<GS: GroupSize = TwoToPowerBitsStatic::<4>, SS: SeedSize = TwoToPowerBitsStatic<2>, S = BuildDefaultSeededHasher> {
287    level_sizes: Vec::<u64>,
288    arrays: Vec::<LevelArray>,
289    group_seeds: Vec::<Box<[SS::VecElement]>>,
290    conf: GOBuildConf<GS, SS, S>
291}   // TODO introduce trait to make other builders possible
292
293impl<GS: GroupSize + Sync, SS: SeedSize, S: BuildSeededHasher + Sync> GOBuilder<GS, SS, S>
294{
295    pub fn new(mut conf: GOBuildConf<GS, SS, S>) -> Self {
296        conf.goconf.validate();
297        if conf.use_multiple_threads { conf.use_multiple_threads = rayon::current_num_threads() > 1; }
298        Self {
299            level_sizes: Vec::<u64>::new(),
300            arrays: Vec::<LevelArray>::new(),
301            group_seeds: Vec::<Box<[SS::VecElement]>>::new(),
302            conf
303        }
304    }
305
306    fn push(&mut self, array: LevelArray, seeds: Box<[SS::VecElement]>, size_groups: u64) {
307        self.arrays.push(array);
308        self.group_seeds.push(seeds);
309        self.level_sizes.push(size_groups);
310    }
311
312    /// Returns number the level about to build (number of levels built so far).
313    #[inline(always)] fn level_nr(&self) -> u32 { self.level_sizes.len() as u32 }
314
315    /// Returns whether `key` is retained (`false` if it is already hashed at the levels built so far).
316    fn retained<K>(&self, key: &K) -> bool where K: Hash {
317        self.arrays.iter().zip(self.group_seeds.iter()).zip(self.level_sizes.iter()).enumerate()
318            .all(|(level_seed, ((a, seeds), level_size_groups))| {
319                !a.get_bit(self.conf.goconf.key_index(key, level_seed as u32, *level_size_groups,
320                |group| self.conf.goconf.bits_per_seed.get_seed(seeds, group as usize)))
321            })
322    }
323
324    /// Returns fingerprint array for given keys, level size, and group seeds (given as a function that returns seeds for provided group indices).
325    #[inline(always)]
326    fn build_array<KS, K>(&self, keys: &KS, level_size_segments: usize, level_size_groups: u64, group_seed: u16) -> LevelArray
327        where   // returns group seed for group with given index
328            K: Hash, KS: KeySet<K>
329    {
330        let mut result = level_array_for(level_size_segments);
331        let mut collision = vec![0u64; level_size_segments].into_boxed_slice();
332        let level_seed = self.level_nr();
333        keys.for_each_key(|key| fphash_add_bit(&mut result, &mut collision, self.conf.goconf.key_index(key, level_seed, level_size_groups, |_| group_seed)),
334                          |key| self.retained(key));
335        fphash_remove_collided(&mut result, &collision);
336        result
337    }
338
339    /// Returns fingerprint array for given hashes of keys, level size, and group seeds (given as a function that returns seeds for provided group indices).
340    #[inline(always)]
341    fn build_array_mt<KS, K>(&self, keys: &KS, level_size_segments: usize, level_size_groups: u64, group_seed: u16) -> LevelArray
342        where K: Hash, KS: KeySet<K>  // returns group seed for group with given index
343    {
344        if !keys.has_par_for_each_key() {
345            return self.build_array(keys, level_size_segments, level_size_groups, group_seed);
346        }
347        let mut result = level_array_for(level_size_segments);
348        let result_atom = from_mut_slice(&mut result);
349        let mut collision: Box<[AtomicU64]> = (0..level_size_segments).map(|_| AtomicU64::default()).collect();
350        let level_seed = self.level_nr();
351        keys.par_for_each_key(
352            |key| fphash_sync_add_bit(&result_atom, &collision, self.conf.goconf.key_index(key, level_seed, level_size_groups, |_| group_seed)),
353            |key| self.retained(key)
354        );
355        fphash_remove_collided(&mut result, get_mut_slice(&mut collision));
356        result
357    }
358
359    fn build_next_level_with_cache<KS, K>(&mut self, keys: &mut KS, level_size_groups: u64, level_size_segments: usize)
360        where K: Hash + Sync, KS: KeySet<K> + Sync
361    {
362        let level_seed = self.level_nr();
363        let key_hashes = keys.maybe_par_map_each_key(
364            |k| self.conf.goconf.hash_builder.hash_one(k, level_seed),
365            |key| self.retained(key),
366            self.conf.use_multiple_threads
367        );
368        let (array, seeds) = if self.conf.use_multiple_threads {
369            self.conf.best_array(|g| self.conf.goconf.build_array_for_hashes_mt(&key_hashes, level_size_segments, level_size_groups, g), level_size_groups)
370        } else {
371            self.conf.best_array(|g| self.conf.goconf.build_array_for_hashes(&key_hashes, level_size_segments, level_size_groups, g), level_size_groups)
372        };
373        keys.maybe_par_retain_keys_with_indices(
374            |i| !array.get_bit(
375                self.conf.goconf.hash_index(key_hashes[i], level_size_groups,
376                                     |group| self.conf.goconf.bits_per_seed.get_seed(&seeds, group as usize))
377            ),
378            |key| !array.get_bit(
379                self.conf.goconf.key_index(key, level_seed, level_size_groups,
380                                    |group| self.conf.goconf.bits_per_seed.get_seed(&seeds, group as usize))
381            ),
382            |key| self.retained(key),
383            || array.count_bit_ones(),
384            self.conf.use_multiple_threads
385        );
386        self.push(array, seeds, level_size_groups);
387    }
388
389    /// Returns true after successful building.
390    fn build_levels<KS, K, BS>(&mut self, keys: &mut KS, stats: &mut BS) -> bool
391    where K: Hash + Sync, KS: KeySet<K> + Sync, BS: stats::BuildStatsCollector
392    {
393        let mut levels_without_reduction = 0;   // number of levels without any reduction in number of the keys
394        let mut input_size = keys.keys_len();
395        while input_size != 0 {
396            let (level_size_groups, level_size_segments) = self.conf.goconf.bits_per_group.level_size_groups_segments(
397                ceiling_div(input_size * self.conf.relative_level_size as usize, 100));
398            //let seed = level_nr;
399            stats.level(input_size, level_size_segments * 64);
400            let level_size_groups = level_size_groups as u64;
401            if input_size < self.conf.cache_threshold {
402                self.build_next_level_with_cache(keys, level_size_groups, level_size_segments);
403            } else {
404                // build level without hash caching:
405                let (array, seeds) = if self.conf.use_multiple_threads {
406                    self.conf.best_array(|g| self.build_array_mt(keys, level_size_segments, level_size_groups, g), level_size_groups)
407                } else {
408                    self.conf.best_array(|g| self.build_array(keys, level_size_segments, level_size_groups, g), level_size_groups)
409                };
410                let level_nr = self.level_nr();
411                keys.maybe_par_retain_keys(
412                    |key| {
413                        let hash = self.conf.goconf.hash_builder.hash_one(key, level_nr);
414                        let group = group_nr(hash, level_size_groups);
415                        let bit_index = self.conf.goconf.bits_per_group.bit_index_for_seed(
416                            hash,
417                            //current_seeds.get_fragment(group as usize, conf.bits_per_group_seed) as u16,
418                            self.conf.goconf.bits_per_seed.get_seed(&seeds, group as usize),
419                            group);
420                        !array.get_bit(bit_index)
421                    },
422                    |key| self.retained(key),
423                    || array.count_bit_ones(),
424                    self.conf.use_multiple_threads
425                );
426                self.push(array, seeds, level_size_groups);
427            }
428            let prev_input_size = input_size;
429            input_size = keys.keys_len();
430            if input_size == prev_input_size {
431                levels_without_reduction += 1;
432                if levels_without_reduction == 10 {
433                    let len = self.arrays.len()-levels_without_reduction;
434                    self.arrays.truncate(len);
435                    self.group_seeds.truncate(len);
436                    self.level_sizes.truncate(len);
437                    stats.end(input_size);
438                    return false;
439                }
440            } else {
441                levels_without_reduction = 0;
442            }
443        }
444        stats.end(0);
445        return true;
446    }
447
448    pub fn finish(self) -> GOFunction<GS, SS, S> {
449        let (array, _)  = ArrayWithRank::build(concat_level_arrays(self.arrays));
450        let group_seeds_concatenated = self.conf.goconf.bits_per_seed.concatenate_seed_vecs(&self.level_sizes, self.group_seeds);
451        GOFunction::<GS, SS, S> {
452            array,
453            group_seeds: group_seeds_concatenated,
454            conf: self.conf.goconf,
455            level_sizes: self.level_sizes.into_boxed_slice(),
456        }
457    }
458}
459
460/// Fingerprinting-based minimal perfect hash function with group optimization (FMPHGO).
461///
462/// See:
463/// - P. Beling, *Fingerprinting-based minimal perfect hashing revisited*, ACM Journal of Experimental Algorithmics, 2023, <https://doi.org/10.1145/3596453>
464pub struct GOFunction<GS: GroupSize = TwoToPowerBitsStatic::<4>, SS: SeedSize = TwoToPowerBitsStatic<2>, S = BuildDefaultSeededHasher> {
465    array: ArrayWithRank,
466    group_seeds: Box<[SS::VecElement]>,   //  Box<[u8]>,
467    level_sizes: Box<[u64]>, // number of groups
468    conf: GOConf<GS, SS, S>
469    // 0..01..1 mask with number of ones = group size (in bits)
470    //group_size_mask: u8,
471}
472
473impl<GS: GroupSize, SS: SeedSize, S: BuildSeededHasher> GetSize for GOFunction<GS, SS, S> {
474    fn size_bytes_dyn(&self) -> usize {
475        self.array.size_bytes_dyn()
476            //+ self.seeds.len() * std::mem::size_of::<u8>()
477            + self.group_seeds.size_bytes_dyn()
478            + self.level_sizes.size_bytes_dyn()
479    }
480
481    const USES_DYN_MEM: bool = true;
482}
483
484impl<GS: GroupSize, SS: SeedSize, S: BuildSeededHasher> GOFunction<GS, SS, S> {
485
486    /// Gets the value associated with the given `key` and reports statistics to `access_stats`.
487    /// 
488    /// The returned value is in the range: `0` (inclusive), the number of elements in the input key collection (exclusive).
489    /// If the `key` was not in the input key collection given during construction,
490    /// either [`None`] or an undetermined value from the specified range is returned.
491    pub fn get_stats<K: Hash + ?Sized, A: stats::AccessStatsCollector>(&self, key: &K, access_stats: &mut A) -> Option<u64> {
492        let mut groups_before = 0u64;
493        let mut level_nr = 0u32;
494        loop {
495            let level_size_groups = *self.level_sizes.get(level_nr as usize)?;
496            /*let bit_index = self.conf.key_index(key, level_nr, level_size_groups,
497                                                |g| self.conf.bits_per_seed.get_seed(&self.group_seeds, (groups_before + g) as usize)
498            ); // wrong as bit_index_for_seed is called with wrong group */
499            let hash = self.conf.hash_builder.hash_one(key, level_nr);
500            let group = groups_before + group_nr(hash, level_size_groups);
501            let seed = self.conf.bits_per_seed.get_seed(&self.group_seeds, group as usize);
502            let bit_index = self.conf.bits_per_group.bit_index_for_seed(hash, seed, group);
503            if self.array.content.get_bit(bit_index) {
504                access_stats.found_on_level(level_nr);
505                return Some(unsafe{self.array.rank_unchecked(bit_index)} as u64);
506            }
507            groups_before += level_size_groups;
508            level_nr += 1;
509        }
510    }
511
512    /// Gets the value associated with the given `key`.
513    /// 
514    /// The returned value is in the range: `0` (inclusive), the number of elements in the input key collection (exclusive).
515    /// If the `key` was not in the input key collection given during construction,
516    /// either [`None`] or an undetermined value from the specified range is returned.
517    #[inline] pub fn get<K: Hash + ?Sized>(&self, key: &K) -> Option<u64> {
518        self.get_stats(key, &mut ())
519    }
520
521    /// Gets the value associated with the given `key` and reports statistics to `access_stats`.
522    /// 
523    /// The returned value is in the range: `0` (inclusive), the number of elements in the input key collection (exclusive).
524    /// If the `key` was not in the input key collection given during construction,
525    /// it either panics or returns an undetermined value from the specified range.
526    #[inline] pub fn get_stats_or_panic<K: Hash + ?Sized, A: stats::AccessStatsCollector>(&self, key: &K, access_stats: &mut A) -> u64 {
527        self.get_stats(key, access_stats).expect("Invalid access to an item outside the set given during construction.")
528    }
529
530    /// Gets the value associated with the given `key` and reports statistics to `access_stats`.
531    /// 
532    /// The returned value is in the range: `0` (inclusive), the number of elements in the input key collection (exclusive).
533    /// If the `key` was not in the input key collection given during construction,
534    /// it either panics or returns an undetermined value from the specified range.
535    #[inline] pub fn get_or_panic<K: Hash + ?Sized>(&self, key: &K) -> u64 {
536        self.get_stats_or_panic(key, &mut ())
537    }
538
539    /// Returns number of bytes which `write` will write.
540    pub fn write_bytes(&self) -> usize {
541        self.conf.bits_per_group.write_size_bytes()
542            + VByte::array_size(&self.level_sizes)
543            + AsIs::array_content_size(&self.array.content)
544            + std::mem::size_of::<u8>() + self.group_seeds.size_bytes_content_dyn()
545    }
546
547    /// Writes `self` to the `output`.
548    pub fn write(&self, output: &mut dyn io::Write) -> io::Result<()>
549    {
550        self.conf.bits_per_group.write(output)?;
551        VByte::write_array(output, &self.level_sizes)?;
552        AsIs::write_all(output, self.array.content.iter())?;
553        self.conf.bits_per_seed.write_seed_vec(output, &self.group_seeds)
554    }
555
556    /// Reads `Self` from the `input`. Hash builder must be the same as the one used to write.
557    pub fn read_with_hasher(input: &mut dyn io::Read, hash_builder: S) -> io::Result<Self>
558    {
559        let bits_per_group = GS::read(input)?;
560        let level_size = VByte::read_array(input)?;
561        let number_of_groups = level_size.iter().map(|v|*v as usize).sum::<usize>();
562
563        let array_content = read_bits(input, bits_per_group * number_of_groups)?;
564        let (array_with_rank, _) = ArrayWithRank::build(array_content);
565
566        let (bits_per_group_seed, group_seeds) = SS::read_seed_vec(input, number_of_groups)?;
567
568        Ok(Self {
569            array: array_with_rank,
570            group_seeds,
571            level_sizes: level_size,
572            conf: GOConf {
573                bits_per_seed: bits_per_group_seed,
574                bits_per_group,
575                hash_builder
576            },
577        })
578    }
579
580    /// Returns sizes of the successive levels.
581    pub fn level_sizes(&self) -> &[u64] {
582        &self.level_sizes
583    }
584}
585
586impl<GS: GroupSize + Sync, SS: SeedSize, S: BuildSeededHasher + Sync> GOFunction<GS, SS, S> {
587    /// Constructs [`GOFunction`] for given input `keys`,
588    /// using the build configuration `conf` and reporting statistics with `stats`.
589    /// 
590    /// If the construction fails, it returns `Err` with a triple *(f, k, s)*, where:
591    /// - *f* is a [`GOFunction`] handling only part of the keys
592    ///   (that returns numbers in the interval *[0, s-k.keys_len())*);
593    /// - *k* is a set of the remaining keys,
594    /// - *s* is the initial number of keys.
595    /// If needed, the keys from *k* can be placed in another data structure to handle all the keys.
596    /// 
597    /// If the construction fails, it is almost certain that the input contains either duplicate keys
598    /// or keys indistinguishable by any hash function from the family used.
599    /// The duplicate keys will be included in the *k* set.
600    pub fn try_with_conf_stats_or_partial<K, KS, BS>(mut keys: KS, conf: GOBuildConf<GS, SS, S>, stats: &mut BS) -> Result<Self, (Self, KS, usize)>
601        where K: Hash + Sync, KS: KeySet<K> + Sync, BS: stats::BuildStatsCollector
602    {
603        let mut builder = GOBuilder::new(conf);
604        let initial_size = keys.keys_len();
605        if builder.build_levels(&mut keys, stats) {
606            drop(keys);
607            Ok(builder.finish())
608        } else {
609            Err((builder.finish(), keys, initial_size))
610        }
611    }
612
613    /// Constructs [`GOFunction`] for given `keys`, using the build configuration `conf` and reporting statistics with `stats`.
614    /// 
615    /// [`None`] is returned if the construction fails.
616    /// Then it is almost certain that the input contains either duplicate keys
617    /// or keys indistinguishable by any hash function from the family used.
618    pub fn try_with_conf_stats<K, KS, BS>(mut keys: KS, conf: GOBuildConf<GS, SS, S>, stats: &mut BS) -> Option<Self>
619        where K: Hash + Sync, KS: KeySet<K> + Sync, BS: stats::BuildStatsCollector
620    {
621        let mut builder = GOBuilder::new(conf);
622        builder.build_levels(&mut keys, stats).then(|| {
623            drop(keys);
624            builder.finish()
625        })
626    }
627
628    /// Builds [`GOFunction`] for given `keys`, using the build configuration `conf` and reporting statistics with `stats`.
629    ///
630    /// Panics if the construction fails.
631    pub fn with_conf_stats<K, KS, BS>(keys: KS, conf: GOBuildConf<GS, SS, S>, stats: &mut BS) -> Self
632        where K: Hash + Sync, KS: KeySet<K> + Sync, BS: stats::BuildStatsCollector
633    {
634        Self::try_with_conf_stats(keys, conf, stats).expect("Constructing fmph::GOFunction failed. Probably the input contains duplicate keys.")
635    }
636
637    /// Builds [`GOFunction`] for given `keys`, using the build configuration `conf`.
638    ///
639    /// Panics if the construction fails.
640    /// Then it is almost certain that the input contains either duplicate keys
641    /// or keys indistinguishable by any hash function from the family used.
642    #[inline] pub fn with_conf<K, KS>(keys: KS, conf: GOBuildConf<GS, SS, S>) -> Self
643        where K: Hash + Sync, KS: KeySet<K> + Sync
644    {
645        Self::with_conf_stats(keys, conf, &mut ())
646    }
647
648    /// Builds [`GOFunction`] for given `keys`, using the build configuration `conf` and reporting statistics with `stats`.
649    ///
650    /// Panics if the construction fails.
651    /// Then it is almost certain that the input contains either duplicate keys
652    /// or keys indistinguishable by any hash function from the family used.
653    #[inline] pub fn from_slice_with_conf_stats<K, BS>(keys: &[K], conf: GOBuildConf<GS, SS, S>, stats: &mut BS) -> Self
654        where K: Hash + Sync, BS: stats::BuildStatsCollector
655    {
656        Self::with_conf_stats(SliceSourceWithRefs::<_, u8>::new(keys), conf, stats)
657    }
658
659    /// Builds [`GOFunction`] for given `keys`, using the configuration `conf`.
660    ///
661    /// Panics if the construction fails.
662    /// Then it is almost certain that the input contains either duplicate keys
663    /// or keys indistinguishable by any hash function from the family used.
664    #[inline] pub fn from_slice_with_conf<K>(keys: &[K], conf: GOBuildConf<GS, SS, S>) -> Self
665        where K: Hash + Sync
666    {
667        Self::with_conf_stats(SliceSourceWithRefs::<_, u8>::new(keys), conf, &mut ())
668    }
669
670    /// Builds [`GOFunction`] for given `keys`, using the build configuration `conf` and reporting statistics with `stats`.
671    /// Note that `keys` can be reordered during construction.
672    /// 
673    /// Panics if the construction fails.
674    /// Then it is almost certain that the input contains either duplicate keys
675    /// or keys indistinguishable by any hash function from the family used.
676    #[inline] pub fn from_slice_mut_with_conf_stats<K, BS>(keys: &mut [K], conf: GOBuildConf<GS, SS, S>, stats: &mut BS) -> Self
677        where K: Hash + Sync, BS: stats::BuildStatsCollector
678    {
679        Self::with_conf_stats(SliceMutSource::new(keys), conf, stats)
680    }
681
682    /// Builds [`GOFunction`] for given `keys`, using the build configuration `conf`.
683    /// Note that `keys` can be reordered during construction.
684    /// 
685    /// Panics if the construction fails.
686    /// Then it is almost certain that the input contains either duplicate keys
687    /// or keys indistinguishable by any hash function from the family used.
688    #[inline] pub fn from_slice_mut_with_conf<K>(keys: &mut [K], conf: GOBuildConf<GS, SS, S>) -> Self
689        where K: Hash + Sync
690    {
691        Self::with_conf_stats(SliceMutSource::new(keys), conf, &mut ())
692    }
693}
694
695impl<GS: GroupSize + Sync, SS: SeedSize> GOFunction<GS, SS> {
696    /// Reads `Self` from the `input`.
697    /// Only [`GOFunction`]s that use default hasher can be read by this method.
698    pub fn read(input: &mut dyn io::Read) -> io::Result<Self> {
699        Self::read_with_hasher(input, Default::default())
700    }
701}
702
703impl GOFunction {
704    /// Builds [`GOFunction`] for given `keys`, reporting statistics with `stats`.
705    /// 
706    /// Panics if the construction fails.
707    /// Then it is almost certain that the input contains either duplicate keys
708    /// or keys indistinguishable by any hash function from the family used.
709    pub fn from_slice_with_stats<K, BS>(keys: &[K], stats: &mut BS) -> Self
710        where K: Hash + Sync, BS: stats::BuildStatsCollector
711    {
712        Self::from_slice_with_conf_stats(keys, Default::default(), stats)
713    }
714
715    /// Builds [`GOFunction`] for given `keys`.
716    /// 
717    /// Panics if the construction fails.
718    /// Then it is almost certain that the input contains either duplicate keys
719    /// or keys indistinguishable by any hash function from the family used.
720    pub fn from_slice<K: Hash + Sync>(keys: &[K]) -> Self {
721        Self::from_slice_with_conf_stats(keys, Default::default(), &mut ())
722    }
723
724    /// Builds [`GOFunction`] for given `keys`.
725    /// 
726    /// Panics if the construction fails.
727    /// Then it is almost certain that the input contains either duplicate keys
728    /// or keys indistinguishable by any hash function from the family used.
729    pub fn new<K: Hash + Sync, KS: KeySet<K> + Sync>(keys: KS) -> Self {
730        Self::with_conf_stats(keys, Default::default(), &mut ())
731    }
732}
733
734impl<GS: GroupSize, SS: SeedSize, S> GOFunction<GS, SS, S>  {
735    /// Returns the number of keys in the input collection given during construction.
736    /// 
737    /// The time complexity is proportional to the number returned.
738    pub fn len(&self) -> usize {
739        self.array.content.count_bit_ones()
740    }
741}
742
743impl<K: Hash + Sync> From<&[K]> for GOFunction {
744    fn from(keys: &[K]) -> Self {
745        Self::from_slice(keys)
746    }
747}
748
749impl<K: Hash + Sync + Send> From<Vec<K>> for GOFunction {
750    fn from(keys: Vec<K>) -> Self {
751        Self::new(keys)
752    }
753}
754
755#[cfg(test)]
756mod tests {
757    use super::*;
758    use crate::fmph::function::tests::{test_mphf, test_mphf_iter};
759    use crate::fmph::TwoToPowerBits;
760    use std::fmt::{Debug, Display};
761    use crate::seeds::Bits;
762
763    fn test_read_write<GS: GroupSize + Sync, SS: SeedSize>(h: &GOFunction<GS, SS>)
764        where SS::VecElement: std::cmp::PartialEq + Debug
765    {
766        let mut buff = Vec::new();
767        h.write(&mut buff).unwrap();
768        assert_eq!(buff.len(), h.write_bytes());
769        let read = GOFunction::<GS, SS>::read(&mut &buff[..]).unwrap();
770        assert_eq!(h.level_sizes, read.level_sizes);
771        assert_eq!(h.array.content, read.array.content);
772        assert_eq!(h.group_seeds, read.group_seeds);
773    }
774
775    fn test_hash2_invariants<GS: GroupSize, SS: SeedSize>(h: &GOFunction<GS, SS>) {
776        let number_of_groups = h.level_sizes.iter().map(|v| *v as usize).sum::<usize>();
777        assert_eq!(h.conf.bits_per_group * number_of_groups, h.array.content.len() * 64);
778        assert_eq!(ceiling_div(number_of_groups * h.conf.bits_per_seed.into() as usize, 64), h.group_seeds.len());
779    }
780
781    fn test_with_input<K: Hash + Clone + Display + Sync>(to_hash: &[K], bits_per_group: impl GroupSize + Sync) {
782        let goconf = GOConf::bps_bpg(Bits(3), bits_per_group);
783        let h = GOFunction::from_slice_with_conf(to_hash, GOBuildConf::with_mt(goconf, false));
784        //dbg!(h.size_bytes() as f64 * 8.0/to_hash.len() as f64);
785        test_mphf(to_hash, |key| h.get(key));
786        test_hash2_invariants(&h);
787        test_read_write(&h);
788        assert_eq!(h.len(), to_hash.len());
789    }
790
791    #[test]
792    fn test_small_powers_of_two() {
793        //test_with_input(&[1, 2, 5], TwoToPowerBits::new(7));     // not supported for now, upto 63 bit / group
794        //test_with_input(&[1, 2, 5], TwoToPowerBits::new(6));     // not supported for now, upto 63 bit / group
795        test_with_input(&[1, 2, 5], TwoToPowerBits::new(5));
796        test_with_input(&[1, 2, 5], TwoToPowerBits::new(4));
797        test_with_input(&[1, 2, 5], TwoToPowerBits::new(3));
798        test_with_input(&[1, 2, 5], TwoToPowerBits::new(2));
799        test_with_input(&[1, 2, 5], TwoToPowerBits::new(1));
800        test_with_input(&[1, 2, 5], TwoToPowerBits::new(0));
801        //test_with_input(&(-50..150).collect::<Vec<_>>(), TwoToPowerBits::new(7)); // not supported for now, upto 63 bit / group
802        //test_with_input(&(-50..150).collect::<Vec<_>>(), TwoToPowerBits::new(6)); // not supported for now, upto 63 bit / group
803        test_with_input(&(-50..150).collect::<Vec<_>>(), TwoToPowerBits::new(5));
804        test_with_input(&(-50..150).collect::<Vec<_>>(), TwoToPowerBits::new(4));
805        test_with_input(&(-50..150).collect::<Vec<_>>(), TwoToPowerBits::new(3));
806        test_with_input(&(-50..150).collect::<Vec<_>>(), TwoToPowerBits::new(2));
807        test_with_input(&(-50..150).collect::<Vec<_>>(), TwoToPowerBits::new(1));
808        test_with_input(&(-50..150).collect::<Vec<_>>(), TwoToPowerBits::new(0));
809        //test_with_input(&['a', 'b', 'c', 'd'], TwoToPowerBits::new(7)); // not supported for now, upto 63 bit / group
810        //test_with_input(&['a', 'b', 'c', 'd'], TwoToPowerBits::new(6)); // not supported for now, upto 63 bit / group
811        test_with_input(&['a', 'b', 'c', 'd'], TwoToPowerBits::new(5));
812        test_with_input(&['a', 'b', 'c', 'd'], TwoToPowerBits::new(4));
813        test_with_input(&['a', 'b', 'c', 'd'], TwoToPowerBits::new(3));
814        test_with_input(&['a', 'b', 'c', 'd'], TwoToPowerBits::new(2));
815        test_with_input(&['a', 'b', 'c', 'd'], TwoToPowerBits::new(1));
816        test_with_input(&['a', 'b', 'c', 'd'], TwoToPowerBits::new(0));
817    }
818
819    #[test]
820    fn test_small_bits() {
821        test_with_input(&[1, 2, 5], Bits(3));
822        test_with_input(&[1, 2, 5], Bits(5));
823        test_with_input(&[1, 2, 5], Bits(20));
824        test_with_input(&[1, 2, 5], Bits(60));
825        test_with_input(&[1, 2, 5], Bits(63));
826        test_with_input(&(-50..150).collect::<Vec<_>>(), Bits(3));
827        test_with_input(&(-50..150).collect::<Vec<_>>(), Bits(5));
828        test_with_input(&(-50..150).collect::<Vec<_>>(), Bits(20));
829        test_with_input(&(-50..150).collect::<Vec<_>>(), Bits(60));
830        test_with_input(&(-50..150).collect::<Vec<_>>(), Bits(63));
831        test_with_input(&['a', 'b', 'c', 'd'], Bits(3));
832        test_with_input(&['a', 'b', 'c', 'd'], Bits(5));
833        test_with_input(&['a', 'b', 'c', 'd'], Bits(20));
834        test_with_input(&['a', 'b', 'c', 'd'], Bits(60));
835        test_with_input(&['a', 'b', 'c', 'd'], Bits(63));
836    }
837
838    #[test]
839    fn test_medium() {
840        let keys: Vec<_> = (-2000..2000).map(|v| 3*v).collect();
841        //test_with_input(&keys, TwoToPowerBits::new(7));   // not supported for now, upto 63 bit / group
842        //test_with_input(&keys, TwoToPowerBits::new(6));   // not supported for now, upto 63 bit / group
843        test_with_input(&keys, TwoToPowerBits::new(5));
844        test_with_input(&keys, TwoToPowerBits::new(4));
845        test_with_input(&keys, TwoToPowerBits::new(3));
846        test_with_input(&keys, TwoToPowerBits::new(2));
847        test_with_input(&keys, TwoToPowerBits::new(1));
848        test_with_input(&keys, TwoToPowerBits::new(0));
849        test_with_input(&keys, Bits(3));
850        test_with_input(&keys, Bits(5));
851        test_with_input(&keys, Bits(10));
852        test_with_input(&keys, Bits(13));
853        test_with_input(&keys, Bits(20));
854        test_with_input(&keys, Bits(27));
855        test_with_input(&keys, Bits(33));
856        test_with_input(&keys, Bits(45));
857        test_with_input(&keys, Bits(50));
858        test_with_input(&keys, Bits(55));
859        test_with_input(&keys, Bits(60));
860        test_with_input(&keys, Bits(63));
861    }
862
863    #[test]
864    fn test_large_size() {
865        let keys = (-20000..20000).collect::<Vec<_>>();
866        assert!(GOFunction::from_slice_with_conf(&keys[..], GOConf::default_biggest().into()).size_bytes() as f64 * (8.0/40000.0) < 2.57);
867        assert!(GOFunction::from_slice_with_conf(&keys[..], GOConf::default_bigger().into()).size_bytes() as f64 * (8.0/40000.0) < 2.4);
868        assert!(GOFunction::from(&keys[..]).size_bytes() as f64 * (8.0/40000.0) < 2.26);
869        assert!(GOFunction::from_slice_with_conf(&keys[..], GOConf::default_smallest().into()).size_bytes() as f64 * (8.0/40000.0) < 2.15);
870    }
871
872    #[test]
873    #[ignore = "uses much memory and time"]
874    fn test_fmphgo_for_over_2to32_keys() {
875        const LEN: u64 = 5_000_000_000;
876        let f = GOFunction::with_conf_stats(
877            crate::fmph::keyset::CachedKeySet::dynamic(|| 0..LEN, usize::MAX),
878            GOConf::default_biggest().into(),
879            &mut crate::stats::BuildStatsPrinter::stdout());
880        test_mphf_iter(LEN as usize, 0..LEN, |key| f.get(key));
881        assert!(f.size_bytes() as f64 * (8.0/LEN as f64) < 2.57);
882    }
883
884    #[test]
885    fn test_duplicates() {
886        assert!(GOFunction::try_with_conf_stats(vec![1, 1], Default::default(), &mut ()).is_none());
887        assert!(GOFunction::try_with_conf_stats(vec![1, 2, 3, 1, 4], Default::default(), &mut ()).is_none());
888    }
889
890    #[test]
891    fn test_duplicates_partial() {
892        let keys = vec![1, 2, 3, 1, 4];
893        let expected_initial_len = keys.len();
894        let r = GOFunction::try_with_conf_stats_or_partial(keys, Default::default(), &mut ());
895        if let Err((mphf, mut remaining, initial_len)) = r {
896            assert_eq!(initial_len, expected_initial_len);
897            remaining.sort();
898            assert_eq!(remaining, vec![1, 1]);
899            test_mphf(&[2, 3, 4], |key| mphf.get(key));
900        } else {
901            assert!(false)
902        }
903    }
904}