Skip to main content

ph_temp/fmph/
gofunction.rs

1use std::hash::Hash;
2use binout::{VByte, Serializer, AsIs};
3use bitm::{BitAccess, Rank, ceiling_div};
4
5use crate::seeds::{Bits8, SeedSize, TwoToPowerBitsStatic};
6use crate::utils::{ArrayWithRank, read_bits};
7use crate::{BuildDefaultSeededHasher, BuildSeededHasher, stats};
8
9use super::function::{concat_level_arrays, from_mut_slice, get_mut_slice, level_array_for, LevelArray};
10use super::goindexing::GroupSize;
11use std::io;
12use std::sync::atomic::AtomicU64;
13use dyn_size_of::GetSize;
14use crate::fmph::function::{fphash_add_bit, fphash_remove_collided, fphash_sync_add_bit};
15use crate::fmph::goindexing::group_nr;
16
17use rayon::prelude::*;
18use crate::fmph::keyset::{KeySet, SliceMutSource, SliceSourceWithRefs};
19
20/// Configuration of family of (group-optimized) hash functions used by [`GOFunction`] and accepted by [`GOBuildConf`] constructors.
21/// 
22/// Good configurations can be obtained by calling one of the following functions:
23/// [default_biggest](GOConf::default_biggest), [default_bigger](GOConf::default_bigger),
24/// [default](GOConf::default), [default_smallest](GOConf::default_smallest).
25/// These functions are listed in order of increasing performance (in terms of size and evaluation speed)
26/// and time to construct the minimum perfect hash function.
27/// More details are included in their documentation and the paper:
28/// P. Beling, *Fingerprinting-based minimal perfect hashing revisited*, ACM Journal of Experimental Algorithmics, 2023, <https://doi.org/10.1145/3596453>
29#[derive(Clone)]
30pub struct GOConf<GS: GroupSize = TwoToPowerBitsStatic::<4>, SS: SeedSize = TwoToPowerBitsStatic<2>, S = BuildDefaultSeededHasher> {
31    /// The family of hash functions used by the constructed [`GOFunction`]. (default: [`BuildDefaultSeededHasher`])
32    pub hash_builder: S,
33    /// Size of seeds (in bits). (default: 4)
34    pub bits_per_seed: SS,
35    /// Size of groups (in bits). (default: 16)
36    pub bits_per_group: GS
37}
38
39impl GOConf<TwoToPowerBitsStatic::<3>, TwoToPowerBitsStatic::<0>, BuildDefaultSeededHasher> {
40    /// Creates a configuration in which the seed and group sizes are 1 and 8 bits respectively,
41    /// which (when relative level size is 100) leads to a minimum perfect hash function whose:
42    /// - size is about 2.52 bits per input key,
43    /// - the expected number of levels visited during the evaluation is about 2.18,
44    /// - construction takes about 4 times less time compared to the [default](GOConf::default) configuration.
45    pub fn default_biggest() -> Self {
46        Self {
47            hash_builder: Default::default(),
48            bits_per_seed: Default::default(),
49            bits_per_group: Default::default()
50        }
51    }
52}
53
54impl GOConf<TwoToPowerBitsStatic::<4>, TwoToPowerBitsStatic::<1>, BuildDefaultSeededHasher> {
55    /// Creates a configuration in which the seed and group sizes are 2 and 16 bits respectively,
56    /// which (when relative level size is 100) leads to a minimum perfect hash function whose:
57    /// - size is about 2.36 bits per input key,
58    /// - the expected number of levels visited during the evaluation is about 2.04,
59    /// - construction takes about 3 times less time compared to the [default](GOConf::default) configuration.
60    pub fn default_bigger() -> Self {
61        Self {
62            hash_builder: Default::default(),
63            bits_per_seed: Default::default(),
64            bits_per_group: Default::default()
65        }
66    }
67}
68
69impl Default for GOConf {
70    /// Creates a configuration in which the seed and group sizes are 4 and 16 bits respectively,
71    /// which (when relative level size is 100) leads to a minimum perfect hash function whose:
72    /// - size is about 2.21 bits per input key,
73    /// - the expected number of levels visited during the evaluation is about 1.73.
74    fn default() -> Self {
75        Self {
76            hash_builder: Default::default(),
77            bits_per_seed: Default::default(),
78            bits_per_group: Default::default()
79        }
80    }
81}
82
83impl GOConf<TwoToPowerBitsStatic::<5>, Bits8, BuildDefaultSeededHasher> {
84    /// Creates a configuration in which the seed and group sizes are 8 and 32 bits respectively,
85    /// which (when relative level size is 100) leads to a minimum perfect hash function whose:
86    /// - size is about 2.10 bits per input key,
87    /// - the expected number of levels visited during the evaluation is about 1.64,
88    /// - construction takes about 13 times longer compared to the [default](GOConf::default) configuration.
89    pub fn default_smallest() -> Self {
90        Self {
91            hash_builder: Default::default(),
92            bits_per_seed: Default::default(),
93            bits_per_group: Default::default()
94        }
95    }
96}
97
98impl<GS: GroupSize, SS: SeedSize> GOConf<GS, SS> {
99    /// Returns a configuration that uses seeds and groups of the sizes given in bits.
100    pub fn bps_bpg(bits_per_seed: SS, bits_per_group: GS) -> Self {
101        Self {
102            hash_builder: Default::default(),
103            bits_per_seed,
104            bits_per_group,
105        }
106    }
107}
108
109impl<GS: GroupSize, SS: SeedSize, S: BuildSeededHasher> GOConf<GS, SS, S> {
110    /// Panics if the configuration is incorrect.
111    pub fn validate(&self) {
112        self.bits_per_seed.validate().unwrap();
113        self.bits_per_group.validate().unwrap();
114    }
115
116    /// Returns a configuration that uses given family of hash functions and seeds and groups of the sizes given in bits.
117    pub fn hash_bps_bpg(hash_builder: S, bits_per_seed: SS, bits_per_group: GS) -> Self {
118        Self { hash_builder, bits_per_seed, bits_per_group }  // 1<<6=64
119    }
120
121    /// Returns array index for given `hash` of key, size of level in groups, and group seed provided by `group_seed`.
122    #[inline(always)] pub fn hash_index<GetGroupSeed>(&self, hash: u64, level_size_groups: usize, group_seed: GetGroupSeed) -> usize
123        where GetGroupSeed: FnOnce(usize) -> u16  // returns group seed for group with given index
124    {
125        let group = group_nr(hash, level_size_groups);
126        self.bits_per_group.bit_index_for_seed(hash, group_seed(group), group)
127    }
128
129    /// Returns array index for given `key`, seed and size (in groups) of level, and group seed provided by `group_seed`.
130    #[inline(always)] pub fn key_index<GetGroupSeed, K>(&self, key: &K, level_seed: u64, level_size_groups: usize, group_seed: GetGroupSeed) -> usize
131        where GetGroupSeed: FnOnce(usize) -> u16, K: Hash
132    {
133        self.hash_index(self.hash_builder.hash_one(key, level_seed), level_size_groups, group_seed)
134    }
135
136    /// Returns fingerprint array for given hashes of keys, level size, and group seeds (given as a function that returns seeds for provided group indices).
137    fn build_array_for_hashes(&self, key_hashes: &[u64], level_size_segments: usize, level_size_groups: usize, group_seed: u16) -> LevelArray
138    {
139        let mut result = level_array_for(level_size_segments);
140        let mut collision = vec![0u64; level_size_segments].into_boxed_slice();
141        for hash in key_hashes {
142            fphash_add_bit(&mut result, &mut collision, self.hash_index(*hash, level_size_groups, |_| group_seed));
143        };
144        fphash_remove_collided(&mut result, &collision);
145        result
146    }
147}
148
149impl<GS: GroupSize + Sync, SS: SeedSize, S: BuildSeededHasher + Sync> GOConf<GS, SS, S> {
150    /// Returns fingerprint array for given hashes of keys, level size, and group seeds (given as a function that returns seeds for provided group indices).
151    fn build_array_for_hashes_mt(&self, key_hashes: &[u64], level_size_segments: usize, level_size_groups: usize, group_seed: u16) -> LevelArray
152    {
153        let mut result = level_array_for(level_size_segments);
154        let result_atom = from_mut_slice(&mut result);
155        let mut collision: Box<[AtomicU64]> = (0..level_size_segments).map(|_| AtomicU64::default()).collect();
156        key_hashes.par_iter().for_each(
157            |hash| fphash_sync_add_bit(&result_atom, &collision, self.hash_index(*hash, level_size_groups, |_| group_seed))
158        );
159        fphash_remove_collided(&mut result, get_mut_slice(&mut collision));
160        result
161    }
162}
163
164/// Build configuration that is accepted by [`GOFunction`] constructors.
165///
166/// See field descriptions for details.
167#[derive(Clone)]
168pub struct GOBuildConf<GS: GroupSize = TwoToPowerBitsStatic::<4>, SS: SeedSize = TwoToPowerBitsStatic<2>, S = BuildDefaultSeededHasher> {
169    /// The threshold for the number of keys below which their hashes will be cached during level construction.
170    /// (default: [`GOBuildConf::DEFAULT_CACHE_THRESHOLD`])
171    /// 
172    /// Caching speeds up level construction at the expense of memory consumption during construction
173    /// (caching a single key requires 8 bytes of memory).
174    /// Caching is particularly recommended for keys with complex types whose hashing is slow.
175    /// It is possible to use a value of `0` to disable caching completely, or [`usize::MAX`] to use it on all levels.
176    pub cache_threshold: usize,
177
178    /// Size of each level given as a percentage of the number of level input keys. (default: `100`)
179    /// 
180    /// A value of 100 minimizes the size of the constructed minimum perfect hash function.
181    /// Larger values speed up evaluation at the expense of increased size.
182    /// It does not make sense to use values below 100.
183    pub relative_level_size: u16,
184
185    /// Whether to use multiple threads during construction. (default: `true`)
186    /// 
187    /// If `true`, the construction will be performed using the default [rayon] thread pool.
188    pub use_multiple_threads: bool,
189
190    /// Configuration of family of (group-optimized) hash functions (default: [`GOConf::default`]).
191    pub goconf: GOConf<GS, SS, S>,
192}   // TODO introduce trait to make other builders possible
193
194impl Default for GOBuildConf {
195    fn default() -> Self { Self::new(Default::default()) }
196}
197
198impl<GS: GroupSize + Sync, SS: SeedSize, S: BuildSeededHasher + Sync> From<GOConf<GS, SS, S>> for GOBuildConf<GS, SS, S> {
199    #[inline] fn from(value: GOConf<GS, SS, S>) -> Self { Self::new(value) }
200}
201
202impl<GS: GroupSize + Sync, SS: SeedSize, S: BuildSeededHasher + Sync> GOBuildConf<GS, SS, S>
203{
204    /// The default value for [`relative_level_size`](GOBuildConf::relative_level_size),
205    /// which results in building the cache with a maximum size of 1GB.
206    pub const DEFAULT_CACHE_THRESHOLD: usize = 1024*1024*128; // *8 bytes = max 1GB for pre-hashing
207
208    /// Returns configuration with custom [group-optimized family of hash functions](GOBuildConf::goconf),
209    /// [relative level size](GOBuildConf::relative_level_size), [cache threshold](GOBuildConf::cache_threshold)
210    /// and potentially enabled [multiple threads](GOBuildConf::use_multiple_threads).
211    pub fn with_lsize_ct_mt(goconf: GOConf<GS, SS, S>, relative_level_size: u16, cache_threshold: usize, use_multiple_threads: bool) -> Self {
212        Self {
213            cache_threshold,
214            relative_level_size,
215            use_multiple_threads,
216            goconf
217        }
218    }
219
220    /// Returns configuration with custom [group-optimized family of hash functions](GOBuildConf::goconf).
221    pub fn new(goconf: GOConf<GS, SS, S>) -> Self {
222        Self::with_lsize_ct_mt(goconf, 100, Self::DEFAULT_CACHE_THRESHOLD, true)
223    }
224
225    /// Returns configuration with custom [group-optimized family of hash functions](GOBuildConf::goconf)
226    /// and [cache threshold](GOBuildConf::cache_threshold).
227    pub fn with_ct(goconf: GOConf<GS, SS, S>, cache_threshold: usize) -> Self {
228        Self::with_lsize_ct_mt(goconf, 100, cache_threshold, true)
229    }
230
231    /// Returns configuration with custom [group-optimized family of hash functions](GOBuildConf::goconf),
232    /// [relative level size](GOBuildConf::relative_level_size) and [cache threshold](GOBuildConf::cache_threshold).
233    pub fn with_lsize_ct(goconf: GOConf<GS, SS, S>, relative_level_size: u16, cache_threshold: usize) -> Self {
234        Self::with_lsize_ct_mt(goconf, relative_level_size, cache_threshold, true)
235    }
236
237    /// Returns configuration with custom [group-optimized family of hash functions](GOBuildConf::goconf)
238    /// and [relative level size](GOBuildConf::relative_level_size).
239    pub fn with_lsize(goconf: GOConf<GS, SS, S>, relative_level_size: u16) -> Self {
240        Self::with_lsize_ct_mt(goconf, relative_level_size, Self::DEFAULT_CACHE_THRESHOLD, true)
241    }
242
243    /// Returns configuration with custom [group-optimized family of hash functions](GOBuildConf::goconf),
244    /// [relative level size](GOBuildConf::relative_level_size)
245    /// and potentially enabled [multiple threads](GOBuildConf::use_multiple_threads).
246    pub fn with_lsize_mt(goconf: GOConf<GS, SS, S>, relative_level_size: u16, use_multiple_threads: bool) -> Self {
247        Self::with_lsize_ct_mt(goconf, relative_level_size, Self::DEFAULT_CACHE_THRESHOLD, use_multiple_threads)
248    }
249
250    /// Returns configuration with custom [group-optimized family of hash functions](GOBuildConf::goconf),
251    /// and potentially enabled [multiple threads](GOBuildConf::use_multiple_threads).
252    pub fn with_mt(goconf: GOConf<GS, SS, S>, use_multiple_threads: bool) -> Self {
253        Self::with_lsize_ct_mt(goconf, 100, Self::DEFAULT_CACHE_THRESHOLD, use_multiple_threads)
254    }
255
256    #[inline(always)] fn last_seed(&self) -> u16 { ((1u32 << self.goconf.bits_per_seed.into())-1) as u16 }
257
258    /// Update `best_array` and `best_seeds` copying groups that are better (have more ones in `array`) from `array` and `array_seed`.
259    fn update_best<GetGroupSeed>(&self, level_size_groups: usize, best_array: &mut [u64], best_seeds: &mut [SS::VecElement], array: &[u64], array_seed: GetGroupSeed)
260        where GetGroupSeed: Fn(usize) -> u16
261    {
262        for group_index in 0..level_size_groups {
263            self.goconf.bits_per_group.copy_group_if_better(best_array, array, group_index,
264                || unsafe{ self.goconf.bits_per_seed.set_seed(best_seeds, group_index, array_seed(group_index)) }
265            )
266        }
267    }
268
269    /// Build (by calling `build_for_group`) arrays for all group seeds sequentially and select best groups and seeds (which are returned).
270    /// `build_for_group` can use multiple threads internally to build each array.
271    #[inline(always)]
272    fn best_array<AB>(&self, build_for_group: AB, level_size_groups: usize) -> (LevelArray, Box<[SS::VecElement]>)
273        where AB: Fn(u16) -> LevelArray // build array for given group nr
274    {
275        let mut best_array = build_for_group(0);
276        let mut best_seeds = self.goconf.bits_per_seed.new_zeroed_seed_vec(level_size_groups);
277        for group_seed in 1..=self.last_seed() {
278            let with_new_seed = build_for_group(group_seed);
279            self.update_best(level_size_groups, &mut best_array, &mut best_seeds, &with_new_seed, |_| group_seed);
280        }
281        (best_array, best_seeds)
282    }
283}
284
285/// Helper structure for building fingerprinting-based minimal perfect hash function with group optimization (FMPHGO, [`GOFunction`]).
286pub struct GOBuilder<GS: GroupSize = TwoToPowerBitsStatic::<4>, SS: SeedSize = TwoToPowerBitsStatic<2>, S = BuildDefaultSeededHasher> {
287    level_sizes: Vec::<usize>,
288    arrays: Vec::<LevelArray>,
289    group_seeds: Vec::<Box<[SS::VecElement]>>,
290    conf: GOBuildConf<GS, SS, S>
291}   // TODO introduce trait to make other builders possible
292
293impl<GS: GroupSize + Sync, SS: SeedSize, S: BuildSeededHasher + Sync> GOBuilder<GS, SS, S>
294{
295    pub fn new(mut conf: GOBuildConf<GS, SS, S>) -> Self {
296        conf.goconf.validate();
297        if conf.use_multiple_threads { conf.use_multiple_threads = rayon::current_num_threads() > 1; }
298        Self {
299            level_sizes: Vec::<usize>::new(),
300            arrays: Vec::<LevelArray>::new(),
301            group_seeds: Vec::<Box<[SS::VecElement]>>::new(),
302            conf
303        }
304    }
305
306    fn push(&mut self, array: LevelArray, seeds: Box<[SS::VecElement]>, size_groups: usize) {
307        self.arrays.push(array);
308        self.group_seeds.push(seeds);
309        self.level_sizes.push(size_groups);
310    }
311
312    /// Returns number the level about to build (number of levels built so far).
313    #[inline(always)] fn level_nr(&self) -> usize { self.level_sizes.len() }
314
315    /// Returns whether `key` is retained (`false` if it is already hashed at the levels built so far).
316    fn retained<K>(&self, key: &K) -> bool where K: Hash {
317        self.arrays.iter().zip(self.group_seeds.iter()).zip(self.level_sizes.iter()).enumerate()
318            .all(|(level_seed, ((a, seeds), level_size_groups))| {
319                !a.get_bit(self.conf.goconf.key_index(key, level_seed as u64, *level_size_groups,
320                |group| unsafe{ self.conf.goconf.bits_per_seed.get_seed(seeds, group as usize)}))
321            })
322    }
323
324    /// Returns fingerprint array for given keys, level size, and group seeds (given as a function that returns seeds for provided group indices).
325    #[inline(always)]
326    fn build_array<KS, K>(&self, keys: &KS, level_size_segments: usize, level_size_groups: usize, group_seed: u16) -> LevelArray
327        where   // returns group seed for group with given index
328            K: Hash, KS: KeySet<K>
329    {
330        let mut result = level_array_for(level_size_segments);
331        let mut collision = vec![0u64; level_size_segments].into_boxed_slice();
332        let level_seed = self.level_nr() as u64;
333        keys.for_each_key(|key| fphash_add_bit(&mut result, &mut collision, self.conf.goconf.key_index(key, level_seed, level_size_groups, |_| group_seed)),
334                          |key| self.retained(key));
335        fphash_remove_collided(&mut result, &collision);
336        result
337    }
338
339    /// Returns fingerprint array for given hashes of keys, level size, and group seeds (given as a function that returns seeds for provided group indices).
340    #[inline(always)]
341    fn build_array_mt<KS, K>(&self, keys: &KS, level_size_segments: usize, level_size_groups: usize, group_seed: u16) -> LevelArray
342        where K: Hash, KS: KeySet<K>  // returns group seed for group with given index
343    {
344        if !keys.has_par_for_each_key() {
345            return self.build_array(keys, level_size_segments, level_size_groups, group_seed);
346        }
347        let mut result = level_array_for(level_size_segments);
348        let result_atom = from_mut_slice(&mut result);
349        let mut collision: Box<[AtomicU64]> = (0..level_size_segments).map(|_| AtomicU64::default()).collect();
350        let level_seed = self.level_nr() as u64;
351        keys.par_for_each_key(
352            |key| fphash_sync_add_bit(&result_atom, &collision, self.conf.goconf.key_index(key, level_seed, level_size_groups, |_| group_seed)),
353            |key| self.retained(key)
354        );
355        fphash_remove_collided(&mut result, get_mut_slice(&mut collision));
356        result
357    }
358
359    fn build_next_level_with_cache<KS, K>(&mut self, keys: &mut KS, level_size_groups: usize, level_size_segments: usize)
360        where K: Hash + Sync, KS: KeySet<K> + Sync
361    {
362        let level_seed = self.level_nr() as u64;
363        let key_hashes = keys.maybe_par_map_each_key(
364            |k| self.conf.goconf.hash_builder.hash_one(k, level_seed),
365            |key| self.retained(key),
366            self.conf.use_multiple_threads
367        );
368        let (array, seeds) = if self.conf.use_multiple_threads {
369            self.conf.best_array(|g| self.conf.goconf.build_array_for_hashes_mt(&key_hashes, level_size_segments, level_size_groups, g), level_size_groups)
370        } else {
371            self.conf.best_array(|g| self.conf.goconf.build_array_for_hashes(&key_hashes, level_size_segments, level_size_groups, g), level_size_groups)
372        };
373        keys.maybe_par_retain_keys_with_indices(
374            |i| !array.get_bit(
375                self.conf.goconf.hash_index(key_hashes[i], level_size_groups,
376                                     |group| unsafe{ self.conf.goconf.bits_per_seed.get_seed(&seeds, group as usize) })
377            ),
378            |key| !array.get_bit(
379                self.conf.goconf.key_index(key, level_seed, level_size_groups,
380                                    |group| unsafe{ self.conf.goconf.bits_per_seed.get_seed(&seeds, group as usize) })
381            ),
382            |key| self.retained(key),
383            || array.count_bit_ones(),
384            self.conf.use_multiple_threads
385        );
386        self.push(array, seeds, level_size_groups);
387    }
388
389    /// Returns true after successful building.
390    fn build_levels<KS, K, BS>(&mut self, keys: &mut KS, stats: &mut BS) -> bool
391    where K: Hash + Sync, KS: KeySet<K> + Sync, BS: stats::BuildStatsCollector
392    {
393        let mut levels_without_reduction = 0;   // number of levels without any reduction in number of the keys
394        let mut input_size = keys.keys_len();
395        while input_size != 0 {
396            let (level_size_groups, level_size_segments) = self.conf.goconf.bits_per_group.level_size_groups_segments(
397                ceiling_div(input_size * self.conf.relative_level_size as usize, 100));
398            //let seed = level_nr;
399            stats.level(input_size, level_size_segments * 64);
400            if input_size < self.conf.cache_threshold {
401                self.build_next_level_with_cache(keys, level_size_groups, level_size_segments);
402            } else {
403                // build level without hash caching:
404                let (array, seeds) = if self.conf.use_multiple_threads {
405                    self.conf.best_array(|g| self.build_array_mt(keys, level_size_segments, level_size_groups, g), level_size_groups)
406                } else {
407                    self.conf.best_array(|g| self.build_array(keys, level_size_segments, level_size_groups, g), level_size_groups)
408                };
409                let level_nr = self.level_nr();
410                keys.maybe_par_retain_keys(
411                    |key| {
412                        let hash = self.conf.goconf.hash_builder.hash_one(key, level_nr as u64);
413                        let group = group_nr(hash, level_size_groups);
414                        let bit_index = self.conf.goconf.bits_per_group.bit_index_for_seed(
415                            hash,
416                            //current_seeds.get_fragment(group as usize, conf.bits_per_group_seed) as u16,
417                            unsafe{ self.conf.goconf.bits_per_seed.get_seed(&seeds, group as usize) },
418                            group);
419                        !array.get_bit(bit_index)
420                    },
421                    |key| self.retained(key),
422                    || array.count_bit_ones(),
423                    self.conf.use_multiple_threads
424                );
425                self.push(array, seeds, level_size_groups);
426            }
427            let prev_input_size = input_size;
428            input_size = keys.keys_len();
429            if input_size == prev_input_size {
430                levels_without_reduction += 1;
431                if levels_without_reduction == 10 {
432                    let len = self.arrays.len()-levels_without_reduction;
433                    self.arrays.truncate(len);
434                    self.group_seeds.truncate(len);
435                    self.level_sizes.truncate(len);
436                    stats.end(input_size);
437                    return false;
438                }
439            } else {
440                levels_without_reduction = 0;
441            }
442        }
443        stats.end(0);
444        return true;
445    }
446
447    pub fn finish(self) -> GOFunction<GS, SS, S> {
448        let (array, _)  = ArrayWithRank::build(concat_level_arrays(self.arrays));
449        let group_seeds_concatenated = self.conf.goconf.bits_per_seed.concatenate_seed_vecs(|| self.level_sizes.iter().copied(), self.group_seeds);
450        GOFunction::<GS, SS, S> {
451            array,
452            group_seeds: group_seeds_concatenated,
453            conf: self.conf.goconf,
454            level_sizes: self.level_sizes.into_boxed_slice(),
455        }
456    }
457}
458
459/// Fingerprinting-based minimal perfect hash function with group optimization (FMPHGO).
460///
461/// See:
462/// - P. Beling, *Fingerprinting-based minimal perfect hashing revisited*, ACM Journal of Experimental Algorithmics, 2023, <https://doi.org/10.1145/3596453>
463pub struct GOFunction<GS: GroupSize = TwoToPowerBitsStatic::<4>, SS: SeedSize = TwoToPowerBitsStatic<2>, S = BuildDefaultSeededHasher> {
464    array: ArrayWithRank,
465    group_seeds: Box<[SS::VecElement]>,   //  Box<[u8]>,
466    level_sizes: Box<[usize]>, // number of groups
467    conf: GOConf<GS, SS, S>
468    // 0..01..1 mask with number of ones = group size (in bits)
469    //group_size_mask: u8,
470}
471
472impl<GS: GroupSize, SS: SeedSize, S: BuildSeededHasher> GetSize for GOFunction<GS, SS, S> {
473    fn size_bytes_dyn(&self) -> usize {
474        self.array.size_bytes_dyn()
475            //+ self.seeds.len() * std::mem::size_of::<u8>()
476            + self.group_seeds.size_bytes_dyn()
477            + self.level_sizes.size_bytes_dyn()
478    }
479
480    const USES_DYN_MEM: bool = true;
481}
482
483impl<GS: GroupSize, SS: SeedSize, S: BuildSeededHasher> GOFunction<GS, SS, S> {
484
485    /// Gets the value associated with the given `key` and reports statistics to `access_stats`.
486    /// 
487    /// The returned value is in the range: `0` (inclusive), the number of elements in the input key collection (exclusive).
488    /// If the `key` was not in the input key collection given during construction,
489    /// either [`None`] or an undetermined value from the specified range is returned.
490    #[inline(always)] pub fn get_stats<K: Hash + ?Sized, A: stats::AccessStatsCollector>(&self, key: &K, access_stats: &mut A) -> Option<u64> {
491        let mut groups_before = 0usize;
492        let mut level_nr = 0usize;
493        loop {
494            let level_size_groups = *self.level_sizes.get(level_nr)?;
495            /*let bit_index = self.conf.key_index(key, level_nr, level_size_groups,
496                                                |g| self.conf.bits_per_seed.get_seed(&self.group_seeds, (groups_before + g) as usize)
497            ); // wrong as bit_index_for_seed is called with wrong group */
498            let hash = self.conf.hash_builder.hash_one(key, level_nr as u64);
499            let group = groups_before + group_nr(hash, level_size_groups);
500            let seed = unsafe{ self.conf.bits_per_seed.get_seed(&self.group_seeds, group) };
501            let bit_index = self.conf.bits_per_group.bit_index_for_seed(hash, seed, group);
502            if self.array.content.get_bit(bit_index) {
503                access_stats.found_on_level(level_nr);
504                return Some(unsafe{self.array.rank_unchecked(bit_index)} as u64);
505            }
506            groups_before += level_size_groups;
507            level_nr += 1;
508        }
509    }
510
511    /// Gets the value associated with the given `key`.
512    /// 
513    /// The returned value is in the range: `0` (inclusive), the number of elements in the input key collection (exclusive).
514    /// If the `key` was not in the input key collection given during construction,
515    /// either [`None`] or an undetermined value from the specified range is returned.
516    #[inline(always)] pub fn get<K: Hash + ?Sized>(&self, key: &K) -> Option<u64> {
517        self.get_stats(key, &mut ())
518    }
519
520    /// Gets the value associated with the given `key` and reports statistics to `access_stats`.
521    /// 
522    /// The returned value is in the range: `0` (inclusive), the number of elements in the input key collection (exclusive).
523    /// If the `key` was not in the input key collection given during construction,
524    /// it either panics or returns an undetermined value from the specified range.
525    #[inline(always)] pub fn get_stats_or_panic<K: Hash + ?Sized, A: stats::AccessStatsCollector>(&self, key: &K, access_stats: &mut A) -> u64 {
526        self.get_stats(key, access_stats).expect("Invalid access to an item outside the set given during construction.")
527    }
528
529    /// Gets the value associated with the given `key` and reports statistics to `access_stats`.
530    /// 
531    /// The returned value is in the range: `0` (inclusive), the number of elements in the input key collection (exclusive).
532    /// If the `key` was not in the input key collection given during construction,
533    /// it either panics or returns an undetermined value from the specified range.
534    #[inline(always)] pub fn get_or_panic<K: Hash + ?Sized>(&self, key: &K) -> u64 {
535        self.get_stats_or_panic(key, &mut ())
536    }
537
538    /// Returns number of bytes which `write` will write.
539    pub fn write_bytes(&self) -> usize {
540        self.conf.bits_per_group.write_size_bytes()
541            + VByte::array_size(&self.level_sizes)
542            + AsIs::array_content_size(&self.array.content)
543            + std::mem::size_of::<u8>() + self.group_seeds.size_bytes_content_dyn()
544    }
545
546    /// Writes `self` to the `output`.
547    pub fn write(&self, output: &mut dyn io::Write) -> io::Result<()>
548    {
549        self.conf.bits_per_group.write(output)?;
550        VByte::write_array(output, &self.level_sizes)?;
551        AsIs::write_all(output, self.array.content.iter())?;
552        self.conf.bits_per_seed.write_seed_vec(output, &self.group_seeds)
553    }
554
555    /// Reads `Self` from the `input`. Hash builder must be the same as the one used to write.
556    pub fn read_with_hasher(input: &mut dyn io::Read, hash_builder: S) -> io::Result<Self>
557    {
558        let bits_per_group = GS::read(input)?;
559        let level_size = VByte::read_array(input)?;
560        let number_of_groups = level_size.iter().map(|v|*v as usize).sum::<usize>();
561
562        let array_content = read_bits(input, bits_per_group * number_of_groups)?;
563        let (array_with_rank, _) = ArrayWithRank::build(array_content);
564
565        let (bits_per_group_seed, group_seeds) = SS::read_seed_vec(input, number_of_groups)?;
566
567        Ok(Self {
568            array: array_with_rank,
569            group_seeds,
570            level_sizes: level_size,
571            conf: GOConf {
572                bits_per_seed: bits_per_group_seed,
573                bits_per_group,
574                hash_builder
575            },
576        })
577    }
578
579    /// Returns sizes of the successive levels.
580    pub fn level_sizes(&self) -> &[usize] {
581        &self.level_sizes
582    }
583}
584
585impl<GS: GroupSize + Sync, SS: SeedSize, S: BuildSeededHasher + Sync> GOFunction<GS, SS, S> {
586    /// Constructs [`GOFunction`] for given input `keys`,
587    /// using the build configuration `conf` and reporting statistics with `stats`.
588    /// 
589    /// If the construction fails, it returns `Err` with a triple *(f, k, s)*, where:
590    /// - *f* is a [`GOFunction`] handling only part of the keys
591    ///   (that returns numbers in the interval *[0, s-k.keys_len())*);
592    /// - *k* is a set of the remaining keys,
593    /// - *s* is the initial number of keys.
594    /// If needed, the keys from *k* can be placed in another data structure to handle all the keys.
595    /// 
596    /// If the construction fails, it is almost certain that the input contains either duplicate keys
597    /// or keys indistinguishable by any hash function from the family used.
598    /// The duplicate keys will be included in the *k* set.
599    pub fn try_with_conf_stats_or_partial<K, KS, BS>(mut keys: KS, conf: GOBuildConf<GS, SS, S>, stats: &mut BS) -> Result<Self, (Self, KS, usize)>
600        where K: Hash + Sync, KS: KeySet<K> + Sync, BS: stats::BuildStatsCollector
601    {
602        let mut builder = GOBuilder::new(conf);
603        let initial_size = keys.keys_len();
604        if builder.build_levels(&mut keys, stats) {
605            drop(keys);
606            Ok(builder.finish())
607        } else {
608            Err((builder.finish(), keys, initial_size))
609        }
610    }
611
612    /// Constructs [`GOFunction`] for given `keys`, using the build configuration `conf` and reporting statistics with `stats`.
613    /// 
614    /// [`None`] is returned if the construction fails.
615    /// Then it is almost certain that the input contains either duplicate keys
616    /// or keys indistinguishable by any hash function from the family used.
617    pub fn try_with_conf_stats<K, KS, BS>(mut keys: KS, conf: GOBuildConf<GS, SS, S>, stats: &mut BS) -> Option<Self>
618        where K: Hash + Sync, KS: KeySet<K> + Sync, BS: stats::BuildStatsCollector
619    {
620        let mut builder = GOBuilder::new(conf);
621        builder.build_levels(&mut keys, stats).then(|| {
622            drop(keys);
623            builder.finish()
624        })
625    }
626
627    /// Builds [`GOFunction`] for given `keys`, using the build configuration `conf` and reporting statistics with `stats`.
628    ///
629    /// Panics if the construction fails.
630    pub fn with_conf_stats<K, KS, BS>(keys: KS, conf: GOBuildConf<GS, SS, S>, stats: &mut BS) -> Self
631        where K: Hash + Sync, KS: KeySet<K> + Sync, BS: stats::BuildStatsCollector
632    {
633        Self::try_with_conf_stats(keys, conf, stats).expect("Constructing fmph::GOFunction failed. Probably the input contains duplicate keys.")
634    }
635
636    /// Builds [`GOFunction`] for given `keys`, using the build configuration `conf`.
637    ///
638    /// Panics if the construction fails.
639    /// Then it is almost certain that the input contains either duplicate keys
640    /// or keys indistinguishable by any hash function from the family used.
641    #[inline] pub fn with_conf<K, KS>(keys: KS, conf: GOBuildConf<GS, SS, S>) -> Self
642        where K: Hash + Sync, KS: KeySet<K> + Sync
643    {
644        Self::with_conf_stats(keys, conf, &mut ())
645    }
646
647    /// Builds [`GOFunction`] for given `keys`, using the build configuration `conf` and reporting statistics with `stats`.
648    ///
649    /// Panics if the construction fails.
650    /// Then it is almost certain that the input contains either duplicate keys
651    /// or keys indistinguishable by any hash function from the family used.
652    #[inline] pub fn from_slice_with_conf_stats<K, BS>(keys: &[K], conf: GOBuildConf<GS, SS, S>, stats: &mut BS) -> Self
653        where K: Hash + Sync, BS: stats::BuildStatsCollector
654    {
655        Self::with_conf_stats(SliceSourceWithRefs::<_, u8>::new(keys), conf, stats)
656    }
657
658    /// Builds [`GOFunction`] for given `keys`, using the configuration `conf`.
659    ///
660    /// Panics if the construction fails.
661    /// Then it is almost certain that the input contains either duplicate keys
662    /// or keys indistinguishable by any hash function from the family used.
663    #[inline] pub fn from_slice_with_conf<K>(keys: &[K], conf: GOBuildConf<GS, SS, S>) -> Self
664        where K: Hash + Sync
665    {
666        Self::with_conf_stats(SliceSourceWithRefs::<_, u8>::new(keys), conf, &mut ())
667    }
668
669    /// Builds [`GOFunction`] for given `keys`, using the build configuration `conf` and reporting statistics with `stats`.
670    /// Note that `keys` can be reordered during construction.
671    /// 
672    /// Panics if the construction fails.
673    /// Then it is almost certain that the input contains either duplicate keys
674    /// or keys indistinguishable by any hash function from the family used.
675    #[inline] pub fn from_slice_mut_with_conf_stats<K, BS>(keys: &mut [K], conf: GOBuildConf<GS, SS, S>, stats: &mut BS) -> Self
676        where K: Hash + Sync, BS: stats::BuildStatsCollector
677    {
678        Self::with_conf_stats(SliceMutSource::new(keys), conf, stats)
679    }
680
681    /// Builds [`GOFunction`] for given `keys`, using the build configuration `conf`.
682    /// Note that `keys` can be reordered during construction.
683    /// 
684    /// Panics if the construction fails.
685    /// Then it is almost certain that the input contains either duplicate keys
686    /// or keys indistinguishable by any hash function from the family used.
687    #[inline] pub fn from_slice_mut_with_conf<K>(keys: &mut [K], conf: GOBuildConf<GS, SS, S>) -> Self
688        where K: Hash + Sync
689    {
690        Self::with_conf_stats(SliceMutSource::new(keys), conf, &mut ())
691    }
692}
693
694impl<GS: GroupSize + Sync, SS: SeedSize> GOFunction<GS, SS> {
695    /// Reads `Self` from the `input`.
696    /// Only [`GOFunction`]s that use default hasher can be read by this method.
697    pub fn read(input: &mut dyn io::Read) -> io::Result<Self> {
698        Self::read_with_hasher(input, Default::default())
699    }
700}
701
702impl GOFunction {
703    /// Builds [`GOFunction`] for given `keys`, reporting statistics with `stats`.
704    /// 
705    /// Panics if the construction fails.
706    /// Then it is almost certain that the input contains either duplicate keys
707    /// or keys indistinguishable by any hash function from the family used.
708    pub fn from_slice_with_stats<K, BS>(keys: &[K], stats: &mut BS) -> Self
709        where K: Hash + Sync, BS: stats::BuildStatsCollector
710    {
711        Self::from_slice_with_conf_stats(keys, Default::default(), stats)
712    }
713
714    /// Builds [`GOFunction`] for given `keys`.
715    /// 
716    /// Panics if the construction fails.
717    /// Then it is almost certain that the input contains either duplicate keys
718    /// or keys indistinguishable by any hash function from the family used.
719    pub fn from_slice<K: Hash + Sync>(keys: &[K]) -> Self {
720        Self::from_slice_with_conf_stats(keys, Default::default(), &mut ())
721    }
722
723    /// Builds [`GOFunction`] for given `keys`.
724    /// 
725    /// Panics if the construction fails.
726    /// Then it is almost certain that the input contains either duplicate keys
727    /// or keys indistinguishable by any hash function from the family used.
728    pub fn new<K: Hash + Sync, KS: KeySet<K> + Sync>(keys: KS) -> Self {
729        Self::with_conf_stats(keys, Default::default(), &mut ())
730    }
731}
732
733impl<GS: GroupSize, SS: SeedSize, S> GOFunction<GS, SS, S>  {
734    /// Returns the number of keys in the input collection given during construction.
735    /// 
736    /// The time complexity is proportional to the number returned.
737    pub fn len(&self) -> usize {
738        self.array.content.count_bit_ones()
739    }
740}
741
742impl<K: Hash + Sync> From<&[K]> for GOFunction {
743    fn from(keys: &[K]) -> Self {
744        Self::from_slice(keys)
745    }
746}
747
748impl<K: Hash + Sync + Send> From<Vec<K>> for GOFunction {
749    fn from(keys: Vec<K>) -> Self {
750        Self::new(keys)
751    }
752}
753
754#[cfg(test)]
755mod tests {
756    use super::*;
757    use crate::utils::{tests::test_mphf_u64, verify_phf};
758    use crate::fmph::TwoToPowerBits;
759    use std::fmt::{Debug, Display};
760    use crate::seeds::Bits;
761
762    fn test_read_write<GS: GroupSize + Sync, SS: SeedSize>(h: &GOFunction<GS, SS>)
763        where SS::VecElement: std::cmp::PartialEq + Debug
764    {
765        let mut buff = Vec::new();
766        h.write(&mut buff).unwrap();
767        assert_eq!(buff.len(), h.write_bytes());
768        let read = GOFunction::<GS, SS>::read(&mut &buff[..]).unwrap();
769        assert_eq!(h.level_sizes, read.level_sizes);
770        assert_eq!(h.array.content, read.array.content);
771        assert_eq!(h.group_seeds, read.group_seeds);
772    }
773
774    fn test_hash2_invariants<GS: GroupSize, SS: SeedSize>(h: &GOFunction<GS, SS>) {
775        let number_of_groups = h.level_sizes.iter().map(|v| *v as usize).sum::<usize>();
776        assert_eq!(h.conf.bits_per_group * number_of_groups, h.array.content.len() * 64);
777        assert_eq!(ceiling_div(number_of_groups * h.conf.bits_per_seed.into() as usize, 64), h.group_seeds.len());
778    }
779
780    fn test_with_input<K: Hash + Clone + Display + Sync>(to_hash: &[K], bits_per_group: impl GroupSize + Sync) {
781        let goconf = GOConf::bps_bpg(Bits(3), bits_per_group);
782        let h = GOFunction::from_slice_with_conf(to_hash, GOBuildConf::with_mt(goconf, false));
783        //dbg!(h.size_bytes() as f64 * 8.0/to_hash.len() as f64);
784        test_mphf_u64(to_hash, |key| h.get(key));
785        test_hash2_invariants(&h);
786        test_read_write(&h);
787        assert_eq!(h.len(), to_hash.len());
788    }
789
790    #[test]
791    fn test_small_powers_of_two() {
792        //test_with_input(&[1, 2, 5], TwoToPowerBits::new(7));     // not supported for now, upto 63 bit / group
793        //test_with_input(&[1, 2, 5], TwoToPowerBits::new(6));     // not supported for now, upto 63 bit / group
794        test_with_input(&[1, 2, 5], TwoToPowerBits::new(5));
795        test_with_input(&[1, 2, 5], TwoToPowerBits::new(4));
796        test_with_input(&[1, 2, 5], TwoToPowerBits::new(3));
797        test_with_input(&[1, 2, 5], TwoToPowerBits::new(2));
798        test_with_input(&[1, 2, 5], TwoToPowerBits::new(1));
799        test_with_input(&[1, 2, 5], TwoToPowerBits::new(0));
800        //test_with_input(&(-50..150).collect::<Vec<_>>(), TwoToPowerBits::new(7)); // not supported for now, upto 63 bit / group
801        //test_with_input(&(-50..150).collect::<Vec<_>>(), TwoToPowerBits::new(6)); // not supported for now, upto 63 bit / group
802        test_with_input(&(-50..150).collect::<Vec<_>>(), TwoToPowerBits::new(5));
803        test_with_input(&(-50..150).collect::<Vec<_>>(), TwoToPowerBits::new(4));
804        test_with_input(&(-50..150).collect::<Vec<_>>(), TwoToPowerBits::new(3));
805        test_with_input(&(-50..150).collect::<Vec<_>>(), TwoToPowerBits::new(2));
806        test_with_input(&(-50..150).collect::<Vec<_>>(), TwoToPowerBits::new(1));
807        test_with_input(&(-50..150).collect::<Vec<_>>(), TwoToPowerBits::new(0));
808        //test_with_input(&['a', 'b', 'c', 'd'], TwoToPowerBits::new(7)); // not supported for now, upto 63 bit / group
809        //test_with_input(&['a', 'b', 'c', 'd'], TwoToPowerBits::new(6)); // not supported for now, upto 63 bit / group
810        test_with_input(&['a', 'b', 'c', 'd'], TwoToPowerBits::new(5));
811        test_with_input(&['a', 'b', 'c', 'd'], TwoToPowerBits::new(4));
812        test_with_input(&['a', 'b', 'c', 'd'], TwoToPowerBits::new(3));
813        test_with_input(&['a', 'b', 'c', 'd'], TwoToPowerBits::new(2));
814        test_with_input(&['a', 'b', 'c', 'd'], TwoToPowerBits::new(1));
815        test_with_input(&['a', 'b', 'c', 'd'], TwoToPowerBits::new(0));
816    }
817
818    #[test]
819    fn test_small_bits() {
820        test_with_input(&[1, 2, 5], Bits(3));
821        test_with_input(&[1, 2, 5], Bits(5));
822        test_with_input(&[1, 2, 5], Bits(20));
823        test_with_input(&[1, 2, 5], Bits(60));
824        test_with_input(&[1, 2, 5], Bits(63));
825        test_with_input(&(-50..150).collect::<Vec<_>>(), Bits(3));
826        test_with_input(&(-50..150).collect::<Vec<_>>(), Bits(5));
827        test_with_input(&(-50..150).collect::<Vec<_>>(), Bits(20));
828        test_with_input(&(-50..150).collect::<Vec<_>>(), Bits(60));
829        test_with_input(&(-50..150).collect::<Vec<_>>(), Bits(63));
830        test_with_input(&['a', 'b', 'c', 'd'], Bits(3));
831        test_with_input(&['a', 'b', 'c', 'd'], Bits(5));
832        test_with_input(&['a', 'b', 'c', 'd'], Bits(20));
833        test_with_input(&['a', 'b', 'c', 'd'], Bits(60));
834        test_with_input(&['a', 'b', 'c', 'd'], Bits(63));
835    }
836
837    #[test]
838    fn test_medium() {
839        let keys: Vec<_> = (-2000..2000).map(|v| 3*v).collect();
840        //test_with_input(&keys, TwoToPowerBits::new(7));   // not supported for now, upto 63 bit / group
841        //test_with_input(&keys, TwoToPowerBits::new(6));   // not supported for now, upto 63 bit / group
842        test_with_input(&keys, TwoToPowerBits::new(5));
843        test_with_input(&keys, TwoToPowerBits::new(4));
844        test_with_input(&keys, TwoToPowerBits::new(3));
845        test_with_input(&keys, TwoToPowerBits::new(2));
846        test_with_input(&keys, TwoToPowerBits::new(1));
847        test_with_input(&keys, TwoToPowerBits::new(0));
848        test_with_input(&keys, Bits(3));
849        test_with_input(&keys, Bits(5));
850        test_with_input(&keys, Bits(10));
851        test_with_input(&keys, Bits(13));
852        test_with_input(&keys, Bits(20));
853        test_with_input(&keys, Bits(27));
854        test_with_input(&keys, Bits(33));
855        test_with_input(&keys, Bits(45));
856        test_with_input(&keys, Bits(50));
857        test_with_input(&keys, Bits(55));
858        test_with_input(&keys, Bits(60));
859        test_with_input(&keys, Bits(63));
860    }
861
862    #[test]
863    fn test_large_size() {
864        let keys = (-20000..20000).collect::<Vec<_>>();
865        assert!(GOFunction::from_slice_with_conf(&keys[..], GOConf::default_biggest().into()).size_bytes() as f64 * (8.0/40000.0) < 2.57);
866        assert!(GOFunction::from_slice_with_conf(&keys[..], GOConf::default_bigger().into()).size_bytes() as f64 * (8.0/40000.0) < 2.4);
867        assert!(GOFunction::from(&keys[..]).size_bytes() as f64 * (8.0/40000.0) < 2.26);
868        assert!(GOFunction::from_slice_with_conf(&keys[..], GOConf::default_smallest().into()).size_bytes() as f64 * (8.0/40000.0) < 2.15);
869    }
870
871    #[test]
872    #[ignore = "uses much memory and time"]
873    fn test_fmphgo_for_over_2to32_keys() {
874        const LEN: u64 = 5_000_000_000;
875        let f = GOFunction::with_conf_stats(
876            crate::fmph::keyset::CachedKeySet::dynamic(|| 0..LEN, usize::MAX),
877            GOConf::default_biggest().into(),
878            &mut crate::stats::BuildStatsPrinter::stdout());
879        verify_phf(LEN as usize, 0..LEN, |key| f.get(key).map(|v| v as usize));
880        assert!(f.size_bytes() as f64 * (8.0/LEN as f64) < 2.57);
881    }
882
883    #[test]
884    fn test_duplicates() {
885        assert!(GOFunction::try_with_conf_stats(vec![1, 1], Default::default(), &mut ()).is_none());
886        assert!(GOFunction::try_with_conf_stats(vec![1, 2, 3, 1, 4], Default::default(), &mut ()).is_none());
887    }
888
889    #[test]
890    fn test_duplicates_partial() {
891        let keys = vec![1, 2, 3, 1, 4];
892        let expected_initial_len = keys.len();
893        let r = GOFunction::try_with_conf_stats_or_partial(keys, Default::default(), &mut ());
894        if let Err((mphf, mut remaining, initial_len)) = r {
895            assert_eq!(initial_len, expected_initial_len);
896            remaining.sort();
897            assert_eq!(remaining, vec![1, 1]);
898            test_mphf_u64(&[2, 3, 4], |key| mphf.get(key));
899        } else {
900            assert!(false)
901        }
902    }
903}