1use std::hash::Hash;
2use binout::{VByte, Serializer, AsIs};
3use bitm::{BitAccess, Rank, ceiling_div};
4
5use crate::seeds::{Bits8, SeedSize, TwoToPowerBitsStatic};
6use crate::utils::{ArrayWithRank, read_bits};
7use crate::{BuildDefaultSeededHasher, BuildSeededHasher, stats};
8
9use super::function::{concat_level_arrays, from_mut_slice, get_mut_slice, level_array_for, LevelArray};
10use super::goindexing::GroupSize;
11use std::io;
12use std::sync::atomic::AtomicU64;
13use dyn_size_of::GetSize;
14use crate::fmph::function::{fphash_add_bit, fphash_remove_collided, fphash_sync_add_bit};
15use crate::fmph::goindexing::group_nr;
16
17use rayon::prelude::*;
18use crate::fmph::keyset::{KeySet, SliceMutSource, SliceSourceWithRefs};
19
20#[derive(Clone)]
30pub struct GOConf<GS: GroupSize = TwoToPowerBitsStatic::<4>, SS: SeedSize = TwoToPowerBitsStatic<2>, S = BuildDefaultSeededHasher> {
31 pub hash_builder: S,
33 pub bits_per_seed: SS,
35 pub bits_per_group: GS
37}
38
39impl GOConf<TwoToPowerBitsStatic::<3>, TwoToPowerBitsStatic::<0>, BuildDefaultSeededHasher> {
40 pub fn default_biggest() -> Self {
46 Self {
47 hash_builder: Default::default(),
48 bits_per_seed: Default::default(),
49 bits_per_group: Default::default()
50 }
51 }
52}
53
54impl GOConf<TwoToPowerBitsStatic::<4>, TwoToPowerBitsStatic::<1>, BuildDefaultSeededHasher> {
55 pub fn default_bigger() -> Self {
61 Self {
62 hash_builder: Default::default(),
63 bits_per_seed: Default::default(),
64 bits_per_group: Default::default()
65 }
66 }
67}
68
69impl Default for GOConf {
70 fn default() -> Self {
75 Self {
76 hash_builder: Default::default(),
77 bits_per_seed: Default::default(),
78 bits_per_group: Default::default()
79 }
80 }
81}
82
83impl GOConf<TwoToPowerBitsStatic::<5>, Bits8, BuildDefaultSeededHasher> {
84 pub fn default_smallest() -> Self {
90 Self {
91 hash_builder: Default::default(),
92 bits_per_seed: Default::default(),
93 bits_per_group: Default::default()
94 }
95 }
96}
97
98impl<GS: GroupSize, SS: SeedSize> GOConf<GS, SS> {
99 pub fn bps_bpg(bits_per_seed: SS, bits_per_group: GS) -> Self {
101 Self {
102 hash_builder: Default::default(),
103 bits_per_seed,
104 bits_per_group,
105 }
106 }
107}
108
109impl<GS: GroupSize, SS: SeedSize, S: BuildSeededHasher> GOConf<GS, SS, S> {
110 pub fn validate(&self) {
112 self.bits_per_seed.validate().unwrap();
113 self.bits_per_group.validate().unwrap();
114 }
115
116 pub fn hash_bps_bpg(hash_builder: S, bits_per_seed: SS, bits_per_group: GS) -> Self {
118 Self { hash_builder, bits_per_seed, bits_per_group } }
120
121 #[inline(always)] pub fn hash_index<GetGroupSeed>(&self, hash: u64, level_size_groups: u64, group_seed: GetGroupSeed) -> usize
123 where GetGroupSeed: FnOnce(u64) -> u16 {
125 let group = group_nr(hash, level_size_groups);
126 self.bits_per_group.bit_index_for_seed(hash, group_seed(group), group)
127 }
128
129 #[inline(always)] pub fn key_index<GetGroupSeed, K>(&self, key: &K, level_seed: u32, level_size_groups: u64, group_seed: GetGroupSeed) -> usize
131 where GetGroupSeed: FnOnce(u64) -> u16, K: Hash
132 {
133 self.hash_index(self.hash_builder.hash_one(key, level_seed), level_size_groups, group_seed)
134 }
135
136 fn build_array_for_hashes(&self, key_hashes: &[u64], level_size_segments: usize, level_size_groups: u64, group_seed: u16) -> LevelArray
138 {
139 let mut result = level_array_for(level_size_segments);
140 let mut collision = vec![0u64; level_size_segments].into_boxed_slice();
141 for hash in key_hashes {
142 fphash_add_bit(&mut result, &mut collision, self.hash_index(*hash, level_size_groups, |_| group_seed));
143 };
144 fphash_remove_collided(&mut result, &collision);
145 result
146 }
147}
148
149impl<GS: GroupSize + Sync, SS: SeedSize, S: BuildSeededHasher + Sync> GOConf<GS, SS, S> {
150 fn build_array_for_hashes_mt(&self, key_hashes: &[u64], level_size_segments: usize, level_size_groups: u64, group_seed: u16) -> LevelArray
152 {
153 let mut result = level_array_for(level_size_segments);
154 let result_atom = from_mut_slice(&mut result);
155 let mut collision: Box<[AtomicU64]> = (0..level_size_segments).map(|_| AtomicU64::default()).collect();
156 key_hashes.par_iter().for_each(
157 |hash| fphash_sync_add_bit(&result_atom, &collision, self.hash_index(*hash, level_size_groups, |_| group_seed))
158 );
159 fphash_remove_collided(&mut result, get_mut_slice(&mut collision));
160 result
161 }
162}
163
164#[derive(Clone)]
168pub struct GOBuildConf<GS: GroupSize = TwoToPowerBitsStatic::<4>, SS: SeedSize = TwoToPowerBitsStatic<2>, S = BuildDefaultSeededHasher> {
169 pub cache_threshold: usize,
177
178 pub relative_level_size: u16,
184
185 pub use_multiple_threads: bool,
189
190 pub goconf: GOConf<GS, SS, S>,
192} impl Default for GOBuildConf {
195 fn default() -> Self { Self::new(Default::default()) }
196}
197
198impl<GS: GroupSize + Sync, SS: SeedSize, S: BuildSeededHasher + Sync> From<GOConf<GS, SS, S>> for GOBuildConf<GS, SS, S> {
199 #[inline] fn from(value: GOConf<GS, SS, S>) -> Self { Self::new(value) }
200}
201
202impl<GS: GroupSize + Sync, SS: SeedSize, S: BuildSeededHasher + Sync> GOBuildConf<GS, SS, S>
203{
204 pub const DEFAULT_CACHE_THRESHOLD: usize = 1024*1024*128; pub fn with_lsize_ct_mt(goconf: GOConf<GS, SS, S>, relative_level_size: u16, cache_threshold: usize, use_multiple_threads: bool) -> Self {
212 Self {
213 cache_threshold,
214 relative_level_size,
215 use_multiple_threads,
216 goconf
217 }
218 }
219
220 pub fn new(goconf: GOConf<GS, SS, S>) -> Self {
222 Self::with_lsize_ct_mt(goconf, 100, Self::DEFAULT_CACHE_THRESHOLD, true)
223 }
224
225 pub fn with_ct(goconf: GOConf<GS, SS, S>, cache_threshold: usize) -> Self {
228 Self::with_lsize_ct_mt(goconf, 100, cache_threshold, true)
229 }
230
231 pub fn with_lsize_ct(goconf: GOConf<GS, SS, S>, relative_level_size: u16, cache_threshold: usize) -> Self {
234 Self::with_lsize_ct_mt(goconf, relative_level_size, cache_threshold, true)
235 }
236
237 pub fn with_lsize(goconf: GOConf<GS, SS, S>, relative_level_size: u16) -> Self {
240 Self::with_lsize_ct_mt(goconf, relative_level_size, Self::DEFAULT_CACHE_THRESHOLD, true)
241 }
242
243 pub fn with_lsize_mt(goconf: GOConf<GS, SS, S>, relative_level_size: u16, use_multiple_threads: bool) -> Self {
247 Self::with_lsize_ct_mt(goconf, relative_level_size, Self::DEFAULT_CACHE_THRESHOLD, use_multiple_threads)
248 }
249
250 pub fn with_mt(goconf: GOConf<GS, SS, S>, use_multiple_threads: bool) -> Self {
253 Self::with_lsize_ct_mt(goconf, 100, Self::DEFAULT_CACHE_THRESHOLD, use_multiple_threads)
254 }
255
256 #[inline(always)] fn last_seed(&self) -> u16 { ((1u32 << self.goconf.bits_per_seed.into())-1) as u16 }
257
258 fn update_best<GetGroupSeed>(&self, level_size_groups: u64, best_array: &mut [u64], best_seeds: &mut [SS::VecElement], array: &[u64], array_seed: GetGroupSeed)
260 where GetGroupSeed: Fn(u64) -> u16
261 {
262 for group_index in 0..level_size_groups {
263 self.goconf.bits_per_group.copy_group_if_better(best_array, array, group_index as usize,
264 || self.goconf.bits_per_seed.set_seed(best_seeds, group_index as usize, array_seed(group_index))
265 )
266 }
267 }
268
269 #[inline(always)]
272 fn best_array<AB>(&self, build_for_group: AB, level_size_groups: u64) -> (LevelArray, Box<[SS::VecElement]>)
273 where AB: Fn(u16) -> LevelArray {
275 let mut best_array = build_for_group(0);
276 let mut best_seeds = self.goconf.bits_per_seed.new_zeroed_seed_vec(level_size_groups as usize);
277 for group_seed in 1..=self.last_seed() {
278 let with_new_seed = build_for_group(group_seed);
279 self.update_best(level_size_groups, &mut best_array, &mut best_seeds, &with_new_seed, |_| group_seed);
280 }
281 (best_array, best_seeds)
282 }
283}
284
285pub struct GOBuilder<GS: GroupSize = TwoToPowerBitsStatic::<4>, SS: SeedSize = TwoToPowerBitsStatic<2>, S = BuildDefaultSeededHasher> {
287 level_sizes: Vec::<u64>,
288 arrays: Vec::<LevelArray>,
289 group_seeds: Vec::<Box<[SS::VecElement]>>,
290 conf: GOBuildConf<GS, SS, S>
291} impl<GS: GroupSize + Sync, SS: SeedSize, S: BuildSeededHasher + Sync> GOBuilder<GS, SS, S>
294{
295 pub fn new(mut conf: GOBuildConf<GS, SS, S>) -> Self {
296 conf.goconf.validate();
297 if conf.use_multiple_threads { conf.use_multiple_threads = rayon::current_num_threads() > 1; }
298 Self {
299 level_sizes: Vec::<u64>::new(),
300 arrays: Vec::<LevelArray>::new(),
301 group_seeds: Vec::<Box<[SS::VecElement]>>::new(),
302 conf
303 }
304 }
305
306 fn push(&mut self, array: LevelArray, seeds: Box<[SS::VecElement]>, size_groups: u64) {
307 self.arrays.push(array);
308 self.group_seeds.push(seeds);
309 self.level_sizes.push(size_groups);
310 }
311
312 #[inline(always)] fn level_nr(&self) -> u32 { self.level_sizes.len() as u32 }
314
315 fn retained<K>(&self, key: &K) -> bool where K: Hash {
317 self.arrays.iter().zip(self.group_seeds.iter()).zip(self.level_sizes.iter()).enumerate()
318 .all(|(level_seed, ((a, seeds), level_size_groups))| {
319 !a.get_bit(self.conf.goconf.key_index(key, level_seed as u32, *level_size_groups,
320 |group| self.conf.goconf.bits_per_seed.get_seed(seeds, group as usize)))
321 })
322 }
323
324 #[inline(always)]
326 fn build_array<KS, K>(&self, keys: &KS, level_size_segments: usize, level_size_groups: u64, group_seed: u16) -> LevelArray
327 where K: Hash, KS: KeySet<K>
329 {
330 let mut result = level_array_for(level_size_segments);
331 let mut collision = vec![0u64; level_size_segments].into_boxed_slice();
332 let level_seed = self.level_nr();
333 keys.for_each_key(|key| fphash_add_bit(&mut result, &mut collision, self.conf.goconf.key_index(key, level_seed, level_size_groups, |_| group_seed)),
334 |key| self.retained(key));
335 fphash_remove_collided(&mut result, &collision);
336 result
337 }
338
339 #[inline(always)]
341 fn build_array_mt<KS, K>(&self, keys: &KS, level_size_segments: usize, level_size_groups: u64, group_seed: u16) -> LevelArray
342 where K: Hash, KS: KeySet<K> {
344 if !keys.has_par_for_each_key() {
345 return self.build_array(keys, level_size_segments, level_size_groups, group_seed);
346 }
347 let mut result = level_array_for(level_size_segments);
348 let result_atom = from_mut_slice(&mut result);
349 let mut collision: Box<[AtomicU64]> = (0..level_size_segments).map(|_| AtomicU64::default()).collect();
350 let level_seed = self.level_nr();
351 keys.par_for_each_key(
352 |key| fphash_sync_add_bit(&result_atom, &collision, self.conf.goconf.key_index(key, level_seed, level_size_groups, |_| group_seed)),
353 |key| self.retained(key)
354 );
355 fphash_remove_collided(&mut result, get_mut_slice(&mut collision));
356 result
357 }
358
359 fn build_next_level_with_cache<KS, K>(&mut self, keys: &mut KS, level_size_groups: u64, level_size_segments: usize)
360 where K: Hash + Sync, KS: KeySet<K> + Sync
361 {
362 let level_seed = self.level_nr();
363 let key_hashes = keys.maybe_par_map_each_key(
364 |k| self.conf.goconf.hash_builder.hash_one(k, level_seed),
365 |key| self.retained(key),
366 self.conf.use_multiple_threads
367 );
368 let (array, seeds) = if self.conf.use_multiple_threads {
369 self.conf.best_array(|g| self.conf.goconf.build_array_for_hashes_mt(&key_hashes, level_size_segments, level_size_groups, g), level_size_groups)
370 } else {
371 self.conf.best_array(|g| self.conf.goconf.build_array_for_hashes(&key_hashes, level_size_segments, level_size_groups, g), level_size_groups)
372 };
373 keys.maybe_par_retain_keys_with_indices(
374 |i| !array.get_bit(
375 self.conf.goconf.hash_index(key_hashes[i], level_size_groups,
376 |group| self.conf.goconf.bits_per_seed.get_seed(&seeds, group as usize))
377 ),
378 |key| !array.get_bit(
379 self.conf.goconf.key_index(key, level_seed, level_size_groups,
380 |group| self.conf.goconf.bits_per_seed.get_seed(&seeds, group as usize))
381 ),
382 |key| self.retained(key),
383 || array.count_bit_ones(),
384 self.conf.use_multiple_threads
385 );
386 self.push(array, seeds, level_size_groups);
387 }
388
389 fn build_levels<KS, K, BS>(&mut self, keys: &mut KS, stats: &mut BS) -> bool
391 where K: Hash + Sync, KS: KeySet<K> + Sync, BS: stats::BuildStatsCollector
392 {
393 let mut levels_without_reduction = 0; let mut input_size = keys.keys_len();
395 while input_size != 0 {
396 let (level_size_groups, level_size_segments) = self.conf.goconf.bits_per_group.level_size_groups_segments(
397 ceiling_div(input_size * self.conf.relative_level_size as usize, 100));
398 stats.level(input_size, level_size_segments * 64);
400 let level_size_groups = level_size_groups as u64;
401 if input_size < self.conf.cache_threshold {
402 self.build_next_level_with_cache(keys, level_size_groups, level_size_segments);
403 } else {
404 let (array, seeds) = if self.conf.use_multiple_threads {
406 self.conf.best_array(|g| self.build_array_mt(keys, level_size_segments, level_size_groups, g), level_size_groups)
407 } else {
408 self.conf.best_array(|g| self.build_array(keys, level_size_segments, level_size_groups, g), level_size_groups)
409 };
410 let level_nr = self.level_nr();
411 keys.maybe_par_retain_keys(
412 |key| {
413 let hash = self.conf.goconf.hash_builder.hash_one(key, level_nr);
414 let group = group_nr(hash, level_size_groups);
415 let bit_index = self.conf.goconf.bits_per_group.bit_index_for_seed(
416 hash,
417 self.conf.goconf.bits_per_seed.get_seed(&seeds, group as usize),
419 group);
420 !array.get_bit(bit_index)
421 },
422 |key| self.retained(key),
423 || array.count_bit_ones(),
424 self.conf.use_multiple_threads
425 );
426 self.push(array, seeds, level_size_groups);
427 }
428 let prev_input_size = input_size;
429 input_size = keys.keys_len();
430 if input_size == prev_input_size {
431 levels_without_reduction += 1;
432 if levels_without_reduction == 10 {
433 let len = self.arrays.len()-levels_without_reduction;
434 self.arrays.truncate(len);
435 self.group_seeds.truncate(len);
436 self.level_sizes.truncate(len);
437 stats.end(input_size);
438 return false;
439 }
440 } else {
441 levels_without_reduction = 0;
442 }
443 }
444 stats.end(0);
445 return true;
446 }
447
448 pub fn finish(self) -> GOFunction<GS, SS, S> {
449 let (array, _) = ArrayWithRank::build(concat_level_arrays(self.arrays));
450 let group_seeds_concatenated = self.conf.goconf.bits_per_seed.concatenate_seed_vecs(&self.level_sizes, self.group_seeds);
451 GOFunction::<GS, SS, S> {
452 array,
453 group_seeds: group_seeds_concatenated,
454 conf: self.conf.goconf,
455 level_sizes: self.level_sizes.into_boxed_slice(),
456 }
457 }
458}
459
460pub struct GOFunction<GS: GroupSize = TwoToPowerBitsStatic::<4>, SS: SeedSize = TwoToPowerBitsStatic<2>, S = BuildDefaultSeededHasher> {
465 array: ArrayWithRank,
466 group_seeds: Box<[SS::VecElement]>, level_sizes: Box<[u64]>, conf: GOConf<GS, SS, S>
469 }
472
473impl<GS: GroupSize, SS: SeedSize, S: BuildSeededHasher> GetSize for GOFunction<GS, SS, S> {
474 fn size_bytes_dyn(&self) -> usize {
475 self.array.size_bytes_dyn()
476 + self.group_seeds.size_bytes_dyn()
478 + self.level_sizes.size_bytes_dyn()
479 }
480
481 const USES_DYN_MEM: bool = true;
482}
483
484impl<GS: GroupSize, SS: SeedSize, S: BuildSeededHasher> GOFunction<GS, SS, S> {
485
486 pub fn get_stats<K: Hash + ?Sized, A: stats::AccessStatsCollector>(&self, key: &K, access_stats: &mut A) -> Option<u64> {
492 let mut groups_before = 0u64;
493 let mut level_nr = 0u32;
494 loop {
495 let level_size_groups = *self.level_sizes.get(level_nr as usize)?;
496 let hash = self.conf.hash_builder.hash_one(key, level_nr);
500 let group = groups_before + group_nr(hash, level_size_groups);
501 let seed = self.conf.bits_per_seed.get_seed(&self.group_seeds, group as usize);
502 let bit_index = self.conf.bits_per_group.bit_index_for_seed(hash, seed, group);
503 if self.array.content.get_bit(bit_index) {
504 access_stats.found_on_level(level_nr);
505 return Some(unsafe{self.array.rank_unchecked(bit_index)} as u64);
506 }
507 groups_before += level_size_groups;
508 level_nr += 1;
509 }
510 }
511
512 #[inline] pub fn get<K: Hash + ?Sized>(&self, key: &K) -> Option<u64> {
518 self.get_stats(key, &mut ())
519 }
520
521 #[inline] pub fn get_stats_or_panic<K: Hash + ?Sized, A: stats::AccessStatsCollector>(&self, key: &K, access_stats: &mut A) -> u64 {
527 self.get_stats(key, access_stats).expect("Invalid access to an item outside the set given during construction.")
528 }
529
530 #[inline] pub fn get_or_panic<K: Hash + ?Sized>(&self, key: &K) -> u64 {
536 self.get_stats_or_panic(key, &mut ())
537 }
538
539 pub fn write_bytes(&self) -> usize {
541 self.conf.bits_per_group.write_size_bytes()
542 + VByte::array_size(&self.level_sizes)
543 + AsIs::array_content_size(&self.array.content)
544 + std::mem::size_of::<u8>() + self.group_seeds.size_bytes_content_dyn()
545 }
546
547 pub fn write(&self, output: &mut dyn io::Write) -> io::Result<()>
549 {
550 self.conf.bits_per_group.write(output)?;
551 VByte::write_array(output, &self.level_sizes)?;
552 AsIs::write_all(output, self.array.content.iter())?;
553 self.conf.bits_per_seed.write_seed_vec(output, &self.group_seeds)
554 }
555
556 pub fn read_with_hasher(input: &mut dyn io::Read, hash_builder: S) -> io::Result<Self>
558 {
559 let bits_per_group = GS::read(input)?;
560 let level_size = VByte::read_array(input)?;
561 let number_of_groups = level_size.iter().map(|v|*v as usize).sum::<usize>();
562
563 let array_content = read_bits(input, bits_per_group * number_of_groups)?;
564 let (array_with_rank, _) = ArrayWithRank::build(array_content);
565
566 let (bits_per_group_seed, group_seeds) = SS::read_seed_vec(input, number_of_groups)?;
567
568 Ok(Self {
569 array: array_with_rank,
570 group_seeds,
571 level_sizes: level_size,
572 conf: GOConf {
573 bits_per_seed: bits_per_group_seed,
574 bits_per_group,
575 hash_builder
576 },
577 })
578 }
579
580 pub fn level_sizes(&self) -> &[u64] {
582 &self.level_sizes
583 }
584}
585
586impl<GS: GroupSize + Sync, SS: SeedSize, S: BuildSeededHasher + Sync> GOFunction<GS, SS, S> {
587 pub fn try_with_conf_stats_or_partial<K, KS, BS>(mut keys: KS, conf: GOBuildConf<GS, SS, S>, stats: &mut BS) -> Result<Self, (Self, KS, usize)>
601 where K: Hash + Sync, KS: KeySet<K> + Sync, BS: stats::BuildStatsCollector
602 {
603 let mut builder = GOBuilder::new(conf);
604 let initial_size = keys.keys_len();
605 if builder.build_levels(&mut keys, stats) {
606 drop(keys);
607 Ok(builder.finish())
608 } else {
609 Err((builder.finish(), keys, initial_size))
610 }
611 }
612
613 pub fn try_with_conf_stats<K, KS, BS>(mut keys: KS, conf: GOBuildConf<GS, SS, S>, stats: &mut BS) -> Option<Self>
619 where K: Hash + Sync, KS: KeySet<K> + Sync, BS: stats::BuildStatsCollector
620 {
621 let mut builder = GOBuilder::new(conf);
622 builder.build_levels(&mut keys, stats).then(|| {
623 drop(keys);
624 builder.finish()
625 })
626 }
627
628 pub fn with_conf_stats<K, KS, BS>(keys: KS, conf: GOBuildConf<GS, SS, S>, stats: &mut BS) -> Self
632 where K: Hash + Sync, KS: KeySet<K> + Sync, BS: stats::BuildStatsCollector
633 {
634 Self::try_with_conf_stats(keys, conf, stats).expect("Constructing fmph::GOFunction failed. Probably the input contains duplicate keys.")
635 }
636
637 #[inline] pub fn with_conf<K, KS>(keys: KS, conf: GOBuildConf<GS, SS, S>) -> Self
643 where K: Hash + Sync, KS: KeySet<K> + Sync
644 {
645 Self::with_conf_stats(keys, conf, &mut ())
646 }
647
648 #[inline] pub fn from_slice_with_conf_stats<K, BS>(keys: &[K], conf: GOBuildConf<GS, SS, S>, stats: &mut BS) -> Self
654 where K: Hash + Sync, BS: stats::BuildStatsCollector
655 {
656 Self::with_conf_stats(SliceSourceWithRefs::<_, u8>::new(keys), conf, stats)
657 }
658
659 #[inline] pub fn from_slice_with_conf<K>(keys: &[K], conf: GOBuildConf<GS, SS, S>) -> Self
665 where K: Hash + Sync
666 {
667 Self::with_conf_stats(SliceSourceWithRefs::<_, u8>::new(keys), conf, &mut ())
668 }
669
670 #[inline] pub fn from_slice_mut_with_conf_stats<K, BS>(keys: &mut [K], conf: GOBuildConf<GS, SS, S>, stats: &mut BS) -> Self
677 where K: Hash + Sync, BS: stats::BuildStatsCollector
678 {
679 Self::with_conf_stats(SliceMutSource::new(keys), conf, stats)
680 }
681
682 #[inline] pub fn from_slice_mut_with_conf<K>(keys: &mut [K], conf: GOBuildConf<GS, SS, S>) -> Self
689 where K: Hash + Sync
690 {
691 Self::with_conf_stats(SliceMutSource::new(keys), conf, &mut ())
692 }
693}
694
695impl<GS: GroupSize + Sync, SS: SeedSize> GOFunction<GS, SS> {
696 pub fn read(input: &mut dyn io::Read) -> io::Result<Self> {
699 Self::read_with_hasher(input, Default::default())
700 }
701}
702
703impl GOFunction {
704 pub fn from_slice_with_stats<K, BS>(keys: &[K], stats: &mut BS) -> Self
710 where K: Hash + Sync, BS: stats::BuildStatsCollector
711 {
712 Self::from_slice_with_conf_stats(keys, Default::default(), stats)
713 }
714
715 pub fn from_slice<K: Hash + Sync>(keys: &[K]) -> Self {
721 Self::from_slice_with_conf_stats(keys, Default::default(), &mut ())
722 }
723
724 pub fn new<K: Hash + Sync, KS: KeySet<K> + Sync>(keys: KS) -> Self {
730 Self::with_conf_stats(keys, Default::default(), &mut ())
731 }
732}
733
734impl<GS: GroupSize, SS: SeedSize, S> GOFunction<GS, SS, S> {
735 pub fn len(&self) -> usize {
739 self.array.content.count_bit_ones()
740 }
741}
742
743impl<K: Hash + Sync> From<&[K]> for GOFunction {
744 fn from(keys: &[K]) -> Self {
745 Self::from_slice(keys)
746 }
747}
748
749impl<K: Hash + Sync + Send> From<Vec<K>> for GOFunction {
750 fn from(keys: Vec<K>) -> Self {
751 Self::new(keys)
752 }
753}
754
755#[cfg(test)]
756mod tests {
757 use super::*;
758 use crate::fmph::function::tests::{test_mphf, test_mphf_iter};
759 use crate::fmph::TwoToPowerBits;
760 use std::fmt::{Debug, Display};
761 use crate::seeds::Bits;
762
763 fn test_read_write<GS: GroupSize + Sync, SS: SeedSize>(h: &GOFunction<GS, SS>)
764 where SS::VecElement: std::cmp::PartialEq + Debug
765 {
766 let mut buff = Vec::new();
767 h.write(&mut buff).unwrap();
768 assert_eq!(buff.len(), h.write_bytes());
769 let read = GOFunction::<GS, SS>::read(&mut &buff[..]).unwrap();
770 assert_eq!(h.level_sizes, read.level_sizes);
771 assert_eq!(h.array.content, read.array.content);
772 assert_eq!(h.group_seeds, read.group_seeds);
773 }
774
775 fn test_hash2_invariants<GS: GroupSize, SS: SeedSize>(h: &GOFunction<GS, SS>) {
776 let number_of_groups = h.level_sizes.iter().map(|v| *v as usize).sum::<usize>();
777 assert_eq!(h.conf.bits_per_group * number_of_groups, h.array.content.len() * 64);
778 assert_eq!(ceiling_div(number_of_groups * h.conf.bits_per_seed.into() as usize, 64), h.group_seeds.len());
779 }
780
781 fn test_with_input<K: Hash + Clone + Display + Sync>(to_hash: &[K], bits_per_group: impl GroupSize + Sync) {
782 let goconf = GOConf::bps_bpg(Bits(3), bits_per_group);
783 let h = GOFunction::from_slice_with_conf(to_hash, GOBuildConf::with_mt(goconf, false));
784 test_mphf(to_hash, |key| h.get(key));
786 test_hash2_invariants(&h);
787 test_read_write(&h);
788 assert_eq!(h.len(), to_hash.len());
789 }
790
791 #[test]
792 fn test_small_powers_of_two() {
793 test_with_input(&[1, 2, 5], TwoToPowerBits::new(5));
796 test_with_input(&[1, 2, 5], TwoToPowerBits::new(4));
797 test_with_input(&[1, 2, 5], TwoToPowerBits::new(3));
798 test_with_input(&[1, 2, 5], TwoToPowerBits::new(2));
799 test_with_input(&[1, 2, 5], TwoToPowerBits::new(1));
800 test_with_input(&[1, 2, 5], TwoToPowerBits::new(0));
801 test_with_input(&(-50..150).collect::<Vec<_>>(), TwoToPowerBits::new(5));
804 test_with_input(&(-50..150).collect::<Vec<_>>(), TwoToPowerBits::new(4));
805 test_with_input(&(-50..150).collect::<Vec<_>>(), TwoToPowerBits::new(3));
806 test_with_input(&(-50..150).collect::<Vec<_>>(), TwoToPowerBits::new(2));
807 test_with_input(&(-50..150).collect::<Vec<_>>(), TwoToPowerBits::new(1));
808 test_with_input(&(-50..150).collect::<Vec<_>>(), TwoToPowerBits::new(0));
809 test_with_input(&['a', 'b', 'c', 'd'], TwoToPowerBits::new(5));
812 test_with_input(&['a', 'b', 'c', 'd'], TwoToPowerBits::new(4));
813 test_with_input(&['a', 'b', 'c', 'd'], TwoToPowerBits::new(3));
814 test_with_input(&['a', 'b', 'c', 'd'], TwoToPowerBits::new(2));
815 test_with_input(&['a', 'b', 'c', 'd'], TwoToPowerBits::new(1));
816 test_with_input(&['a', 'b', 'c', 'd'], TwoToPowerBits::new(0));
817 }
818
819 #[test]
820 fn test_small_bits() {
821 test_with_input(&[1, 2, 5], Bits(3));
822 test_with_input(&[1, 2, 5], Bits(5));
823 test_with_input(&[1, 2, 5], Bits(20));
824 test_with_input(&[1, 2, 5], Bits(60));
825 test_with_input(&[1, 2, 5], Bits(63));
826 test_with_input(&(-50..150).collect::<Vec<_>>(), Bits(3));
827 test_with_input(&(-50..150).collect::<Vec<_>>(), Bits(5));
828 test_with_input(&(-50..150).collect::<Vec<_>>(), Bits(20));
829 test_with_input(&(-50..150).collect::<Vec<_>>(), Bits(60));
830 test_with_input(&(-50..150).collect::<Vec<_>>(), Bits(63));
831 test_with_input(&['a', 'b', 'c', 'd'], Bits(3));
832 test_with_input(&['a', 'b', 'c', 'd'], Bits(5));
833 test_with_input(&['a', 'b', 'c', 'd'], Bits(20));
834 test_with_input(&['a', 'b', 'c', 'd'], Bits(60));
835 test_with_input(&['a', 'b', 'c', 'd'], Bits(63));
836 }
837
838 #[test]
839 fn test_medium() {
840 let keys: Vec<_> = (-2000..2000).map(|v| 3*v).collect();
841 test_with_input(&keys, TwoToPowerBits::new(5));
844 test_with_input(&keys, TwoToPowerBits::new(4));
845 test_with_input(&keys, TwoToPowerBits::new(3));
846 test_with_input(&keys, TwoToPowerBits::new(2));
847 test_with_input(&keys, TwoToPowerBits::new(1));
848 test_with_input(&keys, TwoToPowerBits::new(0));
849 test_with_input(&keys, Bits(3));
850 test_with_input(&keys, Bits(5));
851 test_with_input(&keys, Bits(10));
852 test_with_input(&keys, Bits(13));
853 test_with_input(&keys, Bits(20));
854 test_with_input(&keys, Bits(27));
855 test_with_input(&keys, Bits(33));
856 test_with_input(&keys, Bits(45));
857 test_with_input(&keys, Bits(50));
858 test_with_input(&keys, Bits(55));
859 test_with_input(&keys, Bits(60));
860 test_with_input(&keys, Bits(63));
861 }
862
863 #[test]
864 fn test_large_size() {
865 let keys = (-20000..20000).collect::<Vec<_>>();
866 assert!(GOFunction::from_slice_with_conf(&keys[..], GOConf::default_biggest().into()).size_bytes() as f64 * (8.0/40000.0) < 2.57);
867 assert!(GOFunction::from_slice_with_conf(&keys[..], GOConf::default_bigger().into()).size_bytes() as f64 * (8.0/40000.0) < 2.4);
868 assert!(GOFunction::from(&keys[..]).size_bytes() as f64 * (8.0/40000.0) < 2.26);
869 assert!(GOFunction::from_slice_with_conf(&keys[..], GOConf::default_smallest().into()).size_bytes() as f64 * (8.0/40000.0) < 2.15);
870 }
871
872 #[test]
873 #[ignore = "uses much memory and time"]
874 fn test_fmphgo_for_over_2to32_keys() {
875 const LEN: u64 = 5_000_000_000;
876 let f = GOFunction::with_conf_stats(
877 crate::fmph::keyset::CachedKeySet::dynamic(|| 0..LEN, usize::MAX),
878 GOConf::default_biggest().into(),
879 &mut crate::stats::BuildStatsPrinter::stdout());
880 test_mphf_iter(LEN as usize, 0..LEN, |key| f.get(key));
881 assert!(f.size_bytes() as f64 * (8.0/LEN as f64) < 2.57);
882 }
883
884 #[test]
885 fn test_duplicates() {
886 assert!(GOFunction::try_with_conf_stats(vec![1, 1], Default::default(), &mut ()).is_none());
887 assert!(GOFunction::try_with_conf_stats(vec![1, 2, 3, 1, 4], Default::default(), &mut ()).is_none());
888 }
889
890 #[test]
891 fn test_duplicates_partial() {
892 let keys = vec![1, 2, 3, 1, 4];
893 let expected_initial_len = keys.len();
894 let r = GOFunction::try_with_conf_stats_or_partial(keys, Default::default(), &mut ());
895 if let Err((mphf, mut remaining, initial_len)) = r {
896 assert_eq!(initial_len, expected_initial_len);
897 remaining.sort();
898 assert_eq!(remaining, vec![1, 1]);
899 test_mphf(&[2, 3, 4], |key| mphf.get(key));
900 } else {
901 assert!(false)
902 }
903 }
904}