1use std::hash::Hash;
2use binout::{VByte, Serializer, AsIs};
3use bitm::{BitAccess, Rank, ceiling_div};
4
5use crate::seeds::{Bits8, SeedSize, TwoToPowerBitsStatic};
6use crate::utils::{ArrayWithRank, read_bits};
7use crate::{BuildDefaultSeededHasher, BuildSeededHasher, stats};
8
9use super::function::{concat_level_arrays, from_mut_slice, get_mut_slice, level_array_for, LevelArray};
10use super::goindexing::GroupSize;
11use std::io;
12use std::sync::atomic::AtomicU64;
13use dyn_size_of::GetSize;
14use crate::fmph::function::{fphash_add_bit, fphash_remove_collided, fphash_sync_add_bit};
15use crate::fmph::goindexing::group_nr;
16
17use rayon::prelude::*;
18use crate::fmph::keyset::{KeySet, SliceMutSource, SliceSourceWithRefs};
19
20#[derive(Clone)]
30pub struct GOConf<GS: GroupSize = TwoToPowerBitsStatic::<4>, SS: SeedSize = TwoToPowerBitsStatic<2>, S = BuildDefaultSeededHasher> {
31 pub hash_builder: S,
33 pub bits_per_seed: SS,
35 pub bits_per_group: GS
37}
38
39impl GOConf<TwoToPowerBitsStatic::<3>, TwoToPowerBitsStatic::<0>, BuildDefaultSeededHasher> {
40 pub fn default_biggest() -> Self {
46 Self {
47 hash_builder: Default::default(),
48 bits_per_seed: Default::default(),
49 bits_per_group: Default::default()
50 }
51 }
52}
53
54impl GOConf<TwoToPowerBitsStatic::<4>, TwoToPowerBitsStatic::<1>, BuildDefaultSeededHasher> {
55 pub fn default_bigger() -> Self {
61 Self {
62 hash_builder: Default::default(),
63 bits_per_seed: Default::default(),
64 bits_per_group: Default::default()
65 }
66 }
67}
68
69impl Default for GOConf {
70 fn default() -> Self {
75 Self {
76 hash_builder: Default::default(),
77 bits_per_seed: Default::default(),
78 bits_per_group: Default::default()
79 }
80 }
81}
82
83impl GOConf<TwoToPowerBitsStatic::<5>, Bits8, BuildDefaultSeededHasher> {
84 pub fn default_smallest() -> Self {
90 Self {
91 hash_builder: Default::default(),
92 bits_per_seed: Default::default(),
93 bits_per_group: Default::default()
94 }
95 }
96}
97
98impl<GS: GroupSize, SS: SeedSize> GOConf<GS, SS> {
99 pub fn bps_bpg(bits_per_seed: SS, bits_per_group: GS) -> Self {
101 Self {
102 hash_builder: Default::default(),
103 bits_per_seed,
104 bits_per_group,
105 }
106 }
107}
108
109impl<GS: GroupSize, SS: SeedSize, S: BuildSeededHasher> GOConf<GS, SS, S> {
110 pub fn validate(&self) {
112 self.bits_per_seed.validate().unwrap();
113 self.bits_per_group.validate().unwrap();
114 }
115
116 pub fn hash_bps_bpg(hash_builder: S, bits_per_seed: SS, bits_per_group: GS) -> Self {
118 Self { hash_builder, bits_per_seed, bits_per_group } }
120
121 #[inline(always)] pub fn hash_index<GetGroupSeed>(&self, hash: u64, level_size_groups: usize, group_seed: GetGroupSeed) -> usize
123 where GetGroupSeed: FnOnce(usize) -> u16 {
125 let group = group_nr(hash, level_size_groups);
126 self.bits_per_group.bit_index_for_seed(hash, group_seed(group), group)
127 }
128
129 #[inline(always)] pub fn key_index<GetGroupSeed, K>(&self, key: &K, level_seed: u64, level_size_groups: usize, group_seed: GetGroupSeed) -> usize
131 where GetGroupSeed: FnOnce(usize) -> u16, K: Hash
132 {
133 self.hash_index(self.hash_builder.hash_one(key, level_seed), level_size_groups, group_seed)
134 }
135
136 fn build_array_for_hashes(&self, key_hashes: &[u64], level_size_segments: usize, level_size_groups: usize, group_seed: u16) -> LevelArray
138 {
139 let mut result = level_array_for(level_size_segments);
140 let mut collision = vec![0u64; level_size_segments].into_boxed_slice();
141 for hash in key_hashes {
142 fphash_add_bit(&mut result, &mut collision, self.hash_index(*hash, level_size_groups, |_| group_seed));
143 };
144 fphash_remove_collided(&mut result, &collision);
145 result
146 }
147}
148
149impl<GS: GroupSize + Sync, SS: SeedSize, S: BuildSeededHasher + Sync> GOConf<GS, SS, S> {
150 fn build_array_for_hashes_mt(&self, key_hashes: &[u64], level_size_segments: usize, level_size_groups: usize, group_seed: u16) -> LevelArray
152 {
153 let mut result = level_array_for(level_size_segments);
154 let result_atom = from_mut_slice(&mut result);
155 let mut collision: Box<[AtomicU64]> = (0..level_size_segments).map(|_| AtomicU64::default()).collect();
156 key_hashes.par_iter().for_each(
157 |hash| fphash_sync_add_bit(&result_atom, &collision, self.hash_index(*hash, level_size_groups, |_| group_seed))
158 );
159 fphash_remove_collided(&mut result, get_mut_slice(&mut collision));
160 result
161 }
162}
163
164#[derive(Clone)]
168pub struct GOBuildConf<GS: GroupSize = TwoToPowerBitsStatic::<4>, SS: SeedSize = TwoToPowerBitsStatic<2>, S = BuildDefaultSeededHasher> {
169 pub cache_threshold: usize,
177
178 pub relative_level_size: u16,
184
185 pub use_multiple_threads: bool,
189
190 pub goconf: GOConf<GS, SS, S>,
192} impl Default for GOBuildConf {
195 fn default() -> Self { Self::new(Default::default()) }
196}
197
198impl<GS: GroupSize + Sync, SS: SeedSize, S: BuildSeededHasher + Sync> From<GOConf<GS, SS, S>> for GOBuildConf<GS, SS, S> {
199 #[inline] fn from(value: GOConf<GS, SS, S>) -> Self { Self::new(value) }
200}
201
202impl<GS: GroupSize + Sync, SS: SeedSize, S: BuildSeededHasher + Sync> GOBuildConf<GS, SS, S>
203{
204 pub const DEFAULT_CACHE_THRESHOLD: usize = 1024*1024*128; pub fn with_lsize_ct_mt(goconf: GOConf<GS, SS, S>, relative_level_size: u16, cache_threshold: usize, use_multiple_threads: bool) -> Self {
212 Self {
213 cache_threshold,
214 relative_level_size,
215 use_multiple_threads,
216 goconf
217 }
218 }
219
220 pub fn new(goconf: GOConf<GS, SS, S>) -> Self {
222 Self::with_lsize_ct_mt(goconf, 100, Self::DEFAULT_CACHE_THRESHOLD, true)
223 }
224
225 pub fn with_ct(goconf: GOConf<GS, SS, S>, cache_threshold: usize) -> Self {
228 Self::with_lsize_ct_mt(goconf, 100, cache_threshold, true)
229 }
230
231 pub fn with_lsize_ct(goconf: GOConf<GS, SS, S>, relative_level_size: u16, cache_threshold: usize) -> Self {
234 Self::with_lsize_ct_mt(goconf, relative_level_size, cache_threshold, true)
235 }
236
237 pub fn with_lsize(goconf: GOConf<GS, SS, S>, relative_level_size: u16) -> Self {
240 Self::with_lsize_ct_mt(goconf, relative_level_size, Self::DEFAULT_CACHE_THRESHOLD, true)
241 }
242
243 pub fn with_lsize_mt(goconf: GOConf<GS, SS, S>, relative_level_size: u16, use_multiple_threads: bool) -> Self {
247 Self::with_lsize_ct_mt(goconf, relative_level_size, Self::DEFAULT_CACHE_THRESHOLD, use_multiple_threads)
248 }
249
250 pub fn with_mt(goconf: GOConf<GS, SS, S>, use_multiple_threads: bool) -> Self {
253 Self::with_lsize_ct_mt(goconf, 100, Self::DEFAULT_CACHE_THRESHOLD, use_multiple_threads)
254 }
255
256 #[inline(always)] fn last_seed(&self) -> u16 { ((1u32 << self.goconf.bits_per_seed.into())-1) as u16 }
257
258 fn update_best<GetGroupSeed>(&self, level_size_groups: usize, best_array: &mut [u64], best_seeds: &mut [SS::VecElement], array: &[u64], array_seed: GetGroupSeed)
260 where GetGroupSeed: Fn(usize) -> u16
261 {
262 for group_index in 0..level_size_groups {
263 self.goconf.bits_per_group.copy_group_if_better(best_array, array, group_index,
264 || unsafe{ self.goconf.bits_per_seed.set_seed(best_seeds, group_index, array_seed(group_index)) }
265 )
266 }
267 }
268
269 #[inline(always)]
272 fn best_array<AB>(&self, build_for_group: AB, level_size_groups: usize) -> (LevelArray, Box<[SS::VecElement]>)
273 where AB: Fn(u16) -> LevelArray {
275 let mut best_array = build_for_group(0);
276 let mut best_seeds = self.goconf.bits_per_seed.new_zeroed_seed_vec(level_size_groups);
277 for group_seed in 1..=self.last_seed() {
278 let with_new_seed = build_for_group(group_seed);
279 self.update_best(level_size_groups, &mut best_array, &mut best_seeds, &with_new_seed, |_| group_seed);
280 }
281 (best_array, best_seeds)
282 }
283}
284
285pub struct GOBuilder<GS: GroupSize = TwoToPowerBitsStatic::<4>, SS: SeedSize = TwoToPowerBitsStatic<2>, S = BuildDefaultSeededHasher> {
287 level_sizes: Vec::<usize>,
288 arrays: Vec::<LevelArray>,
289 group_seeds: Vec::<Box<[SS::VecElement]>>,
290 conf: GOBuildConf<GS, SS, S>
291} impl<GS: GroupSize + Sync, SS: SeedSize, S: BuildSeededHasher + Sync> GOBuilder<GS, SS, S>
294{
295 pub fn new(mut conf: GOBuildConf<GS, SS, S>) -> Self {
296 conf.goconf.validate();
297 if conf.use_multiple_threads { conf.use_multiple_threads = rayon::current_num_threads() > 1; }
298 Self {
299 level_sizes: Vec::<usize>::new(),
300 arrays: Vec::<LevelArray>::new(),
301 group_seeds: Vec::<Box<[SS::VecElement]>>::new(),
302 conf
303 }
304 }
305
306 fn push(&mut self, array: LevelArray, seeds: Box<[SS::VecElement]>, size_groups: usize) {
307 self.arrays.push(array);
308 self.group_seeds.push(seeds);
309 self.level_sizes.push(size_groups);
310 }
311
312 #[inline(always)] fn level_nr(&self) -> usize { self.level_sizes.len() }
314
315 fn retained<K>(&self, key: &K) -> bool where K: Hash {
317 self.arrays.iter().zip(self.group_seeds.iter()).zip(self.level_sizes.iter()).enumerate()
318 .all(|(level_seed, ((a, seeds), level_size_groups))| {
319 !a.get_bit(self.conf.goconf.key_index(key, level_seed as u64, *level_size_groups,
320 |group| unsafe{ self.conf.goconf.bits_per_seed.get_seed(seeds, group as usize)}))
321 })
322 }
323
324 #[inline(always)]
326 fn build_array<KS, K>(&self, keys: &KS, level_size_segments: usize, level_size_groups: usize, group_seed: u16) -> LevelArray
327 where K: Hash, KS: KeySet<K>
329 {
330 let mut result = level_array_for(level_size_segments);
331 let mut collision = vec![0u64; level_size_segments].into_boxed_slice();
332 let level_seed = self.level_nr() as u64;
333 keys.for_each_key(|key| fphash_add_bit(&mut result, &mut collision, self.conf.goconf.key_index(key, level_seed, level_size_groups, |_| group_seed)),
334 |key| self.retained(key));
335 fphash_remove_collided(&mut result, &collision);
336 result
337 }
338
339 #[inline(always)]
341 fn build_array_mt<KS, K>(&self, keys: &KS, level_size_segments: usize, level_size_groups: usize, group_seed: u16) -> LevelArray
342 where K: Hash, KS: KeySet<K> {
344 if !keys.has_par_for_each_key() {
345 return self.build_array(keys, level_size_segments, level_size_groups, group_seed);
346 }
347 let mut result = level_array_for(level_size_segments);
348 let result_atom = from_mut_slice(&mut result);
349 let mut collision: Box<[AtomicU64]> = (0..level_size_segments).map(|_| AtomicU64::default()).collect();
350 let level_seed = self.level_nr() as u64;
351 keys.par_for_each_key(
352 |key| fphash_sync_add_bit(&result_atom, &collision, self.conf.goconf.key_index(key, level_seed, level_size_groups, |_| group_seed)),
353 |key| self.retained(key)
354 );
355 fphash_remove_collided(&mut result, get_mut_slice(&mut collision));
356 result
357 }
358
359 fn build_next_level_with_cache<KS, K>(&mut self, keys: &mut KS, level_size_groups: usize, level_size_segments: usize)
360 where K: Hash + Sync, KS: KeySet<K> + Sync
361 {
362 let level_seed = self.level_nr() as u64;
363 let key_hashes = keys.maybe_par_map_each_key(
364 |k| self.conf.goconf.hash_builder.hash_one(k, level_seed),
365 |key| self.retained(key),
366 self.conf.use_multiple_threads
367 );
368 let (array, seeds) = if self.conf.use_multiple_threads {
369 self.conf.best_array(|g| self.conf.goconf.build_array_for_hashes_mt(&key_hashes, level_size_segments, level_size_groups, g), level_size_groups)
370 } else {
371 self.conf.best_array(|g| self.conf.goconf.build_array_for_hashes(&key_hashes, level_size_segments, level_size_groups, g), level_size_groups)
372 };
373 keys.maybe_par_retain_keys_with_indices(
374 |i| !array.get_bit(
375 self.conf.goconf.hash_index(key_hashes[i], level_size_groups,
376 |group| unsafe{ self.conf.goconf.bits_per_seed.get_seed(&seeds, group as usize) })
377 ),
378 |key| !array.get_bit(
379 self.conf.goconf.key_index(key, level_seed, level_size_groups,
380 |group| unsafe{ self.conf.goconf.bits_per_seed.get_seed(&seeds, group as usize) })
381 ),
382 |key| self.retained(key),
383 || array.count_bit_ones(),
384 self.conf.use_multiple_threads
385 );
386 self.push(array, seeds, level_size_groups);
387 }
388
389 fn build_levels<KS, K, BS>(&mut self, keys: &mut KS, stats: &mut BS) -> bool
391 where K: Hash + Sync, KS: KeySet<K> + Sync, BS: stats::BuildStatsCollector
392 {
393 let mut levels_without_reduction = 0; let mut input_size = keys.keys_len();
395 while input_size != 0 {
396 let (level_size_groups, level_size_segments) = self.conf.goconf.bits_per_group.level_size_groups_segments(
397 ceiling_div(input_size * self.conf.relative_level_size as usize, 100));
398 stats.level(input_size, level_size_segments * 64);
400 if input_size < self.conf.cache_threshold {
401 self.build_next_level_with_cache(keys, level_size_groups, level_size_segments);
402 } else {
403 let (array, seeds) = if self.conf.use_multiple_threads {
405 self.conf.best_array(|g| self.build_array_mt(keys, level_size_segments, level_size_groups, g), level_size_groups)
406 } else {
407 self.conf.best_array(|g| self.build_array(keys, level_size_segments, level_size_groups, g), level_size_groups)
408 };
409 let level_nr = self.level_nr();
410 keys.maybe_par_retain_keys(
411 |key| {
412 let hash = self.conf.goconf.hash_builder.hash_one(key, level_nr as u64);
413 let group = group_nr(hash, level_size_groups);
414 let bit_index = self.conf.goconf.bits_per_group.bit_index_for_seed(
415 hash,
416 unsafe{ self.conf.goconf.bits_per_seed.get_seed(&seeds, group as usize) },
418 group);
419 !array.get_bit(bit_index)
420 },
421 |key| self.retained(key),
422 || array.count_bit_ones(),
423 self.conf.use_multiple_threads
424 );
425 self.push(array, seeds, level_size_groups);
426 }
427 let prev_input_size = input_size;
428 input_size = keys.keys_len();
429 if input_size == prev_input_size {
430 levels_without_reduction += 1;
431 if levels_without_reduction == 10 {
432 let len = self.arrays.len()-levels_without_reduction;
433 self.arrays.truncate(len);
434 self.group_seeds.truncate(len);
435 self.level_sizes.truncate(len);
436 stats.end(input_size);
437 return false;
438 }
439 } else {
440 levels_without_reduction = 0;
441 }
442 }
443 stats.end(0);
444 return true;
445 }
446
447 pub fn finish(self) -> GOFunction<GS, SS, S> {
448 let (array, _) = ArrayWithRank::build(concat_level_arrays(self.arrays));
449 let group_seeds_concatenated = self.conf.goconf.bits_per_seed.concatenate_seed_vecs(|| self.level_sizes.iter().copied(), self.group_seeds);
450 GOFunction::<GS, SS, S> {
451 array,
452 group_seeds: group_seeds_concatenated,
453 conf: self.conf.goconf,
454 level_sizes: self.level_sizes.into_boxed_slice(),
455 }
456 }
457}
458
459pub struct GOFunction<GS: GroupSize = TwoToPowerBitsStatic::<4>, SS: SeedSize = TwoToPowerBitsStatic<2>, S = BuildDefaultSeededHasher> {
464 array: ArrayWithRank,
465 group_seeds: Box<[SS::VecElement]>, level_sizes: Box<[usize]>, conf: GOConf<GS, SS, S>
468 }
471
472impl<GS: GroupSize, SS: SeedSize, S: BuildSeededHasher> GetSize for GOFunction<GS, SS, S> {
473 fn size_bytes_dyn(&self) -> usize {
474 self.array.size_bytes_dyn()
475 + self.group_seeds.size_bytes_dyn()
477 + self.level_sizes.size_bytes_dyn()
478 }
479
480 const USES_DYN_MEM: bool = true;
481}
482
483impl<GS: GroupSize, SS: SeedSize, S: BuildSeededHasher> GOFunction<GS, SS, S> {
484
485 #[inline(always)] pub fn get_stats<K: Hash + ?Sized, A: stats::AccessStatsCollector>(&self, key: &K, access_stats: &mut A) -> Option<u64> {
491 let mut groups_before = 0usize;
492 let mut level_nr = 0usize;
493 loop {
494 let level_size_groups = *self.level_sizes.get(level_nr)?;
495 let hash = self.conf.hash_builder.hash_one(key, level_nr as u64);
499 let group = groups_before + group_nr(hash, level_size_groups);
500 let seed = unsafe{ self.conf.bits_per_seed.get_seed(&self.group_seeds, group) };
501 let bit_index = self.conf.bits_per_group.bit_index_for_seed(hash, seed, group);
502 if self.array.content.get_bit(bit_index) {
503 access_stats.found_on_level(level_nr);
504 return Some(unsafe{self.array.rank_unchecked(bit_index)} as u64);
505 }
506 groups_before += level_size_groups;
507 level_nr += 1;
508 }
509 }
510
511 #[inline(always)] pub fn get<K: Hash + ?Sized>(&self, key: &K) -> Option<u64> {
517 self.get_stats(key, &mut ())
518 }
519
520 #[inline(always)] pub fn get_stats_or_panic<K: Hash + ?Sized, A: stats::AccessStatsCollector>(&self, key: &K, access_stats: &mut A) -> u64 {
526 self.get_stats(key, access_stats).expect("Invalid access to an item outside the set given during construction.")
527 }
528
529 #[inline(always)] pub fn get_or_panic<K: Hash + ?Sized>(&self, key: &K) -> u64 {
535 self.get_stats_or_panic(key, &mut ())
536 }
537
538 pub fn write_bytes(&self) -> usize {
540 self.conf.bits_per_group.write_size_bytes()
541 + VByte::array_size(&self.level_sizes)
542 + AsIs::array_content_size(&self.array.content)
543 + std::mem::size_of::<u8>() + self.group_seeds.size_bytes_content_dyn()
544 }
545
546 pub fn write(&self, output: &mut dyn io::Write) -> io::Result<()>
548 {
549 self.conf.bits_per_group.write(output)?;
550 VByte::write_array(output, &self.level_sizes)?;
551 AsIs::write_all(output, self.array.content.iter())?;
552 self.conf.bits_per_seed.write_seed_vec(output, &self.group_seeds)
553 }
554
555 pub fn read_with_hasher(input: &mut dyn io::Read, hash_builder: S) -> io::Result<Self>
557 {
558 let bits_per_group = GS::read(input)?;
559 let level_size = VByte::read_array(input)?;
560 let number_of_groups = level_size.iter().map(|v|*v as usize).sum::<usize>();
561
562 let array_content = read_bits(input, bits_per_group * number_of_groups)?;
563 let (array_with_rank, _) = ArrayWithRank::build(array_content);
564
565 let (bits_per_group_seed, group_seeds) = SS::read_seed_vec(input, number_of_groups)?;
566
567 Ok(Self {
568 array: array_with_rank,
569 group_seeds,
570 level_sizes: level_size,
571 conf: GOConf {
572 bits_per_seed: bits_per_group_seed,
573 bits_per_group,
574 hash_builder
575 },
576 })
577 }
578
579 pub fn level_sizes(&self) -> &[usize] {
581 &self.level_sizes
582 }
583}
584
585impl<GS: GroupSize + Sync, SS: SeedSize, S: BuildSeededHasher + Sync> GOFunction<GS, SS, S> {
586 pub fn try_with_conf_stats_or_partial<K, KS, BS>(mut keys: KS, conf: GOBuildConf<GS, SS, S>, stats: &mut BS) -> Result<Self, (Self, KS, usize)>
600 where K: Hash + Sync, KS: KeySet<K> + Sync, BS: stats::BuildStatsCollector
601 {
602 let mut builder = GOBuilder::new(conf);
603 let initial_size = keys.keys_len();
604 if builder.build_levels(&mut keys, stats) {
605 drop(keys);
606 Ok(builder.finish())
607 } else {
608 Err((builder.finish(), keys, initial_size))
609 }
610 }
611
612 pub fn try_with_conf_stats<K, KS, BS>(mut keys: KS, conf: GOBuildConf<GS, SS, S>, stats: &mut BS) -> Option<Self>
618 where K: Hash + Sync, KS: KeySet<K> + Sync, BS: stats::BuildStatsCollector
619 {
620 let mut builder = GOBuilder::new(conf);
621 builder.build_levels(&mut keys, stats).then(|| {
622 drop(keys);
623 builder.finish()
624 })
625 }
626
627 pub fn with_conf_stats<K, KS, BS>(keys: KS, conf: GOBuildConf<GS, SS, S>, stats: &mut BS) -> Self
631 where K: Hash + Sync, KS: KeySet<K> + Sync, BS: stats::BuildStatsCollector
632 {
633 Self::try_with_conf_stats(keys, conf, stats).expect("Constructing fmph::GOFunction failed. Probably the input contains duplicate keys.")
634 }
635
636 #[inline] pub fn with_conf<K, KS>(keys: KS, conf: GOBuildConf<GS, SS, S>) -> Self
642 where K: Hash + Sync, KS: KeySet<K> + Sync
643 {
644 Self::with_conf_stats(keys, conf, &mut ())
645 }
646
647 #[inline] pub fn from_slice_with_conf_stats<K, BS>(keys: &[K], conf: GOBuildConf<GS, SS, S>, stats: &mut BS) -> Self
653 where K: Hash + Sync, BS: stats::BuildStatsCollector
654 {
655 Self::with_conf_stats(SliceSourceWithRefs::<_, u8>::new(keys), conf, stats)
656 }
657
658 #[inline] pub fn from_slice_with_conf<K>(keys: &[K], conf: GOBuildConf<GS, SS, S>) -> Self
664 where K: Hash + Sync
665 {
666 Self::with_conf_stats(SliceSourceWithRefs::<_, u8>::new(keys), conf, &mut ())
667 }
668
669 #[inline] pub fn from_slice_mut_with_conf_stats<K, BS>(keys: &mut [K], conf: GOBuildConf<GS, SS, S>, stats: &mut BS) -> Self
676 where K: Hash + Sync, BS: stats::BuildStatsCollector
677 {
678 Self::with_conf_stats(SliceMutSource::new(keys), conf, stats)
679 }
680
681 #[inline] pub fn from_slice_mut_with_conf<K>(keys: &mut [K], conf: GOBuildConf<GS, SS, S>) -> Self
688 where K: Hash + Sync
689 {
690 Self::with_conf_stats(SliceMutSource::new(keys), conf, &mut ())
691 }
692}
693
694impl<GS: GroupSize + Sync, SS: SeedSize> GOFunction<GS, SS> {
695 pub fn read(input: &mut dyn io::Read) -> io::Result<Self> {
698 Self::read_with_hasher(input, Default::default())
699 }
700}
701
702impl GOFunction {
703 pub fn from_slice_with_stats<K, BS>(keys: &[K], stats: &mut BS) -> Self
709 where K: Hash + Sync, BS: stats::BuildStatsCollector
710 {
711 Self::from_slice_with_conf_stats(keys, Default::default(), stats)
712 }
713
714 pub fn from_slice<K: Hash + Sync>(keys: &[K]) -> Self {
720 Self::from_slice_with_conf_stats(keys, Default::default(), &mut ())
721 }
722
723 pub fn new<K: Hash + Sync, KS: KeySet<K> + Sync>(keys: KS) -> Self {
729 Self::with_conf_stats(keys, Default::default(), &mut ())
730 }
731}
732
733impl<GS: GroupSize, SS: SeedSize, S> GOFunction<GS, SS, S> {
734 pub fn len(&self) -> usize {
738 self.array.content.count_bit_ones()
739 }
740}
741
742impl<K: Hash + Sync> From<&[K]> for GOFunction {
743 fn from(keys: &[K]) -> Self {
744 Self::from_slice(keys)
745 }
746}
747
748impl<K: Hash + Sync + Send> From<Vec<K>> for GOFunction {
749 fn from(keys: Vec<K>) -> Self {
750 Self::new(keys)
751 }
752}
753
754#[cfg(test)]
755mod tests {
756 use super::*;
757 use crate::utils::{tests::test_mphf_u64, verify_phf};
758 use crate::fmph::TwoToPowerBits;
759 use std::fmt::{Debug, Display};
760 use crate::seeds::Bits;
761
762 fn test_read_write<GS: GroupSize + Sync, SS: SeedSize>(h: &GOFunction<GS, SS>)
763 where SS::VecElement: std::cmp::PartialEq + Debug
764 {
765 let mut buff = Vec::new();
766 h.write(&mut buff).unwrap();
767 assert_eq!(buff.len(), h.write_bytes());
768 let read = GOFunction::<GS, SS>::read(&mut &buff[..]).unwrap();
769 assert_eq!(h.level_sizes, read.level_sizes);
770 assert_eq!(h.array.content, read.array.content);
771 assert_eq!(h.group_seeds, read.group_seeds);
772 }
773
774 fn test_hash2_invariants<GS: GroupSize, SS: SeedSize>(h: &GOFunction<GS, SS>) {
775 let number_of_groups = h.level_sizes.iter().map(|v| *v as usize).sum::<usize>();
776 assert_eq!(h.conf.bits_per_group * number_of_groups, h.array.content.len() * 64);
777 assert_eq!(ceiling_div(number_of_groups * h.conf.bits_per_seed.into() as usize, 64), h.group_seeds.len());
778 }
779
780 fn test_with_input<K: Hash + Clone + Display + Sync>(to_hash: &[K], bits_per_group: impl GroupSize + Sync) {
781 let goconf = GOConf::bps_bpg(Bits(3), bits_per_group);
782 let h = GOFunction::from_slice_with_conf(to_hash, GOBuildConf::with_mt(goconf, false));
783 test_mphf_u64(to_hash, |key| h.get(key));
785 test_hash2_invariants(&h);
786 test_read_write(&h);
787 assert_eq!(h.len(), to_hash.len());
788 }
789
790 #[test]
791 fn test_small_powers_of_two() {
792 test_with_input(&[1, 2, 5], TwoToPowerBits::new(5));
795 test_with_input(&[1, 2, 5], TwoToPowerBits::new(4));
796 test_with_input(&[1, 2, 5], TwoToPowerBits::new(3));
797 test_with_input(&[1, 2, 5], TwoToPowerBits::new(2));
798 test_with_input(&[1, 2, 5], TwoToPowerBits::new(1));
799 test_with_input(&[1, 2, 5], TwoToPowerBits::new(0));
800 test_with_input(&(-50..150).collect::<Vec<_>>(), TwoToPowerBits::new(5));
803 test_with_input(&(-50..150).collect::<Vec<_>>(), TwoToPowerBits::new(4));
804 test_with_input(&(-50..150).collect::<Vec<_>>(), TwoToPowerBits::new(3));
805 test_with_input(&(-50..150).collect::<Vec<_>>(), TwoToPowerBits::new(2));
806 test_with_input(&(-50..150).collect::<Vec<_>>(), TwoToPowerBits::new(1));
807 test_with_input(&(-50..150).collect::<Vec<_>>(), TwoToPowerBits::new(0));
808 test_with_input(&['a', 'b', 'c', 'd'], TwoToPowerBits::new(5));
811 test_with_input(&['a', 'b', 'c', 'd'], TwoToPowerBits::new(4));
812 test_with_input(&['a', 'b', 'c', 'd'], TwoToPowerBits::new(3));
813 test_with_input(&['a', 'b', 'c', 'd'], TwoToPowerBits::new(2));
814 test_with_input(&['a', 'b', 'c', 'd'], TwoToPowerBits::new(1));
815 test_with_input(&['a', 'b', 'c', 'd'], TwoToPowerBits::new(0));
816 }
817
818 #[test]
819 fn test_small_bits() {
820 test_with_input(&[1, 2, 5], Bits(3));
821 test_with_input(&[1, 2, 5], Bits(5));
822 test_with_input(&[1, 2, 5], Bits(20));
823 test_with_input(&[1, 2, 5], Bits(60));
824 test_with_input(&[1, 2, 5], Bits(63));
825 test_with_input(&(-50..150).collect::<Vec<_>>(), Bits(3));
826 test_with_input(&(-50..150).collect::<Vec<_>>(), Bits(5));
827 test_with_input(&(-50..150).collect::<Vec<_>>(), Bits(20));
828 test_with_input(&(-50..150).collect::<Vec<_>>(), Bits(60));
829 test_with_input(&(-50..150).collect::<Vec<_>>(), Bits(63));
830 test_with_input(&['a', 'b', 'c', 'd'], Bits(3));
831 test_with_input(&['a', 'b', 'c', 'd'], Bits(5));
832 test_with_input(&['a', 'b', 'c', 'd'], Bits(20));
833 test_with_input(&['a', 'b', 'c', 'd'], Bits(60));
834 test_with_input(&['a', 'b', 'c', 'd'], Bits(63));
835 }
836
837 #[test]
838 fn test_medium() {
839 let keys: Vec<_> = (-2000..2000).map(|v| 3*v).collect();
840 test_with_input(&keys, TwoToPowerBits::new(5));
843 test_with_input(&keys, TwoToPowerBits::new(4));
844 test_with_input(&keys, TwoToPowerBits::new(3));
845 test_with_input(&keys, TwoToPowerBits::new(2));
846 test_with_input(&keys, TwoToPowerBits::new(1));
847 test_with_input(&keys, TwoToPowerBits::new(0));
848 test_with_input(&keys, Bits(3));
849 test_with_input(&keys, Bits(5));
850 test_with_input(&keys, Bits(10));
851 test_with_input(&keys, Bits(13));
852 test_with_input(&keys, Bits(20));
853 test_with_input(&keys, Bits(27));
854 test_with_input(&keys, Bits(33));
855 test_with_input(&keys, Bits(45));
856 test_with_input(&keys, Bits(50));
857 test_with_input(&keys, Bits(55));
858 test_with_input(&keys, Bits(60));
859 test_with_input(&keys, Bits(63));
860 }
861
862 #[test]
863 fn test_large_size() {
864 let keys = (-20000..20000).collect::<Vec<_>>();
865 assert!(GOFunction::from_slice_with_conf(&keys[..], GOConf::default_biggest().into()).size_bytes() as f64 * (8.0/40000.0) < 2.57);
866 assert!(GOFunction::from_slice_with_conf(&keys[..], GOConf::default_bigger().into()).size_bytes() as f64 * (8.0/40000.0) < 2.4);
867 assert!(GOFunction::from(&keys[..]).size_bytes() as f64 * (8.0/40000.0) < 2.26);
868 assert!(GOFunction::from_slice_with_conf(&keys[..], GOConf::default_smallest().into()).size_bytes() as f64 * (8.0/40000.0) < 2.15);
869 }
870
871 #[test]
872 #[ignore = "uses much memory and time"]
873 fn test_fmphgo_for_over_2to32_keys() {
874 const LEN: u64 = 5_000_000_000;
875 let f = GOFunction::with_conf_stats(
876 crate::fmph::keyset::CachedKeySet::dynamic(|| 0..LEN, usize::MAX),
877 GOConf::default_biggest().into(),
878 &mut crate::stats::BuildStatsPrinter::stdout());
879 verify_phf(LEN as usize, 0..LEN, |key| f.get(key).map(|v| v as usize));
880 assert!(f.size_bytes() as f64 * (8.0/LEN as f64) < 2.57);
881 }
882
883 #[test]
884 fn test_duplicates() {
885 assert!(GOFunction::try_with_conf_stats(vec![1, 1], Default::default(), &mut ()).is_none());
886 assert!(GOFunction::try_with_conf_stats(vec![1, 2, 3, 1, 4], Default::default(), &mut ()).is_none());
887 }
888
889 #[test]
890 fn test_duplicates_partial() {
891 let keys = vec![1, 2, 3, 1, 4];
892 let expected_initial_len = keys.len();
893 let r = GOFunction::try_with_conf_stats_or_partial(keys, Default::default(), &mut ());
894 if let Err((mphf, mut remaining, initial_len)) = r {
895 assert_eq!(initial_len, expected_initial_len);
896 remaining.sort();
897 assert_eq!(remaining, vec![1, 1]);
898 test_mphf_u64(&[2, 3, 4], |key| mphf.get(key));
899 } else {
900 assert!(false)
901 }
902 }
903}