1use std::{hash::Hash, io, usize};
2
3use crate::{phast::{function::{build_level_from_slice_mt, build_level_from_slice_st, build_level_mt, build_level_st, Level, SeedEx}, seed_chooser::SeedOnlyNoBump, Params, ShiftOnlyWrapped}, seeds::{Bits8, SeedSize}};
4use super::{bits_per_seed_to_100_bucket_size, builder::build_last_level, conf::Conf, seed_chooser::{SeedChooser, SeedOnly}, CompressedArray, DefaultCompressedArray};
5use binout::{Serializer as _, VByte};
6use bitm::BitAccess;
7use dyn_size_of::GetSize;
8use seedable_hash::{BuildDefaultSeededHasher, BuildSeededHasher};
9use voracious_radix_sort::RadixSort;
10
11pub struct Function2<SS, SC = ShiftOnlyWrapped, CA = DefaultCompressedArray, S = BuildDefaultSeededHasher>
29 where SS: SeedSize
30{
31 level0: SeedEx<SS::VecElement>,
32 unassigned: CA,
33 levels: Box<[Level<SS::VecElement>]>,
34 hasher: S,
35 last_level: Level<<Bits8 as SeedSize>::VecElement>,
36 last_level_seed: u64,
37 seed_chooser: SC,
38 seed_size: SS,
39}
40
41impl<SC, SS: SeedSize, CA, S> GetSize for Function2<SS, SC, CA, S> where CA: GetSize {
42 fn size_bytes_dyn(&self) -> usize {
43 self.level0.size_bytes_dyn() +
44 self.unassigned.size_bytes_dyn() +
45 self.levels.size_bytes_dyn() +
46 self.last_level.size_bytes_dyn()
47 }
48 fn size_bytes_content_dyn(&self) -> usize {
49 self.level0.size_bytes_content_dyn() +
50 self.unassigned.size_bytes_content_dyn() +
51 self.levels.size_bytes_content_dyn() +
52 self.last_level.size_bytes_content_dyn()
53 }
54 const USES_DYN_MEM: bool = true;
55}
56
57impl<SS: SeedSize, SC: SeedChooser, CA: CompressedArray, S: BuildSeededHasher> Function2<SS, SC, CA, S> {
58
59 #[inline(always)] pub fn get<K>(&self, key: &K) -> usize where K: Hash + ?Sized {
65 let key_hash = self.hasher.hash_one(key, 0);
66 let seed = unsafe { self.level0.seed_for(self.seed_size, key_hash) };
67 if seed != 0 { return self.seed_chooser.f(key_hash, seed, &self.level0.conf); }
68
69 for level_nr in 0..self.levels.len() {
70 let l = &self.levels[level_nr];
71 let key_hash = self.hasher.hash_one(key, level_nr as u64 + 1);
72 let seed = unsafe { l.seeds.seed_for(self.seed_size, key_hash) };
73 if seed != 0 {
74 return self.unassigned.get(self.seed_chooser.f(key_hash, seed, &l.seeds.conf) + l.shift)
75 }
76 }
77
78 let key_hash = self.hasher.hash_one(key, self.last_level_seed);
79 let seed = unsafe { self.last_level.seeds.seed_for(Bits8, key_hash) };
80 return self.unassigned.get(SeedOnlyNoBump.f(key_hash, seed, &self.last_level.seeds.conf) + self.last_level.shift)
81 }
82
83 pub fn with_vec_p_hash_sc<K>(mut keys: Vec::<K>, params: &Params<SS>, hasher: S, seed_chooser: SC) -> Self where K: Hash {
89 let number_of_keys = keys.len();
90 Self::_new(|h| {
91 let (level0, unassigned_values, unassigned_len) =
92 build_level_st(&mut keys, params, h, seed_chooser, 0);
93 (keys, level0, unassigned_values, unassigned_len)
94 }, |keys, level_nr, h| {
95 build_level_st(keys, params, h, seed_chooser, level_nr)
96 }, hasher, seed_chooser, params.seed_size, number_of_keys)
97 }
98
99 pub fn with_vec_p_threads_hash_sc<K>(mut keys: Vec::<K>, params: &Params<SS>, threads_num: usize, hasher: S, seed_chooser: SC) -> Self
105 where K: Hash+Sync+Send, S: Sync, SC: Sync {
106 if threads_num == 1 { return Self::with_vec_p_hash_sc(keys, params, hasher, seed_chooser); }
107 let number_of_keys = keys.len();
108 Self::_new(|h| {
109 let (level0, unassigned_values, unassigned_len) =
110 build_level_mt(&mut keys, params, threads_num, h, seed_chooser, 0);
111 (keys, level0, unassigned_values, unassigned_len)
112 }, |keys, level_nr, h| {
113 build_level_mt(keys, params, threads_num, h, seed_chooser, level_nr)
114 }, hasher, seed_chooser, params.seed_size, number_of_keys)
115 }
116
117
118 pub fn with_slice_p_hash_sc<K>(keys: &[K], params: &Params<SS>, hasher: S, seed_chooser: SC) -> Self where K: Hash+Clone {
124 Self::_new(|h| {
125 build_level_from_slice_st(keys, params, h, seed_chooser, 0)
126 }, |keys, level_nr, h| {
127 build_level_st(keys, params, h, seed_chooser, level_nr)
128 }, hasher, seed_chooser, params.seed_size, keys.len())
129 }
130
131
132 pub fn with_slice_p_threads_hash_sc<K>(keys: &[K], params: &Params<SS>, threads_num: usize, hasher: S, seed_chooser: SC) -> Self
138 where K: Hash+Sync+Send+Clone, S: Sync, SC: Sync {
139 if threads_num == 1 { return Self::with_slice_p_hash_sc(keys, params, hasher, seed_chooser); }
140 Self::_new(|h| {
141 build_level_from_slice_mt(keys, params, threads_num, h, seed_chooser, 0)
142 }, |keys, level_nr, h| {
143 build_level_mt(keys, params, threads_num, h, seed_chooser, level_nr)
144 }, hasher, seed_chooser, params.seed_size, keys.len())
145 }
146
147 #[inline]
148 fn _new<K, BF, BL>(build_first: BF, build_level: BL, hasher: S, seed_chooser: SC, seed_size: SS, number_of_keys: usize) -> Self
149 where BF: FnOnce(&S) -> (Vec::<K>, SeedEx<SS::VecElement>, Box<[u64]>, usize),
150 BL: Fn(&mut Vec::<K>, u64, &S) -> (SeedEx<SS::VecElement>, Box<[u64]>, usize),
151 K: Hash
152 {
153 let (mut keys, level0, unassigned_values, unassigned_len) = build_first(&hasher);
154 debug_assert_eq!(unassigned_len, unassigned_values.bit_ones().count());
156 let mut level0_unassigned = unassigned_values.bit_ones();
158 let mut unassigned = Vec::with_capacity(unassigned_len * 3 / 2);
159
160 let mut levels = Vec::with_capacity(16);
161 let mut last = 0;
162 while keys.len() > SC::FUNCTION2_THRESHOLD {
164 let keys_len = keys.len();
165 let (seeds, unassigned_values, _unassigned_len) =
171 build_level(&mut keys, levels.len() as u64+1, &hasher);
172 let shift = unassigned.len();
173 for i in 0..keys_len {
174 if CA::MAX_FOR_UNUSED {
175 if !unsafe{unassigned_values.get_bit_unchecked(i)} {
176 last = level0_unassigned.next().unwrap();
177 unassigned.push(last);
178 } else {
179 unassigned.push(usize::MAX);
180 }
181 } else {
182 if !unsafe{unassigned_values.get_bit_unchecked(i)} {
183 last = level0_unassigned.next().unwrap();
184 }
185 unassigned.push(last);
186 }
187 }
188 levels.push(Level { seeds, shift });
189 }
190 let mut last_seed = levels.len() as u64+1;
192 let last_shift;
193 let last_seeds =
194 if keys.is_empty() {
195 last_shift = 0;
196 SeedEx::<<Bits8 as SeedSize>::VecElement>{ seeds: Box::default(), conf: Conf { buckets_num: 0, slice_len_minus_one: 0, num_of_slices: 0 } }
197 } else {
198 let (last_seeds, unassigned_values, _unassigned_len) =
199 Self::build_last_level(keys, &hasher, &mut last_seed);
200 last_shift = unassigned.len();
201 for i in 0..last_seeds.conf.output_range(SeedOnlyNoBump, Bits8.into()) {
202 if CA::MAX_FOR_UNUSED {
203 if !unsafe{unassigned_values.get_bit_unchecked(i)} {
204 last = level0_unassigned.next().unwrap();
205 unassigned.push(last);
206 } else {
207 unassigned.push(usize::MAX);
208 }
209 } else {
210 if !unsafe{unassigned_values.get_bit_unchecked(i)} {
211 last = level0_unassigned.next().unwrap();
212 }
213 unassigned.push(last);
214 }
215 }
216 last_seeds
218 };
219 debug_assert!(level0_unassigned.next().is_none()); drop(level0_unassigned);
221 Self {
222 level0,
223 unassigned: CA::new(unassigned, last, number_of_keys),
224 levels: levels.into_boxed_slice(),
225 hasher,
226 seed_chooser,
227 last_level: Level { seeds: last_seeds, shift: last_shift },
228 last_level_seed: last_seed,
229 seed_size,
230 }
231 }
232
233 #[inline(always)]
234 fn build_last_level<K>(keys: Vec::<K>, hasher: &S, seed: &mut u64)
235 -> (SeedEx<<Bits8 as SeedSize>::VecElement>, Box<[u64]>, usize)
236 where K: Hash
237 {
238 let bits_per_seed = Bits8;
239 let len100 = (keys.len()+10)*120;
240 let conf = SeedOnly.conf_for_minimal((len100+50)/100,
241 bits_per_seed.into(), 400, 0); let evaluator = SeedOnly.bucket_evaluator(bits_per_seed.into(), conf.slice_len());
243 loop {
244 let mut hashes: Box<[_]> = keys.iter().map(|k| hasher.hash_one(k, *seed)).collect();
245 hashes.voracious_sort(); if let Some((seeds, unassigned_values, unassigned_len)) =
247 build_last_level(&hashes, conf, bits_per_seed, evaluator.clone())
248 {
249 return (SeedEx{ seeds, conf }, unassigned_values, unassigned_len);
250 }
251 *seed += 1;
252 }
254 }
255
256 #[inline(always)] pub fn minimal_output_range(&self, num_of_keys: usize) -> usize { self.seed_chooser.minimal_output_range(num_of_keys) }
259
260 pub fn output_range(&self) -> usize {
262 self.level0.conf.output_range(self.seed_chooser, self.seed_size.into())
263 }
264
265 pub fn write(&self, output: &mut dyn io::Write) -> io::Result<()>
340 {
341 self.level0.write(output, self.seed_size)?;
342 self.unassigned.write(output)?;
343 VByte::write(output, self.levels.len())?;
344 for level in &self.levels {
345 level.write(output, self.seed_size)?;
346 }
347 VByte::write(output, self.last_level_seed)?;
348 self.last_level.write(output, Bits8)?;
349 Ok(())
350 }
351
352 pub fn write_bytes(&self) -> usize {
354 self.level0.write_bytes() +
355 self.unassigned.write_bytes() +
356 VByte::size(self.levels.len()) +
357 self.levels.iter().map(|l| l.size_bytes()).sum::<usize>() +
358 VByte::size(self.last_level_seed) +
359 self.last_level.write_bytes()
360 }
361
362 pub fn read_with_hasher_sc(input: &mut dyn io::Read, hasher: S, seed_chooser: SC) -> io::Result<Self> {
364 let (seed_size, level0) = SeedEx::read(input)?;
365 let unassigned = CA::read(input)?;
366 let levels_num: usize = VByte::read(input)?;
367 let mut levels = Vec::with_capacity(levels_num);
368 for _ in 0..levels_num {
369 levels.push(Level::read::<SS>(input)?.1);
370 }
371 let last_level_seed = VByte::read(input)?;
372 let last_level = Level::read::<Bits8>(input)?.1;
373 Ok(Self { level0, unassigned, levels: levels.into_boxed_slice(), hasher, seed_chooser, seed_size, last_level, last_level_seed })
374 }
375}
376
377impl<SS: SeedSize> Function2<SS, ShiftOnlyWrapped, DefaultCompressedArray, BuildDefaultSeededHasher> {
378
379 pub fn read(input: &mut dyn io::Read) -> io::Result<Self> {
381 Self::read_with_hasher_sc(input, BuildDefaultSeededHasher::default(), ShiftOnlyWrapped)
382 }
383}
384
385impl Function2<Bits8, ShiftOnlyWrapped, DefaultCompressedArray, BuildDefaultSeededHasher> {
386 pub fn from_vec_st<K>(keys: Vec::<K>) -> Self where K: Hash {
390 Self::with_vec_p_hash_sc(keys, &Params::new(Bits8, bits_per_seed_to_100_bucket_size(8)),
391 BuildDefaultSeededHasher::default(), ShiftOnlyWrapped)
392 }
393
394 pub fn from_vec_mt<K>(keys: Vec::<K>) -> Self where K: Hash+Send+Sync {
398 Self::with_vec_p_threads_hash_sc(keys, &Params::new(Bits8, bits_per_seed_to_100_bucket_size(8)),
399 std::thread::available_parallelism().map_or(1, |v| v.into()), BuildDefaultSeededHasher::default(), ShiftOnlyWrapped)
400 }
401
402 pub fn from_slice_st<K>(keys: &[K]) -> Self where K: Hash+Clone {
406 Self::with_slice_p_hash_sc(keys, &Params::new(Bits8, bits_per_seed_to_100_bucket_size(8)),
407 BuildDefaultSeededHasher::default(), ShiftOnlyWrapped)
408 }
409
410 pub fn from_slice_mt<K>(keys: &[K]) -> Self where K: Hash+Clone+Send+Sync {
414 Self::with_slice_p_threads_hash_sc(keys, &Params::new(Bits8, bits_per_seed_to_100_bucket_size(8)),
415 std::thread::available_parallelism().map_or(1, |v| v.into()), BuildDefaultSeededHasher::default(), ShiftOnlyWrapped)
416 }
417}
418
419#[cfg(test)]
420pub(crate) mod tests {
421 use super::*;
422 use crate::utils::tests::test_mphf;
423
424 fn test_read_write<SS: SeedSize>(h: &Function2::<SS>) where SS::VecElement: std::fmt::Debug+PartialEq {
425 let mut buff = Vec::new();
426 h.write(&mut buff).unwrap();
427 let read = Function2::<SS>::read(&mut &buff[..]).unwrap();
429 assert_eq!(h.level0.conf, read.level0.conf);
430 assert_eq!(h.levels.len(), read.levels.len());
431 for (hl, rl) in h.levels.iter().zip(&read.levels) {
432 assert_eq!(hl.shift, rl.shift);
433 assert_eq!(hl.seeds.conf, rl.seeds.conf);
434 assert_eq!(hl.seeds.seeds, rl.seeds.seeds);
435 }
436 }
437
438 #[test]
439 fn test_small() {
440 let input = [1, 2, 3, 4, 5];
441 let f = Function2::from_slice_st(&input);
442 test_mphf(&input, |key| Some(f.get(key)));
443 test_read_write(&f);
444 }
445}