1use std::{hash::Hash, io, usize};
2
3use crate::{phast::Params, seeds::{Bits8, SeedSize}};
4use super::{bits_per_seed_to_100_bucket_size, builder::{build_mt, build_st}, conf::Conf, seed_chooser::{SeedChooser, SeedOnly}, CompressedArray, DefaultCompressedArray, WINDOW_SIZE};
5use binout::{Serializer, VByte};
6use bitm::BitAccess;
7use dyn_size_of::GetSize;
8use seedable_hash::{BuildDefaultSeededHasher, BuildSeededHasher};
9use voracious_radix_sort::RadixSort;
10use rayon::prelude::*;
11
12pub(crate) struct SeedEx<SSVecElement> {
14 pub(crate) seeds: Box<[SSVecElement]>,
15 pub(crate) conf: Conf,
16}
17
18impl<SSVecElement> SeedEx<SSVecElement> {
19 #[inline(always)]
20 pub(crate) fn bucket_for(&self, key: u64) -> usize { self.conf.bucket_for(key) }
21
22 #[inline(always)]
23 pub(crate) unsafe fn seed_for<SS>(&self, seed_size: SS, key: u64) -> u16 where SS: SeedSize<VecElement=SSVecElement> {
24 seed_size.get_seed(&self.seeds, self.bucket_for(key))
26 }
27
28 pub fn write<SS: SeedSize<VecElement = SSVecElement>>(&self, output: &mut dyn io::Write, seed_size: SS) -> io::Result<()>
30 {
31 self.conf.write(output)?;
32 seed_size.write_seed_vec(output, &self.seeds)
33 }
34
35 pub fn read<SS: SeedSize<VecElement = SSVecElement>>(input: &mut dyn io::Read) -> io::Result<(SS, Self)> {
37 let conf = Conf::read(input)?;
38 let (seed_size, seeds) = SS::read_seed_vec(input, conf.buckets_num)?;
39 Ok((seed_size, Self{ seeds, conf }))
40 }
41}
42
43impl<SSVecElement: GetSize> SeedEx<SSVecElement> {
44 pub fn write_bytes(&self) -> usize {
46 self.conf.write_bytes() + GetSize::size_bytes_dyn(&self.seeds)
47 }
48}
49
50impl<SSVecElement: GetSize> GetSize for SeedEx<SSVecElement> {
51 fn size_bytes_dyn(&self) -> usize { self.seeds.size_bytes_dyn() }
52 fn size_bytes_content_dyn(&self) -> usize { self.seeds.size_bytes_content_dyn() }
53 const USES_DYN_MEM: bool = true;
54}
55
56
57pub(crate) struct Level<SSVecElement> {
58 pub(crate) seeds: SeedEx<SSVecElement>,
59 pub(crate) shift: usize
60}
61
62impl<SSVecElement: GetSize> GetSize for Level<SSVecElement> {
63 fn size_bytes_dyn(&self) -> usize { self.seeds.size_bytes_dyn() }
64 fn size_bytes_content_dyn(&self) -> usize { self.seeds.size_bytes_content_dyn() }
65 const USES_DYN_MEM: bool = true;
66}
67
68impl<SSVecElement: GetSize> Level<SSVecElement> {
69 pub fn write_bytes(&self) -> usize {
70 VByte::size(self.shift) + self.seeds.write_bytes()
71 }
72}
73
74impl<SSVecElement> Level<SSVecElement> {
75 pub fn write<SS: SeedSize<VecElement = SSVecElement>>(&self, output: &mut dyn io::Write, seed_size: SS) -> io::Result<()>
77 {
78 VByte::write(output, self.shift)?;
79 self.seeds.write(output, seed_size)
80 }
81
82 pub fn read<SS: SeedSize<VecElement = SSVecElement>>(input: &mut dyn io::Read) -> io::Result<(SS, Self)> {
84 let shift = VByte::read(input)?;
85 SeedEx::<SSVecElement>::read::<SS>(input).map(|(ss, seeds)| (ss, Self{ seeds, shift }))
86 }
87}
88
89#[inline]
90pub(crate) fn build_level_from_slice_st<K, SS, SC, S>(keys: &[K], params: &Params<SS>, hasher: &S, seed_chooser: SC, level_nr: u64)
91 -> (Vec<K>, SeedEx<SS::VecElement>, Box<[u64]>, usize)
92 where K: Hash+Clone, SS: SeedSize, SC: SeedChooser, S: BuildSeededHasher
93{
94 let mut hashes: Box<[_]> = keys.iter().map(|k| hasher.hash_one(k, level_nr)).collect();
95 hashes.voracious_sort();
97 let conf = seed_chooser.conf_for_minimal_p(hashes.len(), params);
98 let (seeds, builder) =
99 build_st(&hashes, conf, params.seed_size, seed_chooser.bucket_evaluator(params.bits_per_seed(), conf.slice_len()), seed_chooser);
100 let (unassigned_values, unassigned_len) = builder.unassigned_values(&seeds);
101 drop(builder);
102 let mut keys_vec = Vec::with_capacity(unassigned_len);
103 keys_vec.extend(keys.into_iter().filter(|key| {
104 unsafe { params.seed_size.get_seed(&seeds, conf.bucket_for(hasher.hash_one(key, level_nr))) == 0 }
105 }).cloned());
106 (keys_vec, SeedEx::<SS::VecElement>{ seeds, conf }, unassigned_values, unassigned_len)
107}
108
109#[inline]
110pub(crate) fn build_level_from_slice_mt<K, SS, SC, S>(keys: &[K], params: &Params<SS>, threads_num: usize, hasher: &S, seed_chooser: SC, level_nr: u64)
111 -> (Vec<K>, SeedEx<SS::VecElement>, Box<[u64]>, usize)
112 where K: Hash+Sync+Send+Clone, SS: SeedSize, SC: SeedChooser+Sync, S: BuildSeededHasher+Sync
113{
114 let mut hashes: Box<[_]> = if keys.len() > 4*2048 { keys.par_iter().with_min_len(256).map(|k| hasher.hash_one(k, level_nr)).collect()
119 } else {
120 keys.iter().map(|k| hasher.hash_one(k, level_nr)).collect()
121 };
122 hashes.voracious_mt_sort(threads_num);
124 let conf = seed_chooser.conf_for_minimal_p(hashes.len(), params);
125 let (seeds, builder) =
126 build_mt(&hashes, conf, params.seed_size, WINDOW_SIZE, seed_chooser.bucket_evaluator(params.bits_per_seed(), conf.slice_len()), seed_chooser, threads_num);
127 let (unassigned_values, unassigned_len) = builder.unassigned_values(&seeds);
128 drop(builder);
129 let mut keys_vec = Vec::with_capacity(unassigned_len);
130 keys_vec.par_extend(keys.into_par_iter().filter(|key| {
131 unsafe { params.seed_size.get_seed(&seeds, conf.bucket_for(hasher.hash_one(key, level_nr))) == 0 }
132 }).cloned());
133 (keys_vec, SeedEx::<SS::VecElement>{ seeds, conf }, unassigned_values, unassigned_len)
134}
135
136#[inline(always)]
137pub(crate) fn build_level_st<K, SS, SC, S>(keys: &mut Vec::<K>, params: &Params<SS>, hasher: &S, seed_chooser: SC, level_nr: u64)
138 -> (SeedEx<SS::VecElement>, Box<[u64]>, usize)
139 where K: Hash, SS: SeedSize, SC: SeedChooser, S: BuildSeededHasher
140{
141 let mut hashes: Box<[_]> = keys.iter().map(|k| hasher.hash_one(k, level_nr)).collect();
142 hashes.voracious_sort();
143 let conf = seed_chooser.conf_for_minimal_p(hashes.len(), params);
144 let (seeds, builder) =
145 build_st(&hashes, conf, params.seed_size, seed_chooser.bucket_evaluator(params.bits_per_seed(), conf.slice_len()), seed_chooser);
146 let (unassigned_values, unassigned_len) = builder.unassigned_values(&seeds);
147 drop(builder);
148 keys.retain(|key| {
149 unsafe { params.seed_size.get_seed(&seeds, conf.bucket_for(hasher.hash_one(key, level_nr))) == 0 }
150 });
151 (SeedEx::<SS::VecElement>{ seeds, conf }, unassigned_values, unassigned_len)
152}
153
154#[inline]
155pub(crate) fn build_level_mt<K, SS, SC, S>(keys: &mut Vec::<K>, params: &Params<SS>, threads_num: usize, hasher: &S, seed_chooser: SC, level_nr: u64)
156 -> (SeedEx<SS::VecElement>, Box<[u64]>, usize)
157 where K: Hash+Sync+Send, SS: SeedSize, SC: SeedChooser+Sync, S: BuildSeededHasher+Sync
158{
159 let mut hashes: Box<[_]> = if keys.len() > 4*2048 { keys.par_iter().with_min_len(256).map(|k| hasher.hash_one(k, level_nr)).collect()
164 } else {
165 keys.iter().map(|k| hasher.hash_one(k, level_nr)).collect()
166 };
167 hashes.voracious_mt_sort(threads_num);
169 let conf = seed_chooser.conf_for_minimal_p(hashes.len(), params);
170 let (seeds, builder) =
171 build_mt(&hashes, conf, params.seed_size, WINDOW_SIZE, seed_chooser.bucket_evaluator(params.bits_per_seed(), conf.slice_len()), seed_chooser, threads_num);
172 let (unassigned_values, unassigned_len) = builder.unassigned_values(&seeds);
173 drop(builder);
174 let mut result = Vec::with_capacity(unassigned_len);
175 std::mem::swap(keys, &mut result);
176 keys.par_extend(result.into_par_iter().filter(|key| {
177 unsafe { params.seed_size.get_seed(&seeds, conf.bucket_for(hasher.hash_one(key, level_nr))) == 0 }
178 }));
179 (SeedEx::<SS::VecElement>{ seeds, conf }, unassigned_values, unassigned_len)
180}
181
182pub struct Function<SS, SC = SeedOnly, CA = DefaultCompressedArray, S = BuildDefaultSeededHasher>
196 where SS: SeedSize
197{
198 level0: SeedEx<SS::VecElement>,
199 unassigned: CA,
200 levels: Box<[Level<SS::VecElement>]>,
201 hasher: S,
202 seed_chooser: SC,
203 seed_size: SS, }
205
206impl<SC, SS: SeedSize, CA, S> GetSize for Function<SS, SC, CA, S> where CA: GetSize {
207 fn size_bytes_dyn(&self) -> usize {
208 self.level0.size_bytes_dyn() +
209 self.unassigned.size_bytes_dyn() +
210 self.levels.size_bytes_dyn()
211 }
212 fn size_bytes_content_dyn(&self) -> usize {
213 self.level0.size_bytes_content_dyn() +
214 self.unassigned.size_bytes_content_dyn() +
215 self.levels.size_bytes_content_dyn()
216 }
217 const USES_DYN_MEM: bool = true;
218}
219
220impl<SS: SeedSize, SC: SeedChooser, CA: CompressedArray, S: BuildSeededHasher> Function<SS, SC, CA, S> {
221
222 #[inline(always)] pub fn get<K>(&self, key: &K) -> usize where K: Hash + ?Sized {
229 let key_hash = self.hasher.hash_one(key, 0);
234 let seed = unsafe{ self.level0.seed_for(self.seed_size, key_hash) };
235 if seed != 0 { return self.seed_chooser.f(key_hash, seed, &self.level0.conf); }
236
237 for level_nr in 0..self.levels.len() {
238 let l = &self.levels[level_nr];
239 let key_hash = self.hasher.hash_one(key, level_nr as u64 + 1);
240 let seed = unsafe { l.seeds.seed_for(self.seed_size, key_hash) };
241 if seed != 0 {
242 return self.unassigned.get(self.seed_chooser.f(key_hash, seed, &l.seeds.conf) + l.shift)
243 }
244 }
245 usize::MAX
246 }
247
248 pub fn with_vec_p_hash_sc<K>(mut keys: Vec::<K>, params: &Params<SS>, hasher: S, seed_chooser: SC) -> Self where K: Hash {
254 let number_of_keys = keys.len();
255 Self::_new(|h| {
256 let (level0, unassigned_values, unassigned_len) =
257 build_level_st(&mut keys, params, h, seed_chooser, 0);
258 (keys, level0, unassigned_values, unassigned_len)
259 }, |keys, level_nr, h| {
260 build_level_st(keys, params, h, seed_chooser, level_nr)
261 }, hasher, seed_chooser, params.seed_size, number_of_keys)
262 }
263
264 pub fn with_vec_p_threads_hash_sc<K>(mut keys: Vec::<K>, params: &Params<SS>, threads_num: usize, hasher: S, seed_chooser: SC) -> Self
270 where K: Hash+Sync+Send, S: Sync, SC: Sync
271 {
272 if threads_num == 1 { return Self::with_vec_p_hash_sc(keys, params, hasher, seed_chooser); }
273 let number_of_keys = keys.len();
274 Self::_new(|h| {
275 let (level0, unassigned_values, unassigned_len) =
276 build_level_mt(&mut keys, params, threads_num, h, seed_chooser, 0);
277 (keys, level0, unassigned_values, unassigned_len)
278 }, |keys, level_nr, h| {
279 build_level_mt(keys, params, threads_num, h, seed_chooser, level_nr)
280 }, hasher, seed_chooser, params.seed_size, number_of_keys)
281 }
282
283
284 pub fn with_slice_p_hash_sc<K>(keys: &[K], params: &Params<SS>, hasher: S, seed_chooser: SC) -> Self where K: Hash+Clone {
290 Self::_new(|h| {
291 build_level_from_slice_st(keys, params, h, seed_chooser, 0)
292 }, |keys, level_nr, h| {
293 build_level_st(keys, params, h, seed_chooser, level_nr)
294 }, hasher, seed_chooser, params.seed_size, keys.len())
295 }
296
297
298 pub fn with_slice_p_threads_hash_sc<K>(keys: &[K], params: &Params<SS>, threads_num: usize, hasher: S, seed_chooser: SC) -> Self
304 where K: Hash+Sync+Send+Clone, S: Sync, SC: Sync {
305 if threads_num == 1 { return Self::with_slice_p_hash_sc(keys, params, hasher, seed_chooser); }
306 Self::_new(|h| {
307 build_level_from_slice_mt(keys, params, threads_num, h, seed_chooser, 0)
308 }, |keys, level_nr, h| {
309 build_level_mt(keys, params, threads_num, h, seed_chooser, level_nr)
310 }, hasher, seed_chooser, params.seed_size, keys.len())
311 }
312
313 #[inline]
314 fn _new<K, BF, BL>(build_first: BF, build_level: BL, hasher: S, seed_chooser: SC, seed_size: SS, number_of_keys: usize) -> Self
315 where BF: FnOnce(&S) -> (Vec::<K>, SeedEx<SS::VecElement>, Box<[u64]>, usize),
316 BL: Fn(&mut Vec::<K>, u64, &S) -> (SeedEx<SS::VecElement>, Box<[u64]>, usize),
317 K: Hash
318 {
319 let (mut keys, level0, unassigned_values, unassigned_len) = build_first(&hasher);
320 debug_assert_eq!(unassigned_len, unassigned_values.bit_ones().count());
321 let mut level0_unassigned = unassigned_values.bit_ones();
324 let mut unassigned = Vec::with_capacity(unassigned_len * 3 / 2);
325
326 let mut levels = Vec::with_capacity(16);
327 let mut last = 0;
328 while !keys.is_empty() {
329 let keys_len = keys.len();
330
331 let (seeds, unassigned_values, _unassigned_len) =
337 build_level(&mut keys, levels.len() as u64+1, &hasher);
338 let shift = unassigned.len();
339 for i in 0..keys_len {
340 if CA::MAX_FOR_UNUSED {
341 if !unsafe{unassigned_values.get_bit_unchecked(i)} {
342 last = level0_unassigned.next().unwrap();
343 unassigned.push(last);
344 } else {
345 unassigned.push(usize::MAX);
346 }
347 } else {
348 if !unsafe{unassigned_values.get_bit_unchecked(i)} {
349 last = level0_unassigned.next().unwrap();
350 }
351 unassigned.push(last);
352 }
353 }
354 levels.push(Level { seeds, shift });
355 }
356 debug_assert!(level0_unassigned.next().is_none());
357 drop(level0_unassigned);
358 Self {
359 level0,
360 unassigned: CA::new(unassigned, last, number_of_keys),
361 levels: levels.into_boxed_slice(),
362 hasher,
363 seed_chooser,
364 seed_size
365 }
366 }
367
368 #[inline(always)] pub fn minimal_output_range(&self, num_of_keys: usize) -> usize { self.seed_chooser.minimal_output_range(num_of_keys) }
371
372 pub fn output_range(&self) -> usize {
374 self.level0.conf.output_range(self.seed_chooser, self.seed_size.into())
375 }
376
377 pub fn write_bytes(&self) -> usize {
452 self.level0.write_bytes() +
453 self.unassigned.write_bytes() +
454 VByte::size(self.levels.len()) +
455 self.levels.iter().map(|l| l.size_bytes()).sum::<usize>()
456 }
457
458 pub fn write(&self, output: &mut dyn io::Write) -> io::Result<()>
460 {
461 self.level0.write(output, self.seed_size)?;
462 self.unassigned.write(output)?;
463 VByte::write(output, self.levels.len())?;
464 for level in &self.levels {
465 level.write(output, self.seed_size)?;
466 }
467 Ok(())
468 }
469
470 pub fn read_with_hasher_sc(input: &mut dyn io::Read, hasher: S, seed_chooser: SC) -> io::Result<Self> {
472 let (seed_size, level0) = SeedEx::read(input)?;
473 let unassigned = CA::read(input)?;
474 let levels_num: usize = VByte::read(input)?;
475 let mut levels = Vec::with_capacity(levels_num);
476 for _ in 0..levels_num {
477 levels.push(Level::read::<SS>(input)?.1);
478 }
479 Ok(Self { level0, unassigned, levels: levels.into_boxed_slice(), hasher, seed_chooser, seed_size })
480 }
481}
482
483impl<SS: SeedSize> Function<SS, SeedOnly, DefaultCompressedArray, BuildDefaultSeededHasher> {
484
485 pub fn read(input: &mut dyn io::Read) -> io::Result<Self> {
487 Self::read_with_hasher_sc(input, BuildDefaultSeededHasher::default(), SeedOnly)
488 }
489}
490
491impl Function<Bits8, SeedOnly, DefaultCompressedArray, BuildDefaultSeededHasher> {
492 pub fn from_vec_st<K>(keys: Vec::<K>) -> Self where K: Hash {
496 Self::with_vec_p_hash_sc(keys, &Params::new(Bits8::default(), bits_per_seed_to_100_bucket_size(8)),
497 BuildDefaultSeededHasher::default(), SeedOnly)
498 }
499
500 pub fn from_vec_mt<K>(keys: Vec::<K>) -> Self where K: Hash+Send+Sync {
504 Self::with_vec_p_threads_hash_sc(keys, &Params::new(Bits8::default(), bits_per_seed_to_100_bucket_size(8)),
505 std::thread::available_parallelism().map_or(1, |v| v.into()), BuildDefaultSeededHasher::default(), SeedOnly)
506 }
507
508 pub fn from_slice_st<K>(keys: &[K]) -> Self where K: Hash+Clone {
512 Self::with_slice_p_hash_sc(keys, &Params::new(Bits8::default(), bits_per_seed_to_100_bucket_size(8)),
513 BuildDefaultSeededHasher::default(), SeedOnly)
514 }
515
516 pub fn from_slice_mt<K>(keys: &[K]) -> Self where K: Hash+Clone+Send+Sync {
520 Self::with_slice_p_threads_hash_sc(keys, &Params::new(Bits8::default(), bits_per_seed_to_100_bucket_size(8)),
521 std::thread::available_parallelism().map_or(1, |v| v.into()), BuildDefaultSeededHasher::default(), SeedOnly)
522 }
523
524}
525
526#[cfg(test)]
527pub(crate) mod tests {
528 use super::*;
529 use crate::utils::tests::test_mphf;
530
531 fn test_read_write<SS: SeedSize>(h: &Function::<SS>) where SS::VecElement: std::fmt::Debug+PartialEq {
532 let mut buff = Vec::new();
533 h.write(&mut buff).unwrap();
534 let read = Function::<SS>::read(&mut &buff[..]).unwrap();
536 assert_eq!(h.level0.conf, read.level0.conf);
537 assert_eq!(h.levels.len(), read.levels.len());
538 for (hl, rl) in h.levels.iter().zip(&read.levels) {
539 assert_eq!(hl.shift, rl.shift);
540 assert_eq!(hl.seeds.conf, rl.seeds.conf);
541 assert_eq!(hl.seeds.seeds, rl.seeds.seeds);
542 }
543 }
544
545 #[test]
546 fn test_small() {
547 let input = [1, 2, 3, 4, 5];
548 let f = Function::from_slice_st(&input);
549 test_mphf(&input, |key| Some(f.get(key)));
550 test_read_write(&f);
551 }
552}