1use dyn_size_of::GetSize;
2use seedable_hash::{BuildDefaultSeededHasher, BuildSeededHasher};
3use voracious_radix_sort::RadixSort;
4use std::hash::Hash;
5use rayon::prelude::*;
6
7use crate::{phast::{bits_per_seed_to_100_bucket_size, builder::{build_mt, build_st}, function::{Level, SeedEx}, Params, SeedChooser, SeedOnly, SeedOnlyK, WINDOW_SIZE}, seeds::{Bits8, SeedSize}};
8
9pub struct Perfect<SS: SeedSize, SC = SeedOnly, S = BuildDefaultSeededHasher>
19{
20 level0: SeedEx<SS::VecElement>,
21 levels: Box<[Level<SS::VecElement>]>,
22 hasher: S,
23 seed_chooser: SC,
24 seed_size: SS
25}
26
27impl<SC, SS: SeedSize, S> GetSize for Perfect<SS, SC, S> {
28 fn size_bytes_dyn(&self) -> usize {
29 self.level0.size_bytes_dyn() + self.levels.size_bytes_dyn()
30 }
31 fn size_bytes_content_dyn(&self) -> usize {
32 self.level0.size_bytes_content_dyn() + self.levels.size_bytes_content_dyn()
33 }
34 const USES_DYN_MEM: bool = true;
35}
36
37impl<SS: SeedSize, SC: SeedChooser, S: BuildSeededHasher> Perfect<SS, SC, S> {
38
39 #[inline(always)] pub fn get<K>(&self, key: &K) -> usize where K: Hash + ?Sized {
45 let key_hash = self.hasher.hash_one(key, 0);
46 let seed = unsafe { self.level0.seed_for(self.seed_size, key_hash) };
47 if seed != 0 { return self.seed_chooser.f(key_hash, seed, &self.level0.conf); }
48
49 for level_nr in 0..self.levels.len() {
50 let l = &self.levels[level_nr];
51 let key_hash = self.hasher.hash_one(key, level_nr as u64 + 1);
52 let seed = unsafe { l.seeds.seed_for(self.seed_size, key_hash) };
53 if seed != 0 {
54 return self.seed_chooser.f(key_hash, seed, &l.seeds.conf) + l.shift
55 }
56 }
57
58 unreachable!("phast::Perfect::get called for key not included in the input set")
59 }
60
61 pub fn with_vec_p_hash_sc<K>(mut keys: Vec::<K>, params: &Params<SS>, hasher: S, seed_chooser: SC) -> Self where K: Hash {
67 Self::_new(|h| {
68 let level0 = Self::build_level_st(&mut keys, params, h, seed_chooser, 0);
69 (keys, level0)
70 }, |keys, level_nr, h| {
71 Self::build_level_st(keys, params, h, seed_chooser, level_nr)
72 }, hasher, seed_chooser, params.seed_size)
73 }
74
75 pub fn with_vec_p_threads_hash_sc<K>(mut keys: Vec::<K>, params: &Params<SS>, threads_num: usize, hasher: S, seed_chooser: SC) -> Self
81 where K: Hash+Sync+Send, S: Sync, SC: Sync {
82 if threads_num == 1 { return Self::with_vec_p_hash_sc(keys, params, hasher, seed_chooser); }
83 Self::_new(|h| {
84 let level0 = Self::build_level_mt(&mut keys, params, threads_num, &h, seed_chooser, 0);
85 (keys, level0)
86 }, |keys, level_nr, h| {
87 Self::build_level_mt(keys, params, threads_num, &h, seed_chooser, level_nr)
88 }, hasher, seed_chooser, params.seed_size)
89 }
90
91
92 pub fn with_slice_p_hash_sc<K>(keys: &[K], params: &Params<SS>, hasher: S, seed_chooser: SC) -> Self where K: Hash+Clone {
98 Self::_new(|h| {
99 Self::build_level_from_slice_st(keys, params, h, seed_chooser, 0)
100 }, |keys, level_nr, h| {
101 Self::build_level_st(keys, params, &h, seed_chooser, level_nr)
102 }, hasher, seed_chooser, params.seed_size)
103 }
104
105
106 pub fn with_slice_p_threads_hash_sc<K>(keys: &[K], params: &Params<SS>, threads_num: usize, hasher: S, seed_chooser: SC) -> Self
112 where K: Hash+Sync+Send+Clone, S: Sync, SC: Sync {
113 if threads_num == 1 { return Self::with_slice_p_hash_sc(keys, params, hasher, seed_chooser); }
114 Self::_new(|h| {
115 Self::build_level_from_slice_mt(keys, params, threads_num, h, seed_chooser, 0)
116 }, |keys, level_nr, h| {
117 Self::build_level_mt(keys, params, threads_num, h, seed_chooser, level_nr)
118 }, hasher, seed_chooser, params.seed_size)
119 }
120
121 #[inline]
122 fn _new<K, BF, BL>(build_first: BF, build_level: BL, hasher: S, seed_chooser: SC, seed_size: SS) -> Self
123 where BF: FnOnce(&S) -> (Vec::<K>, SeedEx<SS::VecElement>),
124 BL: Fn(&mut Vec::<K>, u64, &S) -> SeedEx<SS::VecElement>,
125 K: Hash
126 {
127 let (mut keys, level0) = build_first(&hasher);
128 let mut shift = level0.conf.output_range(seed_chooser, seed_size.into());
129 let mut levels = Vec::with_capacity(16);
130 while !keys.is_empty() {
131 let seeds = build_level(&mut keys, levels.len() as u64+1, &hasher);
132 let out_range = seeds.conf.output_range(seed_chooser, seed_size.into());
133 levels.push(Level { seeds, shift });
134 shift += out_range;
135 }
136 Self {
137 level0,
138 levels: levels.into_boxed_slice(),
139 hasher,
140 seed_chooser,
141 seed_size,
142 }
143 }
144
145 #[inline]
146 fn build_level_from_slice_st<K>(keys: &[K], params: &Params<SS>, hasher: &S, seed_chooser: SC, level_nr: u64)
147 -> (Vec<K>, SeedEx<SS::VecElement>)
148 where K: Hash+Clone
149 {
150 let mut hashes: Box<[_]> = keys.iter().map(|k| hasher.hash_one(k, level_nr)).collect();
151 hashes.voracious_sort();
153 let conf = seed_chooser.conf_for_minimal_p(hashes.len(), params);
154 let (seeds, builder) =
155 build_st(&hashes, conf, params.seed_size, seed_chooser.bucket_evaluator(params.bits_per_seed(), conf.slice_len()), seed_chooser);
156 let mut keys_vec = Vec::with_capacity(builder.unassigned_len(&seeds));
157 drop(builder);
158 keys_vec.extend(keys.into_iter().filter(|key| {
159 unsafe { params.seed_size.get_seed(&seeds, conf.bucket_for(hasher.hash_one(key, level_nr))) == 0 }
160 }).cloned());
161 (keys_vec, SeedEx{ seeds, conf })
162 }
163
164 #[inline]
165 fn build_level_from_slice_mt<K>(keys: &[K], params: &Params<SS>, threads_num: usize, hasher: &S, seed_chooser: SC, level_nr: u64)
166 -> (Vec<K>, SeedEx<SS::VecElement>)
167 where K: Hash+Sync+Send+Clone, S: Sync, SC: Sync
168 {
169 let mut hashes: Box<[_]> = if keys.len() > 4*2048 { keys.par_iter().with_min_len(256).map(|k| hasher.hash_one(k, level_nr)).collect()
171 } else {
172 keys.iter().map(|k| hasher.hash_one(k, level_nr)).collect()
173 };
174 hashes.voracious_mt_sort(threads_num);
176 let conf = seed_chooser.conf_for_minimal_p(hashes.len(), params);
177 let (seeds, builder) =
178 build_mt(&hashes, conf, params.seed_size, WINDOW_SIZE, seed_chooser.bucket_evaluator(params.bits_per_seed(), conf.slice_len()), seed_chooser, threads_num);
179 let mut keys_vec = Vec::with_capacity(builder.unassigned_len(&seeds));
180 drop(builder);
181 keys_vec.par_extend(keys.into_par_iter().filter(|key| {
182 unsafe { params.seed_size.get_seed(&seeds, conf.bucket_for(hasher.hash_one(key, level_nr))) == 0 }
183 }).cloned());
184 (keys_vec, SeedEx{ seeds, conf })
185 }
186
187 #[inline(always)]
188 fn build_level_st<K>(keys: &mut Vec::<K>, params: &Params<SS>, hasher: &S, seed_chooser: SC, level_nr: u64) -> SeedEx<SS::VecElement>
189 where K: Hash
190 {
191 let mut hashes: Box<[_]> = keys.iter().map(|k| hasher.hash_one(k, level_nr)).collect();
192 hashes.voracious_sort();
193 let conf = seed_chooser.conf_for_minimal_p(hashes.len(), params);
194 let (seeds, _) =
195 build_st(&hashes, conf, params.seed_size, seed_chooser.bucket_evaluator(params.bits_per_seed(), conf.slice_len()), seed_chooser);
196 keys.retain(|key| {
197 unsafe { params.seed_size.get_seed(&seeds, conf.bucket_for(hasher.hash_one(key, level_nr))) == 0 }
198 });
199 SeedEx{ seeds, conf }
200 }
201
202 #[inline]
203 fn build_level_mt<K>(keys: &mut Vec::<K>, params: &Params<SS>, threads_num: usize, hasher: &S, seed_chooser: SC, level_nr: u64)
204 -> SeedEx<SS::VecElement>
205 where K: Hash+Sync+Send, S: Sync, SC: Sync
206 {
207 let mut hashes: Box<[_]> = if keys.len() > 4*2048 { keys.par_iter().with_min_len(256).map(|k| hasher.hash_one(k, level_nr)).collect()
212 } else {
213 keys.iter().map(|k| hasher.hash_one(k, level_nr)).collect()
214 };
215 hashes.voracious_mt_sort(threads_num);
217 let conf = seed_chooser.conf_for_minimal_p(hashes.len(), params);
218 let (seeds, builder) =
219 build_mt(&hashes, conf, params.seed_size, WINDOW_SIZE, seed_chooser.bucket_evaluator(params.bits_per_seed(), conf.slice_len()), seed_chooser, threads_num);
220 let mut result = Vec::with_capacity(builder.unassigned_len(&seeds));
221 drop(builder);
222 std::mem::swap(keys, &mut result);
223 keys.par_extend(result.into_par_iter().filter(|key| {
224 unsafe { params.seed_size.get_seed(&seeds, conf.bucket_for(hasher.hash_one(key, level_nr))) == 0 }
225 }));
226 SeedEx{ seeds, conf }
227 }
228
229 #[inline(always)] pub fn k(&self) -> u8 { self.seed_chooser.k() }
231
232 #[inline(always)] pub fn minimal_output_range(&self, num_of_keys: usize) -> usize { self.seed_chooser.minimal_output_range(num_of_keys) }
235
236 pub fn output_range(&self) -> usize {
238 if let Some(last_level) = self.levels.last() {
239 last_level.shift + last_level.seeds.conf.output_range(self.seed_chooser, self.seed_size.into())
240 } else {
241 self.level0.conf.output_range(self.seed_chooser, self.seed_size.into())
242 }
243 }
244}
245
246impl Perfect<Bits8, SeedOnly, BuildDefaultSeededHasher> {
247 pub fn from_vec_st<K>(keys: Vec::<K>) -> Self where K: Hash {
251 Self::with_vec_p_hash_sc(keys, &Params::new(Bits8, bits_per_seed_to_100_bucket_size(8)),
252 BuildDefaultSeededHasher::default(), SeedOnly)
253 }
254
255 pub fn from_vec_mt<K>(keys: Vec::<K>) -> Self where K: Hash+Send+Sync {
259 Self::with_vec_p_threads_hash_sc(keys, &Params::new(Bits8, bits_per_seed_to_100_bucket_size(8)),
260 std::thread::available_parallelism().map_or(1, |v| v.into()), BuildDefaultSeededHasher::default(), SeedOnly)
261 }
262
263 pub fn from_slice_st<K>(keys: &[K]) -> Self where K: Hash+Clone {
267 Self::with_slice_p_hash_sc(keys, &Params::new(Bits8, bits_per_seed_to_100_bucket_size(8)),
268 BuildDefaultSeededHasher::default(), SeedOnly)
269 }
270
271 pub fn from_slice_mt<K>(keys: &[K]) -> Self where K: Hash+Clone+Send+Sync {
275 Self::with_slice_p_threads_hash_sc(keys, &Params::new(Bits8, bits_per_seed_to_100_bucket_size(8)),
276 std::thread::available_parallelism().map_or(1, |v| v.into()), BuildDefaultSeededHasher::default(), SeedOnly)
277 }
278}
279
280impl Perfect<Bits8, SeedOnlyK, BuildDefaultSeededHasher> {
281 pub fn k_from_vec_st<K>(k: u8, keys: Vec::<K>) -> Self where K: Hash {
286 Self::with_vec_p_hash_sc(keys, &Params::new(Bits8, bits_per_seed_to_100_bucket_size(8)),
287 BuildDefaultSeededHasher::default(), SeedOnlyK(k))
288 }
289
290 pub fn k_from_vec_mt<K>(k: u8, keys: Vec::<K>) -> Self where K: Hash+Send+Sync {
295 Self::with_vec_p_threads_hash_sc(keys, &Params::new(Bits8, bits_per_seed_to_100_bucket_size(8)),
296 std::thread::available_parallelism().map_or(1, |v| v.into()), BuildDefaultSeededHasher::default(), SeedOnlyK(k))
297 }
298
299 pub fn k_from_slice_st<K>(k: u8, keys: &[K]) -> Self where K: Hash+Clone {
304 Self::with_slice_p_hash_sc(keys, &Params::new(Bits8, bits_per_seed_to_100_bucket_size(8)),
305 BuildDefaultSeededHasher::default(), SeedOnlyK(k))
306 }
307
308 pub fn k_from_slice_mt<K>(k: u8, keys: &[K]) -> Self where K: Hash+Clone+Send+Sync {
313 Self::with_slice_p_threads_hash_sc(keys, &Params::new(Bits8, bits_per_seed_to_100_bucket_size(8)),
314 std::thread::available_parallelism().map_or(1, |v| v.into()), BuildDefaultSeededHasher::default(), SeedOnlyK(k))
315 }
316}
317
318
319#[cfg(test)]
320pub(crate) mod tests {
321 use std::fmt::Display;
322
323 use crate::utils::{verify_partial_kphf, verify_partial_phf};
324
325 use super::*;
326
327 fn phf_test<SC, K, SS, S>(f: &Perfect<SS, SC, S>, keys: &[K])
328 where K: Display+Hash, SC: SeedChooser, SS: SeedSize, S: BuildSeededHasher
329 {
330 verify_partial_phf(f.output_range(), keys, |key| Some(f.get(key)));
331 }
332
333 fn kphf_test<K: Display+Hash, SS: SeedSize, S: BuildSeededHasher>(f: &Perfect<SS, SeedOnlyK, S>, keys: &[K]) {
334 verify_partial_kphf(f.k(), f.output_range(), keys, |key| Some(f.get(key)));
335 }
336
337 #[test]
338 fn test_small() {
339 let input = [1, 2, 3, 4, 5];
340 let f = Perfect::from_slice_st(&input);
341 phf_test(&f, &input);
342 }
343
344 #[test]
345 fn test_medium() {
346 let input: Box<[u16]> = (0..1000).collect();
347 let f = Perfect::from_slice_st(&input);
348 phf_test(&f, &input);
349 }
350
351 #[test]
352 fn test_small_k() {
353 let input = [1, 2, 3, 4, 5];
354 let f = Perfect::k_from_slice_st(3, &input);
355 kphf_test(&f, &input);
356 }
357
358 #[test]
359 fn test_medium_k() {
360 let input: Box<[u16]> = (0..1000).collect();
361 let f = Perfect::k_from_slice_st(3, &input);
362 kphf_test(&f, &input);
363 }
364}