xorfilter/
fuse8.rs

1#[allow(unused_imports)]
2use std::collections::hash_map::{DefaultHasher, RandomState};
3use std::hash::{BuildHasher, Hash, Hasher};
4
5use crate::{BuildHasherDefault, Error, Result};
6
7// probabillity of success should always be > 0.5 so 100 iterations is highly unlikely.
8const XOR_MAX_ITERATIONS: usize = 100;
9
10#[inline]
11pub(crate) fn binary_fuse_murmur64(mut h: u64) -> u64 {
12    h ^= h >> 33;
13    h = h.wrapping_mul(0xff51afd7ed558ccd_u64);
14    h ^= h >> 33;
15    h = h.wrapping_mul(0xc4ceb9fe1a85ec53_u64);
16    h ^= h >> 33;
17    h
18}
19
20#[inline]
21pub(crate) fn binary_fuse_mix_split(key: u64, seed: u64) -> u64 {
22    binary_fuse_murmur64(key.wrapping_add(seed))
23}
24
25#[allow(dead_code)]
26#[inline]
27fn binary_fuse_rotl64(n: u64, c: u32) -> u64 {
28    n.rotate_left(c)
29}
30
31#[allow(dead_code)]
32#[inline]
33fn binary_fuse_reduce(hash: u32, n: u32) -> u32 {
34    // http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
35    (((hash as u64) * (n as u64)) >> 32) as u32
36}
37
38#[inline]
39fn binary_fuse8_fingerprint(hash: u64) -> u64 {
40    hash ^ (hash >> 32)
41}
42
43// returns random number, modifies the seed
44pub(crate) fn binary_fuse_rng_splitmix64(seed: &mut u64) -> u64 {
45    *seed = seed.wrapping_add(0x9E3779B97F4A7C15_u64);
46    let mut z = *seed;
47    z = (z ^ (z >> 30)).wrapping_mul(0xBF58476D1CE4E5B9_u64);
48    z = (z ^ (z >> 27)).wrapping_mul(0x94D049BB133111EB_u64);
49    z ^ (z >> 31)
50}
51
52#[inline]
53pub(crate) fn binary_fuse_mulhi(a: u64, b: u64) -> u64 {
54    (((a as u128) * (b as u128)) >> 64) as u64
55}
56
57#[inline]
58pub(crate) fn binary_fuse_calculate_segment_length(arity: u32, size: u32) -> u32 {
59    let ln_size = (size as f64).ln();
60
61    // These parameters are very sensitive. Replacing 'floor' by 'round' can
62    // substantially affect the construction time.
63    match arity {
64        3 => 1_u32 << ((ln_size / 3.33_f64.ln() + 2.25).floor() as u32),
65        4 => 1_u32 << ((ln_size / 2.91_f64.ln() - 0.50).floor() as u32),
66        _ => 65536,
67    }
68}
69
70#[inline]
71fn binary_fuse8_max(a: f64, b: f64) -> f64 {
72    if a < b {
73        b
74    } else {
75        a
76    }
77}
78
79#[inline]
80pub(crate) fn binary_fuse_calculate_size_factor(arity: u32, size: u32) -> f64 {
81    let ln_size = (size as f64).ln();
82    match arity {
83        3 => binary_fuse8_max(1.125, 0.875 + 0.250 * 1000000.0_f64.ln() / ln_size),
84        4 => binary_fuse8_max(1.075, 0.770 + 0.305 * 0600000.0_f64.ln() / ln_size),
85        _ => 2.0,
86    }
87}
88
89#[inline]
90pub(crate) fn binary_fuse_mod3(x: u8) -> u8 {
91    if x > 2 {
92        x - 3
93    } else {
94        x
95    }
96}
97
98/// Type Fuse8 is probabilistic data-structure to test membership of an element in a set.
99///
100/// Fuse8 is parametrized over type `H` which is expected to implement [BuildHasher]
101/// trait, like types [RandomState] and [BuildHasherDefault]. When not supplied,
102/// [BuildHasherDefault] is used as the default hash-builder.
103///
104/// If `RandomState` is used as BuildHasher, `std` has got this to say
105/// > _A particular instance RandomState will create the same instances
106/// > of Hasher, but the hashers created by two different RandomState
107/// > instances are unlikely to produce the same result for the same values._
108///
109/// If [DefaultHasher] is used as BuildHasher, `std` has got this to say,
110/// > _The internal algorithm is not specified, and so its hashes
111/// > should not be relied upon over releases._
112///
113/// The default type for parameter `H` might change when a reliable and commonly used
114/// BuildHasher type available.
115///
116/// IMPORTANT: Fuse8 filter can only tolerate few duplicates in a given data-set.
117/// So make sure to supply a hasher that is capable of generating unique digests,
118/// _(with allowed tolerance of duplicates)_ and while supplying the digests directly
119/// via `populate_keys()` and `build_keys()` make sure they don't have more than few
120/// duplicates.
121pub struct Fuse8<H = BuildHasherDefault>
122where
123    H: BuildHasher,
124{
125    keys: Option<Vec<u64>>,
126    pub hash_builder: H,
127    pub seed: u64,
128    pub segment_length: u32,
129    pub segment_length_mask: u32,
130    pub segment_count: u32,
131    pub segment_count_length: u32,
132    pub finger_prints: Vec<u8>,
133}
134
135#[derive(Default)]
136pub(crate) struct BinaryHashes {
137    pub(crate) h0: u32,
138    pub(crate) h1: u32,
139    pub(crate) h2: u32,
140}
141
142impl<H> Fuse8<H>
143where
144    H: BuildHasher,
145{
146    #[inline]
147    fn binary_fuse8_hash_batch(&self, hash: u64) -> BinaryHashes {
148        let mut ans = BinaryHashes::default();
149
150        ans.h0 = binary_fuse_mulhi(hash, self.segment_count_length.into()) as u32;
151        ans.h1 = ans.h0 + self.segment_length;
152        ans.h2 = ans.h1 + self.segment_length;
153        ans.h1 ^= ((hash >> 18) as u32) & self.segment_length_mask;
154        ans.h2 ^= (hash as u32) & self.segment_length_mask;
155        ans
156    }
157
158    #[inline]
159    fn binary_fuse8_hash(&self, index: u32, hash: u64) -> u32 {
160        let mut h = binary_fuse_mulhi(hash, self.segment_count_length.into());
161        h += (index * self.segment_length) as u64;
162        // keep the lower 36 bits
163        let hh = hash & ((1_u64 << 36) - 1);
164        // index 0: right shift by 36; index 1: right shift by 18; index 2: no shift
165        h ^= (hh >> (36 - 18 * index)) & (self.segment_length_mask as u64);
166
167        h as u32
168    }
169}
170
171impl<H> Fuse8<H>
172where
173    H: BuildHasher,
174{
175    /// New Fuse8 instance that can index size number of keys. Internal data-structures
176    /// are pre-allocated for `size`.  `size` should be at least 2.
177    pub fn new(size: u32) -> Fuse8<H>
178    where
179        H: Default,
180    {
181        Self::with_hasher(size, H::default())
182    }
183
184    /// New Fuse8 instance initialized with supplied hasher.
185    pub fn with_hasher(size: u32, hash_builder: H) -> Fuse8<H> {
186        use std::cmp;
187
188        let arity = 3_u32;
189
190        let segment_length = match size {
191            0 => 4,
192            size => cmp::min(binary_fuse_calculate_segment_length(arity, size), 262144),
193        };
194
195        let segment_length_mask = segment_length - 1;
196        let mut array_length = {
197            let size_factor = binary_fuse_calculate_size_factor(arity, size);
198            let cap = match size {
199                0 | 1 => 0,
200                size => ((size as f64) * size_factor).round() as u32,
201            };
202            let n = ((cap + segment_length - 1) / segment_length).wrapping_sub(arity - 1);
203            (n.wrapping_add(arity) - 1) * segment_length
204        };
205
206        let mut segment_count = (array_length + segment_length - 1) / segment_length;
207        segment_count = if segment_count <= (arity - 1) {
208            1
209        } else {
210            segment_count - (arity - 1)
211        };
212
213        array_length = (segment_count + arity - 1) * segment_length;
214        let segment_count_length = segment_count * segment_length;
215
216        Fuse8 {
217            keys: Some(Vec::default()),
218            hash_builder,
219            seed: u64::default(),
220            segment_length,
221            segment_length_mask,
222            segment_count,
223            segment_count_length,
224            finger_prints: vec![0; array_length as usize],
225        }
226    }
227}
228
229impl<H> Fuse8<H>
230where
231    H: BuildHasher,
232{
233    /// Return the size of index.
234    #[inline]
235    pub fn size_of(&self) -> usize {
236        std::mem::size_of::<Self>() + self.finger_prints.len()
237    }
238
239    /// Insert 64-bit digest of a single key. Digest for the key shall be generated
240    /// using the default-hasher or via hasher supplied via [Fuse8::with_hasher] method.
241    pub fn insert<K: ?Sized + Hash>(&mut self, key: &K) {
242        let digest = {
243            let mut hasher = self.hash_builder.build_hasher();
244            key.hash(&mut hasher);
245            hasher.finish()
246        };
247        self.keys.as_mut().unwrap().push(digest);
248    }
249
250    /// Populate with 64-bit digests for a collection of keys of type `K`. Digest for
251    /// key shall be generated using the default-hasher or via hasher supplied
252    /// via [Fuse8::with_hasher] method.
253    pub fn populate<K: Hash>(&mut self, keys: &[K]) {
254        keys.iter().for_each(|key| {
255            let mut hasher = self.hash_builder.build_hasher();
256            key.hash(&mut hasher);
257            self.keys.as_mut().unwrap().push(hasher.finish());
258        })
259    }
260
261    /// Populate with pre-compute collection of 64-bit digests.
262    pub fn populate_keys(&mut self, digests: &[u64]) {
263        self.keys.as_mut().unwrap().extend_from_slice(&digests);
264    }
265
266    // construct the filter, returns true on success, false on failure.
267    // most likely, a failure is due to too high a memory usage
268    // size is the number of keys
269    // The caller is responsable for calling binary_fuse8_allocate(size,filter)
270    // before. The caller is responsible to ensure that there are no duplicated
271    // keys. The inner loop will run up to XOR_MAX_ITERATIONS times (default on
272    // 100), it should never fail, except if there are duplicated keys. If it fails,
273    // a return value of false is provided.
274    /// Build bitmap for keys that where previously inserted using [Fuse8::insert],
275    /// [Fuse8::populate] and [Fuse8::populate_keys] method.
276    pub fn build(&mut self) -> Result<()> {
277        match self.keys.take() {
278            Some(keys) => self.build_keys(&keys),
279            None => Ok(()),
280        }
281    }
282
283    /// Build a bitmap for pre-computed 64-bit digests for keys. If keys where
284    /// previously inserted using [Fuse8::insert] or [Fuse8::populate] or
285    /// [Fuse8::populate_keys] methods, they shall be ignored.
286    ///
287    /// It is upto the caller to ensure that digests are unique, that there no
288    /// duplicates.
289    pub fn build_keys(&mut self, digests: &[u64]) -> Result<()> {
290        let mut rng_counter = 0x726b2b9d438b9d4d_u64;
291        let capacity = self.finger_prints.len();
292        let size = digests.len();
293
294        self.seed = binary_fuse_rng_splitmix64(&mut rng_counter);
295
296        let mut reverse_order: Vec<u64> = vec![0; size + 1];
297        let mut reverse_h: Vec<u8> = vec![0; size];
298        let mut alone: Vec<u32> = vec![0; capacity as usize];
299        let mut t2count: Vec<u8> = vec![0; capacity as usize];
300        let mut t2hash: Vec<u64> = vec![0; capacity as usize];
301
302        let mut block_bits: u32 = 1;
303        while (1_u32 << block_bits) < self.segment_count {
304            block_bits += 1;
305        }
306
307        let block = 1_u32 << block_bits;
308
309        let mut start_pos: Vec<u32> = vec![0; 1 << block_bits];
310
311        let mut h012 = [0_u32; 5];
312
313        reverse_order[size] = 1; // sentinel
314        let mut iter = 0..=XOR_MAX_ITERATIONS;
315        loop {
316            if iter.next().is_none() {
317                err_at!(Fatal, msg: "Too many iterations. Are all your keys unique?")?;
318            }
319
320            for i in 0_u32..block {
321                // important : i * size would overflow as a 32-bit number in some
322                // cases.
323                start_pos[i as usize] =
324                    (((i as u64) * (size as u64)) >> block_bits) as u32;
325            }
326
327            let mask_block = (block - 1) as u64;
328            for (_, digest) in digests.iter().enumerate().take(size) {
329                let hash: u64 = binary_fuse_murmur64(digest.wrapping_add(self.seed));
330                let mut segment_index: u64 = hash >> (64 - block_bits);
331                while reverse_order[start_pos[segment_index as usize] as usize] != 0 {
332                    segment_index += 1;
333                    segment_index &= mask_block;
334                }
335                reverse_order[start_pos[segment_index as usize] as usize] = hash;
336                start_pos[segment_index as usize] += 1;
337            }
338
339            let mut error: isize = 0;
340            let mut duplicates = 0;
341            for (_, rev_order) in reverse_order.iter().enumerate().take(size) {
342                let hash: u64 = *rev_order;
343
344                let h0: usize = self.binary_fuse8_hash(0, hash) as usize;
345                t2count[h0] = t2count[h0].wrapping_add(4);
346                t2hash[h0] ^= hash;
347
348                let h1: usize = self.binary_fuse8_hash(1, hash) as usize;
349                t2count[h1] = t2count[h1].wrapping_add(4);
350                t2count[h1] ^= 1;
351                t2hash[h1] ^= hash;
352
353                let h2: usize = self.binary_fuse8_hash(2, hash) as usize;
354                t2count[h2] = t2count[h2].wrapping_add(4);
355                t2hash[h2] ^= hash;
356                t2count[h2] ^= 2;
357
358                // If we have duplicated hash values, then it is likely that
359                // the next comparison is true
360                if (t2hash[h0] & t2hash[h1] & t2hash[h2]) == 0 {
361                    // next we do the actual test
362                    if ((t2hash[h0] == 0) && (t2count[h0] == 8))
363                        || ((t2hash[h1] == 0) && (t2count[h1] == 8))
364                        || ((t2hash[h2] == 0) && (t2count[h2] == 8))
365                    {
366                        duplicates += 1;
367                        t2count[h0] = t2count[h0].wrapping_sub(4);
368                        t2hash[h0] ^= hash;
369                        t2count[h1] = t2count[h1].wrapping_sub(4);
370                        t2count[h1] ^= 1;
371                        t2hash[h1] ^= hash;
372                        t2count[h2] = t2count[h2].wrapping_sub(4);
373                        t2hash[h2] ^= hash;
374                        t2count[h2] ^= 2;
375                    }
376                }
377
378                error = if t2count[h0] < 4 { 1 } else { error };
379                error = if t2count[h1] < 4 { 1 } else { error };
380                error = if t2count[h2] < 4 { 1 } else { error };
381            }
382
383            if error > 0 {
384                continue;
385            }
386
387            let mut q_size = 0_usize; // End of key addition
388
389            // Add sets with one key to the queue.
390            for (i, x) in t2count.iter().enumerate().take(capacity) {
391                alone[q_size] = i as u32;
392                q_size += if (x >> 2) == 1 { 1 } else { 0 };
393            }
394
395            let mut stack_size = 0_usize;
396
397            while q_size > 0 {
398                q_size -= 1;
399                let index = alone[q_size] as usize;
400                if (t2count[index] >> 2) == 1 {
401                    let hash: u64 = t2hash[index];
402
403                    //h012[0] = self.binary_fuse8_hash(0, hash);
404                    h012[1] = self.binary_fuse8_hash(1, hash);
405                    h012[2] = self.binary_fuse8_hash(2, hash);
406                    h012[3] = self.binary_fuse8_hash(0, hash); // == h012[0];
407                    h012[4] = h012[1];
408
409                    let found: u8 = t2count[index] & 3;
410                    reverse_h[stack_size] = found;
411                    reverse_order[stack_size] = hash;
412                    stack_size += 1;
413
414                    let other_index1: u32 = h012[(found + 1) as usize];
415                    alone[q_size] = other_index1;
416                    q_size += if (t2count[other_index1 as usize] >> 2) == 2 {
417                        1
418                    } else {
419                        0
420                    };
421
422                    t2count[other_index1 as usize] -= 4;
423                    t2count[other_index1 as usize] ^= binary_fuse_mod3(found + 1);
424                    t2hash[other_index1 as usize] ^= hash;
425
426                    let other_index2: u32 = h012[(found + 2) as usize];
427                    alone[q_size] = other_index2;
428                    q_size += if (t2count[other_index2 as usize] >> 2) == 2 {
429                        1
430                    } else {
431                        0
432                    };
433                    t2count[other_index2 as usize] -= 4;
434                    t2count[other_index2 as usize] ^= binary_fuse_mod3(found + 2);
435                    t2hash[other_index2 as usize] ^= hash;
436                }
437            }
438
439            if (stack_size + duplicates) == size {
440                break; // success
441            }
442
443            reverse_order.fill(0);
444            reverse_order[size] = 1; // sentinel
445            t2count.fill(0);
446            t2hash.fill(0);
447
448            self.seed = binary_fuse_rng_splitmix64(&mut rng_counter);
449        }
450
451        if size == 0 {
452            return Ok(());
453        }
454
455        for i in (0_usize..size).rev() {
456            // the hash of the key we insert next
457            let hash: u64 = reverse_order[i];
458            let xor2: u8 = binary_fuse8_fingerprint(hash) as u8;
459            let found: usize = reverse_h[i] as usize;
460            h012[0] = self.binary_fuse8_hash(0, hash);
461            h012[1] = self.binary_fuse8_hash(1, hash);
462            h012[2] = self.binary_fuse8_hash(2, hash);
463            h012[3] = h012[0];
464            h012[4] = h012[1];
465            self.finger_prints[h012[found] as usize] = xor2
466                ^ self.finger_prints[h012[found + 1] as usize]
467                ^ self.finger_prints[h012[found + 2] as usize];
468        }
469
470        Ok(())
471    }
472}
473
474impl<H> Fuse8<H>
475where
476    H: BuildHasher,
477{
478    /// Contains tell you whether the key is likely part of the set, with false
479    /// positive rate.
480    pub fn contains<K: ?Sized + Hash>(&self, key: &K) -> bool {
481        let digest = {
482            let mut hasher = self.hash_builder.build_hasher();
483            key.hash(&mut hasher);
484            hasher.finish()
485        };
486        self.contains_key(digest)
487    }
488
489    /// Contains tell you whether the key, as pre-computed digest form, is likely
490    /// part of the set, with false positive rate.
491    pub fn contains_key(&self, digest: u64) -> bool {
492        let hash = binary_fuse_mix_split(digest, self.seed);
493        let mut f = binary_fuse8_fingerprint(hash) as u8;
494        let BinaryHashes { h0, h1, h2 } = self.binary_fuse8_hash_batch(hash);
495        f ^= self.finger_prints[h0 as usize]
496            ^ self.finger_prints[h1 as usize]
497            ^ self.finger_prints[h2 as usize];
498        f == 0
499    }
500
501    #[allow(dead_code)]
502    fn get_hasher(&self) -> H::Hasher {
503        self.hash_builder.build_hasher()
504    }
505}
506
507#[cfg(test)]
508#[path = "fuse8_test.rs"]
509mod fuse8_test;