xorfilter/
fuse16.rs

1#[allow(unused_imports)]
2use std::collections::hash_map::{DefaultHasher, RandomState};
3use std::{
4    collections::BTreeMap,
5    hash::{BuildHasher, Hash, Hasher},
6};
7
8use crate::{
9    fuse8::{
10        binary_fuse_calculate_segment_length, binary_fuse_calculate_size_factor,
11        binary_fuse_mix_split, binary_fuse_mod3, binary_fuse_mulhi, binary_fuse_murmur64,
12        binary_fuse_rng_splitmix64, BinaryHashes,
13    },
14    BuildHasherDefault, Error, Result,
15};
16
17// probabillity of success should always be > 0.5 so 100 iterations is highly unlikely.
18const XOR_MAX_ITERATIONS: usize = 100;
19
20#[inline]
21pub fn binary_fuse16_fingerprint(hash: u64) -> u64 {
22    hash ^ (hash >> 32)
23}
24
25/// Type Fuse16 is probabilistic data-structure to test membership of an element in a set.
26///
27/// Fuse16 is parametrized over type `H` which is expected to implement [BuildHasher]
28/// trait, like types [RandomState] and [BuildHasherDefault]. When not supplied,
29/// [BuildHasherDefault] is used as the default hash-builder.
30///
31/// If `RandomState` is used as BuildHasher, `std` has got this to say
32/// > _A particular instance RandomState will create the same instances
33/// > of Hasher, but the hashers created by two different RandomState
34/// > instances are unlikely to produce the same result for the same values._
35///
36/// If [DefaultHasher] is used as BuildHasher, `std` has got this to say,
37/// > _The internal algorithm is not specified, and so its hashes
38/// > should not be relied upon over releases._
39///
40/// The default type for parameter `H` might change when a reliable and commonly used
41/// BuildHasher type available.
42pub struct Fuse16<H = BuildHasherDefault>
43where
44    H: BuildHasher,
45{
46    keys: Option<BTreeMap<u64, ()>>,
47    pub hash_builder: H,
48    pub seed: u64,
49    pub segment_length: u32,
50    pub segment_length_mask: u32,
51    pub segment_count: u32,
52    pub segment_count_length: u32,
53    pub finger_prints: Vec<u16>,
54}
55
56impl<H> Fuse16<H>
57where
58    H: BuildHasher,
59{
60    #[inline]
61    fn binary_fuse16_hash_batch(&self, hash: u64) -> BinaryHashes {
62        let mut ans = BinaryHashes::default();
63
64        ans.h0 = binary_fuse_mulhi(hash, self.segment_count_length.into()) as u32;
65        ans.h1 = ans.h0 + self.segment_length;
66        ans.h2 = ans.h1 + self.segment_length;
67        ans.h1 ^= ((hash >> 18) as u32) & self.segment_length_mask;
68        ans.h2 ^= (hash as u32) & self.segment_length_mask;
69        ans
70    }
71
72    #[inline]
73    fn binary_fuse16_hash(&self, index: u32, hash: u64) -> u32 {
74        let mut h = binary_fuse_mulhi(hash, self.segment_count_length.into());
75        h += (index * self.segment_length) as u64;
76        // keep the lower 36 bits
77        let hh = hash & ((1_u64 << 36) - 1);
78        // index 0: right shift by 36; index 1: right shift by 18; index 2: no shift
79        h ^= (hh >> (36 - 18 * index)) & (self.segment_length_mask as u64);
80
81        h as u32
82    }
83}
84
85impl<H> Fuse16<H>
86where
87    H: BuildHasher,
88{
89    /// New Fuse16 instance that can index size number of keys. Internal data-structures
90    /// are pre-allocated for `size`.  `size` should be at least 2.
91    pub fn new(size: u32) -> Fuse16<H>
92    where
93        H: Default,
94    {
95        Self::with_hasher(size, H::default())
96    }
97
98    /// New Fuse16 instance initialized with supplied hasher.
99    pub fn with_hasher(size: u32, hash_builder: H) -> Fuse16<H> {
100        use std::cmp;
101
102        let arity = 3_u32;
103
104        let segment_length = match size {
105            0 => 4,
106            size => cmp::min(binary_fuse_calculate_segment_length(arity, size), 262144),
107        };
108
109        let segment_length_mask = segment_length - 1;
110        let mut array_length = {
111            let size_factor = binary_fuse_calculate_size_factor(arity, size);
112            let cap = match size {
113                0 | 1 => 0,
114                size => ((size as f64) * size_factor).round() as u32,
115            };
116            let n = ((cap + segment_length - 1) / segment_length).wrapping_sub(arity - 1);
117            (n.wrapping_add(arity) - 1) * segment_length
118        };
119
120        let mut segment_count = (array_length + segment_length - 1) / segment_length;
121        segment_count = if segment_count <= (arity - 1) {
122            1
123        } else {
124            segment_count - (arity - 1)
125        };
126
127        array_length = (segment_count + arity - 1) * segment_length;
128        let segment_count_length = segment_count * segment_length;
129
130        Fuse16 {
131            keys: Some(BTreeMap::new()),
132            hash_builder,
133            seed: u64::default(),
134            segment_length,
135            segment_length_mask,
136            segment_count,
137            segment_count_length,
138            finger_prints: vec![0; array_length as usize],
139        }
140    }
141}
142
143impl<H> Fuse16<H>
144where
145    H: BuildHasher,
146{
147    /// Return the size of index.
148    #[inline]
149    pub fn size_of(&self) -> usize {
150        std::mem::size_of::<Self>() + (self.finger_prints.len() * 2)
151    }
152
153    /// Insert 64-bit digest of a single key. Digest for the key shall be generated
154    /// using the default-hasher or via hasher supplied via [Fuse16::with_hasher] method.
155    pub fn insert<K: ?Sized + Hash>(&mut self, key: &K) {
156        let digest = {
157            let mut hasher = self.hash_builder.build_hasher();
158            key.hash(&mut hasher);
159            hasher.finish()
160        };
161        self.keys.as_mut().unwrap().insert(digest, ());
162    }
163
164    /// Populate with 64-bit digests for a collection of keys of type `K`. Digest for
165    /// key shall be generated using the default-hasher or via hasher supplied
166    /// via [Fuse16::with_hasher] method.
167    pub fn populate<K: Hash>(&mut self, keys: &[K]) {
168        keys.iter().for_each(|key| {
169            let mut hasher = self.hash_builder.build_hasher();
170            key.hash(&mut hasher);
171            self.keys.as_mut().unwrap().insert(hasher.finish(), ());
172        })
173    }
174
175    /// Populate with pre-compute collection of 64-bit digests.
176    pub fn populate_keys(&mut self, digests: &[u64]) {
177        for digest in digests.iter() {
178            self.keys.as_mut().unwrap().insert(*digest, ());
179        }
180    }
181    // construct the filter, returns true on success, false on failure.
182    // most likely, a failure is due to too high a memory usage
183    // size is the number of keys
184    // The caller is responsable for calling binary_fuse16_allocate(size,filter)
185    // before. The caller is responsible to ensure that there are no duplicated
186    // keys. The inner loop will run up to XOR_MAX_ITERATIONS times (default on
187    // 100), it should never fail, except if there are duplicated keys. If it fails,
188    // a return value of false is provided.
189    /// Build bitmap for keys that where previously inserted using [Fuse16::insert],
190    /// [Fuse16::populate] and [Fuse16::populate_keys] method.
191    pub fn build(&mut self) -> Result<()> {
192        match self.keys.take() {
193            Some(keys) => {
194                let digests = keys.iter().map(|(k, _)| *k).collect::<Vec<u64>>();
195                self.build_keys(&digests)
196            }
197            None => Ok(()),
198        }
199    }
200
201    /// Build a bitmap for pre-computed 64-bit digests for keys. If keys where
202    /// previously inserted using [Fuse16::insert] or [Fuse16::populate] or
203    /// [Fuse16::populate_keys] methods, they shall be ignored.
204    ///
205    /// It is upto the caller to ensure that digests are unique, that there no
206    /// duplicates.
207    pub fn build_keys(&mut self, digests: &[u64]) -> Result<()> {
208        let mut rng_counter = 0x726b2b9d438b9d4d_u64;
209        let capacity = self.finger_prints.len();
210        let size = digests.len();
211
212        self.seed = binary_fuse_rng_splitmix64(&mut rng_counter);
213        let mut reverse_order: Vec<u64> = vec![0; size + 1];
214        let mut reverse_h: Vec<u8> = vec![0; size];
215        let mut alone: Vec<u32> = vec![0; capacity];
216        let mut t2count: Vec<u8> = vec![0; capacity];
217        let mut t2hash: Vec<u64> = vec![0; capacity];
218
219        let mut block_bits: u32 = 1;
220        while (1_u32 << block_bits) < self.segment_count {
221            block_bits += 1;
222        }
223        let block = 1_u32 << block_bits;
224
225        let mut start_pos: Vec<u32> = vec![0; 1 << block_bits];
226
227        let mut h012 = [0_u32; 5];
228
229        reverse_order[size] = 1; // sentinel
230        let mut iter = 0..=XOR_MAX_ITERATIONS;
231        loop {
232            if iter.next().is_none() {
233                err_at!(Fatal, msg: "Too many iterations. Are all your keys unique?")?;
234            }
235
236            for i in 0_u32..block {
237                // important : i * size would overflow as a 32-bit number in some
238                // cases.
239                start_pos[i as usize] =
240                    (((i as u64) * (size as u64)) >> block_bits) as u32;
241            }
242
243            let mask_block = (block - 1) as u64;
244            for (_, digest) in digests.iter().enumerate().take(size) {
245                let hash: u64 = binary_fuse_murmur64(digest.wrapping_add(self.seed));
246                let mut segment_index: u64 = hash >> (64 - block_bits);
247                while reverse_order[start_pos[segment_index as usize] as usize] != 0 {
248                    segment_index += 1;
249                    segment_index &= mask_block;
250                }
251                reverse_order[start_pos[segment_index as usize] as usize] = hash;
252                start_pos[segment_index as usize] += 1;
253            }
254
255            let mut error: isize = 0;
256            for (_, rev_order) in reverse_order.iter().enumerate().take(size) {
257                let hash: u64 = *rev_order;
258
259                let h0: usize = self.binary_fuse16_hash(0, hash) as usize;
260                t2count[h0] = t2count[h0].wrapping_add(4);
261                t2hash[h0] ^= hash;
262
263                let h1: usize = self.binary_fuse16_hash(1, hash) as usize;
264                t2count[h1] = t2count[h1].wrapping_add(4);
265                t2count[h1] ^= 1;
266                t2hash[h1] ^= hash;
267
268                let h2: usize = self.binary_fuse16_hash(2, hash) as usize;
269                t2count[h2] = t2count[h2].wrapping_add(4);
270                t2hash[h2] ^= hash;
271                t2count[h2] ^= 2;
272
273                error = if t2count[h0] < 4 { 1 } else { error };
274                error = if t2count[h1] < 4 { 1 } else { error };
275                error = if t2count[h2] < 4 { 1 } else { error };
276            }
277
278            if error > 0 {
279                continue;
280            }
281
282            let mut q_size = 0_usize; // End of key addition
283
284            // Add sets with one key to the queue.
285            for (i, x) in t2count.iter().enumerate().take(capacity) {
286                alone[q_size] = i as u32;
287                q_size += if (x >> 2) == 1 { 1 } else { 0 };
288            }
289
290            let mut stack_size = 0_usize;
291
292            while q_size > 0 {
293                q_size -= 1;
294                let index = alone[q_size] as usize;
295                if (t2count[index] >> 2) == 1 {
296                    let hash: u64 = t2hash[index];
297
298                    //h012[0] = binary_fuse16_hash(0, hash, self);
299                    h012[1] = self.binary_fuse16_hash(1, hash);
300                    h012[2] = self.binary_fuse16_hash(2, hash);
301                    h012[3] = self.binary_fuse16_hash(0, hash); // == h012[0];
302                    h012[4] = h012[1];
303
304                    let found: u8 = t2count[index] & 3;
305                    reverse_h[stack_size] = found;
306                    reverse_order[stack_size] = hash;
307                    stack_size += 1;
308
309                    let other_index1: u32 = h012[(found + 1) as usize];
310                    alone[q_size] = other_index1;
311                    q_size += if (t2count[other_index1 as usize] >> 2) == 2 {
312                        1
313                    } else {
314                        0
315                    };
316
317                    t2count[other_index1 as usize] -= 4;
318                    t2count[other_index1 as usize] ^= binary_fuse_mod3(found + 1);
319                    t2hash[other_index1 as usize] ^= hash;
320
321                    let other_index2: u32 = h012[(found + 2) as usize];
322                    alone[q_size] = other_index2;
323                    q_size += if (t2count[other_index2 as usize] >> 2) == 2 {
324                        1
325                    } else {
326                        0
327                    };
328                    t2count[other_index2 as usize] -= 4;
329                    t2count[other_index2 as usize] ^= binary_fuse_mod3(found + 2);
330                    t2hash[other_index2 as usize] ^= hash;
331                }
332            }
333
334            if stack_size == size {
335                break; // success
336            }
337
338            reverse_order.fill(0);
339            reverse_order[size] = 1; // sentinel
340            t2count.fill(0);
341            t2hash.fill(0);
342
343            self.seed = binary_fuse_rng_splitmix64(&mut rng_counter);
344        }
345
346        for i in (0_usize..size).rev() {
347            // the hash of the key we insert next
348            let hash: u64 = reverse_order[i];
349            let xor2: u16 = binary_fuse16_fingerprint(hash) as u16;
350            let found: usize = reverse_h[i] as usize;
351            h012[0] = self.binary_fuse16_hash(0, hash);
352            h012[1] = self.binary_fuse16_hash(1, hash);
353            h012[2] = self.binary_fuse16_hash(2, hash);
354            h012[3] = h012[0];
355            h012[4] = h012[1];
356
357            self.finger_prints[h012[found] as usize] = xor2
358                ^ self.finger_prints[h012[found + 1] as usize]
359                ^ self.finger_prints[h012[found + 2] as usize];
360        }
361
362        Ok(())
363    }
364}
365
366impl<H> Fuse16<H>
367where
368    H: BuildHasher,
369{
370    /// Contains tell you whether the key is likely part of the set, with false
371    /// positive rate.
372    pub fn contains<K: ?Sized + Hash>(&self, key: &K) -> bool {
373        let digest = {
374            let mut hasher = self.hash_builder.build_hasher();
375            key.hash(&mut hasher);
376            hasher.finish()
377        };
378        self.contains_key(digest)
379    }
380
381    /// Contains tell you whether the key, as pre-computed digest form, is likely
382    /// part of the set, with false positive rate.
383    pub fn contains_key(&self, digest: u64) -> bool {
384        let hash = binary_fuse_mix_split(digest, self.seed);
385        let mut f = binary_fuse16_fingerprint(hash) as u16;
386        let BinaryHashes { h0, h1, h2 } = self.binary_fuse16_hash_batch(hash);
387        f ^= self.finger_prints[h0 as usize]
388            ^ self.finger_prints[h1 as usize]
389            ^ self.finger_prints[h2 as usize];
390        f == 0
391    }
392
393    #[allow(dead_code)]
394    fn get_hasher(&self) -> H::Hasher {
395        self.hash_builder.build_hasher()
396    }
397}
398
399#[cfg(test)]
400#[path = "fuse16_test.rs"]
401mod fuse16_test;