Skip to main content

spider_util/
bloom.rs

1//! Bloom filter used by the scheduler for cheap duplicate checks.
2
3/// Bloom filter implementation backed by a bitset.
4pub struct BloomFilter {
5    bit_set: Vec<u64>,
6    num_bits: u64,
7    hash_functions: usize,
8}
9
10impl BloomFilter {
11    /// Creates a new BloomFilter with the specified capacity and number of hash functions.
12    pub fn new(num_bits: u64, hash_functions: usize) -> Self {
13        let size = ((num_bits as f64 / 64.0).ceil() as usize).max(1);
14        Self {
15            bit_set: vec![0; size],
16            num_bits,
17            hash_functions,
18        }
19    }
20
21    /// Adds an item to the BloomFilter.
22    pub fn add(&mut self, item: &str) {
23        // Pre-compute hash of the item once
24        let item_bytes = item.as_bytes();
25        let hash1 = seahash::hash(item_bytes);
26
27        for i in 0..self.hash_functions {
28            // Use double hashing: h(i) = hash1 + i * hash2
29            // Optimized: use XOR and rotation instead of hash(&i.to_ne_bytes()) to avoid allocation
30            let hash2 = hash1 ^ (i as u64).rotate_left(13);
31            let combined_hash = hash1.wrapping_add((i as u64).wrapping_mul(hash2));
32            let index = combined_hash % self.num_bits;
33
34            let bucket_idx = (index / 64) as usize;
35            let bit_idx = (index % 64) as usize;
36
37            if bucket_idx < self.bit_set.len() {
38                self.bit_set[bucket_idx] |= 1u64 << bit_idx;
39            }
40        }
41    }
42
43    /// Checks if an item might be in the BloomFilter.
44    /// Returns true if the item might be in the set, false if it definitely isn't.
45    pub fn might_contain(&self, item: &str) -> bool {
46        let item_bytes = item.as_bytes();
47        let hash1 = seahash::hash(item_bytes);
48
49        for i in 0..self.hash_functions {
50            // Optimized: use XOR and rotation instead of hash(&i.to_ne_bytes())
51            let hash2 = hash1 ^ (i as u64).rotate_left(13);
52            let combined_hash = hash1.wrapping_add((i as u64).wrapping_mul(hash2));
53            let index = combined_hash % self.num_bits;
54
55            let bucket_idx = (index / 64) as usize;
56            let bit_idx = (index % 64) as usize;
57
58            if bucket_idx >= self.bit_set.len() {
59                return false;
60            }
61
62            if (self.bit_set[bucket_idx] & (1u64 << bit_idx)) == 0 {
63                return false;
64            }
65        }
66        true
67    }
68}