Skip to main content

spider_util/
bloom_filter.rs

1use seahash::hash;
2use std::collections::hash_map::DefaultHasher;
3use std::hash::{Hash, Hasher};
4
5/// A proper Bloom Filter implementation using a bit vector for memory efficiency.
6/// This is used for efficiently checking if a URL has potentially been visited before,
7/// reducing the need for expensive lookups in the main visited URLs cache.
8pub struct BloomFilter {
9    bit_set: Vec<u64>,
10    num_bits: u64,
11    hash_functions: usize,
12}
13
14impl BloomFilter {
15    /// Creates a new BloomFilter with the specified capacity and number of hash functions.
16    pub fn new(num_bits: u64, hash_functions: usize) -> Self {
17        let size = ((num_bits as f64 / 64.0).ceil() as usize).max(1);
18        Self {
19            bit_set: vec![0; size],
20            num_bits,
21            hash_functions,
22        }
23    }
24
25    /// Adds an item to the BloomFilter.
26    pub fn add(&mut self, item: &str) {
27        for i in 0..self.hash_functions {
28            let index = self.get_bit_index(item, i);
29            let bucket_idx = (index / 64) as usize;
30            let bit_idx = (index % 64) as usize;
31
32            if bucket_idx < self.bit_set.len() {
33                self.bit_set[bucket_idx] |= 1u64 << bit_idx;
34            }
35        }
36    }
37
38    /// Checks if an item might be in the BloomFilter.
39    /// Returns true if the item might be in the set, false if it definitely isn't.
40    pub fn might_contain(&self, item: &str) -> bool {
41        for i in 0..self.hash_functions {
42            let index = self.get_bit_index(item, i);
43            let bucket_idx = (index / 64) as usize;
44            let bit_idx = (index % 64) as usize;
45
46            if bucket_idx >= self.bit_set.len() {
47                return false;
48            }
49
50            if (self.bit_set[bucket_idx] & (1u64 << bit_idx)) == 0 {
51                return false;
52            }
53        }
54        true
55    }
56
57    /// Calculates the bit index for an item using double hashing technique.
58    fn get_bit_index(&self, item: &str, i: usize) -> u64 {
59        let mut hasher = DefaultHasher::new();
60        item.hash(&mut hasher);
61        let hash1 = hasher.finish();
62
63        let combined = format!("{}{}", item, i);
64        let hash2 = hash(combined.as_bytes());
65
66        let combined_hash = hash1.wrapping_add((i as u64).wrapping_mul(hash2));
67        combined_hash % self.num_bits
68    }
69}
70