Skip to main content

spider_lib/utils/
bloom_filter.rs

1//! Bloom Filter implementation for efficient URL duplicate detection.
2//!
3//! This module provides a Bloom Filter implementation that can be used to efficiently
4//! check whether a URL has been visited before, reducing memory usage compared to
5//! storing all URLs in a hash set.
6
7use std::collections::hash_map::DefaultHasher;
8use std::hash::{Hash, Hasher};
9use std::sync::RwLock;
10
11/// A thread-safe Bloom Filter implementation
12pub struct BloomFilter {
13    bit_array: RwLock<Vec<bool>>,
14    bit_array_size: usize,
15    hash_functions: usize,
16}
17
18impl BloomFilter {
19    /// Creates a new Bloom Filter with the specified size and number of hash functions
20    pub fn new(size: usize, hash_functions: usize) -> Self {
21        Self {
22            bit_array: RwLock::new(vec![false; size]),
23            bit_array_size: size,
24            hash_functions,
25        }
26    }
27
28    /// Adds an item to the Bloom Filter
29    pub fn add<T: Hash>(&self, item: &T) {
30        let mut bit_array = self.bit_array.write().unwrap();
31        for i in 0..self.hash_functions {
32            let hash = self.hash_with_seed(item, i);
33            let index = hash % self.bit_array_size;
34            bit_array[index] = true;
35        }
36    }
37
38    /// Checks if an item might be in the Bloom Filter
39    /// Returns true if the item might be in the set, false if it definitely isn't
40    pub fn might_contain<T: Hash>(&self, item: &T) -> bool {
41        let bit_array = self.bit_array.read().unwrap();
42        for i in 0..self.hash_functions {
43            let hash = self.hash_with_seed(item, i);
44            let index = hash % self.bit_array_size;
45            if !bit_array[index] {
46                return false;
47            }
48        }
49        true
50    }
51
52    /// Computes a hash with a seed value
53    fn hash_with_seed<T: Hash>(&self, item: &T, seed: usize) -> usize {
54        let mut hasher = DefaultHasher::new();
55        item.hash(&mut hasher);
56        (hasher.finish() as usize) ^ seed
57    }
58}
59