Skip to main content

spider_util/
bloom.rs

1//! # Bloom Filter Module
2//!
3//! Implements a memory-efficient Bloom Filter for duplicate URL detection.
4//!
5//! ## Overview
6//!
7//! The Bloom Filter module provides an efficient probabilistic data structure
8//! for testing whether an element is a member of a set. In the context of web
9//! crawling, it's used to quickly check if a URL has potentially been visited
10//! before, significantly reducing the need for expensive lookups in the main
11//! visited URLs cache. The filter trades a small probability of false positives
12//! for substantial memory savings and performance gains.
13//!
14//! ## Key Components
15//!
16//! - **BloomFilter**: Main struct implementing the Bloom Filter algorithm
17//! - **Bit Vector**: Memory-efficient storage using a vector of 64-bit integers
18//! - **Hash Functions**: Multiple hash functions using double hashing technique
19//! - **Might Contain**: Probabilistic membership testing method
20//!
21//! ## Algorithm Details
22//!
23//! The implementation uses a bit vector for memory efficiency and applies double
24//! hashing to generate multiple hash values from two initial hash functions.
25//! This approach reduces the computational overhead of calculating multiple
26//! independent hash functions while maintaining good distribution properties.
27//! The filter supports configurable size and number of hash functions.
28//!
29//! ## Example
30//!
31//! ```rust,ignore
32//! use spider_util::bloom_filter::BloomFilter;
33//!
34//! // Create a Bloom Filter with capacity for ~1M items and 5 hash functions
35//! let mut bloom_filter = BloomFilter::new(5_000_000, 5);
36//!
37//! // Add items to the filter
38//! bloom_filter.add("https://example.com/page1");
39//! bloom_filter.add("https://example.com/page2");
40//!
41//! // Check if items might be in the set (with possibility of false positives)
42//! assert_eq!(bloom_filter.might_contain("https://example.com/page1"), true);
43//! assert_eq!(bloom_filter.might_contain("https://example.com/nonexistent"), false); // Likely, but not guaranteed
44//! ```
45
46/// A proper Bloom Filter implementation using a bit vector for memory efficiency.
47/// This is used for efficiently checking if a URL has potentially been visited before,
48/// reducing the need for expensive lookups in the main visited URLs cache.
49pub struct BloomFilter {
50    bit_set: Vec<u64>,
51    num_bits: u64,
52    hash_functions: usize,
53}
54
55impl BloomFilter {
56    /// Creates a new BloomFilter with the specified capacity and number of hash functions.
57    pub fn new(num_bits: u64, hash_functions: usize) -> Self {
58        let size = ((num_bits as f64 / 64.0).ceil() as usize).max(1);
59        Self {
60            bit_set: vec![0; size],
61            num_bits,
62            hash_functions,
63        }
64    }
65
66    /// Adds an item to the BloomFilter.
67    pub fn add(&mut self, item: &str) {
68        // Pre-compute hash of the item once
69        let item_bytes = item.as_bytes();
70        let hash1 = seahash::hash(item_bytes);
71
72        for i in 0..self.hash_functions {
73            // Use double hashing: h(i) = hash1 + i * hash2
74            // Optimized: use XOR and rotation instead of hash(&i.to_ne_bytes()) to avoid allocation
75            let hash2 = hash1 ^ (i as u64).rotate_left(13);
76            let combined_hash = hash1.wrapping_add((i as u64).wrapping_mul(hash2));
77            let index = combined_hash % self.num_bits;
78
79            let bucket_idx = (index / 64) as usize;
80            let bit_idx = (index % 64) as usize;
81
82            if bucket_idx < self.bit_set.len() {
83                self.bit_set[bucket_idx] |= 1u64 << bit_idx;
84            }
85        }
86    }
87
88    /// Checks if an item might be in the BloomFilter.
89    /// Returns true if the item might be in the set, false if it definitely isn't.
90    pub fn might_contain(&self, item: &str) -> bool {
91        let item_bytes = item.as_bytes();
92        let hash1 = seahash::hash(item_bytes);
93
94        for i in 0..self.hash_functions {
95            // Optimized: use XOR and rotation instead of hash(&i.to_ne_bytes())
96            let hash2 = hash1 ^ (i as u64).rotate_left(13);
97            let combined_hash = hash1.wrapping_add((i as u64).wrapping_mul(hash2));
98            let index = combined_hash % self.num_bits;
99
100            let bucket_idx = (index / 64) as usize;
101            let bit_idx = (index % 64) as usize;
102
103            if bucket_idx >= self.bit_set.len() {
104                return false;
105            }
106
107            if (self.bit_set[bucket_idx] & (1u64 << bit_idx)) == 0 {
108                return false;
109            }
110        }
111        true
112    }
113}