rustywallet_bloom/
bloom.rs

1//! Bloom Filter implementation
2
3use crate::hash::{double_hash, fnv1a_64, fnv1a_64_seeded};
4
5/// A space-efficient probabilistic data structure for set membership testing.
6///
7/// Bloom filters can have false positives but never false negatives.
8/// If `contains()` returns `false`, the item is definitely not in the set.
9/// If `contains()` returns `true`, the item is probably in the set.
10///
11/// # Memory Usage
12///
13/// For a given number of items `n` and false positive rate `p`:
14/// - Optimal bits: `m = -n * ln(p) / (ln(2)^2)` ≈ `n * 9.6` for 1% FPR
15/// - Optimal hashes: `k = (m/n) * ln(2)` ≈ 7 for 1% FPR
16///
17/// # Example
18///
19/// ```rust
20/// use rustywallet_bloom::BloomFilter;
21///
22/// let mut filter = BloomFilter::new(100_000, 0.01);
23/// filter.insert("address1");
24/// assert!(filter.contains("address1"));
25/// ```
26pub struct BloomFilter {
27    bits: Vec<u64>,
28    num_bits: usize,
29    num_hashes: usize,
30    count: usize,
31}
32
33impl BloomFilter {
34    /// Creates a new Bloom filter optimized for the expected number of items
35    /// and desired false positive rate.
36    ///
37    /// # Arguments
38    ///
39    /// * `expected_items` - Expected number of items to insert
40    /// * `false_positive_rate` - Desired false positive rate (e.g., 0.01 for 1%)
41    ///
42    /// # Example
43    ///
44    /// ```rust
45    /// use rustywallet_bloom::BloomFilter;
46    ///
47    /// // 1 million items with 1% false positive rate
48    /// let filter = BloomFilter::new(1_000_000, 0.01);
49    /// ```
50    pub fn new(expected_items: usize, false_positive_rate: f64) -> Self {
51        let fpr = false_positive_rate.clamp(1e-15, 0.5); // Allow ultra-low FPR
52        let n = expected_items.max(1) as f64;
53        
54        // Optimal number of bits: m = -n * ln(p) / (ln(2)^2)
55        let ln2_sq = std::f64::consts::LN_2 * std::f64::consts::LN_2;
56        let num_bits = ((-n * fpr.ln()) / ln2_sq).ceil() as usize;
57        let num_bits = num_bits.max(64);
58        
59        // Optimal number of hash functions: k = (m/n) * ln(2)
60        // Allow up to 50 hashes for ultra-low FPR
61        let num_hashes = ((num_bits as f64 / n) * std::f64::consts::LN_2).ceil() as usize;
62        let num_hashes = num_hashes.clamp(3, 50);
63        
64        // Allocate bit array (using u64 chunks)
65        let num_words = num_bits.div_ceil(64);
66        
67        Self {
68            bits: vec![0u64; num_words],
69            num_bits,
70            num_hashes,
71            count: 0,
72        }
73    }
74
75    /// Creates a Bloom filter with explicit parameters.
76    ///
77    /// Use this if you need precise control over the filter size.
78    ///
79    /// # Arguments
80    ///
81    /// * `num_bits` - Total number of bits in the filter
82    /// * `num_hashes` - Number of hash functions to use
83    pub fn with_params(num_bits: usize, num_hashes: usize) -> Self {
84        let num_bits = num_bits.max(64);
85        let num_hashes = num_hashes.clamp(1, 16);
86        let num_words = num_bits.div_ceil(64);
87        
88        Self {
89            bits: vec![0u64; num_words],
90            num_bits,
91            num_hashes,
92            count: 0,
93        }
94    }
95
96    /// Inserts an item into the filter.
97    ///
98    /// After insertion, `contains()` will always return `true` for this item.
99    #[inline]
100    pub fn insert<T: AsRef<[u8]>>(&mut self, item: T) {
101        let data = item.as_ref();
102        let h1 = fnv1a_64(data);
103        let h2 = fnv1a_64_seeded(data, 0x9e3779b97f4a7c15); // Golden ratio
104        
105        for i in 0..self.num_hashes {
106            let pos = double_hash(h1, h2, i, self.num_bits);
107            self.set_bit(pos);
108        }
109        
110        self.count += 1;
111    }
112
113    /// Checks if an item might be in the filter.
114    ///
115    /// Returns `false` if the item is definitely not in the set.
116    /// Returns `true` if the item is probably in the set (may be false positive).
117    #[inline]
118    pub fn contains<T: AsRef<[u8]>>(&self, item: T) -> bool {
119        let data = item.as_ref();
120        let h1 = fnv1a_64(data);
121        let h2 = fnv1a_64_seeded(data, 0x9e3779b97f4a7c15);
122        
123        for i in 0..self.num_hashes {
124            let pos = double_hash(h1, h2, i, self.num_bits);
125            if !self.get_bit(pos) {
126                return false;
127            }
128        }
129        
130        true
131    }
132
133    /// Returns the number of items inserted.
134    pub fn len(&self) -> usize {
135        self.count
136    }
137
138    /// Returns true if no items have been inserted.
139    pub fn is_empty(&self) -> bool {
140        self.count == 0
141    }
142
143    /// Returns the memory usage in bytes.
144    pub fn memory_usage(&self) -> usize {
145        self.bits.len() * 8
146    }
147
148    /// Returns the number of bits in the filter.
149    pub fn num_bits(&self) -> usize {
150        self.num_bits
151    }
152
153    /// Returns the number of hash functions used.
154    pub fn num_hashes(&self) -> usize {
155        self.num_hashes
156    }
157
158    /// Returns the estimated false positive rate based on current fill.
159    pub fn estimated_fpr(&self) -> f64 {
160        let bits_set = self.bits.iter().map(|w| w.count_ones() as usize).sum::<usize>();
161        let fill_ratio = bits_set as f64 / self.num_bits as f64;
162        fill_ratio.powi(self.num_hashes as i32)
163    }
164
165    /// Clears all items from the filter.
166    pub fn clear(&mut self) {
167        self.bits.fill(0);
168        self.count = 0;
169    }
170
171    #[inline]
172    fn set_bit(&mut self, pos: usize) {
173        let word = pos / 64;
174        let bit = pos % 64;
175        self.bits[word] |= 1u64 << bit;
176    }
177
178    #[inline]
179    fn get_bit(&self, pos: usize) -> bool {
180        let word = pos / 64;
181        let bit = pos % 64;
182        (self.bits[word] & (1u64 << bit)) != 0
183    }
184}
185
186impl std::fmt::Debug for BloomFilter {
187    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
188        f.debug_struct("BloomFilter")
189            .field("items", &self.count)
190            .field("bits", &self.num_bits)
191            .field("hashes", &self.num_hashes)
192            .field("memory_mb", &(self.memory_usage() / 1_000_000))
193            .finish()
194    }
195}
196
197#[cfg(test)]
198mod tests {
199    use super::*;
200
201    #[test]
202    fn test_insert_and_contains() {
203        let mut bloom = BloomFilter::new(100, 0.01);
204        
205        bloom.insert("test1");
206        bloom.insert("test2");
207        bloom.insert(b"binary_data");
208        
209        assert!(bloom.contains("test1"));
210        assert!(bloom.contains("test2"));
211        assert!(bloom.contains(b"binary_data"));
212        assert_eq!(bloom.len(), 3);
213    }
214
215    #[test]
216    fn test_clear() {
217        let mut bloom = BloomFilter::new(100, 0.01);
218        bloom.insert("test");
219        assert!(bloom.contains("test"));
220        
221        bloom.clear();
222        assert!(!bloom.contains("test"));
223        assert_eq!(bloom.len(), 0);
224    }
225
226    #[test]
227    fn test_with_params() {
228        let bloom = BloomFilter::with_params(1_000_000, 5);
229        assert_eq!(bloom.num_bits(), 1_000_000);
230        assert_eq!(bloom.num_hashes(), 5);
231    }
232
233    #[test]
234    fn test_large_dataset() {
235        let n = 100_000;
236        let mut bloom = BloomFilter::new(n, 0.01);
237        
238        for i in 0..n {
239            bloom.insert(format!("addr_{}", i));
240        }
241        
242        // All inserted items should be found
243        for i in 0..n {
244            assert!(bloom.contains(format!("addr_{}", i)));
245        }
246        
247        assert_eq!(bloom.len(), n);
248    }
249}