rustywallet_bloom/
counting.rs

1//! Counting Bloom Filter implementation with removal support.
2//!
3//! Unlike standard Bloom filters, counting Bloom filters use counters instead
4//! of single bits, allowing items to be removed from the filter.
5
6use crate::hash::{double_hash, fnv1a_64, fnv1a_64_seeded};
7
8/// Error type for counting bloom filter operations.
9#[derive(Debug, Clone, PartialEq, Eq)]
10pub enum CountingBloomError {
11    /// Counter would underflow (item not in filter or already removed)
12    CounterUnderflow,
13    /// Counter would overflow (too many insertions of same item)
14    CounterOverflow,
15}
16
17impl std::fmt::Display for CountingBloomError {
18    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
19        match self {
20            CountingBloomError::CounterUnderflow => {
21                write!(f, "Counter underflow: item not in filter or already removed")
22            }
23            CountingBloomError::CounterOverflow => {
24                write!(f, "Counter overflow: too many insertions of same item")
25            }
26        }
27    }
28}
29
30impl std::error::Error for CountingBloomError {}
31
32/// A counting Bloom filter that supports removal of items.
33///
34/// Uses 4-bit counters (0-15) for each position, allowing items to be
35/// inserted multiple times and removed. This uses 4x more memory than
36/// a standard Bloom filter but enables deletion.
37///
38/// # Example
39///
40/// ```rust
41/// use rustywallet_bloom::CountingBloomFilter;
42///
43/// let mut filter = CountingBloomFilter::new(10_000, 0.01);
44///
45/// filter.insert("address1");
46/// filter.insert("address2");
47/// assert!(filter.contains("address1"));
48///
49/// filter.remove("address1").unwrap();
50/// assert!(!filter.contains("address1"));
51/// assert!(filter.contains("address2"));
52/// ```
53pub struct CountingBloomFilter {
54    /// Counters stored as nibbles (4 bits each) in u8 array
55    counters: Vec<u8>,
56    /// Number of counter positions
57    num_counters: usize,
58    /// Number of hash functions
59    num_hashes: usize,
60    /// Number of items currently in filter
61    count: usize,
62}
63
64impl CountingBloomFilter {
65    /// Creates a new counting Bloom filter optimized for the expected number
66    /// of items and desired false positive rate.
67    ///
68    /// # Arguments
69    ///
70    /// * `expected_items` - Expected number of items to insert
71    /// * `false_positive_rate` - Desired false positive rate (e.g., 0.01 for 1%)
72    ///
73    /// # Example
74    ///
75    /// ```rust
76    /// use rustywallet_bloom::CountingBloomFilter;
77    ///
78    /// // 100,000 items with 1% false positive rate
79    /// let filter = CountingBloomFilter::new(100_000, 0.01);
80    /// ```
81    pub fn new(expected_items: usize, false_positive_rate: f64) -> Self {
82        let fpr = false_positive_rate.clamp(1e-10, 0.5);
83        let n = expected_items.max(1) as f64;
84
85        // Optimal number of bits: m = -n * ln(p) / (ln(2)^2)
86        let ln2_sq = std::f64::consts::LN_2 * std::f64::consts::LN_2;
87        let num_counters = ((-n * fpr.ln()) / ln2_sq).ceil() as usize;
88        let num_counters = num_counters.max(64);
89
90        // Optimal number of hash functions: k = (m/n) * ln(2)
91        let num_hashes = ((num_counters as f64 / n) * std::f64::consts::LN_2).ceil() as usize;
92        let num_hashes = num_hashes.clamp(3, 20);
93
94        // Allocate counter array (2 counters per byte using nibbles)
95        let num_bytes = num_counters.div_ceil(2);
96
97        Self {
98            counters: vec![0u8; num_bytes],
99            num_counters,
100            num_hashes,
101            count: 0,
102        }
103    }
104
105    /// Creates a counting Bloom filter with explicit parameters.
106    ///
107    /// # Arguments
108    ///
109    /// * `num_counters` - Total number of counter positions
110    /// * `num_hashes` - Number of hash functions to use
111    pub fn with_params(num_counters: usize, num_hashes: usize) -> Self {
112        let num_counters = num_counters.max(64);
113        let num_hashes = num_hashes.clamp(1, 20);
114        let num_bytes = num_counters.div_ceil(2);
115
116        Self {
117            counters: vec![0u8; num_bytes],
118            num_counters,
119            num_hashes,
120            count: 0,
121        }
122    }
123
124    /// Inserts an item into the filter by incrementing relevant counters.
125    ///
126    /// Returns `Ok(())` on success, or `Err(CountingBloomError::CounterOverflow)`
127    /// if any counter would exceed the maximum value (15).
128    ///
129    /// # Example
130    ///
131    /// ```rust
132    /// use rustywallet_bloom::CountingBloomFilter;
133    ///
134    /// let mut filter = CountingBloomFilter::new(1000, 0.01);
135    /// filter.insert("item1");
136    /// assert!(filter.contains("item1"));
137    /// ```
138    pub fn insert<T: AsRef<[u8]>>(&mut self, item: T) {
139        let data = item.as_ref();
140        let h1 = fnv1a_64(data);
141        let h2 = fnv1a_64_seeded(data, 0x9e3779b97f4a7c15);
142
143        for i in 0..self.num_hashes {
144            let pos = double_hash(h1, h2, i, self.num_counters);
145            self.increment_counter(pos);
146        }
147
148        self.count += 1;
149    }
150
151    /// Inserts an item with overflow checking.
152    ///
153    /// Returns `Err(CountingBloomError::CounterOverflow)` if any counter
154    /// would exceed the maximum value.
155    pub fn try_insert<T: AsRef<[u8]>>(&mut self, item: T) -> Result<(), CountingBloomError> {
156        let data = item.as_ref();
157        let h1 = fnv1a_64(data);
158        let h2 = fnv1a_64_seeded(data, 0x9e3779b97f4a7c15);
159
160        // Check all counters first
161        for i in 0..self.num_hashes {
162            let pos = double_hash(h1, h2, i, self.num_counters);
163            if self.get_counter(pos) >= 15 {
164                return Err(CountingBloomError::CounterOverflow);
165            }
166        }
167
168        // All checks passed, increment counters
169        for i in 0..self.num_hashes {
170            let pos = double_hash(h1, h2, i, self.num_counters);
171            self.increment_counter(pos);
172        }
173
174        self.count += 1;
175        Ok(())
176    }
177
178    /// Removes an item from the filter by decrementing relevant counters.
179    ///
180    /// Returns `Ok(())` on success, or `Err(CountingBloomError::CounterUnderflow)`
181    /// if any counter would go below zero (item not in filter).
182    ///
183    /// # Warning
184    ///
185    /// Removing an item that was never inserted can corrupt the filter,
186    /// causing false negatives for other items.
187    ///
188    /// # Example
189    ///
190    /// ```rust
191    /// use rustywallet_bloom::CountingBloomFilter;
192    ///
193    /// let mut filter = CountingBloomFilter::new(1000, 0.01);
194    /// filter.insert("item1");
195    /// assert!(filter.contains("item1"));
196    ///
197    /// filter.remove("item1").unwrap();
198    /// assert!(!filter.contains("item1"));
199    /// ```
200    pub fn remove<T: AsRef<[u8]>>(&mut self, item: T) -> Result<(), CountingBloomError> {
201        let data = item.as_ref();
202        let h1 = fnv1a_64(data);
203        let h2 = fnv1a_64_seeded(data, 0x9e3779b97f4a7c15);
204
205        // Check all counters first to ensure we can decrement
206        for i in 0..self.num_hashes {
207            let pos = double_hash(h1, h2, i, self.num_counters);
208            if self.get_counter(pos) == 0 {
209                return Err(CountingBloomError::CounterUnderflow);
210            }
211        }
212
213        // All checks passed, decrement counters
214        for i in 0..self.num_hashes {
215            let pos = double_hash(h1, h2, i, self.num_counters);
216            self.decrement_counter(pos);
217        }
218
219        self.count = self.count.saturating_sub(1);
220        Ok(())
221    }
222
223    /// Checks if an item might be in the filter.
224    ///
225    /// Returns `false` if the item is definitely not in the set.
226    /// Returns `true` if the item is probably in the set (may be false positive).
227    #[inline]
228    pub fn contains<T: AsRef<[u8]>>(&self, item: T) -> bool {
229        let data = item.as_ref();
230        let h1 = fnv1a_64(data);
231        let h2 = fnv1a_64_seeded(data, 0x9e3779b97f4a7c15);
232
233        for i in 0..self.num_hashes {
234            let pos = double_hash(h1, h2, i, self.num_counters);
235            if self.get_counter(pos) == 0 {
236                return false;
237            }
238        }
239
240        true
241    }
242
243    /// Returns the approximate count for an item.
244    ///
245    /// This returns the minimum counter value across all hash positions,
246    /// which approximates how many times the item was inserted.
247    pub fn count_estimate<T: AsRef<[u8]>>(&self, item: T) -> u8 {
248        let data = item.as_ref();
249        let h1 = fnv1a_64(data);
250        let h2 = fnv1a_64_seeded(data, 0x9e3779b97f4a7c15);
251
252        let mut min_count = u8::MAX;
253        for i in 0..self.num_hashes {
254            let pos = double_hash(h1, h2, i, self.num_counters);
255            min_count = min_count.min(self.get_counter(pos));
256        }
257
258        min_count
259    }
260
261    /// Returns the number of items inserted (minus removed).
262    pub fn len(&self) -> usize {
263        self.count
264    }
265
266    /// Returns true if no items are in the filter.
267    pub fn is_empty(&self) -> bool {
268        self.count == 0
269    }
270
271    /// Returns the memory usage in bytes.
272    pub fn memory_usage(&self) -> usize {
273        self.counters.len()
274    }
275
276    /// Returns the number of counter positions.
277    pub fn num_counters(&self) -> usize {
278        self.num_counters
279    }
280
281    /// Returns the number of hash functions used.
282    pub fn num_hashes(&self) -> usize {
283        self.num_hashes
284    }
285
286    /// Clears all items from the filter.
287    pub fn clear(&mut self) {
288        self.counters.fill(0);
289        self.count = 0;
290    }
291
292    /// Gets the counter value at a position (0-15).
293    #[inline]
294    fn get_counter(&self, pos: usize) -> u8 {
295        let byte_idx = pos / 2;
296        let nibble = pos % 2;
297
298        if nibble == 0 {
299            self.counters[byte_idx] & 0x0F
300        } else {
301            (self.counters[byte_idx] >> 4) & 0x0F
302        }
303    }
304
305    /// Increments the counter at a position (saturates at 15).
306    #[inline]
307    fn increment_counter(&mut self, pos: usize) {
308        let byte_idx = pos / 2;
309        let nibble = pos % 2;
310
311        let current = self.get_counter(pos);
312        if current < 15 {
313            if nibble == 0 {
314                self.counters[byte_idx] = (self.counters[byte_idx] & 0xF0) | (current + 1);
315            } else {
316                self.counters[byte_idx] = (self.counters[byte_idx] & 0x0F) | ((current + 1) << 4);
317            }
318        }
319    }
320
321    /// Decrements the counter at a position (does not go below 0).
322    #[inline]
323    fn decrement_counter(&mut self, pos: usize) {
324        let byte_idx = pos / 2;
325        let nibble = pos % 2;
326
327        let current = self.get_counter(pos);
328        if current > 0 {
329            if nibble == 0 {
330                self.counters[byte_idx] = (self.counters[byte_idx] & 0xF0) | (current - 1);
331            } else {
332                self.counters[byte_idx] = (self.counters[byte_idx] & 0x0F) | ((current - 1) << 4);
333            }
334        }
335    }
336}
337
338impl std::fmt::Debug for CountingBloomFilter {
339    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
340        f.debug_struct("CountingBloomFilter")
341            .field("items", &self.count)
342            .field("counters", &self.num_counters)
343            .field("hashes", &self.num_hashes)
344            .field("memory_kb", &(self.memory_usage() / 1024))
345            .finish()
346    }
347}
348
349#[cfg(test)]
350mod tests {
351    use super::*;
352
353    #[test]
354    fn test_insert_and_contains() {
355        let mut filter = CountingBloomFilter::new(100, 0.01);
356
357        filter.insert("test1");
358        filter.insert("test2");
359
360        assert!(filter.contains("test1"));
361        assert!(filter.contains("test2"));
362        assert!(!filter.contains("test3"));
363        assert_eq!(filter.len(), 2);
364    }
365
366    #[test]
367    fn test_remove() {
368        let mut filter = CountingBloomFilter::new(100, 0.01);
369
370        filter.insert("test1");
371        filter.insert("test2");
372        assert!(filter.contains("test1"));
373
374        filter.remove("test1").unwrap();
375        assert!(!filter.contains("test1"));
376        assert!(filter.contains("test2"));
377        assert_eq!(filter.len(), 1);
378    }
379
380    #[test]
381    fn test_remove_underflow() {
382        let mut filter = CountingBloomFilter::new(100, 0.01);
383
384        // Try to remove item that was never inserted
385        let result = filter.remove("nonexistent");
386        assert_eq!(result, Err(CountingBloomError::CounterUnderflow));
387    }
388
389    #[test]
390    fn test_multiple_insert_remove() {
391        let mut filter = CountingBloomFilter::new(100, 0.01);
392
393        // Insert same item multiple times
394        filter.insert("test");
395        filter.insert("test");
396        filter.insert("test");
397
398        assert!(filter.contains("test"));
399        assert!(filter.count_estimate("test") >= 3);
400
401        // Remove once - should still be present
402        filter.remove("test").unwrap();
403        assert!(filter.contains("test"));
404
405        // Remove again
406        filter.remove("test").unwrap();
407        assert!(filter.contains("test"));
408
409        // Remove third time - should be gone
410        filter.remove("test").unwrap();
411        assert!(!filter.contains("test"));
412    }
413
414    #[test]
415    fn test_clear() {
416        let mut filter = CountingBloomFilter::new(100, 0.01);
417
418        filter.insert("test1");
419        filter.insert("test2");
420        assert!(filter.contains("test1"));
421
422        filter.clear();
423        assert!(!filter.contains("test1"));
424        assert!(!filter.contains("test2"));
425        assert_eq!(filter.len(), 0);
426    }
427
428    #[test]
429    fn test_with_params() {
430        let filter = CountingBloomFilter::with_params(10_000, 5);
431        assert_eq!(filter.num_counters(), 10_000);
432        assert_eq!(filter.num_hashes(), 5);
433    }
434
435    #[test]
436    fn test_memory_usage() {
437        let filter = CountingBloomFilter::new(10_000, 0.01);
438        // Should use about 4x more memory than standard bloom filter
439        // due to 4-bit counters vs 1-bit
440        let bytes = filter.memory_usage();
441        assert!(bytes > 0);
442    }
443
444    #[test]
445    fn test_counter_nibbles() {
446        let mut filter = CountingBloomFilter::with_params(100, 1);
447
448        // Test that nibble storage works correctly
449        for i in 0..16 {
450            filter.increment_counter(0);
451        }
452        // Counter should saturate at 15
453        assert_eq!(filter.get_counter(0), 15);
454
455        // Test second nibble
456        for i in 0..10 {
457            filter.increment_counter(1);
458        }
459        assert_eq!(filter.get_counter(1), 10);
460        assert_eq!(filter.get_counter(0), 15); // First nibble unchanged
461    }
462}