Skip to main content

aura_core/effects/
bloom.rs

1//! Bloom Filter Data Structures
2//!
3//! Provides space-efficient probabilistic data structures for membership testing
4//! and set reconciliation in synchronization protocols.
5//!
6//! # Contents
7//!
8//! - `BloomFilter`: Probabilistic set membership data structure
9//! - `BloomConfig`: Configuration for false positive rate and capacity
10//! - `BloomError`: Error type alias for bloom operations
11//!
12//! # Usage
13//!
14//! Used by `IndexedJournalEffects::get_bloom_filter()` for efficient sync.
15//! Operations are performed synchronously via inline implementations.
16//!
17//! # Security Properties
18//!
19//! - No false negatives (if element is in set, filter will detect it)
20//! - Controlled false positive rate (configurable)
21//! - No direct access to original data through filter
22
23use crate::AuraError;
24use serde::{Deserialize, Serialize};
25
26pub const MAX_BLOOM_BITS_BYTES: usize = 1_048_576;
27
28/// Bloom filter operation error
29pub type BloomError = AuraError;
30
31/// Configuration for Bloom filter parameters
32#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
33pub struct BloomConfig {
34    /// Expected number of elements to be inserted
35    pub expected_elements: u64,
36    /// Desired false positive probability (0.0 to 1.0)
37    pub false_positive_rate: f64,
38    /// Number of hash functions to use
39    pub num_hash_functions: u32,
40    /// Size of the bit vector in bits
41    pub bit_vector_size: u64,
42}
43
44impl BloomConfig {
45    /// Create optimal configuration for given parameters
46    ///
47    /// Automatically calculates optimal bit vector size and hash function count
48    /// based on expected elements and desired false positive rate.
49    pub fn optimal(expected_elements: u64, false_positive_rate: f64) -> Self {
50        // Calculate optimal bit vector size: m = -n * ln(p) / (ln(2)^2)
51        let n = expected_elements as f64;
52        let p = false_positive_rate.clamp(0.00001, 0.99999); // Clamp to reasonable range
53        let m = (-n * p.ln() / (2.0_f64.ln().powi(2))).ceil() as u64;
54
55        // Calculate optimal number of hash functions: k = (m/n) * ln(2)
56        let k = ((m as f64 / n) * 2.0_f64.ln()).round() as u32;
57
58        Self {
59            expected_elements,
60            false_positive_rate,
61            num_hash_functions: k.clamp(1, 32), // Reasonable bounds
62            bit_vector_size: m.max(64),         // Minimum size
63        }
64    }
65
66    /// Create a configuration for sync operations
67    pub fn for_sync(expected_ops: u64) -> Self {
68        // Use 1% false positive rate for sync operations
69        Self::optimal(expected_ops, 0.01)
70    }
71
72    /// Create a configuration for small sets
73    pub fn for_small_set(expected_elements: u64) -> Self {
74        // Use 0.1% false positive rate for small sets
75        Self::optimal(expected_elements, 0.001)
76    }
77
78    /// Validate configuration parameters
79    pub fn validate(&self) -> Result<(), BloomError> {
80        if self.expected_elements == 0 {
81            return Err(BloomError::invalid("Expected elements must be > 0"));
82        }
83        if self.false_positive_rate <= 0.0 || self.false_positive_rate >= 1.0 {
84            return Err(BloomError::invalid(
85                "False positive rate must be between 0.0 and 1.0",
86            ));
87        }
88        if self.num_hash_functions == 0 || self.num_hash_functions > 32 {
89            return Err(BloomError::invalid(
90                "Number of hash functions must be between 1 and 32",
91            ));
92        }
93        if self.bit_vector_size < 64 {
94            return Err(BloomError::invalid("Bit vector size must be at least 64"));
95        }
96        Ok(())
97    }
98}
99
100/// Bloom filter data structure
101#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
102pub struct BloomFilter {
103    /// Bit vector for the filter
104    pub bits: Vec<u8>,
105    /// Configuration parameters
106    pub config: BloomConfig,
107    /// Number of elements currently in the filter
108    pub element_count: u64,
109}
110
111impl BloomFilter {
112    /// Create a new empty bloom filter with the given configuration
113    pub fn new(config: BloomConfig) -> Result<Self, BloomError> {
114        config.validate()?;
115
116        let byte_size = config.bit_vector_size.div_ceil(8); // Round up to byte boundary
117        Ok(Self {
118            bits: vec![0u8; byte_size as usize],
119            config,
120            element_count: 0,
121        })
122    }
123
124    /// Get the size of the bit vector in bits
125    pub fn bit_size(&self) -> u64 {
126        self.config.bit_vector_size
127    }
128
129    /// Get the size of the bit vector in bytes
130    pub fn byte_size(&self) -> u64 {
131        self.bits.len() as u64
132    }
133
134    /// Check if the filter is empty
135    pub fn is_empty(&self) -> bool {
136        self.element_count == 0
137    }
138
139    /// Get the current false positive probability estimate
140    pub fn current_false_positive_rate(&self) -> f64 {
141        if self.element_count == 0 {
142            return 0.0;
143        }
144
145        // Estimate: p ≈ (1 - e^(-kn/m))^k
146        let k = self.config.num_hash_functions as f64;
147        let n = self.element_count as f64;
148        let m = self.config.bit_vector_size as f64;
149
150        (1.0 - (-k * n / m).exp()).powf(k)
151    }
152
153    /// Check if the filter should be rebuilt (too many elements)
154    pub fn should_rebuild(&self) -> bool {
155        self.element_count > self.config.expected_elements * 2
156            || self.current_false_positive_rate() > self.config.false_positive_rate * 2.0
157    }
158}
159
160/// Helper functions for common Bloom filter operations
161impl BloomConfig {
162    /// Standard configuration for OpLog sync operations
163    pub fn oplog_sync() -> Self {
164        Self::optimal(1000, 0.01) // 1000 ops, 1% false positive rate
165    }
166
167    /// Configuration for peer discovery operations
168    pub fn peer_discovery() -> Self {
169        Self::optimal(100, 0.001) // 100 peers, 0.1% false positive rate
170    }
171
172    /// Configuration for chunk deduplication
173    pub fn chunk_dedup() -> Self {
174        Self::optimal(10000, 0.005) // 10k chunks, 0.5% false positive rate
175    }
176}
177
178impl BloomFilter {
179    /// Create a filter optimized for OpLog sync
180    pub fn for_oplog_sync() -> Result<Self, BloomError> {
181        Self::new(BloomConfig::oplog_sync())
182    }
183
184    /// Create a filter for peer discovery
185    pub fn for_peer_discovery() -> Result<Self, BloomError> {
186        Self::new(BloomConfig::peer_discovery())
187    }
188
189    /// Create a filter for chunk deduplication
190    pub fn for_chunk_dedup() -> Result<Self, BloomError> {
191        Self::new(BloomConfig::chunk_dedup())
192    }
193}
194
195#[cfg(test)]
196mod tests {
197    use super::*;
198
199    #[test]
200    fn test_bloom_config_optimal() {
201        let config = BloomConfig::optimal(1000, 0.01);
202        assert!(config.validate().is_ok());
203        assert_eq!(config.expected_elements, 1000);
204        assert!((config.false_positive_rate - 0.01).abs() < f64::EPSILON);
205        assert!(config.num_hash_functions > 0);
206        assert!(config.bit_vector_size >= 64);
207    }
208
209    #[test]
210    fn test_bloom_config_validation() {
211        // Valid config
212        let valid_config = BloomConfig {
213            expected_elements: 100,
214            false_positive_rate: 0.01,
215            num_hash_functions: 7,
216            bit_vector_size: 1000,
217        };
218        assert!(valid_config.validate().is_ok());
219
220        // Invalid expected_elements
221        let invalid_config = BloomConfig {
222            expected_elements: 0,
223            ..valid_config
224        };
225        assert!(invalid_config.validate().is_err());
226
227        // Invalid false_positive_rate
228        let invalid_config = BloomConfig {
229            false_positive_rate: 0.0,
230            ..valid_config
231        };
232        assert!(invalid_config.validate().is_err());
233
234        let invalid_config = BloomConfig {
235            false_positive_rate: 1.0,
236            ..valid_config
237        };
238        assert!(invalid_config.validate().is_err());
239    }
240
241    #[test]
242    fn test_bloom_filter_creation() {
243        let config = BloomConfig::optimal(100, 0.01);
244        let filter = BloomFilter::new(config.clone()).unwrap();
245
246        assert_eq!(filter.config, config);
247        assert_eq!(filter.element_count, 0);
248        assert!(filter.is_empty());
249        assert!(filter.bit_size() >= 64);
250    }
251
252    #[test]
253    fn test_bloom_config_presets() {
254        let sync_config = BloomConfig::oplog_sync();
255        assert!(sync_config.validate().is_ok());
256        assert_eq!(sync_config.expected_elements, 1000);
257
258        let peer_config = BloomConfig::peer_discovery();
259        assert!(peer_config.validate().is_ok());
260        assert_eq!(peer_config.expected_elements, 100);
261
262        let chunk_config = BloomConfig::chunk_dedup();
263        assert!(chunk_config.validate().is_ok());
264        assert_eq!(chunk_config.expected_elements, 10000);
265    }
266
267    #[test]
268    fn test_filter_presets() {
269        let oplog_filter = BloomFilter::for_oplog_sync().unwrap();
270        assert!(oplog_filter.is_empty());
271
272        let peer_filter = BloomFilter::for_peer_discovery().unwrap();
273        assert!(peer_filter.is_empty());
274
275        let chunk_filter = BloomFilter::for_chunk_dedup().unwrap();
276        assert!(chunk_filter.is_empty());
277    }
278
279    #[test]
280    fn test_false_positive_rate_estimation() {
281        let config = BloomConfig::optimal(100, 0.01);
282        let mut filter = BloomFilter::new(config).unwrap();
283
284        // Empty filter should have 0% false positive rate
285        assert_eq!(filter.current_false_positive_rate(), 0.0);
286
287        // Add some elements and check rate increases
288        filter.element_count = 50;
289        let rate_50 = filter.current_false_positive_rate();
290
291        filter.element_count = 100;
292        let rate_100 = filter.current_false_positive_rate();
293
294        assert!(rate_100 > rate_50);
295    }
296
297    #[test]
298    fn test_should_rebuild() {
299        let config = BloomConfig::optimal(100, 0.01);
300        let mut filter = BloomFilter::new(config).unwrap();
301
302        // Empty filter shouldn't need rebuild
303        assert!(!filter.should_rebuild());
304
305        // Normal load shouldn't need rebuild
306        filter.element_count = 100;
307        assert!(!filter.should_rebuild());
308
309        // Excessive load should trigger rebuild
310        filter.element_count = 300;
311        assert!(filter.should_rebuild());
312    }
313}