oximedia_cache/
bloom_filter.rs

1//! Bloom filter for probabilistic cache membership testing.
2//!
3//! Provides two filter variants:
4//!
5//! - [`BloomFilter`] — classic bit-array Bloom filter with optimal `m` and `k`
6//!   computed from expected item count and desired false-positive rate.
7//! - [`CountingBloomFilter`] — extends the bit filter with 4-bit saturating
8//!   counters so that individual items can be removed.
9//!
10//! Both use a pure-Rust FNV-1a double-hashing scheme; no external crates are
11//! required.
12
13// ── FNV-1a constants ──────────────────────────────────────────────────────────
14
15const FNV_OFFSET_BASIS: u64 = 0xcbf29ce484222325u64;
16const FNV_PRIME: u64 = 0x00000100000001b3u64;
17
18/// Compute FNV-1a 64-bit hash of `data` with the given seed (offset basis).
19///
20/// Seeding with a value other than `FNV_OFFSET_BASIS` gives an independent
21/// hash family useful for double hashing.
22fn fnv1a_64_seeded(data: &[u8], seed: u64) -> u64 {
23    let mut hash = seed;
24    for &byte in data {
25        hash ^= u64::from(byte);
26        hash = hash.wrapping_mul(FNV_PRIME);
27    }
28    hash
29}
30
31/// Primary hash h1(x) using the standard FNV-1a offset basis.
32#[inline]
33fn h1(data: &[u8]) -> u64 {
34    fnv1a_64_seeded(data, FNV_OFFSET_BASIS)
35}
36
37/// Secondary hash h2(x) using a perturbed seed to form an independent family.
38///
39/// The seed is chosen to be odd (ensuring it is coprime with any power-of-2
40/// modulus), derived by XOR-folding the FNV prime with its complement.
41#[inline]
42fn h2(data: &[u8]) -> u64 {
43    // A different seed that still produces a good avalanche for the same input.
44    let seed = FNV_OFFSET_BASIS ^ 0xdeadbeefcafe1337u64;
45    // Ensure h2 is always odd so that the double-hashing series covers all
46    // positions (Kirsch–Mitzenmacher construction).
47    fnv1a_64_seeded(data, seed) | 1
48}
49
50/// Compute the `i`-th hash position for a given item via double hashing:
51///
52/// `pos(i, x) = (h1(x) + i * h2(x)) % num_bits`
53#[inline]
54fn double_hash_position(h1_val: u64, h2_val: u64, i: u64, num_bits: usize) -> usize {
55    let nb = num_bits as u64;
56    // Use wrapping arithmetic to avoid overflow on large i.
57    (h1_val.wrapping_add(i.wrapping_mul(h2_val)) % nb) as usize
58}
59
60// ── Optimal parameter helpers ─────────────────────────────────────────────────
61
62/// Compute the optimal bit-array length `m` for a Bloom filter.
63///
64/// Formula: `m = ceil(-n * ln(p) / (ln(2))^2)`
65fn optimal_num_bits(expected_items: usize, false_positive_rate: f64) -> usize {
66    let n = expected_items as f64;
67    let p = false_positive_rate.clamp(1e-15, 1.0 - f64::EPSILON);
68    let ln2_sq = std::f64::consts::LN_2 * std::f64::consts::LN_2;
69    let m = (-n * p.ln() / ln2_sq).ceil() as usize;
70    // Ensure at least one bit and round up to a byte boundary.
71    m.max(8)
72}
73
74/// Compute the optimal number of hash functions `k`.
75///
76/// Formula: `k = round((m / n) * ln(2))`
77fn optimal_num_hash_functions(num_bits: usize, expected_items: usize) -> u8 {
78    if expected_items == 0 {
79        return 1;
80    }
81    let m = num_bits as f64;
82    let n = expected_items as f64;
83    let k = ((m / n) * std::f64::consts::LN_2).round() as u64;
84    // Clamp to [1, 255].
85    k.clamp(1, 255) as u8
86}
87
88// ── BloomFilter ───────────────────────────────────────────────────────────────
89
90/// Space-efficient probabilistic membership filter.
91///
92/// False negatives are impossible; false positives occur with probability ≤
93/// the configured `false_positive_rate` when the number of inserted items does
94/// not exceed `expected_items`.
95#[derive(Debug, Clone)]
96pub struct BloomFilter {
97    /// Backing bit-array, stored as bytes (each byte holds 8 bits).
98    bit_array: Vec<u8>,
99    /// Total number of addressable bits (`bit_array.len() * 8`, rounded during
100    /// construction to the next multiple of 8).
101    num_bits: usize,
102    /// Number of independent hash positions set/checked per item.
103    num_hash_functions: u8,
104    /// Running count of items inserted (not decremented on false removes).
105    num_items: u64,
106}
107
108impl BloomFilter {
109    /// Construct a new `BloomFilter` optimised for `expected_items` items and
110    /// the target `false_positive_rate` (between 0 and 1 exclusive).
111    ///
112    /// Panics if `expected_items == 0` or `false_positive_rate` is outside
113    /// `(0, 1)`.
114    pub fn new(expected_items: usize, false_positive_rate: f64) -> Self {
115        assert!(expected_items > 0, "expected_items must be > 0");
116        assert!(
117            false_positive_rate > 0.0 && false_positive_rate < 1.0,
118            "false_positive_rate must be in (0, 1)"
119        );
120        let num_bits = optimal_num_bits(expected_items, false_positive_rate);
121        let num_hash_functions = optimal_num_hash_functions(num_bits, expected_items);
122        let byte_count = (num_bits + 7) / 8;
123        Self {
124            bit_array: vec![0u8; byte_count],
125            num_bits,
126            num_hash_functions,
127            num_items: 0,
128        }
129    }
130
131    // ── bit helpers ──────────────────────────────────────────────────────────
132
133    /// Set the bit at position `pos`.
134    fn set_bit(&mut self, pos: usize) {
135        let byte_idx = pos / 8;
136        let bit_idx = pos % 8;
137        if let Some(byte) = self.bit_array.get_mut(byte_idx) {
138            *byte |= 1u8 << bit_idx;
139        }
140    }
141
142    /// Test the bit at position `pos`.
143    fn get_bit(&self, pos: usize) -> bool {
144        let byte_idx = pos / 8;
145        let bit_idx = pos % 8;
146        self.bit_array
147            .get(byte_idx)
148            .map(|byte| (byte >> bit_idx) & 1 == 1)
149            .unwrap_or(false)
150    }
151
152    // ── Public API ────────────────────────────────────────────────────────────
153
154    /// Insert `item` into the filter.  After this call `contains(item)` is
155    /// guaranteed to return `true`.
156    pub fn insert(&mut self, item: &[u8]) {
157        let h1_val = h1(item);
158        let h2_val = h2(item);
159        for i in 0..self.num_hash_functions as u64 {
160            let pos = double_hash_position(h1_val, h2_val, i, self.num_bits);
161            self.set_bit(pos);
162        }
163        self.num_items += 1;
164    }
165
166    /// Return `true` if `item` *may* be in the set; `false` means it definitely
167    /// is not.
168    pub fn contains(&self, item: &[u8]) -> bool {
169        let h1_val = h1(item);
170        let h2_val = h2(item);
171        for i in 0..self.num_hash_functions as u64 {
172            let pos = double_hash_position(h1_val, h2_val, i, self.num_bits);
173            if !self.get_bit(pos) {
174                return false;
175            }
176        }
177        true
178    }
179
180    /// Estimate the current false-positive probability given the number of
181    /// items inserted so far.
182    ///
183    /// Formula: `(1 - e^(-k * n / m))^k`
184    pub fn estimate_false_positive_rate(&self) -> f64 {
185        let k = self.num_hash_functions as f64;
186        let n = self.num_items as f64;
187        let m = self.num_bits as f64;
188        if m == 0.0 {
189            return 1.0;
190        }
191        (1.0_f64 - (-k * n / m).exp()).powf(k)
192    }
193
194    /// Return the number of items inserted so far.
195    pub fn item_count(&self) -> u64 {
196        self.num_items
197    }
198
199    /// Return the number of addressable bits in the underlying array.
200    pub fn num_bits(&self) -> usize {
201        self.num_bits
202    }
203
204    /// Return the number of hash functions used per operation.
205    pub fn num_hash_functions(&self) -> u8 {
206        self.num_hash_functions
207    }
208}
209
210// ── CountingBloomFilter ───────────────────────────────────────────────────────
211
212/// Bloom filter with 4-bit saturating counters that supports deletion.
213///
214/// Each logical bit-position in the standard filter is replaced by a 4-bit
215/// counter stored in nibbles (two counters per byte).  A counter saturates at
216/// 15 to prevent overflow; decrement is a no-op on saturated counters (a
217/// conservative choice that avoids spurious false-negatives).
218#[derive(Debug, Clone)]
219pub struct CountingBloomFilter {
220    /// Nibble storage: `counts[byte] = (counter[2*byte+1] << 4) | counter[2*byte]`.
221    counts: Vec<u8>,
222    /// Number of logical counters (`counts.len() * 2`).
223    num_counters: usize,
224    /// Number of hash functions.
225    num_hash_functions: u8,
226    /// Number of items currently represented (net of removes).
227    num_items: u64,
228}
229
230impl CountingBloomFilter {
231    /// Construct a new `CountingBloomFilter` optimised for the given parameters.
232    pub fn new(expected_items: usize, false_positive_rate: f64) -> Self {
233        assert!(expected_items > 0, "expected_items must be > 0");
234        assert!(
235            false_positive_rate > 0.0 && false_positive_rate < 1.0,
236            "false_positive_rate must be in (0, 1)"
237        );
238        let num_bits = optimal_num_bits(expected_items, false_positive_rate);
239        let num_hash_functions = optimal_num_hash_functions(num_bits, expected_items);
240        // One nibble per counter position; round up to bytes.
241        let byte_count = (num_bits + 1) / 2;
242        Self {
243            counts: vec![0u8; byte_count],
244            num_counters: num_bits,
245            num_hash_functions,
246            num_items: 0,
247        }
248    }
249
250    // ── nibble helpers ───────────────────────────────────────────────────────
251
252    fn get_nibble(&self, pos: usize) -> u8 {
253        let byte_idx = pos / 2;
254        let nibble_shift = (pos % 2) * 4;
255        self.counts
256            .get(byte_idx)
257            .map(|b| (b >> nibble_shift) & 0x0F)
258            .unwrap_or(0)
259    }
260
261    fn increment_nibble(&mut self, pos: usize) {
262        let byte_idx = pos / 2;
263        let nibble_shift = (pos % 2) * 4;
264        if let Some(byte) = self.counts.get_mut(byte_idx) {
265            let nibble = (*byte >> nibble_shift) & 0x0F;
266            if nibble < 0x0F {
267                // Not yet saturated; increment.
268                *byte += 1u8 << nibble_shift;
269            }
270            // Saturated (nibble == 15): leave it — conservative approach.
271        }
272    }
273
274    fn decrement_nibble(&mut self, pos: usize) -> bool {
275        let byte_idx = pos / 2;
276        let nibble_shift = (pos % 2) * 4;
277        if let Some(byte) = self.counts.get_mut(byte_idx) {
278            let nibble = (*byte >> nibble_shift) & 0x0F;
279            if nibble == 0x0F {
280                // Saturated: we cannot safely decrement, item may still be present.
281                return false;
282            }
283            if nibble > 0 {
284                *byte -= 1u8 << nibble_shift;
285                return true;
286            }
287        }
288        false
289    }
290
291    // ── Public API ────────────────────────────────────────────────────────────
292
293    /// Insert `item` into the filter, incrementing all associated counters.
294    pub fn insert(&mut self, item: &[u8]) {
295        let h1_val = h1(item);
296        let h2_val = h2(item);
297        for i in 0..self.num_hash_functions as u64 {
298            let pos = double_hash_position(h1_val, h2_val, i, self.num_counters);
299            self.increment_nibble(pos);
300        }
301        self.num_items += 1;
302    }
303
304    /// Return `true` if `item` *may* be in the set.
305    pub fn contains(&self, item: &[u8]) -> bool {
306        let h1_val = h1(item);
307        let h2_val = h2(item);
308        for i in 0..self.num_hash_functions as u64 {
309            let pos = double_hash_position(h1_val, h2_val, i, self.num_counters);
310            if self.get_nibble(pos) == 0 {
311                return false;
312            }
313        }
314        true
315    }
316
317    /// Attempt to remove `item` from the filter by decrementing all associated
318    /// counters.
319    ///
320    /// Returns `true` if the item was (probably) present and all counters could
321    /// be safely decremented.  Returns `false` if any counter was already zero
322    /// (item was never inserted, or already removed) or if any counter is
323    /// saturated (the decrement is withheld).
324    pub fn remove(&mut self, item: &[u8]) -> bool {
325        // First check: does the item appear to be present?
326        if !self.contains(item) {
327            return false;
328        }
329        let h1_val = h1(item);
330        let h2_val = h2(item);
331        // Collect positions so we can roll back on failure.
332        let positions: Vec<usize> = (0..self.num_hash_functions as u64)
333            .map(|i| double_hash_position(h1_val, h2_val, i, self.num_counters))
334            .collect();
335        // Check no position is zero or saturated.
336        for &pos in &positions {
337            let nibble = self.get_nibble(pos);
338            if nibble == 0 || nibble == 0x0F {
339                return false;
340            }
341        }
342        // Safe to decrement all.
343        for &pos in &positions {
344            self.decrement_nibble(pos);
345        }
346        if self.num_items > 0 {
347            self.num_items -= 1;
348        }
349        true
350    }
351
352    /// Return the number of items currently represented in the filter.
353    pub fn item_count(&self) -> u64 {
354        self.num_items
355    }
356
357    /// Estimate false-positive rate (same formula as [`BloomFilter`]).
358    pub fn estimate_false_positive_rate(&self) -> f64 {
359        let k = self.num_hash_functions as f64;
360        let n = self.num_items as f64;
361        let m = self.num_counters as f64;
362        if m == 0.0 {
363            return 1.0;
364        }
365        (1.0_f64 - (-k * n / m).exp()).powf(k)
366    }
367}
368
369// ── ScalableBloomFilter ────────────────────────────────────────────────────────
370
371/// Auto-growing Bloom filter that adds new layers when the estimated
372/// false-positive rate of the current layer exceeds a threshold.
373///
374/// Each layer is an independent [`BloomFilter`] with geometrically increasing
375/// capacity.  A `contains` query checks all layers; an `insert` goes into the
376/// active (most recent) layer.  When the active layer's estimated FPR exceeds
377/// `max_fpr_per_layer`, a new layer is allocated with capacity scaled by
378/// `growth_factor`.
379///
380/// The overall false-positive rate is bounded by the geometric series
381/// `p₀ + p₁ + p₂ + …` where each `pᵢ = max_fpr_per_layer * tightening_ratioⁱ`.
382/// With a tightening ratio < 1 (default 0.8) the series converges.
383#[derive(Debug, Clone)]
384pub struct ScalableBloomFilter {
385    /// Stack of layers; the last element is the active layer.
386    layers: Vec<BloomFilter>,
387    /// Initial capacity of the first layer.
388    initial_capacity: usize,
389    /// Target maximum FPR per individual layer.
390    max_fpr_per_layer: f64,
391    /// Multiplicative growth factor for successive layer capacities (>1.0).
392    growth_factor: f64,
393    /// Tightening ratio: each new layer uses `fpr * tightening_ratio` to keep
394    /// the aggregate FPR bounded.
395    tightening_ratio: f64,
396    /// Total number of items across all layers.
397    total_items: u64,
398}
399
400impl ScalableBloomFilter {
401    /// Create a new `ScalableBloomFilter`.
402    ///
403    /// # Parameters
404    /// * `initial_capacity` — expected items for the first layer (must be > 0).
405    /// * `target_fpr` — target false-positive rate per layer in `(0, 1)`.
406    /// * `growth_factor` — how much each successive layer grows (>1.0, e.g. 2.0).
407    pub fn new(initial_capacity: usize, target_fpr: f64, growth_factor: f64) -> Self {
408        assert!(initial_capacity > 0, "initial_capacity must be > 0");
409        assert!(
410            target_fpr > 0.0 && target_fpr < 1.0,
411            "target_fpr must be in (0, 1)"
412        );
413        let gf = if growth_factor > 1.0 {
414            growth_factor
415        } else {
416            2.0
417        };
418        let first_layer = BloomFilter::new(initial_capacity, target_fpr);
419        Self {
420            layers: vec![first_layer],
421            initial_capacity,
422            max_fpr_per_layer: target_fpr,
423            growth_factor: gf,
424            tightening_ratio: 0.8,
425            total_items: 0,
426        }
427    }
428
429    /// Insert `item` into the active (most recent) layer.
430    ///
431    /// If the active layer's estimated FPR exceeds `max_fpr_per_layer` after
432    /// insertion, a new layer is allocated.
433    pub fn insert(&mut self, item: &[u8]) {
434        // Check if we need a new layer *before* inserting.
435        if let Some(active) = self.layers.last() {
436            if active.estimate_false_positive_rate() > self.max_fpr_per_layer {
437                self.add_layer();
438            }
439        }
440        if let Some(active) = self.layers.last_mut() {
441            active.insert(item);
442        }
443        self.total_items += 1;
444    }
445
446    /// Return `true` if `item` *may* be in any layer; `false` means it
447    /// definitely was never inserted.
448    pub fn contains(&self, item: &[u8]) -> bool {
449        self.layers.iter().any(|layer| layer.contains(item))
450    }
451
452    /// Estimate the aggregate false-positive rate across all layers.
453    ///
454    /// The overall FPR is `1 - Π(1 - fprᵢ)` (probability of at least one
455    /// layer reporting a false positive).
456    pub fn estimate_false_positive_rate(&self) -> f64 {
457        let product: f64 = self
458            .layers
459            .iter()
460            .map(|l| 1.0 - l.estimate_false_positive_rate())
461            .product();
462        1.0 - product
463    }
464
465    /// Return the total number of items inserted across all layers.
466    pub fn total_item_count(&self) -> u64 {
467        self.total_items
468    }
469
470    /// Return the number of layers currently allocated.
471    pub fn layer_count(&self) -> usize {
472        self.layers.len()
473    }
474
475    /// Allocate a new layer with geometrically larger capacity and tighter FPR.
476    fn add_layer(&mut self) {
477        let layer_idx = self.layers.len();
478        let capacity =
479            (self.initial_capacity as f64 * self.growth_factor.powi(layer_idx as i32)) as usize;
480        let capacity = capacity.max(1);
481        let fpr = self.max_fpr_per_layer * self.tightening_ratio.powi(layer_idx as i32);
482        let fpr = fpr.clamp(1e-15, 1.0 - f64::EPSILON);
483        self.layers.push(BloomFilter::new(capacity, fpr));
484    }
485
486    /// Set the tightening ratio (must be in `(0, 1)`).
487    ///
488    /// Each successive layer uses `fpr * tightening_ratio^i` to keep the
489    /// aggregate FPR bounded.  A lower ratio means tighter per-layer FPR
490    /// targets (more bits per layer).
491    pub fn set_tightening_ratio(&mut self, ratio: f64) {
492        if ratio > 0.0 && ratio < 1.0 {
493            self.tightening_ratio = ratio;
494        }
495    }
496
497    /// Return the tightening ratio.
498    pub fn tightening_ratio(&self) -> f64 {
499        self.tightening_ratio
500    }
501
502    /// Return the growth factor.
503    pub fn growth_factor(&self) -> f64 {
504        self.growth_factor
505    }
506
507    /// Return per-layer statistics: `(item_count, num_bits, estimated_fpr)`.
508    pub fn layer_stats(&self) -> Vec<(u64, usize, f64)> {
509        self.layers
510            .iter()
511            .map(|l| {
512                (
513                    l.item_count(),
514                    l.num_bits(),
515                    l.estimate_false_positive_rate(),
516                )
517            })
518            .collect()
519    }
520
521    /// Return the estimated remaining capacity of the active (most recent)
522    /// layer before it triggers a new layer allocation.
523    ///
524    /// This is approximate: it counts how many more items can be inserted
525    /// before the active layer's estimated FPR exceeds `max_fpr_per_layer`.
526    /// Returns `0` if the active layer has already exceeded its FPR target.
527    pub fn estimated_capacity_remaining(&self) -> usize {
528        let active = match self.layers.last() {
529            Some(l) => l,
530            None => return 0,
531        };
532        let current_fpr = active.estimate_false_positive_rate();
533        if current_fpr >= self.max_fpr_per_layer {
534            return 0;
535        }
536        // Estimate: solve (1 - e^(-k * (n+x) / m))^k = max_fpr for x.
537        // Approximate by counting items until the estimated FPR crosses.
538        // Simple approach: use the formula m/k * ln(2) - n as rough estimate.
539        let k = active.num_hash_functions() as f64;
540        let m = active.num_bits() as f64;
541        let n = active.item_count() as f64;
542        let theoretical_max = (m / k) * std::f64::consts::LN_2;
543        let remaining = (theoretical_max - n).max(0.0);
544        remaining as usize
545    }
546
547    /// Clear all layers and reset to a single fresh layer.
548    pub fn clear(&mut self) {
549        self.layers.clear();
550        self.total_items = 0;
551        let first_layer = BloomFilter::new(self.initial_capacity, self.max_fpr_per_layer);
552        self.layers.push(first_layer);
553    }
554
555    /// Return the total number of bits across all layers.
556    pub fn total_bits(&self) -> usize {
557        self.layers.iter().map(|l| l.num_bits()).sum()
558    }
559}
560
561// ── Tests ─────────────────────────────────────────────────────────────────────
562
563#[cfg(test)]
564mod tests {
565    use super::*;
566
567    // ── FNV helpers ───────────────────────────────────────────────────────────
568
569    #[test]
570    fn test_fnv1a_deterministic() {
571        let a = h1(b"hello");
572        let b = h1(b"hello");
573        assert_eq!(a, b);
574    }
575
576    #[test]
577    fn test_h1_h2_differ() {
578        let v = h1(b"test");
579        let v2 = h2(b"test");
580        assert_ne!(v, v2, "h1 and h2 should produce different hashes");
581    }
582
583    #[test]
584    fn test_h2_always_odd() {
585        for seed in [b"a".as_ref(), b"hello", b"oximedia", b"\x00\xff"] {
586            assert_eq!(h2(seed) & 1, 1, "h2 must be odd for {seed:?}");
587        }
588    }
589
590    // ── BloomFilter construction ──────────────────────────────────────────────
591
592    #[test]
593    fn test_new_bloom_filter() {
594        let bf = BloomFilter::new(1000, 0.01);
595        assert!(bf.num_bits() > 0);
596        assert!(bf.num_hash_functions() > 0);
597        assert_eq!(bf.item_count(), 0);
598    }
599
600    #[test]
601    fn test_optimal_num_bits_reasonable() {
602        // For n=10000, p=0.01 the classic formula gives ~95851 bits ≈ 11.4 KiB.
603        let m = optimal_num_bits(10_000, 0.01);
604        assert!(m > 90_000 && m < 110_000, "unexpected m={m}");
605    }
606
607    #[test]
608    fn test_optimal_k_reasonable() {
609        let m = optimal_num_bits(10_000, 0.01);
610        let k = optimal_num_hash_functions(m, 10_000);
611        // Theoretical k ≈ 6.64 → should round to 7.
612        assert!(k >= 6 && k <= 8, "unexpected k={k}");
613    }
614
615    // ── BloomFilter insert / contains ─────────────────────────────────────────
616
617    #[test]
618    fn test_insert_then_contains() {
619        let mut bf = BloomFilter::new(100, 0.01);
620        bf.insert(b"key1");
621        assert!(bf.contains(b"key1"));
622    }
623
624    #[test]
625    fn test_contains_absent_item() {
626        let bf = BloomFilter::new(100, 0.01);
627        // No false negatives; absent items should not be reported present
628        // unless by chance. With p=0.01 and 0 items the rate is 0.
629        assert!(!bf.contains(b"ghost"));
630    }
631
632    #[test]
633    fn test_no_false_negatives() {
634        let mut bf = BloomFilter::new(500, 0.01);
635        let items: Vec<Vec<u8>> = (0u32..200).map(|i| i.to_le_bytes().to_vec()).collect();
636        for item in &items {
637            bf.insert(item);
638        }
639        for item in &items {
640            assert!(bf.contains(item), "false negative detected for {:?}", item);
641        }
642    }
643
644    #[test]
645    fn test_item_count() {
646        let mut bf = BloomFilter::new(100, 0.05);
647        bf.insert(b"a");
648        bf.insert(b"b");
649        bf.insert(b"c");
650        assert_eq!(bf.item_count(), 3);
651    }
652
653    // ── BloomFilter false-positive rate ───────────────────────────────────────
654
655    #[test]
656    fn test_estimate_fpr_empty() {
657        let bf = BloomFilter::new(1000, 0.01);
658        assert_eq!(bf.estimate_false_positive_rate(), 0.0);
659    }
660
661    #[test]
662    fn test_estimate_fpr_increases_with_fill() {
663        let mut bf = BloomFilter::new(100, 0.01);
664        let fpr_empty = bf.estimate_false_positive_rate();
665        for i in 0u32..50 {
666            bf.insert(&i.to_le_bytes());
667        }
668        let fpr_half = bf.estimate_false_positive_rate();
669        assert!(fpr_half > fpr_empty, "FPR should increase as filter fills");
670    }
671
672    /// Empirical false-positive rate at n=10000, p=0.01.
673    ///
674    /// We insert 10 000 distinct items then probe 10 000 distinct non-inserted
675    /// items and assert that < 2% report `contains == true`.
676    #[test]
677    fn test_empirical_fpr_at_n10000_p001() {
678        let n = 10_000usize;
679        let p = 0.01_f64;
680        let mut bf = BloomFilter::new(n, p);
681
682        for i in 0u32..n as u32 {
683            let key = format!("inserted_{i}");
684            bf.insert(key.as_bytes());
685        }
686
687        let mut false_positives = 0usize;
688        let probes = 10_000usize;
689        for i in 0u32..probes as u32 {
690            let key = format!("absent_{i}");
691            if bf.contains(key.as_bytes()) {
692                false_positives += 1;
693            }
694        }
695
696        let observed_fpr = false_positives as f64 / probes as f64;
697        // Allow 3× the target rate as headroom for test flakiness.
698        assert!(
699            observed_fpr <= p * 3.0,
700            "observed FPR {observed_fpr:.4} exceeded 3× target ({:.4})",
701            p * 3.0
702        );
703    }
704
705    // ── CountingBloomFilter ───────────────────────────────────────────────────
706
707    #[test]
708    fn test_counting_bf_insert_contains() {
709        let mut cbf = CountingBloomFilter::new(200, 0.01);
710        cbf.insert(b"alpha");
711        assert!(cbf.contains(b"alpha"));
712    }
713
714    #[test]
715    fn test_counting_bf_remove() {
716        let mut cbf = CountingBloomFilter::new(200, 0.01);
717        cbf.insert(b"remove_me");
718        assert!(cbf.contains(b"remove_me"));
719        let removed = cbf.remove(b"remove_me");
720        assert!(removed, "remove should succeed");
721        assert!(!cbf.contains(b"remove_me"));
722    }
723
724    #[test]
725    fn test_counting_bf_remove_absent() {
726        let mut cbf = CountingBloomFilter::new(200, 0.01);
727        let removed = cbf.remove(b"never_inserted");
728        assert!(!removed, "cannot remove item that was never inserted");
729    }
730
731    #[test]
732    fn test_counting_bf_item_count() {
733        let mut cbf = CountingBloomFilter::new(100, 0.05);
734        cbf.insert(b"x");
735        cbf.insert(b"y");
736        assert_eq!(cbf.item_count(), 2);
737        cbf.remove(b"x");
738        assert_eq!(cbf.item_count(), 1);
739    }
740
741    #[test]
742    fn test_counting_bf_no_false_negatives() {
743        let mut cbf = CountingBloomFilter::new(300, 0.01);
744        let items: Vec<Vec<u8>> = (0u32..100).map(|i| i.to_le_bytes().to_vec()).collect();
745        for item in &items {
746            cbf.insert(item);
747        }
748        for item in &items {
749            assert!(cbf.contains(item), "false negative for {item:?}");
750        }
751    }
752
753    #[test]
754    fn test_counting_bf_multiple_inserts_then_single_remove() {
755        let mut cbf = CountingBloomFilter::new(200, 0.01);
756        // Insert the same item twice: one remove should not clear it.
757        cbf.insert(b"double");
758        cbf.insert(b"double");
759        cbf.remove(b"double");
760        // Should still be present (counter was 2, now 1).
761        assert!(cbf.contains(b"double"));
762    }
763
764    #[test]
765    fn test_double_hash_position_range() {
766        let item = b"test_item";
767        let h1_val = h1(item);
768        let h2_val = h2(item);
769        let num_bits = 1024;
770        for i in 0..10u64 {
771            let pos = double_hash_position(h1_val, h2_val, i, num_bits);
772            assert!(pos < num_bits, "position {pos} out of range");
773        }
774    }
775
776    #[test]
777    fn test_bloom_filter_clone() {
778        let mut bf = BloomFilter::new(100, 0.01);
779        bf.insert(b"cloned");
780        let bf2 = bf.clone();
781        assert!(bf2.contains(b"cloned"));
782        assert_eq!(bf2.item_count(), bf.item_count());
783    }
784
785    // ── ScalableBloomFilter ─────────────────────────────────────────────────
786
787    #[test]
788    fn test_scalable_bf_insert_contains() {
789        let mut sbf = ScalableBloomFilter::new(50, 0.01, 2.0);
790        sbf.insert(b"hello");
791        assert!(sbf.contains(b"hello"));
792    }
793
794    #[test]
795    fn test_scalable_bf_absent_item() {
796        let sbf = ScalableBloomFilter::new(50, 0.01, 2.0);
797        assert!(!sbf.contains(b"missing"));
798    }
799
800    #[test]
801    fn test_scalable_bf_no_false_negatives() {
802        let mut sbf = ScalableBloomFilter::new(50, 0.05, 2.0);
803        let items: Vec<Vec<u8>> = (0u32..200).map(|i| i.to_le_bytes().to_vec()).collect();
804        for item in &items {
805            sbf.insert(item);
806        }
807        for item in &items {
808            assert!(sbf.contains(item), "false negative for {item:?}");
809        }
810    }
811
812    #[test]
813    fn test_scalable_bf_grows_layers() {
814        // Small initial capacity → should create additional layers quickly.
815        let mut sbf = ScalableBloomFilter::new(10, 0.1, 2.0);
816        for i in 0u32..500 {
817            sbf.insert(&i.to_le_bytes());
818        }
819        assert!(
820            sbf.layer_count() > 1,
821            "should have grown beyond 1 layer, got {}",
822            sbf.layer_count()
823        );
824    }
825
826    #[test]
827    fn test_scalable_bf_total_item_count() {
828        let mut sbf = ScalableBloomFilter::new(100, 0.01, 2.0);
829        sbf.insert(b"a");
830        sbf.insert(b"b");
831        sbf.insert(b"c");
832        assert_eq!(sbf.total_item_count(), 3);
833    }
834
835    #[test]
836    fn test_scalable_bf_fpr_bounded() {
837        let mut sbf = ScalableBloomFilter::new(1000, 0.01, 2.0);
838        for i in 0u32..1000 {
839            sbf.insert(&i.to_le_bytes());
840        }
841        let fpr = sbf.estimate_false_positive_rate();
842        // Aggregate FPR should be reasonable (below 10% for 1000 items at 1% target).
843        assert!(fpr < 0.10, "aggregate FPR {fpr:.4} is too high");
844    }
845
846    #[test]
847    fn test_scalable_bf_clone() {
848        let mut sbf = ScalableBloomFilter::new(100, 0.01, 2.0);
849        sbf.insert(b"test");
850        let sbf2 = sbf.clone();
851        assert!(sbf2.contains(b"test"));
852        assert_eq!(sbf2.total_item_count(), 1);
853        assert_eq!(sbf2.layer_count(), sbf.layer_count());
854    }
855
856    #[test]
857    fn test_scalable_bf_empty_fpr() {
858        let sbf = ScalableBloomFilter::new(100, 0.01, 2.0);
859        assert_eq!(sbf.estimate_false_positive_rate(), 0.0);
860    }
861
862    // ── Scalable Bloom filter enhanced tests ────────────────────────────────
863
864    #[test]
865    fn test_scalable_bf_set_tightening_ratio() {
866        let mut sbf = ScalableBloomFilter::new(100, 0.01, 2.0);
867        sbf.set_tightening_ratio(0.5);
868        assert!((sbf.tightening_ratio() - 0.5).abs() < f64::EPSILON);
869    }
870
871    #[test]
872    fn test_scalable_bf_invalid_tightening_ratio_ignored() {
873        let mut sbf = ScalableBloomFilter::new(100, 0.01, 2.0);
874        let original = sbf.tightening_ratio();
875        sbf.set_tightening_ratio(0.0); // invalid
876        assert!((sbf.tightening_ratio() - original).abs() < f64::EPSILON);
877        sbf.set_tightening_ratio(1.0); // invalid
878        assert!((sbf.tightening_ratio() - original).abs() < f64::EPSILON);
879        sbf.set_tightening_ratio(-0.5); // invalid
880        assert!((sbf.tightening_ratio() - original).abs() < f64::EPSILON);
881    }
882
883    #[test]
884    fn test_scalable_bf_growth_factor() {
885        let sbf = ScalableBloomFilter::new(100, 0.01, 3.0);
886        assert!((sbf.growth_factor() - 3.0).abs() < f64::EPSILON);
887    }
888
889    #[test]
890    fn test_scalable_bf_growth_factor_default_when_invalid() {
891        // growth_factor <= 1.0 should default to 2.0
892        let sbf = ScalableBloomFilter::new(100, 0.01, 0.5);
893        assert!((sbf.growth_factor() - 2.0).abs() < f64::EPSILON);
894    }
895
896    #[test]
897    fn test_scalable_bf_layer_stats() {
898        let mut sbf = ScalableBloomFilter::new(10, 0.1, 2.0);
899        for i in 0u32..200 {
900            sbf.insert(&i.to_le_bytes());
901        }
902        let stats = sbf.layer_stats();
903        assert!(!stats.is_empty());
904        // First layer should have items
905        assert!(stats[0].0 > 0, "first layer should have items");
906        // All layers should have bits
907        for (_, bits, _) in &stats {
908            assert!(*bits > 0);
909        }
910    }
911
912    #[test]
913    fn test_scalable_bf_estimated_capacity_remaining() {
914        let sbf = ScalableBloomFilter::new(1000, 0.01, 2.0);
915        let remaining = sbf.estimated_capacity_remaining();
916        // Fresh filter should have significant remaining capacity
917        assert!(remaining > 0, "fresh filter should have remaining capacity");
918    }
919
920    #[test]
921    fn test_scalable_bf_estimated_capacity_decreases() {
922        let mut sbf = ScalableBloomFilter::new(100, 0.01, 2.0);
923        let before = sbf.estimated_capacity_remaining();
924        for i in 0u32..50 {
925            sbf.insert(&i.to_le_bytes());
926        }
927        let after = sbf.estimated_capacity_remaining();
928        assert!(
929            after < before,
930            "remaining capacity should decrease after inserts"
931        );
932    }
933
934    #[test]
935    fn test_scalable_bf_clear() {
936        let mut sbf = ScalableBloomFilter::new(100, 0.01, 2.0);
937        for i in 0u32..50 {
938            sbf.insert(&i.to_le_bytes());
939        }
940        sbf.clear();
941        assert_eq!(sbf.total_item_count(), 0);
942        assert_eq!(sbf.layer_count(), 1);
943        // Previously inserted items should no longer be found
944        assert!(!sbf.contains(&0u32.to_le_bytes()));
945    }
946
947    #[test]
948    fn test_scalable_bf_total_bits() {
949        let mut sbf = ScalableBloomFilter::new(10, 0.1, 2.0);
950        let bits_single = sbf.total_bits();
951        for i in 0u32..500 {
952            sbf.insert(&i.to_le_bytes());
953        }
954        let bits_multi = sbf.total_bits();
955        assert!(
956            bits_multi > bits_single,
957            "total bits should increase with layers"
958        );
959    }
960
961    #[test]
962    fn test_scalable_bf_tighter_ratio_more_layers() {
963        // With tighter ratio (smaller), each layer has tighter FPR target
964        // meaning more bits per layer, potentially fewer layers needed
965        let mut sbf_tight = ScalableBloomFilter::new(10, 0.1, 2.0);
966        sbf_tight.set_tightening_ratio(0.5);
967        let mut sbf_loose = ScalableBloomFilter::new(10, 0.1, 2.0);
968        sbf_loose.set_tightening_ratio(0.9);
969
970        for i in 0u32..200 {
971            sbf_tight.insert(&i.to_le_bytes());
972            sbf_loose.insert(&i.to_le_bytes());
973        }
974        // Both should contain all items (no false negatives)
975        for i in 0u32..200 {
976            assert!(sbf_tight.contains(&i.to_le_bytes()));
977            assert!(sbf_loose.contains(&i.to_le_bytes()));
978        }
979    }
980
981    #[test]
982    fn test_scalable_bf_empirical_fpr() {
983        let mut sbf = ScalableBloomFilter::new(1000, 0.05, 2.0);
984        for i in 0u32..1000 {
985            sbf.insert(&i.to_le_bytes());
986        }
987        // Test FPR against 10000 absent items
988        let mut fps = 0usize;
989        for i in 10000u32..20000 {
990            if sbf.contains(&i.to_le_bytes()) {
991                fps += 1;
992            }
993        }
994        let observed_fpr = fps as f64 / 10000.0;
995        // Aggregate FPR should be reasonable (under 20%)
996        assert!(
997            observed_fpr < 0.20,
998            "observed FPR {observed_fpr:.4} is too high"
999        );
1000    }
1001
1002    #[test]
1003    fn test_scalable_bf_stress_many_inserts() {
1004        let mut sbf = ScalableBloomFilter::new(50, 0.01, 2.0);
1005        for i in 0u32..10000 {
1006            sbf.insert(&i.to_le_bytes());
1007        }
1008        assert_eq!(sbf.total_item_count(), 10000);
1009        // Verify no false negatives on a sample
1010        for i in [0u32, 999, 5000, 9999] {
1011            assert!(sbf.contains(&i.to_le_bytes()), "false negative for {i}");
1012        }
1013    }
1014}
oximedia_cache/bloom_filter.rs

oximedia_cache/
bloom_filter.rs