Skip to main content

oximedia_dedup/
bloom_prescreen.rs

1//! Bloom filter pre-screening for deduplication pipelines.
2//!
3//! [`BloomPrescreen`] provides an optimal Bloom filter implementation using
4//! FNV-1a with varied seeds for k independent hash functions.  It is intended
5//! as a fast first-pass gate before expensive perceptual or SSIM comparisons.
6//!
7//! # Design
8//!
9//! - Bit array backed by `Vec<u64>` (cache-friendly word-at-a-time access).
10//! - Hash family: FNV-1a with per-hash seed diversification (Kirsch-Mitzenmacher).
11//! - Optimal `m` and `k` computed from capacity `n` and target FPR `p`:
12//!   - `m = ceil(-n * ln(p) / (ln(2))^2)`
13//!   - `k = round((m / n) * ln(2))`
14//! - False-positive rate estimate: `(1 - e^(-k*inserted/m))^k`
15
16#![allow(dead_code)]
17#![allow(clippy::cast_precision_loss)]
18#![allow(clippy::cast_possible_truncation)]
19
20/// Bloom filter optimised for deduplication pre-screening.
21///
22/// # Example
23///
24/// ```
25/// use oximedia_dedup::bloom_prescreen::BloomPrescreen;
26///
27/// let mut bp = BloomPrescreen::new(1000, 0.01);
28/// bp.insert(b"hello");
29/// assert!(bp.might_contain(b"hello"));
30/// assert!(!bp.might_contain(b"world")); // (with high probability)
31/// ```
32#[derive(Debug, Clone)]
33pub struct BloomPrescreen {
34    /// Underlying bit storage (each `u64` holds 64 bits).
35    pub bits: Vec<u64>,
36    /// Number of independent hash functions.
37    pub k_hashes: u32,
38    /// Total number of addressable bits.
39    pub num_bits: usize,
40    /// Number of items inserted (used for FPR estimation).
41    items_inserted: u64,
42}
43
44impl BloomPrescreen {
45    /// Create a new `BloomPrescreen` sized for `capacity` expected items at
46    /// the given `false_positive_rate`.
47    ///
48    /// Uses the standard formulas:
49    /// - `m = ceil(-n * ln(p) / (ln 2)^2)`  (number of bits)
50    /// - `k = round((m / n) * ln 2)`         (number of hash functions)
51    ///
52    /// Both values are clamped to sensible minimums.
53    #[must_use]
54    pub fn new(capacity: usize, false_positive_rate: f64) -> Self {
55        let n = capacity.max(1) as f64;
56        let p = false_positive_rate.clamp(1e-15, 1.0 - f64::EPSILON);
57        let ln2 = std::f64::consts::LN_2;
58
59        // Optimal bit count
60        let m_float = -n * p.ln() / (ln2 * ln2);
61        let m = (m_float.ceil() as usize).max(64);
62
63        // Optimal number of hash functions
64        let k_float = (m as f64 / n) * ln2;
65        let k = (k_float.round() as u32).clamp(1, 32);
66
67        // Allocate words
68        let num_words = (m + 63) / 64;
69
70        Self {
71            bits: vec![0u64; num_words],
72            k_hashes: k,
73            num_bits: m,
74            items_inserted: 0,
75        }
76    }
77
78    /// Insert `key` into the filter.
79    ///
80    /// After calling this, [`might_contain`](Self::might_contain) will always
81    /// return `true` for the same `key` (no false negatives).
82    pub fn insert(&mut self, key: &[u8]) {
83        for i in 0..self.k_hashes {
84            let bit_idx = self.bit_index(key, i);
85            let word = bit_idx / 64;
86            let bit = bit_idx % 64;
87            if word < self.bits.len() {
88                self.bits[word] |= 1u64 << bit;
89            }
90        }
91        self.items_inserted += 1;
92    }
93
94    /// Returns `true` if `key` *might* be in the set (possible false positive).
95    /// Returns `false` if `key` is *definitely not* in the set.
96    #[must_use]
97    pub fn might_contain(&self, key: &[u8]) -> bool {
98        for i in 0..self.k_hashes {
99            let bit_idx = self.bit_index(key, i);
100            let word = bit_idx / 64;
101            let bit = bit_idx % 64;
102            match self.bits.get(word) {
103                Some(w) if w & (1u64 << bit) != 0 => continue,
104                _ => return false,
105            }
106        }
107        true
108    }
109
110    /// Estimate the current false-positive rate given the number of inserted items.
111    ///
112    /// Formula: `(1 - e^(-k * n / m))^k`
113    ///
114    /// Returns `1.0` when the filter is full.
115    #[must_use]
116    pub fn false_positive_rate_estimate(&self) -> f64 {
117        if self.num_bits == 0 {
118            return 1.0;
119        }
120        let k = self.k_hashes as f64;
121        let n = self.items_inserted as f64;
122        let m = self.num_bits as f64;
123        let inner = (-k * n / m).exp();
124        (1.0 - inner).powf(k)
125    }
126
127    /// Number of items inserted so far.
128    #[must_use]
129    pub fn items_inserted(&self) -> u64 {
130        self.items_inserted
131    }
132
133    /// Reset the filter to its initial empty state.
134    pub fn clear(&mut self) {
135        for w in &mut self.bits {
136            *w = 0;
137        }
138        self.items_inserted = 0;
139    }
140
141    /// Number of set bits (useful for diagnostics).
142    #[must_use]
143    pub fn set_bit_count(&self) -> u64 {
144        self.bits.iter().map(|w| u64::from(w.count_ones())).sum()
145    }
146
147    // ── Hash family ──────────────────────────────────────────────────────────
148
149    /// Compute the bit index for the `i`-th hash of `key`.
150    ///
151    /// Uses FNV-1a as the base hash, then applies the Kirsch-Mitzenmacher
152    /// technique: `h_i(x) = (h1(x) + i * h2(x)) mod m`.
153    ///
154    /// `h2` is FNV-1a seeded with a distinct per-function constant derived
155    /// from the index, ensuring independence across hash functions.
156    fn bit_index(&self, key: &[u8], i: u32) -> usize {
157        let h1 = fnv1a_64(key);
158        // Seed h2 with a unique per-index constant (large prime mix)
159        let seed = u64::from(i)
160            .wrapping_mul(0x9e37_79b9_7f4a_7c15)
161            .wrapping_add(0x6c62_272e_07bb_0142);
162        let h2 = fnv1a_64_seeded(key, seed);
163        let combined = h1.wrapping_add(u64::from(i).wrapping_mul(h2));
164        (combined % self.num_bits as u64) as usize
165    }
166}
167
168// ── FNV-1a hash primitives ────────────────────────────────────────────────────
169
170/// FNV-1a 64-bit hash (standard offset basis).
171fn fnv1a_64(data: &[u8]) -> u64 {
172    let mut h: u64 = 0xcbf2_9ce4_8422_2325;
173    for &b in data {
174        h ^= u64::from(b);
175        h = h.wrapping_mul(0x0000_0100_0000_01b3);
176    }
177    h
178}
179
180/// FNV-1a 64-bit hash with a custom seed mixed into the offset basis.
181fn fnv1a_64_seeded(data: &[u8], seed: u64) -> u64 {
182    let mut h: u64 = 0xcbf2_9ce4_8422_2325 ^ seed.wrapping_mul(0x0000_0100_0000_01b3);
183    for &b in data {
184        h ^= u64::from(b);
185        h = h.wrapping_mul(0x0000_0100_0000_01b3);
186    }
187    h
188}
189
190// ── Tests ─────────────────────────────────────────────────────────────────────
191
192#[cfg(test)]
193mod tests {
194    use super::*;
195
196    // ── Construction ──────────────────────────────────────────────────────────
197
198    #[test]
199    fn test_new_allocates_bits() {
200        let bp = BloomPrescreen::new(1000, 0.01);
201        assert!(!bp.bits.is_empty());
202        assert!(bp.k_hashes >= 1);
203        assert!(bp.num_bits >= 64);
204    }
205
206    #[test]
207    fn test_new_k_clamp_lower() {
208        // Even at extreme FPR the k must be at least 1.
209        let bp = BloomPrescreen::new(1, 0.99);
210        assert!(bp.k_hashes >= 1);
211    }
212
213    #[test]
214    fn test_new_k_clamp_upper() {
215        // Very tight FPR should not produce k > 32.
216        let bp = BloomPrescreen::new(1_000_000, 1e-15);
217        assert!(bp.k_hashes <= 32);
218    }
219
220    #[test]
221    fn test_larger_capacity_more_bits() {
222        let small = BloomPrescreen::new(100, 0.01);
223        let large = BloomPrescreen::new(100_000, 0.01);
224        assert!(large.num_bits > small.num_bits);
225    }
226
227    #[test]
228    fn test_stricter_fpr_more_hashes() {
229        let loose = BloomPrescreen::new(1000, 0.1);
230        let strict = BloomPrescreen::new(1000, 0.0001);
231        assert!(strict.k_hashes >= loose.k_hashes);
232    }
233
234    // ── No false negatives ────────────────────────────────────────────────────
235
236    #[test]
237    fn test_insert_then_might_contain() {
238        let mut bp = BloomPrescreen::new(100, 0.01);
239        let keys: &[&[u8]] = &[b"alpha", b"beta", b"gamma", b"delta", b"epsilon"];
240        for k in keys {
241            bp.insert(k);
242        }
243        for k in keys {
244            assert!(bp.might_contain(k), "no false negative for {:?}", k);
245        }
246    }
247
248    #[test]
249    fn test_empty_filter_returns_false() {
250        let bp = BloomPrescreen::new(500, 0.01);
251        assert!(!bp.might_contain(b"not_inserted"));
252        assert!(!bp.might_contain(b"oximedia"));
253    }
254
255    #[test]
256    fn test_single_insert() {
257        let mut bp = BloomPrescreen::new(50, 0.01);
258        bp.insert(b"only_one");
259        assert!(bp.might_contain(b"only_one"));
260    }
261
262    #[test]
263    fn test_bulk_no_false_negatives() {
264        let mut bp = BloomPrescreen::new(2000, 0.01);
265        let items: Vec<Vec<u8>> = (0u64..500).map(|i| i.to_le_bytes().to_vec()).collect();
266        for item in &items {
267            bp.insert(item);
268        }
269        for item in &items {
270            assert!(bp.might_contain(item), "false negative detected");
271        }
272    }
273
274    // ── FPR estimation ────────────────────────────────────────────────────────
275
276    #[test]
277    fn test_fpr_estimate_zero_when_empty() {
278        let bp = BloomPrescreen::new(1000, 0.01);
279        // No items inserted → exponent → 0 → (1 - 1)^k = 0
280        assert_eq!(bp.false_positive_rate_estimate(), 0.0);
281    }
282
283    #[test]
284    fn test_fpr_estimate_increases_with_items() {
285        let mut bp = BloomPrescreen::new(100, 0.01);
286        let fpr_before = bp.false_positive_rate_estimate();
287        for i in 0u64..50 {
288            bp.insert(&i.to_le_bytes());
289        }
290        let fpr_after = bp.false_positive_rate_estimate();
291        assert!(fpr_after > fpr_before);
292    }
293
294    #[test]
295    fn test_fpr_estimate_at_design_capacity_near_design_rate() {
296        // When n == capacity, FPR should be ≈ design FPR (within an order of magnitude).
297        let n = 1000usize;
298        let design_fpr = 0.01f64;
299        let mut bp = BloomPrescreen::new(n, design_fpr);
300        for i in 0u64..n as u64 {
301            bp.insert(&i.to_le_bytes());
302        }
303        let est = bp.false_positive_rate_estimate();
304        // Should be in [0.001, 0.1] — within 10× of design.
305        assert!(est > 0.0, "FPR estimate must be positive");
306        assert!(est < 0.5, "FPR estimate must be reasonable");
307    }
308
309    // ── items_inserted counter ────────────────────────────────────────────────
310
311    #[test]
312    fn test_items_inserted_counter() {
313        let mut bp = BloomPrescreen::new(100, 0.01);
314        assert_eq!(bp.items_inserted(), 0);
315        bp.insert(b"one");
316        assert_eq!(bp.items_inserted(), 1);
317        bp.insert(b"two");
318        assert_eq!(bp.items_inserted(), 2);
319    }
320
321    // ── clear ─────────────────────────────────────────────────────────────────
322
323    #[test]
324    fn test_clear_resets_filter() {
325        let mut bp = BloomPrescreen::new(100, 0.01);
326        bp.insert(b"persistent");
327        assert!(bp.might_contain(b"persistent"));
328        bp.clear();
329        assert!(!bp.might_contain(b"persistent"));
330        assert_eq!(bp.items_inserted(), 0);
331    }
332
333    #[test]
334    fn test_clear_resets_set_bit_count() {
335        let mut bp = BloomPrescreen::new(200, 0.01);
336        for i in 0u64..50 {
337            bp.insert(&i.to_le_bytes());
338        }
339        assert!(bp.set_bit_count() > 0);
340        bp.clear();
341        assert_eq!(bp.set_bit_count(), 0);
342    }
343
344    // ── set_bit_count ─────────────────────────────────────────────────────────
345
346    #[test]
347    fn test_set_bit_count_zero_initially() {
348        let bp = BloomPrescreen::new(500, 0.01);
349        assert_eq!(bp.set_bit_count(), 0);
350    }
351
352    #[test]
353    fn test_set_bit_count_increases_with_inserts() {
354        let mut bp = BloomPrescreen::new(500, 0.01);
355        let before = bp.set_bit_count();
356        bp.insert(b"new_item");
357        let after = bp.set_bit_count();
358        assert!(after >= before);
359    }
360
361    // ── Optimal sizing formulas ───────────────────────────────────────────────
362
363    #[test]
364    fn test_num_bits_at_least_64() {
365        // Even tiny capacity must allocate at least 64 bits (1 word).
366        let bp = BloomPrescreen::new(1, 0.5);
367        assert!(bp.num_bits >= 64);
368        assert_eq!(bp.bits.len(), bp.num_bits / 64);
369    }
370
371    #[test]
372    fn test_k_hashes_sensible_range() {
373        // For standard parameters k should be between 4 and 20.
374        let bp = BloomPrescreen::new(10_000, 0.01);
375        assert!(bp.k_hashes >= 4, "k should be at least 4 for 1% FPR");
376        assert!(bp.k_hashes <= 20, "k should be at most 20 for 1% FPR");
377    }
378
379    // ── Byte array keys ───────────────────────────────────────────────────────
380
381    #[test]
382    fn test_empty_key_insert_and_query() {
383        let mut bp = BloomPrescreen::new(100, 0.01);
384        bp.insert(b"");
385        assert!(bp.might_contain(b""));
386    }
387
388    #[test]
389    fn test_long_key_insert_and_query() {
390        let mut bp = BloomPrescreen::new(100, 0.01);
391        let long_key = vec![0xABu8; 4096];
392        bp.insert(&long_key);
393        assert!(bp.might_contain(&long_key));
394    }
395
396    #[test]
397    fn test_binary_keys() {
398        let mut bp = BloomPrescreen::new(200, 0.01);
399        // Insert u64 values as raw bytes
400        let values: Vec<u64> = vec![0, 1, u64::MAX, 0xDEAD_BEEF, 0x0102_0304_0506_0708];
401        for v in &values {
402            bp.insert(&v.to_le_bytes());
403        }
404        for v in &values {
405            assert!(bp.might_contain(&v.to_le_bytes()), "no false neg for {v}");
406        }
407    }
408}