oximedia_dedup/
bloom_filter.rs

1//! Near-duplicate detection using a Bloom filter.
2//!
3//! Provides:
4//! - [`BloomFilter`]: space-efficient probabilistic set membership structure
5//! - [`NearDuplicateDetector`]: wraps a Bloom filter for streaming deduplication
6
7#![allow(dead_code)]
8#![allow(clippy::cast_precision_loss)]
9
10// ---------------------------------------------------------------------------
11// Bloom filter
12// ---------------------------------------------------------------------------
13
14/// A counting-free Bloom filter backed by a `Vec<u64>` bit array.
15///
16/// Insert items and test membership with a configurable false-positive rate.
17/// Deletions are not supported (standard Bloom filter design).
18pub struct BloomFilter {
19    /// Bit storage (each `u64` holds 64 bits).
20    pub bits: Vec<u64>,
21    /// Number of independent hash functions.
22    pub num_hashes: u32,
23    /// Total number of bits (length of the logical bit array).
24    pub bit_count: u64,
25}
26
27impl BloomFilter {
28    /// Create a new Bloom filter sized for `capacity` items at a given `false_positive_rate`.
29    ///
30    /// Uses the standard formulas:
31    /// - `m = -n * ln(p) / (ln(2))^2`  (number of bits)
32    /// - `k = (m / n) * ln(2)`          (number of hash functions)
33    #[must_use]
34    pub fn new(capacity: usize, false_positive_rate: f32) -> Self {
35        let capacity = capacity.max(1);
36        let fpr = false_positive_rate.clamp(1e-9, 1.0 - f32::EPSILON);
37
38        let ln2 = std::f64::consts::LN_2;
39        let n = capacity as f64;
40        let p = fpr as f64;
41
42        let m = (-n * p.ln() / (ln2 * ln2)).ceil() as u64;
43        let m = m.max(64); // at least 64 bits
44        let k = ((m as f64 / n) * ln2).round() as u32;
45        let k = k.clamp(1, 30);
46
47        let words = ((m + 63) / 64) as usize;
48
49        Self {
50            bits: vec![0u64; words],
51            num_hashes: k,
52            bit_count: m,
53        }
54    }
55
56    /// Insert `item` into the filter.
57    pub fn insert(&mut self, item: &[u8]) {
58        for i in 0..self.num_hashes {
59            let idx = self.hash_index(item, i);
60            let word = (idx / 64) as usize;
61            let bit = idx % 64;
62            self.bits[word] |= 1u64 << bit;
63        }
64    }
65
66    /// Returns `true` if `item` *may* be in the set (possible false positive).
67    /// Returns `false` if `item` is *definitely not* in the set.
68    #[must_use]
69    pub fn contains(&self, item: &[u8]) -> bool {
70        for i in 0..self.num_hashes {
71            let idx = self.hash_index(item, i);
72            let word = (idx / 64) as usize;
73            let bit = idx % 64;
74            if self.bits[word] & (1u64 << bit) == 0 {
75                return false;
76            }
77        }
78        true
79    }
80
81    /// Estimate the number of items inserted using the formula:
82    /// `n̂ = -m / k * ln(1 - X / m)`
83    /// where `X` is the number of set bits.
84    #[must_use]
85    pub fn estimated_count(&self) -> usize {
86        let set_bits: u64 = self.bits.iter().map(|w| w.count_ones() as u64).sum();
87        if set_bits == 0 {
88            return 0;
89        }
90        if set_bits >= self.bit_count {
91            // Filter saturated
92            return usize::MAX;
93        }
94        let m = self.bit_count as f64;
95        let k = self.num_hashes as f64;
96        let x = set_bits as f64;
97        let estimate = -(m / k) * (1.0 - x / m).ln();
98        estimate.round() as usize
99    }
100
101    /// Clear all bits (reset the filter).
102    pub fn clear(&mut self) {
103        for w in &mut self.bits {
104            *w = 0;
105        }
106    }
107
108    /// Compute bit index for the i-th hash of `item` using FNV-like mixing.
109    fn hash_index(&self, item: &[u8], i: u32) -> u64 {
110        // Two independent hashes via FNV-1a, then Kirsch-Mitzenmacher combination
111        let h1 = fnv1a_64(item);
112        let h2 = fnv1a_64_seeded(item, i as u64 ^ 0x9e37_79b9_7f4a_7c15);
113        h1.wrapping_add((i as u64).wrapping_mul(h2)) % self.bit_count
114    }
115}
116
117/// FNV-1a 64-bit hash.
118fn fnv1a_64(data: &[u8]) -> u64 {
119    let mut h: u64 = 0xcbf2_9ce4_8422_2325;
120    for &b in data {
121        h ^= b as u64;
122        h = h.wrapping_mul(0x0000_0100_0000_01b3);
123    }
124    h
125}
126
127/// FNV-1a 64-bit hash with an additional seed mixed in.
128fn fnv1a_64_seeded(data: &[u8], seed: u64) -> u64 {
129    let mut h: u64 = 0xcbf2_9ce4_8422_2325 ^ seed.wrapping_mul(0x0000_0100_0000_01b3);
130    for &b in data {
131        h ^= b as u64;
132        h = h.wrapping_mul(0x0000_0100_0000_01b3);
133    }
134    h
135}
136
137// ---------------------------------------------------------------------------
138// NearDuplicateDetector
139// ---------------------------------------------------------------------------
140
141/// Streaming near-duplicate detector backed by a Bloom filter.
142///
143/// Items (fingerprints) are inserted; if the Bloom filter reports a hit the
144/// item is considered a near-duplicate.  Because the underlying structure is a
145/// Bloom filter the detector never produces false negatives but may produce
146/// false positives at a configurable rate.
147pub struct NearDuplicateDetector {
148    /// Similarity/membership threshold concept – stored for reference only.
149    pub threshold: f32,
150    /// The underlying Bloom filter.
151    pub seen: BloomFilter,
152}
153
154impl NearDuplicateDetector {
155    /// Create a detector with the given `capacity` (expected number of unique items).
156    ///
157    /// Uses a false-positive rate of 1%.
158    #[must_use]
159    pub fn new(capacity: usize) -> Self {
160        Self {
161            threshold: 0.99,
162            seen: BloomFilter::new(capacity, 0.01),
163        }
164    }
165
166    /// Create with a custom false-positive rate.
167    #[must_use]
168    pub fn with_fpr(capacity: usize, false_positive_rate: f32) -> Self {
169        Self {
170            threshold: 1.0 - false_positive_rate,
171            seen: BloomFilter::new(capacity, false_positive_rate),
172        }
173    }
174
175    /// Add `fingerprint` and return `true` if it appears to be a near-duplicate
176    /// (i.e., it was already seen before).
177    pub fn add_and_check(&mut self, fingerprint: &[u8]) -> bool {
178        let duplicate = self.seen.contains(fingerprint);
179        self.seen.insert(fingerprint);
180        duplicate
181    }
182
183    /// Reset the detector (clear all seen fingerprints).
184    pub fn reset(&mut self) {
185        self.seen.clear();
186    }
187
188    /// Estimated number of unique items seen so far.
189    #[must_use]
190    pub fn estimated_unique_count(&self) -> usize {
191        self.seen.estimated_count()
192    }
193}
194
195// ---------------------------------------------------------------------------
196// Bloom filter pre-screening for deduplication
197// ---------------------------------------------------------------------------
198
199/// Pre-screening results from the bloom filter pass.
200#[derive(Debug, Clone)]
201pub struct PreScreenResult {
202    /// Indices of items that are *potential* duplicates (bloom filter hit).
203    pub candidates: Vec<usize>,
204    /// Indices of items that are *definitely* unique (bloom filter miss).
205    pub unique: Vec<usize>,
206    /// Total items processed.
207    pub total: usize,
208}
209
210impl PreScreenResult {
211    /// Fraction of items that passed as candidates (potential duplicates).
212    #[must_use]
213    pub fn candidate_rate(&self) -> f64 {
214        if self.total == 0 {
215            return 0.0;
216        }
217        self.candidates.len() as f64 / self.total as f64
218    }
219
220    /// Fraction of items rejected as unique.
221    #[must_use]
222    pub fn rejection_rate(&self) -> f64 {
223        if self.total == 0 {
224            return 0.0;
225        }
226        self.unique.len() as f64 / self.total as f64
227    }
228}
229
230/// Pre-screen a set of fingerprints (byte slices) using a bloom filter.
231///
232/// Each fingerprint is inserted into a bloom filter.  If the filter reports
233/// that the fingerprint was already seen, the item is flagged as a candidate
234/// for expensive pairwise comparison.  Items whose fingerprints are not yet
235/// in the filter are classified as definitely unique (subject to the bloom
236/// filter's false-positive rate).
237///
238/// This dramatically reduces the number of expensive perceptual-hash or
239/// SSIM comparisons needed.
240#[must_use]
241pub fn prescreen_fingerprints(
242    fingerprints: &[Vec<u8>],
243    capacity: usize,
244    fpr: f32,
245) -> PreScreenResult {
246    let mut bloom = BloomFilter::new(capacity, fpr);
247    let mut candidates = Vec::new();
248    let mut unique = Vec::new();
249
250    for (i, fp) in fingerprints.iter().enumerate() {
251        if bloom.contains(fp) {
252            candidates.push(i);
253        } else {
254            unique.push(i);
255        }
256        bloom.insert(fp);
257    }
258
259    PreScreenResult {
260        candidates,
261        unique,
262        total: fingerprints.len(),
263    }
264}
265
266/// Pre-screen using quantized perceptual hashes.
267///
268/// Quantizes each 64-bit hash down to `quantize_bits` (default: 16) so that
269/// similar hashes map to the same bloom key.  This means near-duplicates
270/// (not just exact duplicates) will collide in the bloom filter.
271#[must_use]
272pub fn prescreen_perceptual_hashes(
273    hashes: &[u64],
274    quantize_bits: u32,
275    capacity: usize,
276    fpr: f32,
277) -> PreScreenResult {
278    let quantize_bits = quantize_bits.clamp(4, 64);
279    let shift = 64 - quantize_bits;
280
281    let mut bloom = BloomFilter::new(capacity, fpr);
282    let mut candidates = Vec::new();
283    let mut unique = Vec::new();
284
285    for (i, &hash) in hashes.iter().enumerate() {
286        let quantized = hash >> shift;
287        let bytes = quantized.to_le_bytes();
288        if bloom.contains(&bytes) {
289            candidates.push(i);
290        } else {
291            unique.push(i);
292        }
293        bloom.insert(&bytes);
294    }
295
296    PreScreenResult {
297        candidates,
298        unique,
299        total: hashes.len(),
300    }
301}
302
303// ---------------------------------------------------------------------------
304// Integrated dedup pipeline with bloom + LSH
305// ---------------------------------------------------------------------------
306
307/// Configuration for the integrated bloom + LSH dedup pipeline.
308#[derive(Debug, Clone)]
309pub struct DedupPipelineConfig {
310    /// Bloom filter capacity.
311    pub bloom_capacity: usize,
312    /// Bloom filter false positive rate.
313    pub bloom_fpr: f32,
314    /// Number of bits for quantizing perceptual hashes before bloom insertion.
315    pub quantize_bits: u32,
316    /// Number of LSH hash tables.
317    pub lsh_tables: usize,
318    /// Bits sampled per LSH table.
319    pub lsh_bits_per_table: usize,
320    /// Maximum Hamming distance for near-duplicate pairing.
321    pub max_hamming_distance: u32,
322    /// PRNG seed for LSH.
323    pub seed: u64,
324}
325
326impl Default for DedupPipelineConfig {
327    fn default() -> Self {
328        Self {
329            bloom_capacity: 10_000,
330            bloom_fpr: 0.01,
331            quantize_bits: 16,
332            lsh_tables: 8,
333            lsh_bits_per_table: 8,
334            max_hamming_distance: 10,
335            seed: 42,
336        }
337    }
338}
339
340/// Result of the integrated dedup pipeline.
341#[derive(Debug, Clone)]
342pub struct DedupPipelineResult {
343    /// Items that passed bloom pre-screening (potential duplicates).
344    pub bloom_candidates: Vec<usize>,
345    /// Items rejected by bloom (definitely unique).
346    pub bloom_unique: Vec<usize>,
347    /// Duplicate pairs found by LSH among the bloom candidates.
348    pub lsh_pairs: Vec<(u64, u64, u32)>,
349    /// Total items processed.
350    pub total: usize,
351}
352
353impl DedupPipelineResult {
354    /// Fraction of items rejected by bloom (savings from skipping expensive comparison).
355    #[must_use]
356    pub fn bloom_rejection_rate(&self) -> f64 {
357        if self.total == 0 {
358            return 0.0;
359        }
360        self.bloom_unique.len() as f64 / self.total as f64
361    }
362
363    /// Number of duplicate pairs found.
364    #[must_use]
365    pub fn num_pairs(&self) -> usize {
366        self.lsh_pairs.len()
367    }
368}
369
370/// Run the integrated bloom + LSH dedup pipeline.
371///
372/// Phase 1: Bloom filter pre-screens quantized hashes to reject definitely-unique items.
373/// Phase 2: LSH finds near-duplicate pairs among remaining candidates.
374///
375/// This dramatically reduces the search space for large libraries.
376#[must_use]
377pub fn run_dedup_pipeline(
378    hashes: &[(u64, u64)], // (id, perceptual_hash)
379    config: &DedupPipelineConfig,
380) -> DedupPipelineResult {
381    use crate::lsh_index::lsh_dedup_pass;
382
383    if hashes.is_empty() {
384        return DedupPipelineResult {
385            bloom_candidates: Vec::new(),
386            bloom_unique: Vec::new(),
387            lsh_pairs: Vec::new(),
388            total: 0,
389        };
390    }
391
392    // Phase 1: Bloom pre-screening with quantized hashes
393    let raw_hashes: Vec<u64> = hashes.iter().map(|&(_, h)| h).collect();
394    let prescreen = prescreen_perceptual_hashes(
395        &raw_hashes,
396        config.quantize_bits,
397        config.bloom_capacity,
398        config.bloom_fpr,
399    );
400
401    // Build candidate set for LSH (bloom hits + their original IDs)
402    let candidate_hashes: Vec<(u64, u64)> = prescreen
403        .candidates
404        .iter()
405        .map(|&idx| hashes[idx])
406        .collect();
407
408    // Phase 2: LSH on candidates only
409    let lsh_result = lsh_dedup_pass(
410        &candidate_hashes,
411        config.max_hamming_distance,
412        config.lsh_tables,
413        config.lsh_bits_per_table,
414        config.seed,
415    );
416
417    DedupPipelineResult {
418        bloom_candidates: prescreen.candidates,
419        bloom_unique: prescreen.unique,
420        lsh_pairs: lsh_result.pairs,
421        total: hashes.len(),
422    }
423}
424
425// ---------------------------------------------------------------------------
426// Unit tests
427// ---------------------------------------------------------------------------
428
429#[cfg(test)]
430mod tests {
431    use super::*;
432
433    // ---- BloomFilter construction ----
434
435    #[test]
436    fn test_bloom_new_non_empty() {
437        let bf = BloomFilter::new(1000, 0.01);
438        assert!(!bf.bits.is_empty());
439        assert!(bf.num_hashes >= 1);
440        assert!(bf.bit_count >= 64);
441    }
442
443    #[test]
444    fn test_bloom_larger_capacity_more_bits() {
445        let bf_small = BloomFilter::new(100, 0.01);
446        let bf_large = BloomFilter::new(10_000, 0.01);
447        assert!(bf_large.bit_count > bf_small.bit_count);
448    }
449
450    #[test]
451    fn test_bloom_initially_empty() {
452        let bf = BloomFilter::new(100, 0.01);
453        assert!(!bf.contains(b"hello"));
454        assert!(!bf.contains(b"world"));
455    }
456
457    #[test]
458    fn test_bloom_insert_and_contains() {
459        let mut bf = BloomFilter::new(100, 0.01);
460        bf.insert(b"oximedia");
461        assert!(bf.contains(b"oximedia"));
462    }
463
464    #[test]
465    fn test_bloom_multiple_inserts() {
466        let mut bf = BloomFilter::new(200, 0.01);
467        let items: Vec<&[u8]> = vec![b"alpha", b"beta", b"gamma", b"delta", b"epsilon"];
468        for item in &items {
469            bf.insert(item);
470        }
471        for item in &items {
472            assert!(bf.contains(item), "Item should be present after insert");
473        }
474    }
475
476    #[test]
477    fn test_bloom_clear() {
478        let mut bf = BloomFilter::new(100, 0.01);
479        bf.insert(b"test_item");
480        assert!(bf.contains(b"test_item"));
481        bf.clear();
482        // After clear, the filter should no longer report the item as present
483        assert!(!bf.contains(b"test_item"));
484    }
485
486    #[test]
487    fn test_bloom_estimated_count_zero_initially() {
488        let bf = BloomFilter::new(100, 0.01);
489        assert_eq!(bf.estimated_count(), 0);
490    }
491
492    #[test]
493    fn test_bloom_estimated_count_after_inserts() {
494        let mut bf = BloomFilter::new(1000, 0.01);
495        for i in 0..100u64 {
496            bf.insert(&i.to_le_bytes());
497        }
498        // Estimate should be in a reasonable range (not exact due to probabilistic nature)
499        let est = bf.estimated_count();
500        assert!(est > 0, "Estimated count should be positive");
501    }
502
503    #[test]
504    fn test_bloom_different_fpr_different_hashes() {
505        let bf_strict = BloomFilter::new(100, 0.0001);
506        let bf_loose = BloomFilter::new(100, 0.1);
507        // Stricter FPR → more hash functions
508        assert!(bf_strict.num_hashes >= bf_loose.num_hashes);
509    }
510
511    // ---- NearDuplicateDetector tests ----
512
513    #[test]
514    fn test_near_dup_new() {
515        let det = NearDuplicateDetector::new(500);
516        assert!(det.seen.bit_count >= 64);
517    }
518
519    #[test]
520    fn test_near_dup_first_occurrence_not_duplicate() {
521        let mut det = NearDuplicateDetector::new(100);
522        let result = det.add_and_check(b"unique_fingerprint");
523        assert!(!result, "First occurrence should not be a duplicate");
524    }
525
526    #[test]
527    fn test_near_dup_second_occurrence_is_duplicate() {
528        let mut det = NearDuplicateDetector::new(100);
529        det.add_and_check(b"duplicate_fingerprint");
530        let result = det.add_and_check(b"duplicate_fingerprint");
531        assert!(result, "Second occurrence should be detected as duplicate");
532    }
533
534    #[test]
535    fn test_near_dup_reset_clears_state() {
536        let mut det = NearDuplicateDetector::new(100);
537        det.add_and_check(b"item_to_reset");
538        det.reset();
539        // After reset, the same item should not appear as a duplicate
540        let result = det.add_and_check(b"item_to_reset");
541        assert!(!result, "After reset, item should not be a duplicate");
542    }
543
544    #[test]
545    fn test_near_dup_multiple_unique_items() {
546        let mut det = NearDuplicateDetector::new(1000);
547        let items: Vec<Vec<u8>> = (0u64..50).map(|i| i.to_le_bytes().to_vec()).collect();
548        for item in &items {
549            // First occurrence of each unique item should not be flagged
550            let is_dup = det.add_and_check(item);
551            // Could theoretically be a false positive, but with 1000 capacity it should not
552            assert!(!is_dup, "Unique item should not be flagged as duplicate");
553        }
554    }
555
556    #[test]
557    fn test_near_dup_estimated_unique_count() {
558        let mut det = NearDuplicateDetector::new(1000);
559        for i in 0u64..20 {
560            det.add_and_check(&i.to_le_bytes());
561        }
562        let est = det.estimated_unique_count();
563        assert!(est > 0);
564    }
565
566    // ---- Pre-screening tests ----
567
568    #[test]
569    fn test_prescreen_all_unique() {
570        let fingerprints: Vec<Vec<u8>> = (0u64..50).map(|i| i.to_le_bytes().to_vec()).collect();
571        let result = prescreen_fingerprints(&fingerprints, 1000, 0.01);
572        assert_eq!(result.total, 50);
573        // All unique items should be in the unique set
574        assert_eq!(result.unique.len(), 50);
575        assert!(result.candidates.is_empty());
576    }
577
578    #[test]
579    fn test_prescreen_with_duplicates() {
580        let mut fingerprints: Vec<Vec<u8>> = Vec::new();
581        // Add 10 unique items
582        for i in 0u64..10 {
583            fingerprints.push(i.to_le_bytes().to_vec());
584        }
585        // Add 10 duplicates of item 0
586        for _ in 0..10 {
587            fingerprints.push(0u64.to_le_bytes().to_vec());
588        }
589        let result = prescreen_fingerprints(&fingerprints, 1000, 0.01);
590        assert_eq!(result.total, 20);
591        // The first occurrence of item 0 is unique; the 10 re-inserts are candidates
592        assert_eq!(result.candidates.len(), 10);
593    }
594
595    #[test]
596    fn test_prescreen_candidate_rate() {
597        let fingerprints = vec![
598            vec![1u8, 2, 3],
599            vec![1u8, 2, 3], // duplicate
600            vec![4u8, 5, 6],
601            vec![4u8, 5, 6], // duplicate
602            vec![7u8, 8, 9],
603        ];
604        let result = prescreen_fingerprints(&fingerprints, 100, 0.01);
605        assert_eq!(result.total, 5);
606        assert_eq!(result.candidates.len(), 2);
607        assert!((result.candidate_rate() - 0.4).abs() < f64::EPSILON);
608        assert!((result.rejection_rate() - 0.6).abs() < f64::EPSILON);
609    }
610
611    #[test]
612    fn test_prescreen_perceptual_hashes_identical() {
613        let hashes = vec![0xDEAD_BEEF_DEAD_BEEFu64; 5];
614        let result = prescreen_perceptual_hashes(&hashes, 16, 1000, 0.01);
615        assert_eq!(result.total, 5);
616        // First is unique, rest are candidates
617        assert_eq!(result.candidates.len(), 4);
618        assert_eq!(result.unique.len(), 1);
619    }
620
621    #[test]
622    fn test_prescreen_perceptual_hashes_all_different() {
623        // Hashes that differ significantly in high bits
624        let hashes: Vec<u64> = (0..20u64).map(|i| i << 48).collect();
625        let result = prescreen_perceptual_hashes(&hashes, 16, 1000, 0.01);
626        assert_eq!(result.total, 20);
627        // With 16-bit quantization from top bits, all should be unique
628        assert_eq!(result.unique.len(), 20);
629        assert!(result.candidates.is_empty());
630    }
631
632    #[test]
633    fn test_prescreen_empty_input() {
634        let result = prescreen_fingerprints(&[], 100, 0.01);
635        assert_eq!(result.total, 0);
636        assert!(result.candidates.is_empty());
637        assert!(result.unique.is_empty());
638        assert_eq!(result.candidate_rate(), 0.0);
639        assert_eq!(result.rejection_rate(), 0.0);
640    }
641
642    #[test]
643    fn test_prescreen_result_rates_sum_to_one() {
644        let fingerprints: Vec<Vec<u8>> =
645            vec![vec![1, 2], vec![1, 2], vec![3, 4], vec![5, 6], vec![3, 4]];
646        let result = prescreen_fingerprints(&fingerprints, 100, 0.01);
647        let sum = result.candidate_rate() + result.rejection_rate();
648        assert!((sum - 1.0).abs() < 1e-10);
649    }
650
651    // ---- DedupPipelineConfig tests ----
652
653    #[test]
654    fn test_pipeline_config_default() {
655        let cfg = DedupPipelineConfig::default();
656        assert_eq!(cfg.bloom_capacity, 10_000);
657        assert_eq!(cfg.lsh_tables, 8);
658        assert_eq!(cfg.max_hamming_distance, 10);
659    }
660
661    // ---- run_dedup_pipeline tests ----
662
663    #[test]
664    fn test_pipeline_empty() {
665        let cfg = DedupPipelineConfig::default();
666        let result = run_dedup_pipeline(&[], &cfg);
667        assert_eq!(result.total, 0);
668        assert!(result.bloom_candidates.is_empty());
669        assert!(result.lsh_pairs.is_empty());
670    }
671
672    #[test]
673    fn test_pipeline_all_identical() {
674        let hash = 0xDEAD_BEEF_CAFE_BABEu64;
675        let hashes: Vec<(u64, u64)> = (0..10).map(|i| (i, hash)).collect();
676        let cfg = DedupPipelineConfig {
677            bloom_capacity: 100,
678            bloom_fpr: 0.01,
679            quantize_bits: 16,
680            lsh_tables: 6,
681            lsh_bits_per_table: 8,
682            max_hamming_distance: 0,
683            seed: 42,
684        };
685        let result = run_dedup_pipeline(&hashes, &cfg);
686        assert_eq!(result.total, 10);
687        // First item is unique in bloom, rest are candidates
688        assert_eq!(result.bloom_candidates.len(), 9);
689        assert_eq!(result.bloom_unique.len(), 1);
690        // LSH should find many pairs among the 9 candidates
691        assert!(!result.lsh_pairs.is_empty());
692    }
693
694    #[test]
695    fn test_pipeline_all_unique() {
696        // Hashes that differ significantly in high bits
697        let hashes: Vec<(u64, u64)> = (0..20u64).map(|i| (i, i << 48)).collect();
698        let cfg = DedupPipelineConfig {
699            bloom_capacity: 1000,
700            bloom_fpr: 0.01,
701            quantize_bits: 16,
702            lsh_tables: 4,
703            lsh_bits_per_table: 12,
704            max_hamming_distance: 5,
705            seed: 42,
706        };
707        let result = run_dedup_pipeline(&hashes, &cfg);
708        assert_eq!(result.total, 20);
709        // All should be unique in bloom
710        assert_eq!(result.bloom_unique.len(), 20);
711        assert!(result.bloom_candidates.is_empty());
712        assert!(result.lsh_pairs.is_empty());
713    }
714
715    #[test]
716    fn test_pipeline_mixed() {
717        // 5 identical + 5 unique
718        let base_hash = 0xAAAA_BBBB_CCCC_DDDDu64;
719        let mut hashes: Vec<(u64, u64)> = (0..5).map(|i| (i, base_hash)).collect();
720        for i in 5..10u64 {
721            hashes.push((i, i << 48));
722        }
723        let cfg = DedupPipelineConfig::default();
724        let result = run_dedup_pipeline(&hashes, &cfg);
725        assert_eq!(result.total, 10);
726        // bloom_unique should include the unique hashes
727        assert!(!result.bloom_unique.is_empty());
728    }
729
730    #[test]
731    fn test_pipeline_result_bloom_rejection_rate() {
732        let result = DedupPipelineResult {
733            bloom_candidates: vec![0, 1],
734            bloom_unique: vec![2, 3, 4],
735            lsh_pairs: vec![(0, 1, 0)],
736            total: 5,
737        };
738        assert!((result.bloom_rejection_rate() - 0.6).abs() < f64::EPSILON);
739        assert_eq!(result.num_pairs(), 1);
740    }
741
742    #[test]
743    fn test_pipeline_result_empty_total() {
744        let result = DedupPipelineResult {
745            bloom_candidates: Vec::new(),
746            bloom_unique: Vec::new(),
747            lsh_pairs: Vec::new(),
748            total: 0,
749        };
750        assert_eq!(result.bloom_rejection_rate(), 0.0);
751        assert_eq!(result.num_pairs(), 0);
752    }
753
754    #[test]
755    fn test_pipeline_near_duplicates() {
756        let base = 0xFFFF_FFFF_FFFF_FFFFu64;
757        let similar = base ^ 0b111; // 3 bits different
758                                    // Use same high bits so bloom quantization groups them
759        let hashes = vec![(1, base), (2, similar), (3, base)];
760        let cfg = DedupPipelineConfig {
761            bloom_capacity: 100,
762            bloom_fpr: 0.01,
763            quantize_bits: 16,
764            lsh_tables: 8,
765            lsh_bits_per_table: 6,
766            max_hamming_distance: 5,
767            seed: 77,
768        };
769        let result = run_dedup_pipeline(&hashes, &cfg);
770        assert_eq!(result.total, 3);
771        // At least some pairs should be found
772        // (bloom may or may not catch all, depends on quantization)
773    }
774}
oximedia_dedup/bloom_filter.rs

oximedia_dedup/
bloom_filter.rs