1#![allow(dead_code)]
8#![allow(clippy::cast_precision_loss)]
9
10pub struct BloomFilter {
19 pub bits: Vec<u64>,
21 pub num_hashes: u32,
23 pub bit_count: u64,
25}
26
27impl BloomFilter {
28 #[must_use]
34 pub fn new(capacity: usize, false_positive_rate: f32) -> Self {
35 let capacity = capacity.max(1);
36 let fpr = false_positive_rate.clamp(1e-9, 1.0 - f32::EPSILON);
37
38 let ln2 = std::f64::consts::LN_2;
39 let n = capacity as f64;
40 let p = fpr as f64;
41
42 let m = (-n * p.ln() / (ln2 * ln2)).ceil() as u64;
43 let m = m.max(64); let k = ((m as f64 / n) * ln2).round() as u32;
45 let k = k.clamp(1, 30);
46
47 let words = ((m + 63) / 64) as usize;
48
49 Self {
50 bits: vec![0u64; words],
51 num_hashes: k,
52 bit_count: m,
53 }
54 }
55
56 pub fn insert(&mut self, item: &[u8]) {
58 for i in 0..self.num_hashes {
59 let idx = self.hash_index(item, i);
60 let word = (idx / 64) as usize;
61 let bit = idx % 64;
62 self.bits[word] |= 1u64 << bit;
63 }
64 }
65
66 #[must_use]
69 pub fn contains(&self, item: &[u8]) -> bool {
70 for i in 0..self.num_hashes {
71 let idx = self.hash_index(item, i);
72 let word = (idx / 64) as usize;
73 let bit = idx % 64;
74 if self.bits[word] & (1u64 << bit) == 0 {
75 return false;
76 }
77 }
78 true
79 }
80
81 #[must_use]
85 pub fn estimated_count(&self) -> usize {
86 let set_bits: u64 = self.bits.iter().map(|w| w.count_ones() as u64).sum();
87 if set_bits == 0 {
88 return 0;
89 }
90 if set_bits >= self.bit_count {
91 return usize::MAX;
93 }
94 let m = self.bit_count as f64;
95 let k = self.num_hashes as f64;
96 let x = set_bits as f64;
97 let estimate = -(m / k) * (1.0 - x / m).ln();
98 estimate.round() as usize
99 }
100
101 pub fn clear(&mut self) {
103 for w in &mut self.bits {
104 *w = 0;
105 }
106 }
107
108 fn hash_index(&self, item: &[u8], i: u32) -> u64 {
110 let h1 = fnv1a_64(item);
112 let h2 = fnv1a_64_seeded(item, i as u64 ^ 0x9e37_79b9_7f4a_7c15);
113 h1.wrapping_add((i as u64).wrapping_mul(h2)) % self.bit_count
114 }
115}
116
117fn fnv1a_64(data: &[u8]) -> u64 {
119 let mut h: u64 = 0xcbf2_9ce4_8422_2325;
120 for &b in data {
121 h ^= b as u64;
122 h = h.wrapping_mul(0x0000_0100_0000_01b3);
123 }
124 h
125}
126
127fn fnv1a_64_seeded(data: &[u8], seed: u64) -> u64 {
129 let mut h: u64 = 0xcbf2_9ce4_8422_2325 ^ seed.wrapping_mul(0x0000_0100_0000_01b3);
130 for &b in data {
131 h ^= b as u64;
132 h = h.wrapping_mul(0x0000_0100_0000_01b3);
133 }
134 h
135}
136
137pub struct NearDuplicateDetector {
148 pub threshold: f32,
150 pub seen: BloomFilter,
152}
153
154impl NearDuplicateDetector {
155 #[must_use]
159 pub fn new(capacity: usize) -> Self {
160 Self {
161 threshold: 0.99,
162 seen: BloomFilter::new(capacity, 0.01),
163 }
164 }
165
166 #[must_use]
168 pub fn with_fpr(capacity: usize, false_positive_rate: f32) -> Self {
169 Self {
170 threshold: 1.0 - false_positive_rate,
171 seen: BloomFilter::new(capacity, false_positive_rate),
172 }
173 }
174
175 pub fn add_and_check(&mut self, fingerprint: &[u8]) -> bool {
178 let duplicate = self.seen.contains(fingerprint);
179 self.seen.insert(fingerprint);
180 duplicate
181 }
182
183 pub fn reset(&mut self) {
185 self.seen.clear();
186 }
187
188 #[must_use]
190 pub fn estimated_unique_count(&self) -> usize {
191 self.seen.estimated_count()
192 }
193}
194
195#[derive(Debug, Clone)]
201pub struct PreScreenResult {
202 pub candidates: Vec<usize>,
204 pub unique: Vec<usize>,
206 pub total: usize,
208}
209
210impl PreScreenResult {
211 #[must_use]
213 pub fn candidate_rate(&self) -> f64 {
214 if self.total == 0 {
215 return 0.0;
216 }
217 self.candidates.len() as f64 / self.total as f64
218 }
219
220 #[must_use]
222 pub fn rejection_rate(&self) -> f64 {
223 if self.total == 0 {
224 return 0.0;
225 }
226 self.unique.len() as f64 / self.total as f64
227 }
228}
229
230#[must_use]
241pub fn prescreen_fingerprints(
242 fingerprints: &[Vec<u8>],
243 capacity: usize,
244 fpr: f32,
245) -> PreScreenResult {
246 let mut bloom = BloomFilter::new(capacity, fpr);
247 let mut candidates = Vec::new();
248 let mut unique = Vec::new();
249
250 for (i, fp) in fingerprints.iter().enumerate() {
251 if bloom.contains(fp) {
252 candidates.push(i);
253 } else {
254 unique.push(i);
255 }
256 bloom.insert(fp);
257 }
258
259 PreScreenResult {
260 candidates,
261 unique,
262 total: fingerprints.len(),
263 }
264}
265
266#[must_use]
272pub fn prescreen_perceptual_hashes(
273 hashes: &[u64],
274 quantize_bits: u32,
275 capacity: usize,
276 fpr: f32,
277) -> PreScreenResult {
278 let quantize_bits = quantize_bits.clamp(4, 64);
279 let shift = 64 - quantize_bits;
280
281 let mut bloom = BloomFilter::new(capacity, fpr);
282 let mut candidates = Vec::new();
283 let mut unique = Vec::new();
284
285 for (i, &hash) in hashes.iter().enumerate() {
286 let quantized = hash >> shift;
287 let bytes = quantized.to_le_bytes();
288 if bloom.contains(&bytes) {
289 candidates.push(i);
290 } else {
291 unique.push(i);
292 }
293 bloom.insert(&bytes);
294 }
295
296 PreScreenResult {
297 candidates,
298 unique,
299 total: hashes.len(),
300 }
301}
302
303#[derive(Debug, Clone)]
309pub struct DedupPipelineConfig {
310 pub bloom_capacity: usize,
312 pub bloom_fpr: f32,
314 pub quantize_bits: u32,
316 pub lsh_tables: usize,
318 pub lsh_bits_per_table: usize,
320 pub max_hamming_distance: u32,
322 pub seed: u64,
324}
325
326impl Default for DedupPipelineConfig {
327 fn default() -> Self {
328 Self {
329 bloom_capacity: 10_000,
330 bloom_fpr: 0.01,
331 quantize_bits: 16,
332 lsh_tables: 8,
333 lsh_bits_per_table: 8,
334 max_hamming_distance: 10,
335 seed: 42,
336 }
337 }
338}
339
340#[derive(Debug, Clone)]
342pub struct DedupPipelineResult {
343 pub bloom_candidates: Vec<usize>,
345 pub bloom_unique: Vec<usize>,
347 pub lsh_pairs: Vec<(u64, u64, u32)>,
349 pub total: usize,
351}
352
353impl DedupPipelineResult {
354 #[must_use]
356 pub fn bloom_rejection_rate(&self) -> f64 {
357 if self.total == 0 {
358 return 0.0;
359 }
360 self.bloom_unique.len() as f64 / self.total as f64
361 }
362
363 #[must_use]
365 pub fn num_pairs(&self) -> usize {
366 self.lsh_pairs.len()
367 }
368}
369
370#[must_use]
377pub fn run_dedup_pipeline(
378 hashes: &[(u64, u64)], config: &DedupPipelineConfig,
380) -> DedupPipelineResult {
381 use crate::lsh_index::lsh_dedup_pass;
382
383 if hashes.is_empty() {
384 return DedupPipelineResult {
385 bloom_candidates: Vec::new(),
386 bloom_unique: Vec::new(),
387 lsh_pairs: Vec::new(),
388 total: 0,
389 };
390 }
391
392 let raw_hashes: Vec<u64> = hashes.iter().map(|&(_, h)| h).collect();
394 let prescreen = prescreen_perceptual_hashes(
395 &raw_hashes,
396 config.quantize_bits,
397 config.bloom_capacity,
398 config.bloom_fpr,
399 );
400
401 let candidate_hashes: Vec<(u64, u64)> = prescreen
403 .candidates
404 .iter()
405 .map(|&idx| hashes[idx])
406 .collect();
407
408 let lsh_result = lsh_dedup_pass(
410 &candidate_hashes,
411 config.max_hamming_distance,
412 config.lsh_tables,
413 config.lsh_bits_per_table,
414 config.seed,
415 );
416
417 DedupPipelineResult {
418 bloom_candidates: prescreen.candidates,
419 bloom_unique: prescreen.unique,
420 lsh_pairs: lsh_result.pairs,
421 total: hashes.len(),
422 }
423}
424
425#[cfg(test)]
430mod tests {
431 use super::*;
432
433 #[test]
436 fn test_bloom_new_non_empty() {
437 let bf = BloomFilter::new(1000, 0.01);
438 assert!(!bf.bits.is_empty());
439 assert!(bf.num_hashes >= 1);
440 assert!(bf.bit_count >= 64);
441 }
442
443 #[test]
444 fn test_bloom_larger_capacity_more_bits() {
445 let bf_small = BloomFilter::new(100, 0.01);
446 let bf_large = BloomFilter::new(10_000, 0.01);
447 assert!(bf_large.bit_count > bf_small.bit_count);
448 }
449
450 #[test]
451 fn test_bloom_initially_empty() {
452 let bf = BloomFilter::new(100, 0.01);
453 assert!(!bf.contains(b"hello"));
454 assert!(!bf.contains(b"world"));
455 }
456
457 #[test]
458 fn test_bloom_insert_and_contains() {
459 let mut bf = BloomFilter::new(100, 0.01);
460 bf.insert(b"oximedia");
461 assert!(bf.contains(b"oximedia"));
462 }
463
464 #[test]
465 fn test_bloom_multiple_inserts() {
466 let mut bf = BloomFilter::new(200, 0.01);
467 let items: Vec<&[u8]> = vec![b"alpha", b"beta", b"gamma", b"delta", b"epsilon"];
468 for item in &items {
469 bf.insert(item);
470 }
471 for item in &items {
472 assert!(bf.contains(item), "Item should be present after insert");
473 }
474 }
475
476 #[test]
477 fn test_bloom_clear() {
478 let mut bf = BloomFilter::new(100, 0.01);
479 bf.insert(b"test_item");
480 assert!(bf.contains(b"test_item"));
481 bf.clear();
482 assert!(!bf.contains(b"test_item"));
484 }
485
486 #[test]
487 fn test_bloom_estimated_count_zero_initially() {
488 let bf = BloomFilter::new(100, 0.01);
489 assert_eq!(bf.estimated_count(), 0);
490 }
491
492 #[test]
493 fn test_bloom_estimated_count_after_inserts() {
494 let mut bf = BloomFilter::new(1000, 0.01);
495 for i in 0..100u64 {
496 bf.insert(&i.to_le_bytes());
497 }
498 let est = bf.estimated_count();
500 assert!(est > 0, "Estimated count should be positive");
501 }
502
503 #[test]
504 fn test_bloom_different_fpr_different_hashes() {
505 let bf_strict = BloomFilter::new(100, 0.0001);
506 let bf_loose = BloomFilter::new(100, 0.1);
507 assert!(bf_strict.num_hashes >= bf_loose.num_hashes);
509 }
510
511 #[test]
514 fn test_near_dup_new() {
515 let det = NearDuplicateDetector::new(500);
516 assert!(det.seen.bit_count >= 64);
517 }
518
519 #[test]
520 fn test_near_dup_first_occurrence_not_duplicate() {
521 let mut det = NearDuplicateDetector::new(100);
522 let result = det.add_and_check(b"unique_fingerprint");
523 assert!(!result, "First occurrence should not be a duplicate");
524 }
525
526 #[test]
527 fn test_near_dup_second_occurrence_is_duplicate() {
528 let mut det = NearDuplicateDetector::new(100);
529 det.add_and_check(b"duplicate_fingerprint");
530 let result = det.add_and_check(b"duplicate_fingerprint");
531 assert!(result, "Second occurrence should be detected as duplicate");
532 }
533
534 #[test]
535 fn test_near_dup_reset_clears_state() {
536 let mut det = NearDuplicateDetector::new(100);
537 det.add_and_check(b"item_to_reset");
538 det.reset();
539 let result = det.add_and_check(b"item_to_reset");
541 assert!(!result, "After reset, item should not be a duplicate");
542 }
543
544 #[test]
545 fn test_near_dup_multiple_unique_items() {
546 let mut det = NearDuplicateDetector::new(1000);
547 let items: Vec<Vec<u8>> = (0u64..50).map(|i| i.to_le_bytes().to_vec()).collect();
548 for item in &items {
549 let is_dup = det.add_and_check(item);
551 assert!(!is_dup, "Unique item should not be flagged as duplicate");
553 }
554 }
555
556 #[test]
557 fn test_near_dup_estimated_unique_count() {
558 let mut det = NearDuplicateDetector::new(1000);
559 for i in 0u64..20 {
560 det.add_and_check(&i.to_le_bytes());
561 }
562 let est = det.estimated_unique_count();
563 assert!(est > 0);
564 }
565
566 #[test]
569 fn test_prescreen_all_unique() {
570 let fingerprints: Vec<Vec<u8>> = (0u64..50).map(|i| i.to_le_bytes().to_vec()).collect();
571 let result = prescreen_fingerprints(&fingerprints, 1000, 0.01);
572 assert_eq!(result.total, 50);
573 assert_eq!(result.unique.len(), 50);
575 assert!(result.candidates.is_empty());
576 }
577
578 #[test]
579 fn test_prescreen_with_duplicates() {
580 let mut fingerprints: Vec<Vec<u8>> = Vec::new();
581 for i in 0u64..10 {
583 fingerprints.push(i.to_le_bytes().to_vec());
584 }
585 for _ in 0..10 {
587 fingerprints.push(0u64.to_le_bytes().to_vec());
588 }
589 let result = prescreen_fingerprints(&fingerprints, 1000, 0.01);
590 assert_eq!(result.total, 20);
591 assert_eq!(result.candidates.len(), 10);
593 }
594
595 #[test]
596 fn test_prescreen_candidate_rate() {
597 let fingerprints = vec![
598 vec![1u8, 2, 3],
599 vec![1u8, 2, 3], vec![4u8, 5, 6],
601 vec![4u8, 5, 6], vec![7u8, 8, 9],
603 ];
604 let result = prescreen_fingerprints(&fingerprints, 100, 0.01);
605 assert_eq!(result.total, 5);
606 assert_eq!(result.candidates.len(), 2);
607 assert!((result.candidate_rate() - 0.4).abs() < f64::EPSILON);
608 assert!((result.rejection_rate() - 0.6).abs() < f64::EPSILON);
609 }
610
611 #[test]
612 fn test_prescreen_perceptual_hashes_identical() {
613 let hashes = vec![0xDEAD_BEEF_DEAD_BEEFu64; 5];
614 let result = prescreen_perceptual_hashes(&hashes, 16, 1000, 0.01);
615 assert_eq!(result.total, 5);
616 assert_eq!(result.candidates.len(), 4);
618 assert_eq!(result.unique.len(), 1);
619 }
620
621 #[test]
622 fn test_prescreen_perceptual_hashes_all_different() {
623 let hashes: Vec<u64> = (0..20u64).map(|i| i << 48).collect();
625 let result = prescreen_perceptual_hashes(&hashes, 16, 1000, 0.01);
626 assert_eq!(result.total, 20);
627 assert_eq!(result.unique.len(), 20);
629 assert!(result.candidates.is_empty());
630 }
631
632 #[test]
633 fn test_prescreen_empty_input() {
634 let result = prescreen_fingerprints(&[], 100, 0.01);
635 assert_eq!(result.total, 0);
636 assert!(result.candidates.is_empty());
637 assert!(result.unique.is_empty());
638 assert_eq!(result.candidate_rate(), 0.0);
639 assert_eq!(result.rejection_rate(), 0.0);
640 }
641
642 #[test]
643 fn test_prescreen_result_rates_sum_to_one() {
644 let fingerprints: Vec<Vec<u8>> =
645 vec![vec![1, 2], vec![1, 2], vec![3, 4], vec![5, 6], vec![3, 4]];
646 let result = prescreen_fingerprints(&fingerprints, 100, 0.01);
647 let sum = result.candidate_rate() + result.rejection_rate();
648 assert!((sum - 1.0).abs() < 1e-10);
649 }
650
651 #[test]
654 fn test_pipeline_config_default() {
655 let cfg = DedupPipelineConfig::default();
656 assert_eq!(cfg.bloom_capacity, 10_000);
657 assert_eq!(cfg.lsh_tables, 8);
658 assert_eq!(cfg.max_hamming_distance, 10);
659 }
660
661 #[test]
664 fn test_pipeline_empty() {
665 let cfg = DedupPipelineConfig::default();
666 let result = run_dedup_pipeline(&[], &cfg);
667 assert_eq!(result.total, 0);
668 assert!(result.bloom_candidates.is_empty());
669 assert!(result.lsh_pairs.is_empty());
670 }
671
672 #[test]
673 fn test_pipeline_all_identical() {
674 let hash = 0xDEAD_BEEF_CAFE_BABEu64;
675 let hashes: Vec<(u64, u64)> = (0..10).map(|i| (i, hash)).collect();
676 let cfg = DedupPipelineConfig {
677 bloom_capacity: 100,
678 bloom_fpr: 0.01,
679 quantize_bits: 16,
680 lsh_tables: 6,
681 lsh_bits_per_table: 8,
682 max_hamming_distance: 0,
683 seed: 42,
684 };
685 let result = run_dedup_pipeline(&hashes, &cfg);
686 assert_eq!(result.total, 10);
687 assert_eq!(result.bloom_candidates.len(), 9);
689 assert_eq!(result.bloom_unique.len(), 1);
690 assert!(!result.lsh_pairs.is_empty());
692 }
693
694 #[test]
695 fn test_pipeline_all_unique() {
696 let hashes: Vec<(u64, u64)> = (0..20u64).map(|i| (i, i << 48)).collect();
698 let cfg = DedupPipelineConfig {
699 bloom_capacity: 1000,
700 bloom_fpr: 0.01,
701 quantize_bits: 16,
702 lsh_tables: 4,
703 lsh_bits_per_table: 12,
704 max_hamming_distance: 5,
705 seed: 42,
706 };
707 let result = run_dedup_pipeline(&hashes, &cfg);
708 assert_eq!(result.total, 20);
709 assert_eq!(result.bloom_unique.len(), 20);
711 assert!(result.bloom_candidates.is_empty());
712 assert!(result.lsh_pairs.is_empty());
713 }
714
715 #[test]
716 fn test_pipeline_mixed() {
717 let base_hash = 0xAAAA_BBBB_CCCC_DDDDu64;
719 let mut hashes: Vec<(u64, u64)> = (0..5).map(|i| (i, base_hash)).collect();
720 for i in 5..10u64 {
721 hashes.push((i, i << 48));
722 }
723 let cfg = DedupPipelineConfig::default();
724 let result = run_dedup_pipeline(&hashes, &cfg);
725 assert_eq!(result.total, 10);
726 assert!(!result.bloom_unique.is_empty());
728 }
729
730 #[test]
731 fn test_pipeline_result_bloom_rejection_rate() {
732 let result = DedupPipelineResult {
733 bloom_candidates: vec![0, 1],
734 bloom_unique: vec![2, 3, 4],
735 lsh_pairs: vec![(0, 1, 0)],
736 total: 5,
737 };
738 assert!((result.bloom_rejection_rate() - 0.6).abs() < f64::EPSILON);
739 assert_eq!(result.num_pairs(), 1);
740 }
741
742 #[test]
743 fn test_pipeline_result_empty_total() {
744 let result = DedupPipelineResult {
745 bloom_candidates: Vec::new(),
746 bloom_unique: Vec::new(),
747 lsh_pairs: Vec::new(),
748 total: 0,
749 };
750 assert_eq!(result.bloom_rejection_rate(), 0.0);
751 assert_eq!(result.num_pairs(), 0);
752 }
753
754 #[test]
755 fn test_pipeline_near_duplicates() {
756 let base = 0xFFFF_FFFF_FFFF_FFFFu64;
757 let similar = base ^ 0b111; let hashes = vec![(1, base), (2, similar), (3, base)];
760 let cfg = DedupPipelineConfig {
761 bloom_capacity: 100,
762 bloom_fpr: 0.01,
763 quantize_bits: 16,
764 lsh_tables: 8,
765 lsh_bits_per_table: 6,
766 max_hamming_distance: 5,
767 seed: 77,
768 };
769 let result = run_dedup_pipeline(&hashes, &cfg);
770 assert_eq!(result.total, 3);
771 }
774}