#![allow(dead_code)]
#![allow(clippy::cast_precision_loss)]
pub struct BloomFilter {
pub bits: Vec<u64>,
pub num_hashes: u32,
pub bit_count: u64,
}
impl BloomFilter {
#[must_use]
pub fn new(capacity: usize, false_positive_rate: f32) -> Self {
let capacity = capacity.max(1);
let fpr = false_positive_rate.clamp(1e-9, 1.0 - f32::EPSILON);
let ln2 = std::f64::consts::LN_2;
let n = capacity as f64;
let p = fpr as f64;
let m = (-n * p.ln() / (ln2 * ln2)).ceil() as u64;
let m = m.max(64); let k = ((m as f64 / n) * ln2).round() as u32;
let k = k.clamp(1, 30);
let words = ((m + 63) / 64) as usize;
Self {
bits: vec![0u64; words],
num_hashes: k,
bit_count: m,
}
}
pub fn insert(&mut self, item: &[u8]) {
for i in 0..self.num_hashes {
let idx = self.hash_index(item, i);
let word = (idx / 64) as usize;
let bit = idx % 64;
self.bits[word] |= 1u64 << bit;
}
}
#[must_use]
pub fn contains(&self, item: &[u8]) -> bool {
for i in 0..self.num_hashes {
let idx = self.hash_index(item, i);
let word = (idx / 64) as usize;
let bit = idx % 64;
if self.bits[word] & (1u64 << bit) == 0 {
return false;
}
}
true
}
#[must_use]
pub fn estimated_count(&self) -> usize {
let set_bits: u64 = self.bits.iter().map(|w| w.count_ones() as u64).sum();
if set_bits == 0 {
return 0;
}
if set_bits >= self.bit_count {
return usize::MAX;
}
let m = self.bit_count as f64;
let k = self.num_hashes as f64;
let x = set_bits as f64;
let estimate = -(m / k) * (1.0 - x / m).ln();
estimate.round() as usize
}
pub fn clear(&mut self) {
for w in &mut self.bits {
*w = 0;
}
}
fn hash_index(&self, item: &[u8], i: u32) -> u64 {
let h1 = fnv1a_64(item);
let h2 = fnv1a_64_seeded(item, i as u64 ^ 0x9e37_79b9_7f4a_7c15);
h1.wrapping_add((i as u64).wrapping_mul(h2)) % self.bit_count
}
}
fn fnv1a_64(data: &[u8]) -> u64 {
let mut h: u64 = 0xcbf2_9ce4_8422_2325;
for &b in data {
h ^= b as u64;
h = h.wrapping_mul(0x0000_0100_0000_01b3);
}
h
}
fn fnv1a_64_seeded(data: &[u8], seed: u64) -> u64 {
let mut h: u64 = 0xcbf2_9ce4_8422_2325 ^ seed.wrapping_mul(0x0000_0100_0000_01b3);
for &b in data {
h ^= b as u64;
h = h.wrapping_mul(0x0000_0100_0000_01b3);
}
h
}
pub struct NearDuplicateDetector {
pub threshold: f32,
pub seen: BloomFilter,
}
impl NearDuplicateDetector {
#[must_use]
pub fn new(capacity: usize) -> Self {
Self {
threshold: 0.99,
seen: BloomFilter::new(capacity, 0.01),
}
}
#[must_use]
pub fn with_fpr(capacity: usize, false_positive_rate: f32) -> Self {
Self {
threshold: 1.0 - false_positive_rate,
seen: BloomFilter::new(capacity, false_positive_rate),
}
}
pub fn add_and_check(&mut self, fingerprint: &[u8]) -> bool {
let duplicate = self.seen.contains(fingerprint);
self.seen.insert(fingerprint);
duplicate
}
pub fn reset(&mut self) {
self.seen.clear();
}
#[must_use]
pub fn estimated_unique_count(&self) -> usize {
self.seen.estimated_count()
}
}
#[derive(Debug, Clone)]
pub struct PreScreenResult {
pub candidates: Vec<usize>,
pub unique: Vec<usize>,
pub total: usize,
}
impl PreScreenResult {
#[must_use]
pub fn candidate_rate(&self) -> f64 {
if self.total == 0 {
return 0.0;
}
self.candidates.len() as f64 / self.total as f64
}
#[must_use]
pub fn rejection_rate(&self) -> f64 {
if self.total == 0 {
return 0.0;
}
self.unique.len() as f64 / self.total as f64
}
}
#[must_use]
pub fn prescreen_fingerprints(
fingerprints: &[Vec<u8>],
capacity: usize,
fpr: f32,
) -> PreScreenResult {
let mut bloom = BloomFilter::new(capacity, fpr);
let mut candidates = Vec::new();
let mut unique = Vec::new();
for (i, fp) in fingerprints.iter().enumerate() {
if bloom.contains(fp) {
candidates.push(i);
} else {
unique.push(i);
}
bloom.insert(fp);
}
PreScreenResult {
candidates,
unique,
total: fingerprints.len(),
}
}
#[must_use]
pub fn prescreen_perceptual_hashes(
hashes: &[u64],
quantize_bits: u32,
capacity: usize,
fpr: f32,
) -> PreScreenResult {
let quantize_bits = quantize_bits.clamp(4, 64);
let shift = 64 - quantize_bits;
let mut bloom = BloomFilter::new(capacity, fpr);
let mut candidates = Vec::new();
let mut unique = Vec::new();
for (i, &hash) in hashes.iter().enumerate() {
let quantized = hash >> shift;
let bytes = quantized.to_le_bytes();
if bloom.contains(&bytes) {
candidates.push(i);
} else {
unique.push(i);
}
bloom.insert(&bytes);
}
PreScreenResult {
candidates,
unique,
total: hashes.len(),
}
}
#[derive(Debug, Clone)]
pub struct DedupPipelineConfig {
pub bloom_capacity: usize,
pub bloom_fpr: f32,
pub quantize_bits: u32,
pub lsh_tables: usize,
pub lsh_bits_per_table: usize,
pub max_hamming_distance: u32,
pub seed: u64,
}
impl Default for DedupPipelineConfig {
fn default() -> Self {
Self {
bloom_capacity: 10_000,
bloom_fpr: 0.01,
quantize_bits: 16,
lsh_tables: 8,
lsh_bits_per_table: 8,
max_hamming_distance: 10,
seed: 42,
}
}
}
#[derive(Debug, Clone)]
pub struct DedupPipelineResult {
pub bloom_candidates: Vec<usize>,
pub bloom_unique: Vec<usize>,
pub lsh_pairs: Vec<(u64, u64, u32)>,
pub total: usize,
}
impl DedupPipelineResult {
#[must_use]
pub fn bloom_rejection_rate(&self) -> f64 {
if self.total == 0 {
return 0.0;
}
self.bloom_unique.len() as f64 / self.total as f64
}
#[must_use]
pub fn num_pairs(&self) -> usize {
self.lsh_pairs.len()
}
}
#[must_use]
pub fn run_dedup_pipeline(
hashes: &[(u64, u64)], config: &DedupPipelineConfig,
) -> DedupPipelineResult {
use crate::lsh_index::lsh_dedup_pass;
if hashes.is_empty() {
return DedupPipelineResult {
bloom_candidates: Vec::new(),
bloom_unique: Vec::new(),
lsh_pairs: Vec::new(),
total: 0,
};
}
let raw_hashes: Vec<u64> = hashes.iter().map(|&(_, h)| h).collect();
let prescreen = prescreen_perceptual_hashes(
&raw_hashes,
config.quantize_bits,
config.bloom_capacity,
config.bloom_fpr,
);
let candidate_hashes: Vec<(u64, u64)> = prescreen
.candidates
.iter()
.map(|&idx| hashes[idx])
.collect();
let lsh_result = lsh_dedup_pass(
&candidate_hashes,
config.max_hamming_distance,
config.lsh_tables,
config.lsh_bits_per_table,
config.seed,
);
DedupPipelineResult {
bloom_candidates: prescreen.candidates,
bloom_unique: prescreen.unique,
lsh_pairs: lsh_result.pairs,
total: hashes.len(),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_bloom_new_non_empty() {
let bf = BloomFilter::new(1000, 0.01);
assert!(!bf.bits.is_empty());
assert!(bf.num_hashes >= 1);
assert!(bf.bit_count >= 64);
}
#[test]
fn test_bloom_larger_capacity_more_bits() {
let bf_small = BloomFilter::new(100, 0.01);
let bf_large = BloomFilter::new(10_000, 0.01);
assert!(bf_large.bit_count > bf_small.bit_count);
}
#[test]
fn test_bloom_initially_empty() {
let bf = BloomFilter::new(100, 0.01);
assert!(!bf.contains(b"hello"));
assert!(!bf.contains(b"world"));
}
#[test]
fn test_bloom_insert_and_contains() {
let mut bf = BloomFilter::new(100, 0.01);
bf.insert(b"oximedia");
assert!(bf.contains(b"oximedia"));
}
#[test]
fn test_bloom_multiple_inserts() {
let mut bf = BloomFilter::new(200, 0.01);
let items: Vec<&[u8]> = vec![b"alpha", b"beta", b"gamma", b"delta", b"epsilon"];
for item in &items {
bf.insert(item);
}
for item in &items {
assert!(bf.contains(item), "Item should be present after insert");
}
}
#[test]
fn test_bloom_clear() {
let mut bf = BloomFilter::new(100, 0.01);
bf.insert(b"test_item");
assert!(bf.contains(b"test_item"));
bf.clear();
assert!(!bf.contains(b"test_item"));
}
#[test]
fn test_bloom_estimated_count_zero_initially() {
let bf = BloomFilter::new(100, 0.01);
assert_eq!(bf.estimated_count(), 0);
}
#[test]
fn test_bloom_estimated_count_after_inserts() {
let mut bf = BloomFilter::new(1000, 0.01);
for i in 0..100u64 {
bf.insert(&i.to_le_bytes());
}
let est = bf.estimated_count();
assert!(est > 0, "Estimated count should be positive");
}
#[test]
fn test_bloom_different_fpr_different_hashes() {
let bf_strict = BloomFilter::new(100, 0.0001);
let bf_loose = BloomFilter::new(100, 0.1);
assert!(bf_strict.num_hashes >= bf_loose.num_hashes);
}
#[test]
fn test_near_dup_new() {
let det = NearDuplicateDetector::new(500);
assert!(det.seen.bit_count >= 64);
}
#[test]
fn test_near_dup_first_occurrence_not_duplicate() {
let mut det = NearDuplicateDetector::new(100);
let result = det.add_and_check(b"unique_fingerprint");
assert!(!result, "First occurrence should not be a duplicate");
}
#[test]
fn test_near_dup_second_occurrence_is_duplicate() {
let mut det = NearDuplicateDetector::new(100);
det.add_and_check(b"duplicate_fingerprint");
let result = det.add_and_check(b"duplicate_fingerprint");
assert!(result, "Second occurrence should be detected as duplicate");
}
#[test]
fn test_near_dup_reset_clears_state() {
let mut det = NearDuplicateDetector::new(100);
det.add_and_check(b"item_to_reset");
det.reset();
let result = det.add_and_check(b"item_to_reset");
assert!(!result, "After reset, item should not be a duplicate");
}
#[test]
fn test_near_dup_multiple_unique_items() {
let mut det = NearDuplicateDetector::new(1000);
let items: Vec<Vec<u8>> = (0u64..50).map(|i| i.to_le_bytes().to_vec()).collect();
for item in &items {
let is_dup = det.add_and_check(item);
assert!(!is_dup, "Unique item should not be flagged as duplicate");
}
}
#[test]
fn test_near_dup_estimated_unique_count() {
let mut det = NearDuplicateDetector::new(1000);
for i in 0u64..20 {
det.add_and_check(&i.to_le_bytes());
}
let est = det.estimated_unique_count();
assert!(est > 0);
}
#[test]
fn test_prescreen_all_unique() {
let fingerprints: Vec<Vec<u8>> = (0u64..50).map(|i| i.to_le_bytes().to_vec()).collect();
let result = prescreen_fingerprints(&fingerprints, 1000, 0.01);
assert_eq!(result.total, 50);
assert_eq!(result.unique.len(), 50);
assert!(result.candidates.is_empty());
}
#[test]
fn test_prescreen_with_duplicates() {
let mut fingerprints: Vec<Vec<u8>> = Vec::new();
for i in 0u64..10 {
fingerprints.push(i.to_le_bytes().to_vec());
}
for _ in 0..10 {
fingerprints.push(0u64.to_le_bytes().to_vec());
}
let result = prescreen_fingerprints(&fingerprints, 1000, 0.01);
assert_eq!(result.total, 20);
assert_eq!(result.candidates.len(), 10);
}
#[test]
fn test_prescreen_candidate_rate() {
let fingerprints = vec![
vec![1u8, 2, 3],
vec![1u8, 2, 3], vec![4u8, 5, 6],
vec![4u8, 5, 6], vec![7u8, 8, 9],
];
let result = prescreen_fingerprints(&fingerprints, 100, 0.01);
assert_eq!(result.total, 5);
assert_eq!(result.candidates.len(), 2);
assert!((result.candidate_rate() - 0.4).abs() < f64::EPSILON);
assert!((result.rejection_rate() - 0.6).abs() < f64::EPSILON);
}
#[test]
fn test_prescreen_perceptual_hashes_identical() {
let hashes = vec![0xDEAD_BEEF_DEAD_BEEFu64; 5];
let result = prescreen_perceptual_hashes(&hashes, 16, 1000, 0.01);
assert_eq!(result.total, 5);
assert_eq!(result.candidates.len(), 4);
assert_eq!(result.unique.len(), 1);
}
#[test]
fn test_prescreen_perceptual_hashes_all_different() {
let hashes: Vec<u64> = (0..20u64).map(|i| i << 48).collect();
let result = prescreen_perceptual_hashes(&hashes, 16, 1000, 0.01);
assert_eq!(result.total, 20);
assert_eq!(result.unique.len(), 20);
assert!(result.candidates.is_empty());
}
#[test]
fn test_prescreen_empty_input() {
let result = prescreen_fingerprints(&[], 100, 0.01);
assert_eq!(result.total, 0);
assert!(result.candidates.is_empty());
assert!(result.unique.is_empty());
assert_eq!(result.candidate_rate(), 0.0);
assert_eq!(result.rejection_rate(), 0.0);
}
#[test]
fn test_prescreen_result_rates_sum_to_one() {
let fingerprints: Vec<Vec<u8>> =
vec![vec![1, 2], vec![1, 2], vec![3, 4], vec![5, 6], vec![3, 4]];
let result = prescreen_fingerprints(&fingerprints, 100, 0.01);
let sum = result.candidate_rate() + result.rejection_rate();
assert!((sum - 1.0).abs() < 1e-10);
}
#[test]
fn test_pipeline_config_default() {
let cfg = DedupPipelineConfig::default();
assert_eq!(cfg.bloom_capacity, 10_000);
assert_eq!(cfg.lsh_tables, 8);
assert_eq!(cfg.max_hamming_distance, 10);
}
#[test]
fn test_pipeline_empty() {
let cfg = DedupPipelineConfig::default();
let result = run_dedup_pipeline(&[], &cfg);
assert_eq!(result.total, 0);
assert!(result.bloom_candidates.is_empty());
assert!(result.lsh_pairs.is_empty());
}
#[test]
fn test_pipeline_all_identical() {
let hash = 0xDEAD_BEEF_CAFE_BABEu64;
let hashes: Vec<(u64, u64)> = (0..10).map(|i| (i, hash)).collect();
let cfg = DedupPipelineConfig {
bloom_capacity: 100,
bloom_fpr: 0.01,
quantize_bits: 16,
lsh_tables: 6,
lsh_bits_per_table: 8,
max_hamming_distance: 0,
seed: 42,
};
let result = run_dedup_pipeline(&hashes, &cfg);
assert_eq!(result.total, 10);
assert_eq!(result.bloom_candidates.len(), 9);
assert_eq!(result.bloom_unique.len(), 1);
assert!(!result.lsh_pairs.is_empty());
}
#[test]
fn test_pipeline_all_unique() {
let hashes: Vec<(u64, u64)> = (0..20u64).map(|i| (i, i << 48)).collect();
let cfg = DedupPipelineConfig {
bloom_capacity: 1000,
bloom_fpr: 0.01,
quantize_bits: 16,
lsh_tables: 4,
lsh_bits_per_table: 12,
max_hamming_distance: 5,
seed: 42,
};
let result = run_dedup_pipeline(&hashes, &cfg);
assert_eq!(result.total, 20);
assert_eq!(result.bloom_unique.len(), 20);
assert!(result.bloom_candidates.is_empty());
assert!(result.lsh_pairs.is_empty());
}
#[test]
fn test_pipeline_mixed() {
let base_hash = 0xAAAA_BBBB_CCCC_DDDDu64;
let mut hashes: Vec<(u64, u64)> = (0..5).map(|i| (i, base_hash)).collect();
for i in 5..10u64 {
hashes.push((i, i << 48));
}
let cfg = DedupPipelineConfig::default();
let result = run_dedup_pipeline(&hashes, &cfg);
assert_eq!(result.total, 10);
assert!(!result.bloom_unique.is_empty());
}
#[test]
fn test_pipeline_result_bloom_rejection_rate() {
let result = DedupPipelineResult {
bloom_candidates: vec![0, 1],
bloom_unique: vec![2, 3, 4],
lsh_pairs: vec![(0, 1, 0)],
total: 5,
};
assert!((result.bloom_rejection_rate() - 0.6).abs() < f64::EPSILON);
assert_eq!(result.num_pairs(), 1);
}
#[test]
fn test_pipeline_result_empty_total() {
let result = DedupPipelineResult {
bloom_candidates: Vec::new(),
bloom_unique: Vec::new(),
lsh_pairs: Vec::new(),
total: 0,
};
assert_eq!(result.bloom_rejection_rate(), 0.0);
assert_eq!(result.num_pairs(), 0);
}
#[test]
fn test_pipeline_near_duplicates() {
let base = 0xFFFF_FFFF_FFFF_FFFFu64;
let similar = base ^ 0b111; let hashes = vec![(1, base), (2, similar), (3, base)];
let cfg = DedupPipelineConfig {
bloom_capacity: 100,
bloom_fpr: 0.01,
quantize_bits: 16,
lsh_tables: 8,
lsh_bits_per_table: 6,
max_hamming_distance: 5,
seed: 77,
};
let result = run_dedup_pipeline(&hashes, &cfg);
assert_eq!(result.total, 3);
}
}