Skip to main content

oximedia_dedup/
lib.rs

1//! Media deduplication and duplicate detection for `OxiMedia`.
2//!
3//! `oximedia-dedup` provides comprehensive duplicate detection and media deduplication
4//! for the `OxiMedia` multimedia framework. This includes:
5//!
6//! - **Cryptographic hashing**: BLAKE3-based exact duplicate detection
7//! - **Visual similarity**: Perceptual hashing, SSIM, histogram, and feature matching
8//! - **Audio fingerprinting**: Audio fingerprint comparison and waveform similarity
9//! - **Metadata matching**: Fuzzy metadata comparison for near-duplicates
10//! - **Storage optimization**: Fast SQLite-based indexing for large libraries
11//! - **Reporting**: Comprehensive duplicate reports with similarity scoring
12//!
13//! # Modules
14//!
15//! - [`hash`]: Cryptographic and content-based hashing
16//! - [`visual`]: Visual similarity detection
17//! - [`audio`]: Audio fingerprint comparison
18//! - [`metadata`]: Metadata-based deduplication
19//! - `database`: SQLite-based indexing and lookup
20//! - [`report`]: Duplicate detection reports
21//!
22//! # Example
23//!
24//! ```
25//! use oximedia_dedup::{DuplicateDetector, DetectionStrategy, DedupConfig};
26//!
27//! # async fn example() -> Result<(), Box<dyn std::error::Error>> {
28//! let config = DedupConfig::default();
29//! let mut detector = DuplicateDetector::new(config).await?;
30//!
31//! // Add files to the index
32//! detector.add_file("/path/to/video1.mp4").await?;
33//! detector.add_file("/path/to/video2.mp4").await?;
34//!
35//! // Find duplicates
36//! let duplicates = detector.find_duplicates(DetectionStrategy::All).await?;
37//! # Ok(())
38//! # }
39//! ```
40
41#![warn(missing_docs)]
42#![allow(clippy::module_name_repetitions)]
43#![allow(clippy::similar_names)]
44#![allow(clippy::cast_possible_truncation)]
45#![allow(clippy::cast_sign_loss)]
46#![allow(clippy::cast_precision_loss)]
47#![allow(clippy::too_many_arguments)]
48#![allow(dead_code)]
49
50pub mod audio;
51pub mod bloom_filter;
52pub mod cluster;
53pub mod content_id;
54pub mod content_signature;
55pub mod cross_format;
56#[cfg(feature = "sqlite")]
57pub mod database;
58pub mod dedup_cache;
59pub mod dedup_index;
60pub mod dedup_policy;
61pub mod dedup_report;
62pub mod dedup_report_ext;
63pub mod dedup_stats;
64pub mod frame_hash;
65pub mod fuzzy_match;
66pub mod hash;
67pub mod hash_store;
68pub mod incremental;
69pub mod lsh_index;
70pub mod merge_strategy;
71pub mod metadata;
72pub mod near_duplicate;
73pub mod perceptual_hash;
74pub mod phash;
75pub mod progress;
76pub mod report;
77pub mod rolling_hash;
78pub mod segment_dedup;
79pub mod similarity_index;
80pub mod video_dedup;
81pub mod video_segment_dedup;
82pub mod visual;
83
84#[cfg(feature = "sqlite")]
85use std::path::Path;
86use std::path::PathBuf;
87use thiserror::Error;
88
89#[cfg(feature = "sqlite")]
90pub use database::DedupDatabase;
91pub use report::{DuplicateGroup, DuplicateReport, SimilarityScore};
92
93// ---------------------------------------------------------------------------
94// Internal helpers used by the stub implementations
95// ---------------------------------------------------------------------------
96
97/// Decode a lowercase hex string into a byte vector.
98///
99/// # Errors
100///
101/// Returns `DedupError::Hash` if the string contains non-hex characters or
102/// has an odd number of characters.
103#[cfg(feature = "sqlite")]
104fn decode_hex_bytes(hex: &str) -> DedupResult<Vec<u8>> {
105    if hex.len() % 2 != 0 {
106        return Err(DedupError::Hash(format!(
107            "odd-length hex string: len={}",
108            hex.len()
109        )));
110    }
111    (0..hex.len())
112        .step_by(2)
113        .map(|i| {
114            u8::from_str_radix(&hex[i..i + 2], 16)
115                .map_err(|e| DedupError::Hash(format!("invalid hex byte at {i}: {e}")))
116        })
117        .collect()
118}
119
120/// Compute the cosine similarity between two f64 slices.
121///
122/// Returns a value in [−1, 1] or 0.0 when either vector is zero-magnitude.
123#[cfg(feature = "sqlite")]
124fn cosine_similarity(a: &[f64], b: &[f64]) -> f64 {
125    let dot: f64 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
126    let mag_a: f64 = a.iter().map(|x| x * x).sum::<f64>().sqrt();
127    let mag_b: f64 = b.iter().map(|x| x * x).sum::<f64>().sqrt();
128    if mag_a < f64::EPSILON || mag_b < f64::EPSILON {
129        return 0.0;
130    }
131    dot / (mag_a * mag_b)
132}
133
134/// Generic pairwise grouping helper for perceptual hash comparison.
135///
136/// Takes a slice of `(path, hash)` pairs, a maximum Hamming distance
137/// threshold, a distance function, a similarity function (0.0‒1.0), and a
138/// method label.  Returns non-overlapping duplicate groups.
139#[cfg(feature = "sqlite")]
140fn group_by_pairwise_similarity<H, FDist, FSim>(
141    items: &[(String, H)],
142    max_distance: u32,
143    dist_fn: FDist,
144    sim_fn: FSim,
145    method: &str,
146) -> DedupResult<Vec<DuplicateGroup>>
147where
148    FDist: Fn(&H, &H) -> u32,
149    FSim: Fn(&H, &H) -> f64,
150{
151    let mut groups: Vec<DuplicateGroup> = Vec::new();
152    let mut assigned = vec![false; items.len()];
153
154    for i in 0..items.len() {
155        if assigned[i] {
156            continue;
157        }
158        let mut group_files = vec![items[i].0.clone()];
159        let mut best_score = 0.0f64;
160
161        for j in (i + 1)..items.len() {
162            if assigned[j] {
163                continue;
164            }
165            let dist = dist_fn(&items[i].1, &items[j].1);
166            if dist <= max_distance {
167                let sim = sim_fn(&items[i].1, &items[j].1);
168                group_files.push(items[j].0.clone());
169                assigned[j] = true;
170                if sim > best_score {
171                    best_score = sim;
172                }
173            }
174        }
175
176        if group_files.len() > 1 {
177            assigned[i] = true;
178            groups.push(DuplicateGroup {
179                files: group_files,
180                scores: vec![SimilarityScore {
181                    method: method.to_string(),
182                    score: best_score,
183                    metadata: Vec::new(),
184                }],
185            });
186        }
187    }
188
189    Ok(groups)
190}
191
192/// Deduplication error type.
193#[derive(Error, Debug)]
194pub enum DedupError {
195    /// I/O error
196    #[error("I/O error: {0}")]
197    Io(#[from] std::io::Error),
198
199    /// Database error
200    #[cfg(feature = "sqlite")]
201    #[error("Database error: {0}")]
202    Database(#[from] sqlx::Error),
203
204    /// Database error (non-sqlite variant)
205    #[cfg(not(feature = "sqlite"))]
206    #[error("Database error: {0}")]
207    Database(String),
208
209    /// Hashing error
210    #[error("Hashing error: {0}")]
211    Hash(String),
212
213    /// Visual processing error
214    #[error("Visual processing error: {0}")]
215    Visual(String),
216
217    /// Audio processing error
218    #[error("Audio processing error: {0}")]
219    Audio(String),
220
221    /// Metadata processing error
222    #[error("Metadata processing error: {0}")]
223    Metadata(String),
224
225    /// File not found
226    #[error("File not found: {0}")]
227    FileNotFound(PathBuf),
228
229    /// Invalid configuration
230    #[error("Invalid configuration: {0}")]
231    InvalidConfig(String),
232
233    /// Core library error
234    #[error("OxiMedia core error: {0}")]
235    Core(#[from] oximedia_core::OxiError),
236}
237
238/// Deduplication result type.
239pub type DedupResult<T> = Result<T, DedupError>;
240
241/// Detection strategy for finding duplicates.
242#[derive(Debug, Clone, Copy, PartialEq, Eq)]
243pub enum DetectionStrategy {
244    /// Exact duplicates only (cryptographic hash)
245    ExactHash,
246
247    /// Visual similarity using perceptual hashing
248    PerceptualHash,
249
250    /// Visual similarity using SSIM
251    Ssim,
252
253    /// Visual similarity using histogram comparison
254    Histogram,
255
256    /// Visual similarity using feature matching
257    FeatureMatch,
258
259    /// Audio fingerprint comparison
260    AudioFingerprint,
261
262    /// Metadata-based matching
263    Metadata,
264
265    /// All detection methods
266    All,
267
268    /// Combination of visual methods
269    VisualAll,
270
271    /// Combination of fast methods (hash + perceptual + metadata)
272    Fast,
273}
274
275impl DetectionStrategy {
276    /// Check if strategy includes exact hashing.
277    #[must_use]
278    pub fn includes_hash(self) -> bool {
279        matches!(self, Self::ExactHash | Self::All | Self::Fast)
280    }
281
282    /// Check if strategy includes perceptual hashing.
283    #[must_use]
284    pub fn includes_perceptual(self) -> bool {
285        matches!(
286            self,
287            Self::PerceptualHash | Self::All | Self::VisualAll | Self::Fast
288        )
289    }
290
291    /// Check if strategy includes SSIM.
292    #[must_use]
293    pub fn includes_ssim(self) -> bool {
294        matches!(self, Self::Ssim | Self::All | Self::VisualAll)
295    }
296
297    /// Check if strategy includes histogram.
298    #[must_use]
299    pub fn includes_histogram(self) -> bool {
300        matches!(self, Self::Histogram | Self::All | Self::VisualAll)
301    }
302
303    /// Check if strategy includes feature matching.
304    #[must_use]
305    pub fn includes_feature_match(self) -> bool {
306        matches!(self, Self::FeatureMatch | Self::All | Self::VisualAll)
307    }
308
309    /// Check if strategy includes audio fingerprinting.
310    #[must_use]
311    pub fn includes_audio(self) -> bool {
312        matches!(self, Self::AudioFingerprint | Self::All)
313    }
314
315    /// Check if strategy includes metadata.
316    #[must_use]
317    pub fn includes_metadata(self) -> bool {
318        matches!(self, Self::Metadata | Self::All | Self::Fast)
319    }
320}
321
322/// Configuration for deduplication.
323#[derive(Debug, Clone)]
324pub struct DedupConfig {
325    /// Database path
326    pub database_path: PathBuf,
327
328    /// Perceptual hash similarity threshold (0.0-1.0)
329    pub perceptual_threshold: f64,
330
331    /// SSIM similarity threshold (0.0-1.0)
332    pub ssim_threshold: f64,
333
334    /// Histogram similarity threshold (0.0-1.0)
335    pub histogram_threshold: f64,
336
337    /// Feature match threshold (minimum number of matches)
338    pub feature_match_threshold: usize,
339
340    /// Audio fingerprint similarity threshold (0.0-1.0)
341    pub audio_threshold: f64,
342
343    /// Metadata similarity threshold (0.0-1.0)
344    pub metadata_threshold: f64,
345
346    /// Enable parallel processing
347    pub parallel: bool,
348
349    /// Number of frames to sample for video analysis
350    pub sample_frames: usize,
351
352    /// Chunk size for content-based chunking (bytes)
353    pub chunk_size: usize,
354
355    /// Thumbnail resolution for SSIM duplicate detection.
356    ///
357    /// Specifies both width and height of the grayscale thumbnail used for
358    /// SSIM comparison.  Must be >= 4.  Default is 8 (i.e. 8x8 = 64 pixels).
359    /// Higher values give more accurate SSIM at the cost of storage and CPU.
360    pub thumbnail_resolution: usize,
361
362    /// Enable bloom filter pre-screening before expensive perceptual comparisons.
363    ///
364    /// When enabled, a bloom filter is used to quickly reject items whose
365    /// content hash is already known to be unique, avoiding expensive
366    /// pairwise perceptual hash comparisons.
367    pub bloom_prescreen: bool,
368
369    /// Expected capacity for the bloom filter pre-screener.
370    pub bloom_capacity: usize,
371
372    /// False positive rate for the bloom filter pre-screener.
373    pub bloom_fpr: f32,
374
375    /// Use LSH acceleration for perceptual hash deduplication.
376    ///
377    /// When enabled, `find_perceptual_duplicates()` uses a `BitLshIndex`
378    /// instead of O(n^2) pairwise comparison.  This provides sub-quadratic
379    /// performance for large libraries at the cost of slightly reduced recall.
380    pub use_lsh: bool,
381
382    /// Number of LSH hash tables (more = better recall, more memory).
383    pub lsh_num_tables: usize,
384
385    /// Bits sampled per LSH table (fewer = more candidates = better recall).
386    pub lsh_bits_per_table: usize,
387
388    /// Deterministic seed for LSH projections.
389    pub lsh_seed: u64,
390}
391
392impl Default for DedupConfig {
393    fn default() -> Self {
394        Self {
395            database_path: PathBuf::from("oximedia_dedup.db"),
396            perceptual_threshold: 0.95,
397            ssim_threshold: 0.90,
398            histogram_threshold: 0.85,
399            feature_match_threshold: 50,
400            audio_threshold: 0.90,
401            metadata_threshold: 0.80,
402            parallel: true,
403            sample_frames: 10,
404            chunk_size: 4096,
405            thumbnail_resolution: 8,
406            bloom_prescreen: false,
407            bloom_capacity: 10_000,
408            bloom_fpr: 0.01,
409            use_lsh: true,
410            lsh_num_tables: 8,
411            lsh_bits_per_table: 8,
412            lsh_seed: 42,
413        }
414    }
415}
416
417/// Main duplicate detector.
418#[cfg(feature = "sqlite")]
419pub struct DuplicateDetector {
420    config: DedupConfig,
421    database: DedupDatabase,
422    /// Optional Bloom filter for fast-path duplicate pre-screening.
423    ///
424    /// Populated when `DedupConfig::bloom_prescreen` is `true`.  Stores
425    /// raw BLAKE3 hash bytes of every indexed file so that definitely-unique
426    /// files can be rejected without expensive pairwise comparisons.
427    bloom: Option<bloom_filter::BloomFilter>,
428}
429
430#[cfg(feature = "sqlite")]
431impl DuplicateDetector {
432    /// Create a new duplicate detector.
433    ///
434    /// When `config.bloom_prescreen` is `true`, a `BloomFilter` is
435    /// created using `config.bloom_capacity` and `config.bloom_fpr`.
436    /// Every file indexed via `add_file` or `par_index_files` will
437    /// automatically populate the filter so it can be used for fast-path
438    /// rejection in subsequent duplicate-detection passes.
439    ///
440    /// # Errors
441    ///
442    /// Returns an error if the database cannot be opened or initialized.
443    pub async fn new(config: DedupConfig) -> DedupResult<Self> {
444        let database = DedupDatabase::open(&config.database_path).await?;
445        let bloom = if config.bloom_prescreen {
446            Some(bloom_filter::BloomFilter::new(
447                config.bloom_capacity,
448                config.bloom_fpr,
449            ))
450        } else {
451            None
452        };
453        Ok(Self {
454            config,
455            database,
456            bloom,
457        })
458    }
459
460    /// Add a file to the deduplication index.
461    ///
462    /// If bloom pre-screening is enabled, the file's BLAKE3 hash bytes are
463    /// also inserted into the in-memory Bloom filter so that future
464    /// `might_be_duplicate` calls can provide fast-path rejection.
465    ///
466    /// # Errors
467    ///
468    /// Returns an error if the file cannot be read or processed.
469    pub async fn add_file(&mut self, path: impl AsRef<Path>) -> DedupResult<()> {
470        let path = path.as_ref();
471        if !path.exists() {
472            return Err(DedupError::FileNotFound(path.to_path_buf()));
473        }
474
475        // Compute hash
476        let file_hash = hash::compute_file_hash(path)?;
477
478        // Populate bloom filter (fast-path pre-screener) if enabled.
479        if let Some(ref mut bloom) = self.bloom {
480            bloom.insert(file_hash.as_bytes());
481        }
482
483        // Store in database
484        self.database.insert_file(path, &file_hash.to_hex()).await?;
485
486        Ok(())
487    }
488
489    /// Add multiple files sequentially.
490    ///
491    /// # Errors
492    ///
493    /// Returns an error if any file cannot be processed.
494    pub async fn add_files(&mut self, paths: &[impl AsRef<Path>]) -> DedupResult<Vec<String>> {
495        let mut errors = Vec::new();
496
497        for path in paths {
498            if let Err(e) = self.add_file(path).await {
499                errors.push(format!("{}: {}", path.as_ref().display(), e));
500            }
501        }
502
503        Ok(errors)
504    }
505
506    /// Add multiple files to the index using parallel hash computation.
507    ///
508    /// This method computes file hashes (BLAKE3) in parallel using rayon, then
509    /// merges the results into the database sequentially.  The parallelism
510    /// benefit is greatest for large libraries where hash I/O and computation
511    /// dominate.  Database inserts are performed sequentially afterwards
512    /// because they require exclusive `&mut self` access.
513    ///
514    /// Errors from individual files are collected and returned rather than
515    /// aborting the entire batch.
516    ///
517    /// # Errors
518    ///
519    /// Returns the list of per-file error strings.  An empty `Vec` means all
520    /// files were indexed successfully.
521    pub async fn par_index_files<P>(&mut self, paths: &[P]) -> DedupResult<Vec<String>>
522    where
523        P: AsRef<Path> + Sync,
524    {
525        use rayon::prelude::*;
526
527        // Phase 1: compute hashes in parallel (CPU-intensive, embarrassingly parallel).
528        let hash_results: Vec<(PathBuf, DedupResult<hash::FileHash>)> = paths
529            .par_iter()
530            .map(|p| {
531                let path = p.as_ref().to_path_buf();
532                if !path.exists() {
533                    return (path.clone(), Err(DedupError::FileNotFound(path)));
534                }
535                let result = hash::compute_file_hash(&path);
536                (path, result)
537            })
538            .collect();
539
540        // Phase 2: merge into DB sequentially (requires exclusive &mut self).
541        let mut errors = Vec::new();
542        for (path, result) in hash_results {
543            match result {
544                Ok(file_hash) => {
545                    if let Err(e) = self.database.insert_file(&path, &file_hash.to_hex()).await {
546                        errors.push(format!("{}: {}", path.display(), e));
547                    }
548                }
549                Err(e) => {
550                    errors.push(format!("{}: {}", path.display(), e));
551                }
552            }
553        }
554
555        Ok(errors)
556    }
557
558    /// Find duplicates using the specified strategy.
559    ///
560    /// # Errors
561    ///
562    /// Returns an error if duplicate detection fails.
563    pub async fn find_duplicates(
564        &self,
565        strategy: DetectionStrategy,
566    ) -> DedupResult<DuplicateReport> {
567        self.find_duplicates_with_progress(strategy, &progress::NullReporter)
568            .await
569    }
570
571    /// Find duplicates with progress reporting.
572    ///
573    /// Like `find_duplicates` but emits progress events through the
574    /// supplied [`ProgressReporter`](progress::ProgressReporter).  This is
575    /// the primary integration point for large-library deduplication where
576    /// the caller wants to display a progress bar or support cancellation.
577    ///
578    /// # Errors
579    ///
580    /// Returns an error if duplicate detection fails.
581    pub async fn find_duplicates_with_progress(
582        &self,
583        strategy: DetectionStrategy,
584        reporter: &dyn progress::ProgressReporter,
585    ) -> DedupResult<DuplicateReport> {
586        use progress::{ProgressEvent, ProgressTracker};
587
588        let run_start = std::time::SystemTime::now()
589            .duration_since(std::time::UNIX_EPOCH)
590            .unwrap_or_default()
591            .as_millis() as u64;
592
593        let mut report = DuplicateReport::new();
594
595        // Count phases for total progress.
596        let phase_count = [
597            strategy.includes_hash(),
598            strategy.includes_perceptual(),
599            strategy.includes_ssim(),
600            strategy.includes_histogram(),
601            strategy.includes_feature_match(),
602            strategy.includes_audio(),
603            strategy.includes_metadata(),
604        ]
605        .iter()
606        .filter(|&&b| b)
607        .count();
608
609        let mut completed_phases = 0usize;
610
611        // Exact hash duplicates
612        if strategy.includes_hash() {
613            if reporter.is_cancelled() {
614                return Ok(report);
615            }
616            let mut tracker = ProgressTracker::new(reporter, "exact_hash", 0);
617            let hash_dups = self.find_hash_duplicates().await?;
618            tracker.tick_batch(1);
619            let groups_found = hash_dups.len();
620            report.add_groups(hash_dups);
621            tracker.complete(groups_found);
622            completed_phases += 1;
623        }
624
625        // Perceptual hash duplicates
626        if strategy.includes_perceptual() {
627            if reporter.is_cancelled() {
628                return Ok(report);
629            }
630            let mut tracker = ProgressTracker::new(reporter, "perceptual_hash", 0);
631            let perceptual_dups = self.find_perceptual_duplicates().await?;
632            tracker.tick_batch(1);
633            let groups_found = perceptual_dups.len();
634            report.add_groups(perceptual_dups);
635            tracker.complete(groups_found);
636            completed_phases += 1;
637        }
638
639        // SSIM duplicates
640        if strategy.includes_ssim() {
641            if reporter.is_cancelled() {
642                return Ok(report);
643            }
644            let mut tracker = ProgressTracker::new(reporter, "ssim", 0);
645            let ssim_dups = self.find_ssim_duplicates().await?;
646            tracker.tick_batch(1);
647            let groups_found = ssim_dups.len();
648            report.add_groups(ssim_dups);
649            tracker.complete(groups_found);
650            completed_phases += 1;
651        }
652
653        // Histogram duplicates
654        if strategy.includes_histogram() {
655            if reporter.is_cancelled() {
656                return Ok(report);
657            }
658            let mut tracker = ProgressTracker::new(reporter, "histogram", 0);
659            let histogram_dups = self.find_histogram_duplicates().await?;
660            tracker.tick_batch(1);
661            let groups_found = histogram_dups.len();
662            report.add_groups(histogram_dups);
663            tracker.complete(groups_found);
664            completed_phases += 1;
665        }
666
667        // Feature match duplicates
668        if strategy.includes_feature_match() {
669            if reporter.is_cancelled() {
670                return Ok(report);
671            }
672            let mut tracker = ProgressTracker::new(reporter, "feature_match", 0);
673            let feature_dups = self.find_feature_duplicates().await?;
674            tracker.tick_batch(1);
675            let groups_found = feature_dups.len();
676            report.add_groups(feature_dups);
677            tracker.complete(groups_found);
678            completed_phases += 1;
679        }
680
681        // Audio fingerprint duplicates
682        if strategy.includes_audio() {
683            if reporter.is_cancelled() {
684                return Ok(report);
685            }
686            let mut tracker = ProgressTracker::new(reporter, "audio_fingerprint", 0);
687            let audio_dups = self.find_audio_duplicates().await?;
688            tracker.tick_batch(1);
689            let groups_found = audio_dups.len();
690            report.add_groups(audio_dups);
691            tracker.complete(groups_found);
692            completed_phases += 1;
693        }
694
695        // Metadata duplicates
696        if strategy.includes_metadata() {
697            if reporter.is_cancelled() {
698                return Ok(report);
699            }
700            let mut tracker = ProgressTracker::new(reporter, "metadata", 0);
701            let metadata_dups = self.find_metadata_duplicates().await?;
702            tracker.tick_batch(1);
703            let groups_found = metadata_dups.len();
704            report.add_groups(metadata_dups);
705            tracker.complete(groups_found);
706            completed_phases += 1;
707        }
708
709        // Emit run completed event.
710        let run_end = std::time::SystemTime::now()
711            .duration_since(std::time::UNIX_EPOCH)
712            .unwrap_or_default()
713            .as_millis() as u64;
714
715        reporter.on_event(&ProgressEvent::RunCompleted {
716            total_groups: report.groups.len(),
717            total_elapsed_ms: run_end.saturating_sub(run_start),
718        });
719
720        let _ = (phase_count, completed_phases); // used for bookkeeping
721
722        Ok(report)
723    }
724
725    /// Find exact duplicates by cryptographic hash.
726    async fn find_hash_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
727        let duplicates = self.database.find_duplicate_hashes().await?;
728        let mut groups = Vec::new();
729
730        for (hash, files) in duplicates {
731            if files.len() > 1 {
732                groups.push(DuplicateGroup {
733                    files,
734                    scores: vec![SimilarityScore {
735                        method: "exact_hash".to_string(),
736                        score: 1.0,
737                        metadata: vec![("hash".to_string(), hash)],
738                    }],
739                });
740            }
741        }
742
743        Ok(groups)
744    }
745
746    /// Find perceptual hash duplicates.
747    ///
748    /// When `config.use_lsh` is enabled (the default), uses a
749    /// [`BitLshIndex`](lsh_index::BitLshIndex) for sub-quadratic performance.
750    /// Otherwise falls back to O(n^2) pairwise comparison.
751    ///
752    /// Loads perceptual hashes stored in the `fingerprints` table under the key
753    /// `"phash"`.  Pairs with a Hamming distance below the threshold derived
754    /// from `config.perceptual_threshold` are grouped together.
755    async fn find_perceptual_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
756        // Threshold: perceptual_threshold is 0.0-1.0 similarity.
757        // Hamming distance over 64 bits → distance ≤ (1 - threshold) * 64.
758        let max_hamming = ((1.0 - self.config.perceptual_threshold) * 64.0) as u32;
759
760        // Fetch all stored perceptual hash fingerprints.
761        let stored = self.database.get_all_fingerprints_by_type("phash").await?;
762
763        // Build a list of (path, PerceptualHash) from stored hex strings.
764        let mut hashes: Vec<(String, visual::PerceptualHash)> = Vec::new();
765        for (path, hex) in stored {
766            if let Ok(value) = u64::from_str_radix(&hex, 16) {
767                hashes.push((path, visual::PerceptualHash::new(value, 64)));
768            }
769        }
770
771        // If no stored hashes, nothing to compare.
772        if hashes.len() < 2 {
773            return Ok(Vec::new());
774        }
775
776        // Bloom filter pre-screening: discard definitely-unique perceptual hashes
777        // before any expensive pairwise or LSH comparison.
778        //
779        // Strategy: quantise each 64-bit pHash down to its top 16 bits and run
780        // the items through the shared `prescreen_perceptual_hashes` helper.
781        // Items whose quantised hash has never been seen before are provably
782        // unique (no false negatives in a Bloom filter) and are dropped from
783        // the candidate set.  Remaining items are forwarded to the LSH/pairwise
784        // pass as before.
785        let hashes: Vec<(String, visual::PerceptualHash)> = if self.config.bloom_prescreen {
786            let raw: Vec<u64> = hashes.iter().map(|(_, ph)| ph.hash()).collect();
787            let prescreen = bloom_filter::prescreen_perceptual_hashes(
788                &raw,
789                16, // quantize_bits: top 16 bits capture coarse visual similarity
790                self.config.bloom_capacity,
791                self.config.bloom_fpr,
792            );
793            prescreen
794                .candidates
795                .iter()
796                .filter_map(|&idx| hashes.get(idx).cloned())
797                .collect()
798        } else {
799            hashes
800        };
801
802        // After bloom pre-screening, re-check candidate count.
803        if hashes.len() < 2 {
804            return Ok(Vec::new());
805        }
806
807        if self.config.use_lsh {
808            self.find_perceptual_duplicates_lsh(&hashes, max_hamming)
809        } else {
810            group_by_pairwise_similarity(
811                &hashes,
812                max_hamming,
813                |h1, h2| h1.hamming_distance(h2),
814                |h1, h2| h1.similarity(h2),
815                "perceptual_hash",
816            )
817        }
818    }
819
820    /// LSH-accelerated perceptual hash duplicate detection.
821    ///
822    /// Replaces the O(n^2) pairwise comparison with sub-quadratic LSH
823    /// candidate generation followed by exact Hamming distance verification.
824    fn find_perceptual_duplicates_lsh(
825        &self,
826        hashes: &[(String, visual::PerceptualHash)],
827        max_hamming: u32,
828    ) -> DedupResult<Vec<DuplicateGroup>> {
829        // Build id <-> path mapping.
830        let id_hashes: Vec<(u64, u64)> = hashes
831            .iter()
832            .enumerate()
833            .map(|(i, (_, ph))| (i as u64, ph.hash()))
834            .collect();
835
836        // Run LSH dedup pass.
837        let lsh_result = lsh_index::lsh_dedup_pass(
838            &id_hashes,
839            max_hamming,
840            self.config.lsh_num_tables,
841            self.config.lsh_bits_per_table,
842            self.config.lsh_seed,
843        );
844
845        // Group by transitive closure.
846        let all_ids: Vec<u64> = (0..hashes.len() as u64).collect();
847        let groups = lsh_index::group_by_lsh_pairs(&lsh_result.pairs, &all_ids);
848
849        // Convert back to DuplicateGroup with paths.
850        let mut result = Vec::new();
851        for group_ids in &groups {
852            let files: Vec<String> = group_ids
853                .iter()
854                .filter_map(|&id| hashes.get(id as usize).map(|(p, _)| p.clone()))
855                .collect();
856
857            if files.len() < 2 {
858                continue;
859            }
860
861            // Find best pairwise similarity within the group for scoring.
862            let mut best_sim = 0.0f64;
863            for i in 0..group_ids.len() {
864                for j in (i + 1)..group_ids.len() {
865                    let ia = group_ids[i] as usize;
866                    let ib = group_ids[j] as usize;
867                    if let (Some((_, ha)), Some((_, hb))) = (hashes.get(ia), hashes.get(ib)) {
868                        let sim = ha.similarity(hb);
869                        if sim > best_sim {
870                            best_sim = sim;
871                        }
872                    }
873                }
874            }
875
876            result.push(DuplicateGroup {
877                files,
878                scores: vec![SimilarityScore {
879                    method: "perceptual_hash_lsh".to_string(),
880                    score: best_sim,
881                    metadata: vec![
882                        (
883                            "lsh_candidates".to_string(),
884                            lsh_result.candidates_checked.to_string(),
885                        ),
886                        (
887                            "comparison_ratio".to_string(),
888                            format!("{:.4}", lsh_result.comparison_ratio()),
889                        ),
890                    ],
891                }],
892            });
893        }
894
895        Ok(result)
896    }
897
898    /// Find SSIM duplicates.
899    ///
900    /// Retrieves stored thumbnail pixel data (type `"thumbnail"`) from the
901    /// fingerprints table, reconstructs grayscale `Image` objects, and
902    /// computes the Structural Similarity Index (SSIM) between every unique
903    /// pair.  Pairs with SSIM above `config.ssim_threshold` are grouped.
904    ///
905    /// Thumbnail resolution is controlled by `config.thumbnail_resolution`.
906    async fn find_ssim_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
907        let threshold = self.config.ssim_threshold;
908        let res = self.config.thumbnail_resolution.max(4);
909        let expected_bytes = res * res;
910
911        // Thumbnail images are stored hex-encoded in the fingerprints table.
912        let stored = self
913            .database
914            .get_all_fingerprints_by_type("thumbnail")
915            .await?;
916
917        // Decode hex → bytes → Image (configurable resolution, grayscale).
918        let mut images: Vec<(String, visual::Image)> = Vec::new();
919        for (path, hex) in stored {
920            let bytes = decode_hex_bytes(&hex)?;
921            // Accept thumbnails matching the configured resolution.
922            if bytes.len() == expected_bytes {
923                if let Ok(img) = visual::Image::from_data(res, res, 1, bytes) {
924                    images.push((path, img));
925                }
926            }
927        }
928
929        if images.len() < 2 {
930            return Ok(Vec::new());
931        }
932
933        let ssim_params = visual::SsimParams::default();
934        let mut groups: Vec<DuplicateGroup> = Vec::new();
935        let mut assigned = vec![false; images.len()];
936
937        for i in 0..images.len() {
938            if assigned[i] {
939                continue;
940            }
941            let mut group_files = vec![images[i].0.clone()];
942            let mut best_score = 0.0f64;
943
944            for j in (i + 1)..images.len() {
945                if assigned[j] {
946                    continue;
947                }
948                let ssim = visual::compute_ssim(&images[i].1, &images[j].1, &ssim_params);
949                if ssim >= threshold {
950                    group_files.push(images[j].0.clone());
951                    assigned[j] = true;
952                    if ssim > best_score {
953                        best_score = ssim;
954                    }
955                }
956            }
957
958            if group_files.len() > 1 {
959                assigned[i] = true;
960                groups.push(DuplicateGroup {
961                    files: group_files,
962                    scores: vec![SimilarityScore {
963                        method: "ssim".to_string(),
964                        score: best_score,
965                        metadata: Vec::new(),
966                    }],
967                });
968            }
969        }
970
971        Ok(groups)
972    }
973
974    /// Find histogram duplicates.
975    ///
976    /// Loads stored colour histogram fingerprints (type `"histogram"`) from
977    /// the database.  The data is a JSON-encoded flat array of `u32` bin
978    /// counts (three channels × 256 bins = 768 values).  Histogram
979    /// correlation is computed between every pair; pairs above
980    /// `config.histogram_threshold` are grouped.
981    async fn find_histogram_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
982        let threshold = self.config.histogram_threshold;
983
984        let stored = self
985            .database
986            .get_all_fingerprints_by_type("histogram")
987            .await?;
988
989        // Decode stored JSON histogram data → Vec<Vec<u32>>.
990        let mut histograms: Vec<(String, Vec<Vec<u32>>)> = Vec::new();
991        for (path, json_str) in stored {
992            if let Ok(flat) = serde_json::from_str::<Vec<u32>>(&json_str) {
993                // Each channel has 256 bins; infer channel count.
994                if flat.len() % 256 == 0 && !flat.is_empty() {
995                    let channels = flat.len() / 256;
996                    let hist: Vec<Vec<u32>> = (0..channels)
997                        .map(|c| flat[c * 256..(c + 1) * 256].to_vec())
998                        .collect();
999                    histograms.push((path, hist));
1000                }
1001            }
1002        }
1003
1004        if histograms.len() < 2 {
1005            return Ok(Vec::new());
1006        }
1007
1008        let mut groups: Vec<DuplicateGroup> = Vec::new();
1009        let mut assigned = vec![false; histograms.len()];
1010
1011        for i in 0..histograms.len() {
1012            if assigned[i] {
1013                continue;
1014            }
1015            let mut group_files = vec![histograms[i].0.clone()];
1016            let mut best_score = 0.0f64;
1017
1018            for j in (i + 1)..histograms.len() {
1019                if assigned[j] {
1020                    continue;
1021                }
1022                let corr = visual::compare_histograms(&histograms[i].1, &histograms[j].1);
1023                if corr >= threshold {
1024                    group_files.push(histograms[j].0.clone());
1025                    assigned[j] = true;
1026                    if corr > best_score {
1027                        best_score = corr;
1028                    }
1029                }
1030            }
1031
1032            if group_files.len() > 1 {
1033                assigned[i] = true;
1034                groups.push(DuplicateGroup {
1035                    files: group_files,
1036                    scores: vec![SimilarityScore {
1037                        method: "histogram".to_string(),
1038                        score: best_score,
1039                        metadata: Vec::new(),
1040                    }],
1041                });
1042            }
1043        }
1044
1045        Ok(groups)
1046    }
1047
1048    /// Find feature match duplicates.
1049    ///
1050    /// Loads stored feature-vector fingerprints (type `"feature_vector"`) from
1051    /// the database.  Each feature vector is a JSON-encoded `Vec<f64>`.
1052    /// Cosine similarity is computed between every pair; pairs whose cosine
1053    /// similarity exceeds `config.perceptual_threshold` (reused as a generic
1054    /// visual similarity threshold) are grouped.
1055    async fn find_feature_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
1056        let threshold = self.config.perceptual_threshold;
1057
1058        let stored = self
1059            .database
1060            .get_all_fingerprints_by_type("feature_vector")
1061            .await?;
1062
1063        // Decode JSON feature vectors.
1064        let mut vectors: Vec<(String, Vec<f64>)> = Vec::new();
1065        for (path, json_str) in stored {
1066            if let Ok(vec) = serde_json::from_str::<Vec<f64>>(&json_str) {
1067                if !vec.is_empty() {
1068                    vectors.push((path, vec));
1069                }
1070            }
1071        }
1072
1073        if vectors.len() < 2 {
1074            return Ok(Vec::new());
1075        }
1076
1077        let mut groups: Vec<DuplicateGroup> = Vec::new();
1078        let mut assigned = vec![false; vectors.len()];
1079
1080        for i in 0..vectors.len() {
1081            if assigned[i] {
1082                continue;
1083            }
1084            let mut group_files = vec![vectors[i].0.clone()];
1085            let mut best_score = 0.0f64;
1086
1087            for j in (i + 1)..vectors.len() {
1088                if assigned[j] {
1089                    continue;
1090                }
1091                let sim = cosine_similarity(&vectors[i].1, &vectors[j].1);
1092                if sim >= threshold {
1093                    group_files.push(vectors[j].0.clone());
1094                    assigned[j] = true;
1095                    if sim > best_score {
1096                        best_score = sim;
1097                    }
1098                }
1099            }
1100
1101            if group_files.len() > 1 {
1102                assigned[i] = true;
1103                groups.push(DuplicateGroup {
1104                    files: group_files,
1105                    scores: vec![SimilarityScore {
1106                        method: "feature_vector".to_string(),
1107                        score: best_score,
1108                        metadata: Vec::new(),
1109                    }],
1110                });
1111            }
1112        }
1113
1114        Ok(groups)
1115    }
1116
1117    /// Find audio fingerprint duplicates.
1118    ///
1119    /// Loads stored audio fingerprint data (type `"audio_fingerprint"`) from
1120    /// the database.  Each fingerprint is stored as a hex string of bytes.
1121    /// Pairs whose bit-level Hamming distance is within the threshold derived
1122    /// from `config.audio_threshold` are grouped together.
1123    async fn find_audio_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
1124        let threshold = self.config.audio_threshold;
1125
1126        let stored = self
1127            .database
1128            .get_all_fingerprints_by_type("audio_fingerprint")
1129            .await?;
1130
1131        // Decode hex fingerprints → AudioFingerprint.
1132        let mut fingerprints: Vec<(String, audio::AudioFingerprint)> = Vec::new();
1133        for (path, hex) in stored {
1134            let bytes = decode_hex_bytes(&hex)?;
1135            if !bytes.is_empty() {
1136                fingerprints.push((path, audio::AudioFingerprint::new(bytes, 11025, 0.0)));
1137            }
1138        }
1139
1140        if fingerprints.len() < 2 {
1141            return Ok(Vec::new());
1142        }
1143
1144        let mut groups: Vec<DuplicateGroup> = Vec::new();
1145        let mut assigned = vec![false; fingerprints.len()];
1146
1147        for i in 0..fingerprints.len() {
1148            if assigned[i] {
1149                continue;
1150            }
1151            let mut group_files = vec![fingerprints[i].0.clone()];
1152            let mut best_score = 0.0f64;
1153
1154            for j in (i + 1)..fingerprints.len() {
1155                if assigned[j] {
1156                    continue;
1157                }
1158                let sim = fingerprints[i].1.similarity(&fingerprints[j].1);
1159                if sim >= threshold {
1160                    group_files.push(fingerprints[j].0.clone());
1161                    assigned[j] = true;
1162                    if sim > best_score {
1163                        best_score = sim;
1164                    }
1165                }
1166            }
1167
1168            if group_files.len() > 1 {
1169                assigned[i] = true;
1170                groups.push(DuplicateGroup {
1171                    files: group_files,
1172                    scores: vec![SimilarityScore {
1173                        method: "audio_fingerprint".to_string(),
1174                        score: best_score,
1175                        metadata: Vec::new(),
1176                    }],
1177                });
1178            }
1179        }
1180
1181        Ok(groups)
1182    }
1183
1184    /// Find metadata duplicates.
1185    ///
1186    /// Fetches all files with their stored metadata from the database and
1187    /// compares every unique pair using `metadata::compare_metadata`.  The
1188    /// key signals for a "near-duplicate" are:
1189    ///
1190    /// - Duration within ±1 second of each other.
1191    /// - Same video resolution (or both without resolution data).
1192    /// - Same video and audio codec.
1193    ///
1194    /// The overall weighted metadata similarity must exceed
1195    /// `config.metadata_threshold`.
1196    async fn find_metadata_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
1197        use metadata::{compare_metadata, MediaMetadata};
1198        use std::path::PathBuf;
1199
1200        let threshold = self.config.metadata_threshold;
1201
1202        let rows = self.database.get_all_files_with_metadata().await?;
1203
1204        if rows.len() < 2 {
1205            return Ok(Vec::new());
1206        }
1207
1208        // Reconstruct MediaMetadata objects from the DB rows.
1209        let media_meta: Vec<MediaMetadata> = rows
1210            .iter()
1211            .map(
1212                |(path, duration, width, height, video_codec, audio_codec, container)| {
1213                    let fs_size = std::fs::metadata(path).map(|m| m.len()).unwrap_or(0);
1214                    let mut m = MediaMetadata::new(PathBuf::from(path), fs_size);
1215                    m.duration = *duration;
1216                    m.width = width.map(|v| v as u32);
1217                    m.height = height.map(|v| v as u32);
1218                    m.video_codec = video_codec.clone();
1219                    m.audio_codec = audio_codec.clone();
1220                    m.container = container.clone();
1221                    m
1222                },
1223            )
1224            .collect();
1225
1226        let paths: Vec<String> = rows.iter().map(|(p, ..)| p.clone()).collect();
1227
1228        let mut groups: Vec<DuplicateGroup> = Vec::new();
1229        let mut assigned = vec![false; media_meta.len()];
1230
1231        for i in 0..media_meta.len() {
1232            if assigned[i] {
1233                continue;
1234            }
1235            let mut group_files = vec![paths[i].clone()];
1236            let mut best_score = 0.0f64;
1237            let mut best_duration_diff: Option<f64> = None;
1238
1239            for j in (i + 1)..media_meta.len() {
1240                if assigned[j] {
1241                    continue;
1242                }
1243
1244                // Fast pre-filter: duration must match within ±1 second
1245                // when both files have duration information stored.
1246                let duration_ok = match (media_meta[i].duration, media_meta[j].duration) {
1247                    (Some(d1), Some(d2)) => (d1 - d2).abs() <= 1.0,
1248                    _ => true, // No duration data → don't discard
1249                };
1250                if !duration_ok {
1251                    continue;
1252                }
1253
1254                let sim = compare_metadata(&media_meta[i], &media_meta[j]);
1255                let score = sim.overall_score();
1256                if score >= threshold {
1257                    group_files.push(paths[j].clone());
1258                    assigned[j] = true;
1259                    if score > best_score {
1260                        best_score = score;
1261                        best_duration_diff = match (media_meta[i].duration, media_meta[j].duration)
1262                        {
1263                            (Some(d1), Some(d2)) => Some((d1 - d2).abs()),
1264                            _ => None,
1265                        };
1266                    }
1267                }
1268            }
1269
1270            if group_files.len() > 1 {
1271                assigned[i] = true;
1272                let mut score_entry = SimilarityScore {
1273                    method: "metadata".to_string(),
1274                    score: best_score,
1275                    metadata: Vec::new(),
1276                };
1277                if let Some(diff) = best_duration_diff {
1278                    score_entry
1279                        .metadata
1280                        .push(("duration_diff_secs".to_string(), format!("{diff:.3}")));
1281                }
1282                groups.push(DuplicateGroup {
1283                    files: group_files,
1284                    scores: vec![score_entry],
1285                });
1286            }
1287        }
1288
1289        Ok(groups)
1290    }
1291
1292    /// Get database statistics.
1293    ///
1294    /// # Errors
1295    ///
1296    /// Returns an error if database query fails.
1297    pub async fn get_stats(&self) -> DedupResult<DedupStats> {
1298        let total_files = self.database.count_files().await?;
1299        let total_hashes = self.database.count_unique_hashes().await?;
1300
1301        Ok(DedupStats {
1302            total_files,
1303            total_hashes,
1304            duplicate_files: total_files.saturating_sub(total_hashes),
1305        })
1306    }
1307
1308    /// Close the database.
1309    pub async fn close(self) -> DedupResult<()> {
1310        self.database.close().await?;
1311        Ok(())
1312    }
1313
1314    /// Fast-path bloom filter check: does this hash *possibly* exist in the index?
1315    ///
1316    /// Returns `true` if the Bloom filter reports the hash *might* be a
1317    /// duplicate (i.e., the same bytes were inserted previously).  Returns
1318    /// `false` only if the hash is **definitely** not present — meaning the
1319    /// file is provably unique and expensive pairwise comparisons can be
1320    /// skipped entirely.
1321    ///
1322    /// When bloom pre-screening is disabled (`config.bloom_prescreen == false`)
1323    /// this always returns `true` so callers always fall through to the full
1324    /// comparison path.
1325    #[must_use]
1326    pub fn might_be_duplicate(&self, hash_bytes: &[u8]) -> bool {
1327        match &self.bloom {
1328            Some(bloom) => bloom.contains(hash_bytes),
1329            None => true,
1330        }
1331    }
1332
1333    /// Reset the in-memory bloom filter without touching the database.
1334    ///
1335    /// Useful after a bulk-index session to free the bloom filter's memory,
1336    /// or to rebuild it from scratch with a different capacity.  The database
1337    /// index is not affected.
1338    pub fn reset_bloom(&mut self) {
1339        if let Some(ref mut bloom) = self.bloom {
1340            bloom.clear();
1341        }
1342    }
1343}
1344
1345/// Deduplication statistics.
1346#[derive(Debug, Clone)]
1347pub struct DedupStats {
1348    /// Total number of indexed files
1349    pub total_files: usize,
1350
1351    /// Total number of unique hashes
1352    pub total_hashes: usize,
1353
1354    /// Number of duplicate files
1355    pub duplicate_files: usize,
1356}
1357
1358#[cfg(test)]
1359mod tests {
1360    use super::*;
1361
1362    #[test]
1363    fn test_detection_strategy() {
1364        assert!(DetectionStrategy::ExactHash.includes_hash());
1365        assert!(!DetectionStrategy::ExactHash.includes_perceptual());
1366
1367        assert!(DetectionStrategy::All.includes_hash());
1368        assert!(DetectionStrategy::All.includes_perceptual());
1369        assert!(DetectionStrategy::All.includes_audio());
1370
1371        assert!(DetectionStrategy::Fast.includes_hash());
1372        assert!(DetectionStrategy::Fast.includes_perceptual());
1373        assert!(!DetectionStrategy::Fast.includes_ssim());
1374    }
1375
1376    #[test]
1377    fn test_config_default() {
1378        let config = DedupConfig::default();
1379        assert_eq!(config.perceptual_threshold, 0.95);
1380        assert_eq!(config.ssim_threshold, 0.90);
1381        assert!(config.parallel);
1382    }
1383
1384    #[test]
1385    fn test_config_lsh_defaults() {
1386        let config = DedupConfig::default();
1387        assert!(config.use_lsh);
1388        assert_eq!(config.lsh_num_tables, 8);
1389        assert_eq!(config.lsh_bits_per_table, 8);
1390        assert_eq!(config.lsh_seed, 42);
1391    }
1392
1393    #[test]
1394    fn test_config_bloom_defaults() {
1395        let config = DedupConfig::default();
1396        // bloom_prescreen is off by default; capacity and fpr are set
1397        assert!(!config.bloom_prescreen);
1398        assert_eq!(config.bloom_capacity, 10_000);
1399        assert!((config.bloom_fpr - 0.01f32).abs() < f32::EPSILON);
1400    }
1401
1402    /// Compile-time check: `par_index_files` accepts an empty slice without panicking.
1403    #[tokio::test]
1404    #[cfg(feature = "sqlite")]
1405    async fn test_par_index_files_empty_slice() {
1406        use std::path::PathBuf;
1407        let dir = std::env::temp_dir();
1408        let db_path = dir.join(format!(
1409            "oxidedup_test_par_{}.db",
1410            std::time::SystemTime::now()
1411                .duration_since(std::time::UNIX_EPOCH)
1412                .unwrap_or_default()
1413                .subsec_nanos()
1414        ));
1415        let config = DedupConfig {
1416            database_path: db_path.clone(),
1417            ..DedupConfig::default()
1418        };
1419        if let Ok(mut detector) = DuplicateDetector::new(config).await {
1420            let no_paths: &[PathBuf] = &[];
1421            let errors = detector
1422                .par_index_files(no_paths)
1423                .await
1424                .expect("par_index_files should succeed on empty input");
1425            assert!(errors.is_empty(), "No errors expected for empty input");
1426            let _ = detector.close().await;
1427        }
1428        let _ = std::fs::remove_file(&db_path);
1429    }
1430
1431    /// par_index_files returns per-file errors for non-existent paths (no panic).
1432    #[tokio::test]
1433    #[cfg(feature = "sqlite")]
1434    async fn test_par_index_files_nonexistent_paths() {
1435        let dir = std::env::temp_dir();
1436        let db_path = dir.join(format!(
1437            "oxidedup_test_par_ne_{}.db",
1438            std::time::SystemTime::now()
1439                .duration_since(std::time::UNIX_EPOCH)
1440                .unwrap_or_default()
1441                .subsec_nanos()
1442        ));
1443        let config = DedupConfig {
1444            database_path: db_path.clone(),
1445            ..DedupConfig::default()
1446        };
1447        if let Ok(mut detector) = DuplicateDetector::new(config).await {
1448            let missing = vec![
1449                PathBuf::from("/nonexistent/path/a.mp4"),
1450                PathBuf::from("/nonexistent/path/b.mp4"),
1451            ];
1452            let errors = detector
1453                .par_index_files(&missing)
1454                .await
1455                .expect("par_index_files should return Ok even when files are missing");
1456            assert_eq!(errors.len(), 2, "Should have one error per missing file");
1457            let _ = detector.close().await;
1458        }
1459        let _ = std::fs::remove_file(&db_path);
1460    }
1461
1462    // ---- Bloom filter wiring tests ----
1463
1464    /// When bloom_prescreen is false (default), might_be_duplicate always returns true.
1465    #[tokio::test]
1466    #[cfg(feature = "sqlite")]
1467    async fn test_might_be_duplicate_no_bloom_always_true() {
1468        let dir = std::env::temp_dir();
1469        let db_path = dir.join(format!(
1470            "oxidedup_bloom_noscreen_{}.db",
1471            std::time::SystemTime::now()
1472                .duration_since(std::time::UNIX_EPOCH)
1473                .unwrap_or_default()
1474                .subsec_nanos()
1475        ));
1476        let config = DedupConfig {
1477            database_path: db_path.clone(),
1478            bloom_prescreen: false,
1479            ..DedupConfig::default()
1480        };
1481        if let Ok(detector) = DuplicateDetector::new(config).await {
1482            // Without a bloom filter, every hash is a "maybe duplicate"
1483            assert!(
1484                detector.might_be_duplicate(b"some_hash_bytes"),
1485                "Should always return true when bloom is disabled"
1486            );
1487            assert!(
1488                detector.might_be_duplicate(b""),
1489                "Empty bytes: should return true without bloom"
1490            );
1491            let _ = detector.close().await;
1492        }
1493        let _ = std::fs::remove_file(&db_path);
1494    }
1495
1496    /// When bloom_prescreen is enabled, unknown hashes return false from might_be_duplicate.
1497    #[tokio::test]
1498    #[cfg(feature = "sqlite")]
1499    async fn test_might_be_duplicate_with_bloom_unknown_hash() {
1500        let dir = std::env::temp_dir();
1501        let db_path = dir.join(format!(
1502            "oxidedup_bloom_unknown_{}.db",
1503            std::time::SystemTime::now()
1504                .duration_since(std::time::UNIX_EPOCH)
1505                .unwrap_or_default()
1506                .subsec_nanos()
1507        ));
1508        let config = DedupConfig {
1509            database_path: db_path.clone(),
1510            bloom_prescreen: true,
1511            bloom_capacity: 1000,
1512            bloom_fpr: 0.01,
1513            ..DedupConfig::default()
1514        };
1515        if let Ok(detector) = DuplicateDetector::new(config).await {
1516            // A freshly created detector has an empty bloom filter — unknown hashes
1517            // must return false (definitely not a duplicate)
1518            assert!(
1519                !detector.might_be_duplicate(b"never_inserted_hash"),
1520                "Unknown hash should return false from a fresh bloom filter"
1521            );
1522            let _ = detector.close().await;
1523        }
1524        let _ = std::fs::remove_file(&db_path);
1525    }
1526
1527    /// reset_bloom clears the filter so previously-seen hashes return false.
1528    #[tokio::test]
1529    #[cfg(feature = "sqlite")]
1530    async fn test_reset_bloom_clears_state() {
1531        let dir = std::env::temp_dir();
1532        let db_path = dir.join(format!(
1533            "oxidedup_bloom_reset_{}.db",
1534            std::time::SystemTime::now()
1535                .duration_since(std::time::UNIX_EPOCH)
1536                .unwrap_or_default()
1537                .subsec_nanos()
1538        ));
1539        let config = DedupConfig {
1540            database_path: db_path.clone(),
1541            bloom_prescreen: true,
1542            bloom_capacity: 1000,
1543            bloom_fpr: 0.01,
1544            ..DedupConfig::default()
1545        };
1546        if let Ok(mut detector) = DuplicateDetector::new(config).await {
1547            // Manually insert into the bloom filter by inserting known bytes
1548            if let Some(ref mut bloom) = detector.bloom {
1549                bloom.insert(b"known_hash");
1550            }
1551            // Now it should report a potential duplicate
1552            assert!(
1553                detector.might_be_duplicate(b"known_hash"),
1554                "After insert, bloom should report potential duplicate"
1555            );
1556            // After reset, the same hash must not be found
1557            detector.reset_bloom();
1558            assert!(
1559                !detector.might_be_duplicate(b"known_hash"),
1560                "After reset_bloom, hash should not be found"
1561            );
1562            let _ = detector.close().await;
1563        }
1564        let _ = std::fs::remove_file(&db_path);
1565    }
1566}