Skip to main content

oximedia_dedup/
lib.rs

1//! Media deduplication and duplicate detection for `OxiMedia`.
2//!
3//! `oximedia-dedup` provides comprehensive duplicate detection and media deduplication
4//! for the `OxiMedia` multimedia framework. This includes:
5//!
6//! - **Cryptographic hashing**: BLAKE3-based exact duplicate detection
7//! - **Visual similarity**: Perceptual hashing, SSIM, histogram, and feature matching
8//! - **Audio fingerprinting**: Audio fingerprint comparison and waveform similarity
9//! - **Metadata matching**: Fuzzy metadata comparison for near-duplicates
10//! - **Storage optimization**: Fast SQLite-based indexing for large libraries
11//! - **Reporting**: Comprehensive duplicate reports with similarity scoring
12//!
13//! # Modules
14//!
15//! - [`hash`]: Cryptographic and content-based hashing
16//! - [`visual`]: Visual similarity detection
17//! - [`audio`]: Audio fingerprint comparison
18//! - [`metadata`]: Metadata-based deduplication
19//! - `database`: SQLite-based indexing and lookup
20//! - [`report`]: Duplicate detection reports
21//!
22//! # Example
23//!
24//! ```
25//! use oximedia_dedup::{DuplicateDetector, DetectionStrategy, DedupConfig};
26//!
27//! # async fn example() -> Result<(), Box<dyn std::error::Error>> {
28//! let config = DedupConfig::default();
29//! let mut detector = DuplicateDetector::new(config).await?;
30//!
31//! // Add files to the index
32//! detector.add_file("/path/to/video1.mp4").await?;
33//! detector.add_file("/path/to/video2.mp4").await?;
34//!
35//! // Find duplicates
36//! let duplicates = detector.find_duplicates(DetectionStrategy::All).await?;
37//! # Ok(())
38//! # }
39//! ```
40//!
41//! ## Strategy Selection Guide
42//!
43//! | Strategy | Speed | Precision | Use case |
44//! |---|---|---|---|
45//! | `ExactHash` | Very fast | Perfect (no false positives) | Bit-for-bit identical files — best first pass for any library |
46//! | `Fast` | Fast | High | Quick scan: hash + perceptual + metadata; good default for large libraries |
47//! | `PerceptualHash` | Fast | Good | Visually identical images/frames that were re-encoded or lightly cropped |
48//! | `Histogram` | Fast | Moderate | Color-similar frames regardless of spatial layout |
49//! | `AudioFingerprint` | Moderate | High | Same audio in different codecs or with minor edits |
50//! | `Metadata` | Fast | Low–moderate | Likely duplicates with same duration/resolution (combine with visual pass) |
51//! | `Ssim` | Slow | Very high | Near-identical video frames where pHash gives too many false positives |
52//! | `FeatureMatch` | Slow | High | Cropped, rotated, or partially occluded duplicates |
53//! | `VisualAll` | Slow | Very high | Combined visual pipeline: pHash + Histogram + SSIM + FeatureMatch |
54//! | `All` | Very slow | Maximum | Full pipeline — all methods; use only for final authoritative scan |
55//!
56//! **Recommended workflow:**
57//! 1. Run `ExactHash` first to catch perfect duplicates cheaply.
58//! 2. Run `Fast` for a broad near-duplicate sweep.
59//! 3. Run `VisualAll` or `All` for a precision clean-up pass on the remainder.
60//!
61//! ## Detection Method Trade-offs
62//!
63//! | Method | Accuracy | CPU cost | Memory | False-positive risk | Notes |
64//! |---|---|---|---|---|---|
65//! | BLAKE3 hash | 100% | Very low | O(1) | None | Misses re-encoded or edited copies |
66//! | dHash (8×8) | High | Very low | O(1) | Low | Robust to resize; sensitive to crops |
67//! | pHash (DCT) | High | Low | O(1) | Low–medium | Better than dHash for brightness shifts |
68//! | wHash (wavelet) | High | Low | O(1) | Low | Most robust to combined transforms |
69//! | SSIM | Very high | High | O(WH) | Very low | Pixel-accurate; slow for large images |
70//! | Histogram | Moderate | Low | O(256) | Medium | Colour match only; ignores structure |
71//! | FeatureMatch | High | Very high | O(N×D) | Low | Works on crops/rotations; expensive |
72//! | AudioFingerprint | High | Moderate | O(T) | Low | Spectral-peak based; codec-agnostic |
73//! | Metadata | Low–moderate | Very low | O(1) | High | Use only as a pre-filter |
74//!
75//! **Bloom-filter pre-screening** (`DedupConfig::bloom_prescreen = true`) reduces
76//! the number of pairwise comparisons by rejecting definitely-unique items before
77//! the expensive perceptual-hash phase.  Recommended for libraries with > 10 K files.
78//!
79//! **LSH acceleration** (`DedupConfig::use_lsh = true`, default) replaces O(n²)
80//! pairwise perceptual-hash comparison with sub-quadratic approximate nearest-
81//! neighbour lookup via `BitLshIndex`.  Adjust `lsh_num_tables` (more tables →
82//! better recall, more memory) and `lsh_bits_per_table` (fewer bits → more
83//! candidates → better recall at higher CPU cost).
84
85#![warn(missing_docs)]
86#![allow(clippy::module_name_repetitions)]
87#![allow(clippy::similar_names)]
88#![allow(clippy::cast_possible_truncation)]
89#![allow(clippy::cast_sign_loss)]
90#![allow(clippy::cast_precision_loss)]
91#![allow(clippy::too_many_arguments)]
92#![allow(dead_code)]
93
94pub mod audio;
95pub mod audio_fingerprint;
96pub mod bloom_filter;
97pub mod bloom_prescreen;
98pub mod chromagram;
99pub mod cluster;
100pub mod content_id;
101pub mod content_signature;
102pub mod cross_format;
103#[cfg(feature = "sqlite")]
104pub mod database;
105pub mod dedup_cache;
106pub mod dedup_index;
107pub mod dedup_policy;
108pub mod dedup_queue;
109pub mod dedup_report;
110pub mod dedup_report_detailed;
111pub mod dedup_report_ext;
112pub mod dedup_stats;
113pub mod exact_match;
114pub mod frame_hash;
115pub mod fuzzy_match;
116pub mod hash;
117pub mod hash_store;
118pub mod hierarchical;
119pub mod incremental;
120pub mod lsh_index;
121pub mod merge_strategy;
122pub mod metadata;
123pub mod minhash;
124pub mod near_duplicate;
125pub mod near_duplicate_cluster;
126pub mod network_dedup;
127pub mod parallel_indexer;
128pub mod perceptual_hash;
129pub mod persistent_cache;
130pub mod phash;
131pub mod progress;
132pub mod report;
133pub mod rolling_hash;
134pub mod segment_dedup;
135pub mod signature_store;
136pub mod similarity_index;
137pub mod space_savings;
138pub mod stream_dedup;
139pub mod video_dedup;
140pub mod video_dedup_pipeline;
141pub mod video_segment_dedup;
142pub mod visual;
143
144#[cfg(feature = "sqlite")]
145use std::path::Path;
146use std::path::PathBuf;
147use thiserror::Error;
148
149#[cfg(feature = "sqlite")]
150pub use database::DedupDatabase;
151pub use merge_strategy::{AppliedAction, MergeExecutor, MergeReport};
152pub use report::{DuplicateGroup, DuplicateReport, SimilarityScore};
153
154// ---------------------------------------------------------------------------
155// Internal helpers used by the stub implementations
156// ---------------------------------------------------------------------------
157
158/// Decode a lowercase hex string into a byte vector.
159///
160/// # Errors
161///
162/// Returns `DedupError::Hash` if the string contains non-hex characters or
163/// has an odd number of characters.
164#[cfg(feature = "sqlite")]
165fn decode_hex_bytes(hex: &str) -> DedupResult<Vec<u8>> {
166    if hex.len() % 2 != 0 {
167        return Err(DedupError::Hash(format!(
168            "odd-length hex string: len={}",
169            hex.len()
170        )));
171    }
172    (0..hex.len())
173        .step_by(2)
174        .map(|i| {
175            u8::from_str_radix(&hex[i..i + 2], 16)
176                .map_err(|e| DedupError::Hash(format!("invalid hex byte at {i}: {e}")))
177        })
178        .collect()
179}
180
181/// Compute the cosine similarity between two f64 slices.
182///
183/// Returns a value in [−1, 1] or 0.0 when either vector is zero-magnitude.
184#[cfg(feature = "sqlite")]
185fn cosine_similarity(a: &[f64], b: &[f64]) -> f64 {
186    let dot: f64 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
187    let mag_a: f64 = a.iter().map(|x| x * x).sum::<f64>().sqrt();
188    let mag_b: f64 = b.iter().map(|x| x * x).sum::<f64>().sqrt();
189    if mag_a < f64::EPSILON || mag_b < f64::EPSILON {
190        return 0.0;
191    }
192    dot / (mag_a * mag_b)
193}
194
195/// Generic pairwise grouping helper for perceptual hash comparison.
196///
197/// Takes a slice of `(path, hash)` pairs, a maximum Hamming distance
198/// threshold, a distance function, a similarity function (0.0‒1.0), and a
199/// method label.  Returns non-overlapping duplicate groups.
200#[cfg(feature = "sqlite")]
201fn group_by_pairwise_similarity<H, FDist, FSim>(
202    items: &[(String, H)],
203    max_distance: u32,
204    dist_fn: FDist,
205    sim_fn: FSim,
206    method: &str,
207) -> DedupResult<Vec<DuplicateGroup>>
208where
209    FDist: Fn(&H, &H) -> u32,
210    FSim: Fn(&H, &H) -> f64,
211{
212    let mut groups: Vec<DuplicateGroup> = Vec::new();
213    let mut assigned = vec![false; items.len()];
214
215    for i in 0..items.len() {
216        if assigned[i] {
217            continue;
218        }
219        let mut group_files = vec![items[i].0.clone()];
220        let mut best_score = 0.0f64;
221
222        for j in (i + 1)..items.len() {
223            if assigned[j] {
224                continue;
225            }
226            let dist = dist_fn(&items[i].1, &items[j].1);
227            if dist <= max_distance {
228                let sim = sim_fn(&items[i].1, &items[j].1);
229                group_files.push(items[j].0.clone());
230                assigned[j] = true;
231                if sim > best_score {
232                    best_score = sim;
233                }
234            }
235        }
236
237        if group_files.len() > 1 {
238            assigned[i] = true;
239            groups.push(DuplicateGroup {
240                files: group_files,
241                scores: vec![SimilarityScore {
242                    method: method.to_string(),
243                    score: best_score,
244                    metadata: Vec::new(),
245                }],
246            });
247        }
248    }
249
250    Ok(groups)
251}
252
253/// Deduplication error type.
254#[derive(Error, Debug)]
255pub enum DedupError {
256    /// I/O error
257    #[error("I/O error: {0}")]
258    Io(#[from] std::io::Error),
259
260    /// Database error
261    #[cfg(feature = "sqlite")]
262    #[error("Database error: {0}")]
263    Database(#[from] sqlx::Error),
264
265    /// Database error (non-sqlite variant)
266    #[cfg(not(feature = "sqlite"))]
267    #[error("Database error: {0}")]
268    Database(String),
269
270    /// Hashing error
271    #[error("Hashing error: {0}")]
272    Hash(String),
273
274    /// Visual processing error
275    #[error("Visual processing error: {0}")]
276    Visual(String),
277
278    /// Audio processing error
279    #[error("Audio processing error: {0}")]
280    Audio(String),
281
282    /// Metadata processing error
283    #[error("Metadata processing error: {0}")]
284    Metadata(String),
285
286    /// File not found
287    #[error("File not found: {0}")]
288    FileNotFound(PathBuf),
289
290    /// Invalid configuration
291    #[error("Invalid configuration: {0}")]
292    InvalidConfig(String),
293
294    /// Core library error
295    #[error("OxiMedia core error: {0}")]
296    Core(#[from] oximedia_core::OxiError),
297}
298
299/// Deduplication result type.
300pub type DedupResult<T> = Result<T, DedupError>;
301
302/// Detection strategy for finding duplicates.
303#[derive(Debug, Clone, Copy, PartialEq, Eq)]
304pub enum DetectionStrategy {
305    /// Exact duplicates only (cryptographic hash)
306    ExactHash,
307
308    /// Visual similarity using perceptual hashing
309    PerceptualHash,
310
311    /// Visual similarity using SSIM
312    Ssim,
313
314    /// Visual similarity using histogram comparison
315    Histogram,
316
317    /// Visual similarity using feature matching
318    FeatureMatch,
319
320    /// Audio fingerprint comparison
321    AudioFingerprint,
322
323    /// Metadata-based matching
324    Metadata,
325
326    /// All detection methods
327    All,
328
329    /// Combination of visual methods
330    VisualAll,
331
332    /// Combination of fast methods (hash + perceptual + metadata)
333    Fast,
334}
335
336impl DetectionStrategy {
337    /// Check if strategy includes exact hashing.
338    #[must_use]
339    pub fn includes_hash(self) -> bool {
340        matches!(self, Self::ExactHash | Self::All | Self::Fast)
341    }
342
343    /// Check if strategy includes perceptual hashing.
344    #[must_use]
345    pub fn includes_perceptual(self) -> bool {
346        matches!(
347            self,
348            Self::PerceptualHash | Self::All | Self::VisualAll | Self::Fast
349        )
350    }
351
352    /// Check if strategy includes SSIM.
353    #[must_use]
354    pub fn includes_ssim(self) -> bool {
355        matches!(self, Self::Ssim | Self::All | Self::VisualAll)
356    }
357
358    /// Check if strategy includes histogram.
359    #[must_use]
360    pub fn includes_histogram(self) -> bool {
361        matches!(self, Self::Histogram | Self::All | Self::VisualAll)
362    }
363
364    /// Check if strategy includes feature matching.
365    #[must_use]
366    pub fn includes_feature_match(self) -> bool {
367        matches!(self, Self::FeatureMatch | Self::All | Self::VisualAll)
368    }
369
370    /// Check if strategy includes audio fingerprinting.
371    #[must_use]
372    pub fn includes_audio(self) -> bool {
373        matches!(self, Self::AudioFingerprint | Self::All)
374    }
375
376    /// Check if strategy includes metadata.
377    #[must_use]
378    pub fn includes_metadata(self) -> bool {
379        matches!(self, Self::Metadata | Self::All | Self::Fast)
380    }
381}
382
383/// Configuration for deduplication.
384#[derive(Debug, Clone)]
385pub struct DedupConfig {
386    /// Database path
387    pub database_path: PathBuf,
388
389    /// Perceptual hash similarity threshold (0.0-1.0)
390    pub perceptual_threshold: f64,
391
392    /// SSIM similarity threshold (0.0-1.0)
393    pub ssim_threshold: f64,
394
395    /// Histogram similarity threshold (0.0-1.0)
396    pub histogram_threshold: f64,
397
398    /// Feature match threshold (minimum number of matches)
399    pub feature_match_threshold: usize,
400
401    /// Audio fingerprint similarity threshold (0.0-1.0)
402    pub audio_threshold: f64,
403
404    /// Metadata similarity threshold (0.0-1.0)
405    pub metadata_threshold: f64,
406
407    /// Enable parallel processing
408    pub parallel: bool,
409
410    /// Number of frames to sample for video analysis
411    pub sample_frames: usize,
412
413    /// Chunk size for content-based chunking (bytes)
414    pub chunk_size: usize,
415
416    /// Thumbnail resolution for SSIM duplicate detection.
417    ///
418    /// Specifies both width and height of the grayscale thumbnail used for
419    /// SSIM comparison.  Must be >= 4.  Default is 8 (i.e. 8x8 = 64 pixels).
420    /// Higher values give more accurate SSIM at the cost of storage and CPU.
421    pub thumbnail_resolution: usize,
422
423    /// Enable bloom filter pre-screening before expensive perceptual comparisons.
424    ///
425    /// When enabled, a bloom filter is used to quickly reject items whose
426    /// content hash is already known to be unique, avoiding expensive
427    /// pairwise perceptual hash comparisons.
428    pub bloom_prescreen: bool,
429
430    /// Expected capacity for the bloom filter pre-screener.
431    pub bloom_capacity: usize,
432
433    /// False positive rate for the bloom filter pre-screener.
434    pub bloom_fpr: f32,
435
436    /// Use LSH acceleration for perceptual hash deduplication.
437    ///
438    /// When enabled, `find_perceptual_duplicates()` uses a `BitLshIndex`
439    /// instead of O(n^2) pairwise comparison.  This provides sub-quadratic
440    /// performance for large libraries at the cost of slightly reduced recall.
441    pub use_lsh: bool,
442
443    /// Number of LSH hash tables (more = better recall, more memory).
444    pub lsh_num_tables: usize,
445
446    /// Bits sampled per LSH table (fewer = more candidates = better recall).
447    pub lsh_bits_per_table: usize,
448
449    /// Deterministic seed for LSH projections.
450    pub lsh_seed: u64,
451}
452
453impl Default for DedupConfig {
454    fn default() -> Self {
455        Self {
456            database_path: PathBuf::from("oximedia_dedup.db"),
457            perceptual_threshold: 0.95,
458            ssim_threshold: 0.90,
459            histogram_threshold: 0.85,
460            feature_match_threshold: 50,
461            audio_threshold: 0.90,
462            metadata_threshold: 0.80,
463            parallel: true,
464            sample_frames: 10,
465            chunk_size: 4096,
466            thumbnail_resolution: 8,
467            bloom_prescreen: false,
468            bloom_capacity: 10_000,
469            bloom_fpr: 0.01,
470            use_lsh: true,
471            lsh_num_tables: 8,
472            lsh_bits_per_table: 8,
473            lsh_seed: 42,
474        }
475    }
476}
477
478/// Main duplicate detector.
479#[cfg(feature = "sqlite")]
480pub struct DuplicateDetector {
481    config: DedupConfig,
482    database: DedupDatabase,
483    /// Optional Bloom filter for fast-path duplicate pre-screening.
484    ///
485    /// Populated when `DedupConfig::bloom_prescreen` is `true`.  Stores
486    /// raw BLAKE3 hash bytes of every indexed file so that definitely-unique
487    /// files can be rejected without expensive pairwise comparisons.
488    bloom: Option<bloom_filter::BloomFilter>,
489}
490
491#[cfg(feature = "sqlite")]
492impl DuplicateDetector {
493    /// Create a new duplicate detector.
494    ///
495    /// When `config.bloom_prescreen` is `true`, a `BloomFilter` is
496    /// created using `config.bloom_capacity` and `config.bloom_fpr`.
497    /// Every file indexed via `add_file` or `par_index_files` will
498    /// automatically populate the filter so it can be used for fast-path
499    /// rejection in subsequent duplicate-detection passes.
500    ///
501    /// # Errors
502    ///
503    /// Returns an error if the database cannot be opened or initialized.
504    pub async fn new(config: DedupConfig) -> DedupResult<Self> {
505        let database = DedupDatabase::open(&config.database_path).await?;
506        let bloom = if config.bloom_prescreen {
507            Some(bloom_filter::BloomFilter::new(
508                config.bloom_capacity,
509                config.bloom_fpr,
510            ))
511        } else {
512            None
513        };
514        Ok(Self {
515            config,
516            database,
517            bloom,
518        })
519    }
520
521    /// Add a file to the deduplication index.
522    ///
523    /// If bloom pre-screening is enabled, the file's BLAKE3 hash bytes are
524    /// also inserted into the in-memory Bloom filter so that future
525    /// `might_be_duplicate` calls can provide fast-path rejection.
526    ///
527    /// # Errors
528    ///
529    /// Returns an error if the file cannot be read or processed.
530    pub async fn add_file(&mut self, path: impl AsRef<Path>) -> DedupResult<()> {
531        let path = path.as_ref();
532        if !path.exists() {
533            return Err(DedupError::FileNotFound(path.to_path_buf()));
534        }
535
536        // Compute hash
537        let file_hash = hash::compute_file_hash(path)?;
538
539        // Populate bloom filter (fast-path pre-screener) if enabled.
540        if let Some(ref mut bloom) = self.bloom {
541            bloom.insert(file_hash.as_bytes());
542        }
543
544        // Store in database
545        self.database.insert_file(path, &file_hash.to_hex()).await?;
546
547        Ok(())
548    }
549
550    /// Add multiple files sequentially.
551    ///
552    /// # Errors
553    ///
554    /// Returns an error if any file cannot be processed.
555    pub async fn add_files(&mut self, paths: &[impl AsRef<Path>]) -> DedupResult<Vec<String>> {
556        let mut errors = Vec::new();
557
558        for path in paths {
559            if let Err(e) = self.add_file(path).await {
560                errors.push(format!("{}: {}", path.as_ref().display(), e));
561            }
562        }
563
564        Ok(errors)
565    }
566
567    /// Add multiple files to the index using parallel hash computation.
568    ///
569    /// This method computes file hashes (BLAKE3) in parallel using rayon, then
570    /// merges the results into the database sequentially.  The parallelism
571    /// benefit is greatest for large libraries where hash I/O and computation
572    /// dominate.  Database inserts are performed sequentially afterwards
573    /// because they require exclusive `&mut self` access.
574    ///
575    /// Errors from individual files are collected and returned rather than
576    /// aborting the entire batch.
577    ///
578    /// # Errors
579    ///
580    /// Returns the list of per-file error strings.  An empty `Vec` means all
581    /// files were indexed successfully.
582    pub async fn par_index_files<P>(&mut self, paths: &[P]) -> DedupResult<Vec<String>>
583    where
584        P: AsRef<Path> + Sync,
585    {
586        use rayon::prelude::*;
587
588        // Phase 1: compute hashes in parallel (CPU-intensive, embarrassingly parallel).
589        let hash_results: Vec<(PathBuf, DedupResult<hash::FileHash>)> = paths
590            .par_iter()
591            .map(|p| {
592                let path = p.as_ref().to_path_buf();
593                if !path.exists() {
594                    return (path.clone(), Err(DedupError::FileNotFound(path)));
595                }
596                let result = hash::compute_file_hash(&path);
597                (path, result)
598            })
599            .collect();
600
601        // Phase 2: merge into DB sequentially (requires exclusive &mut self).
602        let mut errors = Vec::new();
603        for (path, result) in hash_results {
604            match result {
605                Ok(file_hash) => {
606                    if let Err(e) = self.database.insert_file(&path, &file_hash.to_hex()).await {
607                        errors.push(format!("{}: {}", path.display(), e));
608                    }
609                }
610                Err(e) => {
611                    errors.push(format!("{}: {}", path.display(), e));
612                }
613            }
614        }
615
616        Ok(errors)
617    }
618
619    /// Find duplicates using the specified strategy.
620    ///
621    /// # Errors
622    ///
623    /// Returns an error if duplicate detection fails.
624    pub async fn find_duplicates(
625        &self,
626        strategy: DetectionStrategy,
627    ) -> DedupResult<DuplicateReport> {
628        self.find_duplicates_with_progress(strategy, &progress::NullReporter)
629            .await
630    }
631
632    /// Find duplicates with progress reporting.
633    ///
634    /// Like `find_duplicates` but emits progress events through the
635    /// supplied [`ProgressReporter`](progress::ProgressReporter).  This is
636    /// the primary integration point for large-library deduplication where
637    /// the caller wants to display a progress bar or support cancellation.
638    ///
639    /// # Errors
640    ///
641    /// Returns an error if duplicate detection fails.
642    pub async fn find_duplicates_with_progress(
643        &self,
644        strategy: DetectionStrategy,
645        reporter: &dyn progress::ProgressReporter,
646    ) -> DedupResult<DuplicateReport> {
647        use progress::{ProgressEvent, ProgressTracker};
648
649        let run_start = std::time::SystemTime::now()
650            .duration_since(std::time::UNIX_EPOCH)
651            .unwrap_or_default()
652            .as_millis() as u64;
653
654        let mut report = DuplicateReport::new();
655
656        // Count phases for total progress.
657        let phase_count = [
658            strategy.includes_hash(),
659            strategy.includes_perceptual(),
660            strategy.includes_ssim(),
661            strategy.includes_histogram(),
662            strategy.includes_feature_match(),
663            strategy.includes_audio(),
664            strategy.includes_metadata(),
665        ]
666        .iter()
667        .filter(|&&b| b)
668        .count();
669
670        let mut completed_phases = 0usize;
671
672        // Exact hash duplicates
673        if strategy.includes_hash() {
674            if reporter.is_cancelled() {
675                return Ok(report);
676            }
677            let mut tracker = ProgressTracker::new(reporter, "exact_hash", 0);
678            let hash_dups = self.find_hash_duplicates().await?;
679            tracker.tick_batch(1);
680            let groups_found = hash_dups.len();
681            report.add_groups(hash_dups);
682            tracker.complete(groups_found);
683            completed_phases += 1;
684        }
685
686        // Perceptual hash duplicates
687        if strategy.includes_perceptual() {
688            if reporter.is_cancelled() {
689                return Ok(report);
690            }
691            let mut tracker = ProgressTracker::new(reporter, "perceptual_hash", 0);
692            let perceptual_dups = self.find_perceptual_duplicates().await?;
693            tracker.tick_batch(1);
694            let groups_found = perceptual_dups.len();
695            report.add_groups(perceptual_dups);
696            tracker.complete(groups_found);
697            completed_phases += 1;
698        }
699
700        // SSIM duplicates
701        if strategy.includes_ssim() {
702            if reporter.is_cancelled() {
703                return Ok(report);
704            }
705            let mut tracker = ProgressTracker::new(reporter, "ssim", 0);
706            let ssim_dups = self.find_ssim_duplicates().await?;
707            tracker.tick_batch(1);
708            let groups_found = ssim_dups.len();
709            report.add_groups(ssim_dups);
710            tracker.complete(groups_found);
711            completed_phases += 1;
712        }
713
714        // Histogram duplicates
715        if strategy.includes_histogram() {
716            if reporter.is_cancelled() {
717                return Ok(report);
718            }
719            let mut tracker = ProgressTracker::new(reporter, "histogram", 0);
720            let histogram_dups = self.find_histogram_duplicates().await?;
721            tracker.tick_batch(1);
722            let groups_found = histogram_dups.len();
723            report.add_groups(histogram_dups);
724            tracker.complete(groups_found);
725            completed_phases += 1;
726        }
727
728        // Feature match duplicates
729        if strategy.includes_feature_match() {
730            if reporter.is_cancelled() {
731                return Ok(report);
732            }
733            let mut tracker = ProgressTracker::new(reporter, "feature_match", 0);
734            let feature_dups = self.find_feature_duplicates().await?;
735            tracker.tick_batch(1);
736            let groups_found = feature_dups.len();
737            report.add_groups(feature_dups);
738            tracker.complete(groups_found);
739            completed_phases += 1;
740        }
741
742        // Audio fingerprint duplicates
743        if strategy.includes_audio() {
744            if reporter.is_cancelled() {
745                return Ok(report);
746            }
747            let mut tracker = ProgressTracker::new(reporter, "audio_fingerprint", 0);
748            let audio_dups = self.find_audio_duplicates().await?;
749            tracker.tick_batch(1);
750            let groups_found = audio_dups.len();
751            report.add_groups(audio_dups);
752            tracker.complete(groups_found);
753            completed_phases += 1;
754        }
755
756        // Metadata duplicates
757        if strategy.includes_metadata() {
758            if reporter.is_cancelled() {
759                return Ok(report);
760            }
761            let mut tracker = ProgressTracker::new(reporter, "metadata", 0);
762            let metadata_dups = self.find_metadata_duplicates().await?;
763            tracker.tick_batch(1);
764            let groups_found = metadata_dups.len();
765            report.add_groups(metadata_dups);
766            tracker.complete(groups_found);
767            completed_phases += 1;
768        }
769
770        // Emit run completed event.
771        let run_end = std::time::SystemTime::now()
772            .duration_since(std::time::UNIX_EPOCH)
773            .unwrap_or_default()
774            .as_millis() as u64;
775
776        reporter.on_event(&ProgressEvent::RunCompleted {
777            total_groups: report.groups.len(),
778            total_elapsed_ms: run_end.saturating_sub(run_start),
779        });
780
781        let _ = (phase_count, completed_phases); // used for bookkeeping
782
783        Ok(report)
784    }
785
786    /// Find exact duplicates by cryptographic hash.
787    async fn find_hash_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
788        let duplicates = self.database.find_duplicate_hashes().await?;
789        let mut groups = Vec::new();
790
791        for (hash, files) in duplicates {
792            if files.len() > 1 {
793                groups.push(DuplicateGroup {
794                    files,
795                    scores: vec![SimilarityScore {
796                        method: "exact_hash".to_string(),
797                        score: 1.0,
798                        metadata: vec![("hash".to_string(), hash)],
799                    }],
800                });
801            }
802        }
803
804        Ok(groups)
805    }
806
807    /// Find perceptual hash duplicates.
808    ///
809    /// When `config.use_lsh` is enabled (the default), uses a
810    /// [`BitLshIndex`](lsh_index::BitLshIndex) for sub-quadratic performance.
811    /// Otherwise falls back to O(n^2) pairwise comparison.
812    ///
813    /// Loads perceptual hashes stored in the `fingerprints` table under the key
814    /// `"phash"`.  Pairs with a Hamming distance below the threshold derived
815    /// from `config.perceptual_threshold` are grouped together.
816    async fn find_perceptual_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
817        // Threshold: perceptual_threshold is 0.0-1.0 similarity.
818        // Hamming distance over 64 bits → distance ≤ (1 - threshold) * 64.
819        let max_hamming = ((1.0 - self.config.perceptual_threshold) * 64.0) as u32;
820
821        // Fetch all stored perceptual hash fingerprints.
822        let stored = self.database.get_all_fingerprints_by_type("phash").await?;
823
824        // Build a list of (path, PerceptualHash) from stored hex strings.
825        let mut hashes: Vec<(String, visual::PerceptualHash)> = Vec::new();
826        for (path, hex) in stored {
827            if let Ok(value) = u64::from_str_radix(&hex, 16) {
828                hashes.push((path, visual::PerceptualHash::new(value, 64)));
829            }
830        }
831
832        // If no stored hashes, nothing to compare.
833        if hashes.len() < 2 {
834            return Ok(Vec::new());
835        }
836
837        // Bloom filter pre-screening: discard definitely-unique perceptual hashes
838        // before any expensive pairwise or LSH comparison.
839        //
840        // Strategy: quantise each 64-bit pHash down to its top 16 bits and run
841        // the items through the shared `prescreen_perceptual_hashes` helper.
842        // Items whose quantised hash has never been seen before are provably
843        // unique (no false negatives in a Bloom filter) and are dropped from
844        // the candidate set.  Remaining items are forwarded to the LSH/pairwise
845        // pass as before.
846        let hashes: Vec<(String, visual::PerceptualHash)> = if self.config.bloom_prescreen {
847            let raw: Vec<u64> = hashes.iter().map(|(_, ph)| ph.hash()).collect();
848            let prescreen = bloom_filter::prescreen_perceptual_hashes(
849                &raw,
850                16, // quantize_bits: top 16 bits capture coarse visual similarity
851                self.config.bloom_capacity,
852                self.config.bloom_fpr,
853            );
854            prescreen
855                .candidates
856                .iter()
857                .filter_map(|&idx| hashes.get(idx).cloned())
858                .collect()
859        } else {
860            hashes
861        };
862
863        // After bloom pre-screening, re-check candidate count.
864        if hashes.len() < 2 {
865            return Ok(Vec::new());
866        }
867
868        if self.config.use_lsh {
869            self.find_perceptual_duplicates_lsh(&hashes, max_hamming)
870        } else {
871            group_by_pairwise_similarity(
872                &hashes,
873                max_hamming,
874                |h1, h2| h1.hamming_distance(h2),
875                |h1, h2| h1.similarity(h2),
876                "perceptual_hash",
877            )
878        }
879    }
880
881    /// LSH-accelerated perceptual hash duplicate detection.
882    ///
883    /// Replaces the O(n^2) pairwise comparison with sub-quadratic LSH
884    /// candidate generation followed by exact Hamming distance verification.
885    fn find_perceptual_duplicates_lsh(
886        &self,
887        hashes: &[(String, visual::PerceptualHash)],
888        max_hamming: u32,
889    ) -> DedupResult<Vec<DuplicateGroup>> {
890        // Build id <-> path mapping.
891        let id_hashes: Vec<(u64, u64)> = hashes
892            .iter()
893            .enumerate()
894            .map(|(i, (_, ph))| (i as u64, ph.hash()))
895            .collect();
896
897        // Run LSH dedup pass.
898        let lsh_result = lsh_index::lsh_dedup_pass(
899            &id_hashes,
900            max_hamming,
901            self.config.lsh_num_tables,
902            self.config.lsh_bits_per_table,
903            self.config.lsh_seed,
904        );
905
906        // Group by transitive closure.
907        let all_ids: Vec<u64> = (0..hashes.len() as u64).collect();
908        let groups = lsh_index::group_by_lsh_pairs(&lsh_result.pairs, &all_ids);
909
910        // Convert back to DuplicateGroup with paths.
911        let mut result = Vec::new();
912        for group_ids in &groups {
913            let files: Vec<String> = group_ids
914                .iter()
915                .filter_map(|&id| hashes.get(id as usize).map(|(p, _)| p.clone()))
916                .collect();
917
918            if files.len() < 2 {
919                continue;
920            }
921
922            // Find best pairwise similarity within the group for scoring.
923            let mut best_sim = 0.0f64;
924            for i in 0..group_ids.len() {
925                for j in (i + 1)..group_ids.len() {
926                    let ia = group_ids[i] as usize;
927                    let ib = group_ids[j] as usize;
928                    if let (Some((_, ha)), Some((_, hb))) = (hashes.get(ia), hashes.get(ib)) {
929                        let sim = ha.similarity(hb);
930                        if sim > best_sim {
931                            best_sim = sim;
932                        }
933                    }
934                }
935            }
936
937            result.push(DuplicateGroup {
938                files,
939                scores: vec![SimilarityScore {
940                    method: "perceptual_hash_lsh".to_string(),
941                    score: best_sim,
942                    metadata: vec![
943                        (
944                            "lsh_candidates".to_string(),
945                            lsh_result.candidates_checked.to_string(),
946                        ),
947                        (
948                            "comparison_ratio".to_string(),
949                            format!("{:.4}", lsh_result.comparison_ratio()),
950                        ),
951                    ],
952                }],
953            });
954        }
955
956        Ok(result)
957    }
958
959    /// Find SSIM duplicates.
960    ///
961    /// Retrieves stored thumbnail pixel data (type `"thumbnail"`) from the
962    /// fingerprints table, reconstructs grayscale `Image` objects, and
963    /// computes the Structural Similarity Index (SSIM) between every unique
964    /// pair.  Pairs with SSIM above `config.ssim_threshold` are grouped.
965    ///
966    /// Thumbnail resolution is controlled by `config.thumbnail_resolution`.
967    async fn find_ssim_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
968        let threshold = self.config.ssim_threshold;
969        let res = self.config.thumbnail_resolution.max(4);
970        let expected_bytes = res * res;
971
972        // Thumbnail images are stored hex-encoded in the fingerprints table.
973        let stored = self
974            .database
975            .get_all_fingerprints_by_type("thumbnail")
976            .await?;
977
978        // Decode hex → bytes → Image (configurable resolution, grayscale).
979        let mut images: Vec<(String, visual::Image)> = Vec::new();
980        for (path, hex) in stored {
981            let bytes = decode_hex_bytes(&hex)?;
982            // Accept thumbnails matching the configured resolution.
983            if bytes.len() == expected_bytes {
984                if let Ok(img) = visual::Image::from_data(res, res, 1, bytes) {
985                    images.push((path, img));
986                }
987            }
988        }
989
990        if images.len() < 2 {
991            return Ok(Vec::new());
992        }
993
994        let ssim_params = visual::SsimParams::default();
995        let mut groups: Vec<DuplicateGroup> = Vec::new();
996        let mut assigned = vec![false; images.len()];
997
998        for i in 0..images.len() {
999            if assigned[i] {
1000                continue;
1001            }
1002            let mut group_files = vec![images[i].0.clone()];
1003            let mut best_score = 0.0f64;
1004
1005            for j in (i + 1)..images.len() {
1006                if assigned[j] {
1007                    continue;
1008                }
1009                let ssim = visual::compute_ssim(&images[i].1, &images[j].1, &ssim_params);
1010                if ssim >= threshold {
1011                    group_files.push(images[j].0.clone());
1012                    assigned[j] = true;
1013                    if ssim > best_score {
1014                        best_score = ssim;
1015                    }
1016                }
1017            }
1018
1019            if group_files.len() > 1 {
1020                assigned[i] = true;
1021                groups.push(DuplicateGroup {
1022                    files: group_files,
1023                    scores: vec![SimilarityScore {
1024                        method: "ssim".to_string(),
1025                        score: best_score,
1026                        metadata: Vec::new(),
1027                    }],
1028                });
1029            }
1030        }
1031
1032        Ok(groups)
1033    }
1034
1035    /// Find histogram duplicates.
1036    ///
1037    /// Loads stored colour histogram fingerprints (type `"histogram"`) from
1038    /// the database.  The data is a JSON-encoded flat array of `u32` bin
1039    /// counts (three channels × 256 bins = 768 values).  Histogram
1040    /// correlation is computed between every pair; pairs above
1041    /// `config.histogram_threshold` are grouped.
1042    async fn find_histogram_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
1043        let threshold = self.config.histogram_threshold;
1044
1045        let stored = self
1046            .database
1047            .get_all_fingerprints_by_type("histogram")
1048            .await?;
1049
1050        // Decode stored JSON histogram data → Vec<Vec<u32>>.
1051        let mut histograms: Vec<(String, Vec<Vec<u32>>)> = Vec::new();
1052        for (path, json_str) in stored {
1053            if let Ok(flat) = serde_json::from_str::<Vec<u32>>(&json_str) {
1054                // Each channel has 256 bins; infer channel count.
1055                if flat.len() % 256 == 0 && !flat.is_empty() {
1056                    let channels = flat.len() / 256;
1057                    let hist: Vec<Vec<u32>> = (0..channels)
1058                        .map(|c| flat[c * 256..(c + 1) * 256].to_vec())
1059                        .collect();
1060                    histograms.push((path, hist));
1061                }
1062            }
1063        }
1064
1065        if histograms.len() < 2 {
1066            return Ok(Vec::new());
1067        }
1068
1069        let mut groups: Vec<DuplicateGroup> = Vec::new();
1070        let mut assigned = vec![false; histograms.len()];
1071
1072        for i in 0..histograms.len() {
1073            if assigned[i] {
1074                continue;
1075            }
1076            let mut group_files = vec![histograms[i].0.clone()];
1077            let mut best_score = 0.0f64;
1078
1079            for j in (i + 1)..histograms.len() {
1080                if assigned[j] {
1081                    continue;
1082                }
1083                let corr = visual::compare_histograms(&histograms[i].1, &histograms[j].1);
1084                if corr >= threshold {
1085                    group_files.push(histograms[j].0.clone());
1086                    assigned[j] = true;
1087                    if corr > best_score {
1088                        best_score = corr;
1089                    }
1090                }
1091            }
1092
1093            if group_files.len() > 1 {
1094                assigned[i] = true;
1095                groups.push(DuplicateGroup {
1096                    files: group_files,
1097                    scores: vec![SimilarityScore {
1098                        method: "histogram".to_string(),
1099                        score: best_score,
1100                        metadata: Vec::new(),
1101                    }],
1102                });
1103            }
1104        }
1105
1106        Ok(groups)
1107    }
1108
1109    /// Find feature match duplicates.
1110    ///
1111    /// Loads stored feature-vector fingerprints (type `"feature_vector"`) from
1112    /// the database.  Each feature vector is a JSON-encoded `Vec<f64>`.
1113    /// Cosine similarity is computed between every pair; pairs whose cosine
1114    /// similarity exceeds `config.perceptual_threshold` (reused as a generic
1115    /// visual similarity threshold) are grouped.
1116    async fn find_feature_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
1117        let threshold = self.config.perceptual_threshold;
1118
1119        let stored = self
1120            .database
1121            .get_all_fingerprints_by_type("feature_vector")
1122            .await?;
1123
1124        // Decode JSON feature vectors.
1125        let mut vectors: Vec<(String, Vec<f64>)> = Vec::new();
1126        for (path, json_str) in stored {
1127            if let Ok(vec) = serde_json::from_str::<Vec<f64>>(&json_str) {
1128                if !vec.is_empty() {
1129                    vectors.push((path, vec));
1130                }
1131            }
1132        }
1133
1134        if vectors.len() < 2 {
1135            return Ok(Vec::new());
1136        }
1137
1138        let mut groups: Vec<DuplicateGroup> = Vec::new();
1139        let mut assigned = vec![false; vectors.len()];
1140
1141        for i in 0..vectors.len() {
1142            if assigned[i] {
1143                continue;
1144            }
1145            let mut group_files = vec![vectors[i].0.clone()];
1146            let mut best_score = 0.0f64;
1147
1148            for j in (i + 1)..vectors.len() {
1149                if assigned[j] {
1150                    continue;
1151                }
1152                let sim = cosine_similarity(&vectors[i].1, &vectors[j].1);
1153                if sim >= threshold {
1154                    group_files.push(vectors[j].0.clone());
1155                    assigned[j] = true;
1156                    if sim > best_score {
1157                        best_score = sim;
1158                    }
1159                }
1160            }
1161
1162            if group_files.len() > 1 {
1163                assigned[i] = true;
1164                groups.push(DuplicateGroup {
1165                    files: group_files,
1166                    scores: vec![SimilarityScore {
1167                        method: "feature_vector".to_string(),
1168                        score: best_score,
1169                        metadata: Vec::new(),
1170                    }],
1171                });
1172            }
1173        }
1174
1175        Ok(groups)
1176    }
1177
1178    /// Find audio fingerprint duplicates.
1179    ///
1180    /// Loads stored audio fingerprint data (type `"audio_fingerprint"`) from
1181    /// the database.  Each fingerprint is stored as a hex string of bytes.
1182    /// Pairs whose bit-level Hamming distance is within the threshold derived
1183    /// from `config.audio_threshold` are grouped together.
1184    async fn find_audio_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
1185        let threshold = self.config.audio_threshold;
1186
1187        let stored = self
1188            .database
1189            .get_all_fingerprints_by_type("audio_fingerprint")
1190            .await?;
1191
1192        // Decode hex fingerprints → AudioFingerprint.
1193        let mut fingerprints: Vec<(String, audio::AudioFingerprint)> = Vec::new();
1194        for (path, hex) in stored {
1195            let bytes = decode_hex_bytes(&hex)?;
1196            if !bytes.is_empty() {
1197                fingerprints.push((path, audio::AudioFingerprint::new(bytes, 11025, 0.0)));
1198            }
1199        }
1200
1201        if fingerprints.len() < 2 {
1202            return Ok(Vec::new());
1203        }
1204
1205        let mut groups: Vec<DuplicateGroup> = Vec::new();
1206        let mut assigned = vec![false; fingerprints.len()];
1207
1208        for i in 0..fingerprints.len() {
1209            if assigned[i] {
1210                continue;
1211            }
1212            let mut group_files = vec![fingerprints[i].0.clone()];
1213            let mut best_score = 0.0f64;
1214
1215            for j in (i + 1)..fingerprints.len() {
1216                if assigned[j] {
1217                    continue;
1218                }
1219                let sim = fingerprints[i].1.similarity(&fingerprints[j].1);
1220                if sim >= threshold {
1221                    group_files.push(fingerprints[j].0.clone());
1222                    assigned[j] = true;
1223                    if sim > best_score {
1224                        best_score = sim;
1225                    }
1226                }
1227            }
1228
1229            if group_files.len() > 1 {
1230                assigned[i] = true;
1231                groups.push(DuplicateGroup {
1232                    files: group_files,
1233                    scores: vec![SimilarityScore {
1234                        method: "audio_fingerprint".to_string(),
1235                        score: best_score,
1236                        metadata: Vec::new(),
1237                    }],
1238                });
1239            }
1240        }
1241
1242        Ok(groups)
1243    }
1244
1245    /// Find metadata duplicates.
1246    ///
1247    /// Fetches all files with their stored metadata from the database and
1248    /// compares every unique pair using `metadata::compare_metadata`.  The
1249    /// key signals for a "near-duplicate" are:
1250    ///
1251    /// - Duration within ±1 second of each other.
1252    /// - Same video resolution (or both without resolution data).
1253    /// - Same video and audio codec.
1254    ///
1255    /// The overall weighted metadata similarity must exceed
1256    /// `config.metadata_threshold`.
1257    async fn find_metadata_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
1258        use metadata::{compare_metadata, MediaMetadata};
1259        use std::path::PathBuf;
1260
1261        let threshold = self.config.metadata_threshold;
1262
1263        let rows = self.database.get_all_files_with_metadata().await?;
1264
1265        if rows.len() < 2 {
1266            return Ok(Vec::new());
1267        }
1268
1269        // Reconstruct MediaMetadata objects from the DB rows.
1270        let media_meta: Vec<MediaMetadata> = rows
1271            .iter()
1272            .map(
1273                |(path, duration, width, height, video_codec, audio_codec, container)| {
1274                    let fs_size = std::fs::metadata(path).map(|m| m.len()).unwrap_or(0);
1275                    let mut m = MediaMetadata::new(PathBuf::from(path), fs_size);
1276                    m.duration = *duration;
1277                    m.width = width.map(|v| v as u32);
1278                    m.height = height.map(|v| v as u32);
1279                    m.video_codec = video_codec.clone();
1280                    m.audio_codec = audio_codec.clone();
1281                    m.container = container.clone();
1282                    m
1283                },
1284            )
1285            .collect();
1286
1287        let paths: Vec<String> = rows.iter().map(|(p, ..)| p.clone()).collect();
1288
1289        let mut groups: Vec<DuplicateGroup> = Vec::new();
1290        let mut assigned = vec![false; media_meta.len()];
1291
1292        for i in 0..media_meta.len() {
1293            if assigned[i] {
1294                continue;
1295            }
1296            let mut group_files = vec![paths[i].clone()];
1297            let mut best_score = 0.0f64;
1298            let mut best_duration_diff: Option<f64> = None;
1299
1300            for j in (i + 1)..media_meta.len() {
1301                if assigned[j] {
1302                    continue;
1303                }
1304
1305                // Fast pre-filter: duration must match within ±1 second
1306                // when both files have duration information stored.
1307                let duration_ok = match (media_meta[i].duration, media_meta[j].duration) {
1308                    (Some(d1), Some(d2)) => (d1 - d2).abs() <= 1.0,
1309                    _ => true, // No duration data → don't discard
1310                };
1311                if !duration_ok {
1312                    continue;
1313                }
1314
1315                let sim = compare_metadata(&media_meta[i], &media_meta[j]);
1316                let score = sim.overall_score();
1317                if score >= threshold {
1318                    group_files.push(paths[j].clone());
1319                    assigned[j] = true;
1320                    if score > best_score {
1321                        best_score = score;
1322                        best_duration_diff = match (media_meta[i].duration, media_meta[j].duration)
1323                        {
1324                            (Some(d1), Some(d2)) => Some((d1 - d2).abs()),
1325                            _ => None,
1326                        };
1327                    }
1328                }
1329            }
1330
1331            if group_files.len() > 1 {
1332                assigned[i] = true;
1333                let mut score_entry = SimilarityScore {
1334                    method: "metadata".to_string(),
1335                    score: best_score,
1336                    metadata: Vec::new(),
1337                };
1338                if let Some(diff) = best_duration_diff {
1339                    score_entry
1340                        .metadata
1341                        .push(("duration_diff_secs".to_string(), format!("{diff:.3}")));
1342                }
1343                groups.push(DuplicateGroup {
1344                    files: group_files,
1345                    scores: vec![score_entry],
1346                });
1347            }
1348        }
1349
1350        Ok(groups)
1351    }
1352
1353    /// Get database statistics.
1354    ///
1355    /// # Errors
1356    ///
1357    /// Returns an error if database query fails.
1358    pub async fn get_stats(&self) -> DedupResult<DedupStats> {
1359        let total_files = self.database.count_files().await?;
1360        let total_hashes = self.database.count_unique_hashes().await?;
1361
1362        Ok(DedupStats {
1363            total_files,
1364            total_hashes,
1365            duplicate_files: total_files.saturating_sub(total_hashes),
1366        })
1367    }
1368
1369    /// Close the database.
1370    pub async fn close(self) -> DedupResult<()> {
1371        self.database.close().await?;
1372        Ok(())
1373    }
1374
1375    /// Fast-path bloom filter check: does this hash *possibly* exist in the index?
1376    ///
1377    /// Returns `true` if the Bloom filter reports the hash *might* be a
1378    /// duplicate (i.e., the same bytes were inserted previously).  Returns
1379    /// `false` only if the hash is **definitely** not present — meaning the
1380    /// file is provably unique and expensive pairwise comparisons can be
1381    /// skipped entirely.
1382    ///
1383    /// When bloom pre-screening is disabled (`config.bloom_prescreen == false`)
1384    /// this always returns `true` so callers always fall through to the full
1385    /// comparison path.
1386    #[must_use]
1387    pub fn might_be_duplicate(&self, hash_bytes: &[u8]) -> bool {
1388        match &self.bloom {
1389            Some(bloom) => bloom.contains(hash_bytes),
1390            None => true,
1391        }
1392    }
1393
1394    /// Reset the in-memory bloom filter without touching the database.
1395    ///
1396    /// Useful after a bulk-index session to free the bloom filter's memory,
1397    /// or to rebuild it from scratch with a different capacity.  The database
1398    /// index is not affected.
1399    pub fn reset_bloom(&mut self) {
1400        if let Some(ref mut bloom) = self.bloom {
1401            bloom.clear();
1402        }
1403    }
1404}
1405
1406/// Deduplication statistics.
1407#[derive(Debug, Clone)]
1408pub struct DedupStats {
1409    /// Total number of indexed files
1410    pub total_files: usize,
1411
1412    /// Total number of unique hashes
1413    pub total_hashes: usize,
1414
1415    /// Number of duplicate files
1416    pub duplicate_files: usize,
1417}
1418
1419#[cfg(test)]
1420mod tests {
1421    use super::*;
1422
1423    #[test]
1424    fn test_detection_strategy() {
1425        assert!(DetectionStrategy::ExactHash.includes_hash());
1426        assert!(!DetectionStrategy::ExactHash.includes_perceptual());
1427
1428        assert!(DetectionStrategy::All.includes_hash());
1429        assert!(DetectionStrategy::All.includes_perceptual());
1430        assert!(DetectionStrategy::All.includes_audio());
1431
1432        assert!(DetectionStrategy::Fast.includes_hash());
1433        assert!(DetectionStrategy::Fast.includes_perceptual());
1434        assert!(!DetectionStrategy::Fast.includes_ssim());
1435    }
1436
1437    #[test]
1438    fn test_config_default() {
1439        let config = DedupConfig::default();
1440        assert_eq!(config.perceptual_threshold, 0.95);
1441        assert_eq!(config.ssim_threshold, 0.90);
1442        assert!(config.parallel);
1443    }
1444
1445    #[test]
1446    fn test_config_lsh_defaults() {
1447        let config = DedupConfig::default();
1448        assert!(config.use_lsh);
1449        assert_eq!(config.lsh_num_tables, 8);
1450        assert_eq!(config.lsh_bits_per_table, 8);
1451        assert_eq!(config.lsh_seed, 42);
1452    }
1453
1454    #[test]
1455    fn test_config_bloom_defaults() {
1456        let config = DedupConfig::default();
1457        // bloom_prescreen is off by default; capacity and fpr are set
1458        assert!(!config.bloom_prescreen);
1459        assert_eq!(config.bloom_capacity, 10_000);
1460        assert!((config.bloom_fpr - 0.01f32).abs() < f32::EPSILON);
1461    }
1462
1463    /// Compile-time check: `par_index_files` accepts an empty slice without panicking.
1464    #[tokio::test]
1465    #[cfg(feature = "sqlite")]
1466    async fn test_par_index_files_empty_slice() {
1467        use std::path::PathBuf;
1468        let dir = std::env::temp_dir();
1469        let db_path = dir.join(format!(
1470            "oxidedup_test_par_{}.db",
1471            std::time::SystemTime::now()
1472                .duration_since(std::time::UNIX_EPOCH)
1473                .unwrap_or_default()
1474                .subsec_nanos()
1475        ));
1476        let config = DedupConfig {
1477            database_path: db_path.clone(),
1478            ..DedupConfig::default()
1479        };
1480        if let Ok(mut detector) = DuplicateDetector::new(config).await {
1481            let no_paths: &[PathBuf] = &[];
1482            let errors = detector
1483                .par_index_files(no_paths)
1484                .await
1485                .expect("par_index_files should succeed on empty input");
1486            assert!(errors.is_empty(), "No errors expected for empty input");
1487            let _ = detector.close().await;
1488        }
1489        let _ = std::fs::remove_file(&db_path);
1490    }
1491
1492    /// par_index_files returns per-file errors for non-existent paths (no panic).
1493    #[tokio::test]
1494    #[cfg(feature = "sqlite")]
1495    async fn test_par_index_files_nonexistent_paths() {
1496        let dir = std::env::temp_dir();
1497        let db_path = dir.join(format!(
1498            "oxidedup_test_par_ne_{}.db",
1499            std::time::SystemTime::now()
1500                .duration_since(std::time::UNIX_EPOCH)
1501                .unwrap_or_default()
1502                .subsec_nanos()
1503        ));
1504        let config = DedupConfig {
1505            database_path: db_path.clone(),
1506            ..DedupConfig::default()
1507        };
1508        if let Ok(mut detector) = DuplicateDetector::new(config).await {
1509            let missing = vec![
1510                PathBuf::from("/nonexistent/path/a.mp4"),
1511                PathBuf::from("/nonexistent/path/b.mp4"),
1512            ];
1513            let errors = detector
1514                .par_index_files(&missing)
1515                .await
1516                .expect("par_index_files should return Ok even when files are missing");
1517            assert_eq!(errors.len(), 2, "Should have one error per missing file");
1518            let _ = detector.close().await;
1519        }
1520        let _ = std::fs::remove_file(&db_path);
1521    }
1522
1523    // ---- Bloom filter wiring tests ----
1524
1525    /// When bloom_prescreen is false (default), might_be_duplicate always returns true.
1526    #[tokio::test]
1527    #[cfg(feature = "sqlite")]
1528    async fn test_might_be_duplicate_no_bloom_always_true() {
1529        let dir = std::env::temp_dir();
1530        let db_path = dir.join(format!(
1531            "oxidedup_bloom_noscreen_{}.db",
1532            std::time::SystemTime::now()
1533                .duration_since(std::time::UNIX_EPOCH)
1534                .unwrap_or_default()
1535                .subsec_nanos()
1536        ));
1537        let config = DedupConfig {
1538            database_path: db_path.clone(),
1539            bloom_prescreen: false,
1540            ..DedupConfig::default()
1541        };
1542        if let Ok(detector) = DuplicateDetector::new(config).await {
1543            // Without a bloom filter, every hash is a "maybe duplicate"
1544            assert!(
1545                detector.might_be_duplicate(b"some_hash_bytes"),
1546                "Should always return true when bloom is disabled"
1547            );
1548            assert!(
1549                detector.might_be_duplicate(b""),
1550                "Empty bytes: should return true without bloom"
1551            );
1552            let _ = detector.close().await;
1553        }
1554        let _ = std::fs::remove_file(&db_path);
1555    }
1556
1557    /// When bloom_prescreen is enabled, unknown hashes return false from might_be_duplicate.
1558    #[tokio::test]
1559    #[cfg(feature = "sqlite")]
1560    async fn test_might_be_duplicate_with_bloom_unknown_hash() {
1561        let dir = std::env::temp_dir();
1562        let db_path = dir.join(format!(
1563            "oxidedup_bloom_unknown_{}.db",
1564            std::time::SystemTime::now()
1565                .duration_since(std::time::UNIX_EPOCH)
1566                .unwrap_or_default()
1567                .subsec_nanos()
1568        ));
1569        let config = DedupConfig {
1570            database_path: db_path.clone(),
1571            bloom_prescreen: true,
1572            bloom_capacity: 1000,
1573            bloom_fpr: 0.01,
1574            ..DedupConfig::default()
1575        };
1576        if let Ok(detector) = DuplicateDetector::new(config).await {
1577            // A freshly created detector has an empty bloom filter — unknown hashes
1578            // must return false (definitely not a duplicate)
1579            assert!(
1580                !detector.might_be_duplicate(b"never_inserted_hash"),
1581                "Unknown hash should return false from a fresh bloom filter"
1582            );
1583            let _ = detector.close().await;
1584        }
1585        let _ = std::fs::remove_file(&db_path);
1586    }
1587
1588    /// reset_bloom clears the filter so previously-seen hashes return false.
1589    #[tokio::test]
1590    #[cfg(feature = "sqlite")]
1591    async fn test_reset_bloom_clears_state() {
1592        let dir = std::env::temp_dir();
1593        let db_path = dir.join(format!(
1594            "oxidedup_bloom_reset_{}.db",
1595            std::time::SystemTime::now()
1596                .duration_since(std::time::UNIX_EPOCH)
1597                .unwrap_or_default()
1598                .subsec_nanos()
1599        ));
1600        let config = DedupConfig {
1601            database_path: db_path.clone(),
1602            bloom_prescreen: true,
1603            bloom_capacity: 1000,
1604            bloom_fpr: 0.01,
1605            ..DedupConfig::default()
1606        };
1607        if let Ok(mut detector) = DuplicateDetector::new(config).await {
1608            // Manually insert into the bloom filter by inserting known bytes
1609            if let Some(ref mut bloom) = detector.bloom {
1610                bloom.insert(b"known_hash");
1611            }
1612            // Now it should report a potential duplicate
1613            assert!(
1614                detector.might_be_duplicate(b"known_hash"),
1615                "After insert, bloom should report potential duplicate"
1616            );
1617            // After reset, the same hash must not be found
1618            detector.reset_bloom();
1619            assert!(
1620                !detector.might_be_duplicate(b"known_hash"),
1621                "After reset_bloom, hash should not be found"
1622            );
1623            let _ = detector.close().await;
1624        }
1625        let _ = std::fs::remove_file(&db_path);
1626    }
1627}