oximedia_dedup/
lib.rs

1//! Media deduplication and duplicate detection for `OxiMedia`.
2//!
3//! `oximedia-dedup` provides comprehensive duplicate detection and media deduplication
4//! for the `OxiMedia` multimedia framework. This includes:
5//!
6//! - **Cryptographic hashing**: BLAKE3-based exact duplicate detection
7//! - **Visual similarity**: Perceptual hashing, SSIM, histogram, and feature matching
8//! - **Audio fingerprinting**: Audio fingerprint comparison and waveform similarity
9//! - **Metadata matching**: Fuzzy metadata comparison for near-duplicates
10//! - **Storage optimization**: Fast SQLite-based indexing for large libraries
11//! - **Reporting**: Comprehensive duplicate reports with similarity scoring
12//!
13//! # Modules
14//!
15//! - [`hash`]: Cryptographic and content-based hashing
16//! - [`visual`]: Visual similarity detection
17//! - [`audio`]: Audio fingerprint comparison
18//! - [`metadata`]: Metadata-based deduplication
19//! - `database`: SQLite-based indexing and lookup
20//! - [`report`]: Duplicate detection reports
21//!
22//! # Example
23//!
24//! ```
25//! use oximedia_dedup::{DuplicateDetector, DetectionStrategy, DedupConfig};
26//!
27//! # async fn example() -> Result<(), Box<dyn std::error::Error>> {
28//! let config = DedupConfig::default();
29//! let mut detector = DuplicateDetector::new(config).await?;
30//!
31//! // Add files to the index
32//! detector.add_file("/path/to/video1.mp4").await?;
33//! detector.add_file("/path/to/video2.mp4").await?;
34//!
35//! // Find duplicates
36//! let duplicates = detector.find_duplicates(DetectionStrategy::All).await?;
37//! # Ok(())
38//! # }
39//! ```
40//!
41//! ## Strategy Selection Guide
42//!
43//! | Strategy | Speed | Precision | Use case |
44//! |---|---|---|---|
45//! | `ExactHash` | Very fast | Perfect (no false positives) | Bit-for-bit identical files — best first pass for any library |
46//! | `Fast` | Fast | High | Quick scan: hash + perceptual + metadata; good default for large libraries |
47//! | `PerceptualHash` | Fast | Good | Visually identical images/frames that were re-encoded or lightly cropped |
48//! | `Histogram` | Fast | Moderate | Color-similar frames regardless of spatial layout |
49//! | `AudioFingerprint` | Moderate | High | Same audio in different codecs or with minor edits |
50//! | `Metadata` | Fast | Low–moderate | Likely duplicates with same duration/resolution (combine with visual pass) |
51//! | `Ssim` | Slow | Very high | Near-identical video frames where pHash gives too many false positives |
52//! | `FeatureMatch` | Slow | High | Cropped, rotated, or partially occluded duplicates |
53//! | `VisualAll` | Slow | Very high | Combined visual pipeline: pHash + Histogram + SSIM + FeatureMatch |
54//! | `All` | Very slow | Maximum | Full pipeline — all methods; use only for final authoritative scan |
55//!
56//! **Recommended workflow:**
57//! 1. Run `ExactHash` first to catch perfect duplicates cheaply.
58//! 2. Run `Fast` for a broad near-duplicate sweep.
59//! 3. Run `VisualAll` or `All` for a precision clean-up pass on the remainder.
60//!
61//! ## Detection Method Trade-offs
62//!
63//! | Method | Accuracy | CPU cost | Memory | False-positive risk | Notes |
64//! |---|---|---|---|---|---|
65//! | BLAKE3 hash | 100% | Very low | O(1) | None | Misses re-encoded or edited copies |
66//! | dHash (8×8) | High | Very low | O(1) | Low | Robust to resize; sensitive to crops |
67//! | pHash (DCT) | High | Low | O(1) | Low–medium | Better than dHash for brightness shifts |
68//! | wHash (wavelet) | High | Low | O(1) | Low | Most robust to combined transforms |
69//! | SSIM | Very high | High | O(WH) | Very low | Pixel-accurate; slow for large images |
70//! | Histogram | Moderate | Low | O(256) | Medium | Colour match only; ignores structure |
71//! | FeatureMatch | High | Very high | O(N×D) | Low | Works on crops/rotations; expensive |
72//! | AudioFingerprint | High | Moderate | O(T) | Low | Spectral-peak based; codec-agnostic |
73//! | Metadata | Low–moderate | Very low | O(1) | High | Use only as a pre-filter |
74//!
75//! **Bloom-filter pre-screening** (`DedupConfig::bloom_prescreen = true`) reduces
76//! the number of pairwise comparisons by rejecting definitely-unique items before
77//! the expensive perceptual-hash phase.  Recommended for libraries with > 10 K files.
78//!
79//! **LSH acceleration** (`DedupConfig::use_lsh = true`, default) replaces O(n²)
80//! pairwise perceptual-hash comparison with sub-quadratic approximate nearest-
81//! neighbour lookup via `BitLshIndex`.  Adjust `lsh_num_tables` (more tables →
82//! better recall, more memory) and `lsh_bits_per_table` (fewer bits → more
83//! candidates → better recall at higher CPU cost).
84
85#![warn(missing_docs)]
86#![allow(clippy::module_name_repetitions)]
87#![allow(clippy::similar_names)]
88#![allow(clippy::cast_possible_truncation)]
89#![allow(clippy::cast_sign_loss)]
90#![allow(clippy::cast_precision_loss)]
91#![allow(clippy::too_many_arguments)]
92#![allow(dead_code)]
93
94pub mod audio;
95pub mod bloom_filter;
96pub mod cluster;
97pub mod content_id;
98pub mod content_signature;
99pub mod cross_format;
100#[cfg(feature = "sqlite")]
101pub mod database;
102pub mod dedup_cache;
103pub mod dedup_index;
104pub mod dedup_policy;
105pub mod dedup_report;
106pub mod dedup_report_ext;
107pub mod dedup_stats;
108pub mod frame_hash;
109pub mod fuzzy_match;
110pub mod hash;
111pub mod hash_store;
112pub mod incremental;
113pub mod lsh_index;
114pub mod merge_strategy;
115pub mod metadata;
116pub mod near_duplicate;
117pub mod perceptual_hash;
118pub mod phash;
119pub mod progress;
120pub mod report;
121pub mod rolling_hash;
122pub mod segment_dedup;
123pub mod similarity_index;
124pub mod video_dedup;
125pub mod video_segment_dedup;
126pub mod visual;
127
128#[cfg(feature = "sqlite")]
129use std::path::Path;
130use std::path::PathBuf;
131use thiserror::Error;
132
133#[cfg(feature = "sqlite")]
134pub use database::DedupDatabase;
135pub use merge_strategy::{AppliedAction, MergeExecutor, MergeReport};
136pub use report::{DuplicateGroup, DuplicateReport, SimilarityScore};
137
138// ---------------------------------------------------------------------------
139// Internal helpers used by the stub implementations
140// ---------------------------------------------------------------------------
141
142/// Decode a lowercase hex string into a byte vector.
143///
144/// # Errors
145///
146/// Returns `DedupError::Hash` if the string contains non-hex characters or
147/// has an odd number of characters.
148#[cfg(feature = "sqlite")]
149fn decode_hex_bytes(hex: &str) -> DedupResult<Vec<u8>> {
150    if hex.len() % 2 != 0 {
151        return Err(DedupError::Hash(format!(
152            "odd-length hex string: len={}",
153            hex.len()
154        )));
155    }
156    (0..hex.len())
157        .step_by(2)
158        .map(|i| {
159            u8::from_str_radix(&hex[i..i + 2], 16)
160                .map_err(|e| DedupError::Hash(format!("invalid hex byte at {i}: {e}")))
161        })
162        .collect()
163}
164
165/// Compute the cosine similarity between two f64 slices.
166///
167/// Returns a value in [−1, 1] or 0.0 when either vector is zero-magnitude.
168#[cfg(feature = "sqlite")]
169fn cosine_similarity(a: &[f64], b: &[f64]) -> f64 {
170    let dot: f64 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
171    let mag_a: f64 = a.iter().map(|x| x * x).sum::<f64>().sqrt();
172    let mag_b: f64 = b.iter().map(|x| x * x).sum::<f64>().sqrt();
173    if mag_a < f64::EPSILON || mag_b < f64::EPSILON {
174        return 0.0;
175    }
176    dot / (mag_a * mag_b)
177}
178
179/// Generic pairwise grouping helper for perceptual hash comparison.
180///
181/// Takes a slice of `(path, hash)` pairs, a maximum Hamming distance
182/// threshold, a distance function, a similarity function (0.0‒1.0), and a
183/// method label.  Returns non-overlapping duplicate groups.
184#[cfg(feature = "sqlite")]
185fn group_by_pairwise_similarity<H, FDist, FSim>(
186    items: &[(String, H)],
187    max_distance: u32,
188    dist_fn: FDist,
189    sim_fn: FSim,
190    method: &str,
191) -> DedupResult<Vec<DuplicateGroup>>
192where
193    FDist: Fn(&H, &H) -> u32,
194    FSim: Fn(&H, &H) -> f64,
195{
196    let mut groups: Vec<DuplicateGroup> = Vec::new();
197    let mut assigned = vec![false; items.len()];
198
199    for i in 0..items.len() {
200        if assigned[i] {
201            continue;
202        }
203        let mut group_files = vec![items[i].0.clone()];
204        let mut best_score = 0.0f64;
205
206        for j in (i + 1)..items.len() {
207            if assigned[j] {
208                continue;
209            }
210            let dist = dist_fn(&items[i].1, &items[j].1);
211            if dist <= max_distance {
212                let sim = sim_fn(&items[i].1, &items[j].1);
213                group_files.push(items[j].0.clone());
214                assigned[j] = true;
215                if sim > best_score {
216                    best_score = sim;
217                }
218            }
219        }
220
221        if group_files.len() > 1 {
222            assigned[i] = true;
223            groups.push(DuplicateGroup {
224                files: group_files,
225                scores: vec![SimilarityScore {
226                    method: method.to_string(),
227                    score: best_score,
228                    metadata: Vec::new(),
229                }],
230            });
231        }
232    }
233
234    Ok(groups)
235}
236
237/// Deduplication error type.
238#[derive(Error, Debug)]
239pub enum DedupError {
240    /// I/O error
241    #[error("I/O error: {0}")]
242    Io(#[from] std::io::Error),
243
244    /// Database error
245    #[cfg(feature = "sqlite")]
246    #[error("Database error: {0}")]
247    Database(#[from] sqlx::Error),
248
249    /// Database error (non-sqlite variant)
250    #[cfg(not(feature = "sqlite"))]
251    #[error("Database error: {0}")]
252    Database(String),
253
254    /// Hashing error
255    #[error("Hashing error: {0}")]
256    Hash(String),
257
258    /// Visual processing error
259    #[error("Visual processing error: {0}")]
260    Visual(String),
261
262    /// Audio processing error
263    #[error("Audio processing error: {0}")]
264    Audio(String),
265
266    /// Metadata processing error
267    #[error("Metadata processing error: {0}")]
268    Metadata(String),
269
270    /// File not found
271    #[error("File not found: {0}")]
272    FileNotFound(PathBuf),
273
274    /// Invalid configuration
275    #[error("Invalid configuration: {0}")]
276    InvalidConfig(String),
277
278    /// Core library error
279    #[error("OxiMedia core error: {0}")]
280    Core(#[from] oximedia_core::OxiError),
281}
282
283/// Deduplication result type.
284pub type DedupResult<T> = Result<T, DedupError>;
285
286/// Detection strategy for finding duplicates.
287#[derive(Debug, Clone, Copy, PartialEq, Eq)]
288pub enum DetectionStrategy {
289    /// Exact duplicates only (cryptographic hash)
290    ExactHash,
291
292    /// Visual similarity using perceptual hashing
293    PerceptualHash,
294
295    /// Visual similarity using SSIM
296    Ssim,
297
298    /// Visual similarity using histogram comparison
299    Histogram,
300
301    /// Visual similarity using feature matching
302    FeatureMatch,
303
304    /// Audio fingerprint comparison
305    AudioFingerprint,
306
307    /// Metadata-based matching
308    Metadata,
309
310    /// All detection methods
311    All,
312
313    /// Combination of visual methods
314    VisualAll,
315
316    /// Combination of fast methods (hash + perceptual + metadata)
317    Fast,
318}
319
320impl DetectionStrategy {
321    /// Check if strategy includes exact hashing.
322    #[must_use]
323    pub fn includes_hash(self) -> bool {
324        matches!(self, Self::ExactHash | Self::All | Self::Fast)
325    }
326
327    /// Check if strategy includes perceptual hashing.
328    #[must_use]
329    pub fn includes_perceptual(self) -> bool {
330        matches!(
331            self,
332            Self::PerceptualHash | Self::All | Self::VisualAll | Self::Fast
333        )
334    }
335
336    /// Check if strategy includes SSIM.
337    #[must_use]
338    pub fn includes_ssim(self) -> bool {
339        matches!(self, Self::Ssim | Self::All | Self::VisualAll)
340    }
341
342    /// Check if strategy includes histogram.
343    #[must_use]
344    pub fn includes_histogram(self) -> bool {
345        matches!(self, Self::Histogram | Self::All | Self::VisualAll)
346    }
347
348    /// Check if strategy includes feature matching.
349    #[must_use]
350    pub fn includes_feature_match(self) -> bool {
351        matches!(self, Self::FeatureMatch | Self::All | Self::VisualAll)
352    }
353
354    /// Check if strategy includes audio fingerprinting.
355    #[must_use]
356    pub fn includes_audio(self) -> bool {
357        matches!(self, Self::AudioFingerprint | Self::All)
358    }
359
360    /// Check if strategy includes metadata.
361    #[must_use]
362    pub fn includes_metadata(self) -> bool {
363        matches!(self, Self::Metadata | Self::All | Self::Fast)
364    }
365}
366
367/// Configuration for deduplication.
368#[derive(Debug, Clone)]
369pub struct DedupConfig {
370    /// Database path
371    pub database_path: PathBuf,
372
373    /// Perceptual hash similarity threshold (0.0-1.0)
374    pub perceptual_threshold: f64,
375
376    /// SSIM similarity threshold (0.0-1.0)
377    pub ssim_threshold: f64,
378
379    /// Histogram similarity threshold (0.0-1.0)
380    pub histogram_threshold: f64,
381
382    /// Feature match threshold (minimum number of matches)
383    pub feature_match_threshold: usize,
384
385    /// Audio fingerprint similarity threshold (0.0-1.0)
386    pub audio_threshold: f64,
387
388    /// Metadata similarity threshold (0.0-1.0)
389    pub metadata_threshold: f64,
390
391    /// Enable parallel processing
392    pub parallel: bool,
393
394    /// Number of frames to sample for video analysis
395    pub sample_frames: usize,
396
397    /// Chunk size for content-based chunking (bytes)
398    pub chunk_size: usize,
399
400    /// Thumbnail resolution for SSIM duplicate detection.
401    ///
402    /// Specifies both width and height of the grayscale thumbnail used for
403    /// SSIM comparison.  Must be >= 4.  Default is 8 (i.e. 8x8 = 64 pixels).
404    /// Higher values give more accurate SSIM at the cost of storage and CPU.
405    pub thumbnail_resolution: usize,
406
407    /// Enable bloom filter pre-screening before expensive perceptual comparisons.
408    ///
409    /// When enabled, a bloom filter is used to quickly reject items whose
410    /// content hash is already known to be unique, avoiding expensive
411    /// pairwise perceptual hash comparisons.
412    pub bloom_prescreen: bool,
413
414    /// Expected capacity for the bloom filter pre-screener.
415    pub bloom_capacity: usize,
416
417    /// False positive rate for the bloom filter pre-screener.
418    pub bloom_fpr: f32,
419
420    /// Use LSH acceleration for perceptual hash deduplication.
421    ///
422    /// When enabled, `find_perceptual_duplicates()` uses a `BitLshIndex`
423    /// instead of O(n^2) pairwise comparison.  This provides sub-quadratic
424    /// performance for large libraries at the cost of slightly reduced recall.
425    pub use_lsh: bool,
426
427    /// Number of LSH hash tables (more = better recall, more memory).
428    pub lsh_num_tables: usize,
429
430    /// Bits sampled per LSH table (fewer = more candidates = better recall).
431    pub lsh_bits_per_table: usize,
432
433    /// Deterministic seed for LSH projections.
434    pub lsh_seed: u64,
435}
436
437impl Default for DedupConfig {
438    fn default() -> Self {
439        Self {
440            database_path: PathBuf::from("oximedia_dedup.db"),
441            perceptual_threshold: 0.95,
442            ssim_threshold: 0.90,
443            histogram_threshold: 0.85,
444            feature_match_threshold: 50,
445            audio_threshold: 0.90,
446            metadata_threshold: 0.80,
447            parallel: true,
448            sample_frames: 10,
449            chunk_size: 4096,
450            thumbnail_resolution: 8,
451            bloom_prescreen: false,
452            bloom_capacity: 10_000,
453            bloom_fpr: 0.01,
454            use_lsh: true,
455            lsh_num_tables: 8,
456            lsh_bits_per_table: 8,
457            lsh_seed: 42,
458        }
459    }
460}
461
462/// Main duplicate detector.
463#[cfg(feature = "sqlite")]
464pub struct DuplicateDetector {
465    config: DedupConfig,
466    database: DedupDatabase,
467    /// Optional Bloom filter for fast-path duplicate pre-screening.
468    ///
469    /// Populated when `DedupConfig::bloom_prescreen` is `true`.  Stores
470    /// raw BLAKE3 hash bytes of every indexed file so that definitely-unique
471    /// files can be rejected without expensive pairwise comparisons.
472    bloom: Option<bloom_filter::BloomFilter>,
473}
474
475#[cfg(feature = "sqlite")]
476impl DuplicateDetector {
477    /// Create a new duplicate detector.
478    ///
479    /// When `config.bloom_prescreen` is `true`, a `BloomFilter` is
480    /// created using `config.bloom_capacity` and `config.bloom_fpr`.
481    /// Every file indexed via `add_file` or `par_index_files` will
482    /// automatically populate the filter so it can be used for fast-path
483    /// rejection in subsequent duplicate-detection passes.
484    ///
485    /// # Errors
486    ///
487    /// Returns an error if the database cannot be opened or initialized.
488    pub async fn new(config: DedupConfig) -> DedupResult<Self> {
489        let database = DedupDatabase::open(&config.database_path).await?;
490        let bloom = if config.bloom_prescreen {
491            Some(bloom_filter::BloomFilter::new(
492                config.bloom_capacity,
493                config.bloom_fpr,
494            ))
495        } else {
496            None
497        };
498        Ok(Self {
499            config,
500            database,
501            bloom,
502        })
503    }
504
505    /// Add a file to the deduplication index.
506    ///
507    /// If bloom pre-screening is enabled, the file's BLAKE3 hash bytes are
508    /// also inserted into the in-memory Bloom filter so that future
509    /// `might_be_duplicate` calls can provide fast-path rejection.
510    ///
511    /// # Errors
512    ///
513    /// Returns an error if the file cannot be read or processed.
514    pub async fn add_file(&mut self, path: impl AsRef<Path>) -> DedupResult<()> {
515        let path = path.as_ref();
516        if !path.exists() {
517            return Err(DedupError::FileNotFound(path.to_path_buf()));
518        }
519
520        // Compute hash
521        let file_hash = hash::compute_file_hash(path)?;
522
523        // Populate bloom filter (fast-path pre-screener) if enabled.
524        if let Some(ref mut bloom) = self.bloom {
525            bloom.insert(file_hash.as_bytes());
526        }
527
528        // Store in database
529        self.database.insert_file(path, &file_hash.to_hex()).await?;
530
531        Ok(())
532    }
533
534    /// Add multiple files sequentially.
535    ///
536    /// # Errors
537    ///
538    /// Returns an error if any file cannot be processed.
539    pub async fn add_files(&mut self, paths: &[impl AsRef<Path>]) -> DedupResult<Vec<String>> {
540        let mut errors = Vec::new();
541
542        for path in paths {
543            if let Err(e) = self.add_file(path).await {
544                errors.push(format!("{}: {}", path.as_ref().display(), e));
545            }
546        }
547
548        Ok(errors)
549    }
550
551    /// Add multiple files to the index using parallel hash computation.
552    ///
553    /// This method computes file hashes (BLAKE3) in parallel using rayon, then
554    /// merges the results into the database sequentially.  The parallelism
555    /// benefit is greatest for large libraries where hash I/O and computation
556    /// dominate.  Database inserts are performed sequentially afterwards
557    /// because they require exclusive `&mut self` access.
558    ///
559    /// Errors from individual files are collected and returned rather than
560    /// aborting the entire batch.
561    ///
562    /// # Errors
563    ///
564    /// Returns the list of per-file error strings.  An empty `Vec` means all
565    /// files were indexed successfully.
566    pub async fn par_index_files<P>(&mut self, paths: &[P]) -> DedupResult<Vec<String>>
567    where
568        P: AsRef<Path> + Sync,
569    {
570        use rayon::prelude::*;
571
572        // Phase 1: compute hashes in parallel (CPU-intensive, embarrassingly parallel).
573        let hash_results: Vec<(PathBuf, DedupResult<hash::FileHash>)> = paths
574            .par_iter()
575            .map(|p| {
576                let path = p.as_ref().to_path_buf();
577                if !path.exists() {
578                    return (path.clone(), Err(DedupError::FileNotFound(path)));
579                }
580                let result = hash::compute_file_hash(&path);
581                (path, result)
582            })
583            .collect();
584
585        // Phase 2: merge into DB sequentially (requires exclusive &mut self).
586        let mut errors = Vec::new();
587        for (path, result) in hash_results {
588            match result {
589                Ok(file_hash) => {
590                    if let Err(e) = self.database.insert_file(&path, &file_hash.to_hex()).await {
591                        errors.push(format!("{}: {}", path.display(), e));
592                    }
593                }
594                Err(e) => {
595                    errors.push(format!("{}: {}", path.display(), e));
596                }
597            }
598        }
599
600        Ok(errors)
601    }
602
603    /// Find duplicates using the specified strategy.
604    ///
605    /// # Errors
606    ///
607    /// Returns an error if duplicate detection fails.
608    pub async fn find_duplicates(
609        &self,
610        strategy: DetectionStrategy,
611    ) -> DedupResult<DuplicateReport> {
612        self.find_duplicates_with_progress(strategy, &progress::NullReporter)
613            .await
614    }
615
616    /// Find duplicates with progress reporting.
617    ///
618    /// Like `find_duplicates` but emits progress events through the
619    /// supplied [`ProgressReporter`](progress::ProgressReporter).  This is
620    /// the primary integration point for large-library deduplication where
621    /// the caller wants to display a progress bar or support cancellation.
622    ///
623    /// # Errors
624    ///
625    /// Returns an error if duplicate detection fails.
626    pub async fn find_duplicates_with_progress(
627        &self,
628        strategy: DetectionStrategy,
629        reporter: &dyn progress::ProgressReporter,
630    ) -> DedupResult<DuplicateReport> {
631        use progress::{ProgressEvent, ProgressTracker};
632
633        let run_start = std::time::SystemTime::now()
634            .duration_since(std::time::UNIX_EPOCH)
635            .unwrap_or_default()
636            .as_millis() as u64;
637
638        let mut report = DuplicateReport::new();
639
640        // Count phases for total progress.
641        let phase_count = [
642            strategy.includes_hash(),
643            strategy.includes_perceptual(),
644            strategy.includes_ssim(),
645            strategy.includes_histogram(),
646            strategy.includes_feature_match(),
647            strategy.includes_audio(),
648            strategy.includes_metadata(),
649        ]
650        .iter()
651        .filter(|&&b| b)
652        .count();
653
654        let mut completed_phases = 0usize;
655
656        // Exact hash duplicates
657        if strategy.includes_hash() {
658            if reporter.is_cancelled() {
659                return Ok(report);
660            }
661            let mut tracker = ProgressTracker::new(reporter, "exact_hash", 0);
662            let hash_dups = self.find_hash_duplicates().await?;
663            tracker.tick_batch(1);
664            let groups_found = hash_dups.len();
665            report.add_groups(hash_dups);
666            tracker.complete(groups_found);
667            completed_phases += 1;
668        }
669
670        // Perceptual hash duplicates
671        if strategy.includes_perceptual() {
672            if reporter.is_cancelled() {
673                return Ok(report);
674            }
675            let mut tracker = ProgressTracker::new(reporter, "perceptual_hash", 0);
676            let perceptual_dups = self.find_perceptual_duplicates().await?;
677            tracker.tick_batch(1);
678            let groups_found = perceptual_dups.len();
679            report.add_groups(perceptual_dups);
680            tracker.complete(groups_found);
681            completed_phases += 1;
682        }
683
684        // SSIM duplicates
685        if strategy.includes_ssim() {
686            if reporter.is_cancelled() {
687                return Ok(report);
688            }
689            let mut tracker = ProgressTracker::new(reporter, "ssim", 0);
690            let ssim_dups = self.find_ssim_duplicates().await?;
691            tracker.tick_batch(1);
692            let groups_found = ssim_dups.len();
693            report.add_groups(ssim_dups);
694            tracker.complete(groups_found);
695            completed_phases += 1;
696        }
697
698        // Histogram duplicates
699        if strategy.includes_histogram() {
700            if reporter.is_cancelled() {
701                return Ok(report);
702            }
703            let mut tracker = ProgressTracker::new(reporter, "histogram", 0);
704            let histogram_dups = self.find_histogram_duplicates().await?;
705            tracker.tick_batch(1);
706            let groups_found = histogram_dups.len();
707            report.add_groups(histogram_dups);
708            tracker.complete(groups_found);
709            completed_phases += 1;
710        }
711
712        // Feature match duplicates
713        if strategy.includes_feature_match() {
714            if reporter.is_cancelled() {
715                return Ok(report);
716            }
717            let mut tracker = ProgressTracker::new(reporter, "feature_match", 0);
718            let feature_dups = self.find_feature_duplicates().await?;
719            tracker.tick_batch(1);
720            let groups_found = feature_dups.len();
721            report.add_groups(feature_dups);
722            tracker.complete(groups_found);
723            completed_phases += 1;
724        }
725
726        // Audio fingerprint duplicates
727        if strategy.includes_audio() {
728            if reporter.is_cancelled() {
729                return Ok(report);
730            }
731            let mut tracker = ProgressTracker::new(reporter, "audio_fingerprint", 0);
732            let audio_dups = self.find_audio_duplicates().await?;
733            tracker.tick_batch(1);
734            let groups_found = audio_dups.len();
735            report.add_groups(audio_dups);
736            tracker.complete(groups_found);
737            completed_phases += 1;
738        }
739
740        // Metadata duplicates
741        if strategy.includes_metadata() {
742            if reporter.is_cancelled() {
743                return Ok(report);
744            }
745            let mut tracker = ProgressTracker::new(reporter, "metadata", 0);
746            let metadata_dups = self.find_metadata_duplicates().await?;
747            tracker.tick_batch(1);
748            let groups_found = metadata_dups.len();
749            report.add_groups(metadata_dups);
750            tracker.complete(groups_found);
751            completed_phases += 1;
752        }
753
754        // Emit run completed event.
755        let run_end = std::time::SystemTime::now()
756            .duration_since(std::time::UNIX_EPOCH)
757            .unwrap_or_default()
758            .as_millis() as u64;
759
760        reporter.on_event(&ProgressEvent::RunCompleted {
761            total_groups: report.groups.len(),
762            total_elapsed_ms: run_end.saturating_sub(run_start),
763        });
764
765        let _ = (phase_count, completed_phases); // used for bookkeeping
766
767        Ok(report)
768    }
769
770    /// Find exact duplicates by cryptographic hash.
771    async fn find_hash_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
772        let duplicates = self.database.find_duplicate_hashes().await?;
773        let mut groups = Vec::new();
774
775        for (hash, files) in duplicates {
776            if files.len() > 1 {
777                groups.push(DuplicateGroup {
778                    files,
779                    scores: vec![SimilarityScore {
780                        method: "exact_hash".to_string(),
781                        score: 1.0,
782                        metadata: vec![("hash".to_string(), hash)],
783                    }],
784                });
785            }
786        }
787
788        Ok(groups)
789    }
790
791    /// Find perceptual hash duplicates.
792    ///
793    /// When `config.use_lsh` is enabled (the default), uses a
794    /// [`BitLshIndex`](lsh_index::BitLshIndex) for sub-quadratic performance.
795    /// Otherwise falls back to O(n^2) pairwise comparison.
796    ///
797    /// Loads perceptual hashes stored in the `fingerprints` table under the key
798    /// `"phash"`.  Pairs with a Hamming distance below the threshold derived
799    /// from `config.perceptual_threshold` are grouped together.
800    async fn find_perceptual_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
801        // Threshold: perceptual_threshold is 0.0-1.0 similarity.
802        // Hamming distance over 64 bits → distance ≤ (1 - threshold) * 64.
803        let max_hamming = ((1.0 - self.config.perceptual_threshold) * 64.0) as u32;
804
805        // Fetch all stored perceptual hash fingerprints.
806        let stored = self.database.get_all_fingerprints_by_type("phash").await?;
807
808        // Build a list of (path, PerceptualHash) from stored hex strings.
809        let mut hashes: Vec<(String, visual::PerceptualHash)> = Vec::new();
810        for (path, hex) in stored {
811            if let Ok(value) = u64::from_str_radix(&hex, 16) {
812                hashes.push((path, visual::PerceptualHash::new(value, 64)));
813            }
814        }
815
816        // If no stored hashes, nothing to compare.
817        if hashes.len() < 2 {
818            return Ok(Vec::new());
819        }
820
821        // Bloom filter pre-screening: discard definitely-unique perceptual hashes
822        // before any expensive pairwise or LSH comparison.
823        //
824        // Strategy: quantise each 64-bit pHash down to its top 16 bits and run
825        // the items through the shared `prescreen_perceptual_hashes` helper.
826        // Items whose quantised hash has never been seen before are provably
827        // unique (no false negatives in a Bloom filter) and are dropped from
828        // the candidate set.  Remaining items are forwarded to the LSH/pairwise
829        // pass as before.
830        let hashes: Vec<(String, visual::PerceptualHash)> = if self.config.bloom_prescreen {
831            let raw: Vec<u64> = hashes.iter().map(|(_, ph)| ph.hash()).collect();
832            let prescreen = bloom_filter::prescreen_perceptual_hashes(
833                &raw,
834                16, // quantize_bits: top 16 bits capture coarse visual similarity
835                self.config.bloom_capacity,
836                self.config.bloom_fpr,
837            );
838            prescreen
839                .candidates
840                .iter()
841                .filter_map(|&idx| hashes.get(idx).cloned())
842                .collect()
843        } else {
844            hashes
845        };
846
847        // After bloom pre-screening, re-check candidate count.
848        if hashes.len() < 2 {
849            return Ok(Vec::new());
850        }
851
852        if self.config.use_lsh {
853            self.find_perceptual_duplicates_lsh(&hashes, max_hamming)
854        } else {
855            group_by_pairwise_similarity(
856                &hashes,
857                max_hamming,
858                |h1, h2| h1.hamming_distance(h2),
859                |h1, h2| h1.similarity(h2),
860                "perceptual_hash",
861            )
862        }
863    }
864
865    /// LSH-accelerated perceptual hash duplicate detection.
866    ///
867    /// Replaces the O(n^2) pairwise comparison with sub-quadratic LSH
868    /// candidate generation followed by exact Hamming distance verification.
869    fn find_perceptual_duplicates_lsh(
870        &self,
871        hashes: &[(String, visual::PerceptualHash)],
872        max_hamming: u32,
873    ) -> DedupResult<Vec<DuplicateGroup>> {
874        // Build id <-> path mapping.
875        let id_hashes: Vec<(u64, u64)> = hashes
876            .iter()
877            .enumerate()
878            .map(|(i, (_, ph))| (i as u64, ph.hash()))
879            .collect();
880
881        // Run LSH dedup pass.
882        let lsh_result = lsh_index::lsh_dedup_pass(
883            &id_hashes,
884            max_hamming,
885            self.config.lsh_num_tables,
886            self.config.lsh_bits_per_table,
887            self.config.lsh_seed,
888        );
889
890        // Group by transitive closure.
891        let all_ids: Vec<u64> = (0..hashes.len() as u64).collect();
892        let groups = lsh_index::group_by_lsh_pairs(&lsh_result.pairs, &all_ids);
893
894        // Convert back to DuplicateGroup with paths.
895        let mut result = Vec::new();
896        for group_ids in &groups {
897            let files: Vec<String> = group_ids
898                .iter()
899                .filter_map(|&id| hashes.get(id as usize).map(|(p, _)| p.clone()))
900                .collect();
901
902            if files.len() < 2 {
903                continue;
904            }
905
906            // Find best pairwise similarity within the group for scoring.
907            let mut best_sim = 0.0f64;
908            for i in 0..group_ids.len() {
909                for j in (i + 1)..group_ids.len() {
910                    let ia = group_ids[i] as usize;
911                    let ib = group_ids[j] as usize;
912                    if let (Some((_, ha)), Some((_, hb))) = (hashes.get(ia), hashes.get(ib)) {
913                        let sim = ha.similarity(hb);
914                        if sim > best_sim {
915                            best_sim = sim;
916                        }
917                    }
918                }
919            }
920
921            result.push(DuplicateGroup {
922                files,
923                scores: vec![SimilarityScore {
924                    method: "perceptual_hash_lsh".to_string(),
925                    score: best_sim,
926                    metadata: vec![
927                        (
928                            "lsh_candidates".to_string(),
929                            lsh_result.candidates_checked.to_string(),
930                        ),
931                        (
932                            "comparison_ratio".to_string(),
933                            format!("{:.4}", lsh_result.comparison_ratio()),
934                        ),
935                    ],
936                }],
937            });
938        }
939
940        Ok(result)
941    }
942
943    /// Find SSIM duplicates.
944    ///
945    /// Retrieves stored thumbnail pixel data (type `"thumbnail"`) from the
946    /// fingerprints table, reconstructs grayscale `Image` objects, and
947    /// computes the Structural Similarity Index (SSIM) between every unique
948    /// pair.  Pairs with SSIM above `config.ssim_threshold` are grouped.
949    ///
950    /// Thumbnail resolution is controlled by `config.thumbnail_resolution`.
951    async fn find_ssim_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
952        let threshold = self.config.ssim_threshold;
953        let res = self.config.thumbnail_resolution.max(4);
954        let expected_bytes = res * res;
955
956        // Thumbnail images are stored hex-encoded in the fingerprints table.
957        let stored = self
958            .database
959            .get_all_fingerprints_by_type("thumbnail")
960            .await?;
961
962        // Decode hex → bytes → Image (configurable resolution, grayscale).
963        let mut images: Vec<(String, visual::Image)> = Vec::new();
964        for (path, hex) in stored {
965            let bytes = decode_hex_bytes(&hex)?;
966            // Accept thumbnails matching the configured resolution.
967            if bytes.len() == expected_bytes {
968                if let Ok(img) = visual::Image::from_data(res, res, 1, bytes) {
969                    images.push((path, img));
970                }
971            }
972        }
973
974        if images.len() < 2 {
975            return Ok(Vec::new());
976        }
977
978        let ssim_params = visual::SsimParams::default();
979        let mut groups: Vec<DuplicateGroup> = Vec::new();
980        let mut assigned = vec![false; images.len()];
981
982        for i in 0..images.len() {
983            if assigned[i] {
984                continue;
985            }
986            let mut group_files = vec![images[i].0.clone()];
987            let mut best_score = 0.0f64;
988
989            for j in (i + 1)..images.len() {
990                if assigned[j] {
991                    continue;
992                }
993                let ssim = visual::compute_ssim(&images[i].1, &images[j].1, &ssim_params);
994                if ssim >= threshold {
995                    group_files.push(images[j].0.clone());
996                    assigned[j] = true;
997                    if ssim > best_score {
998                        best_score = ssim;
999                    }
1000                }
1001            }
1002
1003            if group_files.len() > 1 {
1004                assigned[i] = true;
1005                groups.push(DuplicateGroup {
1006                    files: group_files,
1007                    scores: vec![SimilarityScore {
1008                        method: "ssim".to_string(),
1009                        score: best_score,
1010                        metadata: Vec::new(),
1011                    }],
1012                });
1013            }
1014        }
1015
1016        Ok(groups)
1017    }
1018
1019    /// Find histogram duplicates.
1020    ///
1021    /// Loads stored colour histogram fingerprints (type `"histogram"`) from
1022    /// the database.  The data is a JSON-encoded flat array of `u32` bin
1023    /// counts (three channels × 256 bins = 768 values).  Histogram
1024    /// correlation is computed between every pair; pairs above
1025    /// `config.histogram_threshold` are grouped.
1026    async fn find_histogram_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
1027        let threshold = self.config.histogram_threshold;
1028
1029        let stored = self
1030            .database
1031            .get_all_fingerprints_by_type("histogram")
1032            .await?;
1033
1034        // Decode stored JSON histogram data → Vec<Vec<u32>>.
1035        let mut histograms: Vec<(String, Vec<Vec<u32>>)> = Vec::new();
1036        for (path, json_str) in stored {
1037            if let Ok(flat) = serde_json::from_str::<Vec<u32>>(&json_str) {
1038                // Each channel has 256 bins; infer channel count.
1039                if flat.len() % 256 == 0 && !flat.is_empty() {
1040                    let channels = flat.len() / 256;
1041                    let hist: Vec<Vec<u32>> = (0..channels)
1042                        .map(|c| flat[c * 256..(c + 1) * 256].to_vec())
1043                        .collect();
1044                    histograms.push((path, hist));
1045                }
1046            }
1047        }
1048
1049        if histograms.len() < 2 {
1050            return Ok(Vec::new());
1051        }
1052
1053        let mut groups: Vec<DuplicateGroup> = Vec::new();
1054        let mut assigned = vec![false; histograms.len()];
1055
1056        for i in 0..histograms.len() {
1057            if assigned[i] {
1058                continue;
1059            }
1060            let mut group_files = vec![histograms[i].0.clone()];
1061            let mut best_score = 0.0f64;
1062
1063            for j in (i + 1)..histograms.len() {
1064                if assigned[j] {
1065                    continue;
1066                }
1067                let corr = visual::compare_histograms(&histograms[i].1, &histograms[j].1);
1068                if corr >= threshold {
1069                    group_files.push(histograms[j].0.clone());
1070                    assigned[j] = true;
1071                    if corr > best_score {
1072                        best_score = corr;
1073                    }
1074                }
1075            }
1076
1077            if group_files.len() > 1 {
1078                assigned[i] = true;
1079                groups.push(DuplicateGroup {
1080                    files: group_files,
1081                    scores: vec![SimilarityScore {
1082                        method: "histogram".to_string(),
1083                        score: best_score,
1084                        metadata: Vec::new(),
1085                    }],
1086                });
1087            }
1088        }
1089
1090        Ok(groups)
1091    }
1092
1093    /// Find feature match duplicates.
1094    ///
1095    /// Loads stored feature-vector fingerprints (type `"feature_vector"`) from
1096    /// the database.  Each feature vector is a JSON-encoded `Vec<f64>`.
1097    /// Cosine similarity is computed between every pair; pairs whose cosine
1098    /// similarity exceeds `config.perceptual_threshold` (reused as a generic
1099    /// visual similarity threshold) are grouped.
1100    async fn find_feature_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
1101        let threshold = self.config.perceptual_threshold;
1102
1103        let stored = self
1104            .database
1105            .get_all_fingerprints_by_type("feature_vector")
1106            .await?;
1107
1108        // Decode JSON feature vectors.
1109        let mut vectors: Vec<(String, Vec<f64>)> = Vec::new();
1110        for (path, json_str) in stored {
1111            if let Ok(vec) = serde_json::from_str::<Vec<f64>>(&json_str) {
1112                if !vec.is_empty() {
1113                    vectors.push((path, vec));
1114                }
1115            }
1116        }
1117
1118        if vectors.len() < 2 {
1119            return Ok(Vec::new());
1120        }
1121
1122        let mut groups: Vec<DuplicateGroup> = Vec::new();
1123        let mut assigned = vec![false; vectors.len()];
1124
1125        for i in 0..vectors.len() {
1126            if assigned[i] {
1127                continue;
1128            }
1129            let mut group_files = vec![vectors[i].0.clone()];
1130            let mut best_score = 0.0f64;
1131
1132            for j in (i + 1)..vectors.len() {
1133                if assigned[j] {
1134                    continue;
1135                }
1136                let sim = cosine_similarity(&vectors[i].1, &vectors[j].1);
1137                if sim >= threshold {
1138                    group_files.push(vectors[j].0.clone());
1139                    assigned[j] = true;
1140                    if sim > best_score {
1141                        best_score = sim;
1142                    }
1143                }
1144            }
1145
1146            if group_files.len() > 1 {
1147                assigned[i] = true;
1148                groups.push(DuplicateGroup {
1149                    files: group_files,
1150                    scores: vec![SimilarityScore {
1151                        method: "feature_vector".to_string(),
1152                        score: best_score,
1153                        metadata: Vec::new(),
1154                    }],
1155                });
1156            }
1157        }
1158
1159        Ok(groups)
1160    }
1161
1162    /// Find audio fingerprint duplicates.
1163    ///
1164    /// Loads stored audio fingerprint data (type `"audio_fingerprint"`) from
1165    /// the database.  Each fingerprint is stored as a hex string of bytes.
1166    /// Pairs whose bit-level Hamming distance is within the threshold derived
1167    /// from `config.audio_threshold` are grouped together.
1168    async fn find_audio_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
1169        let threshold = self.config.audio_threshold;
1170
1171        let stored = self
1172            .database
1173            .get_all_fingerprints_by_type("audio_fingerprint")
1174            .await?;
1175
1176        // Decode hex fingerprints → AudioFingerprint.
1177        let mut fingerprints: Vec<(String, audio::AudioFingerprint)> = Vec::new();
1178        for (path, hex) in stored {
1179            let bytes = decode_hex_bytes(&hex)?;
1180            if !bytes.is_empty() {
1181                fingerprints.push((path, audio::AudioFingerprint::new(bytes, 11025, 0.0)));
1182            }
1183        }
1184
1185        if fingerprints.len() < 2 {
1186            return Ok(Vec::new());
1187        }
1188
1189        let mut groups: Vec<DuplicateGroup> = Vec::new();
1190        let mut assigned = vec![false; fingerprints.len()];
1191
1192        for i in 0..fingerprints.len() {
1193            if assigned[i] {
1194                continue;
1195            }
1196            let mut group_files = vec![fingerprints[i].0.clone()];
1197            let mut best_score = 0.0f64;
1198
1199            for j in (i + 1)..fingerprints.len() {
1200                if assigned[j] {
1201                    continue;
1202                }
1203                let sim = fingerprints[i].1.similarity(&fingerprints[j].1);
1204                if sim >= threshold {
1205                    group_files.push(fingerprints[j].0.clone());
1206                    assigned[j] = true;
1207                    if sim > best_score {
1208                        best_score = sim;
1209                    }
1210                }
1211            }
1212
1213            if group_files.len() > 1 {
1214                assigned[i] = true;
1215                groups.push(DuplicateGroup {
1216                    files: group_files,
1217                    scores: vec![SimilarityScore {
1218                        method: "audio_fingerprint".to_string(),
1219                        score: best_score,
1220                        metadata: Vec::new(),
1221                    }],
1222                });
1223            }
1224        }
1225
1226        Ok(groups)
1227    }
1228
1229    /// Find metadata duplicates.
1230    ///
1231    /// Fetches all files with their stored metadata from the database and
1232    /// compares every unique pair using `metadata::compare_metadata`.  The
1233    /// key signals for a "near-duplicate" are:
1234    ///
1235    /// - Duration within ±1 second of each other.
1236    /// - Same video resolution (or both without resolution data).
1237    /// - Same video and audio codec.
1238    ///
1239    /// The overall weighted metadata similarity must exceed
1240    /// `config.metadata_threshold`.
1241    async fn find_metadata_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
1242        use metadata::{compare_metadata, MediaMetadata};
1243        use std::path::PathBuf;
1244
1245        let threshold = self.config.metadata_threshold;
1246
1247        let rows = self.database.get_all_files_with_metadata().await?;
1248
1249        if rows.len() < 2 {
1250            return Ok(Vec::new());
1251        }
1252
1253        // Reconstruct MediaMetadata objects from the DB rows.
1254        let media_meta: Vec<MediaMetadata> = rows
1255            .iter()
1256            .map(
1257                |(path, duration, width, height, video_codec, audio_codec, container)| {
1258                    let fs_size = std::fs::metadata(path).map(|m| m.len()).unwrap_or(0);
1259                    let mut m = MediaMetadata::new(PathBuf::from(path), fs_size);
1260                    m.duration = *duration;
1261                    m.width = width.map(|v| v as u32);
1262                    m.height = height.map(|v| v as u32);
1263                    m.video_codec = video_codec.clone();
1264                    m.audio_codec = audio_codec.clone();
1265                    m.container = container.clone();
1266                    m
1267                },
1268            )
1269            .collect();
1270
1271        let paths: Vec<String> = rows.iter().map(|(p, ..)| p.clone()).collect();
1272
1273        let mut groups: Vec<DuplicateGroup> = Vec::new();
1274        let mut assigned = vec![false; media_meta.len()];
1275
1276        for i in 0..media_meta.len() {
1277            if assigned[i] {
1278                continue;
1279            }
1280            let mut group_files = vec![paths[i].clone()];
1281            let mut best_score = 0.0f64;
1282            let mut best_duration_diff: Option<f64> = None;
1283
1284            for j in (i + 1)..media_meta.len() {
1285                if assigned[j] {
1286                    continue;
1287                }
1288
1289                // Fast pre-filter: duration must match within ±1 second
1290                // when both files have duration information stored.
1291                let duration_ok = match (media_meta[i].duration, media_meta[j].duration) {
1292                    (Some(d1), Some(d2)) => (d1 - d2).abs() <= 1.0,
1293                    _ => true, // No duration data → don't discard
1294                };
1295                if !duration_ok {
1296                    continue;
1297                }
1298
1299                let sim = compare_metadata(&media_meta[i], &media_meta[j]);
1300                let score = sim.overall_score();
1301                if score >= threshold {
1302                    group_files.push(paths[j].clone());
1303                    assigned[j] = true;
1304                    if score > best_score {
1305                        best_score = score;
1306                        best_duration_diff = match (media_meta[i].duration, media_meta[j].duration)
1307                        {
1308                            (Some(d1), Some(d2)) => Some((d1 - d2).abs()),
1309                            _ => None,
1310                        };
1311                    }
1312                }
1313            }
1314
1315            if group_files.len() > 1 {
1316                assigned[i] = true;
1317                let mut score_entry = SimilarityScore {
1318                    method: "metadata".to_string(),
1319                    score: best_score,
1320                    metadata: Vec::new(),
1321                };
1322                if let Some(diff) = best_duration_diff {
1323                    score_entry
1324                        .metadata
1325                        .push(("duration_diff_secs".to_string(), format!("{diff:.3}")));
1326                }
1327                groups.push(DuplicateGroup {
1328                    files: group_files,
1329                    scores: vec![score_entry],
1330                });
1331            }
1332        }
1333
1334        Ok(groups)
1335    }
1336
1337    /// Get database statistics.
1338    ///
1339    /// # Errors
1340    ///
1341    /// Returns an error if database query fails.
1342    pub async fn get_stats(&self) -> DedupResult<DedupStats> {
1343        let total_files = self.database.count_files().await?;
1344        let total_hashes = self.database.count_unique_hashes().await?;
1345
1346        Ok(DedupStats {
1347            total_files,
1348            total_hashes,
1349            duplicate_files: total_files.saturating_sub(total_hashes),
1350        })
1351    }
1352
1353    /// Close the database.
1354    pub async fn close(self) -> DedupResult<()> {
1355        self.database.close().await?;
1356        Ok(())
1357    }
1358
1359    /// Fast-path bloom filter check: does this hash *possibly* exist in the index?
1360    ///
1361    /// Returns `true` if the Bloom filter reports the hash *might* be a
1362    /// duplicate (i.e., the same bytes were inserted previously).  Returns
1363    /// `false` only if the hash is **definitely** not present — meaning the
1364    /// file is provably unique and expensive pairwise comparisons can be
1365    /// skipped entirely.
1366    ///
1367    /// When bloom pre-screening is disabled (`config.bloom_prescreen == false`)
1368    /// this always returns `true` so callers always fall through to the full
1369    /// comparison path.
1370    #[must_use]
1371    pub fn might_be_duplicate(&self, hash_bytes: &[u8]) -> bool {
1372        match &self.bloom {
1373            Some(bloom) => bloom.contains(hash_bytes),
1374            None => true,
1375        }
1376    }
1377
1378    /// Reset the in-memory bloom filter without touching the database.
1379    ///
1380    /// Useful after a bulk-index session to free the bloom filter's memory,
1381    /// or to rebuild it from scratch with a different capacity.  The database
1382    /// index is not affected.
1383    pub fn reset_bloom(&mut self) {
1384        if let Some(ref mut bloom) = self.bloom {
1385            bloom.clear();
1386        }
1387    }
1388}
1389
1390/// Deduplication statistics.
1391#[derive(Debug, Clone)]
1392pub struct DedupStats {
1393    /// Total number of indexed files
1394    pub total_files: usize,
1395
1396    /// Total number of unique hashes
1397    pub total_hashes: usize,
1398
1399    /// Number of duplicate files
1400    pub duplicate_files: usize,
1401}
1402
1403#[cfg(test)]
1404mod tests {
1405    use super::*;
1406
1407    #[test]
1408    fn test_detection_strategy() {
1409        assert!(DetectionStrategy::ExactHash.includes_hash());
1410        assert!(!DetectionStrategy::ExactHash.includes_perceptual());
1411
1412        assert!(DetectionStrategy::All.includes_hash());
1413        assert!(DetectionStrategy::All.includes_perceptual());
1414        assert!(DetectionStrategy::All.includes_audio());
1415
1416        assert!(DetectionStrategy::Fast.includes_hash());
1417        assert!(DetectionStrategy::Fast.includes_perceptual());
1418        assert!(!DetectionStrategy::Fast.includes_ssim());
1419    }
1420
1421    #[test]
1422    fn test_config_default() {
1423        let config = DedupConfig::default();
1424        assert_eq!(config.perceptual_threshold, 0.95);
1425        assert_eq!(config.ssim_threshold, 0.90);
1426        assert!(config.parallel);
1427    }
1428
1429    #[test]
1430    fn test_config_lsh_defaults() {
1431        let config = DedupConfig::default();
1432        assert!(config.use_lsh);
1433        assert_eq!(config.lsh_num_tables, 8);
1434        assert_eq!(config.lsh_bits_per_table, 8);
1435        assert_eq!(config.lsh_seed, 42);
1436    }
1437
1438    #[test]
1439    fn test_config_bloom_defaults() {
1440        let config = DedupConfig::default();
1441        // bloom_prescreen is off by default; capacity and fpr are set
1442        assert!(!config.bloom_prescreen);
1443        assert_eq!(config.bloom_capacity, 10_000);
1444        assert!((config.bloom_fpr - 0.01f32).abs() < f32::EPSILON);
1445    }
1446
1447    /// Compile-time check: `par_index_files` accepts an empty slice without panicking.
1448    #[tokio::test]
1449    #[cfg(feature = "sqlite")]
1450    async fn test_par_index_files_empty_slice() {
1451        use std::path::PathBuf;
1452        let dir = std::env::temp_dir();
1453        let db_path = dir.join(format!(
1454            "oxidedup_test_par_{}.db",
1455            std::time::SystemTime::now()
1456                .duration_since(std::time::UNIX_EPOCH)
1457                .unwrap_or_default()
1458                .subsec_nanos()
1459        ));
1460        let config = DedupConfig {
1461            database_path: db_path.clone(),
1462            ..DedupConfig::default()
1463        };
1464        if let Ok(mut detector) = DuplicateDetector::new(config).await {
1465            let no_paths: &[PathBuf] = &[];
1466            let errors = detector
1467                .par_index_files(no_paths)
1468                .await
1469                .expect("par_index_files should succeed on empty input");
1470            assert!(errors.is_empty(), "No errors expected for empty input");
1471            let _ = detector.close().await;
1472        }
1473        let _ = std::fs::remove_file(&db_path);
1474    }
1475
1476    /// par_index_files returns per-file errors for non-existent paths (no panic).
1477    #[tokio::test]
1478    #[cfg(feature = "sqlite")]
1479    async fn test_par_index_files_nonexistent_paths() {
1480        let dir = std::env::temp_dir();
1481        let db_path = dir.join(format!(
1482            "oxidedup_test_par_ne_{}.db",
1483            std::time::SystemTime::now()
1484                .duration_since(std::time::UNIX_EPOCH)
1485                .unwrap_or_default()
1486                .subsec_nanos()
1487        ));
1488        let config = DedupConfig {
1489            database_path: db_path.clone(),
1490            ..DedupConfig::default()
1491        };
1492        if let Ok(mut detector) = DuplicateDetector::new(config).await {
1493            let missing = vec![
1494                PathBuf::from("/nonexistent/path/a.mp4"),
1495                PathBuf::from("/nonexistent/path/b.mp4"),
1496            ];
1497            let errors = detector
1498                .par_index_files(&missing)
1499                .await
1500                .expect("par_index_files should return Ok even when files are missing");
1501            assert_eq!(errors.len(), 2, "Should have one error per missing file");
1502            let _ = detector.close().await;
1503        }
1504        let _ = std::fs::remove_file(&db_path);
1505    }
1506
1507    // ---- Bloom filter wiring tests ----
1508
1509    /// When bloom_prescreen is false (default), might_be_duplicate always returns true.
1510    #[tokio::test]
1511    #[cfg(feature = "sqlite")]
1512    async fn test_might_be_duplicate_no_bloom_always_true() {
1513        let dir = std::env::temp_dir();
1514        let db_path = dir.join(format!(
1515            "oxidedup_bloom_noscreen_{}.db",
1516            std::time::SystemTime::now()
1517                .duration_since(std::time::UNIX_EPOCH)
1518                .unwrap_or_default()
1519                .subsec_nanos()
1520        ));
1521        let config = DedupConfig {
1522            database_path: db_path.clone(),
1523            bloom_prescreen: false,
1524            ..DedupConfig::default()
1525        };
1526        if let Ok(detector) = DuplicateDetector::new(config).await {
1527            // Without a bloom filter, every hash is a "maybe duplicate"
1528            assert!(
1529                detector.might_be_duplicate(b"some_hash_bytes"),
1530                "Should always return true when bloom is disabled"
1531            );
1532            assert!(
1533                detector.might_be_duplicate(b""),
1534                "Empty bytes: should return true without bloom"
1535            );
1536            let _ = detector.close().await;
1537        }
1538        let _ = std::fs::remove_file(&db_path);
1539    }
1540
1541    /// When bloom_prescreen is enabled, unknown hashes return false from might_be_duplicate.
1542    #[tokio::test]
1543    #[cfg(feature = "sqlite")]
1544    async fn test_might_be_duplicate_with_bloom_unknown_hash() {
1545        let dir = std::env::temp_dir();
1546        let db_path = dir.join(format!(
1547            "oxidedup_bloom_unknown_{}.db",
1548            std::time::SystemTime::now()
1549                .duration_since(std::time::UNIX_EPOCH)
1550                .unwrap_or_default()
1551                .subsec_nanos()
1552        ));
1553        let config = DedupConfig {
1554            database_path: db_path.clone(),
1555            bloom_prescreen: true,
1556            bloom_capacity: 1000,
1557            bloom_fpr: 0.01,
1558            ..DedupConfig::default()
1559        };
1560        if let Ok(detector) = DuplicateDetector::new(config).await {
1561            // A freshly created detector has an empty bloom filter — unknown hashes
1562            // must return false (definitely not a duplicate)
1563            assert!(
1564                !detector.might_be_duplicate(b"never_inserted_hash"),
1565                "Unknown hash should return false from a fresh bloom filter"
1566            );
1567            let _ = detector.close().await;
1568        }
1569        let _ = std::fs::remove_file(&db_path);
1570    }
1571
1572    /// reset_bloom clears the filter so previously-seen hashes return false.
1573    #[tokio::test]
1574    #[cfg(feature = "sqlite")]
1575    async fn test_reset_bloom_clears_state() {
1576        let dir = std::env::temp_dir();
1577        let db_path = dir.join(format!(
1578            "oxidedup_bloom_reset_{}.db",
1579            std::time::SystemTime::now()
1580                .duration_since(std::time::UNIX_EPOCH)
1581                .unwrap_or_default()
1582                .subsec_nanos()
1583        ));
1584        let config = DedupConfig {
1585            database_path: db_path.clone(),
1586            bloom_prescreen: true,
1587            bloom_capacity: 1000,
1588            bloom_fpr: 0.01,
1589            ..DedupConfig::default()
1590        };
1591        if let Ok(mut detector) = DuplicateDetector::new(config).await {
1592            // Manually insert into the bloom filter by inserting known bytes
1593            if let Some(ref mut bloom) = detector.bloom {
1594                bloom.insert(b"known_hash");
1595            }
1596            // Now it should report a potential duplicate
1597            assert!(
1598                detector.might_be_duplicate(b"known_hash"),
1599                "After insert, bloom should report potential duplicate"
1600            );
1601            // After reset, the same hash must not be found
1602            detector.reset_bloom();
1603            assert!(
1604                !detector.might_be_duplicate(b"known_hash"),
1605                "After reset_bloom, hash should not be found"
1606            );
1607            let _ = detector.close().await;
1608        }
1609        let _ = std::fs::remove_file(&db_path);
1610    }
1611}
oximedia_dedup/lib.rs

oximedia_dedup/
lib.rs