1#![warn(missing_docs)]
42#![allow(clippy::module_name_repetitions)]
43#![allow(clippy::similar_names)]
44#![allow(clippy::cast_possible_truncation)]
45#![allow(clippy::cast_sign_loss)]
46#![allow(clippy::cast_precision_loss)]
47#![allow(clippy::too_many_arguments)]
48#![allow(dead_code)]
49
50pub mod audio;
51pub mod bloom_filter;
52pub mod cluster;
53pub mod content_id;
54pub mod content_signature;
55pub mod cross_format;
56#[cfg(feature = "sqlite")]
57pub mod database;
58pub mod dedup_cache;
59pub mod dedup_index;
60pub mod dedup_policy;
61pub mod dedup_report;
62pub mod dedup_report_ext;
63pub mod dedup_stats;
64pub mod frame_hash;
65pub mod fuzzy_match;
66pub mod hash;
67pub mod hash_store;
68pub mod incremental;
69pub mod lsh_index;
70pub mod merge_strategy;
71pub mod metadata;
72pub mod near_duplicate;
73pub mod perceptual_hash;
74pub mod phash;
75pub mod progress;
76pub mod report;
77pub mod rolling_hash;
78pub mod segment_dedup;
79pub mod similarity_index;
80pub mod video_dedup;
81pub mod video_segment_dedup;
82pub mod visual;
83
84#[cfg(feature = "sqlite")]
85use std::path::Path;
86use std::path::PathBuf;
87use thiserror::Error;
88
89#[cfg(feature = "sqlite")]
90pub use database::DedupDatabase;
91pub use report::{DuplicateGroup, DuplicateReport, SimilarityScore};
92
93#[cfg(feature = "sqlite")]
104fn decode_hex_bytes(hex: &str) -> DedupResult<Vec<u8>> {
105 if hex.len() % 2 != 0 {
106 return Err(DedupError::Hash(format!(
107 "odd-length hex string: len={}",
108 hex.len()
109 )));
110 }
111 (0..hex.len())
112 .step_by(2)
113 .map(|i| {
114 u8::from_str_radix(&hex[i..i + 2], 16)
115 .map_err(|e| DedupError::Hash(format!("invalid hex byte at {i}: {e}")))
116 })
117 .collect()
118}
119
120#[cfg(feature = "sqlite")]
124fn cosine_similarity(a: &[f64], b: &[f64]) -> f64 {
125 let dot: f64 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
126 let mag_a: f64 = a.iter().map(|x| x * x).sum::<f64>().sqrt();
127 let mag_b: f64 = b.iter().map(|x| x * x).sum::<f64>().sqrt();
128 if mag_a < f64::EPSILON || mag_b < f64::EPSILON {
129 return 0.0;
130 }
131 dot / (mag_a * mag_b)
132}
133
134#[cfg(feature = "sqlite")]
140fn group_by_pairwise_similarity<H, FDist, FSim>(
141 items: &[(String, H)],
142 max_distance: u32,
143 dist_fn: FDist,
144 sim_fn: FSim,
145 method: &str,
146) -> DedupResult<Vec<DuplicateGroup>>
147where
148 FDist: Fn(&H, &H) -> u32,
149 FSim: Fn(&H, &H) -> f64,
150{
151 let mut groups: Vec<DuplicateGroup> = Vec::new();
152 let mut assigned = vec![false; items.len()];
153
154 for i in 0..items.len() {
155 if assigned[i] {
156 continue;
157 }
158 let mut group_files = vec![items[i].0.clone()];
159 let mut best_score = 0.0f64;
160
161 for j in (i + 1)..items.len() {
162 if assigned[j] {
163 continue;
164 }
165 let dist = dist_fn(&items[i].1, &items[j].1);
166 if dist <= max_distance {
167 let sim = sim_fn(&items[i].1, &items[j].1);
168 group_files.push(items[j].0.clone());
169 assigned[j] = true;
170 if sim > best_score {
171 best_score = sim;
172 }
173 }
174 }
175
176 if group_files.len() > 1 {
177 assigned[i] = true;
178 groups.push(DuplicateGroup {
179 files: group_files,
180 scores: vec![SimilarityScore {
181 method: method.to_string(),
182 score: best_score,
183 metadata: Vec::new(),
184 }],
185 });
186 }
187 }
188
189 Ok(groups)
190}
191
192#[derive(Error, Debug)]
194pub enum DedupError {
195 #[error("I/O error: {0}")]
197 Io(#[from] std::io::Error),
198
199 #[cfg(feature = "sqlite")]
201 #[error("Database error: {0}")]
202 Database(#[from] sqlx::Error),
203
204 #[cfg(not(feature = "sqlite"))]
206 #[error("Database error: {0}")]
207 Database(String),
208
209 #[error("Hashing error: {0}")]
211 Hash(String),
212
213 #[error("Visual processing error: {0}")]
215 Visual(String),
216
217 #[error("Audio processing error: {0}")]
219 Audio(String),
220
221 #[error("Metadata processing error: {0}")]
223 Metadata(String),
224
225 #[error("File not found: {0}")]
227 FileNotFound(PathBuf),
228
229 #[error("Invalid configuration: {0}")]
231 InvalidConfig(String),
232
233 #[error("OxiMedia core error: {0}")]
235 Core(#[from] oximedia_core::OxiError),
236}
237
238pub type DedupResult<T> = Result<T, DedupError>;
240
241#[derive(Debug, Clone, Copy, PartialEq, Eq)]
243pub enum DetectionStrategy {
244 ExactHash,
246
247 PerceptualHash,
249
250 Ssim,
252
253 Histogram,
255
256 FeatureMatch,
258
259 AudioFingerprint,
261
262 Metadata,
264
265 All,
267
268 VisualAll,
270
271 Fast,
273}
274
275impl DetectionStrategy {
276 #[must_use]
278 pub fn includes_hash(self) -> bool {
279 matches!(self, Self::ExactHash | Self::All | Self::Fast)
280 }
281
282 #[must_use]
284 pub fn includes_perceptual(self) -> bool {
285 matches!(
286 self,
287 Self::PerceptualHash | Self::All | Self::VisualAll | Self::Fast
288 )
289 }
290
291 #[must_use]
293 pub fn includes_ssim(self) -> bool {
294 matches!(self, Self::Ssim | Self::All | Self::VisualAll)
295 }
296
297 #[must_use]
299 pub fn includes_histogram(self) -> bool {
300 matches!(self, Self::Histogram | Self::All | Self::VisualAll)
301 }
302
303 #[must_use]
305 pub fn includes_feature_match(self) -> bool {
306 matches!(self, Self::FeatureMatch | Self::All | Self::VisualAll)
307 }
308
309 #[must_use]
311 pub fn includes_audio(self) -> bool {
312 matches!(self, Self::AudioFingerprint | Self::All)
313 }
314
315 #[must_use]
317 pub fn includes_metadata(self) -> bool {
318 matches!(self, Self::Metadata | Self::All | Self::Fast)
319 }
320}
321
322#[derive(Debug, Clone)]
324pub struct DedupConfig {
325 pub database_path: PathBuf,
327
328 pub perceptual_threshold: f64,
330
331 pub ssim_threshold: f64,
333
334 pub histogram_threshold: f64,
336
337 pub feature_match_threshold: usize,
339
340 pub audio_threshold: f64,
342
343 pub metadata_threshold: f64,
345
346 pub parallel: bool,
348
349 pub sample_frames: usize,
351
352 pub chunk_size: usize,
354
355 pub thumbnail_resolution: usize,
361
362 pub bloom_prescreen: bool,
368
369 pub bloom_capacity: usize,
371
372 pub bloom_fpr: f32,
374
375 pub use_lsh: bool,
381
382 pub lsh_num_tables: usize,
384
385 pub lsh_bits_per_table: usize,
387
388 pub lsh_seed: u64,
390}
391
392impl Default for DedupConfig {
393 fn default() -> Self {
394 Self {
395 database_path: PathBuf::from("oximedia_dedup.db"),
396 perceptual_threshold: 0.95,
397 ssim_threshold: 0.90,
398 histogram_threshold: 0.85,
399 feature_match_threshold: 50,
400 audio_threshold: 0.90,
401 metadata_threshold: 0.80,
402 parallel: true,
403 sample_frames: 10,
404 chunk_size: 4096,
405 thumbnail_resolution: 8,
406 bloom_prescreen: false,
407 bloom_capacity: 10_000,
408 bloom_fpr: 0.01,
409 use_lsh: true,
410 lsh_num_tables: 8,
411 lsh_bits_per_table: 8,
412 lsh_seed: 42,
413 }
414 }
415}
416
417#[cfg(feature = "sqlite")]
419pub struct DuplicateDetector {
420 config: DedupConfig,
421 database: DedupDatabase,
422 bloom: Option<bloom_filter::BloomFilter>,
428}
429
430#[cfg(feature = "sqlite")]
431impl DuplicateDetector {
432 pub async fn new(config: DedupConfig) -> DedupResult<Self> {
444 let database = DedupDatabase::open(&config.database_path).await?;
445 let bloom = if config.bloom_prescreen {
446 Some(bloom_filter::BloomFilter::new(
447 config.bloom_capacity,
448 config.bloom_fpr,
449 ))
450 } else {
451 None
452 };
453 Ok(Self {
454 config,
455 database,
456 bloom,
457 })
458 }
459
460 pub async fn add_file(&mut self, path: impl AsRef<Path>) -> DedupResult<()> {
470 let path = path.as_ref();
471 if !path.exists() {
472 return Err(DedupError::FileNotFound(path.to_path_buf()));
473 }
474
475 let file_hash = hash::compute_file_hash(path)?;
477
478 if let Some(ref mut bloom) = self.bloom {
480 bloom.insert(file_hash.as_bytes());
481 }
482
483 self.database.insert_file(path, &file_hash.to_hex()).await?;
485
486 Ok(())
487 }
488
489 pub async fn add_files(&mut self, paths: &[impl AsRef<Path>]) -> DedupResult<Vec<String>> {
495 let mut errors = Vec::new();
496
497 for path in paths {
498 if let Err(e) = self.add_file(path).await {
499 errors.push(format!("{}: {}", path.as_ref().display(), e));
500 }
501 }
502
503 Ok(errors)
504 }
505
506 pub async fn par_index_files<P>(&mut self, paths: &[P]) -> DedupResult<Vec<String>>
522 where
523 P: AsRef<Path> + Sync,
524 {
525 use rayon::prelude::*;
526
527 let hash_results: Vec<(PathBuf, DedupResult<hash::FileHash>)> = paths
529 .par_iter()
530 .map(|p| {
531 let path = p.as_ref().to_path_buf();
532 if !path.exists() {
533 return (path.clone(), Err(DedupError::FileNotFound(path)));
534 }
535 let result = hash::compute_file_hash(&path);
536 (path, result)
537 })
538 .collect();
539
540 let mut errors = Vec::new();
542 for (path, result) in hash_results {
543 match result {
544 Ok(file_hash) => {
545 if let Err(e) = self.database.insert_file(&path, &file_hash.to_hex()).await {
546 errors.push(format!("{}: {}", path.display(), e));
547 }
548 }
549 Err(e) => {
550 errors.push(format!("{}: {}", path.display(), e));
551 }
552 }
553 }
554
555 Ok(errors)
556 }
557
558 pub async fn find_duplicates(
564 &self,
565 strategy: DetectionStrategy,
566 ) -> DedupResult<DuplicateReport> {
567 self.find_duplicates_with_progress(strategy, &progress::NullReporter)
568 .await
569 }
570
571 pub async fn find_duplicates_with_progress(
582 &self,
583 strategy: DetectionStrategy,
584 reporter: &dyn progress::ProgressReporter,
585 ) -> DedupResult<DuplicateReport> {
586 use progress::{ProgressEvent, ProgressTracker};
587
588 let run_start = std::time::SystemTime::now()
589 .duration_since(std::time::UNIX_EPOCH)
590 .unwrap_or_default()
591 .as_millis() as u64;
592
593 let mut report = DuplicateReport::new();
594
595 let phase_count = [
597 strategy.includes_hash(),
598 strategy.includes_perceptual(),
599 strategy.includes_ssim(),
600 strategy.includes_histogram(),
601 strategy.includes_feature_match(),
602 strategy.includes_audio(),
603 strategy.includes_metadata(),
604 ]
605 .iter()
606 .filter(|&&b| b)
607 .count();
608
609 let mut completed_phases = 0usize;
610
611 if strategy.includes_hash() {
613 if reporter.is_cancelled() {
614 return Ok(report);
615 }
616 let mut tracker = ProgressTracker::new(reporter, "exact_hash", 0);
617 let hash_dups = self.find_hash_duplicates().await?;
618 tracker.tick_batch(1);
619 let groups_found = hash_dups.len();
620 report.add_groups(hash_dups);
621 tracker.complete(groups_found);
622 completed_phases += 1;
623 }
624
625 if strategy.includes_perceptual() {
627 if reporter.is_cancelled() {
628 return Ok(report);
629 }
630 let mut tracker = ProgressTracker::new(reporter, "perceptual_hash", 0);
631 let perceptual_dups = self.find_perceptual_duplicates().await?;
632 tracker.tick_batch(1);
633 let groups_found = perceptual_dups.len();
634 report.add_groups(perceptual_dups);
635 tracker.complete(groups_found);
636 completed_phases += 1;
637 }
638
639 if strategy.includes_ssim() {
641 if reporter.is_cancelled() {
642 return Ok(report);
643 }
644 let mut tracker = ProgressTracker::new(reporter, "ssim", 0);
645 let ssim_dups = self.find_ssim_duplicates().await?;
646 tracker.tick_batch(1);
647 let groups_found = ssim_dups.len();
648 report.add_groups(ssim_dups);
649 tracker.complete(groups_found);
650 completed_phases += 1;
651 }
652
653 if strategy.includes_histogram() {
655 if reporter.is_cancelled() {
656 return Ok(report);
657 }
658 let mut tracker = ProgressTracker::new(reporter, "histogram", 0);
659 let histogram_dups = self.find_histogram_duplicates().await?;
660 tracker.tick_batch(1);
661 let groups_found = histogram_dups.len();
662 report.add_groups(histogram_dups);
663 tracker.complete(groups_found);
664 completed_phases += 1;
665 }
666
667 if strategy.includes_feature_match() {
669 if reporter.is_cancelled() {
670 return Ok(report);
671 }
672 let mut tracker = ProgressTracker::new(reporter, "feature_match", 0);
673 let feature_dups = self.find_feature_duplicates().await?;
674 tracker.tick_batch(1);
675 let groups_found = feature_dups.len();
676 report.add_groups(feature_dups);
677 tracker.complete(groups_found);
678 completed_phases += 1;
679 }
680
681 if strategy.includes_audio() {
683 if reporter.is_cancelled() {
684 return Ok(report);
685 }
686 let mut tracker = ProgressTracker::new(reporter, "audio_fingerprint", 0);
687 let audio_dups = self.find_audio_duplicates().await?;
688 tracker.tick_batch(1);
689 let groups_found = audio_dups.len();
690 report.add_groups(audio_dups);
691 tracker.complete(groups_found);
692 completed_phases += 1;
693 }
694
695 if strategy.includes_metadata() {
697 if reporter.is_cancelled() {
698 return Ok(report);
699 }
700 let mut tracker = ProgressTracker::new(reporter, "metadata", 0);
701 let metadata_dups = self.find_metadata_duplicates().await?;
702 tracker.tick_batch(1);
703 let groups_found = metadata_dups.len();
704 report.add_groups(metadata_dups);
705 tracker.complete(groups_found);
706 completed_phases += 1;
707 }
708
709 let run_end = std::time::SystemTime::now()
711 .duration_since(std::time::UNIX_EPOCH)
712 .unwrap_or_default()
713 .as_millis() as u64;
714
715 reporter.on_event(&ProgressEvent::RunCompleted {
716 total_groups: report.groups.len(),
717 total_elapsed_ms: run_end.saturating_sub(run_start),
718 });
719
720 let _ = (phase_count, completed_phases); Ok(report)
723 }
724
725 async fn find_hash_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
727 let duplicates = self.database.find_duplicate_hashes().await?;
728 let mut groups = Vec::new();
729
730 for (hash, files) in duplicates {
731 if files.len() > 1 {
732 groups.push(DuplicateGroup {
733 files,
734 scores: vec![SimilarityScore {
735 method: "exact_hash".to_string(),
736 score: 1.0,
737 metadata: vec![("hash".to_string(), hash)],
738 }],
739 });
740 }
741 }
742
743 Ok(groups)
744 }
745
746 async fn find_perceptual_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
756 let max_hamming = ((1.0 - self.config.perceptual_threshold) * 64.0) as u32;
759
760 let stored = self.database.get_all_fingerprints_by_type("phash").await?;
762
763 let mut hashes: Vec<(String, visual::PerceptualHash)> = Vec::new();
765 for (path, hex) in stored {
766 if let Ok(value) = u64::from_str_radix(&hex, 16) {
767 hashes.push((path, visual::PerceptualHash::new(value, 64)));
768 }
769 }
770
771 if hashes.len() < 2 {
773 return Ok(Vec::new());
774 }
775
776 let hashes: Vec<(String, visual::PerceptualHash)> = if self.config.bloom_prescreen {
786 let raw: Vec<u64> = hashes.iter().map(|(_, ph)| ph.hash()).collect();
787 let prescreen = bloom_filter::prescreen_perceptual_hashes(
788 &raw,
789 16, self.config.bloom_capacity,
791 self.config.bloom_fpr,
792 );
793 prescreen
794 .candidates
795 .iter()
796 .filter_map(|&idx| hashes.get(idx).cloned())
797 .collect()
798 } else {
799 hashes
800 };
801
802 if hashes.len() < 2 {
804 return Ok(Vec::new());
805 }
806
807 if self.config.use_lsh {
808 self.find_perceptual_duplicates_lsh(&hashes, max_hamming)
809 } else {
810 group_by_pairwise_similarity(
811 &hashes,
812 max_hamming,
813 |h1, h2| h1.hamming_distance(h2),
814 |h1, h2| h1.similarity(h2),
815 "perceptual_hash",
816 )
817 }
818 }
819
820 fn find_perceptual_duplicates_lsh(
825 &self,
826 hashes: &[(String, visual::PerceptualHash)],
827 max_hamming: u32,
828 ) -> DedupResult<Vec<DuplicateGroup>> {
829 let id_hashes: Vec<(u64, u64)> = hashes
831 .iter()
832 .enumerate()
833 .map(|(i, (_, ph))| (i as u64, ph.hash()))
834 .collect();
835
836 let lsh_result = lsh_index::lsh_dedup_pass(
838 &id_hashes,
839 max_hamming,
840 self.config.lsh_num_tables,
841 self.config.lsh_bits_per_table,
842 self.config.lsh_seed,
843 );
844
845 let all_ids: Vec<u64> = (0..hashes.len() as u64).collect();
847 let groups = lsh_index::group_by_lsh_pairs(&lsh_result.pairs, &all_ids);
848
849 let mut result = Vec::new();
851 for group_ids in &groups {
852 let files: Vec<String> = group_ids
853 .iter()
854 .filter_map(|&id| hashes.get(id as usize).map(|(p, _)| p.clone()))
855 .collect();
856
857 if files.len() < 2 {
858 continue;
859 }
860
861 let mut best_sim = 0.0f64;
863 for i in 0..group_ids.len() {
864 for j in (i + 1)..group_ids.len() {
865 let ia = group_ids[i] as usize;
866 let ib = group_ids[j] as usize;
867 if let (Some((_, ha)), Some((_, hb))) = (hashes.get(ia), hashes.get(ib)) {
868 let sim = ha.similarity(hb);
869 if sim > best_sim {
870 best_sim = sim;
871 }
872 }
873 }
874 }
875
876 result.push(DuplicateGroup {
877 files,
878 scores: vec![SimilarityScore {
879 method: "perceptual_hash_lsh".to_string(),
880 score: best_sim,
881 metadata: vec![
882 (
883 "lsh_candidates".to_string(),
884 lsh_result.candidates_checked.to_string(),
885 ),
886 (
887 "comparison_ratio".to_string(),
888 format!("{:.4}", lsh_result.comparison_ratio()),
889 ),
890 ],
891 }],
892 });
893 }
894
895 Ok(result)
896 }
897
898 async fn find_ssim_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
907 let threshold = self.config.ssim_threshold;
908 let res = self.config.thumbnail_resolution.max(4);
909 let expected_bytes = res * res;
910
911 let stored = self
913 .database
914 .get_all_fingerprints_by_type("thumbnail")
915 .await?;
916
917 let mut images: Vec<(String, visual::Image)> = Vec::new();
919 for (path, hex) in stored {
920 let bytes = decode_hex_bytes(&hex)?;
921 if bytes.len() == expected_bytes {
923 if let Ok(img) = visual::Image::from_data(res, res, 1, bytes) {
924 images.push((path, img));
925 }
926 }
927 }
928
929 if images.len() < 2 {
930 return Ok(Vec::new());
931 }
932
933 let ssim_params = visual::SsimParams::default();
934 let mut groups: Vec<DuplicateGroup> = Vec::new();
935 let mut assigned = vec![false; images.len()];
936
937 for i in 0..images.len() {
938 if assigned[i] {
939 continue;
940 }
941 let mut group_files = vec![images[i].0.clone()];
942 let mut best_score = 0.0f64;
943
944 for j in (i + 1)..images.len() {
945 if assigned[j] {
946 continue;
947 }
948 let ssim = visual::compute_ssim(&images[i].1, &images[j].1, &ssim_params);
949 if ssim >= threshold {
950 group_files.push(images[j].0.clone());
951 assigned[j] = true;
952 if ssim > best_score {
953 best_score = ssim;
954 }
955 }
956 }
957
958 if group_files.len() > 1 {
959 assigned[i] = true;
960 groups.push(DuplicateGroup {
961 files: group_files,
962 scores: vec![SimilarityScore {
963 method: "ssim".to_string(),
964 score: best_score,
965 metadata: Vec::new(),
966 }],
967 });
968 }
969 }
970
971 Ok(groups)
972 }
973
974 async fn find_histogram_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
982 let threshold = self.config.histogram_threshold;
983
984 let stored = self
985 .database
986 .get_all_fingerprints_by_type("histogram")
987 .await?;
988
989 let mut histograms: Vec<(String, Vec<Vec<u32>>)> = Vec::new();
991 for (path, json_str) in stored {
992 if let Ok(flat) = serde_json::from_str::<Vec<u32>>(&json_str) {
993 if flat.len() % 256 == 0 && !flat.is_empty() {
995 let channels = flat.len() / 256;
996 let hist: Vec<Vec<u32>> = (0..channels)
997 .map(|c| flat[c * 256..(c + 1) * 256].to_vec())
998 .collect();
999 histograms.push((path, hist));
1000 }
1001 }
1002 }
1003
1004 if histograms.len() < 2 {
1005 return Ok(Vec::new());
1006 }
1007
1008 let mut groups: Vec<DuplicateGroup> = Vec::new();
1009 let mut assigned = vec![false; histograms.len()];
1010
1011 for i in 0..histograms.len() {
1012 if assigned[i] {
1013 continue;
1014 }
1015 let mut group_files = vec![histograms[i].0.clone()];
1016 let mut best_score = 0.0f64;
1017
1018 for j in (i + 1)..histograms.len() {
1019 if assigned[j] {
1020 continue;
1021 }
1022 let corr = visual::compare_histograms(&histograms[i].1, &histograms[j].1);
1023 if corr >= threshold {
1024 group_files.push(histograms[j].0.clone());
1025 assigned[j] = true;
1026 if corr > best_score {
1027 best_score = corr;
1028 }
1029 }
1030 }
1031
1032 if group_files.len() > 1 {
1033 assigned[i] = true;
1034 groups.push(DuplicateGroup {
1035 files: group_files,
1036 scores: vec![SimilarityScore {
1037 method: "histogram".to_string(),
1038 score: best_score,
1039 metadata: Vec::new(),
1040 }],
1041 });
1042 }
1043 }
1044
1045 Ok(groups)
1046 }
1047
1048 async fn find_feature_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
1056 let threshold = self.config.perceptual_threshold;
1057
1058 let stored = self
1059 .database
1060 .get_all_fingerprints_by_type("feature_vector")
1061 .await?;
1062
1063 let mut vectors: Vec<(String, Vec<f64>)> = Vec::new();
1065 for (path, json_str) in stored {
1066 if let Ok(vec) = serde_json::from_str::<Vec<f64>>(&json_str) {
1067 if !vec.is_empty() {
1068 vectors.push((path, vec));
1069 }
1070 }
1071 }
1072
1073 if vectors.len() < 2 {
1074 return Ok(Vec::new());
1075 }
1076
1077 let mut groups: Vec<DuplicateGroup> = Vec::new();
1078 let mut assigned = vec![false; vectors.len()];
1079
1080 for i in 0..vectors.len() {
1081 if assigned[i] {
1082 continue;
1083 }
1084 let mut group_files = vec![vectors[i].0.clone()];
1085 let mut best_score = 0.0f64;
1086
1087 for j in (i + 1)..vectors.len() {
1088 if assigned[j] {
1089 continue;
1090 }
1091 let sim = cosine_similarity(&vectors[i].1, &vectors[j].1);
1092 if sim >= threshold {
1093 group_files.push(vectors[j].0.clone());
1094 assigned[j] = true;
1095 if sim > best_score {
1096 best_score = sim;
1097 }
1098 }
1099 }
1100
1101 if group_files.len() > 1 {
1102 assigned[i] = true;
1103 groups.push(DuplicateGroup {
1104 files: group_files,
1105 scores: vec![SimilarityScore {
1106 method: "feature_vector".to_string(),
1107 score: best_score,
1108 metadata: Vec::new(),
1109 }],
1110 });
1111 }
1112 }
1113
1114 Ok(groups)
1115 }
1116
1117 async fn find_audio_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
1124 let threshold = self.config.audio_threshold;
1125
1126 let stored = self
1127 .database
1128 .get_all_fingerprints_by_type("audio_fingerprint")
1129 .await?;
1130
1131 let mut fingerprints: Vec<(String, audio::AudioFingerprint)> = Vec::new();
1133 for (path, hex) in stored {
1134 let bytes = decode_hex_bytes(&hex)?;
1135 if !bytes.is_empty() {
1136 fingerprints.push((path, audio::AudioFingerprint::new(bytes, 11025, 0.0)));
1137 }
1138 }
1139
1140 if fingerprints.len() < 2 {
1141 return Ok(Vec::new());
1142 }
1143
1144 let mut groups: Vec<DuplicateGroup> = Vec::new();
1145 let mut assigned = vec![false; fingerprints.len()];
1146
1147 for i in 0..fingerprints.len() {
1148 if assigned[i] {
1149 continue;
1150 }
1151 let mut group_files = vec![fingerprints[i].0.clone()];
1152 let mut best_score = 0.0f64;
1153
1154 for j in (i + 1)..fingerprints.len() {
1155 if assigned[j] {
1156 continue;
1157 }
1158 let sim = fingerprints[i].1.similarity(&fingerprints[j].1);
1159 if sim >= threshold {
1160 group_files.push(fingerprints[j].0.clone());
1161 assigned[j] = true;
1162 if sim > best_score {
1163 best_score = sim;
1164 }
1165 }
1166 }
1167
1168 if group_files.len() > 1 {
1169 assigned[i] = true;
1170 groups.push(DuplicateGroup {
1171 files: group_files,
1172 scores: vec![SimilarityScore {
1173 method: "audio_fingerprint".to_string(),
1174 score: best_score,
1175 metadata: Vec::new(),
1176 }],
1177 });
1178 }
1179 }
1180
1181 Ok(groups)
1182 }
1183
1184 async fn find_metadata_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
1197 use metadata::{compare_metadata, MediaMetadata};
1198 use std::path::PathBuf;
1199
1200 let threshold = self.config.metadata_threshold;
1201
1202 let rows = self.database.get_all_files_with_metadata().await?;
1203
1204 if rows.len() < 2 {
1205 return Ok(Vec::new());
1206 }
1207
1208 let media_meta: Vec<MediaMetadata> = rows
1210 .iter()
1211 .map(
1212 |(path, duration, width, height, video_codec, audio_codec, container)| {
1213 let fs_size = std::fs::metadata(path).map(|m| m.len()).unwrap_or(0);
1214 let mut m = MediaMetadata::new(PathBuf::from(path), fs_size);
1215 m.duration = *duration;
1216 m.width = width.map(|v| v as u32);
1217 m.height = height.map(|v| v as u32);
1218 m.video_codec = video_codec.clone();
1219 m.audio_codec = audio_codec.clone();
1220 m.container = container.clone();
1221 m
1222 },
1223 )
1224 .collect();
1225
1226 let paths: Vec<String> = rows.iter().map(|(p, ..)| p.clone()).collect();
1227
1228 let mut groups: Vec<DuplicateGroup> = Vec::new();
1229 let mut assigned = vec![false; media_meta.len()];
1230
1231 for i in 0..media_meta.len() {
1232 if assigned[i] {
1233 continue;
1234 }
1235 let mut group_files = vec![paths[i].clone()];
1236 let mut best_score = 0.0f64;
1237 let mut best_duration_diff: Option<f64> = None;
1238
1239 for j in (i + 1)..media_meta.len() {
1240 if assigned[j] {
1241 continue;
1242 }
1243
1244 let duration_ok = match (media_meta[i].duration, media_meta[j].duration) {
1247 (Some(d1), Some(d2)) => (d1 - d2).abs() <= 1.0,
1248 _ => true, };
1250 if !duration_ok {
1251 continue;
1252 }
1253
1254 let sim = compare_metadata(&media_meta[i], &media_meta[j]);
1255 let score = sim.overall_score();
1256 if score >= threshold {
1257 group_files.push(paths[j].clone());
1258 assigned[j] = true;
1259 if score > best_score {
1260 best_score = score;
1261 best_duration_diff = match (media_meta[i].duration, media_meta[j].duration)
1262 {
1263 (Some(d1), Some(d2)) => Some((d1 - d2).abs()),
1264 _ => None,
1265 };
1266 }
1267 }
1268 }
1269
1270 if group_files.len() > 1 {
1271 assigned[i] = true;
1272 let mut score_entry = SimilarityScore {
1273 method: "metadata".to_string(),
1274 score: best_score,
1275 metadata: Vec::new(),
1276 };
1277 if let Some(diff) = best_duration_diff {
1278 score_entry
1279 .metadata
1280 .push(("duration_diff_secs".to_string(), format!("{diff:.3}")));
1281 }
1282 groups.push(DuplicateGroup {
1283 files: group_files,
1284 scores: vec![score_entry],
1285 });
1286 }
1287 }
1288
1289 Ok(groups)
1290 }
1291
1292 pub async fn get_stats(&self) -> DedupResult<DedupStats> {
1298 let total_files = self.database.count_files().await?;
1299 let total_hashes = self.database.count_unique_hashes().await?;
1300
1301 Ok(DedupStats {
1302 total_files,
1303 total_hashes,
1304 duplicate_files: total_files.saturating_sub(total_hashes),
1305 })
1306 }
1307
1308 pub async fn close(self) -> DedupResult<()> {
1310 self.database.close().await?;
1311 Ok(())
1312 }
1313
1314 #[must_use]
1326 pub fn might_be_duplicate(&self, hash_bytes: &[u8]) -> bool {
1327 match &self.bloom {
1328 Some(bloom) => bloom.contains(hash_bytes),
1329 None => true,
1330 }
1331 }
1332
1333 pub fn reset_bloom(&mut self) {
1339 if let Some(ref mut bloom) = self.bloom {
1340 bloom.clear();
1341 }
1342 }
1343}
1344
1345#[derive(Debug, Clone)]
1347pub struct DedupStats {
1348 pub total_files: usize,
1350
1351 pub total_hashes: usize,
1353
1354 pub duplicate_files: usize,
1356}
1357
1358#[cfg(test)]
1359mod tests {
1360 use super::*;
1361
1362 #[test]
1363 fn test_detection_strategy() {
1364 assert!(DetectionStrategy::ExactHash.includes_hash());
1365 assert!(!DetectionStrategy::ExactHash.includes_perceptual());
1366
1367 assert!(DetectionStrategy::All.includes_hash());
1368 assert!(DetectionStrategy::All.includes_perceptual());
1369 assert!(DetectionStrategy::All.includes_audio());
1370
1371 assert!(DetectionStrategy::Fast.includes_hash());
1372 assert!(DetectionStrategy::Fast.includes_perceptual());
1373 assert!(!DetectionStrategy::Fast.includes_ssim());
1374 }
1375
1376 #[test]
1377 fn test_config_default() {
1378 let config = DedupConfig::default();
1379 assert_eq!(config.perceptual_threshold, 0.95);
1380 assert_eq!(config.ssim_threshold, 0.90);
1381 assert!(config.parallel);
1382 }
1383
1384 #[test]
1385 fn test_config_lsh_defaults() {
1386 let config = DedupConfig::default();
1387 assert!(config.use_lsh);
1388 assert_eq!(config.lsh_num_tables, 8);
1389 assert_eq!(config.lsh_bits_per_table, 8);
1390 assert_eq!(config.lsh_seed, 42);
1391 }
1392
1393 #[test]
1394 fn test_config_bloom_defaults() {
1395 let config = DedupConfig::default();
1396 assert!(!config.bloom_prescreen);
1398 assert_eq!(config.bloom_capacity, 10_000);
1399 assert!((config.bloom_fpr - 0.01f32).abs() < f32::EPSILON);
1400 }
1401
1402 #[tokio::test]
1404 #[cfg(feature = "sqlite")]
1405 async fn test_par_index_files_empty_slice() {
1406 use std::path::PathBuf;
1407 let dir = std::env::temp_dir();
1408 let db_path = dir.join(format!(
1409 "oxidedup_test_par_{}.db",
1410 std::time::SystemTime::now()
1411 .duration_since(std::time::UNIX_EPOCH)
1412 .unwrap_or_default()
1413 .subsec_nanos()
1414 ));
1415 let config = DedupConfig {
1416 database_path: db_path.clone(),
1417 ..DedupConfig::default()
1418 };
1419 if let Ok(mut detector) = DuplicateDetector::new(config).await {
1420 let no_paths: &[PathBuf] = &[];
1421 let errors = detector
1422 .par_index_files(no_paths)
1423 .await
1424 .expect("par_index_files should succeed on empty input");
1425 assert!(errors.is_empty(), "No errors expected for empty input");
1426 let _ = detector.close().await;
1427 }
1428 let _ = std::fs::remove_file(&db_path);
1429 }
1430
1431 #[tokio::test]
1433 #[cfg(feature = "sqlite")]
1434 async fn test_par_index_files_nonexistent_paths() {
1435 let dir = std::env::temp_dir();
1436 let db_path = dir.join(format!(
1437 "oxidedup_test_par_ne_{}.db",
1438 std::time::SystemTime::now()
1439 .duration_since(std::time::UNIX_EPOCH)
1440 .unwrap_or_default()
1441 .subsec_nanos()
1442 ));
1443 let config = DedupConfig {
1444 database_path: db_path.clone(),
1445 ..DedupConfig::default()
1446 };
1447 if let Ok(mut detector) = DuplicateDetector::new(config).await {
1448 let missing = vec![
1449 PathBuf::from("/nonexistent/path/a.mp4"),
1450 PathBuf::from("/nonexistent/path/b.mp4"),
1451 ];
1452 let errors = detector
1453 .par_index_files(&missing)
1454 .await
1455 .expect("par_index_files should return Ok even when files are missing");
1456 assert_eq!(errors.len(), 2, "Should have one error per missing file");
1457 let _ = detector.close().await;
1458 }
1459 let _ = std::fs::remove_file(&db_path);
1460 }
1461
1462 #[tokio::test]
1466 #[cfg(feature = "sqlite")]
1467 async fn test_might_be_duplicate_no_bloom_always_true() {
1468 let dir = std::env::temp_dir();
1469 let db_path = dir.join(format!(
1470 "oxidedup_bloom_noscreen_{}.db",
1471 std::time::SystemTime::now()
1472 .duration_since(std::time::UNIX_EPOCH)
1473 .unwrap_or_default()
1474 .subsec_nanos()
1475 ));
1476 let config = DedupConfig {
1477 database_path: db_path.clone(),
1478 bloom_prescreen: false,
1479 ..DedupConfig::default()
1480 };
1481 if let Ok(detector) = DuplicateDetector::new(config).await {
1482 assert!(
1484 detector.might_be_duplicate(b"some_hash_bytes"),
1485 "Should always return true when bloom is disabled"
1486 );
1487 assert!(
1488 detector.might_be_duplicate(b""),
1489 "Empty bytes: should return true without bloom"
1490 );
1491 let _ = detector.close().await;
1492 }
1493 let _ = std::fs::remove_file(&db_path);
1494 }
1495
1496 #[tokio::test]
1498 #[cfg(feature = "sqlite")]
1499 async fn test_might_be_duplicate_with_bloom_unknown_hash() {
1500 let dir = std::env::temp_dir();
1501 let db_path = dir.join(format!(
1502 "oxidedup_bloom_unknown_{}.db",
1503 std::time::SystemTime::now()
1504 .duration_since(std::time::UNIX_EPOCH)
1505 .unwrap_or_default()
1506 .subsec_nanos()
1507 ));
1508 let config = DedupConfig {
1509 database_path: db_path.clone(),
1510 bloom_prescreen: true,
1511 bloom_capacity: 1000,
1512 bloom_fpr: 0.01,
1513 ..DedupConfig::default()
1514 };
1515 if let Ok(detector) = DuplicateDetector::new(config).await {
1516 assert!(
1519 !detector.might_be_duplicate(b"never_inserted_hash"),
1520 "Unknown hash should return false from a fresh bloom filter"
1521 );
1522 let _ = detector.close().await;
1523 }
1524 let _ = std::fs::remove_file(&db_path);
1525 }
1526
1527 #[tokio::test]
1529 #[cfg(feature = "sqlite")]
1530 async fn test_reset_bloom_clears_state() {
1531 let dir = std::env::temp_dir();
1532 let db_path = dir.join(format!(
1533 "oxidedup_bloom_reset_{}.db",
1534 std::time::SystemTime::now()
1535 .duration_since(std::time::UNIX_EPOCH)
1536 .unwrap_or_default()
1537 .subsec_nanos()
1538 ));
1539 let config = DedupConfig {
1540 database_path: db_path.clone(),
1541 bloom_prescreen: true,
1542 bloom_capacity: 1000,
1543 bloom_fpr: 0.01,
1544 ..DedupConfig::default()
1545 };
1546 if let Ok(mut detector) = DuplicateDetector::new(config).await {
1547 if let Some(ref mut bloom) = detector.bloom {
1549 bloom.insert(b"known_hash");
1550 }
1551 assert!(
1553 detector.might_be_duplicate(b"known_hash"),
1554 "After insert, bloom should report potential duplicate"
1555 );
1556 detector.reset_bloom();
1558 assert!(
1559 !detector.might_be_duplicate(b"known_hash"),
1560 "After reset_bloom, hash should not be found"
1561 );
1562 let _ = detector.close().await;
1563 }
1564 let _ = std::fs::remove_file(&db_path);
1565 }
1566}