1#![warn(missing_docs)]
86#![allow(clippy::module_name_repetitions)]
87#![allow(clippy::similar_names)]
88#![allow(clippy::cast_possible_truncation)]
89#![allow(clippy::cast_sign_loss)]
90#![allow(clippy::cast_precision_loss)]
91#![allow(clippy::too_many_arguments)]
92#![allow(dead_code)]
93
94pub mod audio;
95pub mod bloom_filter;
96pub mod cluster;
97pub mod content_id;
98pub mod content_signature;
99pub mod cross_format;
100#[cfg(feature = "sqlite")]
101pub mod database;
102pub mod dedup_cache;
103pub mod dedup_index;
104pub mod dedup_policy;
105pub mod dedup_report;
106pub mod dedup_report_ext;
107pub mod dedup_stats;
108pub mod frame_hash;
109pub mod fuzzy_match;
110pub mod hash;
111pub mod hash_store;
112pub mod incremental;
113pub mod lsh_index;
114pub mod merge_strategy;
115pub mod metadata;
116pub mod near_duplicate;
117pub mod perceptual_hash;
118pub mod phash;
119pub mod progress;
120pub mod report;
121pub mod rolling_hash;
122pub mod segment_dedup;
123pub mod similarity_index;
124pub mod video_dedup;
125pub mod video_segment_dedup;
126pub mod visual;
127
128#[cfg(feature = "sqlite")]
129use std::path::Path;
130use std::path::PathBuf;
131use thiserror::Error;
132
133#[cfg(feature = "sqlite")]
134pub use database::DedupDatabase;
135pub use merge_strategy::{AppliedAction, MergeExecutor, MergeReport};
136pub use report::{DuplicateGroup, DuplicateReport, SimilarityScore};
137
138#[cfg(feature = "sqlite")]
149fn decode_hex_bytes(hex: &str) -> DedupResult<Vec<u8>> {
150 if hex.len() % 2 != 0 {
151 return Err(DedupError::Hash(format!(
152 "odd-length hex string: len={}",
153 hex.len()
154 )));
155 }
156 (0..hex.len())
157 .step_by(2)
158 .map(|i| {
159 u8::from_str_radix(&hex[i..i + 2], 16)
160 .map_err(|e| DedupError::Hash(format!("invalid hex byte at {i}: {e}")))
161 })
162 .collect()
163}
164
165#[cfg(feature = "sqlite")]
169fn cosine_similarity(a: &[f64], b: &[f64]) -> f64 {
170 let dot: f64 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
171 let mag_a: f64 = a.iter().map(|x| x * x).sum::<f64>().sqrt();
172 let mag_b: f64 = b.iter().map(|x| x * x).sum::<f64>().sqrt();
173 if mag_a < f64::EPSILON || mag_b < f64::EPSILON {
174 return 0.0;
175 }
176 dot / (mag_a * mag_b)
177}
178
179#[cfg(feature = "sqlite")]
185fn group_by_pairwise_similarity<H, FDist, FSim>(
186 items: &[(String, H)],
187 max_distance: u32,
188 dist_fn: FDist,
189 sim_fn: FSim,
190 method: &str,
191) -> DedupResult<Vec<DuplicateGroup>>
192where
193 FDist: Fn(&H, &H) -> u32,
194 FSim: Fn(&H, &H) -> f64,
195{
196 let mut groups: Vec<DuplicateGroup> = Vec::new();
197 let mut assigned = vec![false; items.len()];
198
199 for i in 0..items.len() {
200 if assigned[i] {
201 continue;
202 }
203 let mut group_files = vec![items[i].0.clone()];
204 let mut best_score = 0.0f64;
205
206 for j in (i + 1)..items.len() {
207 if assigned[j] {
208 continue;
209 }
210 let dist = dist_fn(&items[i].1, &items[j].1);
211 if dist <= max_distance {
212 let sim = sim_fn(&items[i].1, &items[j].1);
213 group_files.push(items[j].0.clone());
214 assigned[j] = true;
215 if sim > best_score {
216 best_score = sim;
217 }
218 }
219 }
220
221 if group_files.len() > 1 {
222 assigned[i] = true;
223 groups.push(DuplicateGroup {
224 files: group_files,
225 scores: vec![SimilarityScore {
226 method: method.to_string(),
227 score: best_score,
228 metadata: Vec::new(),
229 }],
230 });
231 }
232 }
233
234 Ok(groups)
235}
236
237#[derive(Error, Debug)]
239pub enum DedupError {
240 #[error("I/O error: {0}")]
242 Io(#[from] std::io::Error),
243
244 #[cfg(feature = "sqlite")]
246 #[error("Database error: {0}")]
247 Database(#[from] sqlx::Error),
248
249 #[cfg(not(feature = "sqlite"))]
251 #[error("Database error: {0}")]
252 Database(String),
253
254 #[error("Hashing error: {0}")]
256 Hash(String),
257
258 #[error("Visual processing error: {0}")]
260 Visual(String),
261
262 #[error("Audio processing error: {0}")]
264 Audio(String),
265
266 #[error("Metadata processing error: {0}")]
268 Metadata(String),
269
270 #[error("File not found: {0}")]
272 FileNotFound(PathBuf),
273
274 #[error("Invalid configuration: {0}")]
276 InvalidConfig(String),
277
278 #[error("OxiMedia core error: {0}")]
280 Core(#[from] oximedia_core::OxiError),
281}
282
283pub type DedupResult<T> = Result<T, DedupError>;
285
286#[derive(Debug, Clone, Copy, PartialEq, Eq)]
288pub enum DetectionStrategy {
289 ExactHash,
291
292 PerceptualHash,
294
295 Ssim,
297
298 Histogram,
300
301 FeatureMatch,
303
304 AudioFingerprint,
306
307 Metadata,
309
310 All,
312
313 VisualAll,
315
316 Fast,
318}
319
320impl DetectionStrategy {
321 #[must_use]
323 pub fn includes_hash(self) -> bool {
324 matches!(self, Self::ExactHash | Self::All | Self::Fast)
325 }
326
327 #[must_use]
329 pub fn includes_perceptual(self) -> bool {
330 matches!(
331 self,
332 Self::PerceptualHash | Self::All | Self::VisualAll | Self::Fast
333 )
334 }
335
336 #[must_use]
338 pub fn includes_ssim(self) -> bool {
339 matches!(self, Self::Ssim | Self::All | Self::VisualAll)
340 }
341
342 #[must_use]
344 pub fn includes_histogram(self) -> bool {
345 matches!(self, Self::Histogram | Self::All | Self::VisualAll)
346 }
347
348 #[must_use]
350 pub fn includes_feature_match(self) -> bool {
351 matches!(self, Self::FeatureMatch | Self::All | Self::VisualAll)
352 }
353
354 #[must_use]
356 pub fn includes_audio(self) -> bool {
357 matches!(self, Self::AudioFingerprint | Self::All)
358 }
359
360 #[must_use]
362 pub fn includes_metadata(self) -> bool {
363 matches!(self, Self::Metadata | Self::All | Self::Fast)
364 }
365}
366
367#[derive(Debug, Clone)]
369pub struct DedupConfig {
370 pub database_path: PathBuf,
372
373 pub perceptual_threshold: f64,
375
376 pub ssim_threshold: f64,
378
379 pub histogram_threshold: f64,
381
382 pub feature_match_threshold: usize,
384
385 pub audio_threshold: f64,
387
388 pub metadata_threshold: f64,
390
391 pub parallel: bool,
393
394 pub sample_frames: usize,
396
397 pub chunk_size: usize,
399
400 pub thumbnail_resolution: usize,
406
407 pub bloom_prescreen: bool,
413
414 pub bloom_capacity: usize,
416
417 pub bloom_fpr: f32,
419
420 pub use_lsh: bool,
426
427 pub lsh_num_tables: usize,
429
430 pub lsh_bits_per_table: usize,
432
433 pub lsh_seed: u64,
435}
436
437impl Default for DedupConfig {
438 fn default() -> Self {
439 Self {
440 database_path: PathBuf::from("oximedia_dedup.db"),
441 perceptual_threshold: 0.95,
442 ssim_threshold: 0.90,
443 histogram_threshold: 0.85,
444 feature_match_threshold: 50,
445 audio_threshold: 0.90,
446 metadata_threshold: 0.80,
447 parallel: true,
448 sample_frames: 10,
449 chunk_size: 4096,
450 thumbnail_resolution: 8,
451 bloom_prescreen: false,
452 bloom_capacity: 10_000,
453 bloom_fpr: 0.01,
454 use_lsh: true,
455 lsh_num_tables: 8,
456 lsh_bits_per_table: 8,
457 lsh_seed: 42,
458 }
459 }
460}
461
462#[cfg(feature = "sqlite")]
464pub struct DuplicateDetector {
465 config: DedupConfig,
466 database: DedupDatabase,
467 bloom: Option<bloom_filter::BloomFilter>,
473}
474
475#[cfg(feature = "sqlite")]
476impl DuplicateDetector {
477 pub async fn new(config: DedupConfig) -> DedupResult<Self> {
489 let database = DedupDatabase::open(&config.database_path).await?;
490 let bloom = if config.bloom_prescreen {
491 Some(bloom_filter::BloomFilter::new(
492 config.bloom_capacity,
493 config.bloom_fpr,
494 ))
495 } else {
496 None
497 };
498 Ok(Self {
499 config,
500 database,
501 bloom,
502 })
503 }
504
505 pub async fn add_file(&mut self, path: impl AsRef<Path>) -> DedupResult<()> {
515 let path = path.as_ref();
516 if !path.exists() {
517 return Err(DedupError::FileNotFound(path.to_path_buf()));
518 }
519
520 let file_hash = hash::compute_file_hash(path)?;
522
523 if let Some(ref mut bloom) = self.bloom {
525 bloom.insert(file_hash.as_bytes());
526 }
527
528 self.database.insert_file(path, &file_hash.to_hex()).await?;
530
531 Ok(())
532 }
533
534 pub async fn add_files(&mut self, paths: &[impl AsRef<Path>]) -> DedupResult<Vec<String>> {
540 let mut errors = Vec::new();
541
542 for path in paths {
543 if let Err(e) = self.add_file(path).await {
544 errors.push(format!("{}: {}", path.as_ref().display(), e));
545 }
546 }
547
548 Ok(errors)
549 }
550
551 pub async fn par_index_files<P>(&mut self, paths: &[P]) -> DedupResult<Vec<String>>
567 where
568 P: AsRef<Path> + Sync,
569 {
570 use rayon::prelude::*;
571
572 let hash_results: Vec<(PathBuf, DedupResult<hash::FileHash>)> = paths
574 .par_iter()
575 .map(|p| {
576 let path = p.as_ref().to_path_buf();
577 if !path.exists() {
578 return (path.clone(), Err(DedupError::FileNotFound(path)));
579 }
580 let result = hash::compute_file_hash(&path);
581 (path, result)
582 })
583 .collect();
584
585 let mut errors = Vec::new();
587 for (path, result) in hash_results {
588 match result {
589 Ok(file_hash) => {
590 if let Err(e) = self.database.insert_file(&path, &file_hash.to_hex()).await {
591 errors.push(format!("{}: {}", path.display(), e));
592 }
593 }
594 Err(e) => {
595 errors.push(format!("{}: {}", path.display(), e));
596 }
597 }
598 }
599
600 Ok(errors)
601 }
602
603 pub async fn find_duplicates(
609 &self,
610 strategy: DetectionStrategy,
611 ) -> DedupResult<DuplicateReport> {
612 self.find_duplicates_with_progress(strategy, &progress::NullReporter)
613 .await
614 }
615
616 pub async fn find_duplicates_with_progress(
627 &self,
628 strategy: DetectionStrategy,
629 reporter: &dyn progress::ProgressReporter,
630 ) -> DedupResult<DuplicateReport> {
631 use progress::{ProgressEvent, ProgressTracker};
632
633 let run_start = std::time::SystemTime::now()
634 .duration_since(std::time::UNIX_EPOCH)
635 .unwrap_or_default()
636 .as_millis() as u64;
637
638 let mut report = DuplicateReport::new();
639
640 let phase_count = [
642 strategy.includes_hash(),
643 strategy.includes_perceptual(),
644 strategy.includes_ssim(),
645 strategy.includes_histogram(),
646 strategy.includes_feature_match(),
647 strategy.includes_audio(),
648 strategy.includes_metadata(),
649 ]
650 .iter()
651 .filter(|&&b| b)
652 .count();
653
654 let mut completed_phases = 0usize;
655
656 if strategy.includes_hash() {
658 if reporter.is_cancelled() {
659 return Ok(report);
660 }
661 let mut tracker = ProgressTracker::new(reporter, "exact_hash", 0);
662 let hash_dups = self.find_hash_duplicates().await?;
663 tracker.tick_batch(1);
664 let groups_found = hash_dups.len();
665 report.add_groups(hash_dups);
666 tracker.complete(groups_found);
667 completed_phases += 1;
668 }
669
670 if strategy.includes_perceptual() {
672 if reporter.is_cancelled() {
673 return Ok(report);
674 }
675 let mut tracker = ProgressTracker::new(reporter, "perceptual_hash", 0);
676 let perceptual_dups = self.find_perceptual_duplicates().await?;
677 tracker.tick_batch(1);
678 let groups_found = perceptual_dups.len();
679 report.add_groups(perceptual_dups);
680 tracker.complete(groups_found);
681 completed_phases += 1;
682 }
683
684 if strategy.includes_ssim() {
686 if reporter.is_cancelled() {
687 return Ok(report);
688 }
689 let mut tracker = ProgressTracker::new(reporter, "ssim", 0);
690 let ssim_dups = self.find_ssim_duplicates().await?;
691 tracker.tick_batch(1);
692 let groups_found = ssim_dups.len();
693 report.add_groups(ssim_dups);
694 tracker.complete(groups_found);
695 completed_phases += 1;
696 }
697
698 if strategy.includes_histogram() {
700 if reporter.is_cancelled() {
701 return Ok(report);
702 }
703 let mut tracker = ProgressTracker::new(reporter, "histogram", 0);
704 let histogram_dups = self.find_histogram_duplicates().await?;
705 tracker.tick_batch(1);
706 let groups_found = histogram_dups.len();
707 report.add_groups(histogram_dups);
708 tracker.complete(groups_found);
709 completed_phases += 1;
710 }
711
712 if strategy.includes_feature_match() {
714 if reporter.is_cancelled() {
715 return Ok(report);
716 }
717 let mut tracker = ProgressTracker::new(reporter, "feature_match", 0);
718 let feature_dups = self.find_feature_duplicates().await?;
719 tracker.tick_batch(1);
720 let groups_found = feature_dups.len();
721 report.add_groups(feature_dups);
722 tracker.complete(groups_found);
723 completed_phases += 1;
724 }
725
726 if strategy.includes_audio() {
728 if reporter.is_cancelled() {
729 return Ok(report);
730 }
731 let mut tracker = ProgressTracker::new(reporter, "audio_fingerprint", 0);
732 let audio_dups = self.find_audio_duplicates().await?;
733 tracker.tick_batch(1);
734 let groups_found = audio_dups.len();
735 report.add_groups(audio_dups);
736 tracker.complete(groups_found);
737 completed_phases += 1;
738 }
739
740 if strategy.includes_metadata() {
742 if reporter.is_cancelled() {
743 return Ok(report);
744 }
745 let mut tracker = ProgressTracker::new(reporter, "metadata", 0);
746 let metadata_dups = self.find_metadata_duplicates().await?;
747 tracker.tick_batch(1);
748 let groups_found = metadata_dups.len();
749 report.add_groups(metadata_dups);
750 tracker.complete(groups_found);
751 completed_phases += 1;
752 }
753
754 let run_end = std::time::SystemTime::now()
756 .duration_since(std::time::UNIX_EPOCH)
757 .unwrap_or_default()
758 .as_millis() as u64;
759
760 reporter.on_event(&ProgressEvent::RunCompleted {
761 total_groups: report.groups.len(),
762 total_elapsed_ms: run_end.saturating_sub(run_start),
763 });
764
765 let _ = (phase_count, completed_phases); Ok(report)
768 }
769
770 async fn find_hash_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
772 let duplicates = self.database.find_duplicate_hashes().await?;
773 let mut groups = Vec::new();
774
775 for (hash, files) in duplicates {
776 if files.len() > 1 {
777 groups.push(DuplicateGroup {
778 files,
779 scores: vec![SimilarityScore {
780 method: "exact_hash".to_string(),
781 score: 1.0,
782 metadata: vec![("hash".to_string(), hash)],
783 }],
784 });
785 }
786 }
787
788 Ok(groups)
789 }
790
791 async fn find_perceptual_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
801 let max_hamming = ((1.0 - self.config.perceptual_threshold) * 64.0) as u32;
804
805 let stored = self.database.get_all_fingerprints_by_type("phash").await?;
807
808 let mut hashes: Vec<(String, visual::PerceptualHash)> = Vec::new();
810 for (path, hex) in stored {
811 if let Ok(value) = u64::from_str_radix(&hex, 16) {
812 hashes.push((path, visual::PerceptualHash::new(value, 64)));
813 }
814 }
815
816 if hashes.len() < 2 {
818 return Ok(Vec::new());
819 }
820
821 let hashes: Vec<(String, visual::PerceptualHash)> = if self.config.bloom_prescreen {
831 let raw: Vec<u64> = hashes.iter().map(|(_, ph)| ph.hash()).collect();
832 let prescreen = bloom_filter::prescreen_perceptual_hashes(
833 &raw,
834 16, self.config.bloom_capacity,
836 self.config.bloom_fpr,
837 );
838 prescreen
839 .candidates
840 .iter()
841 .filter_map(|&idx| hashes.get(idx).cloned())
842 .collect()
843 } else {
844 hashes
845 };
846
847 if hashes.len() < 2 {
849 return Ok(Vec::new());
850 }
851
852 if self.config.use_lsh {
853 self.find_perceptual_duplicates_lsh(&hashes, max_hamming)
854 } else {
855 group_by_pairwise_similarity(
856 &hashes,
857 max_hamming,
858 |h1, h2| h1.hamming_distance(h2),
859 |h1, h2| h1.similarity(h2),
860 "perceptual_hash",
861 )
862 }
863 }
864
865 fn find_perceptual_duplicates_lsh(
870 &self,
871 hashes: &[(String, visual::PerceptualHash)],
872 max_hamming: u32,
873 ) -> DedupResult<Vec<DuplicateGroup>> {
874 let id_hashes: Vec<(u64, u64)> = hashes
876 .iter()
877 .enumerate()
878 .map(|(i, (_, ph))| (i as u64, ph.hash()))
879 .collect();
880
881 let lsh_result = lsh_index::lsh_dedup_pass(
883 &id_hashes,
884 max_hamming,
885 self.config.lsh_num_tables,
886 self.config.lsh_bits_per_table,
887 self.config.lsh_seed,
888 );
889
890 let all_ids: Vec<u64> = (0..hashes.len() as u64).collect();
892 let groups = lsh_index::group_by_lsh_pairs(&lsh_result.pairs, &all_ids);
893
894 let mut result = Vec::new();
896 for group_ids in &groups {
897 let files: Vec<String> = group_ids
898 .iter()
899 .filter_map(|&id| hashes.get(id as usize).map(|(p, _)| p.clone()))
900 .collect();
901
902 if files.len() < 2 {
903 continue;
904 }
905
906 let mut best_sim = 0.0f64;
908 for i in 0..group_ids.len() {
909 for j in (i + 1)..group_ids.len() {
910 let ia = group_ids[i] as usize;
911 let ib = group_ids[j] as usize;
912 if let (Some((_, ha)), Some((_, hb))) = (hashes.get(ia), hashes.get(ib)) {
913 let sim = ha.similarity(hb);
914 if sim > best_sim {
915 best_sim = sim;
916 }
917 }
918 }
919 }
920
921 result.push(DuplicateGroup {
922 files,
923 scores: vec![SimilarityScore {
924 method: "perceptual_hash_lsh".to_string(),
925 score: best_sim,
926 metadata: vec![
927 (
928 "lsh_candidates".to_string(),
929 lsh_result.candidates_checked.to_string(),
930 ),
931 (
932 "comparison_ratio".to_string(),
933 format!("{:.4}", lsh_result.comparison_ratio()),
934 ),
935 ],
936 }],
937 });
938 }
939
940 Ok(result)
941 }
942
943 async fn find_ssim_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
952 let threshold = self.config.ssim_threshold;
953 let res = self.config.thumbnail_resolution.max(4);
954 let expected_bytes = res * res;
955
956 let stored = self
958 .database
959 .get_all_fingerprints_by_type("thumbnail")
960 .await?;
961
962 let mut images: Vec<(String, visual::Image)> = Vec::new();
964 for (path, hex) in stored {
965 let bytes = decode_hex_bytes(&hex)?;
966 if bytes.len() == expected_bytes {
968 if let Ok(img) = visual::Image::from_data(res, res, 1, bytes) {
969 images.push((path, img));
970 }
971 }
972 }
973
974 if images.len() < 2 {
975 return Ok(Vec::new());
976 }
977
978 let ssim_params = visual::SsimParams::default();
979 let mut groups: Vec<DuplicateGroup> = Vec::new();
980 let mut assigned = vec![false; images.len()];
981
982 for i in 0..images.len() {
983 if assigned[i] {
984 continue;
985 }
986 let mut group_files = vec![images[i].0.clone()];
987 let mut best_score = 0.0f64;
988
989 for j in (i + 1)..images.len() {
990 if assigned[j] {
991 continue;
992 }
993 let ssim = visual::compute_ssim(&images[i].1, &images[j].1, &ssim_params);
994 if ssim >= threshold {
995 group_files.push(images[j].0.clone());
996 assigned[j] = true;
997 if ssim > best_score {
998 best_score = ssim;
999 }
1000 }
1001 }
1002
1003 if group_files.len() > 1 {
1004 assigned[i] = true;
1005 groups.push(DuplicateGroup {
1006 files: group_files,
1007 scores: vec![SimilarityScore {
1008 method: "ssim".to_string(),
1009 score: best_score,
1010 metadata: Vec::new(),
1011 }],
1012 });
1013 }
1014 }
1015
1016 Ok(groups)
1017 }
1018
1019 async fn find_histogram_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
1027 let threshold = self.config.histogram_threshold;
1028
1029 let stored = self
1030 .database
1031 .get_all_fingerprints_by_type("histogram")
1032 .await?;
1033
1034 let mut histograms: Vec<(String, Vec<Vec<u32>>)> = Vec::new();
1036 for (path, json_str) in stored {
1037 if let Ok(flat) = serde_json::from_str::<Vec<u32>>(&json_str) {
1038 if flat.len() % 256 == 0 && !flat.is_empty() {
1040 let channels = flat.len() / 256;
1041 let hist: Vec<Vec<u32>> = (0..channels)
1042 .map(|c| flat[c * 256..(c + 1) * 256].to_vec())
1043 .collect();
1044 histograms.push((path, hist));
1045 }
1046 }
1047 }
1048
1049 if histograms.len() < 2 {
1050 return Ok(Vec::new());
1051 }
1052
1053 let mut groups: Vec<DuplicateGroup> = Vec::new();
1054 let mut assigned = vec![false; histograms.len()];
1055
1056 for i in 0..histograms.len() {
1057 if assigned[i] {
1058 continue;
1059 }
1060 let mut group_files = vec![histograms[i].0.clone()];
1061 let mut best_score = 0.0f64;
1062
1063 for j in (i + 1)..histograms.len() {
1064 if assigned[j] {
1065 continue;
1066 }
1067 let corr = visual::compare_histograms(&histograms[i].1, &histograms[j].1);
1068 if corr >= threshold {
1069 group_files.push(histograms[j].0.clone());
1070 assigned[j] = true;
1071 if corr > best_score {
1072 best_score = corr;
1073 }
1074 }
1075 }
1076
1077 if group_files.len() > 1 {
1078 assigned[i] = true;
1079 groups.push(DuplicateGroup {
1080 files: group_files,
1081 scores: vec![SimilarityScore {
1082 method: "histogram".to_string(),
1083 score: best_score,
1084 metadata: Vec::new(),
1085 }],
1086 });
1087 }
1088 }
1089
1090 Ok(groups)
1091 }
1092
1093 async fn find_feature_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
1101 let threshold = self.config.perceptual_threshold;
1102
1103 let stored = self
1104 .database
1105 .get_all_fingerprints_by_type("feature_vector")
1106 .await?;
1107
1108 let mut vectors: Vec<(String, Vec<f64>)> = Vec::new();
1110 for (path, json_str) in stored {
1111 if let Ok(vec) = serde_json::from_str::<Vec<f64>>(&json_str) {
1112 if !vec.is_empty() {
1113 vectors.push((path, vec));
1114 }
1115 }
1116 }
1117
1118 if vectors.len() < 2 {
1119 return Ok(Vec::new());
1120 }
1121
1122 let mut groups: Vec<DuplicateGroup> = Vec::new();
1123 let mut assigned = vec![false; vectors.len()];
1124
1125 for i in 0..vectors.len() {
1126 if assigned[i] {
1127 continue;
1128 }
1129 let mut group_files = vec![vectors[i].0.clone()];
1130 let mut best_score = 0.0f64;
1131
1132 for j in (i + 1)..vectors.len() {
1133 if assigned[j] {
1134 continue;
1135 }
1136 let sim = cosine_similarity(&vectors[i].1, &vectors[j].1);
1137 if sim >= threshold {
1138 group_files.push(vectors[j].0.clone());
1139 assigned[j] = true;
1140 if sim > best_score {
1141 best_score = sim;
1142 }
1143 }
1144 }
1145
1146 if group_files.len() > 1 {
1147 assigned[i] = true;
1148 groups.push(DuplicateGroup {
1149 files: group_files,
1150 scores: vec![SimilarityScore {
1151 method: "feature_vector".to_string(),
1152 score: best_score,
1153 metadata: Vec::new(),
1154 }],
1155 });
1156 }
1157 }
1158
1159 Ok(groups)
1160 }
1161
1162 async fn find_audio_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
1169 let threshold = self.config.audio_threshold;
1170
1171 let stored = self
1172 .database
1173 .get_all_fingerprints_by_type("audio_fingerprint")
1174 .await?;
1175
1176 let mut fingerprints: Vec<(String, audio::AudioFingerprint)> = Vec::new();
1178 for (path, hex) in stored {
1179 let bytes = decode_hex_bytes(&hex)?;
1180 if !bytes.is_empty() {
1181 fingerprints.push((path, audio::AudioFingerprint::new(bytes, 11025, 0.0)));
1182 }
1183 }
1184
1185 if fingerprints.len() < 2 {
1186 return Ok(Vec::new());
1187 }
1188
1189 let mut groups: Vec<DuplicateGroup> = Vec::new();
1190 let mut assigned = vec![false; fingerprints.len()];
1191
1192 for i in 0..fingerprints.len() {
1193 if assigned[i] {
1194 continue;
1195 }
1196 let mut group_files = vec![fingerprints[i].0.clone()];
1197 let mut best_score = 0.0f64;
1198
1199 for j in (i + 1)..fingerprints.len() {
1200 if assigned[j] {
1201 continue;
1202 }
1203 let sim = fingerprints[i].1.similarity(&fingerprints[j].1);
1204 if sim >= threshold {
1205 group_files.push(fingerprints[j].0.clone());
1206 assigned[j] = true;
1207 if sim > best_score {
1208 best_score = sim;
1209 }
1210 }
1211 }
1212
1213 if group_files.len() > 1 {
1214 assigned[i] = true;
1215 groups.push(DuplicateGroup {
1216 files: group_files,
1217 scores: vec![SimilarityScore {
1218 method: "audio_fingerprint".to_string(),
1219 score: best_score,
1220 metadata: Vec::new(),
1221 }],
1222 });
1223 }
1224 }
1225
1226 Ok(groups)
1227 }
1228
1229 async fn find_metadata_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
1242 use metadata::{compare_metadata, MediaMetadata};
1243 use std::path::PathBuf;
1244
1245 let threshold = self.config.metadata_threshold;
1246
1247 let rows = self.database.get_all_files_with_metadata().await?;
1248
1249 if rows.len() < 2 {
1250 return Ok(Vec::new());
1251 }
1252
1253 let media_meta: Vec<MediaMetadata> = rows
1255 .iter()
1256 .map(
1257 |(path, duration, width, height, video_codec, audio_codec, container)| {
1258 let fs_size = std::fs::metadata(path).map(|m| m.len()).unwrap_or(0);
1259 let mut m = MediaMetadata::new(PathBuf::from(path), fs_size);
1260 m.duration = *duration;
1261 m.width = width.map(|v| v as u32);
1262 m.height = height.map(|v| v as u32);
1263 m.video_codec = video_codec.clone();
1264 m.audio_codec = audio_codec.clone();
1265 m.container = container.clone();
1266 m
1267 },
1268 )
1269 .collect();
1270
1271 let paths: Vec<String> = rows.iter().map(|(p, ..)| p.clone()).collect();
1272
1273 let mut groups: Vec<DuplicateGroup> = Vec::new();
1274 let mut assigned = vec![false; media_meta.len()];
1275
1276 for i in 0..media_meta.len() {
1277 if assigned[i] {
1278 continue;
1279 }
1280 let mut group_files = vec![paths[i].clone()];
1281 let mut best_score = 0.0f64;
1282 let mut best_duration_diff: Option<f64> = None;
1283
1284 for j in (i + 1)..media_meta.len() {
1285 if assigned[j] {
1286 continue;
1287 }
1288
1289 let duration_ok = match (media_meta[i].duration, media_meta[j].duration) {
1292 (Some(d1), Some(d2)) => (d1 - d2).abs() <= 1.0,
1293 _ => true, };
1295 if !duration_ok {
1296 continue;
1297 }
1298
1299 let sim = compare_metadata(&media_meta[i], &media_meta[j]);
1300 let score = sim.overall_score();
1301 if score >= threshold {
1302 group_files.push(paths[j].clone());
1303 assigned[j] = true;
1304 if score > best_score {
1305 best_score = score;
1306 best_duration_diff = match (media_meta[i].duration, media_meta[j].duration)
1307 {
1308 (Some(d1), Some(d2)) => Some((d1 - d2).abs()),
1309 _ => None,
1310 };
1311 }
1312 }
1313 }
1314
1315 if group_files.len() > 1 {
1316 assigned[i] = true;
1317 let mut score_entry = SimilarityScore {
1318 method: "metadata".to_string(),
1319 score: best_score,
1320 metadata: Vec::new(),
1321 };
1322 if let Some(diff) = best_duration_diff {
1323 score_entry
1324 .metadata
1325 .push(("duration_diff_secs".to_string(), format!("{diff:.3}")));
1326 }
1327 groups.push(DuplicateGroup {
1328 files: group_files,
1329 scores: vec![score_entry],
1330 });
1331 }
1332 }
1333
1334 Ok(groups)
1335 }
1336
1337 pub async fn get_stats(&self) -> DedupResult<DedupStats> {
1343 let total_files = self.database.count_files().await?;
1344 let total_hashes = self.database.count_unique_hashes().await?;
1345
1346 Ok(DedupStats {
1347 total_files,
1348 total_hashes,
1349 duplicate_files: total_files.saturating_sub(total_hashes),
1350 })
1351 }
1352
1353 pub async fn close(self) -> DedupResult<()> {
1355 self.database.close().await?;
1356 Ok(())
1357 }
1358
1359 #[must_use]
1371 pub fn might_be_duplicate(&self, hash_bytes: &[u8]) -> bool {
1372 match &self.bloom {
1373 Some(bloom) => bloom.contains(hash_bytes),
1374 None => true,
1375 }
1376 }
1377
1378 pub fn reset_bloom(&mut self) {
1384 if let Some(ref mut bloom) = self.bloom {
1385 bloom.clear();
1386 }
1387 }
1388}
1389
1390#[derive(Debug, Clone)]
1392pub struct DedupStats {
1393 pub total_files: usize,
1395
1396 pub total_hashes: usize,
1398
1399 pub duplicate_files: usize,
1401}
1402
1403#[cfg(test)]
1404mod tests {
1405 use super::*;
1406
1407 #[test]
1408 fn test_detection_strategy() {
1409 assert!(DetectionStrategy::ExactHash.includes_hash());
1410 assert!(!DetectionStrategy::ExactHash.includes_perceptual());
1411
1412 assert!(DetectionStrategy::All.includes_hash());
1413 assert!(DetectionStrategy::All.includes_perceptual());
1414 assert!(DetectionStrategy::All.includes_audio());
1415
1416 assert!(DetectionStrategy::Fast.includes_hash());
1417 assert!(DetectionStrategy::Fast.includes_perceptual());
1418 assert!(!DetectionStrategy::Fast.includes_ssim());
1419 }
1420
1421 #[test]
1422 fn test_config_default() {
1423 let config = DedupConfig::default();
1424 assert_eq!(config.perceptual_threshold, 0.95);
1425 assert_eq!(config.ssim_threshold, 0.90);
1426 assert!(config.parallel);
1427 }
1428
1429 #[test]
1430 fn test_config_lsh_defaults() {
1431 let config = DedupConfig::default();
1432 assert!(config.use_lsh);
1433 assert_eq!(config.lsh_num_tables, 8);
1434 assert_eq!(config.lsh_bits_per_table, 8);
1435 assert_eq!(config.lsh_seed, 42);
1436 }
1437
1438 #[test]
1439 fn test_config_bloom_defaults() {
1440 let config = DedupConfig::default();
1441 assert!(!config.bloom_prescreen);
1443 assert_eq!(config.bloom_capacity, 10_000);
1444 assert!((config.bloom_fpr - 0.01f32).abs() < f32::EPSILON);
1445 }
1446
1447 #[tokio::test]
1449 #[cfg(feature = "sqlite")]
1450 async fn test_par_index_files_empty_slice() {
1451 use std::path::PathBuf;
1452 let dir = std::env::temp_dir();
1453 let db_path = dir.join(format!(
1454 "oxidedup_test_par_{}.db",
1455 std::time::SystemTime::now()
1456 .duration_since(std::time::UNIX_EPOCH)
1457 .unwrap_or_default()
1458 .subsec_nanos()
1459 ));
1460 let config = DedupConfig {
1461 database_path: db_path.clone(),
1462 ..DedupConfig::default()
1463 };
1464 if let Ok(mut detector) = DuplicateDetector::new(config).await {
1465 let no_paths: &[PathBuf] = &[];
1466 let errors = detector
1467 .par_index_files(no_paths)
1468 .await
1469 .expect("par_index_files should succeed on empty input");
1470 assert!(errors.is_empty(), "No errors expected for empty input");
1471 let _ = detector.close().await;
1472 }
1473 let _ = std::fs::remove_file(&db_path);
1474 }
1475
1476 #[tokio::test]
1478 #[cfg(feature = "sqlite")]
1479 async fn test_par_index_files_nonexistent_paths() {
1480 let dir = std::env::temp_dir();
1481 let db_path = dir.join(format!(
1482 "oxidedup_test_par_ne_{}.db",
1483 std::time::SystemTime::now()
1484 .duration_since(std::time::UNIX_EPOCH)
1485 .unwrap_or_default()
1486 .subsec_nanos()
1487 ));
1488 let config = DedupConfig {
1489 database_path: db_path.clone(),
1490 ..DedupConfig::default()
1491 };
1492 if let Ok(mut detector) = DuplicateDetector::new(config).await {
1493 let missing = vec![
1494 PathBuf::from("/nonexistent/path/a.mp4"),
1495 PathBuf::from("/nonexistent/path/b.mp4"),
1496 ];
1497 let errors = detector
1498 .par_index_files(&missing)
1499 .await
1500 .expect("par_index_files should return Ok even when files are missing");
1501 assert_eq!(errors.len(), 2, "Should have one error per missing file");
1502 let _ = detector.close().await;
1503 }
1504 let _ = std::fs::remove_file(&db_path);
1505 }
1506
1507 #[tokio::test]
1511 #[cfg(feature = "sqlite")]
1512 async fn test_might_be_duplicate_no_bloom_always_true() {
1513 let dir = std::env::temp_dir();
1514 let db_path = dir.join(format!(
1515 "oxidedup_bloom_noscreen_{}.db",
1516 std::time::SystemTime::now()
1517 .duration_since(std::time::UNIX_EPOCH)
1518 .unwrap_or_default()
1519 .subsec_nanos()
1520 ));
1521 let config = DedupConfig {
1522 database_path: db_path.clone(),
1523 bloom_prescreen: false,
1524 ..DedupConfig::default()
1525 };
1526 if let Ok(detector) = DuplicateDetector::new(config).await {
1527 assert!(
1529 detector.might_be_duplicate(b"some_hash_bytes"),
1530 "Should always return true when bloom is disabled"
1531 );
1532 assert!(
1533 detector.might_be_duplicate(b""),
1534 "Empty bytes: should return true without bloom"
1535 );
1536 let _ = detector.close().await;
1537 }
1538 let _ = std::fs::remove_file(&db_path);
1539 }
1540
1541 #[tokio::test]
1543 #[cfg(feature = "sqlite")]
1544 async fn test_might_be_duplicate_with_bloom_unknown_hash() {
1545 let dir = std::env::temp_dir();
1546 let db_path = dir.join(format!(
1547 "oxidedup_bloom_unknown_{}.db",
1548 std::time::SystemTime::now()
1549 .duration_since(std::time::UNIX_EPOCH)
1550 .unwrap_or_default()
1551 .subsec_nanos()
1552 ));
1553 let config = DedupConfig {
1554 database_path: db_path.clone(),
1555 bloom_prescreen: true,
1556 bloom_capacity: 1000,
1557 bloom_fpr: 0.01,
1558 ..DedupConfig::default()
1559 };
1560 if let Ok(detector) = DuplicateDetector::new(config).await {
1561 assert!(
1564 !detector.might_be_duplicate(b"never_inserted_hash"),
1565 "Unknown hash should return false from a fresh bloom filter"
1566 );
1567 let _ = detector.close().await;
1568 }
1569 let _ = std::fs::remove_file(&db_path);
1570 }
1571
1572 #[tokio::test]
1574 #[cfg(feature = "sqlite")]
1575 async fn test_reset_bloom_clears_state() {
1576 let dir = std::env::temp_dir();
1577 let db_path = dir.join(format!(
1578 "oxidedup_bloom_reset_{}.db",
1579 std::time::SystemTime::now()
1580 .duration_since(std::time::UNIX_EPOCH)
1581 .unwrap_or_default()
1582 .subsec_nanos()
1583 ));
1584 let config = DedupConfig {
1585 database_path: db_path.clone(),
1586 bloom_prescreen: true,
1587 bloom_capacity: 1000,
1588 bloom_fpr: 0.01,
1589 ..DedupConfig::default()
1590 };
1591 if let Ok(mut detector) = DuplicateDetector::new(config).await {
1592 if let Some(ref mut bloom) = detector.bloom {
1594 bloom.insert(b"known_hash");
1595 }
1596 assert!(
1598 detector.might_be_duplicate(b"known_hash"),
1599 "After insert, bloom should report potential duplicate"
1600 );
1601 detector.reset_bloom();
1603 assert!(
1604 !detector.might_be_duplicate(b"known_hash"),
1605 "After reset_bloom, hash should not be found"
1606 );
1607 let _ = detector.close().await;
1608 }
1609 let _ = std::fs::remove_file(&db_path);
1610 }
1611}