1#![warn(missing_docs)]
86#![allow(clippy::module_name_repetitions)]
87#![allow(clippy::similar_names)]
88#![allow(clippy::cast_possible_truncation)]
89#![allow(clippy::cast_sign_loss)]
90#![allow(clippy::cast_precision_loss)]
91#![allow(clippy::too_many_arguments)]
92#![allow(dead_code)]
93
94pub mod audio;
95pub mod audio_fingerprint;
96pub mod bloom_filter;
97pub mod bloom_prescreen;
98pub mod chromagram;
99pub mod cluster;
100pub mod content_id;
101pub mod content_signature;
102pub mod cross_format;
103#[cfg(feature = "sqlite")]
104pub mod database;
105pub mod dedup_cache;
106pub mod dedup_index;
107pub mod dedup_policy;
108pub mod dedup_queue;
109pub mod dedup_report;
110pub mod dedup_report_detailed;
111pub mod dedup_report_ext;
112pub mod dedup_stats;
113pub mod exact_match;
114pub mod frame_hash;
115pub mod fuzzy_match;
116pub mod hash;
117pub mod hash_store;
118pub mod hierarchical;
119pub mod incremental;
120pub mod lsh_index;
121pub mod merge_strategy;
122pub mod metadata;
123pub mod minhash;
124pub mod near_duplicate;
125pub mod near_duplicate_cluster;
126pub mod network_dedup;
127pub mod parallel_indexer;
128pub mod perceptual_hash;
129pub mod persistent_cache;
130pub mod phash;
131pub mod progress;
132pub mod report;
133pub mod rolling_hash;
134pub mod segment_dedup;
135pub mod signature_store;
136pub mod similarity_index;
137pub mod space_savings;
138pub mod stream_dedup;
139pub mod video_dedup;
140pub mod video_dedup_pipeline;
141pub mod video_segment_dedup;
142pub mod visual;
143
144#[cfg(feature = "sqlite")]
145use std::path::Path;
146use std::path::PathBuf;
147use thiserror::Error;
148
149#[cfg(feature = "sqlite")]
150pub use database::DedupDatabase;
151pub use merge_strategy::{AppliedAction, MergeExecutor, MergeReport};
152pub use report::{DuplicateGroup, DuplicateReport, SimilarityScore};
153
154#[cfg(feature = "sqlite")]
165fn decode_hex_bytes(hex: &str) -> DedupResult<Vec<u8>> {
166 if hex.len() % 2 != 0 {
167 return Err(DedupError::Hash(format!(
168 "odd-length hex string: len={}",
169 hex.len()
170 )));
171 }
172 (0..hex.len())
173 .step_by(2)
174 .map(|i| {
175 u8::from_str_radix(&hex[i..i + 2], 16)
176 .map_err(|e| DedupError::Hash(format!("invalid hex byte at {i}: {e}")))
177 })
178 .collect()
179}
180
181#[cfg(feature = "sqlite")]
185fn cosine_similarity(a: &[f64], b: &[f64]) -> f64 {
186 let dot: f64 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
187 let mag_a: f64 = a.iter().map(|x| x * x).sum::<f64>().sqrt();
188 let mag_b: f64 = b.iter().map(|x| x * x).sum::<f64>().sqrt();
189 if mag_a < f64::EPSILON || mag_b < f64::EPSILON {
190 return 0.0;
191 }
192 dot / (mag_a * mag_b)
193}
194
195#[cfg(feature = "sqlite")]
201fn group_by_pairwise_similarity<H, FDist, FSim>(
202 items: &[(String, H)],
203 max_distance: u32,
204 dist_fn: FDist,
205 sim_fn: FSim,
206 method: &str,
207) -> DedupResult<Vec<DuplicateGroup>>
208where
209 FDist: Fn(&H, &H) -> u32,
210 FSim: Fn(&H, &H) -> f64,
211{
212 let mut groups: Vec<DuplicateGroup> = Vec::new();
213 let mut assigned = vec![false; items.len()];
214
215 for i in 0..items.len() {
216 if assigned[i] {
217 continue;
218 }
219 let mut group_files = vec![items[i].0.clone()];
220 let mut best_score = 0.0f64;
221
222 for j in (i + 1)..items.len() {
223 if assigned[j] {
224 continue;
225 }
226 let dist = dist_fn(&items[i].1, &items[j].1);
227 if dist <= max_distance {
228 let sim = sim_fn(&items[i].1, &items[j].1);
229 group_files.push(items[j].0.clone());
230 assigned[j] = true;
231 if sim > best_score {
232 best_score = sim;
233 }
234 }
235 }
236
237 if group_files.len() > 1 {
238 assigned[i] = true;
239 groups.push(DuplicateGroup {
240 files: group_files,
241 scores: vec![SimilarityScore {
242 method: method.to_string(),
243 score: best_score,
244 metadata: Vec::new(),
245 }],
246 });
247 }
248 }
249
250 Ok(groups)
251}
252
253#[derive(Error, Debug)]
255pub enum DedupError {
256 #[error("I/O error: {0}")]
258 Io(#[from] std::io::Error),
259
260 #[cfg(feature = "sqlite")]
262 #[error("Database error: {0}")]
263 Database(#[from] sqlx::Error),
264
265 #[cfg(not(feature = "sqlite"))]
267 #[error("Database error: {0}")]
268 Database(String),
269
270 #[error("Hashing error: {0}")]
272 Hash(String),
273
274 #[error("Visual processing error: {0}")]
276 Visual(String),
277
278 #[error("Audio processing error: {0}")]
280 Audio(String),
281
282 #[error("Metadata processing error: {0}")]
284 Metadata(String),
285
286 #[error("File not found: {0}")]
288 FileNotFound(PathBuf),
289
290 #[error("Invalid configuration: {0}")]
292 InvalidConfig(String),
293
294 #[error("OxiMedia core error: {0}")]
296 Core(#[from] oximedia_core::OxiError),
297}
298
299pub type DedupResult<T> = Result<T, DedupError>;
301
302#[derive(Debug, Clone, Copy, PartialEq, Eq)]
304pub enum DetectionStrategy {
305 ExactHash,
307
308 PerceptualHash,
310
311 Ssim,
313
314 Histogram,
316
317 FeatureMatch,
319
320 AudioFingerprint,
322
323 Metadata,
325
326 All,
328
329 VisualAll,
331
332 Fast,
334}
335
336impl DetectionStrategy {
337 #[must_use]
339 pub fn includes_hash(self) -> bool {
340 matches!(self, Self::ExactHash | Self::All | Self::Fast)
341 }
342
343 #[must_use]
345 pub fn includes_perceptual(self) -> bool {
346 matches!(
347 self,
348 Self::PerceptualHash | Self::All | Self::VisualAll | Self::Fast
349 )
350 }
351
352 #[must_use]
354 pub fn includes_ssim(self) -> bool {
355 matches!(self, Self::Ssim | Self::All | Self::VisualAll)
356 }
357
358 #[must_use]
360 pub fn includes_histogram(self) -> bool {
361 matches!(self, Self::Histogram | Self::All | Self::VisualAll)
362 }
363
364 #[must_use]
366 pub fn includes_feature_match(self) -> bool {
367 matches!(self, Self::FeatureMatch | Self::All | Self::VisualAll)
368 }
369
370 #[must_use]
372 pub fn includes_audio(self) -> bool {
373 matches!(self, Self::AudioFingerprint | Self::All)
374 }
375
376 #[must_use]
378 pub fn includes_metadata(self) -> bool {
379 matches!(self, Self::Metadata | Self::All | Self::Fast)
380 }
381}
382
383#[derive(Debug, Clone)]
385pub struct DedupConfig {
386 pub database_path: PathBuf,
388
389 pub perceptual_threshold: f64,
391
392 pub ssim_threshold: f64,
394
395 pub histogram_threshold: f64,
397
398 pub feature_match_threshold: usize,
400
401 pub audio_threshold: f64,
403
404 pub metadata_threshold: f64,
406
407 pub parallel: bool,
409
410 pub sample_frames: usize,
412
413 pub chunk_size: usize,
415
416 pub thumbnail_resolution: usize,
422
423 pub bloom_prescreen: bool,
429
430 pub bloom_capacity: usize,
432
433 pub bloom_fpr: f32,
435
436 pub use_lsh: bool,
442
443 pub lsh_num_tables: usize,
445
446 pub lsh_bits_per_table: usize,
448
449 pub lsh_seed: u64,
451}
452
453impl Default for DedupConfig {
454 fn default() -> Self {
455 Self {
456 database_path: PathBuf::from("oximedia_dedup.db"),
457 perceptual_threshold: 0.95,
458 ssim_threshold: 0.90,
459 histogram_threshold: 0.85,
460 feature_match_threshold: 50,
461 audio_threshold: 0.90,
462 metadata_threshold: 0.80,
463 parallel: true,
464 sample_frames: 10,
465 chunk_size: 4096,
466 thumbnail_resolution: 8,
467 bloom_prescreen: false,
468 bloom_capacity: 10_000,
469 bloom_fpr: 0.01,
470 use_lsh: true,
471 lsh_num_tables: 8,
472 lsh_bits_per_table: 8,
473 lsh_seed: 42,
474 }
475 }
476}
477
478#[cfg(feature = "sqlite")]
480pub struct DuplicateDetector {
481 config: DedupConfig,
482 database: DedupDatabase,
483 bloom: Option<bloom_filter::BloomFilter>,
489}
490
491#[cfg(feature = "sqlite")]
492impl DuplicateDetector {
493 pub async fn new(config: DedupConfig) -> DedupResult<Self> {
505 let database = DedupDatabase::open(&config.database_path).await?;
506 let bloom = if config.bloom_prescreen {
507 Some(bloom_filter::BloomFilter::new(
508 config.bloom_capacity,
509 config.bloom_fpr,
510 ))
511 } else {
512 None
513 };
514 Ok(Self {
515 config,
516 database,
517 bloom,
518 })
519 }
520
521 pub async fn add_file(&mut self, path: impl AsRef<Path>) -> DedupResult<()> {
531 let path = path.as_ref();
532 if !path.exists() {
533 return Err(DedupError::FileNotFound(path.to_path_buf()));
534 }
535
536 let file_hash = hash::compute_file_hash(path)?;
538
539 if let Some(ref mut bloom) = self.bloom {
541 bloom.insert(file_hash.as_bytes());
542 }
543
544 self.database.insert_file(path, &file_hash.to_hex()).await?;
546
547 Ok(())
548 }
549
550 pub async fn add_files(&mut self, paths: &[impl AsRef<Path>]) -> DedupResult<Vec<String>> {
556 let mut errors = Vec::new();
557
558 for path in paths {
559 if let Err(e) = self.add_file(path).await {
560 errors.push(format!("{}: {}", path.as_ref().display(), e));
561 }
562 }
563
564 Ok(errors)
565 }
566
567 pub async fn par_index_files<P>(&mut self, paths: &[P]) -> DedupResult<Vec<String>>
583 where
584 P: AsRef<Path> + Sync,
585 {
586 use rayon::prelude::*;
587
588 let hash_results: Vec<(PathBuf, DedupResult<hash::FileHash>)> = paths
590 .par_iter()
591 .map(|p| {
592 let path = p.as_ref().to_path_buf();
593 if !path.exists() {
594 return (path.clone(), Err(DedupError::FileNotFound(path)));
595 }
596 let result = hash::compute_file_hash(&path);
597 (path, result)
598 })
599 .collect();
600
601 let mut errors = Vec::new();
603 for (path, result) in hash_results {
604 match result {
605 Ok(file_hash) => {
606 if let Err(e) = self.database.insert_file(&path, &file_hash.to_hex()).await {
607 errors.push(format!("{}: {}", path.display(), e));
608 }
609 }
610 Err(e) => {
611 errors.push(format!("{}: {}", path.display(), e));
612 }
613 }
614 }
615
616 Ok(errors)
617 }
618
619 pub async fn find_duplicates(
625 &self,
626 strategy: DetectionStrategy,
627 ) -> DedupResult<DuplicateReport> {
628 self.find_duplicates_with_progress(strategy, &progress::NullReporter)
629 .await
630 }
631
632 pub async fn find_duplicates_with_progress(
643 &self,
644 strategy: DetectionStrategy,
645 reporter: &dyn progress::ProgressReporter,
646 ) -> DedupResult<DuplicateReport> {
647 use progress::{ProgressEvent, ProgressTracker};
648
649 let run_start = std::time::SystemTime::now()
650 .duration_since(std::time::UNIX_EPOCH)
651 .unwrap_or_default()
652 .as_millis() as u64;
653
654 let mut report = DuplicateReport::new();
655
656 let phase_count = [
658 strategy.includes_hash(),
659 strategy.includes_perceptual(),
660 strategy.includes_ssim(),
661 strategy.includes_histogram(),
662 strategy.includes_feature_match(),
663 strategy.includes_audio(),
664 strategy.includes_metadata(),
665 ]
666 .iter()
667 .filter(|&&b| b)
668 .count();
669
670 let mut completed_phases = 0usize;
671
672 if strategy.includes_hash() {
674 if reporter.is_cancelled() {
675 return Ok(report);
676 }
677 let mut tracker = ProgressTracker::new(reporter, "exact_hash", 0);
678 let hash_dups = self.find_hash_duplicates().await?;
679 tracker.tick_batch(1);
680 let groups_found = hash_dups.len();
681 report.add_groups(hash_dups);
682 tracker.complete(groups_found);
683 completed_phases += 1;
684 }
685
686 if strategy.includes_perceptual() {
688 if reporter.is_cancelled() {
689 return Ok(report);
690 }
691 let mut tracker = ProgressTracker::new(reporter, "perceptual_hash", 0);
692 let perceptual_dups = self.find_perceptual_duplicates().await?;
693 tracker.tick_batch(1);
694 let groups_found = perceptual_dups.len();
695 report.add_groups(perceptual_dups);
696 tracker.complete(groups_found);
697 completed_phases += 1;
698 }
699
700 if strategy.includes_ssim() {
702 if reporter.is_cancelled() {
703 return Ok(report);
704 }
705 let mut tracker = ProgressTracker::new(reporter, "ssim", 0);
706 let ssim_dups = self.find_ssim_duplicates().await?;
707 tracker.tick_batch(1);
708 let groups_found = ssim_dups.len();
709 report.add_groups(ssim_dups);
710 tracker.complete(groups_found);
711 completed_phases += 1;
712 }
713
714 if strategy.includes_histogram() {
716 if reporter.is_cancelled() {
717 return Ok(report);
718 }
719 let mut tracker = ProgressTracker::new(reporter, "histogram", 0);
720 let histogram_dups = self.find_histogram_duplicates().await?;
721 tracker.tick_batch(1);
722 let groups_found = histogram_dups.len();
723 report.add_groups(histogram_dups);
724 tracker.complete(groups_found);
725 completed_phases += 1;
726 }
727
728 if strategy.includes_feature_match() {
730 if reporter.is_cancelled() {
731 return Ok(report);
732 }
733 let mut tracker = ProgressTracker::new(reporter, "feature_match", 0);
734 let feature_dups = self.find_feature_duplicates().await?;
735 tracker.tick_batch(1);
736 let groups_found = feature_dups.len();
737 report.add_groups(feature_dups);
738 tracker.complete(groups_found);
739 completed_phases += 1;
740 }
741
742 if strategy.includes_audio() {
744 if reporter.is_cancelled() {
745 return Ok(report);
746 }
747 let mut tracker = ProgressTracker::new(reporter, "audio_fingerprint", 0);
748 let audio_dups = self.find_audio_duplicates().await?;
749 tracker.tick_batch(1);
750 let groups_found = audio_dups.len();
751 report.add_groups(audio_dups);
752 tracker.complete(groups_found);
753 completed_phases += 1;
754 }
755
756 if strategy.includes_metadata() {
758 if reporter.is_cancelled() {
759 return Ok(report);
760 }
761 let mut tracker = ProgressTracker::new(reporter, "metadata", 0);
762 let metadata_dups = self.find_metadata_duplicates().await?;
763 tracker.tick_batch(1);
764 let groups_found = metadata_dups.len();
765 report.add_groups(metadata_dups);
766 tracker.complete(groups_found);
767 completed_phases += 1;
768 }
769
770 let run_end = std::time::SystemTime::now()
772 .duration_since(std::time::UNIX_EPOCH)
773 .unwrap_or_default()
774 .as_millis() as u64;
775
776 reporter.on_event(&ProgressEvent::RunCompleted {
777 total_groups: report.groups.len(),
778 total_elapsed_ms: run_end.saturating_sub(run_start),
779 });
780
781 let _ = (phase_count, completed_phases); Ok(report)
784 }
785
786 async fn find_hash_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
788 let duplicates = self.database.find_duplicate_hashes().await?;
789 let mut groups = Vec::new();
790
791 for (hash, files) in duplicates {
792 if files.len() > 1 {
793 groups.push(DuplicateGroup {
794 files,
795 scores: vec![SimilarityScore {
796 method: "exact_hash".to_string(),
797 score: 1.0,
798 metadata: vec![("hash".to_string(), hash)],
799 }],
800 });
801 }
802 }
803
804 Ok(groups)
805 }
806
807 async fn find_perceptual_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
817 let max_hamming = ((1.0 - self.config.perceptual_threshold) * 64.0) as u32;
820
821 let stored = self.database.get_all_fingerprints_by_type("phash").await?;
823
824 let mut hashes: Vec<(String, visual::PerceptualHash)> = Vec::new();
826 for (path, hex) in stored {
827 if let Ok(value) = u64::from_str_radix(&hex, 16) {
828 hashes.push((path, visual::PerceptualHash::new(value, 64)));
829 }
830 }
831
832 if hashes.len() < 2 {
834 return Ok(Vec::new());
835 }
836
837 let hashes: Vec<(String, visual::PerceptualHash)> = if self.config.bloom_prescreen {
847 let raw: Vec<u64> = hashes.iter().map(|(_, ph)| ph.hash()).collect();
848 let prescreen = bloom_filter::prescreen_perceptual_hashes(
849 &raw,
850 16, self.config.bloom_capacity,
852 self.config.bloom_fpr,
853 );
854 prescreen
855 .candidates
856 .iter()
857 .filter_map(|&idx| hashes.get(idx).cloned())
858 .collect()
859 } else {
860 hashes
861 };
862
863 if hashes.len() < 2 {
865 return Ok(Vec::new());
866 }
867
868 if self.config.use_lsh {
869 self.find_perceptual_duplicates_lsh(&hashes, max_hamming)
870 } else {
871 group_by_pairwise_similarity(
872 &hashes,
873 max_hamming,
874 |h1, h2| h1.hamming_distance(h2),
875 |h1, h2| h1.similarity(h2),
876 "perceptual_hash",
877 )
878 }
879 }
880
881 fn find_perceptual_duplicates_lsh(
886 &self,
887 hashes: &[(String, visual::PerceptualHash)],
888 max_hamming: u32,
889 ) -> DedupResult<Vec<DuplicateGroup>> {
890 let id_hashes: Vec<(u64, u64)> = hashes
892 .iter()
893 .enumerate()
894 .map(|(i, (_, ph))| (i as u64, ph.hash()))
895 .collect();
896
897 let lsh_result = lsh_index::lsh_dedup_pass(
899 &id_hashes,
900 max_hamming,
901 self.config.lsh_num_tables,
902 self.config.lsh_bits_per_table,
903 self.config.lsh_seed,
904 );
905
906 let all_ids: Vec<u64> = (0..hashes.len() as u64).collect();
908 let groups = lsh_index::group_by_lsh_pairs(&lsh_result.pairs, &all_ids);
909
910 let mut result = Vec::new();
912 for group_ids in &groups {
913 let files: Vec<String> = group_ids
914 .iter()
915 .filter_map(|&id| hashes.get(id as usize).map(|(p, _)| p.clone()))
916 .collect();
917
918 if files.len() < 2 {
919 continue;
920 }
921
922 let mut best_sim = 0.0f64;
924 for i in 0..group_ids.len() {
925 for j in (i + 1)..group_ids.len() {
926 let ia = group_ids[i] as usize;
927 let ib = group_ids[j] as usize;
928 if let (Some((_, ha)), Some((_, hb))) = (hashes.get(ia), hashes.get(ib)) {
929 let sim = ha.similarity(hb);
930 if sim > best_sim {
931 best_sim = sim;
932 }
933 }
934 }
935 }
936
937 result.push(DuplicateGroup {
938 files,
939 scores: vec![SimilarityScore {
940 method: "perceptual_hash_lsh".to_string(),
941 score: best_sim,
942 metadata: vec![
943 (
944 "lsh_candidates".to_string(),
945 lsh_result.candidates_checked.to_string(),
946 ),
947 (
948 "comparison_ratio".to_string(),
949 format!("{:.4}", lsh_result.comparison_ratio()),
950 ),
951 ],
952 }],
953 });
954 }
955
956 Ok(result)
957 }
958
959 async fn find_ssim_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
968 let threshold = self.config.ssim_threshold;
969 let res = self.config.thumbnail_resolution.max(4);
970 let expected_bytes = res * res;
971
972 let stored = self
974 .database
975 .get_all_fingerprints_by_type("thumbnail")
976 .await?;
977
978 let mut images: Vec<(String, visual::Image)> = Vec::new();
980 for (path, hex) in stored {
981 let bytes = decode_hex_bytes(&hex)?;
982 if bytes.len() == expected_bytes {
984 if let Ok(img) = visual::Image::from_data(res, res, 1, bytes) {
985 images.push((path, img));
986 }
987 }
988 }
989
990 if images.len() < 2 {
991 return Ok(Vec::new());
992 }
993
994 let ssim_params = visual::SsimParams::default();
995 let mut groups: Vec<DuplicateGroup> = Vec::new();
996 let mut assigned = vec![false; images.len()];
997
998 for i in 0..images.len() {
999 if assigned[i] {
1000 continue;
1001 }
1002 let mut group_files = vec![images[i].0.clone()];
1003 let mut best_score = 0.0f64;
1004
1005 for j in (i + 1)..images.len() {
1006 if assigned[j] {
1007 continue;
1008 }
1009 let ssim = visual::compute_ssim(&images[i].1, &images[j].1, &ssim_params);
1010 if ssim >= threshold {
1011 group_files.push(images[j].0.clone());
1012 assigned[j] = true;
1013 if ssim > best_score {
1014 best_score = ssim;
1015 }
1016 }
1017 }
1018
1019 if group_files.len() > 1 {
1020 assigned[i] = true;
1021 groups.push(DuplicateGroup {
1022 files: group_files,
1023 scores: vec![SimilarityScore {
1024 method: "ssim".to_string(),
1025 score: best_score,
1026 metadata: Vec::new(),
1027 }],
1028 });
1029 }
1030 }
1031
1032 Ok(groups)
1033 }
1034
1035 async fn find_histogram_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
1043 let threshold = self.config.histogram_threshold;
1044
1045 let stored = self
1046 .database
1047 .get_all_fingerprints_by_type("histogram")
1048 .await?;
1049
1050 let mut histograms: Vec<(String, Vec<Vec<u32>>)> = Vec::new();
1052 for (path, json_str) in stored {
1053 if let Ok(flat) = serde_json::from_str::<Vec<u32>>(&json_str) {
1054 if flat.len() % 256 == 0 && !flat.is_empty() {
1056 let channels = flat.len() / 256;
1057 let hist: Vec<Vec<u32>> = (0..channels)
1058 .map(|c| flat[c * 256..(c + 1) * 256].to_vec())
1059 .collect();
1060 histograms.push((path, hist));
1061 }
1062 }
1063 }
1064
1065 if histograms.len() < 2 {
1066 return Ok(Vec::new());
1067 }
1068
1069 let mut groups: Vec<DuplicateGroup> = Vec::new();
1070 let mut assigned = vec![false; histograms.len()];
1071
1072 for i in 0..histograms.len() {
1073 if assigned[i] {
1074 continue;
1075 }
1076 let mut group_files = vec![histograms[i].0.clone()];
1077 let mut best_score = 0.0f64;
1078
1079 for j in (i + 1)..histograms.len() {
1080 if assigned[j] {
1081 continue;
1082 }
1083 let corr = visual::compare_histograms(&histograms[i].1, &histograms[j].1);
1084 if corr >= threshold {
1085 group_files.push(histograms[j].0.clone());
1086 assigned[j] = true;
1087 if corr > best_score {
1088 best_score = corr;
1089 }
1090 }
1091 }
1092
1093 if group_files.len() > 1 {
1094 assigned[i] = true;
1095 groups.push(DuplicateGroup {
1096 files: group_files,
1097 scores: vec![SimilarityScore {
1098 method: "histogram".to_string(),
1099 score: best_score,
1100 metadata: Vec::new(),
1101 }],
1102 });
1103 }
1104 }
1105
1106 Ok(groups)
1107 }
1108
1109 async fn find_feature_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
1117 let threshold = self.config.perceptual_threshold;
1118
1119 let stored = self
1120 .database
1121 .get_all_fingerprints_by_type("feature_vector")
1122 .await?;
1123
1124 let mut vectors: Vec<(String, Vec<f64>)> = Vec::new();
1126 for (path, json_str) in stored {
1127 if let Ok(vec) = serde_json::from_str::<Vec<f64>>(&json_str) {
1128 if !vec.is_empty() {
1129 vectors.push((path, vec));
1130 }
1131 }
1132 }
1133
1134 if vectors.len() < 2 {
1135 return Ok(Vec::new());
1136 }
1137
1138 let mut groups: Vec<DuplicateGroup> = Vec::new();
1139 let mut assigned = vec![false; vectors.len()];
1140
1141 for i in 0..vectors.len() {
1142 if assigned[i] {
1143 continue;
1144 }
1145 let mut group_files = vec![vectors[i].0.clone()];
1146 let mut best_score = 0.0f64;
1147
1148 for j in (i + 1)..vectors.len() {
1149 if assigned[j] {
1150 continue;
1151 }
1152 let sim = cosine_similarity(&vectors[i].1, &vectors[j].1);
1153 if sim >= threshold {
1154 group_files.push(vectors[j].0.clone());
1155 assigned[j] = true;
1156 if sim > best_score {
1157 best_score = sim;
1158 }
1159 }
1160 }
1161
1162 if group_files.len() > 1 {
1163 assigned[i] = true;
1164 groups.push(DuplicateGroup {
1165 files: group_files,
1166 scores: vec![SimilarityScore {
1167 method: "feature_vector".to_string(),
1168 score: best_score,
1169 metadata: Vec::new(),
1170 }],
1171 });
1172 }
1173 }
1174
1175 Ok(groups)
1176 }
1177
1178 async fn find_audio_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
1185 let threshold = self.config.audio_threshold;
1186
1187 let stored = self
1188 .database
1189 .get_all_fingerprints_by_type("audio_fingerprint")
1190 .await?;
1191
1192 let mut fingerprints: Vec<(String, audio::AudioFingerprint)> = Vec::new();
1194 for (path, hex) in stored {
1195 let bytes = decode_hex_bytes(&hex)?;
1196 if !bytes.is_empty() {
1197 fingerprints.push((path, audio::AudioFingerprint::new(bytes, 11025, 0.0)));
1198 }
1199 }
1200
1201 if fingerprints.len() < 2 {
1202 return Ok(Vec::new());
1203 }
1204
1205 let mut groups: Vec<DuplicateGroup> = Vec::new();
1206 let mut assigned = vec![false; fingerprints.len()];
1207
1208 for i in 0..fingerprints.len() {
1209 if assigned[i] {
1210 continue;
1211 }
1212 let mut group_files = vec![fingerprints[i].0.clone()];
1213 let mut best_score = 0.0f64;
1214
1215 for j in (i + 1)..fingerprints.len() {
1216 if assigned[j] {
1217 continue;
1218 }
1219 let sim = fingerprints[i].1.similarity(&fingerprints[j].1);
1220 if sim >= threshold {
1221 group_files.push(fingerprints[j].0.clone());
1222 assigned[j] = true;
1223 if sim > best_score {
1224 best_score = sim;
1225 }
1226 }
1227 }
1228
1229 if group_files.len() > 1 {
1230 assigned[i] = true;
1231 groups.push(DuplicateGroup {
1232 files: group_files,
1233 scores: vec![SimilarityScore {
1234 method: "audio_fingerprint".to_string(),
1235 score: best_score,
1236 metadata: Vec::new(),
1237 }],
1238 });
1239 }
1240 }
1241
1242 Ok(groups)
1243 }
1244
1245 async fn find_metadata_duplicates(&self) -> DedupResult<Vec<DuplicateGroup>> {
1258 use metadata::{compare_metadata, MediaMetadata};
1259 use std::path::PathBuf;
1260
1261 let threshold = self.config.metadata_threshold;
1262
1263 let rows = self.database.get_all_files_with_metadata().await?;
1264
1265 if rows.len() < 2 {
1266 return Ok(Vec::new());
1267 }
1268
1269 let media_meta: Vec<MediaMetadata> = rows
1271 .iter()
1272 .map(
1273 |(path, duration, width, height, video_codec, audio_codec, container)| {
1274 let fs_size = std::fs::metadata(path).map(|m| m.len()).unwrap_or(0);
1275 let mut m = MediaMetadata::new(PathBuf::from(path), fs_size);
1276 m.duration = *duration;
1277 m.width = width.map(|v| v as u32);
1278 m.height = height.map(|v| v as u32);
1279 m.video_codec = video_codec.clone();
1280 m.audio_codec = audio_codec.clone();
1281 m.container = container.clone();
1282 m
1283 },
1284 )
1285 .collect();
1286
1287 let paths: Vec<String> = rows.iter().map(|(p, ..)| p.clone()).collect();
1288
1289 let mut groups: Vec<DuplicateGroup> = Vec::new();
1290 let mut assigned = vec![false; media_meta.len()];
1291
1292 for i in 0..media_meta.len() {
1293 if assigned[i] {
1294 continue;
1295 }
1296 let mut group_files = vec![paths[i].clone()];
1297 let mut best_score = 0.0f64;
1298 let mut best_duration_diff: Option<f64> = None;
1299
1300 for j in (i + 1)..media_meta.len() {
1301 if assigned[j] {
1302 continue;
1303 }
1304
1305 let duration_ok = match (media_meta[i].duration, media_meta[j].duration) {
1308 (Some(d1), Some(d2)) => (d1 - d2).abs() <= 1.0,
1309 _ => true, };
1311 if !duration_ok {
1312 continue;
1313 }
1314
1315 let sim = compare_metadata(&media_meta[i], &media_meta[j]);
1316 let score = sim.overall_score();
1317 if score >= threshold {
1318 group_files.push(paths[j].clone());
1319 assigned[j] = true;
1320 if score > best_score {
1321 best_score = score;
1322 best_duration_diff = match (media_meta[i].duration, media_meta[j].duration)
1323 {
1324 (Some(d1), Some(d2)) => Some((d1 - d2).abs()),
1325 _ => None,
1326 };
1327 }
1328 }
1329 }
1330
1331 if group_files.len() > 1 {
1332 assigned[i] = true;
1333 let mut score_entry = SimilarityScore {
1334 method: "metadata".to_string(),
1335 score: best_score,
1336 metadata: Vec::new(),
1337 };
1338 if let Some(diff) = best_duration_diff {
1339 score_entry
1340 .metadata
1341 .push(("duration_diff_secs".to_string(), format!("{diff:.3}")));
1342 }
1343 groups.push(DuplicateGroup {
1344 files: group_files,
1345 scores: vec![score_entry],
1346 });
1347 }
1348 }
1349
1350 Ok(groups)
1351 }
1352
1353 pub async fn get_stats(&self) -> DedupResult<DedupStats> {
1359 let total_files = self.database.count_files().await?;
1360 let total_hashes = self.database.count_unique_hashes().await?;
1361
1362 Ok(DedupStats {
1363 total_files,
1364 total_hashes,
1365 duplicate_files: total_files.saturating_sub(total_hashes),
1366 })
1367 }
1368
1369 pub async fn close(self) -> DedupResult<()> {
1371 self.database.close().await?;
1372 Ok(())
1373 }
1374
1375 #[must_use]
1387 pub fn might_be_duplicate(&self, hash_bytes: &[u8]) -> bool {
1388 match &self.bloom {
1389 Some(bloom) => bloom.contains(hash_bytes),
1390 None => true,
1391 }
1392 }
1393
1394 pub fn reset_bloom(&mut self) {
1400 if let Some(ref mut bloom) = self.bloom {
1401 bloom.clear();
1402 }
1403 }
1404}
1405
1406#[derive(Debug, Clone)]
1408pub struct DedupStats {
1409 pub total_files: usize,
1411
1412 pub total_hashes: usize,
1414
1415 pub duplicate_files: usize,
1417}
1418
1419#[cfg(test)]
1420mod tests {
1421 use super::*;
1422
1423 #[test]
1424 fn test_detection_strategy() {
1425 assert!(DetectionStrategy::ExactHash.includes_hash());
1426 assert!(!DetectionStrategy::ExactHash.includes_perceptual());
1427
1428 assert!(DetectionStrategy::All.includes_hash());
1429 assert!(DetectionStrategy::All.includes_perceptual());
1430 assert!(DetectionStrategy::All.includes_audio());
1431
1432 assert!(DetectionStrategy::Fast.includes_hash());
1433 assert!(DetectionStrategy::Fast.includes_perceptual());
1434 assert!(!DetectionStrategy::Fast.includes_ssim());
1435 }
1436
1437 #[test]
1438 fn test_config_default() {
1439 let config = DedupConfig::default();
1440 assert_eq!(config.perceptual_threshold, 0.95);
1441 assert_eq!(config.ssim_threshold, 0.90);
1442 assert!(config.parallel);
1443 }
1444
1445 #[test]
1446 fn test_config_lsh_defaults() {
1447 let config = DedupConfig::default();
1448 assert!(config.use_lsh);
1449 assert_eq!(config.lsh_num_tables, 8);
1450 assert_eq!(config.lsh_bits_per_table, 8);
1451 assert_eq!(config.lsh_seed, 42);
1452 }
1453
1454 #[test]
1455 fn test_config_bloom_defaults() {
1456 let config = DedupConfig::default();
1457 assert!(!config.bloom_prescreen);
1459 assert_eq!(config.bloom_capacity, 10_000);
1460 assert!((config.bloom_fpr - 0.01f32).abs() < f32::EPSILON);
1461 }
1462
1463 #[tokio::test]
1465 #[cfg(feature = "sqlite")]
1466 async fn test_par_index_files_empty_slice() {
1467 use std::path::PathBuf;
1468 let dir = std::env::temp_dir();
1469 let db_path = dir.join(format!(
1470 "oxidedup_test_par_{}.db",
1471 std::time::SystemTime::now()
1472 .duration_since(std::time::UNIX_EPOCH)
1473 .unwrap_or_default()
1474 .subsec_nanos()
1475 ));
1476 let config = DedupConfig {
1477 database_path: db_path.clone(),
1478 ..DedupConfig::default()
1479 };
1480 if let Ok(mut detector) = DuplicateDetector::new(config).await {
1481 let no_paths: &[PathBuf] = &[];
1482 let errors = detector
1483 .par_index_files(no_paths)
1484 .await
1485 .expect("par_index_files should succeed on empty input");
1486 assert!(errors.is_empty(), "No errors expected for empty input");
1487 let _ = detector.close().await;
1488 }
1489 let _ = std::fs::remove_file(&db_path);
1490 }
1491
1492 #[tokio::test]
1494 #[cfg(feature = "sqlite")]
1495 async fn test_par_index_files_nonexistent_paths() {
1496 let dir = std::env::temp_dir();
1497 let db_path = dir.join(format!(
1498 "oxidedup_test_par_ne_{}.db",
1499 std::time::SystemTime::now()
1500 .duration_since(std::time::UNIX_EPOCH)
1501 .unwrap_or_default()
1502 .subsec_nanos()
1503 ));
1504 let config = DedupConfig {
1505 database_path: db_path.clone(),
1506 ..DedupConfig::default()
1507 };
1508 if let Ok(mut detector) = DuplicateDetector::new(config).await {
1509 let missing = vec![
1510 PathBuf::from("/nonexistent/path/a.mp4"),
1511 PathBuf::from("/nonexistent/path/b.mp4"),
1512 ];
1513 let errors = detector
1514 .par_index_files(&missing)
1515 .await
1516 .expect("par_index_files should return Ok even when files are missing");
1517 assert_eq!(errors.len(), 2, "Should have one error per missing file");
1518 let _ = detector.close().await;
1519 }
1520 let _ = std::fs::remove_file(&db_path);
1521 }
1522
1523 #[tokio::test]
1527 #[cfg(feature = "sqlite")]
1528 async fn test_might_be_duplicate_no_bloom_always_true() {
1529 let dir = std::env::temp_dir();
1530 let db_path = dir.join(format!(
1531 "oxidedup_bloom_noscreen_{}.db",
1532 std::time::SystemTime::now()
1533 .duration_since(std::time::UNIX_EPOCH)
1534 .unwrap_or_default()
1535 .subsec_nanos()
1536 ));
1537 let config = DedupConfig {
1538 database_path: db_path.clone(),
1539 bloom_prescreen: false,
1540 ..DedupConfig::default()
1541 };
1542 if let Ok(detector) = DuplicateDetector::new(config).await {
1543 assert!(
1545 detector.might_be_duplicate(b"some_hash_bytes"),
1546 "Should always return true when bloom is disabled"
1547 );
1548 assert!(
1549 detector.might_be_duplicate(b""),
1550 "Empty bytes: should return true without bloom"
1551 );
1552 let _ = detector.close().await;
1553 }
1554 let _ = std::fs::remove_file(&db_path);
1555 }
1556
1557 #[tokio::test]
1559 #[cfg(feature = "sqlite")]
1560 async fn test_might_be_duplicate_with_bloom_unknown_hash() {
1561 let dir = std::env::temp_dir();
1562 let db_path = dir.join(format!(
1563 "oxidedup_bloom_unknown_{}.db",
1564 std::time::SystemTime::now()
1565 .duration_since(std::time::UNIX_EPOCH)
1566 .unwrap_or_default()
1567 .subsec_nanos()
1568 ));
1569 let config = DedupConfig {
1570 database_path: db_path.clone(),
1571 bloom_prescreen: true,
1572 bloom_capacity: 1000,
1573 bloom_fpr: 0.01,
1574 ..DedupConfig::default()
1575 };
1576 if let Ok(detector) = DuplicateDetector::new(config).await {
1577 assert!(
1580 !detector.might_be_duplicate(b"never_inserted_hash"),
1581 "Unknown hash should return false from a fresh bloom filter"
1582 );
1583 let _ = detector.close().await;
1584 }
1585 let _ = std::fs::remove_file(&db_path);
1586 }
1587
1588 #[tokio::test]
1590 #[cfg(feature = "sqlite")]
1591 async fn test_reset_bloom_clears_state() {
1592 let dir = std::env::temp_dir();
1593 let db_path = dir.join(format!(
1594 "oxidedup_bloom_reset_{}.db",
1595 std::time::SystemTime::now()
1596 .duration_since(std::time::UNIX_EPOCH)
1597 .unwrap_or_default()
1598 .subsec_nanos()
1599 ));
1600 let config = DedupConfig {
1601 database_path: db_path.clone(),
1602 bloom_prescreen: true,
1603 bloom_capacity: 1000,
1604 bloom_fpr: 0.01,
1605 ..DedupConfig::default()
1606 };
1607 if let Ok(mut detector) = DuplicateDetector::new(config).await {
1608 if let Some(ref mut bloom) = detector.bloom {
1610 bloom.insert(b"known_hash");
1611 }
1612 assert!(
1614 detector.might_be_duplicate(b"known_hash"),
1615 "After insert, bloom should report potential duplicate"
1616 );
1617 detector.reset_bloom();
1619 assert!(
1620 !detector.might_be_duplicate(b"known_hash"),
1621 "After reset_bloom, hash should not be found"
1622 );
1623 let _ = detector.close().await;
1624 }
1625 let _ = std::fs::remove_file(&db_path);
1626 }
1627}