1#![allow(dead_code)]
16#![allow(clippy::cast_precision_loss)]
17
18use std::collections::HashMap;
19
20#[derive(Debug, Clone, PartialEq, Eq)]
26pub enum RecommendedAction {
27 DeleteDuplicates,
29 SymlinkDuplicates,
31 HardlinkDuplicates,
33 ArchiveDuplicates,
35 ManualReview,
37 NoAction,
39}
40
41impl RecommendedAction {
42 #[must_use]
44 pub fn label(&self) -> &'static str {
45 match self {
46 Self::DeleteDuplicates => "delete",
47 Self::SymlinkDuplicates => "symlink",
48 Self::HardlinkDuplicates => "hardlink",
49 Self::ArchiveDuplicates => "archive",
50 Self::ManualReview => "manual_review",
51 Self::NoAction => "no_action",
52 }
53 }
54
55 #[must_use]
57 pub fn description(&self) -> &'static str {
58 match self {
59 Self::DeleteDuplicates => {
60 "Delete all duplicate files, keeping only the representative."
61 }
62 Self::SymlinkDuplicates => {
63 "Replace duplicate files with symbolic links to the representative."
64 }
65 Self::HardlinkDuplicates => {
66 "Replace duplicate files with hard links to the representative."
67 }
68 Self::ArchiveDuplicates => "Move duplicates to an archive directory for manual review.",
69 Self::ManualReview => {
70 "Similarity confidence is insufficient for automated action; review manually."
71 }
72 Self::NoAction => "Single-member group; no action required.",
73 }
74 }
75}
76
77#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
85pub enum ConfidenceTier {
86 Low,
89 Medium,
91 High,
93 Exact,
96}
97
98impl ConfidenceTier {
99 #[must_use]
101 pub fn from_score(score: f64) -> Self {
102 if score >= 0.98 {
103 Self::Exact
104 } else if score >= 0.90 {
105 Self::High
106 } else if score >= 0.75 {
107 Self::Medium
108 } else {
109 Self::Low
110 }
111 }
112
113 #[must_use]
115 pub fn label(self) -> &'static str {
116 match self {
117 Self::Exact => "exact",
118 Self::High => "high",
119 Self::Medium => "medium",
120 Self::Low => "low",
121 }
122 }
123}
124
125#[derive(Debug, Clone)]
131pub struct FileEntry {
132 pub path: String,
134 pub size_bytes: u64,
136 pub is_representative: bool,
138 pub similarity_to_rep: f64,
141}
142
143impl FileEntry {
144 #[must_use]
146 pub fn new(path: impl Into<String>, size_bytes: u64, similarity_to_rep: f64) -> Self {
147 Self {
148 path: path.into(),
149 size_bytes,
150 is_representative: false,
151 similarity_to_rep,
152 }
153 }
154}
155
156#[derive(Debug, Clone, Default)]
162pub struct SpaceSavingsEstimate {
163 pub reclaimable_bytes: u64,
165 pub retained_bytes: u64,
167 pub savings_ratio: f64,
169}
170
171impl SpaceSavingsEstimate {
172 #[must_use]
174 pub fn from_entries(entries: &[FileEntry]) -> Self {
175 let total_bytes: u64 = entries.iter().map(|e| e.size_bytes).sum();
176 let retained_bytes: u64 = entries
177 .iter()
178 .filter(|e| e.is_representative)
179 .map(|e| e.size_bytes)
180 .sum();
181 let reclaimable_bytes = total_bytes.saturating_sub(retained_bytes);
182 let savings_ratio = if total_bytes > 0 {
183 reclaimable_bytes as f64 / total_bytes as f64
184 } else {
185 0.0
186 };
187 Self {
188 reclaimable_bytes,
189 retained_bytes,
190 savings_ratio,
191 }
192 }
193
194 pub fn merge(&mut self, other: &Self) {
196 self.reclaimable_bytes += other.reclaimable_bytes;
197 self.retained_bytes += other.retained_bytes;
198 let total = self.reclaimable_bytes + self.retained_bytes;
199 self.savings_ratio = if total > 0 {
200 self.reclaimable_bytes as f64 / total as f64
201 } else {
202 0.0
203 };
204 }
205
206 #[must_use]
208 pub fn description(&self) -> String {
209 format!(
210 "{} bytes reclaimable / {} bytes retained ({:.1}% savings)",
211 self.reclaimable_bytes,
212 self.retained_bytes,
213 self.savings_ratio * 100.0,
214 )
215 }
216}
217
218#[derive(Debug, Clone)]
225pub struct DetailedDuplicateGroup {
226 pub id: usize,
228 pub method: String,
230 pub files: Vec<FileEntry>,
232 pub mean_similarity: f64,
234 pub confidence_tier: ConfidenceTier,
236 pub action: RecommendedAction,
238 pub space_savings: SpaceSavingsEstimate,
240 pub metadata: HashMap<String, String>,
242}
243
244impl DetailedDuplicateGroup {
245 #[must_use]
247 pub fn new(id: usize, method: impl Into<String>, mean_similarity: f64) -> Self {
248 let confidence_tier = ConfidenceTier::from_score(mean_similarity);
249 Self {
250 id,
251 method: method.into(),
252 files: Vec::new(),
253 mean_similarity,
254 confidence_tier,
255 action: RecommendedAction::NoAction,
256 space_savings: SpaceSavingsEstimate::default(),
257 metadata: HashMap::new(),
258 }
259 }
260
261 pub fn add_file(&mut self, entry: FileEntry) {
263 self.files.push(entry);
264 }
265
266 #[must_use]
268 pub fn size(&self) -> usize {
269 self.files.len()
270 }
271
272 #[must_use]
274 pub fn is_duplicate(&self) -> bool {
275 self.files.len() >= 2
276 }
277
278 pub fn compute_space_savings(&mut self) {
280 self.space_savings = SpaceSavingsEstimate::from_entries(&self.files);
281 }
282
283 pub fn select_largest_representative(&mut self) {
286 if self.files.is_empty() {
287 return;
288 }
289 for f in &mut self.files {
291 f.is_representative = false;
292 }
293 let best_idx = self
294 .files
295 .iter()
296 .enumerate()
297 .max_by_key(|(_, e)| e.size_bytes)
298 .map(|(i, _)| i)
299 .unwrap_or(0);
300 self.files[best_idx].is_representative = true;
301 }
302
303 pub fn select_highest_similarity_representative(&mut self) {
307 if self.files.is_empty() {
308 return;
309 }
310 for f in &mut self.files {
311 f.is_representative = false;
312 }
313 let best_idx = self
314 .files
315 .iter()
316 .enumerate()
317 .max_by(|(_, a), (_, b)| {
318 a.similarity_to_rep
319 .partial_cmp(&b.similarity_to_rep)
320 .unwrap_or(std::cmp::Ordering::Equal)
321 })
322 .map(|(i, _)| i)
323 .unwrap_or(0);
324 self.files[best_idx].is_representative = true;
325 }
326
327 pub fn assign_action(
333 &mut self,
334 exact_action: RecommendedAction,
335 high_action: RecommendedAction,
336 fallback_action: RecommendedAction,
337 ) {
338 if !self.is_duplicate() {
339 self.action = RecommendedAction::NoAction;
340 return;
341 }
342 self.action = match self.confidence_tier {
343 ConfidenceTier::Exact => exact_action,
344 ConfidenceTier::High => high_action,
345 _ => fallback_action,
346 };
347 }
348
349 pub fn set_metadata(&mut self, key: impl Into<String>, value: impl Into<String>) {
351 self.metadata.insert(key.into(), value.into());
352 }
353
354 #[must_use]
356 pub fn representative_path(&self) -> Option<&str> {
357 self.files
358 .iter()
359 .find(|e| e.is_representative)
360 .map(|e| e.path.as_str())
361 }
362}
363
364#[derive(Debug, Clone)]
370pub struct DetailedReport {
371 pub groups: Vec<DetailedDuplicateGroup>,
373 pub total_space_savings: SpaceSavingsEstimate,
375 pub total_files_examined: usize,
377 pub tier_counts: HashMap<String, usize>,
379 pub action_counts: HashMap<String, usize>,
381}
382
383impl DetailedReport {
384 #[must_use]
386 pub fn group_count(&self) -> usize {
387 self.groups.len()
388 }
389
390 #[must_use]
392 pub fn duplicate_file_count(&self) -> usize {
393 self.groups.iter().map(|g| g.size()).sum()
394 }
395
396 #[must_use]
398 pub fn summary(&self) -> String {
399 format!(
400 "DetailedReport: {} groups | {} duplicate files | {} files examined\n\
401 Space: {}\n\
402 Tiers: {:?}\n\
403 Actions: {:?}",
404 self.group_count(),
405 self.duplicate_file_count(),
406 self.total_files_examined,
407 self.total_space_savings.description(),
408 self.tier_counts,
409 self.action_counts,
410 )
411 }
412}
413
414#[derive(Debug, Default)]
434pub struct DetailedReportBuilder {
435 groups: Vec<DetailedDuplicateGroup>,
436 total_files_examined: usize,
437}
438
439impl DetailedReportBuilder {
440 #[must_use]
442 pub fn new() -> Self {
443 Self::default()
444 }
445
446 #[must_use]
448 pub fn total_files_examined(mut self, n: usize) -> Self {
449 self.total_files_examined = n;
450 self
451 }
452
453 #[must_use]
455 pub fn add_group(mut self, group: DetailedDuplicateGroup) -> Self {
456 self.groups.push(group);
457 self
458 }
459
460 #[must_use]
465 pub fn add_simple_group(
466 mut self,
467 id: usize,
468 method: impl Into<String>,
469 mean_similarity: f64,
470 files: Vec<(String, u64)>,
471 ) -> Self {
472 let mut group = DetailedDuplicateGroup::new(id, method, mean_similarity);
473 for (path, size) in files {
474 group.add_file(FileEntry::new(path, size, mean_similarity));
475 }
476 group.select_largest_representative();
477 group.assign_action(
478 RecommendedAction::DeleteDuplicates,
479 RecommendedAction::HardlinkDuplicates,
480 RecommendedAction::ManualReview,
481 );
482 group.compute_space_savings();
483 self.groups.push(group);
484 self
485 }
486
487 #[must_use]
489 pub fn build(self) -> DetailedReport {
490 let mut total_space_savings = SpaceSavingsEstimate::default();
491 let mut tier_counts: HashMap<String, usize> = HashMap::new();
492 let mut action_counts: HashMap<String, usize> = HashMap::new();
493
494 for group in &self.groups {
495 total_space_savings.merge(&group.space_savings);
496 *tier_counts
497 .entry(group.confidence_tier.label().to_string())
498 .or_insert(0) += 1;
499 *action_counts
500 .entry(group.action.label().to_string())
501 .or_insert(0) += 1;
502 }
503
504 DetailedReport {
505 groups: self.groups,
506 total_space_savings,
507 total_files_examined: self.total_files_examined,
508 tier_counts,
509 action_counts,
510 }
511 }
512}
513
514#[cfg(test)]
519mod tests {
520 use super::*;
521
522 fn entry(path: &str, size: u64, sim: f64) -> FileEntry {
523 FileEntry::new(path, size, sim)
524 }
525
526 #[test]
527 fn test_confidence_tier_from_score() {
528 assert_eq!(ConfidenceTier::from_score(1.0), ConfidenceTier::Exact);
529 assert_eq!(ConfidenceTier::from_score(0.98), ConfidenceTier::Exact);
530 assert_eq!(ConfidenceTier::from_score(0.95), ConfidenceTier::High);
531 assert_eq!(ConfidenceTier::from_score(0.90), ConfidenceTier::High);
532 assert_eq!(ConfidenceTier::from_score(0.80), ConfidenceTier::Medium);
533 assert_eq!(ConfidenceTier::from_score(0.75), ConfidenceTier::Medium);
534 assert_eq!(ConfidenceTier::from_score(0.50), ConfidenceTier::Low);
535 assert_eq!(ConfidenceTier::from_score(0.0), ConfidenceTier::Low);
536 }
537
538 #[test]
539 fn test_confidence_tier_ordering() {
540 assert!(ConfidenceTier::Exact > ConfidenceTier::Low);
541 assert!(ConfidenceTier::High > ConfidenceTier::Medium);
542 }
543
544 #[test]
545 fn test_recommended_action_labels() {
546 assert_eq!(RecommendedAction::DeleteDuplicates.label(), "delete");
547 assert_eq!(RecommendedAction::SymlinkDuplicates.label(), "symlink");
548 assert_eq!(RecommendedAction::HardlinkDuplicates.label(), "hardlink");
549 assert_eq!(RecommendedAction::ArchiveDuplicates.label(), "archive");
550 assert_eq!(RecommendedAction::ManualReview.label(), "manual_review");
551 assert_eq!(RecommendedAction::NoAction.label(), "no_action");
552 }
553
554 #[test]
555 fn test_space_savings_from_entries() {
556 let mut files = vec![entry("a.mp4", 1000, 1.0), entry("b.mp4", 800, 0.95)];
557 files[0].is_representative = true;
558 let est = SpaceSavingsEstimate::from_entries(&files);
559 assert_eq!(est.retained_bytes, 1000);
560 assert_eq!(est.reclaimable_bytes, 800);
561 assert!((est.savings_ratio - 800.0 / 1800.0).abs() < 1e-9);
562 }
563
564 #[test]
565 fn test_space_savings_empty() {
566 let est = SpaceSavingsEstimate::from_entries(&[]);
567 assert_eq!(est.reclaimable_bytes, 0);
568 assert_eq!(est.savings_ratio, 0.0);
569 }
570
571 #[test]
572 fn test_space_savings_merge() {
573 let mut a = SpaceSavingsEstimate {
574 reclaimable_bytes: 500,
575 retained_bytes: 1000,
576 savings_ratio: 0.333,
577 };
578 let b = SpaceSavingsEstimate {
579 reclaimable_bytes: 300,
580 retained_bytes: 700,
581 savings_ratio: 0.3,
582 };
583 a.merge(&b);
584 assert_eq!(a.reclaimable_bytes, 800);
585 assert_eq!(a.retained_bytes, 1700);
586 let expected_ratio = 800.0 / 2500.0;
587 assert!((a.savings_ratio - expected_ratio).abs() < 1e-9);
588 }
589
590 #[test]
591 fn test_group_select_largest_representative() {
592 let mut group = DetailedDuplicateGroup::new(0, "phash", 0.95);
593 group.add_file(entry("small.mp4", 100, 0.95));
594 group.add_file(entry("large.mp4", 9000, 0.95));
595 group.add_file(entry("medium.mp4", 500, 0.95));
596 group.select_largest_representative();
597 assert_eq!(group.representative_path(), Some("large.mp4"));
598 }
599
600 #[test]
601 fn test_group_assign_action_exact() {
602 let mut group = DetailedDuplicateGroup::new(0, "hash", 0.999);
603 group.add_file(entry("a.mp4", 100, 1.0));
604 group.add_file(entry("b.mp4", 100, 1.0));
605 group.assign_action(
606 RecommendedAction::DeleteDuplicates,
607 RecommendedAction::HardlinkDuplicates,
608 RecommendedAction::ManualReview,
609 );
610 assert_eq!(group.action, RecommendedAction::DeleteDuplicates);
611 }
612
613 #[test]
614 fn test_group_assign_action_low_confidence() {
615 let mut group = DetailedDuplicateGroup::new(0, "ssim", 0.65);
616 group.add_file(entry("a.mp4", 100, 0.65));
617 group.add_file(entry("b.mp4", 100, 0.65));
618 group.assign_action(
619 RecommendedAction::DeleteDuplicates,
620 RecommendedAction::HardlinkDuplicates,
621 RecommendedAction::ManualReview,
622 );
623 assert_eq!(group.action, RecommendedAction::ManualReview);
624 }
625
626 #[test]
627 fn test_group_single_member_no_action() {
628 let mut group = DetailedDuplicateGroup::new(0, "phash", 1.0);
629 group.add_file(entry("only.mp4", 500, 1.0));
630 group.assign_action(
631 RecommendedAction::DeleteDuplicates,
632 RecommendedAction::HardlinkDuplicates,
633 RecommendedAction::ManualReview,
634 );
635 assert_eq!(group.action, RecommendedAction::NoAction);
636 }
637
638 #[test]
639 fn test_group_metadata() {
640 let mut group = DetailedDuplicateGroup::new(0, "phash", 0.95);
641 group.set_metadata("codec", "h264");
642 group.set_metadata("resolution", "1920x1080");
643 assert_eq!(
644 group.metadata.get("codec").map(String::as_str),
645 Some("h264")
646 );
647 assert_eq!(group.metadata.len(), 2);
648 }
649
650 #[test]
651 fn test_report_builder_empty() {
652 let report = DetailedReportBuilder::new()
653 .total_files_examined(50)
654 .build();
655 assert_eq!(report.total_files_examined, 50);
656 assert!(report.groups.is_empty());
657 assert_eq!(report.group_count(), 0);
658 assert_eq!(report.duplicate_file_count(), 0);
659 }
660
661 #[test]
662 fn test_report_builder_with_groups() {
663 let report = DetailedReportBuilder::new()
664 .total_files_examined(200)
665 .add_simple_group(
666 0,
667 "phash",
668 0.96,
669 vec![("a.mp4".to_string(), 2000), ("b.mp4".to_string(), 1500)],
670 )
671 .add_simple_group(
672 1,
673 "ssim",
674 0.82,
675 vec![("c.mp4".to_string(), 1000), ("d.mp4".to_string(), 900)],
676 )
677 .build();
678
679 assert_eq!(report.group_count(), 2);
680 assert_eq!(report.duplicate_file_count(), 4);
681 assert!(report.total_space_savings.reclaimable_bytes > 0);
682 assert!(!report.summary().is_empty());
683 }
684
685 #[test]
686 fn test_report_tier_and_action_counts() {
687 let report = DetailedReportBuilder::new()
688 .add_simple_group(
689 0,
690 "phash",
691 0.999,
692 vec![("a.mp4".to_string(), 1000), ("b.mp4".to_string(), 800)],
693 )
694 .add_simple_group(
695 1,
696 "ssim",
697 0.60,
698 vec![("c.mp4".to_string(), 500), ("d.mp4".to_string(), 400)],
699 )
700 .build();
701
702 assert_eq!(report.tier_counts.get("exact").copied().unwrap_or(0), 1);
704 assert_eq!(report.tier_counts.get("low").copied().unwrap_or(0), 1);
705 assert_eq!(report.action_counts.get("delete").copied().unwrap_or(0), 1);
706 assert_eq!(
707 report
708 .action_counts
709 .get("manual_review")
710 .copied()
711 .unwrap_or(0),
712 1
713 );
714 }
715}