Skip to main content

oximedia_dedup/
dedup_report_detailed.rs

1//! Detailed deduplication reporting with disk-space savings, confidence
2//! scores, and action recommendations.
3//!
4//! This module extends the base deduplication machinery with:
5//!
6//! - [`DetailedDuplicateGroup`]: Duplicate group with per-file metadata,
7//!   size-savings estimates, confidence scores, and recommended action.
8//! - [`RecommendedAction`]: Enum of actions the user or automation can take.
9//! - [`SpaceSavingsEstimate`]: Byte-level space savings broken down by
10//!   duplicate tier.
11//! - [`DetailedReport`]: Aggregate report over all groups.
12//! - [`DetailedReportBuilder`]: Fluent builder for assembling a report
13//!   incrementally.
14
15#![allow(dead_code)]
16#![allow(clippy::cast_precision_loss)]
17
18use std::collections::HashMap;
19
20// ---------------------------------------------------------------------------
21// RecommendedAction
22// ---------------------------------------------------------------------------
23
24/// Recommended action for a duplicate file group.
25#[derive(Debug, Clone, PartialEq, Eq)]
26pub enum RecommendedAction {
27    /// Delete all files except the best representative.
28    DeleteDuplicates,
29    /// Replace duplicate files with symbolic links to the representative.
30    SymlinkDuplicates,
31    /// Replace duplicate files with hard links to the representative.
32    HardlinkDuplicates,
33    /// Move duplicates to an archive directory for manual review.
34    ArchiveDuplicates,
35    /// Confidence is too low for automatic action; request manual review.
36    ManualReview,
37    /// Only one file in the group; no action needed.
38    NoAction,
39}
40
41impl RecommendedAction {
42    /// Return a short machine-readable label for this action.
43    #[must_use]
44    pub fn label(&self) -> &'static str {
45        match self {
46            Self::DeleteDuplicates => "delete",
47            Self::SymlinkDuplicates => "symlink",
48            Self::HardlinkDuplicates => "hardlink",
49            Self::ArchiveDuplicates => "archive",
50            Self::ManualReview => "manual_review",
51            Self::NoAction => "no_action",
52        }
53    }
54
55    /// Return a human-readable description.
56    #[must_use]
57    pub fn description(&self) -> &'static str {
58        match self {
59            Self::DeleteDuplicates => {
60                "Delete all duplicate files, keeping only the representative."
61            }
62            Self::SymlinkDuplicates => {
63                "Replace duplicate files with symbolic links to the representative."
64            }
65            Self::HardlinkDuplicates => {
66                "Replace duplicate files with hard links to the representative."
67            }
68            Self::ArchiveDuplicates => "Move duplicates to an archive directory for manual review.",
69            Self::ManualReview => {
70                "Similarity confidence is insufficient for automated action; review manually."
71            }
72            Self::NoAction => "Single-member group; no action required.",
73        }
74    }
75}
76
77// ---------------------------------------------------------------------------
78// ConfidenceTier
79// ---------------------------------------------------------------------------
80
81/// Confidence tier based on the similarity score.
82///
83/// Variants are ordered from lowest to highest so that `Low < Medium < High < Exact`.
84#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
85pub enum ConfidenceTier {
86    /// Low confidence (score < 0.75); possible duplicates, manual review
87    /// recommended.
88    Low,
89    /// Medium confidence (0.75 ≤ score < 0.90); probable duplicates.
90    Medium,
91    /// High confidence (0.90 ≤ score < 0.98); near-identical duplicates.
92    High,
93    /// Very high confidence (score ≥ 0.98); likely bitwise identical after
94    /// normalization.
95    Exact,
96}
97
98impl ConfidenceTier {
99    /// Classify a similarity score into a confidence tier.
100    #[must_use]
101    pub fn from_score(score: f64) -> Self {
102        if score >= 0.98 {
103            Self::Exact
104        } else if score >= 0.90 {
105            Self::High
106        } else if score >= 0.75 {
107            Self::Medium
108        } else {
109            Self::Low
110        }
111    }
112
113    /// Return a short label.
114    #[must_use]
115    pub fn label(self) -> &'static str {
116        match self {
117            Self::Exact => "exact",
118            Self::High => "high",
119            Self::Medium => "medium",
120            Self::Low => "low",
121        }
122    }
123}
124
125// ---------------------------------------------------------------------------
126// FileEntry
127// ---------------------------------------------------------------------------
128
129/// Metadata for a single file within a duplicate group.
130#[derive(Debug, Clone)]
131pub struct FileEntry {
132    /// Path or identifier for this file.
133    pub path: String,
134    /// File size in bytes.
135    pub size_bytes: u64,
136    /// Whether this is the chosen representative of the group.
137    pub is_representative: bool,
138    /// Similarity score to the representative (1.0 for the representative
139    /// itself).
140    pub similarity_to_rep: f64,
141}
142
143impl FileEntry {
144    /// Create a new file entry.
145    #[must_use]
146    pub fn new(path: impl Into<String>, size_bytes: u64, similarity_to_rep: f64) -> Self {
147        Self {
148            path: path.into(),
149            size_bytes,
150            is_representative: false,
151            similarity_to_rep,
152        }
153    }
154}
155
156// ---------------------------------------------------------------------------
157// SpaceSavingsEstimate
158// ---------------------------------------------------------------------------
159
160/// Space savings estimate for a single duplicate group or a whole report.
161#[derive(Debug, Clone, Default)]
162pub struct SpaceSavingsEstimate {
163    /// Bytes that would be freed by deleting non-representative files.
164    pub reclaimable_bytes: u64,
165    /// Bytes occupied by the representative files (lower bound of total kept).
166    pub retained_bytes: u64,
167    /// Fraction of total group storage that is reclaimable (0.0 – 1.0).
168    pub savings_ratio: f64,
169}
170
171impl SpaceSavingsEstimate {
172    /// Compute estimates from a slice of [`FileEntry`]s.
173    #[must_use]
174    pub fn from_entries(entries: &[FileEntry]) -> Self {
175        let total_bytes: u64 = entries.iter().map(|e| e.size_bytes).sum();
176        let retained_bytes: u64 = entries
177            .iter()
178            .filter(|e| e.is_representative)
179            .map(|e| e.size_bytes)
180            .sum();
181        let reclaimable_bytes = total_bytes.saturating_sub(retained_bytes);
182        let savings_ratio = if total_bytes > 0 {
183            reclaimable_bytes as f64 / total_bytes as f64
184        } else {
185            0.0
186        };
187        Self {
188            reclaimable_bytes,
189            retained_bytes,
190            savings_ratio,
191        }
192    }
193
194    /// Merge another estimate into this one (accumulate totals).
195    pub fn merge(&mut self, other: &Self) {
196        self.reclaimable_bytes += other.reclaimable_bytes;
197        self.retained_bytes += other.retained_bytes;
198        let total = self.reclaimable_bytes + self.retained_bytes;
199        self.savings_ratio = if total > 0 {
200            self.reclaimable_bytes as f64 / total as f64
201        } else {
202            0.0
203        };
204    }
205
206    /// Return a human-readable string.
207    #[must_use]
208    pub fn description(&self) -> String {
209        format!(
210            "{} bytes reclaimable / {} bytes retained ({:.1}% savings)",
211            self.reclaimable_bytes,
212            self.retained_bytes,
213            self.savings_ratio * 100.0,
214        )
215    }
216}
217
218// ---------------------------------------------------------------------------
219// DetailedDuplicateGroup
220// ---------------------------------------------------------------------------
221
222/// A single group of duplicate or near-duplicate files, enriched with
223/// detailed metadata.
224#[derive(Debug, Clone)]
225pub struct DetailedDuplicateGroup {
226    /// Group identifier.
227    pub id: usize,
228    /// Detection method that found this group (e.g. "phash", "ssim").
229    pub method: String,
230    /// Member files.
231    pub files: Vec<FileEntry>,
232    /// Mean pairwise similarity within this group.
233    pub mean_similarity: f64,
234    /// Confidence tier derived from `mean_similarity`.
235    pub confidence_tier: ConfidenceTier,
236    /// Recommended action for this group.
237    pub action: RecommendedAction,
238    /// Space savings estimate for this group.
239    pub space_savings: SpaceSavingsEstimate,
240    /// Arbitrary key-value metadata (e.g. codec, container, resolution).
241    pub metadata: HashMap<String, String>,
242}
243
244impl DetailedDuplicateGroup {
245    /// Create a new group.
246    #[must_use]
247    pub fn new(id: usize, method: impl Into<String>, mean_similarity: f64) -> Self {
248        let confidence_tier = ConfidenceTier::from_score(mean_similarity);
249        Self {
250            id,
251            method: method.into(),
252            files: Vec::new(),
253            mean_similarity,
254            confidence_tier,
255            action: RecommendedAction::NoAction,
256            space_savings: SpaceSavingsEstimate::default(),
257            metadata: HashMap::new(),
258        }
259    }
260
261    /// Add a file entry to this group.
262    pub fn add_file(&mut self, entry: FileEntry) {
263        self.files.push(entry);
264    }
265
266    /// Number of files.
267    #[must_use]
268    pub fn size(&self) -> usize {
269        self.files.len()
270    }
271
272    /// True when the group has at least two members.
273    #[must_use]
274    pub fn is_duplicate(&self) -> bool {
275        self.files.len() >= 2
276    }
277
278    /// Compute and cache the space savings estimate.
279    pub fn compute_space_savings(&mut self) {
280        self.space_savings = SpaceSavingsEstimate::from_entries(&self.files);
281    }
282
283    /// Assign a representative by picking the file with the largest size
284    /// (on the assumption that higher-quality originals tend to be larger).
285    pub fn select_largest_representative(&mut self) {
286        if self.files.is_empty() {
287            return;
288        }
289        // Clear existing representative flags.
290        for f in &mut self.files {
291            f.is_representative = false;
292        }
293        let best_idx = self
294            .files
295            .iter()
296            .enumerate()
297            .max_by_key(|(_, e)| e.size_bytes)
298            .map(|(i, _)| i)
299            .unwrap_or(0);
300        self.files[best_idx].is_representative = true;
301    }
302
303    /// Assign a representative by picking the file with the highest
304    /// similarity_to_rep score (useful when the representative is
305    /// externally determined).
306    pub fn select_highest_similarity_representative(&mut self) {
307        if self.files.is_empty() {
308            return;
309        }
310        for f in &mut self.files {
311            f.is_representative = false;
312        }
313        let best_idx = self
314            .files
315            .iter()
316            .enumerate()
317            .max_by(|(_, a), (_, b)| {
318                a.similarity_to_rep
319                    .partial_cmp(&b.similarity_to_rep)
320                    .unwrap_or(std::cmp::Ordering::Equal)
321            })
322            .map(|(i, _)| i)
323            .unwrap_or(0);
324        self.files[best_idx].is_representative = true;
325    }
326
327    /// Assign the recommended action based on confidence tier and a policy.
328    ///
329    /// - `exact_action`: action when `confidence_tier == Exact`.
330    /// - `high_action`: action when `confidence_tier == High`.
331    /// - `fallback_action`: action for Medium/Low confidence.
332    pub fn assign_action(
333        &mut self,
334        exact_action: RecommendedAction,
335        high_action: RecommendedAction,
336        fallback_action: RecommendedAction,
337    ) {
338        if !self.is_duplicate() {
339            self.action = RecommendedAction::NoAction;
340            return;
341        }
342        self.action = match self.confidence_tier {
343            ConfidenceTier::Exact => exact_action,
344            ConfidenceTier::High => high_action,
345            _ => fallback_action,
346        };
347    }
348
349    /// Insert a metadata key-value pair.
350    pub fn set_metadata(&mut self, key: impl Into<String>, value: impl Into<String>) {
351        self.metadata.insert(key.into(), value.into());
352    }
353
354    /// Return the path of the representative file, if one is marked.
355    #[must_use]
356    pub fn representative_path(&self) -> Option<&str> {
357        self.files
358            .iter()
359            .find(|e| e.is_representative)
360            .map(|e| e.path.as_str())
361    }
362}
363
364// ---------------------------------------------------------------------------
365// DetailedReport
366// ---------------------------------------------------------------------------
367
368/// Aggregate detailed report over all duplicate groups found.
369#[derive(Debug, Clone)]
370pub struct DetailedReport {
371    /// All duplicate groups (size ≥ 2).
372    pub groups: Vec<DetailedDuplicateGroup>,
373    /// Aggregated space savings across all groups.
374    pub total_space_savings: SpaceSavingsEstimate,
375    /// Number of files examined in total.
376    pub total_files_examined: usize,
377    /// Breakdown of group counts by confidence tier.
378    pub tier_counts: HashMap<String, usize>,
379    /// Breakdown of group counts by recommended action.
380    pub action_counts: HashMap<String, usize>,
381}
382
383impl DetailedReport {
384    /// Number of duplicate groups.
385    #[must_use]
386    pub fn group_count(&self) -> usize {
387        self.groups.len()
388    }
389
390    /// Total number of files that are in duplicate groups.
391    #[must_use]
392    pub fn duplicate_file_count(&self) -> usize {
393        self.groups.iter().map(|g| g.size()).sum()
394    }
395
396    /// Return a multi-line human-readable summary.
397    #[must_use]
398    pub fn summary(&self) -> String {
399        format!(
400            "DetailedReport: {} groups | {} duplicate files | {} files examined\n\
401             Space: {}\n\
402             Tiers: {:?}\n\
403             Actions: {:?}",
404            self.group_count(),
405            self.duplicate_file_count(),
406            self.total_files_examined,
407            self.total_space_savings.description(),
408            self.tier_counts,
409            self.action_counts,
410        )
411    }
412}
413
414// ---------------------------------------------------------------------------
415// DetailedReportBuilder
416// ---------------------------------------------------------------------------
417
418/// Fluent builder for assembling a [`DetailedReport`].
419///
420/// # Example
421/// ```
422/// use oximedia_dedup::dedup_report_detailed::{
423///     DetailedReportBuilder, FileEntry, RecommendedAction,
424/// };
425///
426/// let report = DetailedReportBuilder::new()
427///     .total_files_examined(100)
428///     .build();
429///
430/// assert_eq!(report.total_files_examined, 100);
431/// assert!(report.groups.is_empty());
432/// ```
433#[derive(Debug, Default)]
434pub struct DetailedReportBuilder {
435    groups: Vec<DetailedDuplicateGroup>,
436    total_files_examined: usize,
437}
438
439impl DetailedReportBuilder {
440    /// Create a new builder.
441    #[must_use]
442    pub fn new() -> Self {
443        Self::default()
444    }
445
446    /// Set the total number of files examined.
447    #[must_use]
448    pub fn total_files_examined(mut self, n: usize) -> Self {
449        self.total_files_examined = n;
450        self
451    }
452
453    /// Add a pre-built [`DetailedDuplicateGroup`] to the report.
454    #[must_use]
455    pub fn add_group(mut self, group: DetailedDuplicateGroup) -> Self {
456        self.groups.push(group);
457        self
458    }
459
460    /// Convenience: add a simple group from paths and sizes.
461    ///
462    /// The group will use the largest-file-is-representative heuristic and
463    /// the default action policy (Exact→delete, High→hardlink, else review).
464    #[must_use]
465    pub fn add_simple_group(
466        mut self,
467        id: usize,
468        method: impl Into<String>,
469        mean_similarity: f64,
470        files: Vec<(String, u64)>,
471    ) -> Self {
472        let mut group = DetailedDuplicateGroup::new(id, method, mean_similarity);
473        for (path, size) in files {
474            group.add_file(FileEntry::new(path, size, mean_similarity));
475        }
476        group.select_largest_representative();
477        group.assign_action(
478            RecommendedAction::DeleteDuplicates,
479            RecommendedAction::HardlinkDuplicates,
480            RecommendedAction::ManualReview,
481        );
482        group.compute_space_savings();
483        self.groups.push(group);
484        self
485    }
486
487    /// Build the final [`DetailedReport`].
488    #[must_use]
489    pub fn build(self) -> DetailedReport {
490        let mut total_space_savings = SpaceSavingsEstimate::default();
491        let mut tier_counts: HashMap<String, usize> = HashMap::new();
492        let mut action_counts: HashMap<String, usize> = HashMap::new();
493
494        for group in &self.groups {
495            total_space_savings.merge(&group.space_savings);
496            *tier_counts
497                .entry(group.confidence_tier.label().to_string())
498                .or_insert(0) += 1;
499            *action_counts
500                .entry(group.action.label().to_string())
501                .or_insert(0) += 1;
502        }
503
504        DetailedReport {
505            groups: self.groups,
506            total_space_savings,
507            total_files_examined: self.total_files_examined,
508            tier_counts,
509            action_counts,
510        }
511    }
512}
513
514// ---------------------------------------------------------------------------
515// Tests
516// ---------------------------------------------------------------------------
517
518#[cfg(test)]
519mod tests {
520    use super::*;
521
522    fn entry(path: &str, size: u64, sim: f64) -> FileEntry {
523        FileEntry::new(path, size, sim)
524    }
525
526    #[test]
527    fn test_confidence_tier_from_score() {
528        assert_eq!(ConfidenceTier::from_score(1.0), ConfidenceTier::Exact);
529        assert_eq!(ConfidenceTier::from_score(0.98), ConfidenceTier::Exact);
530        assert_eq!(ConfidenceTier::from_score(0.95), ConfidenceTier::High);
531        assert_eq!(ConfidenceTier::from_score(0.90), ConfidenceTier::High);
532        assert_eq!(ConfidenceTier::from_score(0.80), ConfidenceTier::Medium);
533        assert_eq!(ConfidenceTier::from_score(0.75), ConfidenceTier::Medium);
534        assert_eq!(ConfidenceTier::from_score(0.50), ConfidenceTier::Low);
535        assert_eq!(ConfidenceTier::from_score(0.0), ConfidenceTier::Low);
536    }
537
538    #[test]
539    fn test_confidence_tier_ordering() {
540        assert!(ConfidenceTier::Exact > ConfidenceTier::Low);
541        assert!(ConfidenceTier::High > ConfidenceTier::Medium);
542    }
543
544    #[test]
545    fn test_recommended_action_labels() {
546        assert_eq!(RecommendedAction::DeleteDuplicates.label(), "delete");
547        assert_eq!(RecommendedAction::SymlinkDuplicates.label(), "symlink");
548        assert_eq!(RecommendedAction::HardlinkDuplicates.label(), "hardlink");
549        assert_eq!(RecommendedAction::ArchiveDuplicates.label(), "archive");
550        assert_eq!(RecommendedAction::ManualReview.label(), "manual_review");
551        assert_eq!(RecommendedAction::NoAction.label(), "no_action");
552    }
553
554    #[test]
555    fn test_space_savings_from_entries() {
556        let mut files = vec![entry("a.mp4", 1000, 1.0), entry("b.mp4", 800, 0.95)];
557        files[0].is_representative = true;
558        let est = SpaceSavingsEstimate::from_entries(&files);
559        assert_eq!(est.retained_bytes, 1000);
560        assert_eq!(est.reclaimable_bytes, 800);
561        assert!((est.savings_ratio - 800.0 / 1800.0).abs() < 1e-9);
562    }
563
564    #[test]
565    fn test_space_savings_empty() {
566        let est = SpaceSavingsEstimate::from_entries(&[]);
567        assert_eq!(est.reclaimable_bytes, 0);
568        assert_eq!(est.savings_ratio, 0.0);
569    }
570
571    #[test]
572    fn test_space_savings_merge() {
573        let mut a = SpaceSavingsEstimate {
574            reclaimable_bytes: 500,
575            retained_bytes: 1000,
576            savings_ratio: 0.333,
577        };
578        let b = SpaceSavingsEstimate {
579            reclaimable_bytes: 300,
580            retained_bytes: 700,
581            savings_ratio: 0.3,
582        };
583        a.merge(&b);
584        assert_eq!(a.reclaimable_bytes, 800);
585        assert_eq!(a.retained_bytes, 1700);
586        let expected_ratio = 800.0 / 2500.0;
587        assert!((a.savings_ratio - expected_ratio).abs() < 1e-9);
588    }
589
590    #[test]
591    fn test_group_select_largest_representative() {
592        let mut group = DetailedDuplicateGroup::new(0, "phash", 0.95);
593        group.add_file(entry("small.mp4", 100, 0.95));
594        group.add_file(entry("large.mp4", 9000, 0.95));
595        group.add_file(entry("medium.mp4", 500, 0.95));
596        group.select_largest_representative();
597        assert_eq!(group.representative_path(), Some("large.mp4"));
598    }
599
600    #[test]
601    fn test_group_assign_action_exact() {
602        let mut group = DetailedDuplicateGroup::new(0, "hash", 0.999);
603        group.add_file(entry("a.mp4", 100, 1.0));
604        group.add_file(entry("b.mp4", 100, 1.0));
605        group.assign_action(
606            RecommendedAction::DeleteDuplicates,
607            RecommendedAction::HardlinkDuplicates,
608            RecommendedAction::ManualReview,
609        );
610        assert_eq!(group.action, RecommendedAction::DeleteDuplicates);
611    }
612
613    #[test]
614    fn test_group_assign_action_low_confidence() {
615        let mut group = DetailedDuplicateGroup::new(0, "ssim", 0.65);
616        group.add_file(entry("a.mp4", 100, 0.65));
617        group.add_file(entry("b.mp4", 100, 0.65));
618        group.assign_action(
619            RecommendedAction::DeleteDuplicates,
620            RecommendedAction::HardlinkDuplicates,
621            RecommendedAction::ManualReview,
622        );
623        assert_eq!(group.action, RecommendedAction::ManualReview);
624    }
625
626    #[test]
627    fn test_group_single_member_no_action() {
628        let mut group = DetailedDuplicateGroup::new(0, "phash", 1.0);
629        group.add_file(entry("only.mp4", 500, 1.0));
630        group.assign_action(
631            RecommendedAction::DeleteDuplicates,
632            RecommendedAction::HardlinkDuplicates,
633            RecommendedAction::ManualReview,
634        );
635        assert_eq!(group.action, RecommendedAction::NoAction);
636    }
637
638    #[test]
639    fn test_group_metadata() {
640        let mut group = DetailedDuplicateGroup::new(0, "phash", 0.95);
641        group.set_metadata("codec", "h264");
642        group.set_metadata("resolution", "1920x1080");
643        assert_eq!(
644            group.metadata.get("codec").map(String::as_str),
645            Some("h264")
646        );
647        assert_eq!(group.metadata.len(), 2);
648    }
649
650    #[test]
651    fn test_report_builder_empty() {
652        let report = DetailedReportBuilder::new()
653            .total_files_examined(50)
654            .build();
655        assert_eq!(report.total_files_examined, 50);
656        assert!(report.groups.is_empty());
657        assert_eq!(report.group_count(), 0);
658        assert_eq!(report.duplicate_file_count(), 0);
659    }
660
661    #[test]
662    fn test_report_builder_with_groups() {
663        let report = DetailedReportBuilder::new()
664            .total_files_examined(200)
665            .add_simple_group(
666                0,
667                "phash",
668                0.96,
669                vec![("a.mp4".to_string(), 2000), ("b.mp4".to_string(), 1500)],
670            )
671            .add_simple_group(
672                1,
673                "ssim",
674                0.82,
675                vec![("c.mp4".to_string(), 1000), ("d.mp4".to_string(), 900)],
676            )
677            .build();
678
679        assert_eq!(report.group_count(), 2);
680        assert_eq!(report.duplicate_file_count(), 4);
681        assert!(report.total_space_savings.reclaimable_bytes > 0);
682        assert!(!report.summary().is_empty());
683    }
684
685    #[test]
686    fn test_report_tier_and_action_counts() {
687        let report = DetailedReportBuilder::new()
688            .add_simple_group(
689                0,
690                "phash",
691                0.999,
692                vec![("a.mp4".to_string(), 1000), ("b.mp4".to_string(), 800)],
693            )
694            .add_simple_group(
695                1,
696                "ssim",
697                0.60,
698                vec![("c.mp4".to_string(), 500), ("d.mp4".to_string(), 400)],
699            )
700            .build();
701
702        // Exact-confidence group → delete; Low-confidence → manual_review.
703        assert_eq!(report.tier_counts.get("exact").copied().unwrap_or(0), 1);
704        assert_eq!(report.tier_counts.get("low").copied().unwrap_or(0), 1);
705        assert_eq!(report.action_counts.get("delete").copied().unwrap_or(0), 1);
706        assert_eq!(
707            report
708                .action_counts
709                .get("manual_review")
710                .copied()
711                .unwrap_or(0),
712            1
713        );
714    }
715}