Skip to main content

oximedia_dedup/
dedup_stats.rs

1//! Extended deduplication statistics: space savings, group statistics, action recommendations.
2
3#![allow(dead_code)]
4#![allow(clippy::cast_precision_loss)]
5#![allow(clippy::too_many_arguments)]
6
7use std::path::PathBuf;
8
9/// Statistics for a single duplicate group.
10#[derive(Debug, Clone)]
11pub struct GroupStats {
12    /// Number of files in this group.
13    pub count: usize,
14    /// Total size of all files in bytes.
15    pub total_bytes: u64,
16    /// Size of the representative (keep) file in bytes.
17    pub representative_bytes: u64,
18    /// Bytes that could be reclaimed by removing duplicates.
19    pub reclaimable_bytes: u64,
20    /// Average similarity score across all pairs.
21    pub avg_similarity: f64,
22}
23
24impl GroupStats {
25    /// Compute stats from a list of (path, size) pairs and the representative path.
26    #[must_use]
27    pub fn compute(
28        members: &[(PathBuf, u64)],
29        representative: &PathBuf,
30        avg_similarity: f64,
31    ) -> Self {
32        let total_bytes: u64 = members.iter().map(|(_, s)| *s).sum();
33        let representative_bytes = members
34            .iter()
35            .find(|(p, _)| p == representative)
36            .map(|(_, s)| *s)
37            .unwrap_or(0);
38        let reclaimable_bytes = total_bytes.saturating_sub(representative_bytes);
39        Self {
40            count: members.len(),
41            total_bytes,
42            representative_bytes,
43            reclaimable_bytes,
44            avg_similarity,
45        }
46    }
47}
48
49/// Overall deduplication space savings report.
50#[derive(Debug, Clone, Default)]
51pub struct SpaceSavingsReport {
52    /// Total number of files scanned.
53    pub total_files: usize,
54    /// Total bytes across all scanned files.
55    pub total_bytes: u64,
56    /// Number of duplicate groups found.
57    pub duplicate_groups: usize,
58    /// Total number of files that are duplicates (redundant copies).
59    pub duplicate_files: usize,
60    /// Total bytes reclaimable by removing duplicates.
61    pub reclaimable_bytes: u64,
62    /// Percentage of storage that could be freed.
63    pub savings_percent: f64,
64    /// Per-group stats.
65    pub group_stats: Vec<GroupStats>,
66}
67
68impl SpaceSavingsReport {
69    /// Create a new empty report.
70    #[must_use]
71    pub fn new() -> Self {
72        Self::default()
73    }
74
75    /// Add a group's stats to the report.
76    pub fn add_group(&mut self, stats: GroupStats) {
77        self.duplicate_groups += 1;
78        self.duplicate_files += stats.count.saturating_sub(1);
79        self.reclaimable_bytes += stats.reclaimable_bytes;
80        self.group_stats.push(stats);
81    }
82
83    /// Finalise the report by computing derived fields.
84    pub fn finalise(&mut self, total_files: usize, total_bytes: u64) {
85        self.total_files = total_files;
86        self.total_bytes = total_bytes;
87        self.savings_percent = if total_bytes > 0 {
88            100.0 * self.reclaimable_bytes as f64 / total_bytes as f64
89        } else {
90            0.0
91        };
92    }
93
94    /// Human-readable summary line.
95    #[must_use]
96    pub fn summary(&self) -> String {
97        format!(
98            "{} duplicate groups | {} redundant files | {} MB reclaimable ({:.1}%)",
99            self.duplicate_groups,
100            self.duplicate_files,
101            self.reclaimable_bytes / (1024 * 1024),
102            self.savings_percent,
103        )
104    }
105}
106
107/// Severity of a recommended action.
108#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
109pub enum ActionSeverity {
110    /// Informational only.
111    Info,
112    /// Suggested optimisation.
113    Suggestion,
114    /// Strongly recommended.
115    Warning,
116    /// Critical (e.g. very large duplicate sets).
117    Critical,
118}
119
120/// A recommended action for a duplicate group.
121#[derive(Debug, Clone)]
122pub struct ActionRecommendation {
123    /// Severity of this recommendation.
124    pub severity: ActionSeverity,
125    /// Human-readable message.
126    pub message: String,
127    /// Optionally, a list of files recommended for deletion.
128    pub files_to_remove: Vec<PathBuf>,
129    /// The file to keep.
130    pub keep: Option<PathBuf>,
131}
132
133impl ActionRecommendation {
134    /// Build a recommendation from a group's stats.
135    #[must_use]
136    pub fn from_group(
137        representative: Option<PathBuf>,
138        members: &[PathBuf],
139        reclaimable_bytes: u64,
140    ) -> Self {
141        let severity = if reclaimable_bytes > 1_000_000_000 {
142            ActionSeverity::Critical
143        } else if reclaimable_bytes > 100_000_000 {
144            ActionSeverity::Warning
145        } else if reclaimable_bytes > 0 {
146            ActionSeverity::Suggestion
147        } else {
148            ActionSeverity::Info
149        };
150
151        let files_to_remove: Vec<PathBuf> = members
152            .iter()
153            .filter(|p| Some(*p) != representative.as_ref())
154            .cloned()
155            .collect();
156
157        let message = format!(
158            "Remove {} duplicate(s) to reclaim {} MB",
159            files_to_remove.len(),
160            reclaimable_bytes / (1024 * 1024),
161        );
162
163        Self {
164            severity,
165            message,
166            files_to_remove,
167            keep: representative,
168        }
169    }
170}
171
172/// Generate action recommendations for all groups.
173#[must_use]
174pub fn generate_recommendations(
175    report: &SpaceSavingsReport,
176    clusters: &[ClusterInfo],
177) -> Vec<ActionRecommendation> {
178    clusters
179        .iter()
180        .zip(report.group_stats.iter())
181        .map(|(cluster, stats)| {
182            ActionRecommendation::from_group(
183                cluster.representative.clone(),
184                &cluster.members,
185                stats.reclaimable_bytes,
186            )
187        })
188        .collect()
189}
190
191/// Lightweight cluster info for recommendation generation.
192#[derive(Debug, Clone)]
193pub struct ClusterInfo {
194    /// Members of this cluster.
195    pub members: Vec<PathBuf>,
196    /// The representative file to keep.
197    pub representative: Option<PathBuf>,
198}
199
200impl ClusterInfo {
201    /// Create a new cluster info.
202    #[must_use]
203    pub fn new(members: Vec<PathBuf>, representative: Option<PathBuf>) -> Self {
204        Self {
205            members,
206            representative,
207        }
208    }
209}
210
211/// Histogram of similarity score distribution across all pairs.
212#[derive(Debug, Clone)]
213pub struct SimilarityHistogram {
214    /// Bucket boundaries [0.0, 0.1, 0.2, ..., 1.0].
215    pub buckets: Vec<u64>,
216    /// Number of buckets.
217    pub num_buckets: usize,
218}
219
220impl SimilarityHistogram {
221    /// Create a new histogram with `num_buckets` equal-width buckets.
222    #[must_use]
223    pub fn new(num_buckets: usize) -> Self {
224        Self {
225            buckets: vec![0; num_buckets],
226            num_buckets,
227        }
228    }
229
230    /// Record a similarity score.
231    pub fn record(&mut self, score: f64) {
232        let score = score.clamp(0.0, 1.0);
233        let idx = ((score * self.num_buckets as f64) as usize).min(self.num_buckets - 1);
234        self.buckets[idx] += 1;
235    }
236
237    /// Total number of recorded samples.
238    #[must_use]
239    pub fn total(&self) -> u64 {
240        self.buckets.iter().sum()
241    }
242
243    /// Mode bucket index (the bucket with the most samples).
244    #[must_use]
245    pub fn mode_bucket(&self) -> usize {
246        self.buckets
247            .iter()
248            .enumerate()
249            .max_by_key(|(_, &v)| v)
250            .map(|(i, _)| i)
251            .unwrap_or(0)
252    }
253}
254
255#[cfg(test)]
256mod tests {
257    use super::*;
258
259    fn pb(s: &str) -> PathBuf {
260        PathBuf::from(s)
261    }
262
263    #[test]
264    fn test_group_stats_compute() {
265        let members = vec![(pb("a.mp4"), 1000u64), (pb("b.mp4"), 2000u64)];
266        let rep = pb("a.mp4");
267        let stats = GroupStats::compute(&members, &rep, 0.95);
268        assert_eq!(stats.count, 2);
269        assert_eq!(stats.total_bytes, 3000);
270        assert_eq!(stats.representative_bytes, 1000);
271        assert_eq!(stats.reclaimable_bytes, 2000);
272        assert!((stats.avg_similarity - 0.95).abs() < 1e-9);
273    }
274
275    #[test]
276    fn test_group_stats_missing_representative() {
277        let members = vec![(pb("a.mp4"), 500u64)];
278        let rep = pb("missing.mp4");
279        let stats = GroupStats::compute(&members, &rep, 0.0);
280        assert_eq!(stats.representative_bytes, 0);
281        assert_eq!(stats.reclaimable_bytes, 500);
282    }
283
284    #[test]
285    fn test_space_savings_report_add_and_finalise() {
286        let mut report = SpaceSavingsReport::new();
287        let stats = GroupStats {
288            count: 3,
289            total_bytes: 9_000_000,
290            representative_bytes: 3_000_000,
291            reclaimable_bytes: 6_000_000,
292            avg_similarity: 0.98,
293        };
294        report.add_group(stats);
295        report.finalise(10, 20_000_000);
296        assert_eq!(report.duplicate_groups, 1);
297        assert_eq!(report.duplicate_files, 2);
298        assert_eq!(report.reclaimable_bytes, 6_000_000);
299        assert!((report.savings_percent - 30.0).abs() < 1e-6);
300    }
301
302    #[test]
303    fn test_space_savings_report_zero_total() {
304        let mut report = SpaceSavingsReport::new();
305        report.finalise(0, 0);
306        assert_eq!(report.savings_percent, 0.0);
307    }
308
309    #[test]
310    fn test_space_savings_report_summary() {
311        let mut report = SpaceSavingsReport::new();
312        let stats = GroupStats {
313            count: 2,
314            total_bytes: 2_097_152,
315            representative_bytes: 1_048_576,
316            reclaimable_bytes: 1_048_576,
317            avg_similarity: 0.9,
318        };
319        report.add_group(stats);
320        report.finalise(5, 10_485_760);
321        let s = report.summary();
322        assert!(s.contains("1 duplicate groups"));
323        assert!(s.contains("1 redundant files"));
324    }
325
326    #[test]
327    fn test_action_severity_ordering() {
328        assert!(ActionSeverity::Info < ActionSeverity::Suggestion);
329        assert!(ActionSeverity::Suggestion < ActionSeverity::Warning);
330        assert!(ActionSeverity::Warning < ActionSeverity::Critical);
331    }
332
333    #[test]
334    fn test_action_recommendation_critical() {
335        let members = vec![pb("a.mp4"), pb("b.mp4")];
336        let rec = ActionRecommendation::from_group(Some(pb("a.mp4")), &members, 2_000_000_000);
337        assert_eq!(rec.severity, ActionSeverity::Critical);
338        assert_eq!(rec.files_to_remove.len(), 1);
339        assert_eq!(rec.keep, Some(pb("a.mp4")));
340    }
341
342    #[test]
343    fn test_action_recommendation_suggestion() {
344        let members = vec![pb("a.mp4"), pb("b.mp4")];
345        let rec = ActionRecommendation::from_group(Some(pb("a.mp4")), &members, 50_000_000);
346        assert_eq!(rec.severity, ActionSeverity::Suggestion);
347    }
348
349    #[test]
350    fn test_action_recommendation_info() {
351        let members = vec![pb("a.mp4")];
352        let rec = ActionRecommendation::from_group(Some(pb("a.mp4")), &members, 0);
353        assert_eq!(rec.severity, ActionSeverity::Info);
354        assert!(rec.files_to_remove.is_empty());
355    }
356
357    #[test]
358    fn test_generate_recommendations() {
359        let mut report = SpaceSavingsReport::new();
360        report.add_group(GroupStats {
361            count: 2,
362            total_bytes: 200,
363            representative_bytes: 100,
364            reclaimable_bytes: 100,
365            avg_similarity: 0.9,
366        });
367        report.finalise(2, 200);
368        let clusters = vec![ClusterInfo::new(
369            vec![pb("a.mp4"), pb("b.mp4")],
370            Some(pb("a.mp4")),
371        )];
372        let recs = generate_recommendations(&report, &clusters);
373        assert_eq!(recs.len(), 1);
374        assert_eq!(recs[0].files_to_remove.len(), 1);
375    }
376
377    #[test]
378    fn test_similarity_histogram_record_and_total() {
379        let mut h = SimilarityHistogram::new(10);
380        h.record(0.0);
381        h.record(0.5);
382        h.record(0.5);
383        h.record(1.0);
384        assert_eq!(h.total(), 4);
385    }
386
387    #[test]
388    fn test_similarity_histogram_clamp() {
389        let mut h = SimilarityHistogram::new(10);
390        h.record(-0.1); // clamps to 0.0
391        h.record(1.5); // clamps to 1.0
392        assert_eq!(h.total(), 2);
393    }
394
395    #[test]
396    fn test_similarity_histogram_mode() {
397        let mut h = SimilarityHistogram::new(10);
398        h.record(0.95);
399        h.record(0.96);
400        h.record(0.97);
401        h.record(0.1);
402        // Bucket 9 (0.9-1.0) should be the mode.
403        assert_eq!(h.mode_bucket(), 9);
404    }
405}