Skip to main content

oximedia_dedup/
dedup_report_ext.rs

1//! Extended deduplication reporting and statistics.
2//!
3//! Augments the base [`report`](crate::report) module with:
4//! - **`ReportBuilder`**: fluent builder for assembling detailed reports
5//! - **`SizeDistribution`**: histogram of duplicate file sizes
6//! - **`FormatBreakdown`**: per-format duplicate statistics
7//! - **`ReportSummary`**: human-readable summary generation
8
9#![allow(dead_code)]
10#![allow(clippy::cast_precision_loss)]
11
12use std::collections::HashMap;
13use std::path::{Path, PathBuf};
14
15// ---------------------------------------------------------------------------
16// DuplicateEntry
17// ---------------------------------------------------------------------------
18
19/// A single duplicate file entry for reporting.
20#[derive(Debug, Clone)]
21pub struct DuplicateEntry {
22    /// Path to the file.
23    pub path: PathBuf,
24    /// File size in bytes.
25    pub size: u64,
26    /// Hash digest.
27    pub digest: String,
28    /// File extension (lowercase, no dot).
29    pub extension: String,
30}
31
32impl DuplicateEntry {
33    /// Create a new entry.
34    pub fn new(path: PathBuf, size: u64, digest: &str) -> Self {
35        let extension = path
36            .extension()
37            .and_then(|e| e.to_str())
38            .unwrap_or("")
39            .to_lowercase();
40        Self {
41            path,
42            size,
43            digest: digest.to_string(),
44            extension,
45        }
46    }
47}
48
49// ---------------------------------------------------------------------------
50// ReportBuilder
51// ---------------------------------------------------------------------------
52
53/// Fluent builder for constructing an extended dedup report.
54pub struct ReportBuilder {
55    /// All duplicate entries.
56    entries: Vec<DuplicateEntry>,
57    /// Title of the report.
58    title: String,
59    /// Minimum group size to include.
60    min_group_size: usize,
61    /// Only include files larger than this threshold.
62    min_file_size: u64,
63}
64
65impl ReportBuilder {
66    /// Start building a new report.
67    #[must_use]
68    pub fn new() -> Self {
69        Self {
70            entries: Vec::new(),
71            title: "Deduplication Report".to_string(),
72            min_group_size: 2,
73            min_file_size: 0,
74        }
75    }
76
77    /// Set the report title.
78    #[must_use]
79    pub fn title(mut self, title: &str) -> Self {
80        self.title = title.to_string();
81        self
82    }
83
84    /// Set minimum duplicate group size to include.
85    #[must_use]
86    pub fn min_group_size(mut self, n: usize) -> Self {
87        self.min_group_size = n;
88        self
89    }
90
91    /// Set minimum file size filter.
92    #[must_use]
93    pub fn min_file_size(mut self, bytes: u64) -> Self {
94        self.min_file_size = bytes;
95        self
96    }
97
98    /// Add a duplicate entry.
99    pub fn add_entry(&mut self, entry: DuplicateEntry) {
100        self.entries.push(entry);
101    }
102
103    /// Add multiple entries.
104    pub fn add_entries(&mut self, entries: impl IntoIterator<Item = DuplicateEntry>) {
105        self.entries.extend(entries);
106    }
107
108    /// Build the final report.
109    #[must_use]
110    pub fn build(self) -> ExtendedReport {
111        // Group entries by digest
112        let mut groups: HashMap<String, Vec<DuplicateEntry>> = HashMap::new();
113        for entry in self.entries {
114            if entry.size >= self.min_file_size {
115                groups.entry(entry.digest.clone()).or_default().push(entry);
116            }
117        }
118
119        // Filter by min group size
120        let dup_groups: Vec<DuplicateGroup> = groups
121            .into_iter()
122            .filter(|(_, v)| v.len() >= self.min_group_size)
123            .map(|(digest, files)| {
124                let total_size: u64 = files.iter().map(|f| f.size).sum();
125                let recoverable = files.iter().skip(1).map(|f| f.size).sum();
126                DuplicateGroup {
127                    digest,
128                    files,
129                    total_size,
130                    recoverable_bytes: recoverable,
131                }
132            })
133            .collect();
134
135        let total_files: usize = dup_groups.iter().map(|g| g.files.len()).sum();
136        let total_recoverable: u64 = dup_groups.iter().map(|g| g.recoverable_bytes).sum();
137
138        ExtendedReport {
139            title: self.title,
140            groups: dup_groups,
141            total_duplicate_files: total_files,
142            total_recoverable_bytes: total_recoverable,
143        }
144    }
145}
146
147impl Default for ReportBuilder {
148    fn default() -> Self {
149        Self::new()
150    }
151}
152
153// ---------------------------------------------------------------------------
154// DuplicateGroup / ExtendedReport
155// ---------------------------------------------------------------------------
156
157/// A group of duplicate files sharing the same digest.
158#[derive(Debug, Clone)]
159pub struct DuplicateGroup {
160    /// The shared digest.
161    pub digest: String,
162    /// Files in this group.
163    pub files: Vec<DuplicateEntry>,
164    /// Total size of all files in the group.
165    pub total_size: u64,
166    /// Bytes recoverable by keeping only one copy.
167    pub recoverable_bytes: u64,
168}
169
170/// Full extended deduplication report.
171#[derive(Debug, Clone)]
172pub struct ExtendedReport {
173    /// Report title.
174    pub title: String,
175    /// Duplicate groups.
176    pub groups: Vec<DuplicateGroup>,
177    /// Total number of duplicate files.
178    pub total_duplicate_files: usize,
179    /// Total bytes recoverable.
180    pub total_recoverable_bytes: u64,
181}
182
183impl ExtendedReport {
184    /// Return the number of duplicate groups.
185    #[must_use]
186    pub fn group_count(&self) -> usize {
187        self.groups.len()
188    }
189
190    /// Return a per-extension breakdown of duplicates.
191    #[must_use]
192    pub fn format_breakdown(&self) -> FormatBreakdown {
193        let mut by_ext: HashMap<String, ExtStats> = HashMap::new();
194
195        for group in &self.groups {
196            for file in &group.files {
197                let ext = if file.extension.is_empty() {
198                    "(none)".to_string()
199                } else {
200                    file.extension.clone()
201                };
202                let stats = by_ext.entry(ext).or_insert_with(ExtStats::default);
203                stats.file_count += 1;
204                stats.total_bytes += file.size;
205            }
206        }
207
208        FormatBreakdown {
209            by_extension: by_ext,
210        }
211    }
212
213    /// Return a human-readable summary string.
214    #[must_use]
215    pub fn summary_text(&self) -> String {
216        format!(
217            "{}: {} duplicate groups, {} files, {:.2} MB recoverable",
218            self.title,
219            self.groups.len(),
220            self.total_duplicate_files,
221            self.total_recoverable_bytes as f64 / (1024.0 * 1024.0),
222        )
223    }
224
225    /// Build a size distribution histogram with the given bucket boundaries.
226    #[must_use]
227    pub fn size_distribution(&self, bucket_boundaries: &[u64]) -> SizeDistribution {
228        let mut buckets = vec![0u64; bucket_boundaries.len() + 1];
229
230        for group in &self.groups {
231            for file in &group.files {
232                let idx = bucket_boundaries
233                    .iter()
234                    .position(|&b| file.size < b)
235                    .unwrap_or(bucket_boundaries.len());
236                buckets[idx] += 1;
237            }
238        }
239
240        SizeDistribution {
241            boundaries: bucket_boundaries.to_vec(),
242            counts: buckets,
243        }
244    }
245
246    /// Filter groups, keeping only those containing a file under `prefix`.
247    #[must_use]
248    pub fn filter_by_path(&self, prefix: &Path) -> Vec<&DuplicateGroup> {
249        self.groups
250            .iter()
251            .filter(|g| g.files.iter().any(|f| f.path.starts_with(prefix)))
252            .collect()
253    }
254}
255
256// ---------------------------------------------------------------------------
257// FormatBreakdown
258// ---------------------------------------------------------------------------
259
260/// Per-extension statistics.
261#[derive(Debug, Clone, Default)]
262pub struct ExtStats {
263    /// Number of files.
264    pub file_count: usize,
265    /// Total bytes.
266    pub total_bytes: u64,
267}
268
269/// Duplicate statistics broken down by file extension.
270#[derive(Debug, Clone)]
271pub struct FormatBreakdown {
272    /// Map from extension to stats.
273    pub by_extension: HashMap<String, ExtStats>,
274}
275
276impl FormatBreakdown {
277    /// Return the extension with the most duplicate files.
278    #[must_use]
279    pub fn most_common_ext(&self) -> Option<(&str, usize)> {
280        self.by_extension
281            .iter()
282            .max_by_key(|(_, s)| s.file_count)
283            .map(|(ext, s)| (ext.as_str(), s.file_count))
284    }
285}
286
287// ---------------------------------------------------------------------------
288// SizeDistribution
289// ---------------------------------------------------------------------------
290
291/// Histogram of file sizes.
292#[derive(Debug, Clone)]
293pub struct SizeDistribution {
294    /// Bucket boundaries (upper-exclusive).
295    pub boundaries: Vec<u64>,
296    /// Count of files in each bucket.
297    pub counts: Vec<u64>,
298}
299
300impl SizeDistribution {
301    /// Return total file count across all buckets.
302    #[must_use]
303    pub fn total(&self) -> u64 {
304        self.counts.iter().sum()
305    }
306}
307
308// ---------------------------------------------------------------------------
309// Tests
310// ---------------------------------------------------------------------------
311
312#[cfg(test)]
313mod tests {
314    use super::*;
315
316    fn sample_entries() -> Vec<DuplicateEntry> {
317        vec![
318            DuplicateEntry::new(PathBuf::from("/a.mp4"), 1_000_000, "hash1"),
319            DuplicateEntry::new(PathBuf::from("/b.mp4"), 1_000_000, "hash1"),
320            DuplicateEntry::new(PathBuf::from("/c.mov"), 500_000, "hash2"),
321            DuplicateEntry::new(PathBuf::from("/d.mov"), 500_000, "hash2"),
322            DuplicateEntry::new(PathBuf::from("/e.wav"), 200_000, "hash3"),
323        ]
324    }
325
326    #[test]
327    fn test_report_builder_basic() {
328        let mut builder = ReportBuilder::new();
329        builder.add_entries(sample_entries());
330        let report = builder.build();
331        assert_eq!(report.group_count(), 2); // hash1 and hash2 have groups >= 2
332    }
333
334    #[test]
335    fn test_report_builder_title() {
336        let report = ReportBuilder::new().title("My Report").build();
337        assert_eq!(report.title, "My Report");
338    }
339
340    #[test]
341    fn test_report_builder_min_group_size() {
342        let mut builder = ReportBuilder::new().min_group_size(3);
343        builder.add_entries(sample_entries());
344        let report = builder.build();
345        assert_eq!(report.group_count(), 0); // no group has 3+ files
346    }
347
348    #[test]
349    fn test_report_builder_min_file_size() {
350        let mut builder = ReportBuilder::new().min_file_size(600_000);
351        builder.add_entries(sample_entries());
352        let report = builder.build();
353        // Only hash1 group (1MB each) passes
354        assert_eq!(report.group_count(), 1);
355    }
356
357    #[test]
358    fn test_recoverable_bytes() {
359        let mut builder = ReportBuilder::new();
360        builder.add_entries(sample_entries());
361        let report = builder.build();
362        // hash1: 1M recoverable, hash2: 500k recoverable
363        assert_eq!(report.total_recoverable_bytes, 1_500_000);
364    }
365
366    #[test]
367    fn test_summary_text() {
368        let mut builder = ReportBuilder::new().title("Test");
369        builder.add_entries(sample_entries());
370        let report = builder.build();
371        let text = report.summary_text();
372        assert!(text.contains("Test"));
373        assert!(text.contains("duplicate groups"));
374    }
375
376    #[test]
377    fn test_format_breakdown() {
378        let mut builder = ReportBuilder::new();
379        builder.add_entries(sample_entries());
380        let report = builder.build();
381        let breakdown = report.format_breakdown();
382        assert!(breakdown.by_extension.contains_key("mp4"));
383        assert!(breakdown.by_extension.contains_key("mov"));
384    }
385
386    #[test]
387    fn test_most_common_ext() {
388        let mut builder = ReportBuilder::new();
389        builder.add_entries(sample_entries());
390        let report = builder.build();
391        let breakdown = report.format_breakdown();
392        let (ext, count) = breakdown
393            .most_common_ext()
394            .expect("operation should succeed");
395        // mp4 and mov both have 2 files; either is acceptable
396        assert!(count >= 2);
397        assert!(ext == "mp4" || ext == "mov");
398    }
399
400    #[test]
401    fn test_size_distribution() {
402        let mut builder = ReportBuilder::new();
403        builder.add_entries(sample_entries());
404        let report = builder.build();
405        let dist = report.size_distribution(&[100_000, 750_000, 2_000_000]);
406        assert_eq!(dist.total(), 4); // 4 files in 2 groups
407    }
408
409    #[test]
410    fn test_filter_by_path() {
411        let entries = vec![
412            DuplicateEntry::new(PathBuf::from("/archive/a.mp4"), 100, "h1"),
413            DuplicateEntry::new(PathBuf::from("/other/b.mp4"), 100, "h1"),
414        ];
415        let mut builder = ReportBuilder::new();
416        builder.add_entries(entries);
417        let report = builder.build();
418        let filtered = report.filter_by_path(Path::new("/archive"));
419        assert_eq!(filtered.len(), 1);
420    }
421
422    #[test]
423    fn test_empty_report() {
424        let report = ReportBuilder::new().build();
425        assert_eq!(report.group_count(), 0);
426        assert_eq!(report.total_duplicate_files, 0);
427        assert_eq!(report.total_recoverable_bytes, 0);
428    }
429
430    #[test]
431    fn test_duplicate_entry_extension() {
432        let e = DuplicateEntry::new(PathBuf::from("/foo.MP4"), 0, "x");
433        assert_eq!(e.extension, "mp4");
434    }
435}