1#![allow(dead_code)]
10#![allow(clippy::cast_precision_loss)]
11
12use std::collections::HashMap;
13use std::path::{Path, PathBuf};
14
15#[derive(Debug, Clone)]
21pub struct DuplicateEntry {
22 pub path: PathBuf,
24 pub size: u64,
26 pub digest: String,
28 pub extension: String,
30}
31
32impl DuplicateEntry {
33 pub fn new(path: PathBuf, size: u64, digest: &str) -> Self {
35 let extension = path
36 .extension()
37 .and_then(|e| e.to_str())
38 .unwrap_or("")
39 .to_lowercase();
40 Self {
41 path,
42 size,
43 digest: digest.to_string(),
44 extension,
45 }
46 }
47}
48
49pub struct ReportBuilder {
55 entries: Vec<DuplicateEntry>,
57 title: String,
59 min_group_size: usize,
61 min_file_size: u64,
63}
64
65impl ReportBuilder {
66 #[must_use]
68 pub fn new() -> Self {
69 Self {
70 entries: Vec::new(),
71 title: "Deduplication Report".to_string(),
72 min_group_size: 2,
73 min_file_size: 0,
74 }
75 }
76
77 #[must_use]
79 pub fn title(mut self, title: &str) -> Self {
80 self.title = title.to_string();
81 self
82 }
83
84 #[must_use]
86 pub fn min_group_size(mut self, n: usize) -> Self {
87 self.min_group_size = n;
88 self
89 }
90
91 #[must_use]
93 pub fn min_file_size(mut self, bytes: u64) -> Self {
94 self.min_file_size = bytes;
95 self
96 }
97
98 pub fn add_entry(&mut self, entry: DuplicateEntry) {
100 self.entries.push(entry);
101 }
102
103 pub fn add_entries(&mut self, entries: impl IntoIterator<Item = DuplicateEntry>) {
105 self.entries.extend(entries);
106 }
107
108 #[must_use]
110 pub fn build(self) -> ExtendedReport {
111 let mut groups: HashMap<String, Vec<DuplicateEntry>> = HashMap::new();
113 for entry in self.entries {
114 if entry.size >= self.min_file_size {
115 groups.entry(entry.digest.clone()).or_default().push(entry);
116 }
117 }
118
119 let dup_groups: Vec<DuplicateGroup> = groups
121 .into_iter()
122 .filter(|(_, v)| v.len() >= self.min_group_size)
123 .map(|(digest, files)| {
124 let total_size: u64 = files.iter().map(|f| f.size).sum();
125 let recoverable = files.iter().skip(1).map(|f| f.size).sum();
126 DuplicateGroup {
127 digest,
128 files,
129 total_size,
130 recoverable_bytes: recoverable,
131 }
132 })
133 .collect();
134
135 let total_files: usize = dup_groups.iter().map(|g| g.files.len()).sum();
136 let total_recoverable: u64 = dup_groups.iter().map(|g| g.recoverable_bytes).sum();
137
138 ExtendedReport {
139 title: self.title,
140 groups: dup_groups,
141 total_duplicate_files: total_files,
142 total_recoverable_bytes: total_recoverable,
143 }
144 }
145}
146
147impl Default for ReportBuilder {
148 fn default() -> Self {
149 Self::new()
150 }
151}
152
153#[derive(Debug, Clone)]
159pub struct DuplicateGroup {
160 pub digest: String,
162 pub files: Vec<DuplicateEntry>,
164 pub total_size: u64,
166 pub recoverable_bytes: u64,
168}
169
170#[derive(Debug, Clone)]
172pub struct ExtendedReport {
173 pub title: String,
175 pub groups: Vec<DuplicateGroup>,
177 pub total_duplicate_files: usize,
179 pub total_recoverable_bytes: u64,
181}
182
183impl ExtendedReport {
184 #[must_use]
186 pub fn group_count(&self) -> usize {
187 self.groups.len()
188 }
189
190 #[must_use]
192 pub fn format_breakdown(&self) -> FormatBreakdown {
193 let mut by_ext: HashMap<String, ExtStats> = HashMap::new();
194
195 for group in &self.groups {
196 for file in &group.files {
197 let ext = if file.extension.is_empty() {
198 "(none)".to_string()
199 } else {
200 file.extension.clone()
201 };
202 let stats = by_ext.entry(ext).or_insert_with(ExtStats::default);
203 stats.file_count += 1;
204 stats.total_bytes += file.size;
205 }
206 }
207
208 FormatBreakdown {
209 by_extension: by_ext,
210 }
211 }
212
213 #[must_use]
215 pub fn summary_text(&self) -> String {
216 format!(
217 "{}: {} duplicate groups, {} files, {:.2} MB recoverable",
218 self.title,
219 self.groups.len(),
220 self.total_duplicate_files,
221 self.total_recoverable_bytes as f64 / (1024.0 * 1024.0),
222 )
223 }
224
225 #[must_use]
227 pub fn size_distribution(&self, bucket_boundaries: &[u64]) -> SizeDistribution {
228 let mut buckets = vec![0u64; bucket_boundaries.len() + 1];
229
230 for group in &self.groups {
231 for file in &group.files {
232 let idx = bucket_boundaries
233 .iter()
234 .position(|&b| file.size < b)
235 .unwrap_or(bucket_boundaries.len());
236 buckets[idx] += 1;
237 }
238 }
239
240 SizeDistribution {
241 boundaries: bucket_boundaries.to_vec(),
242 counts: buckets,
243 }
244 }
245
246 #[must_use]
248 pub fn filter_by_path(&self, prefix: &Path) -> Vec<&DuplicateGroup> {
249 self.groups
250 .iter()
251 .filter(|g| g.files.iter().any(|f| f.path.starts_with(prefix)))
252 .collect()
253 }
254}
255
256#[derive(Debug, Clone, Default)]
262pub struct ExtStats {
263 pub file_count: usize,
265 pub total_bytes: u64,
267}
268
269#[derive(Debug, Clone)]
271pub struct FormatBreakdown {
272 pub by_extension: HashMap<String, ExtStats>,
274}
275
276impl FormatBreakdown {
277 #[must_use]
279 pub fn most_common_ext(&self) -> Option<(&str, usize)> {
280 self.by_extension
281 .iter()
282 .max_by_key(|(_, s)| s.file_count)
283 .map(|(ext, s)| (ext.as_str(), s.file_count))
284 }
285}
286
287#[derive(Debug, Clone)]
293pub struct SizeDistribution {
294 pub boundaries: Vec<u64>,
296 pub counts: Vec<u64>,
298}
299
300impl SizeDistribution {
301 #[must_use]
303 pub fn total(&self) -> u64 {
304 self.counts.iter().sum()
305 }
306}
307
308#[cfg(test)]
313mod tests {
314 use super::*;
315
316 fn sample_entries() -> Vec<DuplicateEntry> {
317 vec![
318 DuplicateEntry::new(PathBuf::from("/a.mp4"), 1_000_000, "hash1"),
319 DuplicateEntry::new(PathBuf::from("/b.mp4"), 1_000_000, "hash1"),
320 DuplicateEntry::new(PathBuf::from("/c.mov"), 500_000, "hash2"),
321 DuplicateEntry::new(PathBuf::from("/d.mov"), 500_000, "hash2"),
322 DuplicateEntry::new(PathBuf::from("/e.wav"), 200_000, "hash3"),
323 ]
324 }
325
326 #[test]
327 fn test_report_builder_basic() {
328 let mut builder = ReportBuilder::new();
329 builder.add_entries(sample_entries());
330 let report = builder.build();
331 assert_eq!(report.group_count(), 2); }
333
334 #[test]
335 fn test_report_builder_title() {
336 let report = ReportBuilder::new().title("My Report").build();
337 assert_eq!(report.title, "My Report");
338 }
339
340 #[test]
341 fn test_report_builder_min_group_size() {
342 let mut builder = ReportBuilder::new().min_group_size(3);
343 builder.add_entries(sample_entries());
344 let report = builder.build();
345 assert_eq!(report.group_count(), 0); }
347
348 #[test]
349 fn test_report_builder_min_file_size() {
350 let mut builder = ReportBuilder::new().min_file_size(600_000);
351 builder.add_entries(sample_entries());
352 let report = builder.build();
353 assert_eq!(report.group_count(), 1);
355 }
356
357 #[test]
358 fn test_recoverable_bytes() {
359 let mut builder = ReportBuilder::new();
360 builder.add_entries(sample_entries());
361 let report = builder.build();
362 assert_eq!(report.total_recoverable_bytes, 1_500_000);
364 }
365
366 #[test]
367 fn test_summary_text() {
368 let mut builder = ReportBuilder::new().title("Test");
369 builder.add_entries(sample_entries());
370 let report = builder.build();
371 let text = report.summary_text();
372 assert!(text.contains("Test"));
373 assert!(text.contains("duplicate groups"));
374 }
375
376 #[test]
377 fn test_format_breakdown() {
378 let mut builder = ReportBuilder::new();
379 builder.add_entries(sample_entries());
380 let report = builder.build();
381 let breakdown = report.format_breakdown();
382 assert!(breakdown.by_extension.contains_key("mp4"));
383 assert!(breakdown.by_extension.contains_key("mov"));
384 }
385
386 #[test]
387 fn test_most_common_ext() {
388 let mut builder = ReportBuilder::new();
389 builder.add_entries(sample_entries());
390 let report = builder.build();
391 let breakdown = report.format_breakdown();
392 let (ext, count) = breakdown
393 .most_common_ext()
394 .expect("operation should succeed");
395 assert!(count >= 2);
397 assert!(ext == "mp4" || ext == "mov");
398 }
399
400 #[test]
401 fn test_size_distribution() {
402 let mut builder = ReportBuilder::new();
403 builder.add_entries(sample_entries());
404 let report = builder.build();
405 let dist = report.size_distribution(&[100_000, 750_000, 2_000_000]);
406 assert_eq!(dist.total(), 4); }
408
409 #[test]
410 fn test_filter_by_path() {
411 let entries = vec![
412 DuplicateEntry::new(PathBuf::from("/archive/a.mp4"), 100, "h1"),
413 DuplicateEntry::new(PathBuf::from("/other/b.mp4"), 100, "h1"),
414 ];
415 let mut builder = ReportBuilder::new();
416 builder.add_entries(entries);
417 let report = builder.build();
418 let filtered = report.filter_by_path(Path::new("/archive"));
419 assert_eq!(filtered.len(), 1);
420 }
421
422 #[test]
423 fn test_empty_report() {
424 let report = ReportBuilder::new().build();
425 assert_eq!(report.group_count(), 0);
426 assert_eq!(report.total_duplicate_files, 0);
427 assert_eq!(report.total_recoverable_bytes, 0);
428 }
429
430 #[test]
431 fn test_duplicate_entry_extension() {
432 let e = DuplicateEntry::new(PathBuf::from("/foo.MP4"), 0, "x");
433 assert_eq!(e.extension, "mp4");
434 }
435}