1use crate::{DedupError, DedupResult};
11use serde::{Deserialize, Serialize};
12
13#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct DuplicateReport {
16 pub groups: Vec<DuplicateGroup>,
18
19 pub total_duplicates: usize,
21
22 pub wasted_space: u64,
24
25 pub timestamp: i64,
27}
28
29impl DuplicateReport {
30 #[must_use]
32 pub fn new() -> Self {
33 Self {
34 groups: Vec::new(),
35 total_duplicates: 0,
36 wasted_space: 0,
37 timestamp: current_timestamp(),
38 }
39 }
40
41 pub fn add_group(&mut self, group: DuplicateGroup) {
43 if group.files.len() > 1 {
44 self.total_duplicates += group.files.len() - 1;
45 self.wasted_space += group.estimated_savings();
46 self.groups.push(group);
47 }
48 }
49
50 pub fn add_groups(&mut self, groups: Vec<DuplicateGroup>) {
52 for group in groups {
53 self.add_group(group);
54 }
55 }
56
57 pub fn sort_by_savings(&mut self) {
59 self.groups
60 .sort_by(|a, b| b.estimated_savings().cmp(&a.estimated_savings()));
61 }
62
63 pub fn sort_by_similarity(&mut self) {
65 self.groups.sort_by(|a, b| {
66 b.max_similarity()
67 .partial_cmp(&a.max_similarity())
68 .unwrap_or(std::cmp::Ordering::Equal)
69 });
70 }
71
72 pub fn filter_by_similarity(&mut self, threshold: f64) {
74 self.groups.retain(|g| g.max_similarity() >= threshold);
75 self.recalculate_stats();
76 }
77
78 fn recalculate_stats(&mut self) {
80 self.total_duplicates = self.groups.iter().map(|g| g.files.len() - 1).sum();
81 self.wasted_space = self.groups.iter().map(|g| g.estimated_savings()).sum();
82 }
83
84 pub fn to_json(&self) -> DedupResult<String> {
90 serde_json::to_string_pretty(self)
91 .map_err(|e| DedupError::Hash(format!("JSON serialization failed: {e}")))
92 }
93
94 pub fn to_json_file(&self, path: impl AsRef<std::path::Path>) -> DedupResult<()> {
100 let json = self.to_json()?;
101 std::fs::write(path, json)?;
102 Ok(())
103 }
104
105 #[must_use]
107 pub fn to_html(&self) -> String {
108 let mut html = String::from(
109 r#"<!DOCTYPE html>
110<html>
111<head>
112 <meta charset="UTF-8">
113 <title>OxiMedia Duplicate Detection Report</title>
114 <style>
115 body {
116 font-family: Arial, sans-serif;
117 margin: 20px;
118 background-color: #f5f5f5;
119 }
120 h1 {
121 color: #333;
122 }
123 .summary {
124 background-color: white;
125 padding: 20px;
126 border-radius: 8px;
127 margin-bottom: 20px;
128 box-shadow: 0 2px 4px rgba(0,0,0,0.1);
129 }
130 .group {
131 background-color: white;
132 padding: 15px;
133 border-radius: 8px;
134 margin-bottom: 15px;
135 box-shadow: 0 2px 4px rgba(0,0,0,0.1);
136 }
137 .file {
138 padding: 8px;
139 margin: 5px 0;
140 background-color: #f9f9f9;
141 border-left: 3px solid #4CAF50;
142 }
143 .score {
144 display: inline-block;
145 padding: 4px 8px;
146 background-color: #2196F3;
147 color: white;
148 border-radius: 4px;
149 font-size: 12px;
150 margin-left: 10px;
151 }
152 .savings {
153 color: #4CAF50;
154 font-weight: bold;
155 }
156 </style>
157</head>
158<body>
159 <h1>OxiMedia Duplicate Detection Report</h1>
160"#,
161 );
162
163 html.push_str(&format!(
165 r#"
166 <div class="summary">
167 <h2>Summary</h2>
168 <p><strong>Total Duplicate Groups:</strong> {}</p>
169 <p><strong>Total Duplicate Files:</strong> {}</p>
170 <p class="savings"><strong>Potential Storage Savings:</strong> {}</p>
171 <p><strong>Generated:</strong> {}</p>
172 </div>
173"#,
174 self.groups.len(),
175 self.total_duplicates,
176 format_bytes(self.wasted_space),
177 format_timestamp(self.timestamp)
178 ));
179
180 html.push_str(" <h2>Duplicate Groups</h2>\n");
182
183 for (i, group) in self.groups.iter().enumerate() {
184 html.push_str(&format!(
185 r#"
186 <div class="group">
187 <h3>Group {} <span class="score">Similarity: {:.1}%</span> <span class="savings">Savings: {}</span></h3>
188"#,
189 i + 1,
190 group.max_similarity() * 100.0,
191 format_bytes(group.estimated_savings())
192 ));
193
194 for file in &group.files {
195 html.push_str(&format!(
196 r#" <div class="file">{}</div>
197"#,
198 html_escape(file)
199 ));
200 }
201
202 if !group.scores.is_empty() {
203 html.push_str(" <p><strong>Similarity Details:</strong></p>\n");
204 html.push_str(" <ul>\n");
205 for score in &group.scores {
206 html.push_str(&format!(
207 " <li>{}: {:.1}%</li>\n",
208 score.method,
209 score.score * 100.0
210 ));
211 }
212 html.push_str(" </ul>\n");
213 }
214
215 html.push_str(" </div>\n");
216 }
217
218 html.push_str(
219 r#"
220</body>
221</html>
222"#,
223 );
224
225 html
226 }
227
228 pub fn to_html_file(&self, path: impl AsRef<std::path::Path>) -> DedupResult<()> {
234 let html = self.to_html();
235 std::fs::write(path, html)?;
236 Ok(())
237 }
238
239 #[must_use]
241 pub fn group_count(&self) -> usize {
242 self.groups.len()
243 }
244
245 #[must_use]
247 pub fn get_recommendations(&self) -> Vec<Recommendation> {
248 let mut recommendations = Vec::new();
249
250 for group in &self.groups {
251 if let Some(rec) = group.recommend_action() {
252 recommendations.push(rec);
253 }
254 }
255
256 recommendations.sort_by(|a, b| {
258 b.priority
259 .partial_cmp(&a.priority)
260 .unwrap_or(std::cmp::Ordering::Equal)
261 });
262
263 recommendations
264 }
265}
266
267impl Default for DuplicateReport {
268 fn default() -> Self {
269 Self::new()
270 }
271}
272
273#[derive(Debug, Clone, Serialize, Deserialize)]
275pub struct DuplicateGroup {
276 pub files: Vec<String>,
278
279 pub scores: Vec<SimilarityScore>,
281}
282
283impl DuplicateGroup {
284 #[must_use]
286 pub fn new(files: Vec<String>) -> Self {
287 Self {
288 files,
289 scores: Vec::new(),
290 }
291 }
292
293 pub fn add_score(&mut self, score: SimilarityScore) {
295 self.scores.push(score);
296 }
297
298 #[must_use]
300 pub fn max_similarity(&self) -> f64 {
301 self.scores.iter().map(|s| s.score).fold(0.0f64, f64::max)
302 }
303
304 #[must_use]
306 pub fn avg_similarity(&self) -> f64 {
307 if self.scores.is_empty() {
308 return 0.0;
309 }
310 let sum: f64 = self.scores.iter().map(|s| s.score).sum();
311 sum / self.scores.len() as f64
312 }
313
314 #[must_use]
316 pub fn estimated_savings(&self) -> u64 {
317 if self.files.len() <= 1 {
318 return 0;
319 }
320
321 let mut total_size = 0u64;
323 for file in &self.files {
324 if let Ok(metadata) = std::fs::metadata(file) {
325 total_size += metadata.len();
326 }
327 }
328
329 let mut largest = 0u64;
331 for file in &self.files {
332 if let Ok(metadata) = std::fs::metadata(file) {
333 largest = largest.max(metadata.len());
334 }
335 }
336
337 total_size.saturating_sub(largest)
338 }
339
340 #[must_use]
342 pub fn recommend_action(&self) -> Option<Recommendation> {
343 if self.files.len() <= 1 {
344 return None;
345 }
346
347 let mut best_file = None;
353 let mut best_score = 0.0f64;
354
355 for file in &self.files {
356 let mut score = 0.0;
357
358 let path_score = 1.0 / (file.len() as f64 + 1.0);
360 score += path_score * 0.3;
361
362 if let Ok(metadata) = std::fs::metadata(file) {
364 score += (metadata.len() as f64 / 1_000_000.0).min(1.0) * 0.4;
365
366 if let Ok(modified) = metadata.modified() {
368 if let Ok(duration) = modified.duration_since(std::time::UNIX_EPOCH) {
369 let age_days = (current_timestamp() - duration.as_secs() as i64) / 86400;
370 score += (1.0 / (age_days as f64 + 1.0)) * 0.3;
371 }
372 }
373 }
374
375 if score > best_score {
376 best_score = score;
377 best_file = Some(file.clone());
378 }
379 }
380
381 let keep_file = best_file?;
382 let delete_files: Vec<String> = self
383 .files
384 .iter()
385 .filter(|f| *f != &keep_file)
386 .cloned()
387 .collect();
388
389 Some(Recommendation {
390 action: RecommendationAction::DeleteDuplicates,
391 keep_file,
392 delete_files,
393 reason: format!(
394 "Keep the best quality file, remove {} duplicate(s)",
395 self.files.len() - 1
396 ),
397 priority: self.estimated_savings() as f64,
398 })
399 }
400}
401
402#[derive(Debug, Clone, Serialize, Deserialize)]
404pub struct SimilarityScore {
405 pub method: String,
407
408 pub score: f64,
410
411 pub metadata: Vec<(String, String)>,
413}
414
415impl SimilarityScore {
416 #[must_use]
418 pub fn new(method: String, score: f64) -> Self {
419 Self {
420 method,
421 score,
422 metadata: Vec::new(),
423 }
424 }
425
426 pub fn add_metadata(&mut self, key: String, value: String) {
428 self.metadata.push((key, value));
429 }
430}
431
432#[derive(Debug, Clone, Serialize, Deserialize)]
434pub struct Recommendation {
435 pub action: RecommendationAction,
437
438 pub keep_file: String,
440
441 pub delete_files: Vec<String>,
443
444 pub reason: String,
446
447 pub priority: f64,
449}
450
451#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
453pub enum RecommendationAction {
454 DeleteDuplicates,
456
457 CreateSymlinks,
459
460 Archive,
462
463 ManualReview,
465}
466
467fn current_timestamp() -> i64 {
469 std::time::SystemTime::now()
470 .duration_since(std::time::UNIX_EPOCH)
471 .unwrap_or_default()
472 .as_secs() as i64
473}
474
475fn format_timestamp(timestamp: i64) -> String {
477 let datetime = std::time::UNIX_EPOCH + std::time::Duration::from_secs(timestamp as u64);
479 format!("{:?}", datetime)
480}
481
482fn format_bytes(bytes: u64) -> String {
484 const KB: u64 = 1024;
485 const MB: u64 = 1024 * KB;
486 const GB: u64 = 1024 * MB;
487 const TB: u64 = 1024 * GB;
488
489 if bytes >= TB {
490 format!("{:.2} TB", bytes as f64 / TB as f64)
491 } else if bytes >= GB {
492 format!("{:.2} GB", bytes as f64 / GB as f64)
493 } else if bytes >= MB {
494 format!("{:.2} MB", bytes as f64 / MB as f64)
495 } else if bytes >= KB {
496 format!("{:.2} KB", bytes as f64 / KB as f64)
497 } else {
498 format!("{} bytes", bytes)
499 }
500}
501
502fn html_escape(s: &str) -> String {
504 s.replace('&', "&")
505 .replace('<', "<")
506 .replace('>', ">")
507 .replace('"', """)
508 .replace('\'', "'")
509}
510
511#[cfg(test)]
512mod tests {
513 use super::*;
514
515 #[test]
516 fn test_report_creation() {
517 let report = DuplicateReport::new();
518 assert_eq!(report.groups.len(), 0);
519 assert_eq!(report.total_duplicates, 0);
520 }
521
522 #[test]
523 fn test_add_group() {
524 let mut report = DuplicateReport::new();
525
526 let group = DuplicateGroup::new(vec!["file1.mp4".to_string(), "file2.mp4".to_string()]);
527
528 report.add_group(group);
529
530 assert_eq!(report.groups.len(), 1);
531 assert_eq!(report.total_duplicates, 1);
532 }
533
534 #[test]
535 fn test_duplicate_group() {
536 let mut group = DuplicateGroup::new(vec![
537 "file1.mp4".to_string(),
538 "file2.mp4".to_string(),
539 "file3.mp4".to_string(),
540 ]);
541
542 assert_eq!(group.files.len(), 3);
543
544 group.add_score(SimilarityScore::new("hash".to_string(), 1.0));
545 group.add_score(SimilarityScore::new("phash".to_string(), 0.95));
546
547 assert_eq!(group.max_similarity(), 1.0);
548 assert!((group.avg_similarity() - 0.975).abs() < 0.001);
549 }
550
551 #[test]
552 fn test_similarity_score() {
553 let mut score = SimilarityScore::new("test".to_string(), 0.95);
554 assert_eq!(score.method, "test");
555 assert_eq!(score.score, 0.95);
556
557 score.add_metadata("key".to_string(), "value".to_string());
558 assert_eq!(score.metadata.len(), 1);
559 }
560
561 #[test]
562 fn test_format_bytes() {
563 assert_eq!(format_bytes(500), "500 bytes");
564 assert_eq!(format_bytes(1024), "1.00 KB");
565 assert_eq!(format_bytes(1024 * 1024), "1.00 MB");
566 assert_eq!(format_bytes(1024 * 1024 * 1024), "1.00 GB");
567 assert_eq!(format_bytes(1024u64 * 1024 * 1024 * 1024), "1.00 TB");
568 }
569
570 #[test]
571 fn test_html_escape() {
572 assert_eq!(html_escape("test"), "test");
573 assert_eq!(html_escape("<script>"), "<script>");
574 assert_eq!(html_escape("a & b"), "a & b");
575 assert_eq!(html_escape("\"quoted\""), ""quoted"");
576 }
577
578 #[test]
579 fn test_json_export() {
580 let mut report = DuplicateReport::new();
581 let group = DuplicateGroup::new(vec!["file1.mp4".to_string(), "file2.mp4".to_string()]);
582 report.add_group(group);
583
584 let json = report.to_json().expect("operation should succeed");
585 assert!(json.contains("file1.mp4"));
586 assert!(json.contains("file2.mp4"));
587 }
588
589 #[test]
590 fn test_html_export() {
591 let mut report = DuplicateReport::new();
592 let group = DuplicateGroup::new(vec!["file1.mp4".to_string(), "file2.mp4".to_string()]);
593 report.add_group(group);
594
595 let html = report.to_html();
596 assert!(html.contains("<!DOCTYPE html>"));
597 assert!(html.contains("file1.mp4"));
598 assert!(html.contains("file2.mp4"));
599 }
600
601 #[test]
602 fn test_sort_by_similarity() {
603 let mut report = DuplicateReport::new();
604
605 let mut group1 = DuplicateGroup::new(vec!["a".to_string(), "b".to_string()]);
606 group1.add_score(SimilarityScore::new("test".to_string(), 0.9));
607
608 let mut group2 = DuplicateGroup::new(vec!["c".to_string(), "d".to_string()]);
609 group2.add_score(SimilarityScore::new("test".to_string(), 0.95));
610
611 report.add_group(group1);
612 report.add_group(group2);
613
614 report.sort_by_similarity();
615
616 assert_eq!(report.groups[0].max_similarity(), 0.95);
617 assert_eq!(report.groups[1].max_similarity(), 0.9);
618 }
619
620 #[test]
621 fn test_filter_by_similarity() {
622 let mut report = DuplicateReport::new();
623
624 let mut group1 = DuplicateGroup::new(vec!["a".to_string(), "b".to_string()]);
625 group1.add_score(SimilarityScore::new("test".to_string(), 0.7));
626
627 let mut group2 = DuplicateGroup::new(vec!["c".to_string(), "d".to_string()]);
628 group2.add_score(SimilarityScore::new("test".to_string(), 0.95));
629
630 report.add_group(group1);
631 report.add_group(group2);
632
633 report.filter_by_similarity(0.8);
634
635 assert_eq!(report.groups.len(), 1);
636 assert_eq!(report.groups[0].max_similarity(), 0.95);
637 }
638}