Skip to main content

oximedia_dedup/
report.rs

1//! Duplicate detection reports and recommendations.
2//!
3//! This module provides:
4//! - Duplicate group reporting
5//! - Similarity scoring and ranking
6//! - Storage savings estimation
7//! - HTML and JSON export
8//! - Deduplication recommendations
9
10use crate::{DedupError, DedupResult};
11use serde::{Deserialize, Serialize};
12
13/// Duplicate detection report.
14#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct DuplicateReport {
16    /// Groups of duplicate files
17    pub groups: Vec<DuplicateGroup>,
18
19    /// Total number of duplicates
20    pub total_duplicates: usize,
21
22    /// Total wasted space in bytes
23    pub wasted_space: u64,
24
25    /// Report generation timestamp
26    pub timestamp: i64,
27}
28
29impl DuplicateReport {
30    /// Create a new empty report.
31    #[must_use]
32    pub fn new() -> Self {
33        Self {
34            groups: Vec::new(),
35            total_duplicates: 0,
36            wasted_space: 0,
37            timestamp: current_timestamp(),
38        }
39    }
40
41    /// Add a duplicate group.
42    pub fn add_group(&mut self, group: DuplicateGroup) {
43        if group.files.len() > 1 {
44            self.total_duplicates += group.files.len() - 1;
45            self.wasted_space += group.estimated_savings();
46            self.groups.push(group);
47        }
48    }
49
50    /// Add multiple groups.
51    pub fn add_groups(&mut self, groups: Vec<DuplicateGroup>) {
52        for group in groups {
53            self.add_group(group);
54        }
55    }
56
57    /// Sort groups by wasted space (descending).
58    pub fn sort_by_savings(&mut self) {
59        self.groups
60            .sort_by(|a, b| b.estimated_savings().cmp(&a.estimated_savings()));
61    }
62
63    /// Sort groups by similarity score (descending).
64    pub fn sort_by_similarity(&mut self) {
65        self.groups.sort_by(|a, b| {
66            b.max_similarity()
67                .partial_cmp(&a.max_similarity())
68                .unwrap_or(std::cmp::Ordering::Equal)
69        });
70    }
71
72    /// Filter groups by minimum similarity.
73    pub fn filter_by_similarity(&mut self, threshold: f64) {
74        self.groups.retain(|g| g.max_similarity() >= threshold);
75        self.recalculate_stats();
76    }
77
78    /// Recalculate statistics.
79    fn recalculate_stats(&mut self) {
80        self.total_duplicates = self.groups.iter().map(|g| g.files.len() - 1).sum();
81        self.wasted_space = self.groups.iter().map(|g| g.estimated_savings()).sum();
82    }
83
84    /// Export to JSON.
85    ///
86    /// # Errors
87    ///
88    /// Returns an error if serialization fails.
89    pub fn to_json(&self) -> DedupResult<String> {
90        serde_json::to_string_pretty(self)
91            .map_err(|e| DedupError::Hash(format!("JSON serialization failed: {e}")))
92    }
93
94    /// Export to JSON file.
95    ///
96    /// # Errors
97    ///
98    /// Returns an error if writing fails.
99    pub fn to_json_file(&self, path: impl AsRef<std::path::Path>) -> DedupResult<()> {
100        let json = self.to_json()?;
101        std::fs::write(path, json)?;
102        Ok(())
103    }
104
105    /// Export to HTML report.
106    #[must_use]
107    pub fn to_html(&self) -> String {
108        let mut html = String::from(
109            r#"<!DOCTYPE html>
110<html>
111<head>
112    <meta charset="UTF-8">
113    <title>OxiMedia Duplicate Detection Report</title>
114    <style>
115        body {
116            font-family: Arial, sans-serif;
117            margin: 20px;
118            background-color: #f5f5f5;
119        }
120        h1 {
121            color: #333;
122        }
123        .summary {
124            background-color: white;
125            padding: 20px;
126            border-radius: 8px;
127            margin-bottom: 20px;
128            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
129        }
130        .group {
131            background-color: white;
132            padding: 15px;
133            border-radius: 8px;
134            margin-bottom: 15px;
135            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
136        }
137        .file {
138            padding: 8px;
139            margin: 5px 0;
140            background-color: #f9f9f9;
141            border-left: 3px solid #4CAF50;
142        }
143        .score {
144            display: inline-block;
145            padding: 4px 8px;
146            background-color: #2196F3;
147            color: white;
148            border-radius: 4px;
149            font-size: 12px;
150            margin-left: 10px;
151        }
152        .savings {
153            color: #4CAF50;
154            font-weight: bold;
155        }
156    </style>
157</head>
158<body>
159    <h1>OxiMedia Duplicate Detection Report</h1>
160"#,
161        );
162
163        // Summary section
164        html.push_str(&format!(
165            r#"
166    <div class="summary">
167        <h2>Summary</h2>
168        <p><strong>Total Duplicate Groups:</strong> {}</p>
169        <p><strong>Total Duplicate Files:</strong> {}</p>
170        <p class="savings"><strong>Potential Storage Savings:</strong> {}</p>
171        <p><strong>Generated:</strong> {}</p>
172    </div>
173"#,
174            self.groups.len(),
175            self.total_duplicates,
176            format_bytes(self.wasted_space),
177            format_timestamp(self.timestamp)
178        ));
179
180        // Duplicate groups
181        html.push_str("    <h2>Duplicate Groups</h2>\n");
182
183        for (i, group) in self.groups.iter().enumerate() {
184            html.push_str(&format!(
185                r#"
186    <div class="group">
187        <h3>Group {} <span class="score">Similarity: {:.1}%</span> <span class="savings">Savings: {}</span></h3>
188"#,
189                i + 1,
190                group.max_similarity() * 100.0,
191                format_bytes(group.estimated_savings())
192            ));
193
194            for file in &group.files {
195                html.push_str(&format!(
196                    r#"        <div class="file">{}</div>
197"#,
198                    html_escape(file)
199                ));
200            }
201
202            if !group.scores.is_empty() {
203                html.push_str("        <p><strong>Similarity Details:</strong></p>\n");
204                html.push_str("        <ul>\n");
205                for score in &group.scores {
206                    html.push_str(&format!(
207                        "            <li>{}: {:.1}%</li>\n",
208                        score.method,
209                        score.score * 100.0
210                    ));
211                }
212                html.push_str("        </ul>\n");
213            }
214
215            html.push_str("    </div>\n");
216        }
217
218        html.push_str(
219            r#"
220</body>
221</html>
222"#,
223        );
224
225        html
226    }
227
228    /// Export to HTML file.
229    ///
230    /// # Errors
231    ///
232    /// Returns an error if writing fails.
233    pub fn to_html_file(&self, path: impl AsRef<std::path::Path>) -> DedupResult<()> {
234        let html = self.to_html();
235        std::fs::write(path, html)?;
236        Ok(())
237    }
238
239    /// Get total number of groups.
240    #[must_use]
241    pub fn group_count(&self) -> usize {
242        self.groups.len()
243    }
244
245    /// Get recommendations for deduplication.
246    #[must_use]
247    pub fn get_recommendations(&self) -> Vec<Recommendation> {
248        let mut recommendations = Vec::new();
249
250        for group in &self.groups {
251            if let Some(rec) = group.recommend_action() {
252                recommendations.push(rec);
253            }
254        }
255
256        // Sort by priority (savings)
257        recommendations.sort_by(|a, b| {
258            b.priority
259                .partial_cmp(&a.priority)
260                .unwrap_or(std::cmp::Ordering::Equal)
261        });
262
263        recommendations
264    }
265}
266
267impl Default for DuplicateReport {
268    fn default() -> Self {
269        Self::new()
270    }
271}
272
273/// Group of duplicate files.
274#[derive(Debug, Clone, Serialize, Deserialize)]
275pub struct DuplicateGroup {
276    /// File paths in this group
277    pub files: Vec<String>,
278
279    /// Similarity scores
280    pub scores: Vec<SimilarityScore>,
281}
282
283impl DuplicateGroup {
284    /// Create a new duplicate group.
285    #[must_use]
286    pub fn new(files: Vec<String>) -> Self {
287        Self {
288            files,
289            scores: Vec::new(),
290        }
291    }
292
293    /// Add a similarity score.
294    pub fn add_score(&mut self, score: SimilarityScore) {
295        self.scores.push(score);
296    }
297
298    /// Get maximum similarity score.
299    #[must_use]
300    pub fn max_similarity(&self) -> f64 {
301        self.scores.iter().map(|s| s.score).fold(0.0f64, f64::max)
302    }
303
304    /// Get average similarity score.
305    #[must_use]
306    pub fn avg_similarity(&self) -> f64 {
307        if self.scores.is_empty() {
308            return 0.0;
309        }
310        let sum: f64 = self.scores.iter().map(|s| s.score).sum();
311        sum / self.scores.len() as f64
312    }
313
314    /// Estimate storage savings if duplicates are removed.
315    #[must_use]
316    pub fn estimated_savings(&self) -> u64 {
317        if self.files.len() <= 1 {
318            return 0;
319        }
320
321        // Calculate total size of all files
322        let mut total_size = 0u64;
323        for file in &self.files {
324            if let Ok(metadata) = std::fs::metadata(file) {
325                total_size += metadata.len();
326            }
327        }
328
329        // Savings = total - largest file
330        let mut largest = 0u64;
331        for file in &self.files {
332            if let Ok(metadata) = std::fs::metadata(file) {
333                largest = largest.max(metadata.len());
334            }
335        }
336
337        total_size.saturating_sub(largest)
338    }
339
340    /// Recommend which files to keep/delete.
341    #[must_use]
342    pub fn recommend_action(&self) -> Option<Recommendation> {
343        if self.files.len() <= 1 {
344            return None;
345        }
346
347        // Recommend keeping the file with the best quality indicators:
348        // 1. Shortest path (likely original location)
349        // 2. Largest file size (likely highest quality)
350        // 3. Most recent modification time
351
352        let mut best_file = None;
353        let mut best_score = 0.0f64;
354
355        for file in &self.files {
356            let mut score = 0.0;
357
358            // Shorter path is better
359            let path_score = 1.0 / (file.len() as f64 + 1.0);
360            score += path_score * 0.3;
361
362            // Larger size is better
363            if let Ok(metadata) = std::fs::metadata(file) {
364                score += (metadata.len() as f64 / 1_000_000.0).min(1.0) * 0.4;
365
366                // More recent modification is better
367                if let Ok(modified) = metadata.modified() {
368                    if let Ok(duration) = modified.duration_since(std::time::UNIX_EPOCH) {
369                        let age_days = (current_timestamp() - duration.as_secs() as i64) / 86400;
370                        score += (1.0 / (age_days as f64 + 1.0)) * 0.3;
371                    }
372                }
373            }
374
375            if score > best_score {
376                best_score = score;
377                best_file = Some(file.clone());
378            }
379        }
380
381        let keep_file = best_file?;
382        let delete_files: Vec<String> = self
383            .files
384            .iter()
385            .filter(|f| *f != &keep_file)
386            .cloned()
387            .collect();
388
389        Some(Recommendation {
390            action: RecommendationAction::DeleteDuplicates,
391            keep_file,
392            delete_files,
393            reason: format!(
394                "Keep the best quality file, remove {} duplicate(s)",
395                self.files.len() - 1
396            ),
397            priority: self.estimated_savings() as f64,
398        })
399    }
400}
401
402/// Similarity score with metadata.
403#[derive(Debug, Clone, Serialize, Deserialize)]
404pub struct SimilarityScore {
405    /// Detection method name
406    pub method: String,
407
408    /// Similarity score (0.0-1.0)
409    pub score: f64,
410
411    /// Additional metadata
412    pub metadata: Vec<(String, String)>,
413}
414
415impl SimilarityScore {
416    /// Create a new similarity score.
417    #[must_use]
418    pub fn new(method: String, score: f64) -> Self {
419        Self {
420            method,
421            score,
422            metadata: Vec::new(),
423        }
424    }
425
426    /// Add metadata.
427    pub fn add_metadata(&mut self, key: String, value: String) {
428        self.metadata.push((key, value));
429    }
430}
431
432/// Deduplication recommendation.
433#[derive(Debug, Clone, Serialize, Deserialize)]
434pub struct Recommendation {
435    /// Recommended action
436    pub action: RecommendationAction,
437
438    /// File to keep
439    pub keep_file: String,
440
441    /// Files to delete
442    pub delete_files: Vec<String>,
443
444    /// Reason for recommendation
445    pub reason: String,
446
447    /// Priority score (higher = more important)
448    pub priority: f64,
449}
450
451/// Recommended action type.
452#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
453pub enum RecommendationAction {
454    /// Delete duplicate files
455    DeleteDuplicates,
456
457    /// Create symbolic links
458    CreateSymlinks,
459
460    /// Move to archive
461    Archive,
462
463    /// Manual review needed
464    ManualReview,
465}
466
467/// Get current Unix timestamp.
468fn current_timestamp() -> i64 {
469    std::time::SystemTime::now()
470        .duration_since(std::time::UNIX_EPOCH)
471        .unwrap_or_default()
472        .as_secs() as i64
473}
474
475/// Format timestamp as readable string.
476fn format_timestamp(timestamp: i64) -> String {
477    // Simple formatting - in production, use chrono or similar
478    let datetime = std::time::UNIX_EPOCH + std::time::Duration::from_secs(timestamp as u64);
479    format!("{:?}", datetime)
480}
481
482/// Format bytes as human-readable string.
483fn format_bytes(bytes: u64) -> String {
484    const KB: u64 = 1024;
485    const MB: u64 = 1024 * KB;
486    const GB: u64 = 1024 * MB;
487    const TB: u64 = 1024 * GB;
488
489    if bytes >= TB {
490        format!("{:.2} TB", bytes as f64 / TB as f64)
491    } else if bytes >= GB {
492        format!("{:.2} GB", bytes as f64 / GB as f64)
493    } else if bytes >= MB {
494        format!("{:.2} MB", bytes as f64 / MB as f64)
495    } else if bytes >= KB {
496        format!("{:.2} KB", bytes as f64 / KB as f64)
497    } else {
498        format!("{} bytes", bytes)
499    }
500}
501
502/// Escape HTML special characters.
503fn html_escape(s: &str) -> String {
504    s.replace('&', "&amp;")
505        .replace('<', "&lt;")
506        .replace('>', "&gt;")
507        .replace('"', "&quot;")
508        .replace('\'', "&#39;")
509}
510
511#[cfg(test)]
512mod tests {
513    use super::*;
514
515    #[test]
516    fn test_report_creation() {
517        let report = DuplicateReport::new();
518        assert_eq!(report.groups.len(), 0);
519        assert_eq!(report.total_duplicates, 0);
520    }
521
522    #[test]
523    fn test_add_group() {
524        let mut report = DuplicateReport::new();
525
526        let group = DuplicateGroup::new(vec!["file1.mp4".to_string(), "file2.mp4".to_string()]);
527
528        report.add_group(group);
529
530        assert_eq!(report.groups.len(), 1);
531        assert_eq!(report.total_duplicates, 1);
532    }
533
534    #[test]
535    fn test_duplicate_group() {
536        let mut group = DuplicateGroup::new(vec![
537            "file1.mp4".to_string(),
538            "file2.mp4".to_string(),
539            "file3.mp4".to_string(),
540        ]);
541
542        assert_eq!(group.files.len(), 3);
543
544        group.add_score(SimilarityScore::new("hash".to_string(), 1.0));
545        group.add_score(SimilarityScore::new("phash".to_string(), 0.95));
546
547        assert_eq!(group.max_similarity(), 1.0);
548        assert!((group.avg_similarity() - 0.975).abs() < 0.001);
549    }
550
551    #[test]
552    fn test_similarity_score() {
553        let mut score = SimilarityScore::new("test".to_string(), 0.95);
554        assert_eq!(score.method, "test");
555        assert_eq!(score.score, 0.95);
556
557        score.add_metadata("key".to_string(), "value".to_string());
558        assert_eq!(score.metadata.len(), 1);
559    }
560
561    #[test]
562    fn test_format_bytes() {
563        assert_eq!(format_bytes(500), "500 bytes");
564        assert_eq!(format_bytes(1024), "1.00 KB");
565        assert_eq!(format_bytes(1024 * 1024), "1.00 MB");
566        assert_eq!(format_bytes(1024 * 1024 * 1024), "1.00 GB");
567        assert_eq!(format_bytes(1024u64 * 1024 * 1024 * 1024), "1.00 TB");
568    }
569
570    #[test]
571    fn test_html_escape() {
572        assert_eq!(html_escape("test"), "test");
573        assert_eq!(html_escape("<script>"), "&lt;script&gt;");
574        assert_eq!(html_escape("a & b"), "a &amp; b");
575        assert_eq!(html_escape("\"quoted\""), "&quot;quoted&quot;");
576    }
577
578    #[test]
579    fn test_json_export() {
580        let mut report = DuplicateReport::new();
581        let group = DuplicateGroup::new(vec!["file1.mp4".to_string(), "file2.mp4".to_string()]);
582        report.add_group(group);
583
584        let json = report.to_json().expect("operation should succeed");
585        assert!(json.contains("file1.mp4"));
586        assert!(json.contains("file2.mp4"));
587    }
588
589    #[test]
590    fn test_html_export() {
591        let mut report = DuplicateReport::new();
592        let group = DuplicateGroup::new(vec!["file1.mp4".to_string(), "file2.mp4".to_string()]);
593        report.add_group(group);
594
595        let html = report.to_html();
596        assert!(html.contains("<!DOCTYPE html>"));
597        assert!(html.contains("file1.mp4"));
598        assert!(html.contains("file2.mp4"));
599    }
600
601    #[test]
602    fn test_sort_by_similarity() {
603        let mut report = DuplicateReport::new();
604
605        let mut group1 = DuplicateGroup::new(vec!["a".to_string(), "b".to_string()]);
606        group1.add_score(SimilarityScore::new("test".to_string(), 0.9));
607
608        let mut group2 = DuplicateGroup::new(vec!["c".to_string(), "d".to_string()]);
609        group2.add_score(SimilarityScore::new("test".to_string(), 0.95));
610
611        report.add_group(group1);
612        report.add_group(group2);
613
614        report.sort_by_similarity();
615
616        assert_eq!(report.groups[0].max_similarity(), 0.95);
617        assert_eq!(report.groups[1].max_similarity(), 0.9);
618    }
619
620    #[test]
621    fn test_filter_by_similarity() {
622        let mut report = DuplicateReport::new();
623
624        let mut group1 = DuplicateGroup::new(vec!["a".to_string(), "b".to_string()]);
625        group1.add_score(SimilarityScore::new("test".to_string(), 0.7));
626
627        let mut group2 = DuplicateGroup::new(vec!["c".to_string(), "d".to_string()]);
628        group2.add_score(SimilarityScore::new("test".to_string(), 0.95));
629
630        report.add_group(group1);
631        report.add_group(group2);
632
633        report.filter_by_similarity(0.8);
634
635        assert_eq!(report.groups.len(), 1);
636        assert_eq!(report.groups[0].max_similarity(), 0.95);
637    }
638}