#![allow(dead_code)]
#![allow(clippy::cast_precision_loss)]
use std::collections::HashMap;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum RecommendedAction {
DeleteDuplicates,
SymlinkDuplicates,
HardlinkDuplicates,
ArchiveDuplicates,
ManualReview,
NoAction,
}
impl RecommendedAction {
#[must_use]
pub fn label(&self) -> &'static str {
match self {
Self::DeleteDuplicates => "delete",
Self::SymlinkDuplicates => "symlink",
Self::HardlinkDuplicates => "hardlink",
Self::ArchiveDuplicates => "archive",
Self::ManualReview => "manual_review",
Self::NoAction => "no_action",
}
}
#[must_use]
pub fn description(&self) -> &'static str {
match self {
Self::DeleteDuplicates => {
"Delete all duplicate files, keeping only the representative."
}
Self::SymlinkDuplicates => {
"Replace duplicate files with symbolic links to the representative."
}
Self::HardlinkDuplicates => {
"Replace duplicate files with hard links to the representative."
}
Self::ArchiveDuplicates => "Move duplicates to an archive directory for manual review.",
Self::ManualReview => {
"Similarity confidence is insufficient for automated action; review manually."
}
Self::NoAction => "Single-member group; no action required.",
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum ConfidenceTier {
Low,
Medium,
High,
Exact,
}
impl ConfidenceTier {
#[must_use]
pub fn from_score(score: f64) -> Self {
if score >= 0.98 {
Self::Exact
} else if score >= 0.90 {
Self::High
} else if score >= 0.75 {
Self::Medium
} else {
Self::Low
}
}
#[must_use]
pub fn label(self) -> &'static str {
match self {
Self::Exact => "exact",
Self::High => "high",
Self::Medium => "medium",
Self::Low => "low",
}
}
}
#[derive(Debug, Clone)]
pub struct FileEntry {
pub path: String,
pub size_bytes: u64,
pub is_representative: bool,
pub similarity_to_rep: f64,
}
impl FileEntry {
#[must_use]
pub fn new(path: impl Into<String>, size_bytes: u64, similarity_to_rep: f64) -> Self {
Self {
path: path.into(),
size_bytes,
is_representative: false,
similarity_to_rep,
}
}
}
#[derive(Debug, Clone, Default)]
pub struct SpaceSavingsEstimate {
pub reclaimable_bytes: u64,
pub retained_bytes: u64,
pub savings_ratio: f64,
}
impl SpaceSavingsEstimate {
#[must_use]
pub fn from_entries(entries: &[FileEntry]) -> Self {
let total_bytes: u64 = entries.iter().map(|e| e.size_bytes).sum();
let retained_bytes: u64 = entries
.iter()
.filter(|e| e.is_representative)
.map(|e| e.size_bytes)
.sum();
let reclaimable_bytes = total_bytes.saturating_sub(retained_bytes);
let savings_ratio = if total_bytes > 0 {
reclaimable_bytes as f64 / total_bytes as f64
} else {
0.0
};
Self {
reclaimable_bytes,
retained_bytes,
savings_ratio,
}
}
pub fn merge(&mut self, other: &Self) {
self.reclaimable_bytes += other.reclaimable_bytes;
self.retained_bytes += other.retained_bytes;
let total = self.reclaimable_bytes + self.retained_bytes;
self.savings_ratio = if total > 0 {
self.reclaimable_bytes as f64 / total as f64
} else {
0.0
};
}
#[must_use]
pub fn description(&self) -> String {
format!(
"{} bytes reclaimable / {} bytes retained ({:.1}% savings)",
self.reclaimable_bytes,
self.retained_bytes,
self.savings_ratio * 100.0,
)
}
}
#[derive(Debug, Clone)]
pub struct DetailedDuplicateGroup {
pub id: usize,
pub method: String,
pub files: Vec<FileEntry>,
pub mean_similarity: f64,
pub confidence_tier: ConfidenceTier,
pub action: RecommendedAction,
pub space_savings: SpaceSavingsEstimate,
pub metadata: HashMap<String, String>,
}
impl DetailedDuplicateGroup {
#[must_use]
pub fn new(id: usize, method: impl Into<String>, mean_similarity: f64) -> Self {
let confidence_tier = ConfidenceTier::from_score(mean_similarity);
Self {
id,
method: method.into(),
files: Vec::new(),
mean_similarity,
confidence_tier,
action: RecommendedAction::NoAction,
space_savings: SpaceSavingsEstimate::default(),
metadata: HashMap::new(),
}
}
pub fn add_file(&mut self, entry: FileEntry) {
self.files.push(entry);
}
#[must_use]
pub fn size(&self) -> usize {
self.files.len()
}
#[must_use]
pub fn is_duplicate(&self) -> bool {
self.files.len() >= 2
}
pub fn compute_space_savings(&mut self) {
self.space_savings = SpaceSavingsEstimate::from_entries(&self.files);
}
pub fn select_largest_representative(&mut self) {
if self.files.is_empty() {
return;
}
for f in &mut self.files {
f.is_representative = false;
}
let best_idx = self
.files
.iter()
.enumerate()
.max_by_key(|(_, e)| e.size_bytes)
.map(|(i, _)| i)
.unwrap_or(0);
self.files[best_idx].is_representative = true;
}
pub fn select_highest_similarity_representative(&mut self) {
if self.files.is_empty() {
return;
}
for f in &mut self.files {
f.is_representative = false;
}
let best_idx = self
.files
.iter()
.enumerate()
.max_by(|(_, a), (_, b)| {
a.similarity_to_rep
.partial_cmp(&b.similarity_to_rep)
.unwrap_or(std::cmp::Ordering::Equal)
})
.map(|(i, _)| i)
.unwrap_or(0);
self.files[best_idx].is_representative = true;
}
pub fn assign_action(
&mut self,
exact_action: RecommendedAction,
high_action: RecommendedAction,
fallback_action: RecommendedAction,
) {
if !self.is_duplicate() {
self.action = RecommendedAction::NoAction;
return;
}
self.action = match self.confidence_tier {
ConfidenceTier::Exact => exact_action,
ConfidenceTier::High => high_action,
_ => fallback_action,
};
}
pub fn set_metadata(&mut self, key: impl Into<String>, value: impl Into<String>) {
self.metadata.insert(key.into(), value.into());
}
#[must_use]
pub fn representative_path(&self) -> Option<&str> {
self.files
.iter()
.find(|e| e.is_representative)
.map(|e| e.path.as_str())
}
}
#[derive(Debug, Clone)]
pub struct DetailedReport {
pub groups: Vec<DetailedDuplicateGroup>,
pub total_space_savings: SpaceSavingsEstimate,
pub total_files_examined: usize,
pub tier_counts: HashMap<String, usize>,
pub action_counts: HashMap<String, usize>,
}
impl DetailedReport {
#[must_use]
pub fn group_count(&self) -> usize {
self.groups.len()
}
#[must_use]
pub fn duplicate_file_count(&self) -> usize {
self.groups.iter().map(|g| g.size()).sum()
}
#[must_use]
pub fn summary(&self) -> String {
format!(
"DetailedReport: {} groups | {} duplicate files | {} files examined\n\
Space: {}\n\
Tiers: {:?}\n\
Actions: {:?}",
self.group_count(),
self.duplicate_file_count(),
self.total_files_examined,
self.total_space_savings.description(),
self.tier_counts,
self.action_counts,
)
}
}
#[derive(Debug, Default)]
pub struct DetailedReportBuilder {
groups: Vec<DetailedDuplicateGroup>,
total_files_examined: usize,
}
impl DetailedReportBuilder {
#[must_use]
pub fn new() -> Self {
Self::default()
}
#[must_use]
pub fn total_files_examined(mut self, n: usize) -> Self {
self.total_files_examined = n;
self
}
#[must_use]
pub fn add_group(mut self, group: DetailedDuplicateGroup) -> Self {
self.groups.push(group);
self
}
#[must_use]
pub fn add_simple_group(
mut self,
id: usize,
method: impl Into<String>,
mean_similarity: f64,
files: Vec<(String, u64)>,
) -> Self {
let mut group = DetailedDuplicateGroup::new(id, method, mean_similarity);
for (path, size) in files {
group.add_file(FileEntry::new(path, size, mean_similarity));
}
group.select_largest_representative();
group.assign_action(
RecommendedAction::DeleteDuplicates,
RecommendedAction::HardlinkDuplicates,
RecommendedAction::ManualReview,
);
group.compute_space_savings();
self.groups.push(group);
self
}
#[must_use]
pub fn build(self) -> DetailedReport {
let mut total_space_savings = SpaceSavingsEstimate::default();
let mut tier_counts: HashMap<String, usize> = HashMap::new();
let mut action_counts: HashMap<String, usize> = HashMap::new();
for group in &self.groups {
total_space_savings.merge(&group.space_savings);
*tier_counts
.entry(group.confidence_tier.label().to_string())
.or_insert(0) += 1;
*action_counts
.entry(group.action.label().to_string())
.or_insert(0) += 1;
}
DetailedReport {
groups: self.groups,
total_space_savings,
total_files_examined: self.total_files_examined,
tier_counts,
action_counts,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn entry(path: &str, size: u64, sim: f64) -> FileEntry {
FileEntry::new(path, size, sim)
}
#[test]
fn test_confidence_tier_from_score() {
assert_eq!(ConfidenceTier::from_score(1.0), ConfidenceTier::Exact);
assert_eq!(ConfidenceTier::from_score(0.98), ConfidenceTier::Exact);
assert_eq!(ConfidenceTier::from_score(0.95), ConfidenceTier::High);
assert_eq!(ConfidenceTier::from_score(0.90), ConfidenceTier::High);
assert_eq!(ConfidenceTier::from_score(0.80), ConfidenceTier::Medium);
assert_eq!(ConfidenceTier::from_score(0.75), ConfidenceTier::Medium);
assert_eq!(ConfidenceTier::from_score(0.50), ConfidenceTier::Low);
assert_eq!(ConfidenceTier::from_score(0.0), ConfidenceTier::Low);
}
#[test]
fn test_confidence_tier_ordering() {
assert!(ConfidenceTier::Exact > ConfidenceTier::Low);
assert!(ConfidenceTier::High > ConfidenceTier::Medium);
}
#[test]
fn test_recommended_action_labels() {
assert_eq!(RecommendedAction::DeleteDuplicates.label(), "delete");
assert_eq!(RecommendedAction::SymlinkDuplicates.label(), "symlink");
assert_eq!(RecommendedAction::HardlinkDuplicates.label(), "hardlink");
assert_eq!(RecommendedAction::ArchiveDuplicates.label(), "archive");
assert_eq!(RecommendedAction::ManualReview.label(), "manual_review");
assert_eq!(RecommendedAction::NoAction.label(), "no_action");
}
#[test]
fn test_space_savings_from_entries() {
let mut files = vec![entry("a.mp4", 1000, 1.0), entry("b.mp4", 800, 0.95)];
files[0].is_representative = true;
let est = SpaceSavingsEstimate::from_entries(&files);
assert_eq!(est.retained_bytes, 1000);
assert_eq!(est.reclaimable_bytes, 800);
assert!((est.savings_ratio - 800.0 / 1800.0).abs() < 1e-9);
}
#[test]
fn test_space_savings_empty() {
let est = SpaceSavingsEstimate::from_entries(&[]);
assert_eq!(est.reclaimable_bytes, 0);
assert_eq!(est.savings_ratio, 0.0);
}
#[test]
fn test_space_savings_merge() {
let mut a = SpaceSavingsEstimate {
reclaimable_bytes: 500,
retained_bytes: 1000,
savings_ratio: 0.333,
};
let b = SpaceSavingsEstimate {
reclaimable_bytes: 300,
retained_bytes: 700,
savings_ratio: 0.3,
};
a.merge(&b);
assert_eq!(a.reclaimable_bytes, 800);
assert_eq!(a.retained_bytes, 1700);
let expected_ratio = 800.0 / 2500.0;
assert!((a.savings_ratio - expected_ratio).abs() < 1e-9);
}
#[test]
fn test_group_select_largest_representative() {
let mut group = DetailedDuplicateGroup::new(0, "phash", 0.95);
group.add_file(entry("small.mp4", 100, 0.95));
group.add_file(entry("large.mp4", 9000, 0.95));
group.add_file(entry("medium.mp4", 500, 0.95));
group.select_largest_representative();
assert_eq!(group.representative_path(), Some("large.mp4"));
}
#[test]
fn test_group_assign_action_exact() {
let mut group = DetailedDuplicateGroup::new(0, "hash", 0.999);
group.add_file(entry("a.mp4", 100, 1.0));
group.add_file(entry("b.mp4", 100, 1.0));
group.assign_action(
RecommendedAction::DeleteDuplicates,
RecommendedAction::HardlinkDuplicates,
RecommendedAction::ManualReview,
);
assert_eq!(group.action, RecommendedAction::DeleteDuplicates);
}
#[test]
fn test_group_assign_action_low_confidence() {
let mut group = DetailedDuplicateGroup::new(0, "ssim", 0.65);
group.add_file(entry("a.mp4", 100, 0.65));
group.add_file(entry("b.mp4", 100, 0.65));
group.assign_action(
RecommendedAction::DeleteDuplicates,
RecommendedAction::HardlinkDuplicates,
RecommendedAction::ManualReview,
);
assert_eq!(group.action, RecommendedAction::ManualReview);
}
#[test]
fn test_group_single_member_no_action() {
let mut group = DetailedDuplicateGroup::new(0, "phash", 1.0);
group.add_file(entry("only.mp4", 500, 1.0));
group.assign_action(
RecommendedAction::DeleteDuplicates,
RecommendedAction::HardlinkDuplicates,
RecommendedAction::ManualReview,
);
assert_eq!(group.action, RecommendedAction::NoAction);
}
#[test]
fn test_group_metadata() {
let mut group = DetailedDuplicateGroup::new(0, "phash", 0.95);
group.set_metadata("codec", "h264");
group.set_metadata("resolution", "1920x1080");
assert_eq!(
group.metadata.get("codec").map(String::as_str),
Some("h264")
);
assert_eq!(group.metadata.len(), 2);
}
#[test]
fn test_report_builder_empty() {
let report = DetailedReportBuilder::new()
.total_files_examined(50)
.build();
assert_eq!(report.total_files_examined, 50);
assert!(report.groups.is_empty());
assert_eq!(report.group_count(), 0);
assert_eq!(report.duplicate_file_count(), 0);
}
#[test]
fn test_report_builder_with_groups() {
let report = DetailedReportBuilder::new()
.total_files_examined(200)
.add_simple_group(
0,
"phash",
0.96,
vec![("a.mp4".to_string(), 2000), ("b.mp4".to_string(), 1500)],
)
.add_simple_group(
1,
"ssim",
0.82,
vec![("c.mp4".to_string(), 1000), ("d.mp4".to_string(), 900)],
)
.build();
assert_eq!(report.group_count(), 2);
assert_eq!(report.duplicate_file_count(), 4);
assert!(report.total_space_savings.reclaimable_bytes > 0);
assert!(!report.summary().is_empty());
}
#[test]
fn test_report_tier_and_action_counts() {
let report = DetailedReportBuilder::new()
.add_simple_group(
0,
"phash",
0.999,
vec![("a.mp4".to_string(), 1000), ("b.mp4".to_string(), 800)],
)
.add_simple_group(
1,
"ssim",
0.60,
vec![("c.mp4".to_string(), 500), ("d.mp4".to_string(), 400)],
)
.build();
assert_eq!(report.tier_counts.get("exact").copied().unwrap_or(0), 1);
assert_eq!(report.tier_counts.get("low").copied().unwrap_or(0), 1);
assert_eq!(report.action_counts.get("delete").copied().unwrap_or(0), 1);
assert_eq!(
report
.action_counts
.get("manual_review")
.copied()
.unwrap_or(0),
1
);
}
}