#![allow(dead_code)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum DedupAction {
Delete,
Quarantine,
Symlink,
Keep,
Review,
Skip,
}
impl DedupAction {
#[must_use]
pub const fn is_destructive(self) -> bool {
matches!(self, Self::Delete | Self::Quarantine)
}
#[must_use]
pub const fn description(self) -> &'static str {
match self {
Self::Delete => "delete duplicate",
Self::Quarantine => "move to quarantine",
Self::Symlink => "replace with symlink",
Self::Keep => "keep both copies",
Self::Review => "flag for review",
Self::Skip => "skip / log only",
}
}
}
#[derive(Debug, Clone)]
pub struct DedupPolicyConfig {
pub strict_mode: bool,
pub min_similarity: f64,
pub exact_action: DedupAction,
pub near_action: DedupAction,
pub protect_originals: bool,
}
impl Default for DedupPolicyConfig {
fn default() -> Self {
Self {
strict_mode: false,
min_similarity: 0.95,
exact_action: DedupAction::Quarantine,
near_action: DedupAction::Review,
protect_originals: true,
}
}
}
impl DedupPolicyConfig {
#[must_use]
pub const fn strict_mode(&self) -> bool {
self.strict_mode
}
#[must_use]
pub fn min_similarity(&self) -> f64 {
self.min_similarity
}
}
#[derive(Debug, Clone)]
pub struct DedupDecision {
pub similarity: f64,
pub action: DedupAction,
pub needs_review: bool,
pub reason: Option<String>,
}
impl DedupDecision {
#[must_use]
pub fn new(similarity: f64, action: DedupAction, reason: Option<String>) -> Self {
let needs_review =
matches!(action, DedupAction::Review) || (action.is_destructive() && similarity < 1.0);
Self {
similarity,
action,
needs_review,
reason,
}
}
#[must_use]
pub fn requires_review(&self) -> bool {
self.needs_review
}
}
#[derive(Debug, Clone)]
pub struct DedupPolicy {
config: DedupPolicyConfig,
}
impl DedupPolicy {
#[must_use]
pub fn new(config: DedupPolicyConfig) -> Self {
Self { config }
}
#[must_use]
pub fn should_dedup(&self, similarity: f64, is_original: bool) -> DedupDecision {
if similarity < self.config.min_similarity {
return DedupDecision::new(
similarity,
DedupAction::Skip,
Some(format!(
"similarity {similarity:.3} below threshold {:.3}",
self.config.min_similarity
)),
);
}
if is_original && self.config.protect_originals {
return DedupDecision::new(
similarity,
DedupAction::Keep,
Some("file is marked as original".to_string()),
);
}
#[allow(clippy::float_cmp)]
if similarity == 1.0 {
let action = if self.config.strict_mode {
self.config.exact_action
} else {
self.config.exact_action
};
return DedupDecision::new(
similarity,
action,
Some("exact duplicate detected".to_string()),
);
}
DedupDecision::new(
similarity,
self.config.near_action,
Some(format!("near-duplicate at {similarity:.3}")),
)
}
#[must_use]
pub const fn config(&self) -> &DedupPolicyConfig {
&self.config
}
}
impl Default for DedupPolicy {
fn default() -> Self {
Self::new(DedupPolicyConfig::default())
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum KeepCriterion {
Newest,
Oldest,
LargestFile,
SmallestFile,
ShortestPath,
LongestPath,
}
#[derive(Debug, Clone)]
pub struct GroupPolicy {
pub keep: KeepCriterion,
pub action: DedupAction,
pub min_similarity: f64,
}
impl Default for GroupPolicy {
fn default() -> Self {
Self {
keep: KeepCriterion::LargestFile,
action: DedupAction::Review,
min_similarity: 0.95,
}
}
}
#[derive(Debug, Clone)]
pub struct GroupDecision {
pub keep_index: usize,
pub keep_path: String,
pub duplicates: Vec<(usize, String)>,
pub action: DedupAction,
pub reason: String,
}
fn score_file(path: &str, criterion: KeepCriterion) -> f64 {
match criterion {
KeepCriterion::LargestFile => std::fs::metadata(path)
.map(|m| m.len() as f64)
.unwrap_or(0.0),
KeepCriterion::SmallestFile => {
let size = std::fs::metadata(path)
.map(|m| m.len() as f64)
.unwrap_or(f64::MAX);
if size <= 0.0 {
0.0
} else {
1.0 / size
}
}
KeepCriterion::Newest => std::fs::metadata(path)
.ok()
.and_then(|m| m.modified().ok())
.and_then(|t| {
t.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_secs_f64())
.ok()
})
.unwrap_or(0.0),
KeepCriterion::Oldest => {
let ts = std::fs::metadata(path)
.ok()
.and_then(|m| m.modified().ok())
.and_then(|t| {
t.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_secs_f64())
.ok()
})
.unwrap_or(f64::MAX);
if ts >= f64::MAX {
0.0
} else {
1.0 / (ts + 1.0)
}
}
KeepCriterion::ShortestPath => {
if path.is_empty() {
0.0
} else {
1.0 / path.len() as f64
}
}
KeepCriterion::LongestPath => path.len() as f64,
}
}
#[must_use]
pub fn apply_group_policy(files: &[String], policy: &GroupPolicy) -> Option<GroupDecision> {
if files.len() < 2 {
return None;
}
let mut best_idx = 0;
let mut best_score = f64::NEG_INFINITY;
for (i, path) in files.iter().enumerate() {
let s = score_file(path, policy.keep);
if s > best_score {
best_score = s;
best_idx = i;
}
}
let duplicates: Vec<(usize, String)> = files
.iter()
.enumerate()
.filter(|(i, _)| *i != best_idx)
.map(|(i, p)| (i, p.clone()))
.collect();
Some(GroupDecision {
keep_index: best_idx,
keep_path: files[best_idx].clone(),
duplicates,
action: policy.action,
reason: format!(
"keep by {:?}, apply {:?} to {} duplicate(s)",
policy.keep,
policy.action,
files.len() - 1
),
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_action_is_destructive_delete() {
assert!(DedupAction::Delete.is_destructive());
}
#[test]
fn test_action_is_destructive_quarantine() {
assert!(DedupAction::Quarantine.is_destructive());
}
#[test]
fn test_action_not_destructive_keep() {
assert!(!DedupAction::Keep.is_destructive());
}
#[test]
fn test_action_not_destructive_symlink() {
assert!(!DedupAction::Symlink.is_destructive());
}
#[test]
fn test_action_description_nonempty() {
for action in [
DedupAction::Delete,
DedupAction::Quarantine,
DedupAction::Symlink,
DedupAction::Keep,
DedupAction::Review,
DedupAction::Skip,
] {
assert!(!action.description().is_empty());
}
}
#[test]
fn test_policy_config_defaults() {
let cfg = DedupPolicyConfig::default();
assert!(!cfg.strict_mode());
assert!((cfg.min_similarity() - 0.95).abs() < 1e-9);
assert!(cfg.protect_originals);
}
#[test]
fn test_policy_skip_below_threshold() {
let policy = DedupPolicy::default();
let decision = policy.should_dedup(0.50, false);
assert_eq!(decision.action, DedupAction::Skip);
assert!(!decision.requires_review());
}
#[test]
fn test_policy_exact_duplicate() {
let policy = DedupPolicy::default();
let decision = policy.should_dedup(1.0, false);
assert_eq!(decision.action, DedupAction::Quarantine);
}
#[test]
fn test_policy_near_duplicate() {
let policy = DedupPolicy::default();
let decision = policy.should_dedup(0.97, false);
assert_eq!(decision.action, DedupAction::Review);
}
#[test]
fn test_policy_protect_original() {
let policy = DedupPolicy::default();
let decision = policy.should_dedup(1.0, true);
assert_eq!(decision.action, DedupAction::Keep);
}
#[test]
fn test_decision_requires_review_for_review_action() {
let d = DedupDecision::new(0.97, DedupAction::Review, None);
assert!(d.requires_review());
}
#[test]
fn test_decision_requires_review_destructive_near_dup() {
let d = DedupDecision::new(0.97, DedupAction::Delete, None);
assert!(d.requires_review());
}
#[test]
fn test_decision_no_review_for_exact_destructive() {
let d = DedupDecision::new(1.0, DedupAction::Delete, None);
assert!(!d.requires_review());
}
#[test]
fn test_decision_skip_no_review() {
let d = DedupDecision::new(0.5, DedupAction::Skip, None);
assert!(!d.requires_review());
}
#[test]
fn test_policy_config_strict_mode_toggle() {
let mut cfg = DedupPolicyConfig::default();
cfg.strict_mode = true;
assert!(cfg.strict_mode());
}
#[test]
fn test_keep_criterion_shortest_path() {
let files = vec![
"/a/b/c/deep/path/file.mp4".to_string(),
"/short.mp4".to_string(),
"/medium/file.mp4".to_string(),
];
let policy = GroupPolicy {
keep: KeepCriterion::ShortestPath,
action: DedupAction::Delete,
min_similarity: 0.95,
};
let decision = apply_group_policy(&files, &policy).expect("should produce a decision");
assert_eq!(decision.keep_path, "/short.mp4");
assert_eq!(decision.duplicates.len(), 2);
assert_eq!(decision.action, DedupAction::Delete);
}
#[test]
fn test_keep_criterion_longest_path() {
let files = vec![
"/short.mp4".to_string(),
"/a/b/c/deep/path/file.mp4".to_string(),
];
let policy = GroupPolicy {
keep: KeepCriterion::LongestPath,
action: DedupAction::Quarantine,
min_similarity: 0.95,
};
let decision = apply_group_policy(&files, &policy).expect("should produce a decision");
assert_eq!(decision.keep_path, "/a/b/c/deep/path/file.mp4");
}
#[test]
fn test_group_policy_default() {
let policy = GroupPolicy::default();
assert_eq!(policy.keep, KeepCriterion::LargestFile);
assert_eq!(policy.action, DedupAction::Review);
assert!((policy.min_similarity - 0.95).abs() < f64::EPSILON);
}
#[test]
fn test_group_policy_too_few_files() {
let files = vec!["only_one.mp4".to_string()];
let policy = GroupPolicy::default();
assert!(apply_group_policy(&files, &policy).is_none());
}
#[test]
fn test_group_decision_reason_contains_criterion() {
let files = vec!["a.mp4".to_string(), "b.mp4".to_string()];
let policy = GroupPolicy {
keep: KeepCriterion::Newest,
action: DedupAction::Symlink,
min_similarity: 0.9,
};
let decision = apply_group_policy(&files, &policy).expect("should produce a decision");
assert!(decision.reason.contains("Newest"));
assert!(decision.reason.contains("Symlink"));
}
#[test]
fn test_keep_criterion_all_variants_non_destructive() {
let files = vec!["a.mp4".to_string(), "b.mp4".to_string()];
for criterion in [
KeepCriterion::Newest,
KeepCriterion::Oldest,
KeepCriterion::LargestFile,
KeepCriterion::SmallestFile,
KeepCriterion::ShortestPath,
KeepCriterion::LongestPath,
] {
let policy = GroupPolicy {
keep: criterion,
action: DedupAction::Skip,
min_similarity: 0.5,
};
let decision = apply_group_policy(&files, &policy);
assert!(decision.is_some());
}
}
}