#![allow(dead_code)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum DedupAction {
Delete,
Quarantine,
Symlink,
Keep,
Review,
Skip,
}
impl DedupAction {
#[must_use]
pub const fn is_destructive(self) -> bool {
matches!(self, Self::Delete | Self::Quarantine)
}
#[must_use]
pub const fn description(self) -> &'static str {
match self {
Self::Delete => "delete duplicate",
Self::Quarantine => "move to quarantine",
Self::Symlink => "replace with symlink",
Self::Keep => "keep both copies",
Self::Review => "flag for review",
Self::Skip => "skip / log only",
}
}
}
#[derive(Debug, Clone)]
pub struct DedupPolicyConfig {
pub strict_mode: bool,
pub min_similarity: f64,
pub exact_action: DedupAction,
pub near_action: DedupAction,
pub protect_originals: bool,
}
impl Default for DedupPolicyConfig {
fn default() -> Self {
Self {
strict_mode: false,
min_similarity: 0.95,
exact_action: DedupAction::Quarantine,
near_action: DedupAction::Review,
protect_originals: true,
}
}
}
impl DedupPolicyConfig {
#[must_use]
pub const fn strict_mode(&self) -> bool {
self.strict_mode
}
#[must_use]
pub fn min_similarity(&self) -> f64 {
self.min_similarity
}
}
#[derive(Debug, Clone)]
pub struct DedupDecision {
pub similarity: f64,
pub action: DedupAction,
pub needs_review: bool,
pub reason: Option<String>,
}
impl DedupDecision {
#[must_use]
pub fn new(similarity: f64, action: DedupAction, reason: Option<String>) -> Self {
let needs_review =
matches!(action, DedupAction::Review) || (action.is_destructive() && similarity < 1.0);
Self {
similarity,
action,
needs_review,
reason,
}
}
#[must_use]
pub fn requires_review(&self) -> bool {
self.needs_review
}
}
#[derive(Debug, Clone)]
pub struct DedupPolicy {
config: DedupPolicyConfig,
}
impl DedupPolicy {
#[must_use]
pub fn new(config: DedupPolicyConfig) -> Self {
Self { config }
}
#[must_use]
pub fn should_dedup(&self, similarity: f64, is_original: bool) -> DedupDecision {
if similarity < self.config.min_similarity {
return DedupDecision::new(
similarity,
DedupAction::Skip,
Some(format!(
"similarity {similarity:.3} below threshold {:.3}",
self.config.min_similarity
)),
);
}
if is_original && self.config.protect_originals {
return DedupDecision::new(
similarity,
DedupAction::Keep,
Some("file is marked as original".to_string()),
);
}
#[allow(clippy::float_cmp)]
if similarity == 1.0 {
let action = if self.config.strict_mode {
self.config.exact_action
} else {
self.config.exact_action
};
return DedupDecision::new(
similarity,
action,
Some("exact duplicate detected".to_string()),
);
}
DedupDecision::new(
similarity,
self.config.near_action,
Some(format!("near-duplicate at {similarity:.3}")),
)
}
#[must_use]
pub const fn config(&self) -> &DedupPolicyConfig {
&self.config
}
}
impl Default for DedupPolicy {
fn default() -> Self {
Self::new(DedupPolicyConfig::default())
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum KeepCriterion {
Newest,
Oldest,
LargestFile,
SmallestFile,
ShortestPath,
LongestPath,
}
#[derive(Debug, Clone)]
pub struct GroupPolicy {
pub keep: KeepCriterion,
pub action: DedupAction,
pub min_similarity: f64,
}
impl Default for GroupPolicy {
fn default() -> Self {
Self {
keep: KeepCriterion::LargestFile,
action: DedupAction::Review,
min_similarity: 0.95,
}
}
}
#[derive(Debug, Clone)]
pub struct GroupDecision {
pub keep_index: usize,
pub keep_path: String,
pub duplicates: Vec<(usize, String)>,
pub action: DedupAction,
pub reason: String,
}
fn score_file(path: &str, criterion: KeepCriterion) -> f64 {
match criterion {
KeepCriterion::LargestFile => std::fs::metadata(path)
.map(|m| m.len() as f64)
.unwrap_or(0.0),
KeepCriterion::SmallestFile => {
let size = std::fs::metadata(path)
.map(|m| m.len() as f64)
.unwrap_or(f64::MAX);
if size <= 0.0 {
0.0
} else {
1.0 / size
}
}
KeepCriterion::Newest => std::fs::metadata(path)
.ok()
.and_then(|m| m.modified().ok())
.and_then(|t| {
t.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_secs_f64())
.ok()
})
.unwrap_or(0.0),
KeepCriterion::Oldest => {
let ts = std::fs::metadata(path)
.ok()
.and_then(|m| m.modified().ok())
.and_then(|t| {
t.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_secs_f64())
.ok()
})
.unwrap_or(f64::MAX);
if ts >= f64::MAX {
0.0
} else {
1.0 / (ts + 1.0)
}
}
KeepCriterion::ShortestPath => {
if path.is_empty() {
0.0
} else {
1.0 / path.len() as f64
}
}
KeepCriterion::LongestPath => path.len() as f64,
}
}
#[must_use]
pub fn apply_group_policy(files: &[String], policy: &GroupPolicy) -> Option<GroupDecision> {
if files.len() < 2 {
return None;
}
let mut best_idx = 0;
let mut best_score = f64::NEG_INFINITY;
for (i, path) in files.iter().enumerate() {
let s = score_file(path, policy.keep);
if s > best_score {
best_score = s;
best_idx = i;
}
}
let duplicates: Vec<(usize, String)> = files
.iter()
.enumerate()
.filter(|(i, _)| *i != best_idx)
.map(|(i, p)| (i, p.clone()))
.collect();
Some(GroupDecision {
keep_index: best_idx,
keep_path: files[best_idx].clone(),
duplicates,
action: policy.action,
reason: format!(
"keep by {:?}, apply {:?} to {} duplicate(s)",
policy.keep,
policy.action,
files.len() - 1
),
})
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum GroupAction {
KeepNewest,
KeepHighestQuality,
KeepFirst,
Delete,
}
impl GroupAction {
#[must_use]
pub const fn label(self) -> &'static str {
match self {
Self::KeepNewest => "keep-newest",
Self::KeepHighestQuality => "keep-highest-quality",
Self::KeepFirst => "keep-first",
Self::Delete => "delete-all",
}
}
#[must_use]
pub const fn deletes_all(self) -> bool {
matches!(self, Self::Delete)
}
}
#[must_use]
pub fn select_keeper(
group: &[std::path::PathBuf],
action: &GroupAction,
) -> Option<std::path::PathBuf> {
if group.is_empty() {
return None;
}
match action {
GroupAction::KeepFirst => group.first().cloned(),
GroupAction::Delete => None,
GroupAction::KeepNewest => {
let best = group
.iter()
.map(|p| {
let ts = std::fs::metadata(p)
.ok()
.and_then(|m| m.modified().ok())
.and_then(|t| {
t.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_secs())
.ok()
})
.unwrap_or(0);
(ts, p)
})
.max_by_key(|(ts, _)| *ts)
.map(|(_, p)| p.clone());
best
}
GroupAction::KeepHighestQuality => {
let best = group
.iter()
.map(|p| {
let size = std::fs::metadata(p).map(|m| m.len()).unwrap_or(0);
(size, p)
})
.max_by_key(|(size, _)| *size)
.map(|(_, p)| p.clone());
best
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_action_is_destructive_delete() {
assert!(DedupAction::Delete.is_destructive());
}
#[test]
fn test_action_is_destructive_quarantine() {
assert!(DedupAction::Quarantine.is_destructive());
}
#[test]
fn test_action_not_destructive_keep() {
assert!(!DedupAction::Keep.is_destructive());
}
#[test]
fn test_action_not_destructive_symlink() {
assert!(!DedupAction::Symlink.is_destructive());
}
#[test]
fn test_action_description_nonempty() {
for action in [
DedupAction::Delete,
DedupAction::Quarantine,
DedupAction::Symlink,
DedupAction::Keep,
DedupAction::Review,
DedupAction::Skip,
] {
assert!(!action.description().is_empty());
}
}
#[test]
fn test_policy_config_defaults() {
let cfg = DedupPolicyConfig::default();
assert!(!cfg.strict_mode());
assert!((cfg.min_similarity() - 0.95).abs() < 1e-9);
assert!(cfg.protect_originals);
}
#[test]
fn test_policy_skip_below_threshold() {
let policy = DedupPolicy::default();
let decision = policy.should_dedup(0.50, false);
assert_eq!(decision.action, DedupAction::Skip);
assert!(!decision.requires_review());
}
#[test]
fn test_policy_exact_duplicate() {
let policy = DedupPolicy::default();
let decision = policy.should_dedup(1.0, false);
assert_eq!(decision.action, DedupAction::Quarantine);
}
#[test]
fn test_policy_near_duplicate() {
let policy = DedupPolicy::default();
let decision = policy.should_dedup(0.97, false);
assert_eq!(decision.action, DedupAction::Review);
}
#[test]
fn test_policy_protect_original() {
let policy = DedupPolicy::default();
let decision = policy.should_dedup(1.0, true);
assert_eq!(decision.action, DedupAction::Keep);
}
#[test]
fn test_decision_requires_review_for_review_action() {
let d = DedupDecision::new(0.97, DedupAction::Review, None);
assert!(d.requires_review());
}
#[test]
fn test_decision_requires_review_destructive_near_dup() {
let d = DedupDecision::new(0.97, DedupAction::Delete, None);
assert!(d.requires_review());
}
#[test]
fn test_decision_no_review_for_exact_destructive() {
let d = DedupDecision::new(1.0, DedupAction::Delete, None);
assert!(!d.requires_review());
}
#[test]
fn test_decision_skip_no_review() {
let d = DedupDecision::new(0.5, DedupAction::Skip, None);
assert!(!d.requires_review());
}
#[test]
fn test_policy_config_strict_mode_toggle() {
let mut cfg = DedupPolicyConfig::default();
cfg.strict_mode = true;
assert!(cfg.strict_mode());
}
#[test]
fn test_keep_criterion_shortest_path() {
let files = vec![
"/a/b/c/deep/path/file.mp4".to_string(),
"/short.mp4".to_string(),
"/medium/file.mp4".to_string(),
];
let policy = GroupPolicy {
keep: KeepCriterion::ShortestPath,
action: DedupAction::Delete,
min_similarity: 0.95,
};
let decision = apply_group_policy(&files, &policy).expect("should produce a decision");
assert_eq!(decision.keep_path, "/short.mp4");
assert_eq!(decision.duplicates.len(), 2);
assert_eq!(decision.action, DedupAction::Delete);
}
#[test]
fn test_keep_criterion_longest_path() {
let files = vec![
"/short.mp4".to_string(),
"/a/b/c/deep/path/file.mp4".to_string(),
];
let policy = GroupPolicy {
keep: KeepCriterion::LongestPath,
action: DedupAction::Quarantine,
min_similarity: 0.95,
};
let decision = apply_group_policy(&files, &policy).expect("should produce a decision");
assert_eq!(decision.keep_path, "/a/b/c/deep/path/file.mp4");
}
#[test]
fn test_group_policy_default() {
let policy = GroupPolicy::default();
assert_eq!(policy.keep, KeepCriterion::LargestFile);
assert_eq!(policy.action, DedupAction::Review);
assert!((policy.min_similarity - 0.95).abs() < f64::EPSILON);
}
#[test]
fn test_group_policy_too_few_files() {
let files = vec!["only_one.mp4".to_string()];
let policy = GroupPolicy::default();
assert!(apply_group_policy(&files, &policy).is_none());
}
#[test]
fn test_group_decision_reason_contains_criterion() {
let files = vec!["a.mp4".to_string(), "b.mp4".to_string()];
let policy = GroupPolicy {
keep: KeepCriterion::Newest,
action: DedupAction::Symlink,
min_similarity: 0.9,
};
let decision = apply_group_policy(&files, &policy).expect("should produce a decision");
assert!(decision.reason.contains("Newest"));
assert!(decision.reason.contains("Symlink"));
}
#[test]
fn test_keep_criterion_all_variants_non_destructive() {
let files = vec!["a.mp4".to_string(), "b.mp4".to_string()];
for criterion in [
KeepCriterion::Newest,
KeepCriterion::Oldest,
KeepCriterion::LargestFile,
KeepCriterion::SmallestFile,
KeepCriterion::ShortestPath,
KeepCriterion::LongestPath,
] {
let policy = GroupPolicy {
keep: criterion,
action: DedupAction::Skip,
min_similarity: 0.5,
};
let decision = apply_group_policy(&files, &policy);
assert!(decision.is_some());
}
}
#[test]
fn test_group_action_label_nonempty() {
for action in [
GroupAction::KeepNewest,
GroupAction::KeepHighestQuality,
GroupAction::KeepFirst,
GroupAction::Delete,
] {
assert!(!action.label().is_empty());
}
}
#[test]
fn test_group_action_delete_is_delete_all() {
assert!(GroupAction::Delete.deletes_all());
assert!(!GroupAction::KeepFirst.deletes_all());
}
#[test]
fn test_policy_keep_first_returns_first() {
let dir = std::env::temp_dir().join("oximedia_policy_keep_first");
let _ = std::fs::create_dir_all(&dir);
let f1 = dir.join("first.bin");
let f2 = dir.join("second.bin");
std::fs::write(&f1, b"aaa").expect("write");
std::fs::write(&f2, b"bbb").expect("write");
let group = vec![f1.clone(), f2];
let keeper = select_keeper(&group, &GroupAction::KeepFirst);
assert_eq!(keeper, Some(f1));
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn test_policy_delete_returns_none() {
let dir = std::env::temp_dir().join("oximedia_policy_delete");
let _ = std::fs::create_dir_all(&dir);
let f1 = dir.join("a.bin");
let f2 = dir.join("b.bin");
std::fs::write(&f1, b"x").expect("write");
std::fs::write(&f2, b"y").expect("write");
let group = vec![f1, f2];
let keeper = select_keeper(&group, &GroupAction::Delete);
assert!(keeper.is_none(), "Delete action should return None");
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn test_policy_keep_newest_returns_latest() {
let dir = std::env::temp_dir().join("oximedia_policy_newest");
let _ = std::fs::create_dir_all(&dir);
let f_old = dir.join("old.bin");
let f_new = dir.join("new.bin");
std::fs::write(&f_old, b"old").expect("write old");
std::fs::write(&f_new, b"new").expect("write new");
let group = vec![f_old.clone(), f_new.clone()];
let keeper = select_keeper(&group, &GroupAction::KeepNewest);
assert!(
keeper == Some(f_old.clone()) || keeper == Some(f_new.clone()),
"KeepNewest must return one of the group paths"
);
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn test_policy_keep_highest_quality_largest_file() {
let dir = std::env::temp_dir().join("oximedia_policy_quality");
let _ = std::fs::create_dir_all(&dir);
let f_small = dir.join("small.bin");
let f_large = dir.join("large.bin");
std::fs::write(&f_small, &[0u8; 100]).expect("write small");
std::fs::write(&f_large, &[0u8; 500]).expect("write large");
let group = vec![f_small, f_large.clone()];
let keeper = select_keeper(&group, &GroupAction::KeepHighestQuality);
assert_eq!(
keeper,
Some(f_large),
"KeepHighestQuality should pick the largest file"
);
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn test_select_keeper_empty_group() {
let keeper = select_keeper(&[], &GroupAction::KeepFirst);
assert!(keeper.is_none(), "Empty group should always return None");
}
}