1#![allow(dead_code)]
4#![allow(clippy::cast_precision_loss)]
5#![allow(clippy::too_many_arguments)]
6
7use std::collections::{HashMap, HashSet};
8use std::path::PathBuf;
9
10#[derive(Debug, Clone)]
12pub struct DuplicateCluster {
13 pub id: usize,
15 pub members: Vec<PathBuf>,
17 pub edges: Vec<(usize, usize, f64)>,
19 pub representative: Option<PathBuf>,
21}
22
23impl DuplicateCluster {
24 #[must_use]
26 pub fn new(id: usize) -> Self {
27 Self {
28 id,
29 members: Vec::new(),
30 edges: Vec::new(),
31 representative: None,
32 }
33 }
34
35 pub fn add_member(&mut self, path: PathBuf) {
37 self.members.push(path);
38 }
39
40 pub fn add_edge(&mut self, a: usize, b: usize, score: f64) {
42 self.edges.push((a, b, score));
43 }
44
45 #[must_use]
47 pub fn size(&self) -> usize {
48 self.members.len()
49 }
50
51 #[must_use]
53 pub fn is_duplicate_group(&self) -> bool {
54 self.members.len() >= 2
55 }
56
57 #[must_use]
59 pub fn average_similarity(&self) -> f64 {
60 if self.edges.is_empty() {
61 return 0.0;
62 }
63 let sum: f64 = self.edges.iter().map(|(_, _, s)| *s).sum();
64 sum / self.edges.len() as f64
65 }
66
67 pub fn select_representative(&mut self) {
69 if self.members.is_empty() {
70 return;
71 }
72 if self.members.len() == 1 {
73 self.representative = Some(self.members[0].clone());
74 return;
75 }
76 let n = self.members.len();
77 let mut scores = vec![0.0f64; n];
78 let mut counts = vec![0usize; n];
79 for &(a, b, s) in &self.edges {
80 if a < n && b < n {
81 scores[a] += s;
82 scores[b] += s;
83 counts[a] += 1;
84 counts[b] += 1;
85 }
86 }
87 let avg: Vec<f64> = scores
88 .iter()
89 .zip(counts.iter())
90 .map(|(s, &c)| if c > 0 { *s / c as f64 } else { 0.0 })
91 .collect();
92 let best = avg
93 .iter()
94 .enumerate()
95 .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
96 .map(|(i, _)| i)
97 .unwrap_or(0);
98 self.representative = Some(self.members[best].clone());
99 }
100}
101
102#[derive(Debug, Clone, Copy, PartialEq, Eq)]
104pub enum MergeStrategy {
105 SingleLinkage,
107 CompleteLinkage,
109 AverageLinkage,
111}
112
113#[derive(Debug, Clone)]
115pub struct SimilarityPair {
116 pub path_a: PathBuf,
118 pub path_b: PathBuf,
120 pub score: f64,
122}
123
124impl SimilarityPair {
125 #[must_use]
127 pub fn new(path_a: PathBuf, path_b: PathBuf, score: f64) -> Self {
128 Self {
129 path_a,
130 path_b,
131 score,
132 }
133 }
134}
135
136#[derive(Debug, Default)]
138pub struct ClusterBuilder {
139 threshold: f64,
140 strategy: MergeStrategyInner,
141}
142
143#[derive(Debug, Clone, Copy, Default)]
144enum MergeStrategyInner {
145 #[default]
146 SingleLinkage,
147 CompleteLinkage,
148 AverageLinkage,
149}
150
151impl ClusterBuilder {
152 #[must_use]
154 pub fn new(threshold: f64) -> Self {
155 Self {
156 threshold,
157 strategy: MergeStrategyInner::SingleLinkage,
158 }
159 }
160
161 #[must_use]
163 pub fn with_strategy(mut self, strategy: MergeStrategy) -> Self {
164 self.strategy = match strategy {
165 MergeStrategy::SingleLinkage => MergeStrategyInner::SingleLinkage,
166 MergeStrategy::CompleteLinkage => MergeStrategyInner::CompleteLinkage,
167 MergeStrategy::AverageLinkage => MergeStrategyInner::AverageLinkage,
168 };
169 self
170 }
171
172 #[must_use]
174 pub fn build(&self, pairs: &[SimilarityPair]) -> Vec<DuplicateCluster> {
175 let mut path_set: HashSet<&PathBuf> = HashSet::new();
177 for p in pairs {
178 path_set.insert(&p.path_a);
179 path_set.insert(&p.path_b);
180 }
181 let paths: Vec<&PathBuf> = path_set.into_iter().collect();
182 let idx: HashMap<&PathBuf, usize> =
183 paths.iter().enumerate().map(|(i, p)| (*p, i)).collect();
184 let n = paths.len();
185 let mut parent: Vec<usize> = (0..n).collect();
186
187 let valid_pairs: Vec<&SimilarityPair> =
189 pairs.iter().filter(|p| p.score >= self.threshold).collect();
190
191 fn find(parent: &mut Vec<usize>, x: usize) -> usize {
193 let mut root = x;
194 while parent[root] != root {
195 root = parent[root];
196 }
197 let mut cur = x;
198 while cur != root {
199 let next = parent[cur];
200 parent[cur] = root;
201 cur = next;
202 }
203 root
204 }
205
206 for pair in &valid_pairs {
207 let a = idx[&pair.path_a];
208 let b = idx[&pair.path_b];
209 let ra = find(&mut parent, a);
210 let rb = find(&mut parent, b);
211 if ra != rb {
212 parent[rb] = ra;
213 }
214 }
215
216 let mut groups: HashMap<usize, Vec<usize>> = HashMap::new();
218 for i in 0..n {
219 let root = find(&mut parent, i);
220 groups.entry(root).or_default().push(i);
221 }
222
223 let mut clusters = Vec::new();
225 for (cid, (_, members)) in groups.iter().enumerate() {
226 let mut cluster = DuplicateCluster::new(cid);
227 let local_idx: HashMap<usize, usize> = members
228 .iter()
229 .enumerate()
230 .map(|(li, &gi)| (gi, li))
231 .collect();
232 for &gi in members {
233 cluster.add_member(paths[gi].clone());
234 }
235 for pair in &valid_pairs {
236 let a = idx[&pair.path_a];
237 let b = idx[&pair.path_b];
238 if let (Some(&la), Some(&lb)) = (local_idx.get(&a), local_idx.get(&b)) {
239 cluster.add_edge(la, lb, pair.score);
240 }
241 }
242 cluster.select_representative();
243 clusters.push(cluster);
244 }
245 clusters
246 }
247}
248
249#[must_use]
269pub fn transitive_closure_groups(
270 pairs: &[(PathBuf, PathBuf, f64)],
271 threshold: f64,
272) -> Vec<DuplicateCluster> {
273 if pairs.is_empty() {
274 return Vec::new();
275 }
276
277 let mut path_to_idx: HashMap<&PathBuf, usize> = HashMap::new();
279 let mut idx_to_path: Vec<&PathBuf> = Vec::new();
280
281 for (a, b, _) in pairs {
282 if !path_to_idx.contains_key(a) {
283 let idx = idx_to_path.len();
284 path_to_idx.insert(a, idx);
285 idx_to_path.push(a);
286 }
287 if !path_to_idx.contains_key(b) {
288 let idx = idx_to_path.len();
289 path_to_idx.insert(b, idx);
290 idx_to_path.push(b);
291 }
292 }
293
294 let n = idx_to_path.len();
295 let mut parent: Vec<usize> = (0..n).collect();
296 let mut rank: Vec<usize> = vec![0; n];
297
298 fn find_root(parent: &mut [usize], x: usize) -> usize {
300 let mut root = x;
301 while parent[root] != root {
302 root = parent[root];
303 }
304 let mut cur = x;
306 while cur != root {
307 let next = parent[cur];
308 parent[cur] = root;
309 cur = next;
310 }
311 root
312 }
313
314 fn union(parent: &mut [usize], rank: &mut [usize], a: usize, b: usize) {
315 let ra = find_root(parent, a);
316 let rb = find_root(parent, b);
317 if ra == rb {
318 return;
319 }
320 match rank[ra].cmp(&rank[rb]) {
321 std::cmp::Ordering::Less => parent[ra] = rb,
322 std::cmp::Ordering::Greater => parent[rb] = ra,
323 std::cmp::Ordering::Equal => {
324 parent[rb] = ra;
325 rank[ra] += 1;
326 }
327 }
328 }
329
330 let mut valid_edges: Vec<(usize, usize, f64)> = Vec::new();
332 for (a, b, score) in pairs {
333 if *score >= threshold {
334 let ia = path_to_idx[a];
335 let ib = path_to_idx[b];
336 union(&mut parent, &mut rank, ia, ib);
337 valid_edges.push((ia, ib, *score));
338 }
339 }
340
341 let mut groups: HashMap<usize, Vec<usize>> = HashMap::new();
343 for i in 0..n {
344 let root = find_root(&mut parent, i);
345 groups.entry(root).or_default().push(i);
346 }
347
348 let mut clusters = Vec::new();
350 for (cid, (_, members)) in groups.iter().filter(|(_, m)| m.len() >= 2).enumerate() {
351 let mut cluster = DuplicateCluster::new(cid);
352 let local_idx: HashMap<usize, usize> = members
353 .iter()
354 .enumerate()
355 .map(|(local, &global)| (global, local))
356 .collect();
357
358 for &gi in members {
359 cluster.add_member(idx_to_path[gi].clone());
360 }
361
362 for &(ea, eb, score) in &valid_edges {
364 if let (Some(&la), Some(&lb)) = (local_idx.get(&ea), local_idx.get(&eb)) {
365 cluster.add_edge(la, lb, score);
366 }
367 }
368
369 cluster.select_representative();
370 clusters.push(cluster);
371 }
372
373 clusters
374}
375
376#[must_use]
381pub fn transitive_groups_from_pairs(
382 pairs: &[SimilarityPair],
383 threshold: f64,
384) -> Vec<DuplicateCluster> {
385 let triples: Vec<(PathBuf, PathBuf, f64)> = pairs
386 .iter()
387 .map(|p| (p.path_a.clone(), p.path_b.clone(), p.score))
388 .collect();
389 transitive_closure_groups(&triples, threshold)
390}
391
392#[must_use]
394pub fn merge_clusters(mut a: DuplicateCluster, b: DuplicateCluster) -> DuplicateCluster {
395 let offset = a.members.len();
396 for member in b.members {
397 a.members.push(member);
398 }
399 for (ea, eb, score) in b.edges {
400 a.edges.push((ea + offset, eb + offset, score));
401 }
402 a.id = a.id.min(b.id);
403 a.select_representative();
404 a
405}
406
407#[derive(Debug, Clone)]
409pub struct ClusterSummary {
410 pub total_clusters: usize,
412 pub files_in_duplicates: usize,
414 pub largest_cluster_size: usize,
416 pub average_cluster_size: f64,
418}
419
420impl ClusterSummary {
421 #[must_use]
423 pub fn from_clusters(clusters: &[DuplicateCluster]) -> Self {
424 let dup_clusters: Vec<&DuplicateCluster> =
425 clusters.iter().filter(|c| c.is_duplicate_group()).collect();
426 let total_clusters = dup_clusters.len();
427 let files_in_duplicates: usize = dup_clusters.iter().map(|c| c.size()).sum();
428 let largest_cluster_size = dup_clusters.iter().map(|c| c.size()).max().unwrap_or(0);
429 let average_cluster_size = if total_clusters > 0 {
430 files_in_duplicates as f64 / total_clusters as f64
431 } else {
432 0.0
433 };
434 Self {
435 total_clusters,
436 files_in_duplicates,
437 largest_cluster_size,
438 average_cluster_size,
439 }
440 }
441}
442
443#[cfg(test)]
444mod tests {
445 use super::*;
446
447 fn pb(s: &str) -> PathBuf {
448 PathBuf::from(s)
449 }
450
451 #[test]
452 fn test_cluster_new() {
453 let c = DuplicateCluster::new(0);
454 assert_eq!(c.id, 0);
455 assert!(c.members.is_empty());
456 assert!(c.edges.is_empty());
457 assert!(c.representative.is_none());
458 }
459
460 #[test]
461 fn test_cluster_add_member() {
462 let mut c = DuplicateCluster::new(1);
463 c.add_member(pb("a.mp4"));
464 c.add_member(pb("b.mp4"));
465 assert_eq!(c.size(), 2);
466 assert!(c.is_duplicate_group());
467 }
468
469 #[test]
470 fn test_cluster_single_member_not_duplicate() {
471 let mut c = DuplicateCluster::new(0);
472 c.add_member(pb("a.mp4"));
473 assert!(!c.is_duplicate_group());
474 }
475
476 #[test]
477 fn test_cluster_average_similarity_empty_edges() {
478 let c = DuplicateCluster::new(0);
479 assert_eq!(c.average_similarity(), 0.0);
480 }
481
482 #[test]
483 fn test_cluster_average_similarity() {
484 let mut c = DuplicateCluster::new(0);
485 c.add_member(pb("a.mp4"));
486 c.add_member(pb("b.mp4"));
487 c.add_edge(0, 1, 0.8);
488 c.add_edge(0, 1, 0.6);
489 assert!((c.average_similarity() - 0.7).abs() < 1e-9);
490 }
491
492 #[test]
493 fn test_cluster_select_representative_single() {
494 let mut c = DuplicateCluster::new(0);
495 c.add_member(pb("only.mp4"));
496 c.select_representative();
497 assert_eq!(c.representative, Some(pb("only.mp4")));
498 }
499
500 #[test]
501 fn test_cluster_select_representative_two() {
502 let mut c = DuplicateCluster::new(0);
503 c.add_member(pb("a.mp4"));
504 c.add_member(pb("b.mp4"));
505 c.add_edge(0, 1, 0.9);
506 c.select_representative();
507 assert!(c.representative.is_some());
508 }
509
510 #[test]
511 fn test_builder_groups_by_threshold() {
512 let pairs = vec![
513 SimilarityPair::new(pb("a.mp4"), pb("b.mp4"), 0.95),
514 SimilarityPair::new(pb("a.mp4"), pb("c.mp4"), 0.93),
515 SimilarityPair::new(pb("x.mp4"), pb("y.mp4"), 0.40), ];
517 let builder = ClusterBuilder::new(0.90);
518 let clusters = builder.build(&pairs);
519 let dup_clusters: Vec<&DuplicateCluster> =
521 clusters.iter().filter(|c| c.is_duplicate_group()).collect();
522 assert_eq!(dup_clusters.len(), 1);
523 assert_eq!(dup_clusters[0].size(), 3);
524 }
525
526 #[test]
527 fn test_builder_separate_clusters() {
528 let pairs = vec![
529 SimilarityPair::new(pb("a.mp4"), pb("b.mp4"), 0.95),
530 SimilarityPair::new(pb("x.mp4"), pb("y.mp4"), 0.92),
531 ];
532 let builder = ClusterBuilder::new(0.90);
533 let clusters = builder.build(&pairs);
534 let dup_clusters: Vec<&DuplicateCluster> =
535 clusters.iter().filter(|c| c.is_duplicate_group()).collect();
536 assert_eq!(dup_clusters.len(), 2);
537 }
538
539 #[test]
540 fn test_builder_with_strategy_complete_linkage() {
541 let pairs = vec![SimilarityPair::new(pb("a.mp4"), pb("b.mp4"), 0.95)];
542 let builder = ClusterBuilder::new(0.90).with_strategy(MergeStrategy::CompleteLinkage);
543 let clusters = builder.build(&pairs);
544 assert!(!clusters.is_empty());
545 }
546
547 #[test]
548 fn test_merge_clusters() {
549 let mut a = DuplicateCluster::new(0);
550 a.add_member(pb("a.mp4"));
551 a.add_edge(0, 0, 1.0);
552
553 let mut b = DuplicateCluster::new(1);
554 b.add_member(pb("b.mp4"));
555 b.add_edge(0, 0, 0.9);
556
557 let merged = merge_clusters(a, b);
558 assert_eq!(merged.size(), 2);
559 assert_eq!(merged.id, 0);
560 }
561
562 #[test]
563 fn test_cluster_summary_empty() {
564 let summary = ClusterSummary::from_clusters(&[]);
565 assert_eq!(summary.total_clusters, 0);
566 assert_eq!(summary.files_in_duplicates, 0);
567 assert_eq!(summary.largest_cluster_size, 0);
568 assert_eq!(summary.average_cluster_size, 0.0);
569 }
570
571 #[test]
572 fn test_cluster_summary_with_clusters() {
573 let mut c1 = DuplicateCluster::new(0);
574 c1.add_member(pb("a.mp4"));
575 c1.add_member(pb("b.mp4"));
576 c1.add_member(pb("c.mp4"));
577
578 let mut c2 = DuplicateCluster::new(1);
579 c2.add_member(pb("x.mp4"));
580 c2.add_member(pb("y.mp4"));
581
582 let mut c3 = DuplicateCluster::new(2);
583 c3.add_member(pb("solo.mp4")); let summary = ClusterSummary::from_clusters(&[c1, c2, c3]);
586 assert_eq!(summary.total_clusters, 2);
587 assert_eq!(summary.files_in_duplicates, 5);
588 assert_eq!(summary.largest_cluster_size, 3);
589 assert!((summary.average_cluster_size - 2.5).abs() < 1e-9);
590 }
591
592 #[test]
593 fn test_similarity_pair_new() {
594 let p = SimilarityPair::new(pb("a.mp4"), pb("b.mp4"), 0.75);
595 assert_eq!(p.score, 0.75);
596 assert_eq!(p.path_a, pb("a.mp4"));
597 assert_eq!(p.path_b, pb("b.mp4"));
598 }
599
600 #[test]
603 fn test_transitive_closure_empty() {
604 let groups = transitive_closure_groups(&[], 0.5);
605 assert!(groups.is_empty());
606 }
607
608 #[test]
609 fn test_transitive_closure_single_pair() {
610 let pairs = vec![(pb("a.mp4"), pb("b.mp4"), 0.95)];
611 let groups = transitive_closure_groups(&pairs, 0.9);
612 assert_eq!(groups.len(), 1);
613 assert_eq!(groups[0].size(), 2);
614 }
615
616 #[test]
617 fn test_transitive_closure_chain() {
618 let pairs = vec![
620 (pb("a.mp4"), pb("b.mp4"), 0.95),
621 (pb("b.mp4"), pb("c.mp4"), 0.92),
622 ];
623 let groups = transitive_closure_groups(&pairs, 0.9);
624 assert_eq!(groups.len(), 1);
625 assert_eq!(groups[0].size(), 3);
626 let members: HashSet<_> = groups[0].members.iter().collect();
628 assert!(members.contains(&pb("a.mp4")));
629 assert!(members.contains(&pb("b.mp4")));
630 assert!(members.contains(&pb("c.mp4")));
631 }
632
633 #[test]
634 fn test_transitive_closure_two_components() {
635 let pairs = vec![
637 (pb("a.mp4"), pb("b.mp4"), 0.95),
638 (pb("x.mp4"), pb("y.mp4"), 0.93),
639 ];
640 let groups = transitive_closure_groups(&pairs, 0.9);
641 assert_eq!(groups.len(), 2);
642 }
643
644 #[test]
645 fn test_transitive_closure_long_chain() {
646 let pairs = vec![
648 (pb("a.mp4"), pb("b.mp4"), 0.95),
649 (pb("b.mp4"), pb("c.mp4"), 0.94),
650 (pb("c.mp4"), pb("d.mp4"), 0.93),
651 (pb("d.mp4"), pb("e.mp4"), 0.92),
652 ];
653 let groups = transitive_closure_groups(&pairs, 0.9);
654 assert_eq!(groups.len(), 1);
655 assert_eq!(groups[0].size(), 5);
656 }
657
658 #[test]
659 fn test_transitive_closure_threshold_filters() {
660 let pairs = vec![
662 (pb("a.mp4"), pb("b.mp4"), 0.95),
663 (pb("b.mp4"), pb("c.mp4"), 0.80),
664 ];
665 let groups = transitive_closure_groups(&pairs, 0.9);
666 assert_eq!(groups.len(), 1);
667 assert_eq!(groups[0].size(), 2); }
669
670 #[test]
671 fn test_transitive_closure_star_topology() {
672 let pairs = vec![
674 (pb("hub.mp4"), pb("a.mp4"), 0.96),
675 (pb("hub.mp4"), pb("b.mp4"), 0.94),
676 (pb("hub.mp4"), pb("c.mp4"), 0.93),
677 (pb("hub.mp4"), pb("d.mp4"), 0.91),
678 ];
679 let groups = transitive_closure_groups(&pairs, 0.9);
680 assert_eq!(groups.len(), 1);
681 assert_eq!(groups[0].size(), 5);
682 }
683
684 #[test]
685 fn test_transitive_closure_selects_representative() {
686 let pairs = vec![
687 (pb("a.mp4"), pb("b.mp4"), 0.95),
688 (pb("b.mp4"), pb("c.mp4"), 0.93),
689 ];
690 let groups = transitive_closure_groups(&pairs, 0.9);
691 assert_eq!(groups.len(), 1);
692 assert!(groups[0].representative.is_some());
693 }
694
695 #[test]
696 fn test_transitive_groups_from_pairs_convenience() {
697 let pairs = vec![
698 SimilarityPair::new(pb("a.mp4"), pb("b.mp4"), 0.95),
699 SimilarityPair::new(pb("b.mp4"), pb("c.mp4"), 0.93),
700 SimilarityPair::new(pb("x.mp4"), pb("y.mp4"), 0.40), ];
702 let groups = transitive_groups_from_pairs(&pairs, 0.9);
703 assert_eq!(groups.len(), 1);
704 assert_eq!(groups[0].size(), 3);
705 }
706
707 #[test]
708 fn test_transitive_closure_edges_attached() {
709 let pairs = vec![
710 (pb("a.mp4"), pb("b.mp4"), 0.95),
711 (pb("b.mp4"), pb("c.mp4"), 0.93),
712 ];
713 let groups = transitive_closure_groups(&pairs, 0.9);
714 assert_eq!(groups[0].edges.len(), 2);
715 }
716}