Skip to main content

oximedia_dedup/
cluster.rs

1//! Duplicate clustering: similarity groups, cluster merging, representative selection.
2
3#![allow(dead_code)]
4#![allow(clippy::cast_precision_loss)]
5#![allow(clippy::too_many_arguments)]
6
7use std::collections::{HashMap, HashSet};
8use std::path::PathBuf;
9
10/// A cluster of near-duplicate media files.
11#[derive(Debug, Clone)]
12pub struct DuplicateCluster {
13    /// Unique cluster identifier.
14    pub id: usize,
15    /// Members of this cluster (file paths).
16    pub members: Vec<PathBuf>,
17    /// Pairwise similarity scores (index_a, index_b, score).
18    pub edges: Vec<(usize, usize, f64)>,
19    /// The representative file selected for this cluster.
20    pub representative: Option<PathBuf>,
21}
22
23impl DuplicateCluster {
24    /// Create a new cluster with the given id.
25    #[must_use]
26    pub fn new(id: usize) -> Self {
27        Self {
28            id,
29            members: Vec::new(),
30            edges: Vec::new(),
31            representative: None,
32        }
33    }
34
35    /// Add a member file to the cluster.
36    pub fn add_member(&mut self, path: PathBuf) {
37        self.members.push(path);
38    }
39
40    /// Record a similarity edge between two member indices.
41    pub fn add_edge(&mut self, a: usize, b: usize, score: f64) {
42        self.edges.push((a, b, score));
43    }
44
45    /// Number of members.
46    #[must_use]
47    pub fn size(&self) -> usize {
48        self.members.len()
49    }
50
51    /// Returns true if the cluster has at least two members.
52    #[must_use]
53    pub fn is_duplicate_group(&self) -> bool {
54        self.members.len() >= 2
55    }
56
57    /// Average similarity score across all edges.
58    #[must_use]
59    pub fn average_similarity(&self) -> f64 {
60        if self.edges.is_empty() {
61            return 0.0;
62        }
63        let sum: f64 = self.edges.iter().map(|(_, _, s)| *s).sum();
64        sum / self.edges.len() as f64
65    }
66
67    /// Select the representative member: the one with the highest average similarity to others.
68    pub fn select_representative(&mut self) {
69        if self.members.is_empty() {
70            return;
71        }
72        if self.members.len() == 1 {
73            self.representative = Some(self.members[0].clone());
74            return;
75        }
76        let n = self.members.len();
77        let mut scores = vec![0.0f64; n];
78        let mut counts = vec![0usize; n];
79        for &(a, b, s) in &self.edges {
80            if a < n && b < n {
81                scores[a] += s;
82                scores[b] += s;
83                counts[a] += 1;
84                counts[b] += 1;
85            }
86        }
87        let avg: Vec<f64> = scores
88            .iter()
89            .zip(counts.iter())
90            .map(|(s, &c)| if c > 0 { *s / c as f64 } else { 0.0 })
91            .collect();
92        let best = avg
93            .iter()
94            .enumerate()
95            .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
96            .map(|(i, _)| i)
97            .unwrap_or(0);
98        self.representative = Some(self.members[best].clone());
99    }
100}
101
102/// Strategy for merging clusters.
103#[derive(Debug, Clone, Copy, PartialEq, Eq)]
104pub enum MergeStrategy {
105    /// Merge if any edge exceeds the threshold (single-linkage).
106    SingleLinkage,
107    /// Merge only if all pairs exceed the threshold (complete-linkage).
108    CompleteLinkage,
109    /// Merge if average similarity exceeds the threshold (average-linkage).
110    AverageLinkage,
111}
112
113/// Similarity pair between two files.
114#[derive(Debug, Clone)]
115pub struct SimilarityPair {
116    /// Path to the first file.
117    pub path_a: PathBuf,
118    /// Path to the second file.
119    pub path_b: PathBuf,
120    /// Similarity score in [0.0, 1.0].
121    pub score: f64,
122}
123
124impl SimilarityPair {
125    /// Create a new similarity pair.
126    #[must_use]
127    pub fn new(path_a: PathBuf, path_b: PathBuf, score: f64) -> Self {
128        Self {
129            path_a,
130            path_b,
131            score,
132        }
133    }
134}
135
136/// Cluster builder that groups files from similarity pairs.
137#[derive(Debug, Default)]
138pub struct ClusterBuilder {
139    threshold: f64,
140    strategy: MergeStrategyInner,
141}
142
143#[derive(Debug, Clone, Copy, Default)]
144enum MergeStrategyInner {
145    #[default]
146    SingleLinkage,
147    CompleteLinkage,
148    AverageLinkage,
149}
150
151impl ClusterBuilder {
152    /// Create a builder with the given similarity threshold.
153    #[must_use]
154    pub fn new(threshold: f64) -> Self {
155        Self {
156            threshold,
157            strategy: MergeStrategyInner::SingleLinkage,
158        }
159    }
160
161    /// Set the merge strategy.
162    #[must_use]
163    pub fn with_strategy(mut self, strategy: MergeStrategy) -> Self {
164        self.strategy = match strategy {
165            MergeStrategy::SingleLinkage => MergeStrategyInner::SingleLinkage,
166            MergeStrategy::CompleteLinkage => MergeStrategyInner::CompleteLinkage,
167            MergeStrategy::AverageLinkage => MergeStrategyInner::AverageLinkage,
168        };
169        self
170    }
171
172    /// Build clusters from similarity pairs using Union-Find.
173    #[must_use]
174    pub fn build(&self, pairs: &[SimilarityPair]) -> Vec<DuplicateCluster> {
175        // Collect all unique paths.
176        let mut path_set: HashSet<&PathBuf> = HashSet::new();
177        for p in pairs {
178            path_set.insert(&p.path_a);
179            path_set.insert(&p.path_b);
180        }
181        let paths: Vec<&PathBuf> = path_set.into_iter().collect();
182        let idx: HashMap<&PathBuf, usize> =
183            paths.iter().enumerate().map(|(i, p)| (*p, i)).collect();
184        let n = paths.len();
185        let mut parent: Vec<usize> = (0..n).collect();
186
187        // Filter pairs by threshold.
188        let valid_pairs: Vec<&SimilarityPair> =
189            pairs.iter().filter(|p| p.score >= self.threshold).collect();
190
191        // Union-Find helpers (iterative path compression).
192        fn find(parent: &mut Vec<usize>, x: usize) -> usize {
193            let mut root = x;
194            while parent[root] != root {
195                root = parent[root];
196            }
197            let mut cur = x;
198            while cur != root {
199                let next = parent[cur];
200                parent[cur] = root;
201                cur = next;
202            }
203            root
204        }
205
206        for pair in &valid_pairs {
207            let a = idx[&pair.path_a];
208            let b = idx[&pair.path_b];
209            let ra = find(&mut parent, a);
210            let rb = find(&mut parent, b);
211            if ra != rb {
212                parent[rb] = ra;
213            }
214        }
215
216        // Group by root.
217        let mut groups: HashMap<usize, Vec<usize>> = HashMap::new();
218        for i in 0..n {
219            let root = find(&mut parent, i);
220            groups.entry(root).or_default().push(i);
221        }
222
223        // Build DuplicateCluster per group.
224        let mut clusters = Vec::new();
225        for (cid, (_, members)) in groups.iter().enumerate() {
226            let mut cluster = DuplicateCluster::new(cid);
227            let local_idx: HashMap<usize, usize> = members
228                .iter()
229                .enumerate()
230                .map(|(li, &gi)| (gi, li))
231                .collect();
232            for &gi in members {
233                cluster.add_member(paths[gi].clone());
234            }
235            for pair in &valid_pairs {
236                let a = idx[&pair.path_a];
237                let b = idx[&pair.path_b];
238                if let (Some(&la), Some(&lb)) = (local_idx.get(&a), local_idx.get(&b)) {
239                    cluster.add_edge(la, lb, pair.score);
240                }
241            }
242            cluster.select_representative();
243            clusters.push(cluster);
244        }
245        clusters
246    }
247}
248
249// ---------------------------------------------------------------------------
250// Transitive closure grouping
251// ---------------------------------------------------------------------------
252
253/// Group items by transitive closure over similarity pairs.
254///
255/// If A is similar to B and B is similar to C, all three are placed into the
256/// same group {A, B, C}, even if A and C were never directly compared.
257///
258/// This uses Union-Find with path compression and union-by-rank for O(n * alpha(n))
259/// amortized performance, where alpha is the inverse Ackermann function.
260///
261/// # Arguments
262/// * `pairs` - Similarity pairs `(path_a, path_b, score)` with score in [0.0, 1.0].
263/// * `threshold` - Minimum similarity score for two items to be considered linked.
264///
265/// # Returns
266/// A list of `DuplicateCluster` instances, each containing all transitively
267/// connected members. Only clusters with 2+ members are returned.
268#[must_use]
269pub fn transitive_closure_groups(
270    pairs: &[(PathBuf, PathBuf, f64)],
271    threshold: f64,
272) -> Vec<DuplicateCluster> {
273    if pairs.is_empty() {
274        return Vec::new();
275    }
276
277    // Collect all unique paths and assign indices.
278    let mut path_to_idx: HashMap<&PathBuf, usize> = HashMap::new();
279    let mut idx_to_path: Vec<&PathBuf> = Vec::new();
280
281    for (a, b, _) in pairs {
282        if !path_to_idx.contains_key(a) {
283            let idx = idx_to_path.len();
284            path_to_idx.insert(a, idx);
285            idx_to_path.push(a);
286        }
287        if !path_to_idx.contains_key(b) {
288            let idx = idx_to_path.len();
289            path_to_idx.insert(b, idx);
290            idx_to_path.push(b);
291        }
292    }
293
294    let n = idx_to_path.len();
295    let mut parent: Vec<usize> = (0..n).collect();
296    let mut rank: Vec<usize> = vec![0; n];
297
298    // Union-Find with path compression and union-by-rank.
299    fn find_root(parent: &mut [usize], x: usize) -> usize {
300        let mut root = x;
301        while parent[root] != root {
302            root = parent[root];
303        }
304        // Path compression
305        let mut cur = x;
306        while cur != root {
307            let next = parent[cur];
308            parent[cur] = root;
309            cur = next;
310        }
311        root
312    }
313
314    fn union(parent: &mut [usize], rank: &mut [usize], a: usize, b: usize) {
315        let ra = find_root(parent, a);
316        let rb = find_root(parent, b);
317        if ra == rb {
318            return;
319        }
320        match rank[ra].cmp(&rank[rb]) {
321            std::cmp::Ordering::Less => parent[ra] = rb,
322            std::cmp::Ordering::Greater => parent[rb] = ra,
323            std::cmp::Ordering::Equal => {
324                parent[rb] = ra;
325                rank[ra] += 1;
326            }
327        }
328    }
329
330    // Filter pairs by threshold and union them.
331    let mut valid_edges: Vec<(usize, usize, f64)> = Vec::new();
332    for (a, b, score) in pairs {
333        if *score >= threshold {
334            let ia = path_to_idx[a];
335            let ib = path_to_idx[b];
336            union(&mut parent, &mut rank, ia, ib);
337            valid_edges.push((ia, ib, *score));
338        }
339    }
340
341    // Group indices by root.
342    let mut groups: HashMap<usize, Vec<usize>> = HashMap::new();
343    for i in 0..n {
344        let root = find_root(&mut parent, i);
345        groups.entry(root).or_default().push(i);
346    }
347
348    // Build DuplicateCluster per group with >= 2 members.
349    let mut clusters = Vec::new();
350    for (cid, (_, members)) in groups.iter().filter(|(_, m)| m.len() >= 2).enumerate() {
351        let mut cluster = DuplicateCluster::new(cid);
352        let local_idx: HashMap<usize, usize> = members
353            .iter()
354            .enumerate()
355            .map(|(local, &global)| (global, local))
356            .collect();
357
358        for &gi in members {
359            cluster.add_member(idx_to_path[gi].clone());
360        }
361
362        // Attach edges within this group.
363        for &(ea, eb, score) in &valid_edges {
364            if let (Some(&la), Some(&lb)) = (local_idx.get(&ea), local_idx.get(&eb)) {
365                cluster.add_edge(la, lb, score);
366            }
367        }
368
369        cluster.select_representative();
370        clusters.push(cluster);
371    }
372
373    clusters
374}
375
376/// Build transitive groups from `SimilarityPair` slices.
377///
378/// Convenience wrapper around [`transitive_closure_groups`] that accepts
379/// the same `SimilarityPair` type used by `ClusterBuilder`.
380#[must_use]
381pub fn transitive_groups_from_pairs(
382    pairs: &[SimilarityPair],
383    threshold: f64,
384) -> Vec<DuplicateCluster> {
385    let triples: Vec<(PathBuf, PathBuf, f64)> = pairs
386        .iter()
387        .map(|p| (p.path_a.clone(), p.path_b.clone(), p.score))
388        .collect();
389    transitive_closure_groups(&triples, threshold)
390}
391
392/// Merge two clusters into one.
393#[must_use]
394pub fn merge_clusters(mut a: DuplicateCluster, b: DuplicateCluster) -> DuplicateCluster {
395    let offset = a.members.len();
396    for member in b.members {
397        a.members.push(member);
398    }
399    for (ea, eb, score) in b.edges {
400        a.edges.push((ea + offset, eb + offset, score));
401    }
402    a.id = a.id.min(b.id);
403    a.select_representative();
404    a
405}
406
407/// Summary of clustering results.
408#[derive(Debug, Clone)]
409pub struct ClusterSummary {
410    /// Total number of clusters found.
411    pub total_clusters: usize,
412    /// Total files in duplicate clusters (>= 2 members).
413    pub files_in_duplicates: usize,
414    /// Largest cluster size.
415    pub largest_cluster_size: usize,
416    /// Average cluster size (for clusters with >= 2 members).
417    pub average_cluster_size: f64,
418}
419
420impl ClusterSummary {
421    /// Build a summary from a slice of clusters.
422    #[must_use]
423    pub fn from_clusters(clusters: &[DuplicateCluster]) -> Self {
424        let dup_clusters: Vec<&DuplicateCluster> =
425            clusters.iter().filter(|c| c.is_duplicate_group()).collect();
426        let total_clusters = dup_clusters.len();
427        let files_in_duplicates: usize = dup_clusters.iter().map(|c| c.size()).sum();
428        let largest_cluster_size = dup_clusters.iter().map(|c| c.size()).max().unwrap_or(0);
429        let average_cluster_size = if total_clusters > 0 {
430            files_in_duplicates as f64 / total_clusters as f64
431        } else {
432            0.0
433        };
434        Self {
435            total_clusters,
436            files_in_duplicates,
437            largest_cluster_size,
438            average_cluster_size,
439        }
440    }
441}
442
443#[cfg(test)]
444mod tests {
445    use super::*;
446
447    fn pb(s: &str) -> PathBuf {
448        PathBuf::from(s)
449    }
450
451    #[test]
452    fn test_cluster_new() {
453        let c = DuplicateCluster::new(0);
454        assert_eq!(c.id, 0);
455        assert!(c.members.is_empty());
456        assert!(c.edges.is_empty());
457        assert!(c.representative.is_none());
458    }
459
460    #[test]
461    fn test_cluster_add_member() {
462        let mut c = DuplicateCluster::new(1);
463        c.add_member(pb("a.mp4"));
464        c.add_member(pb("b.mp4"));
465        assert_eq!(c.size(), 2);
466        assert!(c.is_duplicate_group());
467    }
468
469    #[test]
470    fn test_cluster_single_member_not_duplicate() {
471        let mut c = DuplicateCluster::new(0);
472        c.add_member(pb("a.mp4"));
473        assert!(!c.is_duplicate_group());
474    }
475
476    #[test]
477    fn test_cluster_average_similarity_empty_edges() {
478        let c = DuplicateCluster::new(0);
479        assert_eq!(c.average_similarity(), 0.0);
480    }
481
482    #[test]
483    fn test_cluster_average_similarity() {
484        let mut c = DuplicateCluster::new(0);
485        c.add_member(pb("a.mp4"));
486        c.add_member(pb("b.mp4"));
487        c.add_edge(0, 1, 0.8);
488        c.add_edge(0, 1, 0.6);
489        assert!((c.average_similarity() - 0.7).abs() < 1e-9);
490    }
491
492    #[test]
493    fn test_cluster_select_representative_single() {
494        let mut c = DuplicateCluster::new(0);
495        c.add_member(pb("only.mp4"));
496        c.select_representative();
497        assert_eq!(c.representative, Some(pb("only.mp4")));
498    }
499
500    #[test]
501    fn test_cluster_select_representative_two() {
502        let mut c = DuplicateCluster::new(0);
503        c.add_member(pb("a.mp4"));
504        c.add_member(pb("b.mp4"));
505        c.add_edge(0, 1, 0.9);
506        c.select_representative();
507        assert!(c.representative.is_some());
508    }
509
510    #[test]
511    fn test_builder_groups_by_threshold() {
512        let pairs = vec![
513            SimilarityPair::new(pb("a.mp4"), pb("b.mp4"), 0.95),
514            SimilarityPair::new(pb("a.mp4"), pb("c.mp4"), 0.93),
515            SimilarityPair::new(pb("x.mp4"), pb("y.mp4"), 0.40), // below threshold
516        ];
517        let builder = ClusterBuilder::new(0.90);
518        let clusters = builder.build(&pairs);
519        // a, b, c should be in one cluster; x and y are singletons or separate
520        let dup_clusters: Vec<&DuplicateCluster> =
521            clusters.iter().filter(|c| c.is_duplicate_group()).collect();
522        assert_eq!(dup_clusters.len(), 1);
523        assert_eq!(dup_clusters[0].size(), 3);
524    }
525
526    #[test]
527    fn test_builder_separate_clusters() {
528        let pairs = vec![
529            SimilarityPair::new(pb("a.mp4"), pb("b.mp4"), 0.95),
530            SimilarityPair::new(pb("x.mp4"), pb("y.mp4"), 0.92),
531        ];
532        let builder = ClusterBuilder::new(0.90);
533        let clusters = builder.build(&pairs);
534        let dup_clusters: Vec<&DuplicateCluster> =
535            clusters.iter().filter(|c| c.is_duplicate_group()).collect();
536        assert_eq!(dup_clusters.len(), 2);
537    }
538
539    #[test]
540    fn test_builder_with_strategy_complete_linkage() {
541        let pairs = vec![SimilarityPair::new(pb("a.mp4"), pb("b.mp4"), 0.95)];
542        let builder = ClusterBuilder::new(0.90).with_strategy(MergeStrategy::CompleteLinkage);
543        let clusters = builder.build(&pairs);
544        assert!(!clusters.is_empty());
545    }
546
547    #[test]
548    fn test_merge_clusters() {
549        let mut a = DuplicateCluster::new(0);
550        a.add_member(pb("a.mp4"));
551        a.add_edge(0, 0, 1.0);
552
553        let mut b = DuplicateCluster::new(1);
554        b.add_member(pb("b.mp4"));
555        b.add_edge(0, 0, 0.9);
556
557        let merged = merge_clusters(a, b);
558        assert_eq!(merged.size(), 2);
559        assert_eq!(merged.id, 0);
560    }
561
562    #[test]
563    fn test_cluster_summary_empty() {
564        let summary = ClusterSummary::from_clusters(&[]);
565        assert_eq!(summary.total_clusters, 0);
566        assert_eq!(summary.files_in_duplicates, 0);
567        assert_eq!(summary.largest_cluster_size, 0);
568        assert_eq!(summary.average_cluster_size, 0.0);
569    }
570
571    #[test]
572    fn test_cluster_summary_with_clusters() {
573        let mut c1 = DuplicateCluster::new(0);
574        c1.add_member(pb("a.mp4"));
575        c1.add_member(pb("b.mp4"));
576        c1.add_member(pb("c.mp4"));
577
578        let mut c2 = DuplicateCluster::new(1);
579        c2.add_member(pb("x.mp4"));
580        c2.add_member(pb("y.mp4"));
581
582        let mut c3 = DuplicateCluster::new(2);
583        c3.add_member(pb("solo.mp4")); // singleton, not counted
584
585        let summary = ClusterSummary::from_clusters(&[c1, c2, c3]);
586        assert_eq!(summary.total_clusters, 2);
587        assert_eq!(summary.files_in_duplicates, 5);
588        assert_eq!(summary.largest_cluster_size, 3);
589        assert!((summary.average_cluster_size - 2.5).abs() < 1e-9);
590    }
591
592    #[test]
593    fn test_similarity_pair_new() {
594        let p = SimilarityPair::new(pb("a.mp4"), pb("b.mp4"), 0.75);
595        assert_eq!(p.score, 0.75);
596        assert_eq!(p.path_a, pb("a.mp4"));
597        assert_eq!(p.path_b, pb("b.mp4"));
598    }
599
600    // ---- Transitive closure grouping tests ----
601
602    #[test]
603    fn test_transitive_closure_empty() {
604        let groups = transitive_closure_groups(&[], 0.5);
605        assert!(groups.is_empty());
606    }
607
608    #[test]
609    fn test_transitive_closure_single_pair() {
610        let pairs = vec![(pb("a.mp4"), pb("b.mp4"), 0.95)];
611        let groups = transitive_closure_groups(&pairs, 0.9);
612        assert_eq!(groups.len(), 1);
613        assert_eq!(groups[0].size(), 2);
614    }
615
616    #[test]
617    fn test_transitive_closure_chain() {
618        // A~B, B~C => {A, B, C}
619        let pairs = vec![
620            (pb("a.mp4"), pb("b.mp4"), 0.95),
621            (pb("b.mp4"), pb("c.mp4"), 0.92),
622        ];
623        let groups = transitive_closure_groups(&pairs, 0.9);
624        assert_eq!(groups.len(), 1);
625        assert_eq!(groups[0].size(), 3);
626        // All three should be in the group
627        let members: HashSet<_> = groups[0].members.iter().collect();
628        assert!(members.contains(&pb("a.mp4")));
629        assert!(members.contains(&pb("b.mp4")));
630        assert!(members.contains(&pb("c.mp4")));
631    }
632
633    #[test]
634    fn test_transitive_closure_two_components() {
635        // {A, B} and {X, Y} are separate components
636        let pairs = vec![
637            (pb("a.mp4"), pb("b.mp4"), 0.95),
638            (pb("x.mp4"), pb("y.mp4"), 0.93),
639        ];
640        let groups = transitive_closure_groups(&pairs, 0.9);
641        assert_eq!(groups.len(), 2);
642    }
643
644    #[test]
645    fn test_transitive_closure_long_chain() {
646        // A~B, B~C, C~D, D~E => {A, B, C, D, E}
647        let pairs = vec![
648            (pb("a.mp4"), pb("b.mp4"), 0.95),
649            (pb("b.mp4"), pb("c.mp4"), 0.94),
650            (pb("c.mp4"), pb("d.mp4"), 0.93),
651            (pb("d.mp4"), pb("e.mp4"), 0.92),
652        ];
653        let groups = transitive_closure_groups(&pairs, 0.9);
654        assert_eq!(groups.len(), 1);
655        assert_eq!(groups[0].size(), 5);
656    }
657
658    #[test]
659    fn test_transitive_closure_threshold_filters() {
660        // A~B at 0.95, B~C at 0.80 (below threshold)
661        let pairs = vec![
662            (pb("a.mp4"), pb("b.mp4"), 0.95),
663            (pb("b.mp4"), pb("c.mp4"), 0.80),
664        ];
665        let groups = transitive_closure_groups(&pairs, 0.9);
666        assert_eq!(groups.len(), 1);
667        assert_eq!(groups[0].size(), 2); // Only A and B; C is excluded
668    }
669
670    #[test]
671    fn test_transitive_closure_star_topology() {
672        // Hub~A, Hub~B, Hub~C, Hub~D => {Hub, A, B, C, D}
673        let pairs = vec![
674            (pb("hub.mp4"), pb("a.mp4"), 0.96),
675            (pb("hub.mp4"), pb("b.mp4"), 0.94),
676            (pb("hub.mp4"), pb("c.mp4"), 0.93),
677            (pb("hub.mp4"), pb("d.mp4"), 0.91),
678        ];
679        let groups = transitive_closure_groups(&pairs, 0.9);
680        assert_eq!(groups.len(), 1);
681        assert_eq!(groups[0].size(), 5);
682    }
683
684    #[test]
685    fn test_transitive_closure_selects_representative() {
686        let pairs = vec![
687            (pb("a.mp4"), pb("b.mp4"), 0.95),
688            (pb("b.mp4"), pb("c.mp4"), 0.93),
689        ];
690        let groups = transitive_closure_groups(&pairs, 0.9);
691        assert_eq!(groups.len(), 1);
692        assert!(groups[0].representative.is_some());
693    }
694
695    #[test]
696    fn test_transitive_groups_from_pairs_convenience() {
697        let pairs = vec![
698            SimilarityPair::new(pb("a.mp4"), pb("b.mp4"), 0.95),
699            SimilarityPair::new(pb("b.mp4"), pb("c.mp4"), 0.93),
700            SimilarityPair::new(pb("x.mp4"), pb("y.mp4"), 0.40), // below threshold
701        ];
702        let groups = transitive_groups_from_pairs(&pairs, 0.9);
703        assert_eq!(groups.len(), 1);
704        assert_eq!(groups[0].size(), 3);
705    }
706
707    #[test]
708    fn test_transitive_closure_edges_attached() {
709        let pairs = vec![
710            (pb("a.mp4"), pb("b.mp4"), 0.95),
711            (pb("b.mp4"), pb("c.mp4"), 0.93),
712        ];
713        let groups = transitive_closure_groups(&pairs, 0.9);
714        assert_eq!(groups[0].edges.len(), 2);
715    }
716}