Skip to main content

fallow_cli/report/
dupes_grouping.rs

1//! Per-group attribution for `fallow dupes --group-by`.
2//!
3//! For each `CloneGroup`, every instance is attributed to a group key (owner,
4//! directory, package, or section) via the same [`OwnershipResolver`] used by
5//! `check` and `health`. The group itself is then attributed to its
6//! **largest owner**: the key with the most instances in that clone group.
7//! Ties are broken alphabetically (lexicographic ascending).
8//!
9//! This mirrors jscpd's majority-owner attribution and avoids the
10//! positional non-determinism that a "first-instance-wins" rule would
11//! introduce, since `DuplicationReport::sort()` already orders instances
12//! deterministically by file path then line.
13
14use std::collections::BTreeMap;
15use std::path::Path;
16
17use fallow_core::duplicates::{
18    CloneFamily, CloneGroup, CloneInstance, DuplicationReport, DuplicationStats,
19};
20use rustc_hash::FxHashSet;
21use serde::Serialize;
22
23use super::grouping::OwnershipResolver;
24use super::relative_path;
25use crate::baseline::recompute_stats;
26use crate::codeowners::UNOWNED_LABEL;
27
28/// Resolve the group key for a single instance file.
29fn key_for_instance(instance: &CloneInstance, root: &Path, resolver: &OwnershipResolver) -> String {
30    resolver.resolve(relative_path(&instance.file, root))
31}
32
33/// Pick the largest owner for a clone group: most instances wins, ties broken
34/// alphabetically (smallest key wins).
35///
36/// Iterates a `BTreeMap` so iteration order is alphabetical. The first key
37/// to reach the running maximum wins, which means equal counts resolve to the
38/// alphabetically-smallest key.
39pub fn largest_owner(group: &CloneGroup, root: &Path, resolver: &OwnershipResolver) -> String {
40    let mut counts: BTreeMap<String, u32> = BTreeMap::new();
41    for instance in &group.instances {
42        let key = key_for_instance(instance, root, resolver);
43        *counts.entry(key).or_insert(0) += 1;
44    }
45    if counts.is_empty() {
46        return UNOWNED_LABEL.to_string();
47    }
48    let mut best_key: Option<String> = None;
49    let mut best_count: u32 = 0;
50    for (key, count) in counts {
51        if best_key.is_none() || count > best_count {
52            best_count = count;
53            best_key = Some(key);
54        }
55    }
56    best_key.unwrap_or_else(|| UNOWNED_LABEL.to_string())
57}
58
59/// A clone instance plus its per-instance owner key (for inline JSON / SARIF
60/// rendering).
61#[derive(Debug, Clone, Serialize)]
62pub struct AttributedInstance {
63    /// The original clone instance.
64    #[serde(flatten)]
65    pub instance: CloneInstance,
66    /// Group key for this specific instance (owner / directory / package / section).
67    pub owner: String,
68}
69
70/// A clone group annotated with its largest owner and per-instance owners.
71#[derive(Debug, Clone, Serialize)]
72pub struct AttributedCloneGroup {
73    /// The largest-owner attribution for the whole group (most instances,
74    /// alphabetical tiebreak).
75    pub primary_owner: String,
76    /// Token count, copied from the source group.
77    pub token_count: usize,
78    /// Line count per instance.
79    pub line_count: usize,
80    /// Per-instance attribution.
81    pub instances: Vec<AttributedInstance>,
82}
83
84impl AttributedCloneGroup {
85    fn from_group(group: &CloneGroup, root: &Path, resolver: &OwnershipResolver) -> Self {
86        let primary_owner = largest_owner(group, root, resolver);
87        let instances = group
88            .instances
89            .iter()
90            .map(|instance| AttributedInstance {
91                owner: key_for_instance(instance, root, resolver),
92                instance: instance.clone(),
93            })
94            .collect();
95        Self {
96            primary_owner,
97            token_count: group.token_count,
98            line_count: group.line_count,
99            instances,
100        }
101    }
102}
103
104/// A single grouped duplication bucket.
105#[derive(Debug, Clone, Serialize)]
106pub struct DuplicationGroup {
107    /// Group label (owner / directory / package / section).
108    pub key: String,
109    /// Per-group dedup-aware stats computed over the FULL group (pre-truncate).
110    pub stats: DuplicationStats,
111    /// Clone groups attributed to this owner (largest-owner rule).
112    pub clone_groups: Vec<AttributedCloneGroup>,
113    /// Clone families restricted to this group's clone groups.
114    pub clone_families: Vec<CloneFamily>,
115}
116
117/// Wrapper carrying the resolver mode label and grouped buckets.
118#[derive(Debug, Clone, Serialize)]
119pub struct DuplicationGrouping {
120    /// Resolver mode label (`"owner"`, `"directory"`, `"package"`, `"section"`).
121    pub mode: &'static str,
122    /// One bucket per resolver key, sorted most clone groups first with
123    /// `(unowned)` pinned last.
124    pub groups: Vec<DuplicationGroup>,
125}
126
127/// Build the grouped duplication payload from a project-level report.
128///
129/// Aggregation is performed BEFORE any `--top` truncation so per-group stats
130/// reflect the full group, not just the rendered top-N.
131pub fn build_duplication_grouping(
132    report: &DuplicationReport,
133    root: &Path,
134    resolver: &OwnershipResolver,
135) -> DuplicationGrouping {
136    // Bucket clone groups by largest owner.
137    let mut buckets: BTreeMap<String, Vec<AttributedCloneGroup>> = BTreeMap::new();
138    for group in &report.clone_groups {
139        let attributed = AttributedCloneGroup::from_group(group, root, resolver);
140        buckets
141            .entry(attributed.primary_owner.clone())
142            .or_default()
143            .push(attributed);
144    }
145
146    // For each bucket, recompute stats from its clone groups by reusing
147    // `recompute_stats`. Use the original (non-attributed) clone groups to
148    // feed the helper so we share the dedup logic with the project report.
149    let mut groups: Vec<DuplicationGroup> = buckets
150        .into_iter()
151        .map(|(key, attributed_groups)| {
152            // Reconstruct a partial DuplicationReport for stats recomputation.
153            let original_groups: Vec<CloneGroup> = attributed_groups
154                .iter()
155                .map(|ag| CloneGroup {
156                    instances: ag.instances.iter().map(|i| i.instance.clone()).collect(),
157                    token_count: ag.token_count,
158                    line_count: ag.line_count,
159                })
160                .collect();
161            let mut subset = DuplicationReport {
162                clone_groups: original_groups,
163                clone_families: Vec::new(),
164                mirrored_directories: Vec::new(),
165                stats: DuplicationStats {
166                    total_files: report.stats.total_files,
167                    files_with_clones: 0,
168                    total_lines: report.stats.total_lines,
169                    duplicated_lines: 0,
170                    total_tokens: report.stats.total_tokens,
171                    duplicated_tokens: 0,
172                    clone_groups: 0,
173                    clone_instances: 0,
174                    duplication_percentage: 0.0,
175                    clone_groups_below_min_occurrences: report
176                        .stats
177                        .clone_groups_below_min_occurrences,
178                },
179            };
180            subset.stats = recompute_stats(&subset);
181
182            // Restrict clone families to those whose group memberships overlap
183            // this bucket. Using a file-set membership check matches how the
184            // project-level report treats families: a family's groups must all
185            // share its file set.
186            let bucket_files: FxHashSet<&Path> = attributed_groups
187                .iter()
188                .flat_map(|ag| ag.instances.iter().map(|i| i.instance.file.as_path()))
189                .collect();
190            let clone_families: Vec<CloneFamily> = report
191                .clone_families
192                .iter()
193                .filter(|f| f.files.iter().any(|fp| bucket_files.contains(fp.as_path())))
194                .cloned()
195                .collect();
196
197            DuplicationGroup {
198                key,
199                stats: subset.stats,
200                clone_groups: attributed_groups,
201                clone_families,
202            }
203        })
204        .collect();
205
206    // Sort: most clone groups first, alphabetical tiebreak, (unowned) last.
207    groups.sort_by(|a, b| {
208        let a_unowned = a.key == UNOWNED_LABEL;
209        let b_unowned = b.key == UNOWNED_LABEL;
210        match (a_unowned, b_unowned) {
211            (true, false) => std::cmp::Ordering::Greater,
212            (false, true) => std::cmp::Ordering::Less,
213            _ => b
214                .clone_groups
215                .len()
216                .cmp(&a.clone_groups.len())
217                .then_with(|| a.key.cmp(&b.key)),
218        }
219    });
220
221    DuplicationGrouping {
222        mode: resolver.mode_label(),
223        groups,
224    }
225}
226
227#[cfg(test)]
228mod tests {
229    use std::path::PathBuf;
230
231    use fallow_core::duplicates::{CloneInstance, DuplicationStats};
232
233    use super::*;
234    use crate::codeowners::CodeOwners;
235
236    fn instance(path: &str, start: usize, end: usize) -> CloneInstance {
237        CloneInstance {
238            file: PathBuf::from(path),
239            start_line: start,
240            end_line: end,
241            start_col: 0,
242            end_col: 0,
243            fragment: String::new(),
244        }
245    }
246
247    fn group(instances: Vec<CloneInstance>) -> CloneGroup {
248        CloneGroup {
249            instances,
250            token_count: 50,
251            line_count: 10,
252        }
253    }
254
255    fn report(groups: Vec<CloneGroup>) -> DuplicationReport {
256        DuplicationReport {
257            clone_groups: groups,
258            clone_families: vec![],
259            mirrored_directories: vec![],
260            stats: DuplicationStats {
261                total_files: 10,
262                total_lines: 1000,
263                ..Default::default()
264            },
265        }
266    }
267
268    #[test]
269    fn largest_owner_majority_wins() {
270        let r = group(vec![
271            instance("/root/src/a.ts", 1, 10),
272            instance("/root/src/b.ts", 1, 10),
273            instance("/root/lib/c.ts", 1, 10),
274        ]);
275        let key = largest_owner(&r, Path::new("/root"), &OwnershipResolver::Directory);
276        assert_eq!(key, "src", "src has 2 instances vs lib's 1");
277    }
278
279    #[test]
280    fn largest_owner_alphabetical_tiebreak() {
281        let r = group(vec![
282            instance("/root/src/a.ts", 1, 10),
283            instance("/root/lib/b.ts", 1, 10),
284        ]);
285        // 1 vs 1 -- alphabetical: lib < src
286        let key = largest_owner(&r, Path::new("/root"), &OwnershipResolver::Directory);
287        assert_eq!(key, "lib");
288    }
289
290    #[test]
291    fn largest_owner_three_way_tie_alphabetical() {
292        let r = group(vec![
293            instance("/root/zeta/a.ts", 1, 10),
294            instance("/root/alpha/b.ts", 1, 10),
295            instance("/root/beta/c.ts", 1, 10),
296        ]);
297        let key = largest_owner(&r, Path::new("/root"), &OwnershipResolver::Directory);
298        assert_eq!(key, "alpha");
299    }
300
301    #[test]
302    fn build_grouping_partitions_clone_groups() {
303        let g1 = group(vec![
304            instance("/root/src/a.ts", 1, 10),
305            instance("/root/src/b.ts", 1, 10),
306        ]);
307        let g2 = group(vec![
308            instance("/root/lib/x.ts", 1, 10),
309            instance("/root/lib/y.ts", 1, 10),
310        ]);
311        let r = report(vec![g1, g2]);
312        let grouping =
313            build_duplication_grouping(&r, Path::new("/root"), &OwnershipResolver::Directory);
314        assert_eq!(grouping.groups.len(), 2);
315        let lib = grouping.groups.iter().find(|g| g.key == "lib").unwrap();
316        let src = grouping.groups.iter().find(|g| g.key == "src").unwrap();
317        assert_eq!(lib.clone_groups.len(), 1);
318        assert_eq!(src.clone_groups.len(), 1);
319    }
320
321    #[test]
322    fn build_grouping_unowned_pinned_last() {
323        let co = CodeOwners::parse("/src/ @frontend\n").unwrap();
324        let resolver = OwnershipResolver::Owner(co);
325        // src group attributed to @frontend; docs group has no rule -> unowned
326        let g_src = group(vec![
327            instance("/root/src/a.ts", 1, 10),
328            instance("/root/src/b.ts", 1, 10),
329        ]);
330        let g_docs = group(vec![
331            instance("/root/docs/a.md", 1, 10),
332            instance("/root/docs/b.md", 1, 10),
333        ]);
334        let r = report(vec![g_src, g_docs]);
335        let grouping = build_duplication_grouping(&r, Path::new("/root"), &resolver);
336        assert_eq!(grouping.groups.len(), 2);
337        // unowned must be last
338        assert_eq!(grouping.groups.last().unwrap().key, UNOWNED_LABEL);
339    }
340
341    #[test]
342    fn build_grouping_per_instance_owner_inline() {
343        let g = group(vec![
344            instance("/root/src/a.ts", 1, 10),
345            instance("/root/src/b.ts", 1, 10),
346            instance("/root/lib/c.ts", 1, 10),
347        ]);
348        let r = report(vec![g]);
349        let grouping =
350            build_duplication_grouping(&r, Path::new("/root"), &OwnershipResolver::Directory);
351        // Group has src=2, lib=1 -> primary src; instances carry their own owner.
352        assert_eq!(grouping.groups.len(), 1);
353        let bucket = &grouping.groups[0];
354        assert_eq!(bucket.key, "src");
355        assert_eq!(bucket.clone_groups.len(), 1);
356        let cg = &bucket.clone_groups[0];
357        assert_eq!(cg.primary_owner, "src");
358        assert_eq!(cg.instances.len(), 3);
359        let owners: Vec<&str> = cg.instances.iter().map(|i| i.owner.as_str()).collect();
360        assert!(owners.contains(&"src"));
361        assert!(owners.contains(&"lib"));
362    }
363
364    #[test]
365    fn empty_report_produces_empty_grouping() {
366        let r = DuplicationReport::default();
367        let grouping =
368            build_duplication_grouping(&r, Path::new("/root"), &OwnershipResolver::Directory);
369        assert!(grouping.groups.is_empty());
370    }
371}