Skip to main content

fallow_cli/report/
dupes_grouping.rs

1//! Per-group attribution for `fallow dupes --group-by`.
2//!
3//! For each `CloneGroup`, every instance is attributed to a group key (owner,
4//! directory, package, or section) via the same [`OwnershipResolver`] used by
5//! `check` and `health`. The group itself is then attributed to its
6//! **largest owner**: the key with the most instances in that clone group.
7//! Ties are broken alphabetically (lexicographic ascending).
8//!
9//! This mirrors jscpd's majority-owner attribution and avoids the
10//! positional non-determinism that a "first-instance-wins" rule would
11//! introduce, since `DuplicationReport::sort()` already orders instances
12//! deterministically by file path then line.
13
14use std::collections::BTreeMap;
15use std::path::Path;
16
17use fallow_core::duplicates::{
18    CloneFamily, CloneGroup, CloneInstance, DuplicationReport, DuplicationStats,
19};
20use rustc_hash::FxHashSet;
21use serde::Serialize;
22
23use super::grouping::OwnershipResolver;
24use super::relative_path;
25use crate::baseline::recompute_stats;
26use crate::codeowners::UNOWNED_LABEL;
27
28/// Resolve the group key for a single instance file.
29fn key_for_instance(instance: &CloneInstance, root: &Path, resolver: &OwnershipResolver) -> String {
30    resolver.resolve(relative_path(&instance.file, root))
31}
32
33/// Pick the largest owner for a clone group: most instances wins, ties broken
34/// alphabetically (smallest key wins).
35///
36/// Iterates a `BTreeMap` so iteration order is alphabetical. The first key
37/// to reach the running maximum wins, which means equal counts resolve to the
38/// alphabetically-smallest key.
39pub fn largest_owner(group: &CloneGroup, root: &Path, resolver: &OwnershipResolver) -> String {
40    let mut counts: BTreeMap<String, u32> = BTreeMap::new();
41    for instance in &group.instances {
42        let key = key_for_instance(instance, root, resolver);
43        *counts.entry(key).or_insert(0) += 1;
44    }
45    if counts.is_empty() {
46        return UNOWNED_LABEL.to_string();
47    }
48    let mut best_key: Option<String> = None;
49    let mut best_count: u32 = 0;
50    for (key, count) in counts {
51        if best_key.is_none() || count > best_count {
52            best_count = count;
53            best_key = Some(key);
54        }
55    }
56    best_key.unwrap_or_else(|| UNOWNED_LABEL.to_string())
57}
58
59/// A clone instance plus its per-instance owner key (for inline JSON / SARIF
60/// rendering).
61#[derive(Debug, Clone, Serialize)]
62pub struct AttributedInstance {
63    /// The original clone instance.
64    #[serde(flatten)]
65    pub instance: CloneInstance,
66    /// Group key for this specific instance (owner / directory / package / section).
67    pub owner: String,
68}
69
70/// A clone group annotated with its largest owner and per-instance owners.
71#[derive(Debug, Clone, Serialize)]
72pub struct AttributedCloneGroup {
73    /// The largest-owner attribution for the whole group (most instances,
74    /// alphabetical tiebreak).
75    pub primary_owner: String,
76    /// Token count, copied from the source group.
77    pub token_count: usize,
78    /// Line count per instance.
79    pub line_count: usize,
80    /// Per-instance attribution.
81    pub instances: Vec<AttributedInstance>,
82}
83
84impl AttributedCloneGroup {
85    fn from_group(group: &CloneGroup, root: &Path, resolver: &OwnershipResolver) -> Self {
86        let primary_owner = largest_owner(group, root, resolver);
87        let instances = group
88            .instances
89            .iter()
90            .map(|instance| AttributedInstance {
91                owner: key_for_instance(instance, root, resolver),
92                instance: instance.clone(),
93            })
94            .collect();
95        Self {
96            primary_owner,
97            token_count: group.token_count,
98            line_count: group.line_count,
99            instances,
100        }
101    }
102}
103
104/// A single grouped duplication bucket.
105#[derive(Debug, Clone, Serialize)]
106pub struct DuplicationGroup {
107    /// Group label (owner / directory / package / section).
108    pub key: String,
109    /// Per-group dedup-aware stats computed over the FULL group (pre-truncate).
110    pub stats: DuplicationStats,
111    /// Clone groups attributed to this owner (largest-owner rule).
112    pub clone_groups: Vec<AttributedCloneGroup>,
113    /// Clone families restricted to this group's clone groups.
114    pub clone_families: Vec<CloneFamily>,
115}
116
117/// Wrapper carrying the resolver mode label and grouped buckets.
118#[derive(Debug, Clone, Serialize)]
119pub struct DuplicationGrouping {
120    /// Resolver mode label (`"owner"`, `"directory"`, `"package"`, `"section"`).
121    pub mode: &'static str,
122    /// One bucket per resolver key, sorted most clone groups first with
123    /// `(unowned)` pinned last.
124    pub groups: Vec<DuplicationGroup>,
125}
126
127/// Build the grouped duplication payload from a project-level report.
128///
129/// Aggregation is performed BEFORE any `--top` truncation so per-group stats
130/// reflect the full group, not just the rendered top-N.
131pub fn build_duplication_grouping(
132    report: &DuplicationReport,
133    root: &Path,
134    resolver: &OwnershipResolver,
135) -> DuplicationGrouping {
136    // Bucket clone groups by largest owner.
137    let mut buckets: BTreeMap<String, Vec<AttributedCloneGroup>> = BTreeMap::new();
138    for group in &report.clone_groups {
139        let attributed = AttributedCloneGroup::from_group(group, root, resolver);
140        buckets
141            .entry(attributed.primary_owner.clone())
142            .or_default()
143            .push(attributed);
144    }
145
146    // For each bucket, recompute stats from its clone groups by reusing
147    // `recompute_stats`. Use the original (non-attributed) clone groups to
148    // feed the helper so we share the dedup logic with the project report.
149    let mut groups: Vec<DuplicationGroup> = buckets
150        .into_iter()
151        .map(|(key, attributed_groups)| {
152            // Reconstruct a partial DuplicationReport for stats recomputation.
153            let original_groups: Vec<CloneGroup> = attributed_groups
154                .iter()
155                .map(|ag| CloneGroup {
156                    instances: ag.instances.iter().map(|i| i.instance.clone()).collect(),
157                    token_count: ag.token_count,
158                    line_count: ag.line_count,
159                })
160                .collect();
161            let mut subset = DuplicationReport {
162                clone_groups: original_groups,
163                clone_families: Vec::new(),
164                mirrored_directories: Vec::new(),
165                stats: DuplicationStats {
166                    total_files: report.stats.total_files,
167                    files_with_clones: 0,
168                    total_lines: report.stats.total_lines,
169                    duplicated_lines: 0,
170                    total_tokens: report.stats.total_tokens,
171                    duplicated_tokens: 0,
172                    clone_groups: 0,
173                    clone_instances: 0,
174                    duplication_percentage: 0.0,
175                },
176            };
177            subset.stats = recompute_stats(&subset);
178
179            // Restrict clone families to those whose group memberships overlap
180            // this bucket. Using a file-set membership check matches how the
181            // project-level report treats families: a family's groups must all
182            // share its file set.
183            let bucket_files: FxHashSet<&Path> = attributed_groups
184                .iter()
185                .flat_map(|ag| ag.instances.iter().map(|i| i.instance.file.as_path()))
186                .collect();
187            let clone_families: Vec<CloneFamily> = report
188                .clone_families
189                .iter()
190                .filter(|f| f.files.iter().any(|fp| bucket_files.contains(fp.as_path())))
191                .cloned()
192                .collect();
193
194            DuplicationGroup {
195                key,
196                stats: subset.stats,
197                clone_groups: attributed_groups,
198                clone_families,
199            }
200        })
201        .collect();
202
203    // Sort: most clone groups first, alphabetical tiebreak, (unowned) last.
204    groups.sort_by(|a, b| {
205        let a_unowned = a.key == UNOWNED_LABEL;
206        let b_unowned = b.key == UNOWNED_LABEL;
207        match (a_unowned, b_unowned) {
208            (true, false) => std::cmp::Ordering::Greater,
209            (false, true) => std::cmp::Ordering::Less,
210            _ => b
211                .clone_groups
212                .len()
213                .cmp(&a.clone_groups.len())
214                .then_with(|| a.key.cmp(&b.key)),
215        }
216    });
217
218    DuplicationGrouping {
219        mode: resolver.mode_label(),
220        groups,
221    }
222}
223
224#[cfg(test)]
225mod tests {
226    use std::path::PathBuf;
227
228    use fallow_core::duplicates::{CloneInstance, DuplicationStats};
229
230    use super::*;
231    use crate::codeowners::CodeOwners;
232
233    fn instance(path: &str, start: usize, end: usize) -> CloneInstance {
234        CloneInstance {
235            file: PathBuf::from(path),
236            start_line: start,
237            end_line: end,
238            start_col: 0,
239            end_col: 0,
240            fragment: String::new(),
241        }
242    }
243
244    fn group(instances: Vec<CloneInstance>) -> CloneGroup {
245        CloneGroup {
246            instances,
247            token_count: 50,
248            line_count: 10,
249        }
250    }
251
252    fn report(groups: Vec<CloneGroup>) -> DuplicationReport {
253        DuplicationReport {
254            clone_groups: groups,
255            clone_families: vec![],
256            mirrored_directories: vec![],
257            stats: DuplicationStats {
258                total_files: 10,
259                total_lines: 1000,
260                ..Default::default()
261            },
262        }
263    }
264
265    #[test]
266    fn largest_owner_majority_wins() {
267        let r = group(vec![
268            instance("/root/src/a.ts", 1, 10),
269            instance("/root/src/b.ts", 1, 10),
270            instance("/root/lib/c.ts", 1, 10),
271        ]);
272        let key = largest_owner(&r, Path::new("/root"), &OwnershipResolver::Directory);
273        assert_eq!(key, "src", "src has 2 instances vs lib's 1");
274    }
275
276    #[test]
277    fn largest_owner_alphabetical_tiebreak() {
278        let r = group(vec![
279            instance("/root/src/a.ts", 1, 10),
280            instance("/root/lib/b.ts", 1, 10),
281        ]);
282        // 1 vs 1 -- alphabetical: lib < src
283        let key = largest_owner(&r, Path::new("/root"), &OwnershipResolver::Directory);
284        assert_eq!(key, "lib");
285    }
286
287    #[test]
288    fn largest_owner_three_way_tie_alphabetical() {
289        let r = group(vec![
290            instance("/root/zeta/a.ts", 1, 10),
291            instance("/root/alpha/b.ts", 1, 10),
292            instance("/root/beta/c.ts", 1, 10),
293        ]);
294        let key = largest_owner(&r, Path::new("/root"), &OwnershipResolver::Directory);
295        assert_eq!(key, "alpha");
296    }
297
298    #[test]
299    fn build_grouping_partitions_clone_groups() {
300        let g1 = group(vec![
301            instance("/root/src/a.ts", 1, 10),
302            instance("/root/src/b.ts", 1, 10),
303        ]);
304        let g2 = group(vec![
305            instance("/root/lib/x.ts", 1, 10),
306            instance("/root/lib/y.ts", 1, 10),
307        ]);
308        let r = report(vec![g1, g2]);
309        let grouping =
310            build_duplication_grouping(&r, Path::new("/root"), &OwnershipResolver::Directory);
311        assert_eq!(grouping.groups.len(), 2);
312        let lib = grouping.groups.iter().find(|g| g.key == "lib").unwrap();
313        let src = grouping.groups.iter().find(|g| g.key == "src").unwrap();
314        assert_eq!(lib.clone_groups.len(), 1);
315        assert_eq!(src.clone_groups.len(), 1);
316    }
317
318    #[test]
319    fn build_grouping_unowned_pinned_last() {
320        let co = CodeOwners::parse("/src/ @frontend\n").unwrap();
321        let resolver = OwnershipResolver::Owner(co);
322        // src group attributed to @frontend; docs group has no rule -> unowned
323        let g_src = group(vec![
324            instance("/root/src/a.ts", 1, 10),
325            instance("/root/src/b.ts", 1, 10),
326        ]);
327        let g_docs = group(vec![
328            instance("/root/docs/a.md", 1, 10),
329            instance("/root/docs/b.md", 1, 10),
330        ]);
331        let r = report(vec![g_src, g_docs]);
332        let grouping = build_duplication_grouping(&r, Path::new("/root"), &resolver);
333        assert_eq!(grouping.groups.len(), 2);
334        // unowned must be last
335        assert_eq!(grouping.groups.last().unwrap().key, UNOWNED_LABEL);
336    }
337
338    #[test]
339    fn build_grouping_per_instance_owner_inline() {
340        let g = group(vec![
341            instance("/root/src/a.ts", 1, 10),
342            instance("/root/src/b.ts", 1, 10),
343            instance("/root/lib/c.ts", 1, 10),
344        ]);
345        let r = report(vec![g]);
346        let grouping =
347            build_duplication_grouping(&r, Path::new("/root"), &OwnershipResolver::Directory);
348        // Group has src=2, lib=1 -> primary src; instances carry their own owner.
349        assert_eq!(grouping.groups.len(), 1);
350        let bucket = &grouping.groups[0];
351        assert_eq!(bucket.key, "src");
352        assert_eq!(bucket.clone_groups.len(), 1);
353        let cg = &bucket.clone_groups[0];
354        assert_eq!(cg.primary_owner, "src");
355        assert_eq!(cg.instances.len(), 3);
356        let owners: Vec<&str> = cg.instances.iter().map(|i| i.owner.as_str()).collect();
357        assert!(owners.contains(&"src"));
358        assert!(owners.contains(&"lib"));
359    }
360
361    #[test]
362    fn empty_report_produces_empty_grouping() {
363        let r = DuplicationReport::default();
364        let grouping =
365            build_duplication_grouping(&r, Path::new("/root"), &OwnershipResolver::Directory);
366        assert!(grouping.groups.is_empty());
367    }
368}