Skip to main content

fallow_cli/report/
dupes_grouping.rs

1//! Per-group attribution for `fallow dupes --group-by`.
2//!
3//! For each `CloneGroup`, every instance is attributed to a group key (owner,
4//! directory, package, or section) via the same [`OwnershipResolver`] used by
5//! `check` and `health`. The group itself is then attributed to its
6//! **largest owner**: the key with the most instances in that clone group.
7//! Ties are broken alphabetically (lexicographic ascending).
8//!
9//! This mirrors jscpd's majority-owner attribution and avoids the
10//! positional non-determinism that a "first-instance-wins" rule would
11//! introduce, since `DuplicationReport::sort()` already orders instances
12//! deterministically by file path then line.
13
14use std::collections::BTreeMap;
15use std::path::Path;
16
17use fallow_core::duplicates::{CloneGroup, CloneInstance, DuplicationReport, DuplicationStats};
18use rustc_hash::FxHashSet;
19use serde::Serialize;
20
21use super::grouping::OwnershipResolver;
22use super::relative_path;
23use crate::baseline::recompute_stats;
24use crate::codeowners::UNOWNED_LABEL;
25use crate::output_dupes::{AttributedCloneGroupFinding, CloneFamilyFinding};
26
27/// Resolve the group key for a single instance file.
28fn key_for_instance(instance: &CloneInstance, root: &Path, resolver: &OwnershipResolver) -> String {
29    resolver.resolve(relative_path(&instance.file, root))
30}
31
32/// Pick the largest owner for a clone group: most instances wins, ties broken
33/// alphabetically (smallest key wins).
34///
35/// Iterates a `BTreeMap` so iteration order is alphabetical. The first key
36/// to reach the running maximum wins, which means equal counts resolve to the
37/// alphabetically-smallest key.
38pub fn largest_owner(group: &CloneGroup, root: &Path, resolver: &OwnershipResolver) -> String {
39    let mut counts: BTreeMap<String, u32> = BTreeMap::new();
40    for instance in &group.instances {
41        let key = key_for_instance(instance, root, resolver);
42        *counts.entry(key).or_insert(0) += 1;
43    }
44    if counts.is_empty() {
45        return UNOWNED_LABEL.to_string();
46    }
47    let mut best_key: Option<String> = None;
48    let mut best_count: u32 = 0;
49    for (key, count) in counts {
50        if best_key.is_none() || count > best_count {
51            best_count = count;
52            best_key = Some(key);
53        }
54    }
55    best_key.unwrap_or_else(|| UNOWNED_LABEL.to_string())
56}
57
58/// A clone instance plus its per-instance owner key (for inline JSON / SARIF
59/// rendering).
60///
61/// Each instance carries its own `owner` field alongside the standard
62/// `CloneInstance` shape (file / start_line / end_line / start_col / end_col /
63/// fragment), so consumers can attribute instances to resolver keys without
64/// re-resolving paths.
65#[derive(Debug, Clone, Serialize)]
66#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
67pub struct AttributedInstance {
68    /// The original clone instance.
69    #[serde(flatten)]
70    pub instance: CloneInstance,
71    /// Resolver key for this specific instance (per-instance, not the
72    /// group-level largest-owner).
73    pub owner: String,
74}
75
76/// A clone group annotated with its largest-owner attribution and per-instance
77/// owner keys.
78#[derive(Debug, Clone, Serialize)]
79#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
80pub struct AttributedCloneGroup {
81    /// Largest-owner attribution: the resolver key with the most instances in
82    /// this clone group. Ties broken alphabetically (smallest key wins).
83    pub primary_owner: String,
84    pub token_count: usize,
85    pub line_count: usize,
86    /// Each instance carries its own `owner` field alongside the standard
87    /// CloneInstance shape.
88    pub instances: Vec<AttributedInstance>,
89}
90
91impl AttributedCloneGroup {
92    fn from_group(group: &CloneGroup, root: &Path, resolver: &OwnershipResolver) -> Self {
93        let primary_owner = largest_owner(group, root, resolver);
94        let instances = group
95            .instances
96            .iter()
97            .map(|instance| AttributedInstance {
98                owner: key_for_instance(instance, root, resolver),
99                instance: instance.clone(),
100            })
101            .collect();
102        Self {
103            primary_owner,
104            token_count: group.token_count,
105            line_count: group.line_count,
106            instances,
107        }
108    }
109}
110
111/// A single grouped duplication bucket. Per-group `stats` are dedup-aware and
112/// computed over the FULL group BEFORE any `--top` truncation.
113#[derive(Debug, Clone, Serialize)]
114#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
115pub struct DuplicationGroup {
116    /// Group label (owner / directory / package / section). `(unowned)` for
117    /// files with no CODEOWNERS rule, `(no section)` for pre-section rules in
118    /// section mode.
119    pub key: String,
120    pub stats: DuplicationStats,
121    /// Clone groups attributed to this owner, each wrapped with the typed
122    /// `actions[]` array. Each group's `primary_owner` is its largest-owner
123    /// key; per-instance `owner` lets consumers see cross-bucket fan-out
124    /// without re-resolving paths.
125    pub clone_groups: Vec<AttributedCloneGroupFinding>,
126    /// Clone families overlapping this bucket, each wrapped with the typed
127    /// `actions[]` array.
128    pub clone_families: Vec<CloneFamilyFinding>,
129}
130
131/// Wrapper carrying the resolver mode label and grouped buckets.
132#[derive(Debug, Clone, Serialize)]
133pub struct DuplicationGrouping {
134    /// Resolver mode label (`"owner"`, `"directory"`, `"package"`, `"section"`).
135    pub mode: &'static str,
136    /// One bucket per resolver key, sorted most clone groups first with
137    /// `(unowned)` pinned last.
138    pub groups: Vec<DuplicationGroup>,
139}
140
141/// Build the grouped duplication payload from a project-level report.
142///
143/// Aggregation is performed BEFORE any `--top` truncation so per-group stats
144/// reflect the full group, not just the rendered top-N.
145pub fn build_duplication_grouping(
146    report: &DuplicationReport,
147    root: &Path,
148    resolver: &OwnershipResolver,
149) -> DuplicationGrouping {
150    // Bucket clone groups by largest owner.
151    let mut buckets: BTreeMap<String, Vec<AttributedCloneGroup>> = BTreeMap::new();
152    for group in &report.clone_groups {
153        let attributed = AttributedCloneGroup::from_group(group, root, resolver);
154        buckets
155            .entry(attributed.primary_owner.clone())
156            .or_default()
157            .push(attributed);
158    }
159
160    // For each bucket, recompute stats from its clone groups by reusing
161    // `recompute_stats`. Use the original (non-attributed) clone groups to
162    // feed the helper so we share the dedup logic with the project report.
163    let mut groups: Vec<DuplicationGroup> = buckets
164        .into_iter()
165        .map(|(key, attributed_groups)| {
166            // Reconstruct a partial DuplicationReport for stats recomputation.
167            let original_groups: Vec<CloneGroup> = attributed_groups
168                .iter()
169                .map(|ag| CloneGroup {
170                    instances: ag.instances.iter().map(|i| i.instance.clone()).collect(),
171                    token_count: ag.token_count,
172                    line_count: ag.line_count,
173                })
174                .collect();
175            let mut subset = DuplicationReport {
176                clone_groups: original_groups,
177                clone_families: Vec::new(),
178                mirrored_directories: Vec::new(),
179                stats: DuplicationStats {
180                    total_files: report.stats.total_files,
181                    files_with_clones: 0,
182                    total_lines: report.stats.total_lines,
183                    duplicated_lines: 0,
184                    total_tokens: report.stats.total_tokens,
185                    duplicated_tokens: 0,
186                    clone_groups: 0,
187                    clone_instances: 0,
188                    duplication_percentage: 0.0,
189                    clone_groups_below_min_occurrences: report
190                        .stats
191                        .clone_groups_below_min_occurrences,
192                },
193            };
194            subset.stats = recompute_stats(&subset);
195
196            // Restrict clone families to those whose group memberships overlap
197            // this bucket. Using a file-set membership check matches how the
198            // project-level report treats families: a family's groups must all
199            // share its file set.
200            let bucket_files: FxHashSet<&Path> = attributed_groups
201                .iter()
202                .flat_map(|ag| ag.instances.iter().map(|i| i.instance.file.as_path()))
203                .collect();
204            let clone_families: Vec<CloneFamilyFinding> = report
205                .clone_families
206                .iter()
207                .filter(|f| f.files.iter().any(|fp| bucket_files.contains(fp.as_path())))
208                .cloned()
209                .map(CloneFamilyFinding::with_actions)
210                .collect();
211
212            let clone_groups: Vec<AttributedCloneGroupFinding> = attributed_groups
213                .into_iter()
214                .map(AttributedCloneGroupFinding::with_actions)
215                .collect();
216
217            DuplicationGroup {
218                key,
219                stats: subset.stats,
220                clone_groups,
221                clone_families,
222            }
223        })
224        .collect();
225
226    // Sort: most clone groups first, alphabetical tiebreak, (unowned) last.
227    groups.sort_by(|a, b| {
228        let a_unowned = a.key == UNOWNED_LABEL;
229        let b_unowned = b.key == UNOWNED_LABEL;
230        match (a_unowned, b_unowned) {
231            (true, false) => std::cmp::Ordering::Greater,
232            (false, true) => std::cmp::Ordering::Less,
233            _ => b
234                .clone_groups
235                .len()
236                .cmp(&a.clone_groups.len())
237                .then_with(|| a.key.cmp(&b.key)),
238        }
239    });
240
241    DuplicationGrouping {
242        mode: resolver.mode_label(),
243        groups,
244    }
245}
246
247#[cfg(test)]
248mod tests {
249    use std::path::PathBuf;
250
251    use fallow_core::duplicates::{CloneInstance, DuplicationStats};
252
253    use super::*;
254    use crate::codeowners::CodeOwners;
255
256    fn instance(path: &str, start: usize, end: usize) -> CloneInstance {
257        CloneInstance {
258            file: PathBuf::from(path),
259            start_line: start,
260            end_line: end,
261            start_col: 0,
262            end_col: 0,
263            fragment: String::new(),
264        }
265    }
266
267    fn group(instances: Vec<CloneInstance>) -> CloneGroup {
268        CloneGroup {
269            instances,
270            token_count: 50,
271            line_count: 10,
272        }
273    }
274
275    fn report(groups: Vec<CloneGroup>) -> DuplicationReport {
276        DuplicationReport {
277            clone_groups: groups,
278            clone_families: vec![],
279            mirrored_directories: vec![],
280            stats: DuplicationStats {
281                total_files: 10,
282                total_lines: 1000,
283                ..Default::default()
284            },
285        }
286    }
287
288    #[test]
289    fn largest_owner_majority_wins() {
290        let r = group(vec![
291            instance("/root/src/a.ts", 1, 10),
292            instance("/root/src/b.ts", 1, 10),
293            instance("/root/lib/c.ts", 1, 10),
294        ]);
295        let key = largest_owner(&r, Path::new("/root"), &OwnershipResolver::Directory);
296        assert_eq!(key, "src", "src has 2 instances vs lib's 1");
297    }
298
299    #[test]
300    fn largest_owner_alphabetical_tiebreak() {
301        let r = group(vec![
302            instance("/root/src/a.ts", 1, 10),
303            instance("/root/lib/b.ts", 1, 10),
304        ]);
305        // 1 vs 1 -- alphabetical: lib < src
306        let key = largest_owner(&r, Path::new("/root"), &OwnershipResolver::Directory);
307        assert_eq!(key, "lib");
308    }
309
310    #[test]
311    fn largest_owner_three_way_tie_alphabetical() {
312        let r = group(vec![
313            instance("/root/zeta/a.ts", 1, 10),
314            instance("/root/alpha/b.ts", 1, 10),
315            instance("/root/beta/c.ts", 1, 10),
316        ]);
317        let key = largest_owner(&r, Path::new("/root"), &OwnershipResolver::Directory);
318        assert_eq!(key, "alpha");
319    }
320
321    #[test]
322    fn build_grouping_partitions_clone_groups() {
323        let g1 = group(vec![
324            instance("/root/src/a.ts", 1, 10),
325            instance("/root/src/b.ts", 1, 10),
326        ]);
327        let g2 = group(vec![
328            instance("/root/lib/x.ts", 1, 10),
329            instance("/root/lib/y.ts", 1, 10),
330        ]);
331        let r = report(vec![g1, g2]);
332        let grouping =
333            build_duplication_grouping(&r, Path::new("/root"), &OwnershipResolver::Directory);
334        assert_eq!(grouping.groups.len(), 2);
335        let lib = grouping.groups.iter().find(|g| g.key == "lib").unwrap();
336        let src = grouping.groups.iter().find(|g| g.key == "src").unwrap();
337        assert_eq!(lib.clone_groups.len(), 1);
338        assert_eq!(src.clone_groups.len(), 1);
339    }
340
341    #[test]
342    fn build_grouping_unowned_pinned_last() {
343        let co = CodeOwners::parse("/src/ @frontend\n").unwrap();
344        let resolver = OwnershipResolver::Owner(co);
345        // src group attributed to @frontend; docs group has no rule -> unowned
346        let g_src = group(vec![
347            instance("/root/src/a.ts", 1, 10),
348            instance("/root/src/b.ts", 1, 10),
349        ]);
350        let g_docs = group(vec![
351            instance("/root/docs/a.md", 1, 10),
352            instance("/root/docs/b.md", 1, 10),
353        ]);
354        let r = report(vec![g_src, g_docs]);
355        let grouping = build_duplication_grouping(&r, Path::new("/root"), &resolver);
356        assert_eq!(grouping.groups.len(), 2);
357        // unowned must be last
358        assert_eq!(grouping.groups.last().unwrap().key, UNOWNED_LABEL);
359    }
360
361    #[test]
362    fn build_grouping_per_instance_owner_inline() {
363        let g = group(vec![
364            instance("/root/src/a.ts", 1, 10),
365            instance("/root/src/b.ts", 1, 10),
366            instance("/root/lib/c.ts", 1, 10),
367        ]);
368        let r = report(vec![g]);
369        let grouping =
370            build_duplication_grouping(&r, Path::new("/root"), &OwnershipResolver::Directory);
371        // Group has src=2, lib=1 -> primary src; instances carry their own owner.
372        assert_eq!(grouping.groups.len(), 1);
373        let bucket = &grouping.groups[0];
374        assert_eq!(bucket.key, "src");
375        assert_eq!(bucket.clone_groups.len(), 1);
376        let finding = &bucket.clone_groups[0];
377        let cg = &finding.group;
378        assert_eq!(cg.primary_owner, "src");
379        assert_eq!(cg.instances.len(), 3);
380        let owners: Vec<&str> = cg.instances.iter().map(|i| i.owner.as_str()).collect();
381        assert!(owners.contains(&"src"));
382        assert!(owners.contains(&"lib"));
383        // Each AttributedCloneGroupFinding carries the canonical 2-action array
384        // (extract-shared + suppress-line).
385        assert_eq!(finding.actions.len(), 2);
386    }
387
388    #[test]
389    fn empty_report_produces_empty_grouping() {
390        let r = DuplicationReport::default();
391        let grouping =
392            build_duplication_grouping(&r, Path::new("/root"), &OwnershipResolver::Directory);
393        assert!(grouping.groups.is_empty());
394    }
395}