Skip to main content

fallow_cli/report/
dupes_grouping.rs

1//! Per-group attribution for `fallow dupes --group-by`.
2//!
3//! For each `CloneGroup`, every instance is attributed to a group key (owner,
4//! directory, package, or section) via the same [`OwnershipResolver`] used by
5//! `check` and `health`. The group itself is then attributed to its
6//! **largest owner**: the key with the most instances in that clone group.
7//! Ties are broken alphabetically (lexicographic ascending).
8//!
9//! This mirrors jscpd's majority-owner attribution and avoids the
10//! positional non-determinism that a "first-instance-wins" rule would
11//! introduce, since `DuplicationReport::sort()` already orders instances
12//! deterministically by file path then line.
13
14use std::collections::BTreeMap;
15use std::path::Path;
16
17use fallow_core::duplicates::{
18    CloneFingerprintSet, CloneGroup, CloneInstance, DuplicationReport, DuplicationStats,
19};
20use rustc_hash::FxHashSet;
21use serde::Serialize;
22
23use super::grouping::OwnershipResolver;
24use super::relative_path;
25use crate::baseline::recompute_stats;
26use crate::codeowners::UNOWNED_LABEL;
27use crate::output_dupes::{AttributedCloneGroupFinding, CloneFamilyFinding};
28
29/// Resolve the group key for a single instance file.
30fn key_for_instance(instance: &CloneInstance, root: &Path, resolver: &OwnershipResolver) -> String {
31    resolver.resolve(relative_path(&instance.file, root))
32}
33
34/// Pick the largest owner for a clone group: most instances wins, ties broken
35/// alphabetically (smallest key wins).
36///
37/// Iterates a `BTreeMap` so iteration order is alphabetical. The first key
38/// to reach the running maximum wins, which means equal counts resolve to the
39/// alphabetically-smallest key.
40pub fn largest_owner(group: &CloneGroup, root: &Path, resolver: &OwnershipResolver) -> String {
41    let mut counts: BTreeMap<String, u32> = BTreeMap::new();
42    for instance in &group.instances {
43        let key = key_for_instance(instance, root, resolver);
44        *counts.entry(key).or_insert(0) += 1;
45    }
46    if counts.is_empty() {
47        return UNOWNED_LABEL.to_string();
48    }
49    let mut best_key: Option<String> = None;
50    let mut best_count: u32 = 0;
51    for (key, count) in counts {
52        if best_key.is_none() || count > best_count {
53            best_count = count;
54            best_key = Some(key);
55        }
56    }
57    best_key.unwrap_or_else(|| UNOWNED_LABEL.to_string())
58}
59
60/// A clone instance plus its per-instance owner key (for inline JSON / SARIF
61/// rendering).
62///
63/// Each instance carries its own `owner` field alongside the standard
64/// `CloneInstance` shape (file / start_line / end_line / start_col / end_col /
65/// fragment), so consumers can attribute instances to resolver keys without
66/// re-resolving paths.
67#[derive(Debug, Clone, Serialize)]
68#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
69pub struct AttributedInstance {
70    /// The original clone instance.
71    #[serde(flatten)]
72    pub instance: CloneInstance,
73    /// Resolver key for this specific instance (per-instance, not the
74    /// group-level largest-owner).
75    pub owner: String,
76}
77
78/// A clone group annotated with its largest-owner attribution and per-instance
79/// owner keys.
80#[derive(Debug, Clone, Serialize)]
81#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
82pub struct AttributedCloneGroup {
83    /// Largest-owner attribution: the resolver key with the most instances in
84    /// this clone group. Ties broken alphabetically (smallest key wins).
85    pub primary_owner: String,
86    pub token_count: usize,
87    pub line_count: usize,
88    /// Each instance carries its own `owner` field alongside the standard
89    /// CloneInstance shape.
90    pub instances: Vec<AttributedInstance>,
91}
92
93impl AttributedCloneGroup {
94    fn from_group(group: &CloneGroup, root: &Path, resolver: &OwnershipResolver) -> Self {
95        let primary_owner = largest_owner(group, root, resolver);
96        let instances = group
97            .instances
98            .iter()
99            .map(|instance| AttributedInstance {
100                owner: key_for_instance(instance, root, resolver),
101                instance: instance.clone(),
102            })
103            .collect();
104        Self {
105            primary_owner,
106            token_count: group.token_count,
107            line_count: group.line_count,
108            instances,
109        }
110    }
111
112    fn fingerprint(&self, fingerprints: &CloneFingerprintSet) -> String {
113        let instances: Vec<_> = self
114            .instances
115            .iter()
116            .map(|instance| instance.instance.clone())
117            .collect();
118        fingerprints.fingerprint_for_parts(&instances, self.token_count, self.line_count)
119    }
120}
121
122/// A single grouped duplication bucket. Per-group `stats` are dedup-aware and
123/// computed over the FULL group BEFORE any `--top` truncation.
124#[derive(Debug, Clone, Serialize)]
125#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
126pub struct DuplicationGroup {
127    /// Group label (owner / directory / package / section). `(unowned)` for
128    /// files with no CODEOWNERS rule, `(no section)` for pre-section rules in
129    /// section mode.
130    pub key: String,
131    pub stats: DuplicationStats,
132    /// Clone groups attributed to this owner, each wrapped with the typed
133    /// `actions[]` array. Each group's `primary_owner` is its largest-owner
134    /// key; per-instance `owner` lets consumers see cross-bucket fan-out
135    /// without re-resolving paths.
136    pub clone_groups: Vec<AttributedCloneGroupFinding>,
137    /// Clone families overlapping this bucket, each wrapped with the typed
138    /// `actions[]` array.
139    pub clone_families: Vec<CloneFamilyFinding>,
140}
141
142/// Wrapper carrying the resolver mode label and grouped buckets.
143#[derive(Debug, Clone, Serialize)]
144pub struct DuplicationGrouping {
145    /// Resolver mode label (`"owner"`, `"directory"`, `"package"`, `"section"`).
146    pub mode: &'static str,
147    /// One bucket per resolver key, sorted most clone groups first with
148    /// `(unowned)` pinned last.
149    pub groups: Vec<DuplicationGroup>,
150}
151
152/// Build the grouped duplication payload from a project-level report.
153///
154/// Aggregation is performed BEFORE any `--top` truncation so per-group stats
155/// reflect the full group, not just the rendered top-N.
156pub fn build_duplication_grouping(
157    report: &DuplicationReport,
158    root: &Path,
159    resolver: &OwnershipResolver,
160) -> DuplicationGrouping {
161    let fingerprints = CloneFingerprintSet::from_groups(&report.clone_groups);
162    let buckets = build_attributed_clone_buckets(report, root, resolver);
163    let mut groups: Vec<DuplicationGroup> = buckets
164        .into_iter()
165        .map(|(key, groups)| duplication_group(key, groups, report, &fingerprints))
166        .collect();
167    sort_duplication_groups(&mut groups);
168
169    DuplicationGrouping {
170        mode: resolver.mode_label(),
171        groups,
172    }
173}
174
175fn build_attributed_clone_buckets(
176    report: &DuplicationReport,
177    root: &Path,
178    resolver: &OwnershipResolver,
179) -> BTreeMap<String, Vec<AttributedCloneGroup>> {
180    let mut buckets: BTreeMap<String, Vec<AttributedCloneGroup>> = BTreeMap::new();
181    for group in &report.clone_groups {
182        let attributed = AttributedCloneGroup::from_group(group, root, resolver);
183        buckets
184            .entry(attributed.primary_owner.clone())
185            .or_default()
186            .push(attributed);
187    }
188    buckets
189}
190
191fn duplication_group(
192    key: String,
193    attributed_groups: Vec<AttributedCloneGroup>,
194    report: &DuplicationReport,
195    fingerprints: &CloneFingerprintSet,
196) -> DuplicationGroup {
197    let mut subset = duplication_subset_report(&attributed_groups, report);
198    subset.stats = recompute_stats(&subset);
199    let clone_families = clone_families_for_bucket(&attributed_groups, report, fingerprints);
200    let clone_groups = attributed_groups
201        .into_iter()
202        .map(|group| {
203            let fingerprint = group.fingerprint(fingerprints);
204            AttributedCloneGroupFinding::with_fingerprint(group, fingerprint)
205        })
206        .collect();
207
208    DuplicationGroup {
209        key,
210        stats: subset.stats,
211        clone_groups,
212        clone_families,
213    }
214}
215
216fn duplication_subset_report(
217    attributed_groups: &[AttributedCloneGroup],
218    report: &DuplicationReport,
219) -> DuplicationReport {
220    DuplicationReport {
221        clone_groups: attributed_groups
222            .iter()
223            .map(|group| CloneGroup {
224                instances: group
225                    .instances
226                    .iter()
227                    .map(|instance| instance.instance.clone())
228                    .collect(),
229                token_count: group.token_count,
230                line_count: group.line_count,
231            })
232            .collect(),
233        clone_families: Vec::new(),
234        mirrored_directories: Vec::new(),
235        stats: DuplicationStats {
236            total_files: report.stats.total_files,
237            files_with_clones: 0,
238            total_lines: report.stats.total_lines,
239            duplicated_lines: 0,
240            total_tokens: report.stats.total_tokens,
241            duplicated_tokens: 0,
242            clone_groups: 0,
243            clone_instances: 0,
244            duplication_percentage: 0.0,
245            clone_groups_below_min_occurrences: report.stats.clone_groups_below_min_occurrences,
246        },
247    }
248}
249
250fn clone_families_for_bucket(
251    attributed_groups: &[AttributedCloneGroup],
252    report: &DuplicationReport,
253    fingerprints: &CloneFingerprintSet,
254) -> Vec<CloneFamilyFinding> {
255    let bucket_files: FxHashSet<&Path> = attributed_groups
256        .iter()
257        .flat_map(|group| group.instances.iter().map(|i| i.instance.file.as_path()))
258        .collect();
259
260    report
261        .clone_families
262        .iter()
263        .filter(|family| {
264            family
265                .files
266                .iter()
267                .any(|path| bucket_files.contains(path.as_path()))
268        })
269        .map(|family| CloneFamilyFinding::with_fingerprints(family.clone(), fingerprints))
270        .collect()
271}
272
273fn sort_duplication_groups(groups: &mut [DuplicationGroup]) {
274    groups.sort_by(|a, b| {
275        let a_unowned = a.key == UNOWNED_LABEL;
276        let b_unowned = b.key == UNOWNED_LABEL;
277        match (a_unowned, b_unowned) {
278            (true, false) => std::cmp::Ordering::Greater,
279            (false, true) => std::cmp::Ordering::Less,
280            _ => b
281                .clone_groups
282                .len()
283                .cmp(&a.clone_groups.len())
284                .then_with(|| a.key.cmp(&b.key)),
285        }
286    });
287}
288
289#[cfg(test)]
290mod tests {
291    use std::path::PathBuf;
292
293    use fallow_core::duplicates::{CloneInstance, DuplicationStats};
294
295    use super::*;
296    use crate::codeowners::CodeOwners;
297
298    fn instance(path: &str, start: usize, end: usize) -> CloneInstance {
299        CloneInstance {
300            file: PathBuf::from(path),
301            start_line: start,
302            end_line: end,
303            start_col: 0,
304            end_col: 0,
305            fragment: String::new(),
306        }
307    }
308
309    fn group(instances: Vec<CloneInstance>) -> CloneGroup {
310        CloneGroup {
311            instances,
312            token_count: 50,
313            line_count: 10,
314        }
315    }
316
317    fn report(groups: Vec<CloneGroup>) -> DuplicationReport {
318        DuplicationReport {
319            clone_groups: groups,
320            clone_families: vec![],
321            mirrored_directories: vec![],
322            stats: DuplicationStats {
323                total_files: 10,
324                total_lines: 1000,
325                ..Default::default()
326            },
327        }
328    }
329
330    #[test]
331    fn largest_owner_majority_wins() {
332        let r = group(vec![
333            instance("/root/src/a.ts", 1, 10),
334            instance("/root/src/b.ts", 1, 10),
335            instance("/root/lib/c.ts", 1, 10),
336        ]);
337        let key = largest_owner(&r, Path::new("/root"), &OwnershipResolver::Directory);
338        assert_eq!(key, "src", "src has 2 instances vs lib's 1");
339    }
340
341    #[test]
342    fn largest_owner_alphabetical_tiebreak() {
343        let r = group(vec![
344            instance("/root/src/a.ts", 1, 10),
345            instance("/root/lib/b.ts", 1, 10),
346        ]);
347        let key = largest_owner(&r, Path::new("/root"), &OwnershipResolver::Directory);
348        assert_eq!(key, "lib");
349    }
350
351    #[test]
352    fn largest_owner_three_way_tie_alphabetical() {
353        let r = group(vec![
354            instance("/root/zeta/a.ts", 1, 10),
355            instance("/root/alpha/b.ts", 1, 10),
356            instance("/root/beta/c.ts", 1, 10),
357        ]);
358        let key = largest_owner(&r, Path::new("/root"), &OwnershipResolver::Directory);
359        assert_eq!(key, "alpha");
360    }
361
362    #[test]
363    fn build_grouping_partitions_clone_groups() {
364        let g1 = group(vec![
365            instance("/root/src/a.ts", 1, 10),
366            instance("/root/src/b.ts", 1, 10),
367        ]);
368        let g2 = group(vec![
369            instance("/root/lib/x.ts", 1, 10),
370            instance("/root/lib/y.ts", 1, 10),
371        ]);
372        let r = report(vec![g1, g2]);
373        let grouping =
374            build_duplication_grouping(&r, Path::new("/root"), &OwnershipResolver::Directory);
375        assert_eq!(grouping.groups.len(), 2);
376        let lib = grouping.groups.iter().find(|g| g.key == "lib").unwrap();
377        let src = grouping.groups.iter().find(|g| g.key == "src").unwrap();
378        assert_eq!(lib.clone_groups.len(), 1);
379        assert_eq!(src.clone_groups.len(), 1);
380    }
381
382    #[test]
383    fn build_grouping_unowned_pinned_last() {
384        let co = CodeOwners::parse("/src/ @frontend\n").unwrap();
385        let resolver = OwnershipResolver::Owner(co);
386        let g_src = group(vec![
387            instance("/root/src/a.ts", 1, 10),
388            instance("/root/src/b.ts", 1, 10),
389        ]);
390        let g_docs = group(vec![
391            instance("/root/docs/a.md", 1, 10),
392            instance("/root/docs/b.md", 1, 10),
393        ]);
394        let r = report(vec![g_src, g_docs]);
395        let grouping = build_duplication_grouping(&r, Path::new("/root"), &resolver);
396        assert_eq!(grouping.groups.len(), 2);
397        assert_eq!(grouping.groups.last().unwrap().key, UNOWNED_LABEL);
398    }
399
400    #[test]
401    fn build_grouping_per_instance_owner_inline() {
402        let g = group(vec![
403            instance("/root/src/a.ts", 1, 10),
404            instance("/root/src/b.ts", 1, 10),
405            instance("/root/lib/c.ts", 1, 10),
406        ]);
407        let r = report(vec![g]);
408        let grouping =
409            build_duplication_grouping(&r, Path::new("/root"), &OwnershipResolver::Directory);
410        assert_eq!(grouping.groups.len(), 1);
411        let bucket = &grouping.groups[0];
412        assert_eq!(bucket.key, "src");
413        assert_eq!(bucket.clone_groups.len(), 1);
414        let finding = &bucket.clone_groups[0];
415        let cg = &finding.group;
416        assert_eq!(cg.primary_owner, "src");
417        assert_eq!(cg.instances.len(), 3);
418        let owners: Vec<&str> = cg.instances.iter().map(|i| i.owner.as_str()).collect();
419        assert!(owners.contains(&"src"));
420        assert!(owners.contains(&"lib"));
421        assert_eq!(finding.actions.len(), 2);
422    }
423
424    #[test]
425    fn empty_report_produces_empty_grouping() {
426        let r = DuplicationReport::default();
427        let grouping =
428            build_duplication_grouping(&r, Path::new("/root"), &OwnershipResolver::Directory);
429        assert!(grouping.groups.is_empty());
430    }
431}