1use std::collections::BTreeMap;
15use std::path::Path;
16
17use fallow_core::duplicates::{CloneGroup, CloneInstance, DuplicationReport, DuplicationStats};
18use rustc_hash::FxHashSet;
19use serde::Serialize;
20
21use super::grouping::OwnershipResolver;
22use super::relative_path;
23use crate::baseline::recompute_stats;
24use crate::codeowners::UNOWNED_LABEL;
25use crate::output_dupes::{AttributedCloneGroupFinding, CloneFamilyFinding};
26
27fn key_for_instance(instance: &CloneInstance, root: &Path, resolver: &OwnershipResolver) -> String {
29 resolver.resolve(relative_path(&instance.file, root))
30}
31
32pub fn largest_owner(group: &CloneGroup, root: &Path, resolver: &OwnershipResolver) -> String {
39 let mut counts: BTreeMap<String, u32> = BTreeMap::new();
40 for instance in &group.instances {
41 let key = key_for_instance(instance, root, resolver);
42 *counts.entry(key).or_insert(0) += 1;
43 }
44 if counts.is_empty() {
45 return UNOWNED_LABEL.to_string();
46 }
47 let mut best_key: Option<String> = None;
48 let mut best_count: u32 = 0;
49 for (key, count) in counts {
50 if best_key.is_none() || count > best_count {
51 best_count = count;
52 best_key = Some(key);
53 }
54 }
55 best_key.unwrap_or_else(|| UNOWNED_LABEL.to_string())
56}
57
58#[derive(Debug, Clone, Serialize)]
66#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
67pub struct AttributedInstance {
68 #[serde(flatten)]
70 pub instance: CloneInstance,
71 pub owner: String,
74}
75
76#[derive(Debug, Clone, Serialize)]
79#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
80pub struct AttributedCloneGroup {
81 pub primary_owner: String,
84 pub token_count: usize,
85 pub line_count: usize,
86 pub instances: Vec<AttributedInstance>,
89}
90
91impl AttributedCloneGroup {
92 fn from_group(group: &CloneGroup, root: &Path, resolver: &OwnershipResolver) -> Self {
93 let primary_owner = largest_owner(group, root, resolver);
94 let instances = group
95 .instances
96 .iter()
97 .map(|instance| AttributedInstance {
98 owner: key_for_instance(instance, root, resolver),
99 instance: instance.clone(),
100 })
101 .collect();
102 Self {
103 primary_owner,
104 token_count: group.token_count,
105 line_count: group.line_count,
106 instances,
107 }
108 }
109}
110
111#[derive(Debug, Clone, Serialize)]
114#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
115pub struct DuplicationGroup {
116 pub key: String,
120 pub stats: DuplicationStats,
121 pub clone_groups: Vec<AttributedCloneGroupFinding>,
126 pub clone_families: Vec<CloneFamilyFinding>,
129}
130
131#[derive(Debug, Clone, Serialize)]
133pub struct DuplicationGrouping {
134 pub mode: &'static str,
136 pub groups: Vec<DuplicationGroup>,
139}
140
141pub fn build_duplication_grouping(
146 report: &DuplicationReport,
147 root: &Path,
148 resolver: &OwnershipResolver,
149) -> DuplicationGrouping {
150 let mut buckets: BTreeMap<String, Vec<AttributedCloneGroup>> = BTreeMap::new();
152 for group in &report.clone_groups {
153 let attributed = AttributedCloneGroup::from_group(group, root, resolver);
154 buckets
155 .entry(attributed.primary_owner.clone())
156 .or_default()
157 .push(attributed);
158 }
159
160 let mut groups: Vec<DuplicationGroup> = buckets
164 .into_iter()
165 .map(|(key, attributed_groups)| {
166 let original_groups: Vec<CloneGroup> = attributed_groups
168 .iter()
169 .map(|ag| CloneGroup {
170 instances: ag.instances.iter().map(|i| i.instance.clone()).collect(),
171 token_count: ag.token_count,
172 line_count: ag.line_count,
173 })
174 .collect();
175 let mut subset = DuplicationReport {
176 clone_groups: original_groups,
177 clone_families: Vec::new(),
178 mirrored_directories: Vec::new(),
179 stats: DuplicationStats {
180 total_files: report.stats.total_files,
181 files_with_clones: 0,
182 total_lines: report.stats.total_lines,
183 duplicated_lines: 0,
184 total_tokens: report.stats.total_tokens,
185 duplicated_tokens: 0,
186 clone_groups: 0,
187 clone_instances: 0,
188 duplication_percentage: 0.0,
189 clone_groups_below_min_occurrences: report
190 .stats
191 .clone_groups_below_min_occurrences,
192 },
193 };
194 subset.stats = recompute_stats(&subset);
195
196 let bucket_files: FxHashSet<&Path> = attributed_groups
201 .iter()
202 .flat_map(|ag| ag.instances.iter().map(|i| i.instance.file.as_path()))
203 .collect();
204 let clone_families: Vec<CloneFamilyFinding> = report
205 .clone_families
206 .iter()
207 .filter(|f| f.files.iter().any(|fp| bucket_files.contains(fp.as_path())))
208 .cloned()
209 .map(CloneFamilyFinding::with_actions)
210 .collect();
211
212 let clone_groups: Vec<AttributedCloneGroupFinding> = attributed_groups
213 .into_iter()
214 .map(AttributedCloneGroupFinding::with_actions)
215 .collect();
216
217 DuplicationGroup {
218 key,
219 stats: subset.stats,
220 clone_groups,
221 clone_families,
222 }
223 })
224 .collect();
225
226 groups.sort_by(|a, b| {
228 let a_unowned = a.key == UNOWNED_LABEL;
229 let b_unowned = b.key == UNOWNED_LABEL;
230 match (a_unowned, b_unowned) {
231 (true, false) => std::cmp::Ordering::Greater,
232 (false, true) => std::cmp::Ordering::Less,
233 _ => b
234 .clone_groups
235 .len()
236 .cmp(&a.clone_groups.len())
237 .then_with(|| a.key.cmp(&b.key)),
238 }
239 });
240
241 DuplicationGrouping {
242 mode: resolver.mode_label(),
243 groups,
244 }
245}
246
247#[cfg(test)]
248mod tests {
249 use std::path::PathBuf;
250
251 use fallow_core::duplicates::{CloneInstance, DuplicationStats};
252
253 use super::*;
254 use crate::codeowners::CodeOwners;
255
256 fn instance(path: &str, start: usize, end: usize) -> CloneInstance {
257 CloneInstance {
258 file: PathBuf::from(path),
259 start_line: start,
260 end_line: end,
261 start_col: 0,
262 end_col: 0,
263 fragment: String::new(),
264 }
265 }
266
267 fn group(instances: Vec<CloneInstance>) -> CloneGroup {
268 CloneGroup {
269 instances,
270 token_count: 50,
271 line_count: 10,
272 }
273 }
274
275 fn report(groups: Vec<CloneGroup>) -> DuplicationReport {
276 DuplicationReport {
277 clone_groups: groups,
278 clone_families: vec![],
279 mirrored_directories: vec![],
280 stats: DuplicationStats {
281 total_files: 10,
282 total_lines: 1000,
283 ..Default::default()
284 },
285 }
286 }
287
288 #[test]
289 fn largest_owner_majority_wins() {
290 let r = group(vec![
291 instance("/root/src/a.ts", 1, 10),
292 instance("/root/src/b.ts", 1, 10),
293 instance("/root/lib/c.ts", 1, 10),
294 ]);
295 let key = largest_owner(&r, Path::new("/root"), &OwnershipResolver::Directory);
296 assert_eq!(key, "src", "src has 2 instances vs lib's 1");
297 }
298
299 #[test]
300 fn largest_owner_alphabetical_tiebreak() {
301 let r = group(vec![
302 instance("/root/src/a.ts", 1, 10),
303 instance("/root/lib/b.ts", 1, 10),
304 ]);
305 let key = largest_owner(&r, Path::new("/root"), &OwnershipResolver::Directory);
307 assert_eq!(key, "lib");
308 }
309
310 #[test]
311 fn largest_owner_three_way_tie_alphabetical() {
312 let r = group(vec![
313 instance("/root/zeta/a.ts", 1, 10),
314 instance("/root/alpha/b.ts", 1, 10),
315 instance("/root/beta/c.ts", 1, 10),
316 ]);
317 let key = largest_owner(&r, Path::new("/root"), &OwnershipResolver::Directory);
318 assert_eq!(key, "alpha");
319 }
320
321 #[test]
322 fn build_grouping_partitions_clone_groups() {
323 let g1 = group(vec![
324 instance("/root/src/a.ts", 1, 10),
325 instance("/root/src/b.ts", 1, 10),
326 ]);
327 let g2 = group(vec![
328 instance("/root/lib/x.ts", 1, 10),
329 instance("/root/lib/y.ts", 1, 10),
330 ]);
331 let r = report(vec![g1, g2]);
332 let grouping =
333 build_duplication_grouping(&r, Path::new("/root"), &OwnershipResolver::Directory);
334 assert_eq!(grouping.groups.len(), 2);
335 let lib = grouping.groups.iter().find(|g| g.key == "lib").unwrap();
336 let src = grouping.groups.iter().find(|g| g.key == "src").unwrap();
337 assert_eq!(lib.clone_groups.len(), 1);
338 assert_eq!(src.clone_groups.len(), 1);
339 }
340
341 #[test]
342 fn build_grouping_unowned_pinned_last() {
343 let co = CodeOwners::parse("/src/ @frontend\n").unwrap();
344 let resolver = OwnershipResolver::Owner(co);
345 let g_src = group(vec![
347 instance("/root/src/a.ts", 1, 10),
348 instance("/root/src/b.ts", 1, 10),
349 ]);
350 let g_docs = group(vec![
351 instance("/root/docs/a.md", 1, 10),
352 instance("/root/docs/b.md", 1, 10),
353 ]);
354 let r = report(vec![g_src, g_docs]);
355 let grouping = build_duplication_grouping(&r, Path::new("/root"), &resolver);
356 assert_eq!(grouping.groups.len(), 2);
357 assert_eq!(grouping.groups.last().unwrap().key, UNOWNED_LABEL);
359 }
360
361 #[test]
362 fn build_grouping_per_instance_owner_inline() {
363 let g = group(vec![
364 instance("/root/src/a.ts", 1, 10),
365 instance("/root/src/b.ts", 1, 10),
366 instance("/root/lib/c.ts", 1, 10),
367 ]);
368 let r = report(vec![g]);
369 let grouping =
370 build_duplication_grouping(&r, Path::new("/root"), &OwnershipResolver::Directory);
371 assert_eq!(grouping.groups.len(), 1);
373 let bucket = &grouping.groups[0];
374 assert_eq!(bucket.key, "src");
375 assert_eq!(bucket.clone_groups.len(), 1);
376 let finding = &bucket.clone_groups[0];
377 let cg = &finding.group;
378 assert_eq!(cg.primary_owner, "src");
379 assert_eq!(cg.instances.len(), 3);
380 let owners: Vec<&str> = cg.instances.iter().map(|i| i.owner.as_str()).collect();
381 assert!(owners.contains(&"src"));
382 assert!(owners.contains(&"lib"));
383 assert_eq!(finding.actions.len(), 2);
386 }
387
388 #[test]
389 fn empty_report_produces_empty_grouping() {
390 let r = DuplicationReport::default();
391 let grouping =
392 build_duplication_grouping(&r, Path::new("/root"), &OwnershipResolver::Directory);
393 assert!(grouping.groups.is_empty());
394 }
395}