1use std::collections::BTreeMap;
15use std::path::Path;
16
17use fallow_core::duplicates::{
18 CloneFingerprintSet, CloneGroup, CloneInstance, DuplicationReport, DuplicationStats,
19};
20use rustc_hash::FxHashSet;
21use serde::Serialize;
22
23use super::grouping::OwnershipResolver;
24use super::relative_path;
25use crate::baseline::recompute_stats;
26use crate::codeowners::UNOWNED_LABEL;
27use crate::output_dupes::{AttributedCloneGroupFinding, CloneFamilyFinding};
28
29fn key_for_instance(instance: &CloneInstance, root: &Path, resolver: &OwnershipResolver) -> String {
31 resolver.resolve(relative_path(&instance.file, root))
32}
33
34pub fn largest_owner(group: &CloneGroup, root: &Path, resolver: &OwnershipResolver) -> String {
41 let mut counts: BTreeMap<String, u32> = BTreeMap::new();
42 for instance in &group.instances {
43 let key = key_for_instance(instance, root, resolver);
44 *counts.entry(key).or_insert(0) += 1;
45 }
46 if counts.is_empty() {
47 return UNOWNED_LABEL.to_string();
48 }
49 let mut best_key: Option<String> = None;
50 let mut best_count: u32 = 0;
51 for (key, count) in counts {
52 if best_key.is_none() || count > best_count {
53 best_count = count;
54 best_key = Some(key);
55 }
56 }
57 best_key.unwrap_or_else(|| UNOWNED_LABEL.to_string())
58}
59
60#[derive(Debug, Clone, Serialize)]
68#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
69pub struct AttributedInstance {
70 #[serde(flatten)]
72 pub instance: CloneInstance,
73 pub owner: String,
76}
77
78#[derive(Debug, Clone, Serialize)]
81#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
82pub struct AttributedCloneGroup {
83 pub primary_owner: String,
86 pub token_count: usize,
87 pub line_count: usize,
88 pub instances: Vec<AttributedInstance>,
91}
92
93impl AttributedCloneGroup {
94 fn from_group(group: &CloneGroup, root: &Path, resolver: &OwnershipResolver) -> Self {
95 let primary_owner = largest_owner(group, root, resolver);
96 let instances = group
97 .instances
98 .iter()
99 .map(|instance| AttributedInstance {
100 owner: key_for_instance(instance, root, resolver),
101 instance: instance.clone(),
102 })
103 .collect();
104 Self {
105 primary_owner,
106 token_count: group.token_count,
107 line_count: group.line_count,
108 instances,
109 }
110 }
111
112 fn fingerprint(&self, fingerprints: &CloneFingerprintSet) -> String {
113 let instances: Vec<_> = self
114 .instances
115 .iter()
116 .map(|instance| instance.instance.clone())
117 .collect();
118 fingerprints.fingerprint_for_parts(&instances, self.token_count, self.line_count)
119 }
120}
121
122#[derive(Debug, Clone, Serialize)]
125#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
126pub struct DuplicationGroup {
127 pub key: String,
131 pub stats: DuplicationStats,
132 pub clone_groups: Vec<AttributedCloneGroupFinding>,
137 pub clone_families: Vec<CloneFamilyFinding>,
140}
141
142#[derive(Debug, Clone, Serialize)]
144pub struct DuplicationGrouping {
145 pub mode: &'static str,
147 pub groups: Vec<DuplicationGroup>,
150}
151
152pub fn build_duplication_grouping(
157 report: &DuplicationReport,
158 root: &Path,
159 resolver: &OwnershipResolver,
160) -> DuplicationGrouping {
161 let fingerprints = CloneFingerprintSet::from_groups(&report.clone_groups);
162 let buckets = build_attributed_clone_buckets(report, root, resolver);
163 let mut groups: Vec<DuplicationGroup> = buckets
164 .into_iter()
165 .map(|(key, groups)| duplication_group(key, groups, report, &fingerprints))
166 .collect();
167 sort_duplication_groups(&mut groups);
168
169 DuplicationGrouping {
170 mode: resolver.mode_label(),
171 groups,
172 }
173}
174
175fn build_attributed_clone_buckets(
176 report: &DuplicationReport,
177 root: &Path,
178 resolver: &OwnershipResolver,
179) -> BTreeMap<String, Vec<AttributedCloneGroup>> {
180 let mut buckets: BTreeMap<String, Vec<AttributedCloneGroup>> = BTreeMap::new();
181 for group in &report.clone_groups {
182 let attributed = AttributedCloneGroup::from_group(group, root, resolver);
183 buckets
184 .entry(attributed.primary_owner.clone())
185 .or_default()
186 .push(attributed);
187 }
188 buckets
189}
190
191fn duplication_group(
192 key: String,
193 attributed_groups: Vec<AttributedCloneGroup>,
194 report: &DuplicationReport,
195 fingerprints: &CloneFingerprintSet,
196) -> DuplicationGroup {
197 let mut subset = duplication_subset_report(&attributed_groups, report);
198 subset.stats = recompute_stats(&subset);
199 let clone_families = clone_families_for_bucket(&attributed_groups, report, fingerprints);
200 let clone_groups = attributed_groups
201 .into_iter()
202 .map(|group| {
203 let fingerprint = group.fingerprint(fingerprints);
204 AttributedCloneGroupFinding::with_fingerprint(group, fingerprint)
205 })
206 .collect();
207
208 DuplicationGroup {
209 key,
210 stats: subset.stats,
211 clone_groups,
212 clone_families,
213 }
214}
215
216fn duplication_subset_report(
217 attributed_groups: &[AttributedCloneGroup],
218 report: &DuplicationReport,
219) -> DuplicationReport {
220 DuplicationReport {
221 clone_groups: attributed_groups
222 .iter()
223 .map(|group| CloneGroup {
224 instances: group
225 .instances
226 .iter()
227 .map(|instance| instance.instance.clone())
228 .collect(),
229 token_count: group.token_count,
230 line_count: group.line_count,
231 })
232 .collect(),
233 clone_families: Vec::new(),
234 mirrored_directories: Vec::new(),
235 stats: DuplicationStats {
236 total_files: report.stats.total_files,
237 files_with_clones: 0,
238 total_lines: report.stats.total_lines,
239 duplicated_lines: 0,
240 total_tokens: report.stats.total_tokens,
241 duplicated_tokens: 0,
242 clone_groups: 0,
243 clone_instances: 0,
244 duplication_percentage: 0.0,
245 clone_groups_below_min_occurrences: report.stats.clone_groups_below_min_occurrences,
246 },
247 }
248}
249
250fn clone_families_for_bucket(
251 attributed_groups: &[AttributedCloneGroup],
252 report: &DuplicationReport,
253 fingerprints: &CloneFingerprintSet,
254) -> Vec<CloneFamilyFinding> {
255 let bucket_files: FxHashSet<&Path> = attributed_groups
256 .iter()
257 .flat_map(|group| group.instances.iter().map(|i| i.instance.file.as_path()))
258 .collect();
259
260 report
261 .clone_families
262 .iter()
263 .filter(|family| {
264 family
265 .files
266 .iter()
267 .any(|path| bucket_files.contains(path.as_path()))
268 })
269 .map(|family| CloneFamilyFinding::with_fingerprints(family.clone(), fingerprints))
270 .collect()
271}
272
273fn sort_duplication_groups(groups: &mut [DuplicationGroup]) {
274 groups.sort_by(|a, b| {
275 let a_unowned = a.key == UNOWNED_LABEL;
276 let b_unowned = b.key == UNOWNED_LABEL;
277 match (a_unowned, b_unowned) {
278 (true, false) => std::cmp::Ordering::Greater,
279 (false, true) => std::cmp::Ordering::Less,
280 _ => b
281 .clone_groups
282 .len()
283 .cmp(&a.clone_groups.len())
284 .then_with(|| a.key.cmp(&b.key)),
285 }
286 });
287}
288
289#[cfg(test)]
290mod tests {
291 use std::path::PathBuf;
292
293 use fallow_core::duplicates::{CloneInstance, DuplicationStats};
294
295 use super::*;
296 use crate::codeowners::CodeOwners;
297
298 fn instance(path: &str, start: usize, end: usize) -> CloneInstance {
299 CloneInstance {
300 file: PathBuf::from(path),
301 start_line: start,
302 end_line: end,
303 start_col: 0,
304 end_col: 0,
305 fragment: String::new(),
306 }
307 }
308
309 fn group(instances: Vec<CloneInstance>) -> CloneGroup {
310 CloneGroup {
311 instances,
312 token_count: 50,
313 line_count: 10,
314 }
315 }
316
317 fn report(groups: Vec<CloneGroup>) -> DuplicationReport {
318 DuplicationReport {
319 clone_groups: groups,
320 clone_families: vec![],
321 mirrored_directories: vec![],
322 stats: DuplicationStats {
323 total_files: 10,
324 total_lines: 1000,
325 ..Default::default()
326 },
327 }
328 }
329
330 #[test]
331 fn largest_owner_majority_wins() {
332 let r = group(vec![
333 instance("/root/src/a.ts", 1, 10),
334 instance("/root/src/b.ts", 1, 10),
335 instance("/root/lib/c.ts", 1, 10),
336 ]);
337 let key = largest_owner(&r, Path::new("/root"), &OwnershipResolver::Directory);
338 assert_eq!(key, "src", "src has 2 instances vs lib's 1");
339 }
340
341 #[test]
342 fn largest_owner_alphabetical_tiebreak() {
343 let r = group(vec![
344 instance("/root/src/a.ts", 1, 10),
345 instance("/root/lib/b.ts", 1, 10),
346 ]);
347 let key = largest_owner(&r, Path::new("/root"), &OwnershipResolver::Directory);
348 assert_eq!(key, "lib");
349 }
350
351 #[test]
352 fn largest_owner_three_way_tie_alphabetical() {
353 let r = group(vec![
354 instance("/root/zeta/a.ts", 1, 10),
355 instance("/root/alpha/b.ts", 1, 10),
356 instance("/root/beta/c.ts", 1, 10),
357 ]);
358 let key = largest_owner(&r, Path::new("/root"), &OwnershipResolver::Directory);
359 assert_eq!(key, "alpha");
360 }
361
362 #[test]
363 fn build_grouping_partitions_clone_groups() {
364 let g1 = group(vec![
365 instance("/root/src/a.ts", 1, 10),
366 instance("/root/src/b.ts", 1, 10),
367 ]);
368 let g2 = group(vec![
369 instance("/root/lib/x.ts", 1, 10),
370 instance("/root/lib/y.ts", 1, 10),
371 ]);
372 let r = report(vec![g1, g2]);
373 let grouping =
374 build_duplication_grouping(&r, Path::new("/root"), &OwnershipResolver::Directory);
375 assert_eq!(grouping.groups.len(), 2);
376 let lib = grouping.groups.iter().find(|g| g.key == "lib").unwrap();
377 let src = grouping.groups.iter().find(|g| g.key == "src").unwrap();
378 assert_eq!(lib.clone_groups.len(), 1);
379 assert_eq!(src.clone_groups.len(), 1);
380 }
381
382 #[test]
383 fn build_grouping_unowned_pinned_last() {
384 let co = CodeOwners::parse("/src/ @frontend\n").unwrap();
385 let resolver = OwnershipResolver::Owner(co);
386 let g_src = group(vec![
387 instance("/root/src/a.ts", 1, 10),
388 instance("/root/src/b.ts", 1, 10),
389 ]);
390 let g_docs = group(vec![
391 instance("/root/docs/a.md", 1, 10),
392 instance("/root/docs/b.md", 1, 10),
393 ]);
394 let r = report(vec![g_src, g_docs]);
395 let grouping = build_duplication_grouping(&r, Path::new("/root"), &resolver);
396 assert_eq!(grouping.groups.len(), 2);
397 assert_eq!(grouping.groups.last().unwrap().key, UNOWNED_LABEL);
398 }
399
400 #[test]
401 fn build_grouping_per_instance_owner_inline() {
402 let g = group(vec![
403 instance("/root/src/a.ts", 1, 10),
404 instance("/root/src/b.ts", 1, 10),
405 instance("/root/lib/c.ts", 1, 10),
406 ]);
407 let r = report(vec![g]);
408 let grouping =
409 build_duplication_grouping(&r, Path::new("/root"), &OwnershipResolver::Directory);
410 assert_eq!(grouping.groups.len(), 1);
411 let bucket = &grouping.groups[0];
412 assert_eq!(bucket.key, "src");
413 assert_eq!(bucket.clone_groups.len(), 1);
414 let finding = &bucket.clone_groups[0];
415 let cg = &finding.group;
416 assert_eq!(cg.primary_owner, "src");
417 assert_eq!(cg.instances.len(), 3);
418 let owners: Vec<&str> = cg.instances.iter().map(|i| i.owner.as_str()).collect();
419 assert!(owners.contains(&"src"));
420 assert!(owners.contains(&"lib"));
421 assert_eq!(finding.actions.len(), 2);
422 }
423
424 #[test]
425 fn empty_report_produces_empty_grouping() {
426 let r = DuplicationReport::default();
427 let grouping =
428 build_duplication_grouping(&r, Path::new("/root"), &OwnershipResolver::Directory);
429 assert!(grouping.groups.is_empty());
430 }
431}