1mod cache;
9pub mod deepdive;
10pub mod detect;
11pub mod families;
12pub mod normalize;
13mod shingle_filter;
14pub mod token_types;
15mod token_visitor;
16pub mod tokenize;
17pub(crate) mod types;
18
19use rustc_hash::FxHashMap;
20use std::path::{Path, PathBuf};
21use std::sync::atomic::{AtomicUsize, Ordering};
22
23use globset::{Glob, GlobMatcher, GlobSet, GlobSetBuilder};
24use rayon::prelude::*;
25use rustc_hash::FxHashSet;
26
27use cache::{TokenCache, TokenCacheEntry, TokenCacheMode};
28pub use deepdive::{
29 CloneFingerprintKey, CloneFingerprintSet, FINGERPRINT_PREFIX, clone_fingerprint,
30 dominant_identifier, fingerprint_for_fragment, group_refactoring_suggestion,
31};
32use detect::CloneDetector;
33use normalize::normalize_and_hash_resolved;
34use tokenize::{tokenize_file, tokenize_file_cross_language};
35pub use types::{
36 CloneFamily, CloneGroup, CloneInstance, DefaultIgnoreSkipCount, DefaultIgnoreSkips,
37 DetectionMode, DuplicatesConfig, DuplicationReport, DuplicationStats, MirroredDirectory,
38 RefactoringKind, RefactoringSuggestion,
39};
40
41use crate::discover::{self, DiscoveredFile};
42use crate::suppress::{self, IssueKind, Suppression};
43
44pub const DUPES_DEFAULT_IGNORES: &[&str] = &[
50 "**/.next/**",
51 "**/.nuxt/**",
52 "**/.svelte-kit/**",
53 "**/.turbo/**",
54 "**/.parcel-cache/**",
55 "**/.vite/**",
56 "**/.cache/**",
57 "**/out/**",
58 "**/storybook-static/**",
59];
60
61#[derive(Clone)]
62pub(super) struct TokenizedFile {
63 path: PathBuf,
64 hashed_tokens: Vec<normalize::HashedToken>,
65 file_tokens: tokenize::FileTokens,
66 metadata: Option<std::fs::Metadata>,
67 cache_hit: bool,
68 suppressions: Vec<Suppression>,
69}
70
71struct IgnoreSet {
72 all: GlobSet,
73 defaults: Vec<(&'static str, GlobMatcher)>,
74}
75
76impl IgnoreSet {
77 fn is_match(&self, path: &Path) -> bool {
78 self.all.is_match(path)
79 }
80
81 fn default_match_index(&self, path: &Path) -> Option<usize> {
82 self.defaults
83 .iter()
84 .position(|(_, matcher)| matcher.is_match(path))
85 }
86}
87
88struct DuplicationRun {
89 report: DuplicationReport,
90 default_ignore_skips: DefaultIgnoreSkips,
91}
92
93struct DuplicationTokenizeContext<'a> {
94 root: &'a Path,
95 config: &'a DuplicatesConfig,
96 extra_ignores: Option<&'a IgnoreSet>,
97 default_skip_counts: &'a [AtomicUsize],
98 token_cache: Option<&'a TokenCache>,
99 token_cache_mode: TokenCacheMode,
100 normalization: fallow_config::ResolvedNormalization,
101 strip_types: bool,
102 skip_imports: bool,
103}
104
105pub fn find_duplicates(
114 root: &Path,
115 files: &[DiscoveredFile],
116 config: &DuplicatesConfig,
117) -> DuplicationReport {
118 find_duplicates_inner(root, files, config, None, None).report
119}
120
121pub fn find_duplicates_with_default_ignore_skips(
124 root: &Path,
125 files: &[DiscoveredFile],
126 config: &DuplicatesConfig,
127) -> (DuplicationReport, DefaultIgnoreSkips) {
128 let run = find_duplicates_inner(root, files, config, None, None);
129 (run.report, run.default_ignore_skips)
130}
131
132pub fn find_duplicates_cached(
134 root: &Path,
135 files: &[DiscoveredFile],
136 config: &DuplicatesConfig,
137 cache_root: &Path,
138) -> DuplicationReport {
139 find_duplicates_inner(root, files, config, None, Some(cache_root)).report
140}
141
142pub fn find_duplicates_cached_with_default_ignore_skips(
145 root: &Path,
146 files: &[DiscoveredFile],
147 config: &DuplicatesConfig,
148 cache_root: &Path,
149) -> (DuplicationReport, DefaultIgnoreSkips) {
150 let run = find_duplicates_inner(root, files, config, None, Some(cache_root));
151 (run.report, run.default_ignore_skips)
152}
153
154#[expect(
160 clippy::implicit_hasher,
161 reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
162)]
163pub fn find_duplicates_touching_files(
164 root: &Path,
165 files: &[DiscoveredFile],
166 config: &DuplicatesConfig,
167 focus_files: &FxHashSet<PathBuf>,
168) -> DuplicationReport {
169 find_duplicates_inner(root, files, config, Some(focus_files), None).report
170}
171
172#[expect(
175 clippy::implicit_hasher,
176 reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
177)]
178pub fn find_duplicates_touching_files_with_default_ignore_skips(
179 root: &Path,
180 files: &[DiscoveredFile],
181 config: &DuplicatesConfig,
182 focus_files: &FxHashSet<PathBuf>,
183) -> (DuplicationReport, DefaultIgnoreSkips) {
184 let run = find_duplicates_inner(root, files, config, Some(focus_files), None);
185 (run.report, run.default_ignore_skips)
186}
187
188#[expect(
190 clippy::implicit_hasher,
191 reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
192)]
193pub fn find_duplicates_touching_files_cached(
194 root: &Path,
195 files: &[DiscoveredFile],
196 config: &DuplicatesConfig,
197 focus_files: &FxHashSet<PathBuf>,
198 cache_root: &Path,
199) -> DuplicationReport {
200 find_duplicates_inner(root, files, config, Some(focus_files), Some(cache_root)).report
201}
202
203#[expect(
206 clippy::implicit_hasher,
207 reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
208)]
209pub fn find_duplicates_touching_files_cached_with_default_ignore_skips(
210 root: &Path,
211 files: &[DiscoveredFile],
212 config: &DuplicatesConfig,
213 focus_files: &FxHashSet<PathBuf>,
214 cache_root: &Path,
215) -> (DuplicationReport, DefaultIgnoreSkips) {
216 let run = find_duplicates_inner(root, files, config, Some(focus_files), Some(cache_root));
217 (run.report, run.default_ignore_skips)
218}
219
220fn find_duplicates_inner(
221 root: &Path,
222 files: &[DiscoveredFile],
223 config: &DuplicatesConfig,
224 focus_files: Option<&FxHashSet<PathBuf>>,
225 cache_root: Option<&Path>,
226) -> DuplicationRun {
227 let _span = tracing::info_span!("find_duplicates").entered();
228
229 let extra_ignores = build_ignore_set(config);
230 let default_skip_counts = extra_ignores
231 .as_ref()
232 .map(|ignores| {
233 std::iter::repeat_with(|| AtomicUsize::new(0))
234 .take(ignores.defaults.len())
235 .collect::<Vec<_>>()
236 })
237 .unwrap_or_default();
238
239 let normalization =
240 fallow_config::ResolvedNormalization::resolve(config.mode, &config.normalization);
241
242 let strip_types = config.cross_language;
243 let skip_imports = config.ignore_imports;
244
245 tracing::debug!(
246 ignore_imports = skip_imports,
247 "duplication tokenization config"
248 );
249
250 let token_cache_mode = TokenCacheMode::new(normalization, strip_types, skip_imports);
251 let cache_root = cache_root.filter(|_| files.len() >= config.min_corpus_size_for_token_cache);
252 let token_cache = cache_root.map(TokenCache::load);
253
254 let mut file_data = tokenize_duplication_files(
255 files,
256 &DuplicationTokenizeContext {
257 root,
258 config,
259 extra_ignores: extra_ignores.as_ref(),
260 default_skip_counts: &default_skip_counts,
261 token_cache: token_cache.as_ref(),
262 token_cache_mode,
263 normalization,
264 strip_types,
265 skip_imports,
266 },
267 );
268
269 if let (Some(cache_root), Some(cache)) = (cache_root, token_cache) {
270 save_duplication_token_cache(cache_root, cache, files, &file_data, token_cache_mode);
271 }
272
273 tracing::info!(
274 files = file_data.len(),
275 "tokenized files for duplication analysis"
276 );
277
278 if let Some(focus_files) = focus_files
279 && file_data.len() >= config.min_corpus_size_for_shingle_filter
280 {
281 shingle_filter::filter_to_focus_candidates(&mut file_data, focus_files, config.min_tokens);
282 }
283
284 let suppressions_by_file: FxHashMap<PathBuf, Vec<Suppression>> = file_data
285 .iter()
286 .filter(|file| !file.suppressions.is_empty())
287 .map(|file| (file.path.clone(), file.suppressions.clone()))
288 .collect();
289
290 let detector_data: Vec<(PathBuf, Vec<normalize::HashedToken>, tokenize::FileTokens)> =
291 file_data
292 .into_iter()
293 .map(|file| (file.path, file.hashed_tokens, file.file_tokens))
294 .collect();
295
296 let detector = CloneDetector::new(config.min_tokens, config.min_lines, config.skip_local);
297 let mut report = if let Some(focus_files) = focus_files {
298 detector.detect_touching_files(detector_data, focus_files)
299 } else {
300 detector.detect(detector_data)
301 };
302
303 if !suppressions_by_file.is_empty() {
304 apply_line_suppressions(&mut report, &suppressions_by_file);
305 }
306
307 apply_min_occurrences_filter(&mut report, config.min_occurrences);
308
309 let default_ignore_skips =
310 build_default_ignore_skips(extra_ignores.as_ref(), &default_skip_counts);
311
312 report.clone_families = families::group_into_families(&report.clone_groups, root);
313
314 report.mirrored_directories =
315 families::detect_mirrored_directories(&report.clone_families, root);
316
317 report.sort();
318
319 DuplicationRun {
320 report,
321 default_ignore_skips,
322 }
323}
324
325fn tokenize_duplication_files(
326 files: &[DiscoveredFile],
327 ctx: &DuplicationTokenizeContext<'_>,
328) -> Vec<TokenizedFile> {
329 files
330 .par_iter()
331 .filter_map(|file| tokenize_duplication_file(file, ctx))
332 .collect()
333}
334
335fn tokenize_duplication_file(
336 file: &DiscoveredFile,
337 ctx: &DuplicationTokenizeContext<'_>,
338) -> Option<TokenizedFile> {
339 if should_skip_duplicate_file(file, ctx) {
340 return None;
341 }
342
343 let metadata = std::fs::metadata(&file.path).ok()?;
344 let cached_entry = ctx
345 .token_cache
346 .and_then(|cache| cache.get(&file.path, &metadata, ctx.token_cache_mode));
347 let cache_hit = cached_entry.is_some();
348 let (mut entry, suppressions) = duplication_token_cache_entry(file, ctx, cached_entry)?;
349 if entry.file_tokens.tokens.is_empty() || entry.hashed_tokens.len() < ctx.config.min_tokens {
350 return None;
351 }
352
353 Some(TokenizedFile {
354 path: file.path.clone(),
355 hashed_tokens: std::mem::take(&mut entry.hashed_tokens),
356 file_tokens: entry.file_tokens,
357 metadata: Some(metadata),
358 cache_hit,
359 suppressions,
360 })
361}
362
363fn should_skip_duplicate_file(file: &DiscoveredFile, ctx: &DuplicationTokenizeContext<'_>) -> bool {
364 let relative = file.path.strip_prefix(ctx.root).unwrap_or(&file.path);
365 let Some(ignores) = ctx.extra_ignores else {
366 return false;
367 };
368 if let Some(index) = ignores.default_match_index(relative) {
369 ctx.default_skip_counts[index].fetch_add(1, Ordering::Relaxed);
370 return true;
371 }
372 ignores.is_match(relative)
373}
374
375fn duplication_token_cache_entry(
376 file: &DiscoveredFile,
377 ctx: &DuplicationTokenizeContext<'_>,
378 cached_entry: Option<TokenCacheEntry>,
379) -> Option<(TokenCacheEntry, Vec<Suppression>)> {
380 if let Some(entry) = cached_entry {
381 let suppressions = entry.suppressions.clone();
382 if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
383 return None;
384 }
385 return Some((entry, suppressions));
386 }
387
388 let source = std::fs::read_to_string(&file.path).ok()?;
389 let suppressions = suppress::parse_suppressions_from_source(&source).suppressions;
390 if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
391 return None;
392 }
393 let file_tokens = tokenize_duplication_source(file, ctx, &source);
394 if file_tokens.tokens.is_empty() {
395 return None;
396 }
397 let hashed_tokens = normalize_and_hash_resolved(&file_tokens.tokens, ctx.normalization);
398 Some((
399 TokenCacheEntry {
400 hashed_tokens,
401 file_tokens,
402 suppressions: suppressions.clone(),
403 },
404 suppressions,
405 ))
406}
407
408fn tokenize_duplication_source(
409 file: &DiscoveredFile,
410 ctx: &DuplicationTokenizeContext<'_>,
411 source: &str,
412) -> tokenize::FileTokens {
413 if ctx.strip_types {
414 tokenize_file_cross_language(&file.path, source, true, ctx.skip_imports)
415 } else {
416 tokenize_file(&file.path, source, ctx.skip_imports)
417 }
418}
419
420fn save_duplication_token_cache(
421 cache_root: &Path,
422 mut cache: TokenCache,
423 files: &[DiscoveredFile],
424 file_data: &[TokenizedFile],
425 mode: TokenCacheMode,
426) {
427 for file in file_data {
428 if !file.cache_hit
429 && let Some(metadata) = &file.metadata
430 {
431 cache.insert(
432 &file.path,
433 metadata,
434 mode,
435 &file.hashed_tokens,
436 &file.file_tokens,
437 &file.suppressions,
438 );
439 }
440 }
441 cache.retain_paths(files);
442 match cache.save_if_dirty() {
443 Ok(true) => {
444 tracing::debug!(cache_root = %cache_root.display(), "saved duplication token cache");
445 }
446 Ok(false) => {
447 tracing::debug!(cache_root = %cache_root.display(), "duplication token cache unchanged");
448 }
449 Err(err) => tracing::warn!("Failed to save duplication token cache: {err}"),
450 }
451}
452
453fn apply_min_occurrences_filter(report: &mut DuplicationReport, min: usize) {
468 if min <= 2 {
469 return;
470 }
471 let before = report.clone_groups.len();
472 report
473 .clone_groups
474 .retain(|group| group.instances.len() >= min);
475 let hidden = before - report.clone_groups.len();
476 if hidden == 0 {
477 return;
478 }
479 report.stats.clone_groups_below_min_occurrences = hidden;
480 report.stats.clone_groups = report.clone_groups.len();
481 report.stats.clone_instances = report.clone_groups.iter().map(|g| g.instances.len()).sum();
482}
483
484#[expect(
486 clippy::cast_possible_truncation,
487 reason = "line numbers are bounded by source size"
488)]
489fn apply_line_suppressions(
490 report: &mut DuplicationReport,
491 suppressions_by_file: &FxHashMap<PathBuf, Vec<Suppression>>,
492) {
493 report.clone_groups.retain_mut(|group| {
494 group.instances.retain(|instance| {
495 if let Some(supps) = suppressions_by_file.get(&instance.file) {
496 for line in instance.start_line..=instance.end_line {
497 if suppress::is_suppressed(supps, line as u32, IssueKind::CodeDuplication) {
498 return false;
499 }
500 }
501 }
502 true
503 });
504 group.instances.len() >= 2
505 });
506}
507
508#[must_use]
512pub fn find_duplicates_in_project(root: &Path, config: &DuplicatesConfig) -> DuplicationReport {
513 let resolved = crate::default_config(root);
514 let files = discover::discover_files_with_plugin_scopes(&resolved);
515 find_duplicates(root, &files, config)
516}
517
518#[expect(
520 clippy::expect_used,
521 reason = "duplicate ignore globs are validated before clone detection"
522)]
523fn build_ignore_set(config: &DuplicatesConfig) -> Option<IgnoreSet> {
524 if !config.ignore_defaults && config.ignore.is_empty() {
525 return None;
526 }
527
528 let mut builder = GlobSetBuilder::new();
529 let mut defaults = Vec::new();
530
531 if config.ignore_defaults {
532 for pattern in DUPES_DEFAULT_IGNORES {
533 let glob = Glob::new(pattern).expect("default duplication ignore pattern is valid");
534 defaults.push((*pattern, glob.compile_matcher()));
535 builder.add(glob);
536 }
537 }
538
539 for pattern in &config.ignore {
540 builder.add(
541 Glob::new(pattern)
542 .expect("duplicates.ignore pattern was validated at config load time"),
543 );
544 }
545
546 builder.build().ok().map(|all| IgnoreSet { all, defaults })
547}
548
549fn build_default_ignore_skips(
550 ignores: Option<&IgnoreSet>,
551 counts: &[AtomicUsize],
552) -> DefaultIgnoreSkips {
553 let Some(ignores) = ignores else {
554 return DefaultIgnoreSkips::default();
555 };
556
557 let by_pattern = ignores
558 .defaults
559 .iter()
560 .zip(counts)
561 .filter_map(|((pattern, _), count)| {
562 let count = count.load(Ordering::Relaxed);
563 (count > 0).then_some(DefaultIgnoreSkipCount { pattern, count })
564 })
565 .collect::<Vec<_>>();
566 let total = by_pattern.iter().map(|entry| entry.count).sum();
567
568 DefaultIgnoreSkips { total, by_pattern }
569}
570
571#[cfg(test)]
572mod tests {
573 use super::*;
574 use crate::discover::FileId;
575
576 #[test]
577 fn find_duplicates_empty_files() {
578 let config = DuplicatesConfig::default();
579 let report = find_duplicates(Path::new("/tmp"), &[], &config);
580 assert!(report.clone_groups.is_empty());
581 assert!(report.clone_families.is_empty());
582 assert_eq!(report.stats.total_files, 0);
583 }
584
585 #[test]
586 fn build_ignore_set_empty() {
587 let config = DuplicatesConfig {
588 ignore_defaults: false,
589 ..DuplicatesConfig::default()
590 };
591 assert!(build_ignore_set(&config).is_none());
592 }
593
594 #[test]
595 fn build_ignore_set_valid_patterns() {
596 let config = DuplicatesConfig {
597 ignore_defaults: false,
598 ignore: vec!["**/*.test.ts".to_string(), "**/*.spec.ts".to_string()],
599 ..DuplicatesConfig::default()
600 };
601 let set = build_ignore_set(&config);
602 assert!(set.is_some());
603 let set = set.unwrap();
604 assert!(set.is_match(Path::new("src/foo.test.ts")));
605 assert!(set.is_match(Path::new("src/bar.spec.ts")));
606 assert!(!set.is_match(Path::new("src/baz.ts")));
607 }
608
609 #[test]
610 fn build_ignore_set_merges_defaults_with_user_patterns() {
611 let config = DuplicatesConfig {
612 ignore: vec!["**/foo/**".to_string()],
613 ..DuplicatesConfig::default()
614 };
615 let set = build_ignore_set(&config).expect("ignore set");
616 assert!(set.is_match(Path::new(".next/static/chunks/app.js")));
617 assert!(set.is_match(Path::new("src/foo/generated.js")));
618 }
619
620 #[test]
621 fn build_ignore_set_ignore_defaults_false_uses_only_user_patterns() {
622 let config = DuplicatesConfig {
623 ignore_defaults: false,
624 ignore: vec!["**/foo/**".to_string()],
625 ..DuplicatesConfig::default()
626 };
627 let set = build_ignore_set(&config).expect("ignore set");
628 assert!(!set.is_match(Path::new(".next/static/chunks/app.js")));
629 assert!(set.is_match(Path::new("src/foo/generated.js")));
630 }
631
632 #[test]
633 fn find_duplicates_with_real_files() {
634 let dir = tempfile::tempdir().expect("create temp dir");
635 let src_dir = dir.path().join("src");
636 std::fs::create_dir_all(&src_dir).expect("create src dir");
637
638 let code = r#"
639export function processData(input: string): string {
640 const trimmed = input.trim();
641 if (trimmed.length === 0) {
642 return "";
643 }
644 const parts = trimmed.split(",");
645 const filtered = parts.filter(p => p.length > 0);
646 const mapped = filtered.map(p => p.toUpperCase());
647 return mapped.join(", ");
648}
649
650export function validateInput(data: string): boolean {
651 if (data === null || data === undefined) {
652 return false;
653 }
654 const cleaned = data.trim();
655 if (cleaned.length < 3) {
656 return false;
657 }
658 return true;
659}
660"#;
661
662 std::fs::write(src_dir.join("original.ts"), code).expect("write original");
663 std::fs::write(src_dir.join("copy.ts"), code).expect("write copy");
664 std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
665 .expect("write package.json");
666
667 let files = vec![
668 DiscoveredFile {
669 id: FileId(0),
670 path: src_dir.join("original.ts"),
671 size_bytes: code.len() as u64,
672 },
673 DiscoveredFile {
674 id: FileId(1),
675 path: src_dir.join("copy.ts"),
676 size_bytes: code.len() as u64,
677 },
678 ];
679
680 let config = DuplicatesConfig {
681 min_tokens: 10,
682 min_lines: 2,
683 ..DuplicatesConfig::default()
684 };
685
686 let report = find_duplicates(dir.path(), &files, &config);
687 assert!(
688 !report.clone_groups.is_empty(),
689 "Should detect clones in identical files"
690 );
691 assert!(report.stats.files_with_clones >= 2);
692
693 assert!(
694 !report.clone_families.is_empty(),
695 "Should group clones into families"
696 );
697 }
698
699 #[test]
700 fn find_duplicates_cached_skips_token_cache_for_small_corpus() {
701 let dir = tempfile::tempdir().expect("create temp dir");
702 let src_dir = dir.path().join("src");
703 std::fs::create_dir_all(&src_dir).expect("create src dir");
704
705 let code = "export function same(input: number): number {\n const doubled = input * 2;\n return doubled + 1;\n}\n";
706 let first = src_dir.join("first.ts");
707 let second = src_dir.join("second.ts");
708 std::fs::write(&first, code).expect("write first");
709 std::fs::write(&second, code).expect("write second");
710
711 let files = vec![
712 DiscoveredFile {
713 id: FileId(0),
714 path: first,
715 size_bytes: code.len() as u64,
716 },
717 DiscoveredFile {
718 id: FileId(1),
719 path: second,
720 size_bytes: code.len() as u64,
721 },
722 ];
723 let config = DuplicatesConfig {
724 min_tokens: 5,
725 min_lines: 2,
726 ..DuplicatesConfig::default()
727 };
728 let cache_root = dir.path().join(".fallow");
729
730 let report = find_duplicates_cached(dir.path(), &files, &config, &cache_root);
731
732 assert!(!report.clone_groups.is_empty());
733 assert!(
734 !cache_root.exists(),
735 "small projects should avoid token-cache IO overhead"
736 );
737 }
738
739 #[test]
740 fn find_duplicates_touching_files_keeps_cross_corpus_matches_only_for_focus() {
741 let dir = tempfile::tempdir().expect("create temp dir");
742 let src_dir = dir.path().join("src");
743 std::fs::create_dir_all(&src_dir).expect("create src dir");
744
745 let focused_code = r"
746export function focused(input: number): number {
747 const doubled = input * 2;
748 const shifted = doubled + 10;
749 return shifted / 2;
750}
751";
752 let untouched_code = r#"
753export function untouched(input: string): string {
754 const lowered = input.toLowerCase();
755 const padded = lowered.padStart(10, "x");
756 return padded.slice(0, 8);
757}
758"#;
759
760 let changed_path = src_dir.join("changed.ts");
761 let focused_copy_path = src_dir.join("focused-copy.ts");
762 let untouched_a_path = src_dir.join("untouched-a.ts");
763 let untouched_b_path = src_dir.join("untouched-b.ts");
764 std::fs::write(&changed_path, focused_code).expect("write changed");
765 std::fs::write(&focused_copy_path, focused_code).expect("write focused copy");
766 std::fs::write(&untouched_a_path, untouched_code).expect("write untouched a");
767 std::fs::write(&untouched_b_path, untouched_code).expect("write untouched b");
768
769 let files = vec![
770 DiscoveredFile {
771 id: FileId(0),
772 path: changed_path.clone(),
773 size_bytes: focused_code.len() as u64,
774 },
775 DiscoveredFile {
776 id: FileId(1),
777 path: focused_copy_path,
778 size_bytes: focused_code.len() as u64,
779 },
780 DiscoveredFile {
781 id: FileId(2),
782 path: untouched_a_path,
783 size_bytes: untouched_code.len() as u64,
784 },
785 DiscoveredFile {
786 id: FileId(3),
787 path: untouched_b_path,
788 size_bytes: untouched_code.len() as u64,
789 },
790 ];
791
792 let config = DuplicatesConfig {
793 mode: DetectionMode::Strict,
794 min_tokens: 5,
795 min_lines: 2,
796 min_corpus_size_for_shingle_filter: 1,
797 ..DuplicatesConfig::default()
798 };
799 let mut focus = FxHashSet::default();
800 focus.insert(changed_path.clone());
801
802 let full_report = find_duplicates(dir.path(), &files, &config);
803 let report = find_duplicates_touching_files(dir.path(), &files, &config, &focus);
804 let expected_touching = full_report
805 .clone_groups
806 .iter()
807 .filter(|group| {
808 group
809 .instances
810 .iter()
811 .any(|instance| instance.file == changed_path)
812 })
813 .count();
814
815 assert!(
816 !report.clone_groups.is_empty(),
817 "focused file should still match an unchanged duplicate"
818 );
819 assert_eq!(
820 report.clone_groups.len(),
821 expected_touching,
822 "focused shingle filtering must not drop clone groups touching the focused file"
823 );
824 assert!(report.clone_groups.iter().all(|group| {
825 group
826 .instances
827 .iter()
828 .any(|instance| instance.file == changed_path)
829 }));
830 }
831
832 #[test]
833 fn file_wide_suppression_excludes_file() {
834 let dir = tempfile::tempdir().expect("create temp dir");
835 let src_dir = dir.path().join("src");
836 std::fs::create_dir_all(&src_dir).expect("create src dir");
837
838 let code = r#"
839export function processData(input: string): string {
840 const trimmed = input.trim();
841 if (trimmed.length === 0) {
842 return "";
843 }
844 const parts = trimmed.split(",");
845 const filtered = parts.filter(p => p.length > 0);
846 const mapped = filtered.map(p => p.toUpperCase());
847 return mapped.join(", ");
848}
849"#;
850 let suppressed_code = format!("// fallow-ignore-file code-duplication\n{code}");
851
852 std::fs::write(src_dir.join("original.ts"), code).expect("write original");
853 std::fs::write(src_dir.join("suppressed.ts"), &suppressed_code).expect("write suppressed");
854 std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
855 .expect("write package.json");
856
857 let files = vec![
858 DiscoveredFile {
859 id: FileId(0),
860 path: src_dir.join("original.ts"),
861 size_bytes: code.len() as u64,
862 },
863 DiscoveredFile {
864 id: FileId(1),
865 path: src_dir.join("suppressed.ts"),
866 size_bytes: suppressed_code.len() as u64,
867 },
868 ];
869
870 let config = DuplicatesConfig {
871 min_tokens: 10,
872 min_lines: 2,
873 ..DuplicatesConfig::default()
874 };
875
876 let report = find_duplicates(dir.path(), &files, &config);
877 assert!(
878 report.clone_groups.is_empty(),
879 "File-wide suppression should exclude file from duplication analysis"
880 );
881 }
882
883 #[test]
884 fn min_occurrences_hides_pairs_and_records_count() {
885 let dir = tempfile::tempdir().expect("create temp dir");
886 let src_dir = dir.path().join("src");
887 std::fs::create_dir_all(&src_dir).expect("create src dir");
888
889 let block_a = r#"
890export function blockA(input: string): string {
891 const trimmed = input.trim();
892 if (trimmed.length === 0) {
893 return "";
894 }
895 const parts = trimmed.split(",");
896 const filtered = parts.filter(p => p.length > 0);
897 const mapped = filtered.map(p => p.toUpperCase());
898 return mapped.join(", ");
899}
900"#;
901 let block_b = r"
902export function blockB(value: number): number {
903 if (value <= 0) {
904 return 0;
905 }
906 let total = 0;
907 for (let i = 1; i <= value; i += 1) {
908 total += i * 2;
909 total -= 1;
910 }
911 return total + 7;
912}
913";
914
915 let pair_a1 = src_dir.join("pair-a1.ts");
916 let pair_a2 = src_dir.join("pair-a2.ts");
917 let triple_b1 = src_dir.join("triple-b1.ts");
918 let triple_b2 = src_dir.join("triple-b2.ts");
919 let triple_b3 = src_dir.join("triple-b3.ts");
920 std::fs::write(&pair_a1, block_a).expect("write");
921 std::fs::write(&pair_a2, block_a).expect("write");
922 std::fs::write(&triple_b1, block_b).expect("write");
923 std::fs::write(&triple_b2, block_b).expect("write");
924 std::fs::write(&triple_b3, block_b).expect("write");
925
926 let files = vec![
927 DiscoveredFile {
928 id: FileId(0),
929 path: pair_a1,
930 size_bytes: block_a.len() as u64,
931 },
932 DiscoveredFile {
933 id: FileId(1),
934 path: pair_a2,
935 size_bytes: block_a.len() as u64,
936 },
937 DiscoveredFile {
938 id: FileId(2),
939 path: triple_b1,
940 size_bytes: block_b.len() as u64,
941 },
942 DiscoveredFile {
943 id: FileId(3),
944 path: triple_b2,
945 size_bytes: block_b.len() as u64,
946 },
947 DiscoveredFile {
948 id: FileId(4),
949 path: triple_b3,
950 size_bytes: block_b.len() as u64,
951 },
952 ];
953
954 let default_config = DuplicatesConfig {
955 min_tokens: 10,
956 min_lines: 2,
957 ..DuplicatesConfig::default()
958 };
959 let baseline = find_duplicates(dir.path(), &files, &default_config);
960 assert_eq!(
961 baseline.clone_groups.len(),
962 2,
963 "default minOccurrences should report both the pair and the triple"
964 );
965 assert_eq!(
966 baseline.stats.clone_groups_below_min_occurrences, 0,
967 "default minOccurrences hides nothing"
968 );
969 let baseline_pct = baseline.stats.duplication_percentage;
970
971 let raised_config = DuplicatesConfig {
972 min_tokens: 10,
973 min_lines: 2,
974 min_occurrences: 3,
975 ..DuplicatesConfig::default()
976 };
977 let report = find_duplicates(dir.path(), &files, &raised_config);
978 assert_eq!(
979 report.clone_groups.len(),
980 1,
981 "minOccurrences=3 should hide the 2-instance group"
982 );
983 assert_eq!(
984 report.clone_groups[0].instances.len(),
985 3,
986 "surviving group must be the 3-instance group"
987 );
988 assert_eq!(
989 report.stats.clone_groups_below_min_occurrences, 1,
990 "the hidden 2-instance group must be counted"
991 );
992 assert_eq!(
993 report.stats.clone_groups, 1,
994 "stats.clone_groups must match the post-filter array length"
995 );
996 assert_eq!(
997 report.stats.clone_instances, 3,
998 "stats.clone_instances must match the surviving instance total"
999 );
1000 assert!(
1001 (report.stats.duplication_percentage - baseline_pct).abs() < f64::EPSILON,
1002 "duplication_percentage should not shift when minOccurrences changes"
1003 );
1004 }
1005
1006 #[test]
1007 fn min_occurrences_evaluates_after_line_suppressions() {
1008 let dir = tempfile::tempdir().expect("create temp dir");
1009 let src_dir = dir.path().join("src");
1010 std::fs::create_dir_all(&src_dir).expect("create src dir");
1011
1012 let block = r#"
1013export function shared(input: string): string {
1014 const trimmed = input.trim();
1015 if (trimmed.length === 0) {
1016 return "";
1017 }
1018 const parts = trimmed.split(",");
1019 const filtered = parts.filter(p => p.length > 0);
1020 const mapped = filtered.map(p => p.toUpperCase());
1021 return mapped.join(", ");
1022}
1023"#;
1024 let suppressed = format!("// fallow-ignore-file code-duplication\n{block}");
1025
1026 let a = src_dir.join("a.ts");
1027 let b = src_dir.join("b.ts");
1028 let c = src_dir.join("c.ts");
1029 std::fs::write(&a, block).expect("write a");
1030 std::fs::write(&b, block).expect("write b");
1031 std::fs::write(&c, &suppressed).expect("write c");
1032
1033 let files = vec![
1034 DiscoveredFile {
1035 id: FileId(0),
1036 path: a,
1037 size_bytes: block.len() as u64,
1038 },
1039 DiscoveredFile {
1040 id: FileId(1),
1041 path: b,
1042 size_bytes: block.len() as u64,
1043 },
1044 DiscoveredFile {
1045 id: FileId(2),
1046 path: c,
1047 size_bytes: suppressed.len() as u64,
1048 },
1049 ];
1050
1051 let config = DuplicatesConfig {
1052 min_tokens: 10,
1053 min_lines: 2,
1054 min_occurrences: 3,
1055 ..DuplicatesConfig::default()
1056 };
1057 let report = find_duplicates(dir.path(), &files, &config);
1058 assert!(
1059 report.clone_groups.is_empty(),
1060 "post-suppression 2-instance group must be hidden by minOccurrences=3, \
1061 got groups: {:?}",
1062 report
1063 .clone_groups
1064 .iter()
1065 .map(|g| g.instances.len())
1066 .collect::<Vec<_>>()
1067 );
1068 assert_eq!(
1069 report.stats.clone_groups, 0,
1070 "stats.clone_groups must match the empty post-filter array"
1071 );
1072 assert_eq!(
1073 report.stats.clone_instances, 0,
1074 "stats.clone_instances must match the empty post-filter array"
1075 );
1076 }
1077}