1mod cache;
9pub mod deepdive;
10pub mod detect;
11pub mod families;
12pub mod normalize;
13mod shingle_filter;
14pub mod token_types;
15mod token_visitor;
16pub mod tokenize;
17pub(crate) mod types;
18
19use rustc_hash::FxHashMap;
20use std::path::{Path, PathBuf};
21use std::sync::atomic::{AtomicUsize, Ordering};
22
23use globset::{Glob, GlobSet, GlobSetBuilder};
24use rayon::prelude::*;
25use rustc_hash::FxHashSet;
26
27use cache::{TokenCache, TokenCacheEntry, TokenCacheMode};
28pub use deepdive::{
29 CloneFingerprintKey, CloneFingerprintSet, FINGERPRINT_PREFIX, clone_fingerprint,
30 dominant_identifier, fingerprint_for_fragment, group_refactoring_suggestion,
31};
32use detect::CloneDetector;
33use normalize::normalize_and_hash_resolved;
34use tokenize::{tokenize_file, tokenize_file_cross_language};
35pub use types::{
36 CloneFamily, CloneGroup, CloneInstance, DefaultIgnoreSkipCount, DefaultIgnoreSkips,
37 DetectionMode, DuplicatesConfig, DuplicationReport, DuplicationStats, MirroredDirectory,
38 RefactoringKind, RefactoringSuggestion,
39};
40
41use crate::discover::{self, DiscoveredFile};
42use crate::suppress::{self, IssueKind, Suppression};
43
44pub const DUPES_DEFAULT_IGNORES: &[&str] = &[
50 "**/.next/**",
51 "**/.nuxt/**",
52 "**/.svelte-kit/**",
53 "**/.turbo/**",
54 "**/.parcel-cache/**",
55 "**/.vite/**",
56 "**/.cache/**",
57 "**/out/**",
58 "**/storybook-static/**",
59 "**/*.test.*",
60 "**/*.spec.*",
61 "**/__tests__/**",
62 "**/__mocks__/**",
63];
64
65#[derive(Clone)]
66pub(super) struct TokenizedFile {
67 path: PathBuf,
68 hashed_tokens: Vec<normalize::HashedToken>,
69 file_tokens: tokenize::FileTokens,
70 metadata: Option<std::fs::Metadata>,
71 cache_hit: bool,
72 suppressions: Vec<Suppression>,
73}
74
75struct IgnoreSet {
76 all: GlobSet,
77 defaults: Vec<&'static str>,
78}
79
80enum IgnoreMatch {
81 Default(usize),
82 User,
83}
84
85impl IgnoreSet {
86 fn match_path(&self, path: &Path, matches: &mut Vec<usize>) -> Option<IgnoreMatch> {
87 self.all.matches_into(path, matches);
88 let first = matches.first().copied()?;
89 if first < self.defaults.len() {
90 Some(IgnoreMatch::Default(first))
91 } else {
92 Some(IgnoreMatch::User)
93 }
94 }
95}
96
97struct DuplicationRun {
98 report: DuplicationReport,
99 default_ignore_skips: DefaultIgnoreSkips,
100}
101
102struct DuplicationTokenizeContext<'a> {
103 root: &'a Path,
104 config: &'a DuplicatesConfig,
105 extra_ignores: Option<&'a IgnoreSet>,
106 default_skip_counts: &'a [AtomicUsize],
107 token_cache: Option<&'a TokenCache>,
108 token_cache_mode: TokenCacheMode,
109 normalization: fallow_config::ResolvedNormalization,
110 strip_types: bool,
111 skip_imports: bool,
112}
113
114pub fn find_duplicates(
123 root: &Path,
124 files: &[DiscoveredFile],
125 config: &DuplicatesConfig,
126) -> DuplicationReport {
127 find_duplicates_inner(root, files, config, None, None).report
128}
129
130pub fn find_duplicates_with_default_ignore_skips(
133 root: &Path,
134 files: &[DiscoveredFile],
135 config: &DuplicatesConfig,
136) -> (DuplicationReport, DefaultIgnoreSkips) {
137 let run = find_duplicates_inner(root, files, config, None, None);
138 (run.report, run.default_ignore_skips)
139}
140
141pub fn find_duplicates_cached(
143 root: &Path,
144 files: &[DiscoveredFile],
145 config: &DuplicatesConfig,
146 cache_root: &Path,
147) -> DuplicationReport {
148 find_duplicates_inner(root, files, config, None, Some(cache_root)).report
149}
150
151pub fn find_duplicates_cached_with_default_ignore_skips(
154 root: &Path,
155 files: &[DiscoveredFile],
156 config: &DuplicatesConfig,
157 cache_root: &Path,
158) -> (DuplicationReport, DefaultIgnoreSkips) {
159 let run = find_duplicates_inner(root, files, config, None, Some(cache_root));
160 (run.report, run.default_ignore_skips)
161}
162
163#[expect(
169 clippy::implicit_hasher,
170 reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
171)]
172pub fn find_duplicates_touching_files(
173 root: &Path,
174 files: &[DiscoveredFile],
175 config: &DuplicatesConfig,
176 focus_files: &FxHashSet<PathBuf>,
177) -> DuplicationReport {
178 find_duplicates_inner(root, files, config, Some(focus_files), None).report
179}
180
181#[expect(
184 clippy::implicit_hasher,
185 reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
186)]
187pub fn find_duplicates_touching_files_with_default_ignore_skips(
188 root: &Path,
189 files: &[DiscoveredFile],
190 config: &DuplicatesConfig,
191 focus_files: &FxHashSet<PathBuf>,
192) -> (DuplicationReport, DefaultIgnoreSkips) {
193 let run = find_duplicates_inner(root, files, config, Some(focus_files), None);
194 (run.report, run.default_ignore_skips)
195}
196
197#[expect(
199 clippy::implicit_hasher,
200 reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
201)]
202pub fn find_duplicates_touching_files_cached(
203 root: &Path,
204 files: &[DiscoveredFile],
205 config: &DuplicatesConfig,
206 focus_files: &FxHashSet<PathBuf>,
207 cache_root: &Path,
208) -> DuplicationReport {
209 find_duplicates_inner(root, files, config, Some(focus_files), Some(cache_root)).report
210}
211
212#[expect(
215 clippy::implicit_hasher,
216 reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
217)]
218pub fn find_duplicates_touching_files_cached_with_default_ignore_skips(
219 root: &Path,
220 files: &[DiscoveredFile],
221 config: &DuplicatesConfig,
222 focus_files: &FxHashSet<PathBuf>,
223 cache_root: &Path,
224) -> (DuplicationReport, DefaultIgnoreSkips) {
225 let run = find_duplicates_inner(root, files, config, Some(focus_files), Some(cache_root));
226 (run.report, run.default_ignore_skips)
227}
228
229fn tokenize_corpus_for_duplicates(
233 root: &Path,
234 files: &[DiscoveredFile],
235 config: &DuplicatesConfig,
236 extra_ignores: Option<&IgnoreSet>,
237 default_skip_counts: &[AtomicUsize],
238 cache_root: Option<&Path>,
239) -> (Vec<TokenizedFile>, detect::CorpusTotals) {
240 let normalization =
241 fallow_config::ResolvedNormalization::resolve(config.mode, &config.normalization);
242
243 let strip_types = config.cross_language;
244 let skip_imports = config.ignore_imports;
245
246 tracing::debug!(
247 ignore_imports = skip_imports,
248 "duplication tokenization config"
249 );
250
251 let token_cache_mode = TokenCacheMode::new(normalization, strip_types, skip_imports);
252 let cache_root = cache_root.filter(|_| files.len() >= config.min_corpus_size_for_token_cache);
253 let token_cache = cache_root.map(TokenCache::load);
254
255 let file_data = tokenize_duplication_files(
256 files,
257 &DuplicationTokenizeContext {
258 root,
259 config,
260 extra_ignores,
261 default_skip_counts,
262 token_cache: token_cache.as_ref(),
263 token_cache_mode,
264 normalization,
265 strip_types,
266 skip_imports,
267 },
268 );
269
270 if let (Some(cache_root), Some(cache)) = (cache_root, token_cache) {
271 save_duplication_token_cache(cache_root, cache, files, &file_data, token_cache_mode);
272 }
273
274 tracing::info!(
275 files = file_data.len(),
276 "tokenized files for duplication analysis"
277 );
278
279 let corpus_totals = detect::CorpusTotals {
280 files: file_data.len(),
281 lines: file_data
282 .iter()
283 .map(|file| file.file_tokens.line_count)
284 .sum(),
285 tokens: file_data.iter().map(|file| file.hashed_tokens.len()).sum(),
286 };
287 (file_data, corpus_totals)
288}
289
290fn detect_and_postprocess(
294 root: &Path,
295 config: &DuplicatesConfig,
296 mut file_data: Vec<TokenizedFile>,
297 corpus_totals: detect::CorpusTotals,
298 focus_files: Option<&FxHashSet<PathBuf>>,
299) -> DuplicationReport {
300 if file_data.len() >= config.min_corpus_size_for_shingle_filter {
301 if let Some(focus_files) = focus_files {
302 shingle_filter::filter_to_focus_candidates(
303 &mut file_data,
304 focus_files,
305 config.min_tokens,
306 );
307 } else {
308 shingle_filter::filter_to_duplicate_candidates(&mut file_data, config.min_tokens);
309 }
310 }
311
312 let suppressions_by_file: FxHashMap<PathBuf, Vec<Suppression>> = file_data
313 .iter()
314 .filter(|file| !file.suppressions.is_empty())
315 .map(|file| (file.path.clone(), file.suppressions.clone()))
316 .collect();
317
318 let detector_data: Vec<(PathBuf, Vec<normalize::HashedToken>, tokenize::FileTokens)> =
319 file_data
320 .into_iter()
321 .map(|file| (file.path, file.hashed_tokens, file.file_tokens))
322 .collect();
323
324 let detector = CloneDetector::new(config.min_tokens, config.min_lines, config.skip_local);
325 let mut report = if let Some(focus_files) = focus_files {
326 detector.detect_touching_files(detector_data, focus_files)
327 } else {
328 detector.detect_with_totals(detector_data, corpus_totals)
329 };
330
331 if !suppressions_by_file.is_empty() {
332 apply_line_suppressions(&mut report, &suppressions_by_file);
333 }
334
335 apply_min_occurrences_filter(&mut report, config.min_occurrences);
336
337 report.clone_families = families::group_into_families(&report.clone_groups, root);
338 report.mirrored_directories =
339 families::detect_mirrored_directories(&report.clone_families, root);
340 report.sort();
341 report
342}
343
344fn find_duplicates_inner(
345 root: &Path,
346 files: &[DiscoveredFile],
347 config: &DuplicatesConfig,
348 focus_files: Option<&FxHashSet<PathBuf>>,
349 cache_root: Option<&Path>,
350) -> DuplicationRun {
351 let _span = tracing::info_span!("find_duplicates").entered();
352
353 let extra_ignores = build_ignore_set(config);
354 let default_skip_counts = extra_ignores
355 .as_ref()
356 .map(|ignores| {
357 std::iter::repeat_with(|| AtomicUsize::new(0))
358 .take(ignores.defaults.len())
359 .collect::<Vec<_>>()
360 })
361 .unwrap_or_default();
362
363 let (file_data, corpus_totals) = tokenize_corpus_for_duplicates(
364 root,
365 files,
366 config,
367 extra_ignores.as_ref(),
368 &default_skip_counts,
369 cache_root,
370 );
371
372 let report = detect_and_postprocess(root, config, file_data, corpus_totals, focus_files);
373
374 let default_ignore_skips =
375 build_default_ignore_skips(extra_ignores.as_ref(), &default_skip_counts);
376
377 DuplicationRun {
378 report,
379 default_ignore_skips,
380 }
381}
382
383fn tokenize_duplication_files(
384 files: &[DiscoveredFile],
385 ctx: &DuplicationTokenizeContext<'_>,
386) -> Vec<TokenizedFile> {
387 files
388 .par_iter()
389 .filter_map(|file| tokenize_duplication_file(file, ctx))
390 .collect()
391}
392
393fn tokenize_duplication_file(
394 file: &DiscoveredFile,
395 ctx: &DuplicationTokenizeContext<'_>,
396) -> Option<TokenizedFile> {
397 if should_skip_duplicate_file(file, ctx) {
398 return None;
399 }
400
401 let metadata = std::fs::metadata(&file.path).ok()?;
402 let cached_entry = ctx
403 .token_cache
404 .and_then(|cache| cache.get(&file.path, &metadata, ctx.token_cache_mode));
405 let cache_hit = cached_entry.is_some();
406 let (mut entry, suppressions) = duplication_token_cache_entry(file, ctx, cached_entry)?;
407 if entry.file_tokens.tokens.is_empty() || entry.hashed_tokens.len() < ctx.config.min_tokens {
408 return None;
409 }
410
411 Some(TokenizedFile {
412 path: file.path.clone(),
413 hashed_tokens: std::mem::take(&mut entry.hashed_tokens),
414 file_tokens: entry.file_tokens,
415 metadata: Some(metadata),
416 cache_hit,
417 suppressions,
418 })
419}
420
421fn should_skip_duplicate_file(file: &DiscoveredFile, ctx: &DuplicationTokenizeContext<'_>) -> bool {
422 let relative = file.path.strip_prefix(ctx.root).unwrap_or(&file.path);
423 let Some(ignores) = ctx.extra_ignores else {
424 return false;
425 };
426 let mut matches = Vec::new();
427 match ignores.match_path(relative, &mut matches) {
428 Some(IgnoreMatch::Default(index)) => {
429 ctx.default_skip_counts[index].fetch_add(1, Ordering::Relaxed);
430 true
431 }
432 Some(IgnoreMatch::User) => true,
433 None => false,
434 }
435}
436
437fn duplication_token_cache_entry(
438 file: &DiscoveredFile,
439 ctx: &DuplicationTokenizeContext<'_>,
440 cached_entry: Option<TokenCacheEntry>,
441) -> Option<(TokenCacheEntry, Vec<Suppression>)> {
442 if let Some(entry) = cached_entry {
443 let suppressions = entry.suppressions.clone();
444 if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
445 return None;
446 }
447 return Some((entry, suppressions));
448 }
449
450 let source = std::fs::read_to_string(&file.path).ok()?;
451 let suppressions = suppress::parse_suppressions_from_source(&source).suppressions;
452 if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
453 return None;
454 }
455 let file_tokens = tokenize_duplication_source(file, ctx, &source);
456 if file_tokens.tokens.is_empty() {
457 return None;
458 }
459 let hashed_tokens = normalize_and_hash_resolved(&file_tokens.tokens, ctx.normalization);
460 Some((
461 TokenCacheEntry {
462 hashed_tokens,
463 file_tokens,
464 suppressions: suppressions.clone(),
465 },
466 suppressions,
467 ))
468}
469
470fn tokenize_duplication_source(
471 file: &DiscoveredFile,
472 ctx: &DuplicationTokenizeContext<'_>,
473 source: &str,
474) -> tokenize::FileTokens {
475 if ctx.strip_types {
476 tokenize_file_cross_language(&file.path, source, true, ctx.skip_imports)
477 } else {
478 tokenize_file(&file.path, source, ctx.skip_imports)
479 }
480}
481
482fn save_duplication_token_cache(
483 cache_root: &Path,
484 mut cache: TokenCache,
485 files: &[DiscoveredFile],
486 file_data: &[TokenizedFile],
487 mode: TokenCacheMode,
488) {
489 for file in file_data {
490 if !file.cache_hit
491 && let Some(metadata) = &file.metadata
492 {
493 cache.insert(
494 &file.path,
495 metadata,
496 mode,
497 &cache::TokenPayload {
498 hashed_tokens: &file.hashed_tokens,
499 file_tokens: &file.file_tokens,
500 suppressions: &file.suppressions,
501 },
502 );
503 }
504 }
505 cache.retain_paths(files);
506 match cache.save_if_dirty() {
507 Ok(true) => {
508 tracing::debug!(cache_root = %cache_root.display(), "saved duplication token cache");
509 }
510 Ok(false) => {
511 tracing::debug!(cache_root = %cache_root.display(), "duplication token cache unchanged");
512 }
513 Err(err) => tracing::warn!("Failed to save duplication token cache: {err}"),
514 }
515}
516
517fn apply_min_occurrences_filter(report: &mut DuplicationReport, min: usize) {
532 if min <= 2 {
533 return;
534 }
535 let before = report.clone_groups.len();
536 report
537 .clone_groups
538 .retain(|group| group.instances.len() >= min);
539 let hidden = before - report.clone_groups.len();
540 if hidden == 0 {
541 return;
542 }
543 report.stats.clone_groups_below_min_occurrences = hidden;
544 report.stats.clone_groups = report.clone_groups.len();
545 report.stats.clone_instances = report.clone_groups.iter().map(|g| g.instances.len()).sum();
546}
547
548#[expect(
550 clippy::cast_possible_truncation,
551 reason = "line numbers are bounded by source size"
552)]
553fn apply_line_suppressions(
554 report: &mut DuplicationReport,
555 suppressions_by_file: &FxHashMap<PathBuf, Vec<Suppression>>,
556) {
557 report.clone_groups.retain_mut(|group| {
558 group.instances.retain(|instance| {
559 if let Some(supps) = suppressions_by_file.get(&instance.file) {
560 for line in instance.start_line..=instance.end_line {
561 if suppress::is_suppressed(supps, line as u32, IssueKind::CodeDuplication) {
562 return false;
563 }
564 }
565 }
566 true
567 });
568 group.instances.len() >= 2
569 });
570}
571
572#[must_use]
576pub fn find_duplicates_in_project(root: &Path, config: &DuplicatesConfig) -> DuplicationReport {
577 let resolved = crate::default_config(root);
578 let files = discover::discover_files_with_plugin_scopes(&resolved);
579 find_duplicates(root, &files, config)
580}
581
582#[expect(
584 clippy::expect_used,
585 reason = "duplicate ignore globs are validated before clone detection"
586)]
587fn build_ignore_set(config: &DuplicatesConfig) -> Option<IgnoreSet> {
588 if !config.ignore_defaults && config.ignore.is_empty() {
589 return None;
590 }
591
592 let mut builder = GlobSetBuilder::new();
593 let mut defaults = Vec::new();
594
595 if config.ignore_defaults {
596 for pattern in DUPES_DEFAULT_IGNORES {
597 let glob = Glob::new(pattern).expect("default duplication ignore pattern is valid");
598 defaults.push(*pattern);
599 builder.add(glob);
600 }
601 }
602
603 for pattern in &config.ignore {
604 builder.add(
605 Glob::new(pattern)
606 .expect("duplicates.ignore pattern was validated at config load time"),
607 );
608 }
609
610 builder.build().ok().map(|all| IgnoreSet { all, defaults })
611}
612
613fn build_default_ignore_skips(
614 ignores: Option<&IgnoreSet>,
615 counts: &[AtomicUsize],
616) -> DefaultIgnoreSkips {
617 let Some(ignores) = ignores else {
618 return DefaultIgnoreSkips::default();
619 };
620
621 let by_pattern = ignores
622 .defaults
623 .iter()
624 .zip(counts)
625 .filter_map(|(pattern, count)| {
626 let count = count.load(Ordering::Relaxed);
627 (count > 0).then_some(DefaultIgnoreSkipCount { pattern, count })
628 })
629 .collect::<Vec<_>>();
630 let total = by_pattern.iter().map(|entry| entry.count).sum();
631
632 DefaultIgnoreSkips { total, by_pattern }
633}
634
635#[cfg(test)]
636mod tests {
637 use super::*;
638 use crate::discover::FileId;
639
640 fn ignore_set_matches(set: &IgnoreSet, path: &str) -> bool {
641 let mut matches = Vec::new();
642 set.match_path(Path::new(path), &mut matches).is_some()
643 }
644
645 #[test]
646 fn find_duplicates_empty_files() {
647 let config = DuplicatesConfig::default();
648 let report = find_duplicates(Path::new("/tmp"), &[], &config);
649 assert!(report.clone_groups.is_empty());
650 assert!(report.clone_families.is_empty());
651 assert_eq!(report.stats.total_files, 0);
652 }
653
654 #[test]
655 fn build_ignore_set_empty() {
656 let config = DuplicatesConfig {
657 ignore_defaults: false,
658 ..DuplicatesConfig::default()
659 };
660 assert!(build_ignore_set(&config).is_none());
661 }
662
663 #[test]
664 fn build_ignore_set_valid_patterns() {
665 let config = DuplicatesConfig {
666 ignore_defaults: false,
667 ignore: vec!["**/*.test.ts".to_string(), "**/*.spec.ts".to_string()],
668 ..DuplicatesConfig::default()
669 };
670 let set = build_ignore_set(&config);
671 assert!(set.is_some());
672 let set = set.unwrap();
673 assert!(ignore_set_matches(&set, "src/foo.test.ts"));
674 assert!(ignore_set_matches(&set, "src/bar.spec.ts"));
675 assert!(!ignore_set_matches(&set, "src/baz.ts"));
676 }
677
678 #[test]
679 fn build_ignore_set_merges_defaults_with_user_patterns() {
680 let config = DuplicatesConfig {
681 ignore: vec!["**/foo/**".to_string()],
682 ..DuplicatesConfig::default()
683 };
684 let set = build_ignore_set(&config).expect("ignore set");
685 assert!(ignore_set_matches(&set, ".next/static/chunks/app.js"));
686 assert!(ignore_set_matches(&set, "src/foo.test.ts"));
687 assert!(ignore_set_matches(&set, "src/foo.spec.tsx"));
688 assert!(ignore_set_matches(&set, "src/__tests__/foo.ts"));
689 assert!(ignore_set_matches(&set, "src/__mocks__/foo.ts"));
690 assert!(ignore_set_matches(&set, "src/foo/generated.js"));
691 }
692
693 #[test]
694 fn build_ignore_set_ignore_defaults_false_uses_only_user_patterns() {
695 let config = DuplicatesConfig {
696 ignore_defaults: false,
697 ignore: vec!["**/foo/**".to_string()],
698 ..DuplicatesConfig::default()
699 };
700 let set = build_ignore_set(&config).expect("ignore set");
701 assert!(!ignore_set_matches(&set, ".next/static/chunks/app.js"));
702 assert!(!ignore_set_matches(&set, "src/foo.test.ts"));
703 assert!(!ignore_set_matches(&set, "src/foo.spec.tsx"));
704 assert!(!ignore_set_matches(&set, "src/__tests__/foo.ts"));
705 assert!(!ignore_set_matches(&set, "src/__mocks__/foo.ts"));
706 assert!(ignore_set_matches(&set, "src/foo/generated.js"));
707 }
708
709 #[test]
710 fn default_ignores_skip_duplicate_test_files() {
711 let dir = tempfile::tempdir().expect("create temp dir");
712 let tests_dir = dir.path().join("src").join("__tests__");
713 std::fs::create_dir_all(&tests_dir).expect("create tests dir");
714
715 let code = r#"
716export function repeatedTestHelper(input: string): string {
717 const trimmed = input.trim();
718 const lowered = trimmed.toLowerCase();
719 const compact = lowered.replaceAll(" ", "-");
720 return compact;
721}
722"#;
723 let first = tests_dir.join("first.test.ts");
724 let second = tests_dir.join("second.test.ts");
725 std::fs::write(&first, code).expect("write first");
726 std::fs::write(&second, code).expect("write second");
727
728 let files = vec![
729 DiscoveredFile {
730 id: FileId(0),
731 path: first,
732 size_bytes: code.len() as u64,
733 },
734 DiscoveredFile {
735 id: FileId(1),
736 path: second,
737 size_bytes: code.len() as u64,
738 },
739 ];
740 let config = DuplicatesConfig {
741 min_tokens: 5,
742 min_lines: 2,
743 ..DuplicatesConfig::default()
744 };
745
746 let (report, skips) =
747 find_duplicates_with_default_ignore_skips(dir.path(), &files, &config);
748
749 assert!(report.clone_groups.is_empty());
750 assert_eq!(skips.total, 2);
751 }
752
753 #[test]
754 fn ignore_defaults_false_restores_duplicate_test_files() {
755 let dir = tempfile::tempdir().expect("create temp dir");
756 let tests_dir = dir.path().join("src").join("__tests__");
757 std::fs::create_dir_all(&tests_dir).expect("create tests dir");
758
759 let code = r#"
760export function repeatedTestHelper(input: string): string {
761 const trimmed = input.trim();
762 const lowered = trimmed.toLowerCase();
763 const compact = lowered.replaceAll(" ", "-");
764 return compact;
765}
766"#;
767 let first = tests_dir.join("first.test.ts");
768 let second = tests_dir.join("second.test.ts");
769 std::fs::write(&first, code).expect("write first");
770 std::fs::write(&second, code).expect("write second");
771
772 let files = vec![
773 DiscoveredFile {
774 id: FileId(0),
775 path: first,
776 size_bytes: code.len() as u64,
777 },
778 DiscoveredFile {
779 id: FileId(1),
780 path: second,
781 size_bytes: code.len() as u64,
782 },
783 ];
784 let config = DuplicatesConfig {
785 min_tokens: 5,
786 min_lines: 2,
787 ignore_defaults: false,
788 ..DuplicatesConfig::default()
789 };
790
791 let report = find_duplicates(dir.path(), &files, &config);
792
793 assert!(!report.clone_groups.is_empty());
794 }
795
796 #[test]
797 fn find_duplicates_with_real_files() {
798 let dir = tempfile::tempdir().expect("create temp dir");
799 let src_dir = dir.path().join("src");
800 std::fs::create_dir_all(&src_dir).expect("create src dir");
801
802 let code = r#"
803export function processData(input: string): string {
804 const trimmed = input.trim();
805 if (trimmed.length === 0) {
806 return "";
807 }
808 const parts = trimmed.split(",");
809 const filtered = parts.filter(p => p.length > 0);
810 const mapped = filtered.map(p => p.toUpperCase());
811 return mapped.join(", ");
812}
813
814export function validateInput(data: string): boolean {
815 if (data === null || data === undefined) {
816 return false;
817 }
818 const cleaned = data.trim();
819 if (cleaned.length < 3) {
820 return false;
821 }
822 return true;
823}
824"#;
825
826 std::fs::write(src_dir.join("original.ts"), code).expect("write original");
827 std::fs::write(src_dir.join("copy.ts"), code).expect("write copy");
828 std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
829 .expect("write package.json");
830
831 let files = vec![
832 DiscoveredFile {
833 id: FileId(0),
834 path: src_dir.join("original.ts"),
835 size_bytes: code.len() as u64,
836 },
837 DiscoveredFile {
838 id: FileId(1),
839 path: src_dir.join("copy.ts"),
840 size_bytes: code.len() as u64,
841 },
842 ];
843
844 let config = DuplicatesConfig {
845 min_tokens: 10,
846 min_lines: 2,
847 ..DuplicatesConfig::default()
848 };
849
850 let report = find_duplicates(dir.path(), &files, &config);
851 assert!(
852 !report.clone_groups.is_empty(),
853 "Should detect clones in identical files"
854 );
855 assert!(report.stats.files_with_clones >= 2);
856
857 assert!(
858 !report.clone_families.is_empty(),
859 "Should group clones into families"
860 );
861 }
862
863 #[test]
864 fn global_shingle_prefilter_preserves_corpus_totals() {
865 let dir = tempfile::tempdir().expect("create temp dir");
866 let src_dir = dir.path().join("src");
867 std::fs::create_dir_all(&src_dir).expect("create src dir");
868
869 let duplicated = r#"
870export function normalizeUser(input: string): string {
871 const trimmed = input.trim();
872 const lowered = trimmed.toLowerCase();
873 const compact = lowered.replaceAll(" ", "-");
874 return compact;
875}
876"#;
877 let unique = r#"
878export function renderInvoice(id: string): string {
879 const prefix = "invoice";
880 const suffix = id.padStart(6, "0");
881 return `${prefix}:${suffix}`;
882}
883"#;
884
885 let original_path = src_dir.join("original.ts");
886 let copy_path = src_dir.join("copy.ts");
887 let unique_path = src_dir.join("unique.ts");
888 std::fs::write(&original_path, duplicated).expect("write original");
889 std::fs::write(©_path, duplicated).expect("write copy");
890 std::fs::write(&unique_path, unique).expect("write unique");
891
892 let files = vec![
893 DiscoveredFile {
894 id: FileId(0),
895 path: original_path,
896 size_bytes: duplicated.len() as u64,
897 },
898 DiscoveredFile {
899 id: FileId(1),
900 path: copy_path,
901 size_bytes: duplicated.len() as u64,
902 },
903 DiscoveredFile {
904 id: FileId(2),
905 path: unique_path,
906 size_bytes: unique.len() as u64,
907 },
908 ];
909 let config = DuplicatesConfig {
910 min_tokens: 5,
911 min_lines: 2,
912 min_corpus_size_for_shingle_filter: 1,
913 ..DuplicatesConfig::default()
914 };
915
916 let report = find_duplicates(dir.path(), &files, &config);
917
918 assert!(!report.clone_groups.is_empty());
919 assert_eq!(report.stats.total_files, 3);
920 assert!(report.stats.total_tokens > report.stats.duplicated_tokens);
921 }
922
923 #[test]
924 fn find_duplicates_cached_skips_token_cache_for_small_corpus() {
925 let dir = tempfile::tempdir().expect("create temp dir");
926 let src_dir = dir.path().join("src");
927 std::fs::create_dir_all(&src_dir).expect("create src dir");
928
929 let code = "export function same(input: number): number {\n const doubled = input * 2;\n return doubled + 1;\n}\n";
930 let first = src_dir.join("first.ts");
931 let second = src_dir.join("second.ts");
932 std::fs::write(&first, code).expect("write first");
933 std::fs::write(&second, code).expect("write second");
934
935 let files = vec![
936 DiscoveredFile {
937 id: FileId(0),
938 path: first,
939 size_bytes: code.len() as u64,
940 },
941 DiscoveredFile {
942 id: FileId(1),
943 path: second,
944 size_bytes: code.len() as u64,
945 },
946 ];
947 let config = DuplicatesConfig {
948 min_tokens: 5,
949 min_lines: 2,
950 ..DuplicatesConfig::default()
951 };
952 let cache_root = dir.path().join(".fallow");
953
954 let report = find_duplicates_cached(dir.path(), &files, &config, &cache_root);
955
956 assert!(!report.clone_groups.is_empty());
957 assert!(
958 !cache_root.exists(),
959 "small projects should avoid token-cache IO overhead"
960 );
961 }
962
963 #[test]
964 fn find_duplicates_touching_files_keeps_cross_corpus_matches_only_for_focus() {
965 let dir = tempfile::tempdir().expect("create temp dir");
966 let src_dir = dir.path().join("src");
967 std::fs::create_dir_all(&src_dir).expect("create src dir");
968
969 let focused_code = r"
970export function focused(input: number): number {
971 const doubled = input * 2;
972 const shifted = doubled + 10;
973 return shifted / 2;
974}
975";
976 let untouched_code = r#"
977export function untouched(input: string): string {
978 const lowered = input.toLowerCase();
979 const padded = lowered.padStart(10, "x");
980 return padded.slice(0, 8);
981}
982"#;
983
984 let changed_path = src_dir.join("changed.ts");
985 let focused_copy_path = src_dir.join("focused-copy.ts");
986 let untouched_a_path = src_dir.join("untouched-a.ts");
987 let untouched_b_path = src_dir.join("untouched-b.ts");
988 std::fs::write(&changed_path, focused_code).expect("write changed");
989 std::fs::write(&focused_copy_path, focused_code).expect("write focused copy");
990 std::fs::write(&untouched_a_path, untouched_code).expect("write untouched a");
991 std::fs::write(&untouched_b_path, untouched_code).expect("write untouched b");
992
993 let files = vec![
994 DiscoveredFile {
995 id: FileId(0),
996 path: changed_path.clone(),
997 size_bytes: focused_code.len() as u64,
998 },
999 DiscoveredFile {
1000 id: FileId(1),
1001 path: focused_copy_path,
1002 size_bytes: focused_code.len() as u64,
1003 },
1004 DiscoveredFile {
1005 id: FileId(2),
1006 path: untouched_a_path,
1007 size_bytes: untouched_code.len() as u64,
1008 },
1009 DiscoveredFile {
1010 id: FileId(3),
1011 path: untouched_b_path,
1012 size_bytes: untouched_code.len() as u64,
1013 },
1014 ];
1015
1016 let config = DuplicatesConfig {
1017 mode: DetectionMode::Strict,
1018 min_tokens: 5,
1019 min_lines: 2,
1020 min_corpus_size_for_shingle_filter: 1,
1021 ..DuplicatesConfig::default()
1022 };
1023 let mut focus = FxHashSet::default();
1024 focus.insert(changed_path.clone());
1025
1026 let full_report = find_duplicates(dir.path(), &files, &config);
1027 let report = find_duplicates_touching_files(dir.path(), &files, &config, &focus);
1028 let expected_touching = full_report
1029 .clone_groups
1030 .iter()
1031 .filter(|group| {
1032 group
1033 .instances
1034 .iter()
1035 .any(|instance| instance.file == changed_path)
1036 })
1037 .count();
1038
1039 assert!(
1040 !report.clone_groups.is_empty(),
1041 "focused file should still match an unchanged duplicate"
1042 );
1043 assert_eq!(
1044 report.clone_groups.len(),
1045 expected_touching,
1046 "focused shingle filtering must not drop clone groups touching the focused file"
1047 );
1048 assert!(report.clone_groups.iter().all(|group| {
1049 group
1050 .instances
1051 .iter()
1052 .any(|instance| instance.file == changed_path)
1053 }));
1054 }
1055
1056 #[test]
1057 fn file_wide_suppression_excludes_file() {
1058 let dir = tempfile::tempdir().expect("create temp dir");
1059 let src_dir = dir.path().join("src");
1060 std::fs::create_dir_all(&src_dir).expect("create src dir");
1061
1062 let code = r#"
1063export function processData(input: string): string {
1064 const trimmed = input.trim();
1065 if (trimmed.length === 0) {
1066 return "";
1067 }
1068 const parts = trimmed.split(",");
1069 const filtered = parts.filter(p => p.length > 0);
1070 const mapped = filtered.map(p => p.toUpperCase());
1071 return mapped.join(", ");
1072}
1073"#;
1074 let suppressed_code = format!("// fallow-ignore-file code-duplication\n{code}");
1075
1076 std::fs::write(src_dir.join("original.ts"), code).expect("write original");
1077 std::fs::write(src_dir.join("suppressed.ts"), &suppressed_code).expect("write suppressed");
1078 std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
1079 .expect("write package.json");
1080
1081 let files = vec![
1082 DiscoveredFile {
1083 id: FileId(0),
1084 path: src_dir.join("original.ts"),
1085 size_bytes: code.len() as u64,
1086 },
1087 DiscoveredFile {
1088 id: FileId(1),
1089 path: src_dir.join("suppressed.ts"),
1090 size_bytes: suppressed_code.len() as u64,
1091 },
1092 ];
1093
1094 let config = DuplicatesConfig {
1095 min_tokens: 10,
1096 min_lines: 2,
1097 ..DuplicatesConfig::default()
1098 };
1099
1100 let report = find_duplicates(dir.path(), &files, &config);
1101 assert!(
1102 report.clone_groups.is_empty(),
1103 "File-wide suppression should exclude file from duplication analysis"
1104 );
1105 }
1106
1107 #[test]
1108 #[expect(
1109 clippy::too_many_lines,
1110 reason = "test fixture; linear setup/assert, length is not a maintainability concern"
1111 )]
1112 fn min_occurrences_hides_pairs_and_records_count() {
1113 let dir = tempfile::tempdir().expect("create temp dir");
1114 let src_dir = dir.path().join("src");
1115 std::fs::create_dir_all(&src_dir).expect("create src dir");
1116
1117 let block_a = r#"
1118export function blockA(input: string): string {
1119 const trimmed = input.trim();
1120 if (trimmed.length === 0) {
1121 return "";
1122 }
1123 const parts = trimmed.split(",");
1124 const filtered = parts.filter(p => p.length > 0);
1125 const mapped = filtered.map(p => p.toUpperCase());
1126 return mapped.join(", ");
1127}
1128"#;
1129 let block_b = r"
1130export function blockB(value: number): number {
1131 if (value <= 0) {
1132 return 0;
1133 }
1134 let total = 0;
1135 for (let i = 1; i <= value; i += 1) {
1136 total += i * 2;
1137 total -= 1;
1138 }
1139 return total + 7;
1140}
1141";
1142
1143 let pair_a1 = src_dir.join("pair-a1.ts");
1144 let pair_a2 = src_dir.join("pair-a2.ts");
1145 let triple_b1 = src_dir.join("triple-b1.ts");
1146 let triple_b2 = src_dir.join("triple-b2.ts");
1147 let triple_b3 = src_dir.join("triple-b3.ts");
1148 std::fs::write(&pair_a1, block_a).expect("write");
1149 std::fs::write(&pair_a2, block_a).expect("write");
1150 std::fs::write(&triple_b1, block_b).expect("write");
1151 std::fs::write(&triple_b2, block_b).expect("write");
1152 std::fs::write(&triple_b3, block_b).expect("write");
1153
1154 let files = vec![
1155 DiscoveredFile {
1156 id: FileId(0),
1157 path: pair_a1,
1158 size_bytes: block_a.len() as u64,
1159 },
1160 DiscoveredFile {
1161 id: FileId(1),
1162 path: pair_a2,
1163 size_bytes: block_a.len() as u64,
1164 },
1165 DiscoveredFile {
1166 id: FileId(2),
1167 path: triple_b1,
1168 size_bytes: block_b.len() as u64,
1169 },
1170 DiscoveredFile {
1171 id: FileId(3),
1172 path: triple_b2,
1173 size_bytes: block_b.len() as u64,
1174 },
1175 DiscoveredFile {
1176 id: FileId(4),
1177 path: triple_b3,
1178 size_bytes: block_b.len() as u64,
1179 },
1180 ];
1181
1182 let default_config = DuplicatesConfig {
1183 min_tokens: 10,
1184 min_lines: 2,
1185 ..DuplicatesConfig::default()
1186 };
1187 let baseline = find_duplicates(dir.path(), &files, &default_config);
1188 assert_eq!(
1189 baseline.clone_groups.len(),
1190 2,
1191 "default minOccurrences should report both the pair and the triple"
1192 );
1193 assert_eq!(
1194 baseline.stats.clone_groups_below_min_occurrences, 0,
1195 "default minOccurrences hides nothing"
1196 );
1197 let baseline_pct = baseline.stats.duplication_percentage;
1198
1199 let raised_config = DuplicatesConfig {
1200 min_tokens: 10,
1201 min_lines: 2,
1202 min_occurrences: 3,
1203 ..DuplicatesConfig::default()
1204 };
1205 let report = find_duplicates(dir.path(), &files, &raised_config);
1206 assert_eq!(
1207 report.clone_groups.len(),
1208 1,
1209 "minOccurrences=3 should hide the 2-instance group"
1210 );
1211 assert_eq!(
1212 report.clone_groups[0].instances.len(),
1213 3,
1214 "surviving group must be the 3-instance group"
1215 );
1216 assert_eq!(
1217 report.stats.clone_groups_below_min_occurrences, 1,
1218 "the hidden 2-instance group must be counted"
1219 );
1220 assert_eq!(
1221 report.stats.clone_groups, 1,
1222 "stats.clone_groups must match the post-filter array length"
1223 );
1224 assert_eq!(
1225 report.stats.clone_instances, 3,
1226 "stats.clone_instances must match the surviving instance total"
1227 );
1228 assert!(
1229 (report.stats.duplication_percentage - baseline_pct).abs() < f64::EPSILON,
1230 "duplication_percentage should not shift when minOccurrences changes"
1231 );
1232 }
1233
1234 #[test]
1235 fn min_occurrences_evaluates_after_line_suppressions() {
1236 let dir = tempfile::tempdir().expect("create temp dir");
1237 let src_dir = dir.path().join("src");
1238 std::fs::create_dir_all(&src_dir).expect("create src dir");
1239
1240 let block = r#"
1241export function shared(input: string): string {
1242 const trimmed = input.trim();
1243 if (trimmed.length === 0) {
1244 return "";
1245 }
1246 const parts = trimmed.split(",");
1247 const filtered = parts.filter(p => p.length > 0);
1248 const mapped = filtered.map(p => p.toUpperCase());
1249 return mapped.join(", ");
1250}
1251"#;
1252 let suppressed = format!("// fallow-ignore-file code-duplication\n{block}");
1253
1254 let a = src_dir.join("a.ts");
1255 let b = src_dir.join("b.ts");
1256 let c = src_dir.join("c.ts");
1257 std::fs::write(&a, block).expect("write a");
1258 std::fs::write(&b, block).expect("write b");
1259 std::fs::write(&c, &suppressed).expect("write c");
1260
1261 let files = vec![
1262 DiscoveredFile {
1263 id: FileId(0),
1264 path: a,
1265 size_bytes: block.len() as u64,
1266 },
1267 DiscoveredFile {
1268 id: FileId(1),
1269 path: b,
1270 size_bytes: block.len() as u64,
1271 },
1272 DiscoveredFile {
1273 id: FileId(2),
1274 path: c,
1275 size_bytes: suppressed.len() as u64,
1276 },
1277 ];
1278
1279 let config = DuplicatesConfig {
1280 min_tokens: 10,
1281 min_lines: 2,
1282 min_occurrences: 3,
1283 ..DuplicatesConfig::default()
1284 };
1285 let report = find_duplicates(dir.path(), &files, &config);
1286 assert!(
1287 report.clone_groups.is_empty(),
1288 "post-suppression 2-instance group must be hidden by minOccurrences=3, \
1289 got groups: {:?}",
1290 report
1291 .clone_groups
1292 .iter()
1293 .map(|g| g.instances.len())
1294 .collect::<Vec<_>>()
1295 );
1296 assert_eq!(
1297 report.stats.clone_groups, 0,
1298 "stats.clone_groups must match the empty post-filter array"
1299 );
1300 assert_eq!(
1301 report.stats.clone_instances, 0,
1302 "stats.clone_instances must match the empty post-filter array"
1303 );
1304 }
1305}