1mod cache;
9pub mod deepdive;
10pub mod detect;
11pub mod families;
12pub mod normalize;
13mod shingle_filter;
14pub mod token_types;
15mod token_visitor;
16pub mod tokenize;
17pub(crate) mod types;
18
19use rustc_hash::FxHashMap;
20use std::path::{Path, PathBuf};
21use std::sync::atomic::{AtomicUsize, Ordering};
22
23use globset::{Glob, GlobMatcher, GlobSet, GlobSetBuilder};
24use rayon::prelude::*;
25use rustc_hash::FxHashSet;
26
27use cache::{TokenCache, TokenCacheEntry, TokenCacheMode};
28pub use deepdive::{
29 CloneFingerprintKey, CloneFingerprintSet, FINGERPRINT_PREFIX, clone_fingerprint,
30 dominant_identifier, fingerprint_for_fragment, group_refactoring_suggestion,
31};
32use detect::CloneDetector;
33use normalize::normalize_and_hash_resolved;
34use tokenize::{tokenize_file, tokenize_file_cross_language};
35pub use types::{
36 CloneFamily, CloneGroup, CloneInstance, DefaultIgnoreSkipCount, DefaultIgnoreSkips,
37 DetectionMode, DuplicatesConfig, DuplicationReport, DuplicationStats, MirroredDirectory,
38 RefactoringKind, RefactoringSuggestion,
39};
40
41use crate::discover::{self, DiscoveredFile};
42use crate::suppress::{self, IssueKind, Suppression};
43
44pub const DUPES_DEFAULT_IGNORES: &[&str] = &[
50 "**/.next/**",
51 "**/.nuxt/**",
52 "**/.svelte-kit/**",
53 "**/.turbo/**",
54 "**/.parcel-cache/**",
55 "**/.vite/**",
56 "**/.cache/**",
57 "**/out/**",
58 "**/storybook-static/**",
59];
60
61#[derive(Clone)]
62pub(super) struct TokenizedFile {
63 path: PathBuf,
64 hashed_tokens: Vec<normalize::HashedToken>,
65 file_tokens: tokenize::FileTokens,
66 metadata: Option<std::fs::Metadata>,
67 cache_hit: bool,
68 suppressions: Vec<Suppression>,
69}
70
71struct IgnoreSet {
72 all: GlobSet,
73 defaults: Vec<(&'static str, GlobMatcher)>,
74}
75
76impl IgnoreSet {
77 fn is_match(&self, path: &Path) -> bool {
78 self.all.is_match(path)
79 }
80
81 fn default_match_index(&self, path: &Path) -> Option<usize> {
82 self.defaults
83 .iter()
84 .position(|(_, matcher)| matcher.is_match(path))
85 }
86}
87
88struct DuplicationRun {
89 report: DuplicationReport,
90 default_ignore_skips: DefaultIgnoreSkips,
91}
92
93pub fn find_duplicates(
102 root: &Path,
103 files: &[DiscoveredFile],
104 config: &DuplicatesConfig,
105) -> DuplicationReport {
106 find_duplicates_inner(root, files, config, None, None).report
107}
108
109pub fn find_duplicates_with_default_ignore_skips(
112 root: &Path,
113 files: &[DiscoveredFile],
114 config: &DuplicatesConfig,
115) -> (DuplicationReport, DefaultIgnoreSkips) {
116 let run = find_duplicates_inner(root, files, config, None, None);
117 (run.report, run.default_ignore_skips)
118}
119
120pub fn find_duplicates_cached(
122 root: &Path,
123 files: &[DiscoveredFile],
124 config: &DuplicatesConfig,
125 cache_root: &Path,
126) -> DuplicationReport {
127 find_duplicates_inner(root, files, config, None, Some(cache_root)).report
128}
129
130pub fn find_duplicates_cached_with_default_ignore_skips(
133 root: &Path,
134 files: &[DiscoveredFile],
135 config: &DuplicatesConfig,
136 cache_root: &Path,
137) -> (DuplicationReport, DefaultIgnoreSkips) {
138 let run = find_duplicates_inner(root, files, config, None, Some(cache_root));
139 (run.report, run.default_ignore_skips)
140}
141
142#[expect(
148 clippy::implicit_hasher,
149 reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
150)]
151pub fn find_duplicates_touching_files(
152 root: &Path,
153 files: &[DiscoveredFile],
154 config: &DuplicatesConfig,
155 focus_files: &FxHashSet<PathBuf>,
156) -> DuplicationReport {
157 find_duplicates_inner(root, files, config, Some(focus_files), None).report
158}
159
160#[expect(
163 clippy::implicit_hasher,
164 reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
165)]
166pub fn find_duplicates_touching_files_with_default_ignore_skips(
167 root: &Path,
168 files: &[DiscoveredFile],
169 config: &DuplicatesConfig,
170 focus_files: &FxHashSet<PathBuf>,
171) -> (DuplicationReport, DefaultIgnoreSkips) {
172 let run = find_duplicates_inner(root, files, config, Some(focus_files), None);
173 (run.report, run.default_ignore_skips)
174}
175
176#[expect(
178 clippy::implicit_hasher,
179 reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
180)]
181pub fn find_duplicates_touching_files_cached(
182 root: &Path,
183 files: &[DiscoveredFile],
184 config: &DuplicatesConfig,
185 focus_files: &FxHashSet<PathBuf>,
186 cache_root: &Path,
187) -> DuplicationReport {
188 find_duplicates_inner(root, files, config, Some(focus_files), Some(cache_root)).report
189}
190
191#[expect(
194 clippy::implicit_hasher,
195 reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
196)]
197pub fn find_duplicates_touching_files_cached_with_default_ignore_skips(
198 root: &Path,
199 files: &[DiscoveredFile],
200 config: &DuplicatesConfig,
201 focus_files: &FxHashSet<PathBuf>,
202 cache_root: &Path,
203) -> (DuplicationReport, DefaultIgnoreSkips) {
204 let run = find_duplicates_inner(root, files, config, Some(focus_files), Some(cache_root));
205 (run.report, run.default_ignore_skips)
206}
207
208fn find_duplicates_inner(
209 root: &Path,
210 files: &[DiscoveredFile],
211 config: &DuplicatesConfig,
212 focus_files: Option<&FxHashSet<PathBuf>>,
213 cache_root: Option<&Path>,
214) -> DuplicationRun {
215 let _span = tracing::info_span!("find_duplicates").entered();
216
217 let extra_ignores = build_ignore_set(config);
218 let default_skip_counts = extra_ignores
219 .as_ref()
220 .map(|ignores| {
221 std::iter::repeat_with(|| AtomicUsize::new(0))
222 .take(ignores.defaults.len())
223 .collect::<Vec<_>>()
224 })
225 .unwrap_or_default();
226
227 let normalization =
229 fallow_config::ResolvedNormalization::resolve(config.mode, &config.normalization);
230
231 let strip_types = config.cross_language;
232 let skip_imports = config.ignore_imports;
233
234 tracing::debug!(
235 ignore_imports = skip_imports,
236 "duplication tokenization config"
237 );
238
239 let token_cache_mode = TokenCacheMode::new(normalization, strip_types, skip_imports);
240 let cache_root = cache_root.filter(|_| files.len() >= config.min_corpus_size_for_token_cache);
241 let token_cache = cache_root.map(TokenCache::load);
242
243 let mut file_data: Vec<TokenizedFile> = files
245 .par_iter()
246 .filter_map(|file| {
247 let relative = file.path.strip_prefix(root).unwrap_or(&file.path);
249 if let Some(ref ignores) = extra_ignores {
250 if let Some(index) = ignores.default_match_index(relative) {
251 default_skip_counts[index].fetch_add(1, Ordering::Relaxed);
252 return None;
253 }
254 if ignores.is_match(relative) {
255 return None;
256 }
257 }
258
259 let metadata = std::fs::metadata(&file.path).ok()?;
260
261 let cached_entry = token_cache
262 .as_ref()
263 .and_then(|cache| cache.get(&file.path, &metadata, token_cache_mode));
264 let cache_hit = cached_entry.is_some();
265
266 let (mut entry, suppressions) = if let Some(entry) = cached_entry {
267 let suppressions = entry.suppressions.clone();
268 if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
269 return None;
270 }
271 (entry, suppressions)
272 } else {
273 let source = std::fs::read_to_string(&file.path).ok()?;
274 let suppressions = suppress::parse_suppressions_from_source(&source).suppressions;
275 if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
276 return None;
277 }
278
279 let file_tokens = if strip_types {
281 tokenize_file_cross_language(&file.path, &source, true, skip_imports)
282 } else {
283 tokenize_file(&file.path, &source, skip_imports)
284 };
285 if file_tokens.tokens.is_empty() {
286 return None;
287 }
288
289 let hashed = normalize_and_hash_resolved(&file_tokens.tokens, normalization);
291 let entry = TokenCacheEntry {
292 hashed_tokens: hashed,
293 file_tokens,
294 suppressions: suppressions.clone(),
295 };
296 (entry, suppressions)
297 };
298 if entry.file_tokens.tokens.is_empty() {
299 return None;
300 }
301 if entry.hashed_tokens.len() < config.min_tokens {
302 return None;
303 }
304
305 Some(TokenizedFile {
306 path: file.path.clone(),
307 hashed_tokens: std::mem::take(&mut entry.hashed_tokens),
308 file_tokens: entry.file_tokens,
309 metadata: Some(metadata),
310 cache_hit,
311 suppressions,
312 })
313 })
314 .collect();
315
316 if let (Some(cache_root), Some(mut cache)) = (cache_root, token_cache) {
317 for file in &file_data {
318 if !file.cache_hit
319 && let Some(metadata) = &file.metadata
320 {
321 cache.insert(
322 &file.path,
323 metadata,
324 token_cache_mode,
325 &file.hashed_tokens,
326 &file.file_tokens,
327 &file.suppressions,
328 );
329 }
330 }
331 cache.retain_paths(files);
332 match cache.save_if_dirty() {
333 Ok(true) => {
334 tracing::debug!(cache_root = %cache_root.display(), "saved duplication token cache");
335 }
336 Ok(false) => {
337 tracing::debug!(cache_root = %cache_root.display(), "duplication token cache unchanged");
338 }
339 Err(err) => {
340 tracing::warn!("Failed to save duplication token cache: {err}");
341 }
342 }
343 }
344
345 tracing::info!(
346 files = file_data.len(),
347 "tokenized files for duplication analysis"
348 );
349
350 if let Some(focus_files) = focus_files
351 && file_data.len() >= config.min_corpus_size_for_shingle_filter
352 {
353 shingle_filter::filter_to_focus_candidates(&mut file_data, focus_files, config.min_tokens);
354 }
355
356 let suppressions_by_file: FxHashMap<PathBuf, Vec<Suppression>> = file_data
358 .iter()
359 .filter(|file| !file.suppressions.is_empty())
360 .map(|file| (file.path.clone(), file.suppressions.clone()))
361 .collect();
362
363 let detector_data: Vec<(PathBuf, Vec<normalize::HashedToken>, tokenize::FileTokens)> =
365 file_data
366 .into_iter()
367 .map(|file| (file.path, file.hashed_tokens, file.file_tokens))
368 .collect();
369
370 let detector = CloneDetector::new(config.min_tokens, config.min_lines, config.skip_local);
372 let mut report = if let Some(focus_files) = focus_files {
373 detector.detect_touching_files(detector_data, focus_files)
374 } else {
375 detector.detect(detector_data)
376 };
377
378 if !suppressions_by_file.is_empty() {
383 apply_line_suppressions(&mut report, &suppressions_by_file);
384 }
385
386 apply_min_occurrences_filter(&mut report, config.min_occurrences);
388
389 let default_ignore_skips =
390 build_default_ignore_skips(extra_ignores.as_ref(), &default_skip_counts);
391
392 report.clone_families = families::group_into_families(&report.clone_groups, root);
394
395 report.mirrored_directories =
397 families::detect_mirrored_directories(&report.clone_families, root);
398
399 report.sort();
402
403 DuplicationRun {
404 report,
405 default_ignore_skips,
406 }
407}
408
409fn apply_min_occurrences_filter(report: &mut DuplicationReport, min: usize) {
424 if min <= 2 {
425 return;
426 }
427 let before = report.clone_groups.len();
428 report
429 .clone_groups
430 .retain(|group| group.instances.len() >= min);
431 let hidden = before - report.clone_groups.len();
432 if hidden == 0 {
433 return;
434 }
435 report.stats.clone_groups_below_min_occurrences = hidden;
436 report.stats.clone_groups = report.clone_groups.len();
437 report.stats.clone_instances = report.clone_groups.iter().map(|g| g.instances.len()).sum();
438}
439
440#[expect(
442 clippy::cast_possible_truncation,
443 reason = "line numbers are bounded by source size"
444)]
445fn apply_line_suppressions(
446 report: &mut DuplicationReport,
447 suppressions_by_file: &FxHashMap<PathBuf, Vec<Suppression>>,
448) {
449 report.clone_groups.retain_mut(|group| {
450 group.instances.retain(|instance| {
451 if let Some(supps) = suppressions_by_file.get(&instance.file) {
452 for line in instance.start_line..=instance.end_line {
454 if suppress::is_suppressed(supps, line as u32, IssueKind::CodeDuplication) {
455 return false;
456 }
457 }
458 }
459 true
460 });
461 group.instances.len() >= 2
463 });
464}
465
466#[must_use]
470pub fn find_duplicates_in_project(root: &Path, config: &DuplicatesConfig) -> DuplicationReport {
471 let resolved = crate::default_config(root);
472 let files = discover::discover_files_with_plugin_scopes(&resolved);
473 find_duplicates(root, &files, config)
474}
475
476fn build_ignore_set(config: &DuplicatesConfig) -> Option<IgnoreSet> {
478 if !config.ignore_defaults && config.ignore.is_empty() {
479 return None;
480 }
481
482 let mut builder = GlobSetBuilder::new();
483 let mut defaults = Vec::new();
484
485 if config.ignore_defaults {
486 for pattern in DUPES_DEFAULT_IGNORES {
487 let glob = Glob::new(pattern).expect("default duplication ignore pattern is valid");
488 defaults.push((*pattern, glob.compile_matcher()));
489 builder.add(glob);
490 }
491 }
492
493 for pattern in &config.ignore {
496 builder.add(
497 Glob::new(pattern)
498 .expect("duplicates.ignore pattern was validated at config load time"),
499 );
500 }
501
502 builder.build().ok().map(|all| IgnoreSet { all, defaults })
503}
504
505fn build_default_ignore_skips(
506 ignores: Option<&IgnoreSet>,
507 counts: &[AtomicUsize],
508) -> DefaultIgnoreSkips {
509 let Some(ignores) = ignores else {
510 return DefaultIgnoreSkips::default();
511 };
512
513 let by_pattern = ignores
514 .defaults
515 .iter()
516 .zip(counts)
517 .filter_map(|((pattern, _), count)| {
518 let count = count.load(Ordering::Relaxed);
519 (count > 0).then_some(DefaultIgnoreSkipCount { pattern, count })
520 })
521 .collect::<Vec<_>>();
522 let total = by_pattern.iter().map(|entry| entry.count).sum();
523
524 DefaultIgnoreSkips { total, by_pattern }
525}
526
527#[cfg(test)]
528mod tests {
529 use super::*;
530 use crate::discover::FileId;
531
532 #[test]
533 fn find_duplicates_empty_files() {
534 let config = DuplicatesConfig::default();
535 let report = find_duplicates(Path::new("/tmp"), &[], &config);
536 assert!(report.clone_groups.is_empty());
537 assert!(report.clone_families.is_empty());
538 assert_eq!(report.stats.total_files, 0);
539 }
540
541 #[test]
542 fn build_ignore_set_empty() {
543 let config = DuplicatesConfig {
544 ignore_defaults: false,
545 ..DuplicatesConfig::default()
546 };
547 assert!(build_ignore_set(&config).is_none());
548 }
549
550 #[test]
551 fn build_ignore_set_valid_patterns() {
552 let config = DuplicatesConfig {
553 ignore_defaults: false,
554 ignore: vec!["**/*.test.ts".to_string(), "**/*.spec.ts".to_string()],
555 ..DuplicatesConfig::default()
556 };
557 let set = build_ignore_set(&config);
558 assert!(set.is_some());
559 let set = set.unwrap();
560 assert!(set.is_match(Path::new("src/foo.test.ts")));
561 assert!(set.is_match(Path::new("src/bar.spec.ts")));
562 assert!(!set.is_match(Path::new("src/baz.ts")));
563 }
564
565 #[test]
566 fn build_ignore_set_merges_defaults_with_user_patterns() {
567 let config = DuplicatesConfig {
568 ignore: vec!["**/foo/**".to_string()],
569 ..DuplicatesConfig::default()
570 };
571 let set = build_ignore_set(&config).expect("ignore set");
572 assert!(set.is_match(Path::new(".next/static/chunks/app.js")));
573 assert!(set.is_match(Path::new("src/foo/generated.js")));
574 }
575
576 #[test]
577 fn build_ignore_set_ignore_defaults_false_uses_only_user_patterns() {
578 let config = DuplicatesConfig {
579 ignore_defaults: false,
580 ignore: vec!["**/foo/**".to_string()],
581 ..DuplicatesConfig::default()
582 };
583 let set = build_ignore_set(&config).expect("ignore set");
584 assert!(!set.is_match(Path::new(".next/static/chunks/app.js")));
585 assert!(set.is_match(Path::new("src/foo/generated.js")));
586 }
587
588 #[test]
589 fn find_duplicates_with_real_files() {
590 let dir = tempfile::tempdir().expect("create temp dir");
592 let src_dir = dir.path().join("src");
593 std::fs::create_dir_all(&src_dir).expect("create src dir");
594
595 let code = r#"
596export function processData(input: string): string {
597 const trimmed = input.trim();
598 if (trimmed.length === 0) {
599 return "";
600 }
601 const parts = trimmed.split(",");
602 const filtered = parts.filter(p => p.length > 0);
603 const mapped = filtered.map(p => p.toUpperCase());
604 return mapped.join(", ");
605}
606
607export function validateInput(data: string): boolean {
608 if (data === null || data === undefined) {
609 return false;
610 }
611 const cleaned = data.trim();
612 if (cleaned.length < 3) {
613 return false;
614 }
615 return true;
616}
617"#;
618
619 std::fs::write(src_dir.join("original.ts"), code).expect("write original");
620 std::fs::write(src_dir.join("copy.ts"), code).expect("write copy");
621 std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
622 .expect("write package.json");
623
624 let files = vec![
625 DiscoveredFile {
626 id: FileId(0),
627 path: src_dir.join("original.ts"),
628 size_bytes: code.len() as u64,
629 },
630 DiscoveredFile {
631 id: FileId(1),
632 path: src_dir.join("copy.ts"),
633 size_bytes: code.len() as u64,
634 },
635 ];
636
637 let config = DuplicatesConfig {
638 min_tokens: 10,
639 min_lines: 2,
640 ..DuplicatesConfig::default()
641 };
642
643 let report = find_duplicates(dir.path(), &files, &config);
644 assert!(
645 !report.clone_groups.is_empty(),
646 "Should detect clones in identical files"
647 );
648 assert!(report.stats.files_with_clones >= 2);
649
650 assert!(
652 !report.clone_families.is_empty(),
653 "Should group clones into families"
654 );
655 }
656
657 #[test]
658 fn find_duplicates_cached_skips_token_cache_for_small_corpus() {
659 let dir = tempfile::tempdir().expect("create temp dir");
660 let src_dir = dir.path().join("src");
661 std::fs::create_dir_all(&src_dir).expect("create src dir");
662
663 let code = "export function same(input: number): number {\n const doubled = input * 2;\n return doubled + 1;\n}\n";
664 let first = src_dir.join("first.ts");
665 let second = src_dir.join("second.ts");
666 std::fs::write(&first, code).expect("write first");
667 std::fs::write(&second, code).expect("write second");
668
669 let files = vec![
670 DiscoveredFile {
671 id: FileId(0),
672 path: first,
673 size_bytes: code.len() as u64,
674 },
675 DiscoveredFile {
676 id: FileId(1),
677 path: second,
678 size_bytes: code.len() as u64,
679 },
680 ];
681 let config = DuplicatesConfig {
682 min_tokens: 5,
683 min_lines: 2,
684 ..DuplicatesConfig::default()
685 };
686 let cache_root = dir.path().join(".fallow");
687
688 let report = find_duplicates_cached(dir.path(), &files, &config, &cache_root);
689
690 assert!(!report.clone_groups.is_empty());
691 assert!(
692 !cache_root.exists(),
693 "small projects should avoid token-cache IO overhead"
694 );
695 }
696
697 #[test]
698 fn find_duplicates_touching_files_keeps_cross_corpus_matches_only_for_focus() {
699 let dir = tempfile::tempdir().expect("create temp dir");
700 let src_dir = dir.path().join("src");
701 std::fs::create_dir_all(&src_dir).expect("create src dir");
702
703 let focused_code = r"
704export function focused(input: number): number {
705 const doubled = input * 2;
706 const shifted = doubled + 10;
707 return shifted / 2;
708}
709";
710 let untouched_code = r#"
711export function untouched(input: string): string {
712 const lowered = input.toLowerCase();
713 const padded = lowered.padStart(10, "x");
714 return padded.slice(0, 8);
715}
716"#;
717
718 let changed_path = src_dir.join("changed.ts");
719 let focused_copy_path = src_dir.join("focused-copy.ts");
720 let untouched_a_path = src_dir.join("untouched-a.ts");
721 let untouched_b_path = src_dir.join("untouched-b.ts");
722 std::fs::write(&changed_path, focused_code).expect("write changed");
723 std::fs::write(&focused_copy_path, focused_code).expect("write focused copy");
724 std::fs::write(&untouched_a_path, untouched_code).expect("write untouched a");
725 std::fs::write(&untouched_b_path, untouched_code).expect("write untouched b");
726
727 let files = vec![
728 DiscoveredFile {
729 id: FileId(0),
730 path: changed_path.clone(),
731 size_bytes: focused_code.len() as u64,
732 },
733 DiscoveredFile {
734 id: FileId(1),
735 path: focused_copy_path,
736 size_bytes: focused_code.len() as u64,
737 },
738 DiscoveredFile {
739 id: FileId(2),
740 path: untouched_a_path,
741 size_bytes: untouched_code.len() as u64,
742 },
743 DiscoveredFile {
744 id: FileId(3),
745 path: untouched_b_path,
746 size_bytes: untouched_code.len() as u64,
747 },
748 ];
749
750 let config = DuplicatesConfig {
751 mode: DetectionMode::Strict,
752 min_tokens: 5,
753 min_lines: 2,
754 min_corpus_size_for_shingle_filter: 1,
755 ..DuplicatesConfig::default()
756 };
757 let mut focus = FxHashSet::default();
758 focus.insert(changed_path.clone());
759
760 let full_report = find_duplicates(dir.path(), &files, &config);
761 let report = find_duplicates_touching_files(dir.path(), &files, &config, &focus);
762 let expected_touching = full_report
763 .clone_groups
764 .iter()
765 .filter(|group| {
766 group
767 .instances
768 .iter()
769 .any(|instance| instance.file == changed_path)
770 })
771 .count();
772
773 assert!(
774 !report.clone_groups.is_empty(),
775 "focused file should still match an unchanged duplicate"
776 );
777 assert_eq!(
778 report.clone_groups.len(),
779 expected_touching,
780 "focused shingle filtering must not drop clone groups touching the focused file"
781 );
782 assert!(report.clone_groups.iter().all(|group| {
783 group
784 .instances
785 .iter()
786 .any(|instance| instance.file == changed_path)
787 }));
788 }
789
790 #[test]
791 fn file_wide_suppression_excludes_file() {
792 let dir = tempfile::tempdir().expect("create temp dir");
793 let src_dir = dir.path().join("src");
794 std::fs::create_dir_all(&src_dir).expect("create src dir");
795
796 let code = r#"
797export function processData(input: string): string {
798 const trimmed = input.trim();
799 if (trimmed.length === 0) {
800 return "";
801 }
802 const parts = trimmed.split(",");
803 const filtered = parts.filter(p => p.length > 0);
804 const mapped = filtered.map(p => p.toUpperCase());
805 return mapped.join(", ");
806}
807"#;
808 let suppressed_code = format!("// fallow-ignore-file code-duplication\n{code}");
809
810 std::fs::write(src_dir.join("original.ts"), code).expect("write original");
811 std::fs::write(src_dir.join("suppressed.ts"), &suppressed_code).expect("write suppressed");
812 std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
813 .expect("write package.json");
814
815 let files = vec![
816 DiscoveredFile {
817 id: FileId(0),
818 path: src_dir.join("original.ts"),
819 size_bytes: code.len() as u64,
820 },
821 DiscoveredFile {
822 id: FileId(1),
823 path: src_dir.join("suppressed.ts"),
824 size_bytes: suppressed_code.len() as u64,
825 },
826 ];
827
828 let config = DuplicatesConfig {
829 min_tokens: 10,
830 min_lines: 2,
831 ..DuplicatesConfig::default()
832 };
833
834 let report = find_duplicates(dir.path(), &files, &config);
835 assert!(
837 report.clone_groups.is_empty(),
838 "File-wide suppression should exclude file from duplication analysis"
839 );
840 }
841
842 #[test]
843 fn min_occurrences_hides_pairs_and_records_count() {
844 let dir = tempfile::tempdir().expect("create temp dir");
845 let src_dir = dir.path().join("src");
846 std::fs::create_dir_all(&src_dir).expect("create src dir");
847
848 let block_a = r#"
851export function blockA(input: string): string {
852 const trimmed = input.trim();
853 if (trimmed.length === 0) {
854 return "";
855 }
856 const parts = trimmed.split(",");
857 const filtered = parts.filter(p => p.length > 0);
858 const mapped = filtered.map(p => p.toUpperCase());
859 return mapped.join(", ");
860}
861"#;
862 let block_b = r"
863export function blockB(value: number): number {
864 if (value <= 0) {
865 return 0;
866 }
867 let total = 0;
868 for (let i = 1; i <= value; i += 1) {
869 total += i * 2;
870 total -= 1;
871 }
872 return total + 7;
873}
874";
875
876 let pair_a1 = src_dir.join("pair-a1.ts");
877 let pair_a2 = src_dir.join("pair-a2.ts");
878 let triple_b1 = src_dir.join("triple-b1.ts");
879 let triple_b2 = src_dir.join("triple-b2.ts");
880 let triple_b3 = src_dir.join("triple-b3.ts");
881 std::fs::write(&pair_a1, block_a).expect("write");
882 std::fs::write(&pair_a2, block_a).expect("write");
883 std::fs::write(&triple_b1, block_b).expect("write");
884 std::fs::write(&triple_b2, block_b).expect("write");
885 std::fs::write(&triple_b3, block_b).expect("write");
886
887 let files = vec![
888 DiscoveredFile {
889 id: FileId(0),
890 path: pair_a1,
891 size_bytes: block_a.len() as u64,
892 },
893 DiscoveredFile {
894 id: FileId(1),
895 path: pair_a2,
896 size_bytes: block_a.len() as u64,
897 },
898 DiscoveredFile {
899 id: FileId(2),
900 path: triple_b1,
901 size_bytes: block_b.len() as u64,
902 },
903 DiscoveredFile {
904 id: FileId(3),
905 path: triple_b2,
906 size_bytes: block_b.len() as u64,
907 },
908 DiscoveredFile {
909 id: FileId(4),
910 path: triple_b3,
911 size_bytes: block_b.len() as u64,
912 },
913 ];
914
915 let default_config = DuplicatesConfig {
917 min_tokens: 10,
918 min_lines: 2,
919 ..DuplicatesConfig::default()
920 };
921 let baseline = find_duplicates(dir.path(), &files, &default_config);
922 assert_eq!(
923 baseline.clone_groups.len(),
924 2,
925 "default minOccurrences should report both the pair and the triple"
926 );
927 assert_eq!(
928 baseline.stats.clone_groups_below_min_occurrences, 0,
929 "default minOccurrences hides nothing"
930 );
931 let baseline_pct = baseline.stats.duplication_percentage;
932
933 let raised_config = DuplicatesConfig {
935 min_tokens: 10,
936 min_lines: 2,
937 min_occurrences: 3,
938 ..DuplicatesConfig::default()
939 };
940 let report = find_duplicates(dir.path(), &files, &raised_config);
941 assert_eq!(
942 report.clone_groups.len(),
943 1,
944 "minOccurrences=3 should hide the 2-instance group"
945 );
946 assert_eq!(
947 report.clone_groups[0].instances.len(),
948 3,
949 "surviving group must be the 3-instance group"
950 );
951 assert_eq!(
952 report.stats.clone_groups_below_min_occurrences, 1,
953 "the hidden 2-instance group must be counted"
954 );
955 assert_eq!(
958 report.stats.clone_groups, 1,
959 "stats.clone_groups must match the post-filter array length"
960 );
961 assert_eq!(
962 report.stats.clone_instances, 3,
963 "stats.clone_instances must match the surviving instance total"
964 );
965 assert!(
968 (report.stats.duplication_percentage - baseline_pct).abs() < f64::EPSILON,
969 "duplication_percentage should not shift when minOccurrences changes"
970 );
971 }
972
973 #[test]
974 fn min_occurrences_evaluates_after_line_suppressions() {
975 let dir = tempfile::tempdir().expect("create temp dir");
981 let src_dir = dir.path().join("src");
982 std::fs::create_dir_all(&src_dir).expect("create src dir");
983
984 let block = r#"
985export function shared(input: string): string {
986 const trimmed = input.trim();
987 if (trimmed.length === 0) {
988 return "";
989 }
990 const parts = trimmed.split(",");
991 const filtered = parts.filter(p => p.length > 0);
992 const mapped = filtered.map(p => p.toUpperCase());
993 return mapped.join(", ");
994}
995"#;
996 let suppressed = format!("// fallow-ignore-file code-duplication\n{block}");
997
998 let a = src_dir.join("a.ts");
999 let b = src_dir.join("b.ts");
1000 let c = src_dir.join("c.ts");
1001 std::fs::write(&a, block).expect("write a");
1002 std::fs::write(&b, block).expect("write b");
1003 std::fs::write(&c, &suppressed).expect("write c");
1004
1005 let files = vec![
1006 DiscoveredFile {
1007 id: FileId(0),
1008 path: a,
1009 size_bytes: block.len() as u64,
1010 },
1011 DiscoveredFile {
1012 id: FileId(1),
1013 path: b,
1014 size_bytes: block.len() as u64,
1015 },
1016 DiscoveredFile {
1017 id: FileId(2),
1018 path: c,
1019 size_bytes: suppressed.len() as u64,
1020 },
1021 ];
1022
1023 let config = DuplicatesConfig {
1024 min_tokens: 10,
1025 min_lines: 2,
1026 min_occurrences: 3,
1027 ..DuplicatesConfig::default()
1028 };
1029 let report = find_duplicates(dir.path(), &files, &config);
1030 assert!(
1031 report.clone_groups.is_empty(),
1032 "post-suppression 2-instance group must be hidden by minOccurrences=3, \
1033 got groups: {:?}",
1034 report
1035 .clone_groups
1036 .iter()
1037 .map(|g| g.instances.len())
1038 .collect::<Vec<_>>()
1039 );
1040 assert_eq!(
1041 report.stats.clone_groups, 0,
1042 "stats.clone_groups must match the empty post-filter array"
1043 );
1044 assert_eq!(
1045 report.stats.clone_instances, 0,
1046 "stats.clone_instances must match the empty post-filter array"
1047 );
1048 }
1049}