1mod cache;
9pub mod detect;
10pub mod families;
11pub mod normalize;
12mod shingle_filter;
13pub mod token_types;
14mod token_visitor;
15pub mod tokenize;
16pub(crate) mod types;
17
18use rustc_hash::FxHashMap;
19use std::path::{Path, PathBuf};
20use std::sync::atomic::{AtomicUsize, Ordering};
21
22use globset::{Glob, GlobMatcher, GlobSet, GlobSetBuilder};
23use rayon::prelude::*;
24use rustc_hash::FxHashSet;
25
26use cache::{TokenCache, TokenCacheEntry, TokenCacheMode};
27use detect::CloneDetector;
28use normalize::normalize_and_hash_resolved;
29use tokenize::{tokenize_file, tokenize_file_cross_language};
30pub use types::{
31 CloneFamily, CloneGroup, CloneInstance, DefaultIgnoreSkipCount, DefaultIgnoreSkips,
32 DetectionMode, DuplicatesConfig, DuplicationReport, DuplicationStats, MirroredDirectory,
33 RefactoringKind, RefactoringSuggestion,
34};
35
36use crate::discover::{self, DiscoveredFile};
37use crate::suppress::{self, IssueKind, Suppression};
38
39pub const DUPES_DEFAULT_IGNORES: &[&str] = &[
45 "**/.next/**",
46 "**/.nuxt/**",
47 "**/.svelte-kit/**",
48 "**/.turbo/**",
49 "**/.parcel-cache/**",
50 "**/.vite/**",
51 "**/.cache/**",
52 "**/out/**",
53 "**/storybook-static/**",
54];
55
56#[derive(Clone)]
57pub(super) struct TokenizedFile {
58 path: PathBuf,
59 hashed_tokens: Vec<normalize::HashedToken>,
60 file_tokens: tokenize::FileTokens,
61 metadata: Option<std::fs::Metadata>,
62 cache_hit: bool,
63 suppressions: Vec<Suppression>,
64}
65
66struct IgnoreSet {
67 all: GlobSet,
68 defaults: Vec<(&'static str, GlobMatcher)>,
69}
70
71impl IgnoreSet {
72 fn is_match(&self, path: &Path) -> bool {
73 self.all.is_match(path)
74 }
75
76 fn default_match_index(&self, path: &Path) -> Option<usize> {
77 self.defaults
78 .iter()
79 .position(|(_, matcher)| matcher.is_match(path))
80 }
81}
82
83struct DuplicationRun {
84 report: DuplicationReport,
85 default_ignore_skips: DefaultIgnoreSkips,
86}
87
88pub fn find_duplicates(
97 root: &Path,
98 files: &[DiscoveredFile],
99 config: &DuplicatesConfig,
100) -> DuplicationReport {
101 find_duplicates_inner(root, files, config, None, None).report
102}
103
104pub fn find_duplicates_with_default_ignore_skips(
107 root: &Path,
108 files: &[DiscoveredFile],
109 config: &DuplicatesConfig,
110) -> (DuplicationReport, DefaultIgnoreSkips) {
111 let run = find_duplicates_inner(root, files, config, None, None);
112 (run.report, run.default_ignore_skips)
113}
114
115pub fn find_duplicates_cached(
117 root: &Path,
118 files: &[DiscoveredFile],
119 config: &DuplicatesConfig,
120 cache_root: &Path,
121) -> DuplicationReport {
122 find_duplicates_inner(root, files, config, None, Some(cache_root)).report
123}
124
125pub fn find_duplicates_cached_with_default_ignore_skips(
128 root: &Path,
129 files: &[DiscoveredFile],
130 config: &DuplicatesConfig,
131 cache_root: &Path,
132) -> (DuplicationReport, DefaultIgnoreSkips) {
133 let run = find_duplicates_inner(root, files, config, None, Some(cache_root));
134 (run.report, run.default_ignore_skips)
135}
136
137#[expect(
143 clippy::implicit_hasher,
144 reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
145)]
146pub fn find_duplicates_touching_files(
147 root: &Path,
148 files: &[DiscoveredFile],
149 config: &DuplicatesConfig,
150 focus_files: &FxHashSet<PathBuf>,
151) -> DuplicationReport {
152 find_duplicates_inner(root, files, config, Some(focus_files), None).report
153}
154
155#[expect(
158 clippy::implicit_hasher,
159 reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
160)]
161pub fn find_duplicates_touching_files_with_default_ignore_skips(
162 root: &Path,
163 files: &[DiscoveredFile],
164 config: &DuplicatesConfig,
165 focus_files: &FxHashSet<PathBuf>,
166) -> (DuplicationReport, DefaultIgnoreSkips) {
167 let run = find_duplicates_inner(root, files, config, Some(focus_files), None);
168 (run.report, run.default_ignore_skips)
169}
170
171#[expect(
173 clippy::implicit_hasher,
174 reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
175)]
176pub fn find_duplicates_touching_files_cached(
177 root: &Path,
178 files: &[DiscoveredFile],
179 config: &DuplicatesConfig,
180 focus_files: &FxHashSet<PathBuf>,
181 cache_root: &Path,
182) -> DuplicationReport {
183 find_duplicates_inner(root, files, config, Some(focus_files), Some(cache_root)).report
184}
185
186#[expect(
189 clippy::implicit_hasher,
190 reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
191)]
192pub fn find_duplicates_touching_files_cached_with_default_ignore_skips(
193 root: &Path,
194 files: &[DiscoveredFile],
195 config: &DuplicatesConfig,
196 focus_files: &FxHashSet<PathBuf>,
197 cache_root: &Path,
198) -> (DuplicationReport, DefaultIgnoreSkips) {
199 let run = find_duplicates_inner(root, files, config, Some(focus_files), Some(cache_root));
200 (run.report, run.default_ignore_skips)
201}
202
203fn find_duplicates_inner(
204 root: &Path,
205 files: &[DiscoveredFile],
206 config: &DuplicatesConfig,
207 focus_files: Option<&FxHashSet<PathBuf>>,
208 cache_root: Option<&Path>,
209) -> DuplicationRun {
210 let _span = tracing::info_span!("find_duplicates").entered();
211
212 let extra_ignores = build_ignore_set(config);
213 let default_skip_counts = extra_ignores
214 .as_ref()
215 .map(|ignores| {
216 std::iter::repeat_with(|| AtomicUsize::new(0))
217 .take(ignores.defaults.len())
218 .collect::<Vec<_>>()
219 })
220 .unwrap_or_default();
221
222 let normalization =
224 fallow_config::ResolvedNormalization::resolve(config.mode, &config.normalization);
225
226 let strip_types = config.cross_language;
227 let skip_imports = config.ignore_imports;
228
229 tracing::debug!(
230 ignore_imports = skip_imports,
231 "duplication tokenization config"
232 );
233
234 let token_cache_mode = TokenCacheMode::new(normalization, strip_types, skip_imports);
235 let cache_root = cache_root.filter(|_| files.len() >= config.min_corpus_size_for_token_cache);
236 let token_cache = cache_root.map(TokenCache::load);
237
238 let mut file_data: Vec<TokenizedFile> = files
240 .par_iter()
241 .filter_map(|file| {
242 let relative = file.path.strip_prefix(root).unwrap_or(&file.path);
244 if let Some(ref ignores) = extra_ignores {
245 if let Some(index) = ignores.default_match_index(relative) {
246 default_skip_counts[index].fetch_add(1, Ordering::Relaxed);
247 return None;
248 }
249 if ignores.is_match(relative) {
250 return None;
251 }
252 }
253
254 let metadata = std::fs::metadata(&file.path).ok()?;
255
256 let cached_entry = token_cache
257 .as_ref()
258 .and_then(|cache| cache.get(&file.path, &metadata, token_cache_mode));
259 let cache_hit = cached_entry.is_some();
260
261 let (mut entry, suppressions) = if let Some(entry) = cached_entry {
262 let suppressions = entry.suppressions.clone();
263 if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
264 return None;
265 }
266 (entry, suppressions)
267 } else {
268 let source = std::fs::read_to_string(&file.path).ok()?;
269 let suppressions = suppress::parse_suppressions_from_source(&source);
270 if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
271 return None;
272 }
273
274 let file_tokens = if strip_types {
276 tokenize_file_cross_language(&file.path, &source, true, skip_imports)
277 } else {
278 tokenize_file(&file.path, &source, skip_imports)
279 };
280 if file_tokens.tokens.is_empty() {
281 return None;
282 }
283
284 let hashed = normalize_and_hash_resolved(&file_tokens.tokens, normalization);
286 let entry = TokenCacheEntry {
287 hashed_tokens: hashed,
288 file_tokens,
289 suppressions: suppressions.clone(),
290 };
291 (entry, suppressions)
292 };
293 if entry.file_tokens.tokens.is_empty() {
294 return None;
295 }
296 if entry.hashed_tokens.len() < config.min_tokens {
297 return None;
298 }
299
300 Some(TokenizedFile {
301 path: file.path.clone(),
302 hashed_tokens: std::mem::take(&mut entry.hashed_tokens),
303 file_tokens: entry.file_tokens,
304 metadata: Some(metadata),
305 cache_hit,
306 suppressions,
307 })
308 })
309 .collect();
310
311 if let (Some(cache_root), Some(mut cache)) = (cache_root, token_cache) {
312 for file in &file_data {
313 if !file.cache_hit
314 && let Some(metadata) = &file.metadata
315 {
316 cache.insert(
317 &file.path,
318 metadata,
319 token_cache_mode,
320 &file.hashed_tokens,
321 &file.file_tokens,
322 &file.suppressions,
323 );
324 }
325 }
326 cache.retain_paths(files);
327 match cache.save_if_dirty() {
328 Ok(true) => {
329 tracing::debug!(cache_root = %cache_root.display(), "saved duplication token cache");
330 }
331 Ok(false) => {
332 tracing::debug!(cache_root = %cache_root.display(), "duplication token cache unchanged");
333 }
334 Err(err) => {
335 tracing::warn!("Failed to save duplication token cache: {err}");
336 }
337 }
338 }
339
340 tracing::info!(
341 files = file_data.len(),
342 "tokenized files for duplication analysis"
343 );
344
345 if let Some(focus_files) = focus_files
346 && file_data.len() >= config.min_corpus_size_for_shingle_filter
347 {
348 shingle_filter::filter_to_focus_candidates(&mut file_data, focus_files, config.min_tokens);
349 }
350
351 let suppressions_by_file: FxHashMap<PathBuf, Vec<Suppression>> = file_data
353 .iter()
354 .filter(|file| !file.suppressions.is_empty())
355 .map(|file| (file.path.clone(), file.suppressions.clone()))
356 .collect();
357
358 let detector_data: Vec<(PathBuf, Vec<normalize::HashedToken>, tokenize::FileTokens)> =
360 file_data
361 .into_iter()
362 .map(|file| (file.path, file.hashed_tokens, file.file_tokens))
363 .collect();
364
365 let detector = CloneDetector::new(config.min_tokens, config.min_lines, config.skip_local);
367 let mut report = if let Some(focus_files) = focus_files {
368 detector.detect_touching_files(detector_data, focus_files)
369 } else {
370 detector.detect(detector_data)
371 };
372
373 if !suppressions_by_file.is_empty() {
378 apply_line_suppressions(&mut report, &suppressions_by_file);
379 }
380
381 apply_min_occurrences_filter(&mut report, config.min_occurrences);
383
384 let default_ignore_skips =
385 build_default_ignore_skips(extra_ignores.as_ref(), &default_skip_counts);
386
387 report.clone_families = families::group_into_families(&report.clone_groups, root);
389
390 report.mirrored_directories =
392 families::detect_mirrored_directories(&report.clone_families, root);
393
394 report.sort();
397
398 DuplicationRun {
399 report,
400 default_ignore_skips,
401 }
402}
403
404fn apply_min_occurrences_filter(report: &mut DuplicationReport, min: usize) {
419 if min <= 2 {
420 return;
421 }
422 let before = report.clone_groups.len();
423 report
424 .clone_groups
425 .retain(|group| group.instances.len() >= min);
426 let hidden = before - report.clone_groups.len();
427 if hidden == 0 {
428 return;
429 }
430 report.stats.clone_groups_below_min_occurrences = hidden;
431 report.stats.clone_groups = report.clone_groups.len();
432 report.stats.clone_instances = report.clone_groups.iter().map(|g| g.instances.len()).sum();
433}
434
435#[expect(
437 clippy::cast_possible_truncation,
438 reason = "line numbers are bounded by source size"
439)]
440fn apply_line_suppressions(
441 report: &mut DuplicationReport,
442 suppressions_by_file: &FxHashMap<PathBuf, Vec<Suppression>>,
443) {
444 report.clone_groups.retain_mut(|group| {
445 group.instances.retain(|instance| {
446 if let Some(supps) = suppressions_by_file.get(&instance.file) {
447 for line in instance.start_line..=instance.end_line {
449 if suppress::is_suppressed(supps, line as u32, IssueKind::CodeDuplication) {
450 return false;
451 }
452 }
453 }
454 true
455 });
456 group.instances.len() >= 2
458 });
459}
460
461#[must_use]
465pub fn find_duplicates_in_project(root: &Path, config: &DuplicatesConfig) -> DuplicationReport {
466 let resolved = crate::default_config(root);
467 let files = discover::discover_files_with_plugin_scopes(&resolved);
468 find_duplicates(root, &files, config)
469}
470
471fn build_ignore_set(config: &DuplicatesConfig) -> Option<IgnoreSet> {
473 if !config.ignore_defaults && config.ignore.is_empty() {
474 return None;
475 }
476
477 let mut builder = GlobSetBuilder::new();
478 let mut defaults = Vec::new();
479
480 if config.ignore_defaults {
481 for pattern in DUPES_DEFAULT_IGNORES {
482 match Glob::new(pattern) {
483 Ok(glob) => {
484 defaults.push((*pattern, glob.compile_matcher()));
485 builder.add(glob);
486 }
487 Err(e) => {
488 tracing::warn!("Invalid default duplication ignore pattern '{pattern}': {e}");
489 }
490 }
491 }
492 }
493
494 for pattern in &config.ignore {
495 match Glob::new(pattern) {
496 Ok(glob) => {
497 builder.add(glob);
498 }
499 Err(e) => {
500 tracing::warn!("Invalid duplication ignore pattern '{pattern}': {e}");
501 }
502 }
503 }
504
505 builder.build().ok().map(|all| IgnoreSet { all, defaults })
506}
507
508fn build_default_ignore_skips(
509 ignores: Option<&IgnoreSet>,
510 counts: &[AtomicUsize],
511) -> DefaultIgnoreSkips {
512 let Some(ignores) = ignores else {
513 return DefaultIgnoreSkips::default();
514 };
515
516 let by_pattern = ignores
517 .defaults
518 .iter()
519 .zip(counts)
520 .filter_map(|((pattern, _), count)| {
521 let count = count.load(Ordering::Relaxed);
522 (count > 0).then_some(DefaultIgnoreSkipCount { pattern, count })
523 })
524 .collect::<Vec<_>>();
525 let total = by_pattern.iter().map(|entry| entry.count).sum();
526
527 DefaultIgnoreSkips { total, by_pattern }
528}
529
530#[cfg(test)]
531mod tests {
532 use super::*;
533 use crate::discover::FileId;
534
535 #[test]
536 fn find_duplicates_empty_files() {
537 let config = DuplicatesConfig::default();
538 let report = find_duplicates(Path::new("/tmp"), &[], &config);
539 assert!(report.clone_groups.is_empty());
540 assert!(report.clone_families.is_empty());
541 assert_eq!(report.stats.total_files, 0);
542 }
543
544 #[test]
545 fn build_ignore_set_empty() {
546 let config = DuplicatesConfig {
547 ignore_defaults: false,
548 ..DuplicatesConfig::default()
549 };
550 assert!(build_ignore_set(&config).is_none());
551 }
552
553 #[test]
554 fn build_ignore_set_valid_patterns() {
555 let config = DuplicatesConfig {
556 ignore_defaults: false,
557 ignore: vec!["**/*.test.ts".to_string(), "**/*.spec.ts".to_string()],
558 ..DuplicatesConfig::default()
559 };
560 let set = build_ignore_set(&config);
561 assert!(set.is_some());
562 let set = set.unwrap();
563 assert!(set.is_match(Path::new("src/foo.test.ts")));
564 assert!(set.is_match(Path::new("src/bar.spec.ts")));
565 assert!(!set.is_match(Path::new("src/baz.ts")));
566 }
567
568 #[test]
569 fn build_ignore_set_merges_defaults_with_user_patterns() {
570 let config = DuplicatesConfig {
571 ignore: vec!["**/foo/**".to_string()],
572 ..DuplicatesConfig::default()
573 };
574 let set = build_ignore_set(&config).expect("ignore set");
575 assert!(set.is_match(Path::new(".next/static/chunks/app.js")));
576 assert!(set.is_match(Path::new("src/foo/generated.js")));
577 }
578
579 #[test]
580 fn build_ignore_set_ignore_defaults_false_uses_only_user_patterns() {
581 let config = DuplicatesConfig {
582 ignore_defaults: false,
583 ignore: vec!["**/foo/**".to_string()],
584 ..DuplicatesConfig::default()
585 };
586 let set = build_ignore_set(&config).expect("ignore set");
587 assert!(!set.is_match(Path::new(".next/static/chunks/app.js")));
588 assert!(set.is_match(Path::new("src/foo/generated.js")));
589 }
590
591 #[test]
592 fn find_duplicates_with_real_files() {
593 let dir = tempfile::tempdir().expect("create temp dir");
595 let src_dir = dir.path().join("src");
596 std::fs::create_dir_all(&src_dir).expect("create src dir");
597
598 let code = r#"
599export function processData(input: string): string {
600 const trimmed = input.trim();
601 if (trimmed.length === 0) {
602 return "";
603 }
604 const parts = trimmed.split(",");
605 const filtered = parts.filter(p => p.length > 0);
606 const mapped = filtered.map(p => p.toUpperCase());
607 return mapped.join(", ");
608}
609
610export function validateInput(data: string): boolean {
611 if (data === null || data === undefined) {
612 return false;
613 }
614 const cleaned = data.trim();
615 if (cleaned.length < 3) {
616 return false;
617 }
618 return true;
619}
620"#;
621
622 std::fs::write(src_dir.join("original.ts"), code).expect("write original");
623 std::fs::write(src_dir.join("copy.ts"), code).expect("write copy");
624 std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
625 .expect("write package.json");
626
627 let files = vec![
628 DiscoveredFile {
629 id: FileId(0),
630 path: src_dir.join("original.ts"),
631 size_bytes: code.len() as u64,
632 },
633 DiscoveredFile {
634 id: FileId(1),
635 path: src_dir.join("copy.ts"),
636 size_bytes: code.len() as u64,
637 },
638 ];
639
640 let config = DuplicatesConfig {
641 min_tokens: 10,
642 min_lines: 2,
643 ..DuplicatesConfig::default()
644 };
645
646 let report = find_duplicates(dir.path(), &files, &config);
647 assert!(
648 !report.clone_groups.is_empty(),
649 "Should detect clones in identical files"
650 );
651 assert!(report.stats.files_with_clones >= 2);
652
653 assert!(
655 !report.clone_families.is_empty(),
656 "Should group clones into families"
657 );
658 }
659
660 #[test]
661 fn find_duplicates_cached_skips_token_cache_for_small_corpus() {
662 let dir = tempfile::tempdir().expect("create temp dir");
663 let src_dir = dir.path().join("src");
664 std::fs::create_dir_all(&src_dir).expect("create src dir");
665
666 let code = "export function same(input: number): number {\n const doubled = input * 2;\n return doubled + 1;\n}\n";
667 let first = src_dir.join("first.ts");
668 let second = src_dir.join("second.ts");
669 std::fs::write(&first, code).expect("write first");
670 std::fs::write(&second, code).expect("write second");
671
672 let files = vec![
673 DiscoveredFile {
674 id: FileId(0),
675 path: first,
676 size_bytes: code.len() as u64,
677 },
678 DiscoveredFile {
679 id: FileId(1),
680 path: second,
681 size_bytes: code.len() as u64,
682 },
683 ];
684 let config = DuplicatesConfig {
685 min_tokens: 5,
686 min_lines: 2,
687 ..DuplicatesConfig::default()
688 };
689 let cache_root = dir.path().join(".fallow");
690
691 let report = find_duplicates_cached(dir.path(), &files, &config, &cache_root);
692
693 assert!(!report.clone_groups.is_empty());
694 assert!(
695 !cache_root.exists(),
696 "small projects should avoid token-cache IO overhead"
697 );
698 }
699
700 #[test]
701 fn find_duplicates_touching_files_keeps_cross_corpus_matches_only_for_focus() {
702 let dir = tempfile::tempdir().expect("create temp dir");
703 let src_dir = dir.path().join("src");
704 std::fs::create_dir_all(&src_dir).expect("create src dir");
705
706 let focused_code = r"
707export function focused(input: number): number {
708 const doubled = input * 2;
709 const shifted = doubled + 10;
710 return shifted / 2;
711}
712";
713 let untouched_code = r#"
714export function untouched(input: string): string {
715 const lowered = input.toLowerCase();
716 const padded = lowered.padStart(10, "x");
717 return padded.slice(0, 8);
718}
719"#;
720
721 let changed_path = src_dir.join("changed.ts");
722 let focused_copy_path = src_dir.join("focused-copy.ts");
723 let untouched_a_path = src_dir.join("untouched-a.ts");
724 let untouched_b_path = src_dir.join("untouched-b.ts");
725 std::fs::write(&changed_path, focused_code).expect("write changed");
726 std::fs::write(&focused_copy_path, focused_code).expect("write focused copy");
727 std::fs::write(&untouched_a_path, untouched_code).expect("write untouched a");
728 std::fs::write(&untouched_b_path, untouched_code).expect("write untouched b");
729
730 let files = vec![
731 DiscoveredFile {
732 id: FileId(0),
733 path: changed_path.clone(),
734 size_bytes: focused_code.len() as u64,
735 },
736 DiscoveredFile {
737 id: FileId(1),
738 path: focused_copy_path,
739 size_bytes: focused_code.len() as u64,
740 },
741 DiscoveredFile {
742 id: FileId(2),
743 path: untouched_a_path,
744 size_bytes: untouched_code.len() as u64,
745 },
746 DiscoveredFile {
747 id: FileId(3),
748 path: untouched_b_path,
749 size_bytes: untouched_code.len() as u64,
750 },
751 ];
752
753 let config = DuplicatesConfig {
754 mode: DetectionMode::Strict,
755 min_tokens: 5,
756 min_lines: 2,
757 min_corpus_size_for_shingle_filter: 1,
758 ..DuplicatesConfig::default()
759 };
760 let mut focus = FxHashSet::default();
761 focus.insert(changed_path.clone());
762
763 let full_report = find_duplicates(dir.path(), &files, &config);
764 let report = find_duplicates_touching_files(dir.path(), &files, &config, &focus);
765 let expected_touching = full_report
766 .clone_groups
767 .iter()
768 .filter(|group| {
769 group
770 .instances
771 .iter()
772 .any(|instance| instance.file == changed_path)
773 })
774 .count();
775
776 assert!(
777 !report.clone_groups.is_empty(),
778 "focused file should still match an unchanged duplicate"
779 );
780 assert_eq!(
781 report.clone_groups.len(),
782 expected_touching,
783 "focused shingle filtering must not drop clone groups touching the focused file"
784 );
785 assert!(report.clone_groups.iter().all(|group| {
786 group
787 .instances
788 .iter()
789 .any(|instance| instance.file == changed_path)
790 }));
791 }
792
793 #[test]
794 fn file_wide_suppression_excludes_file() {
795 let dir = tempfile::tempdir().expect("create temp dir");
796 let src_dir = dir.path().join("src");
797 std::fs::create_dir_all(&src_dir).expect("create src dir");
798
799 let code = r#"
800export function processData(input: string): string {
801 const trimmed = input.trim();
802 if (trimmed.length === 0) {
803 return "";
804 }
805 const parts = trimmed.split(",");
806 const filtered = parts.filter(p => p.length > 0);
807 const mapped = filtered.map(p => p.toUpperCase());
808 return mapped.join(", ");
809}
810"#;
811 let suppressed_code = format!("// fallow-ignore-file code-duplication\n{code}");
812
813 std::fs::write(src_dir.join("original.ts"), code).expect("write original");
814 std::fs::write(src_dir.join("suppressed.ts"), &suppressed_code).expect("write suppressed");
815 std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
816 .expect("write package.json");
817
818 let files = vec![
819 DiscoveredFile {
820 id: FileId(0),
821 path: src_dir.join("original.ts"),
822 size_bytes: code.len() as u64,
823 },
824 DiscoveredFile {
825 id: FileId(1),
826 path: src_dir.join("suppressed.ts"),
827 size_bytes: suppressed_code.len() as u64,
828 },
829 ];
830
831 let config = DuplicatesConfig {
832 min_tokens: 10,
833 min_lines: 2,
834 ..DuplicatesConfig::default()
835 };
836
837 let report = find_duplicates(dir.path(), &files, &config);
838 assert!(
840 report.clone_groups.is_empty(),
841 "File-wide suppression should exclude file from duplication analysis"
842 );
843 }
844
845 #[test]
846 fn min_occurrences_hides_pairs_and_records_count() {
847 let dir = tempfile::tempdir().expect("create temp dir");
848 let src_dir = dir.path().join("src");
849 std::fs::create_dir_all(&src_dir).expect("create src dir");
850
851 let block_a = r#"
854export function blockA(input: string): string {
855 const trimmed = input.trim();
856 if (trimmed.length === 0) {
857 return "";
858 }
859 const parts = trimmed.split(",");
860 const filtered = parts.filter(p => p.length > 0);
861 const mapped = filtered.map(p => p.toUpperCase());
862 return mapped.join(", ");
863}
864"#;
865 let block_b = r"
866export function blockB(value: number): number {
867 if (value <= 0) {
868 return 0;
869 }
870 let total = 0;
871 for (let i = 1; i <= value; i += 1) {
872 total += i * 2;
873 total -= 1;
874 }
875 return total + 7;
876}
877";
878
879 let pair_a1 = src_dir.join("pair-a1.ts");
880 let pair_a2 = src_dir.join("pair-a2.ts");
881 let triple_b1 = src_dir.join("triple-b1.ts");
882 let triple_b2 = src_dir.join("triple-b2.ts");
883 let triple_b3 = src_dir.join("triple-b3.ts");
884 std::fs::write(&pair_a1, block_a).expect("write");
885 std::fs::write(&pair_a2, block_a).expect("write");
886 std::fs::write(&triple_b1, block_b).expect("write");
887 std::fs::write(&triple_b2, block_b).expect("write");
888 std::fs::write(&triple_b3, block_b).expect("write");
889
890 let files = vec![
891 DiscoveredFile {
892 id: FileId(0),
893 path: pair_a1,
894 size_bytes: block_a.len() as u64,
895 },
896 DiscoveredFile {
897 id: FileId(1),
898 path: pair_a2,
899 size_bytes: block_a.len() as u64,
900 },
901 DiscoveredFile {
902 id: FileId(2),
903 path: triple_b1,
904 size_bytes: block_b.len() as u64,
905 },
906 DiscoveredFile {
907 id: FileId(3),
908 path: triple_b2,
909 size_bytes: block_b.len() as u64,
910 },
911 DiscoveredFile {
912 id: FileId(4),
913 path: triple_b3,
914 size_bytes: block_b.len() as u64,
915 },
916 ];
917
918 let default_config = DuplicatesConfig {
920 min_tokens: 10,
921 min_lines: 2,
922 ..DuplicatesConfig::default()
923 };
924 let baseline = find_duplicates(dir.path(), &files, &default_config);
925 assert_eq!(
926 baseline.clone_groups.len(),
927 2,
928 "default minOccurrences should report both the pair and the triple"
929 );
930 assert_eq!(
931 baseline.stats.clone_groups_below_min_occurrences, 0,
932 "default minOccurrences hides nothing"
933 );
934 let baseline_pct = baseline.stats.duplication_percentage;
935
936 let raised_config = DuplicatesConfig {
938 min_tokens: 10,
939 min_lines: 2,
940 min_occurrences: 3,
941 ..DuplicatesConfig::default()
942 };
943 let report = find_duplicates(dir.path(), &files, &raised_config);
944 assert_eq!(
945 report.clone_groups.len(),
946 1,
947 "minOccurrences=3 should hide the 2-instance group"
948 );
949 assert_eq!(
950 report.clone_groups[0].instances.len(),
951 3,
952 "surviving group must be the 3-instance group"
953 );
954 assert_eq!(
955 report.stats.clone_groups_below_min_occurrences, 1,
956 "the hidden 2-instance group must be counted"
957 );
958 assert_eq!(
961 report.stats.clone_groups, 1,
962 "stats.clone_groups must match the post-filter array length"
963 );
964 assert_eq!(
965 report.stats.clone_instances, 3,
966 "stats.clone_instances must match the surviving instance total"
967 );
968 assert!(
971 (report.stats.duplication_percentage - baseline_pct).abs() < f64::EPSILON,
972 "duplication_percentage should not shift when minOccurrences changes"
973 );
974 }
975
976 #[test]
977 fn min_occurrences_evaluates_after_line_suppressions() {
978 let dir = tempfile::tempdir().expect("create temp dir");
984 let src_dir = dir.path().join("src");
985 std::fs::create_dir_all(&src_dir).expect("create src dir");
986
987 let block = r#"
988export function shared(input: string): string {
989 const trimmed = input.trim();
990 if (trimmed.length === 0) {
991 return "";
992 }
993 const parts = trimmed.split(",");
994 const filtered = parts.filter(p => p.length > 0);
995 const mapped = filtered.map(p => p.toUpperCase());
996 return mapped.join(", ");
997}
998"#;
999 let suppressed = format!("// fallow-ignore-file code-duplication\n{block}");
1000
1001 let a = src_dir.join("a.ts");
1002 let b = src_dir.join("b.ts");
1003 let c = src_dir.join("c.ts");
1004 std::fs::write(&a, block).expect("write a");
1005 std::fs::write(&b, block).expect("write b");
1006 std::fs::write(&c, &suppressed).expect("write c");
1007
1008 let files = vec![
1009 DiscoveredFile {
1010 id: FileId(0),
1011 path: a,
1012 size_bytes: block.len() as u64,
1013 },
1014 DiscoveredFile {
1015 id: FileId(1),
1016 path: b,
1017 size_bytes: block.len() as u64,
1018 },
1019 DiscoveredFile {
1020 id: FileId(2),
1021 path: c,
1022 size_bytes: suppressed.len() as u64,
1023 },
1024 ];
1025
1026 let config = DuplicatesConfig {
1027 min_tokens: 10,
1028 min_lines: 2,
1029 min_occurrences: 3,
1030 ..DuplicatesConfig::default()
1031 };
1032 let report = find_duplicates(dir.path(), &files, &config);
1033 assert!(
1034 report.clone_groups.is_empty(),
1035 "post-suppression 2-instance group must be hidden by minOccurrences=3, \
1036 got groups: {:?}",
1037 report
1038 .clone_groups
1039 .iter()
1040 .map(|g| g.instances.len())
1041 .collect::<Vec<_>>()
1042 );
1043 assert_eq!(
1044 report.stats.clone_groups, 0,
1045 "stats.clone_groups must match the empty post-filter array"
1046 );
1047 assert_eq!(
1048 report.stats.clone_instances, 0,
1049 "stats.clone_instances must match the empty post-filter array"
1050 );
1051 }
1052}