1mod cache;
9pub mod detect;
10pub mod families;
11pub mod normalize;
12mod shingle_filter;
13pub mod token_types;
14mod token_visitor;
15pub mod tokenize;
16pub(crate) mod types;
17
18use rustc_hash::FxHashMap;
19use std::path::{Path, PathBuf};
20use std::sync::atomic::{AtomicUsize, Ordering};
21
22use globset::{Glob, GlobMatcher, GlobSet, GlobSetBuilder};
23use rayon::prelude::*;
24use rustc_hash::FxHashSet;
25
26use cache::{TokenCache, TokenCacheEntry, TokenCacheMode};
27use detect::CloneDetector;
28use normalize::normalize_and_hash_resolved;
29use tokenize::{tokenize_file, tokenize_file_cross_language};
30pub use types::{
31 CloneFamily, CloneGroup, CloneInstance, DefaultIgnoreSkipCount, DefaultIgnoreSkips,
32 DetectionMode, DuplicatesConfig, DuplicationReport, DuplicationStats, MirroredDirectory,
33 RefactoringKind, RefactoringSuggestion,
34};
35
36use crate::discover::{self, DiscoveredFile};
37use crate::suppress::{self, IssueKind, Suppression};
38
39pub const DUPES_DEFAULT_IGNORES: &[&str] = &[
45 "**/.next/**",
46 "**/.nuxt/**",
47 "**/.svelte-kit/**",
48 "**/.turbo/**",
49 "**/.parcel-cache/**",
50 "**/.vite/**",
51 "**/.cache/**",
52 "**/out/**",
53 "**/storybook-static/**",
54];
55
56#[derive(Clone)]
57pub(super) struct TokenizedFile {
58 path: PathBuf,
59 hashed_tokens: Vec<normalize::HashedToken>,
60 file_tokens: tokenize::FileTokens,
61 metadata: Option<std::fs::Metadata>,
62 cache_hit: bool,
63 suppressions: Vec<Suppression>,
64}
65
66struct IgnoreSet {
67 all: GlobSet,
68 defaults: Vec<(&'static str, GlobMatcher)>,
69}
70
71impl IgnoreSet {
72 fn is_match(&self, path: &Path) -> bool {
73 self.all.is_match(path)
74 }
75
76 fn default_match_index(&self, path: &Path) -> Option<usize> {
77 self.defaults
78 .iter()
79 .position(|(_, matcher)| matcher.is_match(path))
80 }
81}
82
83struct DuplicationRun {
84 report: DuplicationReport,
85 default_ignore_skips: DefaultIgnoreSkips,
86}
87
88pub fn find_duplicates(
97 root: &Path,
98 files: &[DiscoveredFile],
99 config: &DuplicatesConfig,
100) -> DuplicationReport {
101 find_duplicates_inner(root, files, config, None, None).report
102}
103
104pub fn find_duplicates_with_default_ignore_skips(
107 root: &Path,
108 files: &[DiscoveredFile],
109 config: &DuplicatesConfig,
110) -> (DuplicationReport, DefaultIgnoreSkips) {
111 let run = find_duplicates_inner(root, files, config, None, None);
112 (run.report, run.default_ignore_skips)
113}
114
115pub fn find_duplicates_cached(
117 root: &Path,
118 files: &[DiscoveredFile],
119 config: &DuplicatesConfig,
120 cache_root: &Path,
121) -> DuplicationReport {
122 find_duplicates_inner(root, files, config, None, Some(cache_root)).report
123}
124
125pub fn find_duplicates_cached_with_default_ignore_skips(
128 root: &Path,
129 files: &[DiscoveredFile],
130 config: &DuplicatesConfig,
131 cache_root: &Path,
132) -> (DuplicationReport, DefaultIgnoreSkips) {
133 let run = find_duplicates_inner(root, files, config, None, Some(cache_root));
134 (run.report, run.default_ignore_skips)
135}
136
137#[expect(
143 clippy::implicit_hasher,
144 reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
145)]
146pub fn find_duplicates_touching_files(
147 root: &Path,
148 files: &[DiscoveredFile],
149 config: &DuplicatesConfig,
150 focus_files: &FxHashSet<PathBuf>,
151) -> DuplicationReport {
152 find_duplicates_inner(root, files, config, Some(focus_files), None).report
153}
154
155#[expect(
158 clippy::implicit_hasher,
159 reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
160)]
161pub fn find_duplicates_touching_files_with_default_ignore_skips(
162 root: &Path,
163 files: &[DiscoveredFile],
164 config: &DuplicatesConfig,
165 focus_files: &FxHashSet<PathBuf>,
166) -> (DuplicationReport, DefaultIgnoreSkips) {
167 let run = find_duplicates_inner(root, files, config, Some(focus_files), None);
168 (run.report, run.default_ignore_skips)
169}
170
171#[expect(
173 clippy::implicit_hasher,
174 reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
175)]
176pub fn find_duplicates_touching_files_cached(
177 root: &Path,
178 files: &[DiscoveredFile],
179 config: &DuplicatesConfig,
180 focus_files: &FxHashSet<PathBuf>,
181 cache_root: &Path,
182) -> DuplicationReport {
183 find_duplicates_inner(root, files, config, Some(focus_files), Some(cache_root)).report
184}
185
186#[expect(
189 clippy::implicit_hasher,
190 reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
191)]
192pub fn find_duplicates_touching_files_cached_with_default_ignore_skips(
193 root: &Path,
194 files: &[DiscoveredFile],
195 config: &DuplicatesConfig,
196 focus_files: &FxHashSet<PathBuf>,
197 cache_root: &Path,
198) -> (DuplicationReport, DefaultIgnoreSkips) {
199 let run = find_duplicates_inner(root, files, config, Some(focus_files), Some(cache_root));
200 (run.report, run.default_ignore_skips)
201}
202
203fn find_duplicates_inner(
204 root: &Path,
205 files: &[DiscoveredFile],
206 config: &DuplicatesConfig,
207 focus_files: Option<&FxHashSet<PathBuf>>,
208 cache_root: Option<&Path>,
209) -> DuplicationRun {
210 let _span = tracing::info_span!("find_duplicates").entered();
211
212 let extra_ignores = build_ignore_set(config);
213 let default_skip_counts = extra_ignores
214 .as_ref()
215 .map(|ignores| {
216 std::iter::repeat_with(|| AtomicUsize::new(0))
217 .take(ignores.defaults.len())
218 .collect::<Vec<_>>()
219 })
220 .unwrap_or_default();
221
222 let normalization =
224 fallow_config::ResolvedNormalization::resolve(config.mode, &config.normalization);
225
226 let strip_types = config.cross_language;
227 let skip_imports = config.ignore_imports;
228
229 tracing::debug!(
230 ignore_imports = skip_imports,
231 "duplication tokenization config"
232 );
233
234 let token_cache_mode = TokenCacheMode::new(normalization, strip_types, skip_imports);
235 let cache_root = cache_root.filter(|_| files.len() >= config.min_corpus_size_for_token_cache);
236 let token_cache = cache_root.map(TokenCache::load);
237
238 let mut file_data: Vec<TokenizedFile> = files
240 .par_iter()
241 .filter_map(|file| {
242 let relative = file.path.strip_prefix(root).unwrap_or(&file.path);
244 if let Some(ref ignores) = extra_ignores {
245 if let Some(index) = ignores.default_match_index(relative) {
246 default_skip_counts[index].fetch_add(1, Ordering::Relaxed);
247 return None;
248 }
249 if ignores.is_match(relative) {
250 return None;
251 }
252 }
253
254 let metadata = std::fs::metadata(&file.path).ok()?;
255
256 let cached_entry = token_cache
257 .as_ref()
258 .and_then(|cache| cache.get(&file.path, &metadata, token_cache_mode));
259 let cache_hit = cached_entry.is_some();
260
261 let (mut entry, suppressions) = if let Some(entry) = cached_entry {
262 let suppressions = entry.suppressions.clone();
263 if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
264 return None;
265 }
266 (entry, suppressions)
267 } else {
268 let source = std::fs::read_to_string(&file.path).ok()?;
269 let suppressions = suppress::parse_suppressions_from_source(&source).suppressions;
270 if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
271 return None;
272 }
273
274 let file_tokens = if strip_types {
276 tokenize_file_cross_language(&file.path, &source, true, skip_imports)
277 } else {
278 tokenize_file(&file.path, &source, skip_imports)
279 };
280 if file_tokens.tokens.is_empty() {
281 return None;
282 }
283
284 let hashed = normalize_and_hash_resolved(&file_tokens.tokens, normalization);
286 let entry = TokenCacheEntry {
287 hashed_tokens: hashed,
288 file_tokens,
289 suppressions: suppressions.clone(),
290 };
291 (entry, suppressions)
292 };
293 if entry.file_tokens.tokens.is_empty() {
294 return None;
295 }
296 if entry.hashed_tokens.len() < config.min_tokens {
297 return None;
298 }
299
300 Some(TokenizedFile {
301 path: file.path.clone(),
302 hashed_tokens: std::mem::take(&mut entry.hashed_tokens),
303 file_tokens: entry.file_tokens,
304 metadata: Some(metadata),
305 cache_hit,
306 suppressions,
307 })
308 })
309 .collect();
310
311 if let (Some(cache_root), Some(mut cache)) = (cache_root, token_cache) {
312 for file in &file_data {
313 if !file.cache_hit
314 && let Some(metadata) = &file.metadata
315 {
316 cache.insert(
317 &file.path,
318 metadata,
319 token_cache_mode,
320 &file.hashed_tokens,
321 &file.file_tokens,
322 &file.suppressions,
323 );
324 }
325 }
326 cache.retain_paths(files);
327 match cache.save_if_dirty() {
328 Ok(true) => {
329 tracing::debug!(cache_root = %cache_root.display(), "saved duplication token cache");
330 }
331 Ok(false) => {
332 tracing::debug!(cache_root = %cache_root.display(), "duplication token cache unchanged");
333 }
334 Err(err) => {
335 tracing::warn!("Failed to save duplication token cache: {err}");
336 }
337 }
338 }
339
340 tracing::info!(
341 files = file_data.len(),
342 "tokenized files for duplication analysis"
343 );
344
345 if let Some(focus_files) = focus_files
346 && file_data.len() >= config.min_corpus_size_for_shingle_filter
347 {
348 shingle_filter::filter_to_focus_candidates(&mut file_data, focus_files, config.min_tokens);
349 }
350
351 let suppressions_by_file: FxHashMap<PathBuf, Vec<Suppression>> = file_data
353 .iter()
354 .filter(|file| !file.suppressions.is_empty())
355 .map(|file| (file.path.clone(), file.suppressions.clone()))
356 .collect();
357
358 let detector_data: Vec<(PathBuf, Vec<normalize::HashedToken>, tokenize::FileTokens)> =
360 file_data
361 .into_iter()
362 .map(|file| (file.path, file.hashed_tokens, file.file_tokens))
363 .collect();
364
365 let detector = CloneDetector::new(config.min_tokens, config.min_lines, config.skip_local);
367 let mut report = if let Some(focus_files) = focus_files {
368 detector.detect_touching_files(detector_data, focus_files)
369 } else {
370 detector.detect(detector_data)
371 };
372
373 if !suppressions_by_file.is_empty() {
378 apply_line_suppressions(&mut report, &suppressions_by_file);
379 }
380
381 apply_min_occurrences_filter(&mut report, config.min_occurrences);
383
384 let default_ignore_skips =
385 build_default_ignore_skips(extra_ignores.as_ref(), &default_skip_counts);
386
387 report.clone_families = families::group_into_families(&report.clone_groups, root);
389
390 report.mirrored_directories =
392 families::detect_mirrored_directories(&report.clone_families, root);
393
394 report.sort();
397
398 DuplicationRun {
399 report,
400 default_ignore_skips,
401 }
402}
403
404fn apply_min_occurrences_filter(report: &mut DuplicationReport, min: usize) {
419 if min <= 2 {
420 return;
421 }
422 let before = report.clone_groups.len();
423 report
424 .clone_groups
425 .retain(|group| group.instances.len() >= min);
426 let hidden = before - report.clone_groups.len();
427 if hidden == 0 {
428 return;
429 }
430 report.stats.clone_groups_below_min_occurrences = hidden;
431 report.stats.clone_groups = report.clone_groups.len();
432 report.stats.clone_instances = report.clone_groups.iter().map(|g| g.instances.len()).sum();
433}
434
435#[expect(
437 clippy::cast_possible_truncation,
438 reason = "line numbers are bounded by source size"
439)]
440fn apply_line_suppressions(
441 report: &mut DuplicationReport,
442 suppressions_by_file: &FxHashMap<PathBuf, Vec<Suppression>>,
443) {
444 report.clone_groups.retain_mut(|group| {
445 group.instances.retain(|instance| {
446 if let Some(supps) = suppressions_by_file.get(&instance.file) {
447 for line in instance.start_line..=instance.end_line {
449 if suppress::is_suppressed(supps, line as u32, IssueKind::CodeDuplication) {
450 return false;
451 }
452 }
453 }
454 true
455 });
456 group.instances.len() >= 2
458 });
459}
460
461#[must_use]
465pub fn find_duplicates_in_project(root: &Path, config: &DuplicatesConfig) -> DuplicationReport {
466 let resolved = crate::default_config(root);
467 let files = discover::discover_files_with_plugin_scopes(&resolved);
468 find_duplicates(root, &files, config)
469}
470
471fn build_ignore_set(config: &DuplicatesConfig) -> Option<IgnoreSet> {
473 if !config.ignore_defaults && config.ignore.is_empty() {
474 return None;
475 }
476
477 let mut builder = GlobSetBuilder::new();
478 let mut defaults = Vec::new();
479
480 if config.ignore_defaults {
481 for pattern in DUPES_DEFAULT_IGNORES {
482 let glob = Glob::new(pattern).expect("default duplication ignore pattern is valid");
483 defaults.push((*pattern, glob.compile_matcher()));
484 builder.add(glob);
485 }
486 }
487
488 for pattern in &config.ignore {
491 builder.add(
492 Glob::new(pattern)
493 .expect("duplicates.ignore pattern was validated at config load time"),
494 );
495 }
496
497 builder.build().ok().map(|all| IgnoreSet { all, defaults })
498}
499
500fn build_default_ignore_skips(
501 ignores: Option<&IgnoreSet>,
502 counts: &[AtomicUsize],
503) -> DefaultIgnoreSkips {
504 let Some(ignores) = ignores else {
505 return DefaultIgnoreSkips::default();
506 };
507
508 let by_pattern = ignores
509 .defaults
510 .iter()
511 .zip(counts)
512 .filter_map(|((pattern, _), count)| {
513 let count = count.load(Ordering::Relaxed);
514 (count > 0).then_some(DefaultIgnoreSkipCount { pattern, count })
515 })
516 .collect::<Vec<_>>();
517 let total = by_pattern.iter().map(|entry| entry.count).sum();
518
519 DefaultIgnoreSkips { total, by_pattern }
520}
521
522#[cfg(test)]
523mod tests {
524 use super::*;
525 use crate::discover::FileId;
526
527 #[test]
528 fn find_duplicates_empty_files() {
529 let config = DuplicatesConfig::default();
530 let report = find_duplicates(Path::new("/tmp"), &[], &config);
531 assert!(report.clone_groups.is_empty());
532 assert!(report.clone_families.is_empty());
533 assert_eq!(report.stats.total_files, 0);
534 }
535
536 #[test]
537 fn build_ignore_set_empty() {
538 let config = DuplicatesConfig {
539 ignore_defaults: false,
540 ..DuplicatesConfig::default()
541 };
542 assert!(build_ignore_set(&config).is_none());
543 }
544
545 #[test]
546 fn build_ignore_set_valid_patterns() {
547 let config = DuplicatesConfig {
548 ignore_defaults: false,
549 ignore: vec!["**/*.test.ts".to_string(), "**/*.spec.ts".to_string()],
550 ..DuplicatesConfig::default()
551 };
552 let set = build_ignore_set(&config);
553 assert!(set.is_some());
554 let set = set.unwrap();
555 assert!(set.is_match(Path::new("src/foo.test.ts")));
556 assert!(set.is_match(Path::new("src/bar.spec.ts")));
557 assert!(!set.is_match(Path::new("src/baz.ts")));
558 }
559
560 #[test]
561 fn build_ignore_set_merges_defaults_with_user_patterns() {
562 let config = DuplicatesConfig {
563 ignore: vec!["**/foo/**".to_string()],
564 ..DuplicatesConfig::default()
565 };
566 let set = build_ignore_set(&config).expect("ignore set");
567 assert!(set.is_match(Path::new(".next/static/chunks/app.js")));
568 assert!(set.is_match(Path::new("src/foo/generated.js")));
569 }
570
571 #[test]
572 fn build_ignore_set_ignore_defaults_false_uses_only_user_patterns() {
573 let config = DuplicatesConfig {
574 ignore_defaults: false,
575 ignore: vec!["**/foo/**".to_string()],
576 ..DuplicatesConfig::default()
577 };
578 let set = build_ignore_set(&config).expect("ignore set");
579 assert!(!set.is_match(Path::new(".next/static/chunks/app.js")));
580 assert!(set.is_match(Path::new("src/foo/generated.js")));
581 }
582
583 #[test]
584 fn find_duplicates_with_real_files() {
585 let dir = tempfile::tempdir().expect("create temp dir");
587 let src_dir = dir.path().join("src");
588 std::fs::create_dir_all(&src_dir).expect("create src dir");
589
590 let code = r#"
591export function processData(input: string): string {
592 const trimmed = input.trim();
593 if (trimmed.length === 0) {
594 return "";
595 }
596 const parts = trimmed.split(",");
597 const filtered = parts.filter(p => p.length > 0);
598 const mapped = filtered.map(p => p.toUpperCase());
599 return mapped.join(", ");
600}
601
602export function validateInput(data: string): boolean {
603 if (data === null || data === undefined) {
604 return false;
605 }
606 const cleaned = data.trim();
607 if (cleaned.length < 3) {
608 return false;
609 }
610 return true;
611}
612"#;
613
614 std::fs::write(src_dir.join("original.ts"), code).expect("write original");
615 std::fs::write(src_dir.join("copy.ts"), code).expect("write copy");
616 std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
617 .expect("write package.json");
618
619 let files = vec![
620 DiscoveredFile {
621 id: FileId(0),
622 path: src_dir.join("original.ts"),
623 size_bytes: code.len() as u64,
624 },
625 DiscoveredFile {
626 id: FileId(1),
627 path: src_dir.join("copy.ts"),
628 size_bytes: code.len() as u64,
629 },
630 ];
631
632 let config = DuplicatesConfig {
633 min_tokens: 10,
634 min_lines: 2,
635 ..DuplicatesConfig::default()
636 };
637
638 let report = find_duplicates(dir.path(), &files, &config);
639 assert!(
640 !report.clone_groups.is_empty(),
641 "Should detect clones in identical files"
642 );
643 assert!(report.stats.files_with_clones >= 2);
644
645 assert!(
647 !report.clone_families.is_empty(),
648 "Should group clones into families"
649 );
650 }
651
652 #[test]
653 fn find_duplicates_cached_skips_token_cache_for_small_corpus() {
654 let dir = tempfile::tempdir().expect("create temp dir");
655 let src_dir = dir.path().join("src");
656 std::fs::create_dir_all(&src_dir).expect("create src dir");
657
658 let code = "export function same(input: number): number {\n const doubled = input * 2;\n return doubled + 1;\n}\n";
659 let first = src_dir.join("first.ts");
660 let second = src_dir.join("second.ts");
661 std::fs::write(&first, code).expect("write first");
662 std::fs::write(&second, code).expect("write second");
663
664 let files = vec![
665 DiscoveredFile {
666 id: FileId(0),
667 path: first,
668 size_bytes: code.len() as u64,
669 },
670 DiscoveredFile {
671 id: FileId(1),
672 path: second,
673 size_bytes: code.len() as u64,
674 },
675 ];
676 let config = DuplicatesConfig {
677 min_tokens: 5,
678 min_lines: 2,
679 ..DuplicatesConfig::default()
680 };
681 let cache_root = dir.path().join(".fallow");
682
683 let report = find_duplicates_cached(dir.path(), &files, &config, &cache_root);
684
685 assert!(!report.clone_groups.is_empty());
686 assert!(
687 !cache_root.exists(),
688 "small projects should avoid token-cache IO overhead"
689 );
690 }
691
692 #[test]
693 fn find_duplicates_touching_files_keeps_cross_corpus_matches_only_for_focus() {
694 let dir = tempfile::tempdir().expect("create temp dir");
695 let src_dir = dir.path().join("src");
696 std::fs::create_dir_all(&src_dir).expect("create src dir");
697
698 let focused_code = r"
699export function focused(input: number): number {
700 const doubled = input * 2;
701 const shifted = doubled + 10;
702 return shifted / 2;
703}
704";
705 let untouched_code = r#"
706export function untouched(input: string): string {
707 const lowered = input.toLowerCase();
708 const padded = lowered.padStart(10, "x");
709 return padded.slice(0, 8);
710}
711"#;
712
713 let changed_path = src_dir.join("changed.ts");
714 let focused_copy_path = src_dir.join("focused-copy.ts");
715 let untouched_a_path = src_dir.join("untouched-a.ts");
716 let untouched_b_path = src_dir.join("untouched-b.ts");
717 std::fs::write(&changed_path, focused_code).expect("write changed");
718 std::fs::write(&focused_copy_path, focused_code).expect("write focused copy");
719 std::fs::write(&untouched_a_path, untouched_code).expect("write untouched a");
720 std::fs::write(&untouched_b_path, untouched_code).expect("write untouched b");
721
722 let files = vec![
723 DiscoveredFile {
724 id: FileId(0),
725 path: changed_path.clone(),
726 size_bytes: focused_code.len() as u64,
727 },
728 DiscoveredFile {
729 id: FileId(1),
730 path: focused_copy_path,
731 size_bytes: focused_code.len() as u64,
732 },
733 DiscoveredFile {
734 id: FileId(2),
735 path: untouched_a_path,
736 size_bytes: untouched_code.len() as u64,
737 },
738 DiscoveredFile {
739 id: FileId(3),
740 path: untouched_b_path,
741 size_bytes: untouched_code.len() as u64,
742 },
743 ];
744
745 let config = DuplicatesConfig {
746 mode: DetectionMode::Strict,
747 min_tokens: 5,
748 min_lines: 2,
749 min_corpus_size_for_shingle_filter: 1,
750 ..DuplicatesConfig::default()
751 };
752 let mut focus = FxHashSet::default();
753 focus.insert(changed_path.clone());
754
755 let full_report = find_duplicates(dir.path(), &files, &config);
756 let report = find_duplicates_touching_files(dir.path(), &files, &config, &focus);
757 let expected_touching = full_report
758 .clone_groups
759 .iter()
760 .filter(|group| {
761 group
762 .instances
763 .iter()
764 .any(|instance| instance.file == changed_path)
765 })
766 .count();
767
768 assert!(
769 !report.clone_groups.is_empty(),
770 "focused file should still match an unchanged duplicate"
771 );
772 assert_eq!(
773 report.clone_groups.len(),
774 expected_touching,
775 "focused shingle filtering must not drop clone groups touching the focused file"
776 );
777 assert!(report.clone_groups.iter().all(|group| {
778 group
779 .instances
780 .iter()
781 .any(|instance| instance.file == changed_path)
782 }));
783 }
784
785 #[test]
786 fn file_wide_suppression_excludes_file() {
787 let dir = tempfile::tempdir().expect("create temp dir");
788 let src_dir = dir.path().join("src");
789 std::fs::create_dir_all(&src_dir).expect("create src dir");
790
791 let code = r#"
792export function processData(input: string): string {
793 const trimmed = input.trim();
794 if (trimmed.length === 0) {
795 return "";
796 }
797 const parts = trimmed.split(",");
798 const filtered = parts.filter(p => p.length > 0);
799 const mapped = filtered.map(p => p.toUpperCase());
800 return mapped.join(", ");
801}
802"#;
803 let suppressed_code = format!("// fallow-ignore-file code-duplication\n{code}");
804
805 std::fs::write(src_dir.join("original.ts"), code).expect("write original");
806 std::fs::write(src_dir.join("suppressed.ts"), &suppressed_code).expect("write suppressed");
807 std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
808 .expect("write package.json");
809
810 let files = vec![
811 DiscoveredFile {
812 id: FileId(0),
813 path: src_dir.join("original.ts"),
814 size_bytes: code.len() as u64,
815 },
816 DiscoveredFile {
817 id: FileId(1),
818 path: src_dir.join("suppressed.ts"),
819 size_bytes: suppressed_code.len() as u64,
820 },
821 ];
822
823 let config = DuplicatesConfig {
824 min_tokens: 10,
825 min_lines: 2,
826 ..DuplicatesConfig::default()
827 };
828
829 let report = find_duplicates(dir.path(), &files, &config);
830 assert!(
832 report.clone_groups.is_empty(),
833 "File-wide suppression should exclude file from duplication analysis"
834 );
835 }
836
837 #[test]
838 fn min_occurrences_hides_pairs_and_records_count() {
839 let dir = tempfile::tempdir().expect("create temp dir");
840 let src_dir = dir.path().join("src");
841 std::fs::create_dir_all(&src_dir).expect("create src dir");
842
843 let block_a = r#"
846export function blockA(input: string): string {
847 const trimmed = input.trim();
848 if (trimmed.length === 0) {
849 return "";
850 }
851 const parts = trimmed.split(",");
852 const filtered = parts.filter(p => p.length > 0);
853 const mapped = filtered.map(p => p.toUpperCase());
854 return mapped.join(", ");
855}
856"#;
857 let block_b = r"
858export function blockB(value: number): number {
859 if (value <= 0) {
860 return 0;
861 }
862 let total = 0;
863 for (let i = 1; i <= value; i += 1) {
864 total += i * 2;
865 total -= 1;
866 }
867 return total + 7;
868}
869";
870
871 let pair_a1 = src_dir.join("pair-a1.ts");
872 let pair_a2 = src_dir.join("pair-a2.ts");
873 let triple_b1 = src_dir.join("triple-b1.ts");
874 let triple_b2 = src_dir.join("triple-b2.ts");
875 let triple_b3 = src_dir.join("triple-b3.ts");
876 std::fs::write(&pair_a1, block_a).expect("write");
877 std::fs::write(&pair_a2, block_a).expect("write");
878 std::fs::write(&triple_b1, block_b).expect("write");
879 std::fs::write(&triple_b2, block_b).expect("write");
880 std::fs::write(&triple_b3, block_b).expect("write");
881
882 let files = vec![
883 DiscoveredFile {
884 id: FileId(0),
885 path: pair_a1,
886 size_bytes: block_a.len() as u64,
887 },
888 DiscoveredFile {
889 id: FileId(1),
890 path: pair_a2,
891 size_bytes: block_a.len() as u64,
892 },
893 DiscoveredFile {
894 id: FileId(2),
895 path: triple_b1,
896 size_bytes: block_b.len() as u64,
897 },
898 DiscoveredFile {
899 id: FileId(3),
900 path: triple_b2,
901 size_bytes: block_b.len() as u64,
902 },
903 DiscoveredFile {
904 id: FileId(4),
905 path: triple_b3,
906 size_bytes: block_b.len() as u64,
907 },
908 ];
909
910 let default_config = DuplicatesConfig {
912 min_tokens: 10,
913 min_lines: 2,
914 ..DuplicatesConfig::default()
915 };
916 let baseline = find_duplicates(dir.path(), &files, &default_config);
917 assert_eq!(
918 baseline.clone_groups.len(),
919 2,
920 "default minOccurrences should report both the pair and the triple"
921 );
922 assert_eq!(
923 baseline.stats.clone_groups_below_min_occurrences, 0,
924 "default minOccurrences hides nothing"
925 );
926 let baseline_pct = baseline.stats.duplication_percentage;
927
928 let raised_config = DuplicatesConfig {
930 min_tokens: 10,
931 min_lines: 2,
932 min_occurrences: 3,
933 ..DuplicatesConfig::default()
934 };
935 let report = find_duplicates(dir.path(), &files, &raised_config);
936 assert_eq!(
937 report.clone_groups.len(),
938 1,
939 "minOccurrences=3 should hide the 2-instance group"
940 );
941 assert_eq!(
942 report.clone_groups[0].instances.len(),
943 3,
944 "surviving group must be the 3-instance group"
945 );
946 assert_eq!(
947 report.stats.clone_groups_below_min_occurrences, 1,
948 "the hidden 2-instance group must be counted"
949 );
950 assert_eq!(
953 report.stats.clone_groups, 1,
954 "stats.clone_groups must match the post-filter array length"
955 );
956 assert_eq!(
957 report.stats.clone_instances, 3,
958 "stats.clone_instances must match the surviving instance total"
959 );
960 assert!(
963 (report.stats.duplication_percentage - baseline_pct).abs() < f64::EPSILON,
964 "duplication_percentage should not shift when minOccurrences changes"
965 );
966 }
967
968 #[test]
969 fn min_occurrences_evaluates_after_line_suppressions() {
970 let dir = tempfile::tempdir().expect("create temp dir");
976 let src_dir = dir.path().join("src");
977 std::fs::create_dir_all(&src_dir).expect("create src dir");
978
979 let block = r#"
980export function shared(input: string): string {
981 const trimmed = input.trim();
982 if (trimmed.length === 0) {
983 return "";
984 }
985 const parts = trimmed.split(",");
986 const filtered = parts.filter(p => p.length > 0);
987 const mapped = filtered.map(p => p.toUpperCase());
988 return mapped.join(", ");
989}
990"#;
991 let suppressed = format!("// fallow-ignore-file code-duplication\n{block}");
992
993 let a = src_dir.join("a.ts");
994 let b = src_dir.join("b.ts");
995 let c = src_dir.join("c.ts");
996 std::fs::write(&a, block).expect("write a");
997 std::fs::write(&b, block).expect("write b");
998 std::fs::write(&c, &suppressed).expect("write c");
999
1000 let files = vec![
1001 DiscoveredFile {
1002 id: FileId(0),
1003 path: a,
1004 size_bytes: block.len() as u64,
1005 },
1006 DiscoveredFile {
1007 id: FileId(1),
1008 path: b,
1009 size_bytes: block.len() as u64,
1010 },
1011 DiscoveredFile {
1012 id: FileId(2),
1013 path: c,
1014 size_bytes: suppressed.len() as u64,
1015 },
1016 ];
1017
1018 let config = DuplicatesConfig {
1019 min_tokens: 10,
1020 min_lines: 2,
1021 min_occurrences: 3,
1022 ..DuplicatesConfig::default()
1023 };
1024 let report = find_duplicates(dir.path(), &files, &config);
1025 assert!(
1026 report.clone_groups.is_empty(),
1027 "post-suppression 2-instance group must be hidden by minOccurrences=3, \
1028 got groups: {:?}",
1029 report
1030 .clone_groups
1031 .iter()
1032 .map(|g| g.instances.len())
1033 .collect::<Vec<_>>()
1034 );
1035 assert_eq!(
1036 report.stats.clone_groups, 0,
1037 "stats.clone_groups must match the empty post-filter array"
1038 );
1039 assert_eq!(
1040 report.stats.clone_instances, 0,
1041 "stats.clone_instances must match the empty post-filter array"
1042 );
1043 }
1044}