1mod cache;
9pub mod deepdive;
10pub mod detect;
11pub mod families;
12pub mod normalize;
13mod shingle_filter;
14pub mod token_types;
15mod token_visitor;
16pub mod tokenize;
17pub(crate) mod types;
18
19use rustc_hash::FxHashMap;
20use std::path::{Path, PathBuf};
21use std::sync::atomic::{AtomicUsize, Ordering};
22
23use globset::{Glob, GlobMatcher, GlobSet, GlobSetBuilder};
24use rayon::prelude::*;
25use rustc_hash::FxHashSet;
26
27use cache::{TokenCache, TokenCacheEntry, TokenCacheMode};
28pub use deepdive::{
29 CloneFingerprintKey, CloneFingerprintSet, FINGERPRINT_PREFIX, clone_fingerprint,
30 dominant_identifier, fingerprint_for_fragment, group_refactoring_suggestion,
31};
32use detect::CloneDetector;
33use normalize::normalize_and_hash_resolved;
34use tokenize::{tokenize_file, tokenize_file_cross_language};
35pub use types::{
36 CloneFamily, CloneGroup, CloneInstance, DefaultIgnoreSkipCount, DefaultIgnoreSkips,
37 DetectionMode, DuplicatesConfig, DuplicationReport, DuplicationStats, MirroredDirectory,
38 RefactoringKind, RefactoringSuggestion,
39};
40
41use crate::discover::{self, DiscoveredFile};
42use crate::suppress::{self, IssueKind, Suppression};
43
44pub const DUPES_DEFAULT_IGNORES: &[&str] = &[
50 "**/.next/**",
51 "**/.nuxt/**",
52 "**/.svelte-kit/**",
53 "**/.turbo/**",
54 "**/.parcel-cache/**",
55 "**/.vite/**",
56 "**/.cache/**",
57 "**/out/**",
58 "**/storybook-static/**",
59];
60
61#[derive(Clone)]
62pub(super) struct TokenizedFile {
63 path: PathBuf,
64 hashed_tokens: Vec<normalize::HashedToken>,
65 file_tokens: tokenize::FileTokens,
66 metadata: Option<std::fs::Metadata>,
67 cache_hit: bool,
68 suppressions: Vec<Suppression>,
69}
70
71struct IgnoreSet {
72 all: GlobSet,
73 defaults: Vec<(&'static str, GlobMatcher)>,
74}
75
76impl IgnoreSet {
77 fn is_match(&self, path: &Path) -> bool {
78 self.all.is_match(path)
79 }
80
81 fn default_match_index(&self, path: &Path) -> Option<usize> {
82 self.defaults
83 .iter()
84 .position(|(_, matcher)| matcher.is_match(path))
85 }
86}
87
88struct DuplicationRun {
89 report: DuplicationReport,
90 default_ignore_skips: DefaultIgnoreSkips,
91}
92
93pub fn find_duplicates(
102 root: &Path,
103 files: &[DiscoveredFile],
104 config: &DuplicatesConfig,
105) -> DuplicationReport {
106 find_duplicates_inner(root, files, config, None, None).report
107}
108
109pub fn find_duplicates_with_default_ignore_skips(
112 root: &Path,
113 files: &[DiscoveredFile],
114 config: &DuplicatesConfig,
115) -> (DuplicationReport, DefaultIgnoreSkips) {
116 let run = find_duplicates_inner(root, files, config, None, None);
117 (run.report, run.default_ignore_skips)
118}
119
120pub fn find_duplicates_cached(
122 root: &Path,
123 files: &[DiscoveredFile],
124 config: &DuplicatesConfig,
125 cache_root: &Path,
126) -> DuplicationReport {
127 find_duplicates_inner(root, files, config, None, Some(cache_root)).report
128}
129
130pub fn find_duplicates_cached_with_default_ignore_skips(
133 root: &Path,
134 files: &[DiscoveredFile],
135 config: &DuplicatesConfig,
136 cache_root: &Path,
137) -> (DuplicationReport, DefaultIgnoreSkips) {
138 let run = find_duplicates_inner(root, files, config, None, Some(cache_root));
139 (run.report, run.default_ignore_skips)
140}
141
142#[expect(
148 clippy::implicit_hasher,
149 reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
150)]
151pub fn find_duplicates_touching_files(
152 root: &Path,
153 files: &[DiscoveredFile],
154 config: &DuplicatesConfig,
155 focus_files: &FxHashSet<PathBuf>,
156) -> DuplicationReport {
157 find_duplicates_inner(root, files, config, Some(focus_files), None).report
158}
159
160#[expect(
163 clippy::implicit_hasher,
164 reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
165)]
166pub fn find_duplicates_touching_files_with_default_ignore_skips(
167 root: &Path,
168 files: &[DiscoveredFile],
169 config: &DuplicatesConfig,
170 focus_files: &FxHashSet<PathBuf>,
171) -> (DuplicationReport, DefaultIgnoreSkips) {
172 let run = find_duplicates_inner(root, files, config, Some(focus_files), None);
173 (run.report, run.default_ignore_skips)
174}
175
176#[expect(
178 clippy::implicit_hasher,
179 reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
180)]
181pub fn find_duplicates_touching_files_cached(
182 root: &Path,
183 files: &[DiscoveredFile],
184 config: &DuplicatesConfig,
185 focus_files: &FxHashSet<PathBuf>,
186 cache_root: &Path,
187) -> DuplicationReport {
188 find_duplicates_inner(root, files, config, Some(focus_files), Some(cache_root)).report
189}
190
191#[expect(
194 clippy::implicit_hasher,
195 reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
196)]
197pub fn find_duplicates_touching_files_cached_with_default_ignore_skips(
198 root: &Path,
199 files: &[DiscoveredFile],
200 config: &DuplicatesConfig,
201 focus_files: &FxHashSet<PathBuf>,
202 cache_root: &Path,
203) -> (DuplicationReport, DefaultIgnoreSkips) {
204 let run = find_duplicates_inner(root, files, config, Some(focus_files), Some(cache_root));
205 (run.report, run.default_ignore_skips)
206}
207
208fn find_duplicates_inner(
209 root: &Path,
210 files: &[DiscoveredFile],
211 config: &DuplicatesConfig,
212 focus_files: Option<&FxHashSet<PathBuf>>,
213 cache_root: Option<&Path>,
214) -> DuplicationRun {
215 let _span = tracing::info_span!("find_duplicates").entered();
216
217 let extra_ignores = build_ignore_set(config);
218 let default_skip_counts = extra_ignores
219 .as_ref()
220 .map(|ignores| {
221 std::iter::repeat_with(|| AtomicUsize::new(0))
222 .take(ignores.defaults.len())
223 .collect::<Vec<_>>()
224 })
225 .unwrap_or_default();
226
227 let normalization =
228 fallow_config::ResolvedNormalization::resolve(config.mode, &config.normalization);
229
230 let strip_types = config.cross_language;
231 let skip_imports = config.ignore_imports;
232
233 tracing::debug!(
234 ignore_imports = skip_imports,
235 "duplication tokenization config"
236 );
237
238 let token_cache_mode = TokenCacheMode::new(normalization, strip_types, skip_imports);
239 let cache_root = cache_root.filter(|_| files.len() >= config.min_corpus_size_for_token_cache);
240 let token_cache = cache_root.map(TokenCache::load);
241
242 let mut file_data: Vec<TokenizedFile> = files
243 .par_iter()
244 .filter_map(|file| {
245 let relative = file.path.strip_prefix(root).unwrap_or(&file.path);
246 if let Some(ref ignores) = extra_ignores {
247 if let Some(index) = ignores.default_match_index(relative) {
248 default_skip_counts[index].fetch_add(1, Ordering::Relaxed);
249 return None;
250 }
251 if ignores.is_match(relative) {
252 return None;
253 }
254 }
255
256 let metadata = std::fs::metadata(&file.path).ok()?;
257
258 let cached_entry = token_cache
259 .as_ref()
260 .and_then(|cache| cache.get(&file.path, &metadata, token_cache_mode));
261 let cache_hit = cached_entry.is_some();
262
263 let (mut entry, suppressions) = if let Some(entry) = cached_entry {
264 let suppressions = entry.suppressions.clone();
265 if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
266 return None;
267 }
268 (entry, suppressions)
269 } else {
270 let source = std::fs::read_to_string(&file.path).ok()?;
271 let suppressions = suppress::parse_suppressions_from_source(&source).suppressions;
272 if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
273 return None;
274 }
275
276 let file_tokens = if strip_types {
277 tokenize_file_cross_language(&file.path, &source, true, skip_imports)
278 } else {
279 tokenize_file(&file.path, &source, skip_imports)
280 };
281 if file_tokens.tokens.is_empty() {
282 return None;
283 }
284
285 let hashed = normalize_and_hash_resolved(&file_tokens.tokens, normalization);
286 let entry = TokenCacheEntry {
287 hashed_tokens: hashed,
288 file_tokens,
289 suppressions: suppressions.clone(),
290 };
291 (entry, suppressions)
292 };
293 if entry.file_tokens.tokens.is_empty() {
294 return None;
295 }
296 if entry.hashed_tokens.len() < config.min_tokens {
297 return None;
298 }
299
300 Some(TokenizedFile {
301 path: file.path.clone(),
302 hashed_tokens: std::mem::take(&mut entry.hashed_tokens),
303 file_tokens: entry.file_tokens,
304 metadata: Some(metadata),
305 cache_hit,
306 suppressions,
307 })
308 })
309 .collect();
310
311 if let (Some(cache_root), Some(mut cache)) = (cache_root, token_cache) {
312 for file in &file_data {
313 if !file.cache_hit
314 && let Some(metadata) = &file.metadata
315 {
316 cache.insert(
317 &file.path,
318 metadata,
319 token_cache_mode,
320 &file.hashed_tokens,
321 &file.file_tokens,
322 &file.suppressions,
323 );
324 }
325 }
326 cache.retain_paths(files);
327 match cache.save_if_dirty() {
328 Ok(true) => {
329 tracing::debug!(cache_root = %cache_root.display(), "saved duplication token cache");
330 }
331 Ok(false) => {
332 tracing::debug!(cache_root = %cache_root.display(), "duplication token cache unchanged");
333 }
334 Err(err) => {
335 tracing::warn!("Failed to save duplication token cache: {err}");
336 }
337 }
338 }
339
340 tracing::info!(
341 files = file_data.len(),
342 "tokenized files for duplication analysis"
343 );
344
345 if let Some(focus_files) = focus_files
346 && file_data.len() >= config.min_corpus_size_for_shingle_filter
347 {
348 shingle_filter::filter_to_focus_candidates(&mut file_data, focus_files, config.min_tokens);
349 }
350
351 let suppressions_by_file: FxHashMap<PathBuf, Vec<Suppression>> = file_data
352 .iter()
353 .filter(|file| !file.suppressions.is_empty())
354 .map(|file| (file.path.clone(), file.suppressions.clone()))
355 .collect();
356
357 let detector_data: Vec<(PathBuf, Vec<normalize::HashedToken>, tokenize::FileTokens)> =
358 file_data
359 .into_iter()
360 .map(|file| (file.path, file.hashed_tokens, file.file_tokens))
361 .collect();
362
363 let detector = CloneDetector::new(config.min_tokens, config.min_lines, config.skip_local);
364 let mut report = if let Some(focus_files) = focus_files {
365 detector.detect_touching_files(detector_data, focus_files)
366 } else {
367 detector.detect(detector_data)
368 };
369
370 if !suppressions_by_file.is_empty() {
371 apply_line_suppressions(&mut report, &suppressions_by_file);
372 }
373
374 apply_min_occurrences_filter(&mut report, config.min_occurrences);
375
376 let default_ignore_skips =
377 build_default_ignore_skips(extra_ignores.as_ref(), &default_skip_counts);
378
379 report.clone_families = families::group_into_families(&report.clone_groups, root);
380
381 report.mirrored_directories =
382 families::detect_mirrored_directories(&report.clone_families, root);
383
384 report.sort();
385
386 DuplicationRun {
387 report,
388 default_ignore_skips,
389 }
390}
391
392fn apply_min_occurrences_filter(report: &mut DuplicationReport, min: usize) {
407 if min <= 2 {
408 return;
409 }
410 let before = report.clone_groups.len();
411 report
412 .clone_groups
413 .retain(|group| group.instances.len() >= min);
414 let hidden = before - report.clone_groups.len();
415 if hidden == 0 {
416 return;
417 }
418 report.stats.clone_groups_below_min_occurrences = hidden;
419 report.stats.clone_groups = report.clone_groups.len();
420 report.stats.clone_instances = report.clone_groups.iter().map(|g| g.instances.len()).sum();
421}
422
423#[expect(
425 clippy::cast_possible_truncation,
426 reason = "line numbers are bounded by source size"
427)]
428fn apply_line_suppressions(
429 report: &mut DuplicationReport,
430 suppressions_by_file: &FxHashMap<PathBuf, Vec<Suppression>>,
431) {
432 report.clone_groups.retain_mut(|group| {
433 group.instances.retain(|instance| {
434 if let Some(supps) = suppressions_by_file.get(&instance.file) {
435 for line in instance.start_line..=instance.end_line {
436 if suppress::is_suppressed(supps, line as u32, IssueKind::CodeDuplication) {
437 return false;
438 }
439 }
440 }
441 true
442 });
443 group.instances.len() >= 2
444 });
445}
446
447#[must_use]
451pub fn find_duplicates_in_project(root: &Path, config: &DuplicatesConfig) -> DuplicationReport {
452 let resolved = crate::default_config(root);
453 let files = discover::discover_files_with_plugin_scopes(&resolved);
454 find_duplicates(root, &files, config)
455}
456
457#[expect(
459 clippy::expect_used,
460 reason = "duplicate ignore globs are validated before clone detection"
461)]
462fn build_ignore_set(config: &DuplicatesConfig) -> Option<IgnoreSet> {
463 if !config.ignore_defaults && config.ignore.is_empty() {
464 return None;
465 }
466
467 let mut builder = GlobSetBuilder::new();
468 let mut defaults = Vec::new();
469
470 if config.ignore_defaults {
471 for pattern in DUPES_DEFAULT_IGNORES {
472 let glob = Glob::new(pattern).expect("default duplication ignore pattern is valid");
473 defaults.push((*pattern, glob.compile_matcher()));
474 builder.add(glob);
475 }
476 }
477
478 for pattern in &config.ignore {
479 builder.add(
480 Glob::new(pattern)
481 .expect("duplicates.ignore pattern was validated at config load time"),
482 );
483 }
484
485 builder.build().ok().map(|all| IgnoreSet { all, defaults })
486}
487
488fn build_default_ignore_skips(
489 ignores: Option<&IgnoreSet>,
490 counts: &[AtomicUsize],
491) -> DefaultIgnoreSkips {
492 let Some(ignores) = ignores else {
493 return DefaultIgnoreSkips::default();
494 };
495
496 let by_pattern = ignores
497 .defaults
498 .iter()
499 .zip(counts)
500 .filter_map(|((pattern, _), count)| {
501 let count = count.load(Ordering::Relaxed);
502 (count > 0).then_some(DefaultIgnoreSkipCount { pattern, count })
503 })
504 .collect::<Vec<_>>();
505 let total = by_pattern.iter().map(|entry| entry.count).sum();
506
507 DefaultIgnoreSkips { total, by_pattern }
508}
509
510#[cfg(test)]
511mod tests {
512 use super::*;
513 use crate::discover::FileId;
514
515 #[test]
516 fn find_duplicates_empty_files() {
517 let config = DuplicatesConfig::default();
518 let report = find_duplicates(Path::new("/tmp"), &[], &config);
519 assert!(report.clone_groups.is_empty());
520 assert!(report.clone_families.is_empty());
521 assert_eq!(report.stats.total_files, 0);
522 }
523
524 #[test]
525 fn build_ignore_set_empty() {
526 let config = DuplicatesConfig {
527 ignore_defaults: false,
528 ..DuplicatesConfig::default()
529 };
530 assert!(build_ignore_set(&config).is_none());
531 }
532
533 #[test]
534 fn build_ignore_set_valid_patterns() {
535 let config = DuplicatesConfig {
536 ignore_defaults: false,
537 ignore: vec!["**/*.test.ts".to_string(), "**/*.spec.ts".to_string()],
538 ..DuplicatesConfig::default()
539 };
540 let set = build_ignore_set(&config);
541 assert!(set.is_some());
542 let set = set.unwrap();
543 assert!(set.is_match(Path::new("src/foo.test.ts")));
544 assert!(set.is_match(Path::new("src/bar.spec.ts")));
545 assert!(!set.is_match(Path::new("src/baz.ts")));
546 }
547
548 #[test]
549 fn build_ignore_set_merges_defaults_with_user_patterns() {
550 let config = DuplicatesConfig {
551 ignore: vec!["**/foo/**".to_string()],
552 ..DuplicatesConfig::default()
553 };
554 let set = build_ignore_set(&config).expect("ignore set");
555 assert!(set.is_match(Path::new(".next/static/chunks/app.js")));
556 assert!(set.is_match(Path::new("src/foo/generated.js")));
557 }
558
559 #[test]
560 fn build_ignore_set_ignore_defaults_false_uses_only_user_patterns() {
561 let config = DuplicatesConfig {
562 ignore_defaults: false,
563 ignore: vec!["**/foo/**".to_string()],
564 ..DuplicatesConfig::default()
565 };
566 let set = build_ignore_set(&config).expect("ignore set");
567 assert!(!set.is_match(Path::new(".next/static/chunks/app.js")));
568 assert!(set.is_match(Path::new("src/foo/generated.js")));
569 }
570
571 #[test]
572 fn find_duplicates_with_real_files() {
573 let dir = tempfile::tempdir().expect("create temp dir");
574 let src_dir = dir.path().join("src");
575 std::fs::create_dir_all(&src_dir).expect("create src dir");
576
577 let code = r#"
578export function processData(input: string): string {
579 const trimmed = input.trim();
580 if (trimmed.length === 0) {
581 return "";
582 }
583 const parts = trimmed.split(",");
584 const filtered = parts.filter(p => p.length > 0);
585 const mapped = filtered.map(p => p.toUpperCase());
586 return mapped.join(", ");
587}
588
589export function validateInput(data: string): boolean {
590 if (data === null || data === undefined) {
591 return false;
592 }
593 const cleaned = data.trim();
594 if (cleaned.length < 3) {
595 return false;
596 }
597 return true;
598}
599"#;
600
601 std::fs::write(src_dir.join("original.ts"), code).expect("write original");
602 std::fs::write(src_dir.join("copy.ts"), code).expect("write copy");
603 std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
604 .expect("write package.json");
605
606 let files = vec![
607 DiscoveredFile {
608 id: FileId(0),
609 path: src_dir.join("original.ts"),
610 size_bytes: code.len() as u64,
611 },
612 DiscoveredFile {
613 id: FileId(1),
614 path: src_dir.join("copy.ts"),
615 size_bytes: code.len() as u64,
616 },
617 ];
618
619 let config = DuplicatesConfig {
620 min_tokens: 10,
621 min_lines: 2,
622 ..DuplicatesConfig::default()
623 };
624
625 let report = find_duplicates(dir.path(), &files, &config);
626 assert!(
627 !report.clone_groups.is_empty(),
628 "Should detect clones in identical files"
629 );
630 assert!(report.stats.files_with_clones >= 2);
631
632 assert!(
633 !report.clone_families.is_empty(),
634 "Should group clones into families"
635 );
636 }
637
638 #[test]
639 fn find_duplicates_cached_skips_token_cache_for_small_corpus() {
640 let dir = tempfile::tempdir().expect("create temp dir");
641 let src_dir = dir.path().join("src");
642 std::fs::create_dir_all(&src_dir).expect("create src dir");
643
644 let code = "export function same(input: number): number {\n const doubled = input * 2;\n return doubled + 1;\n}\n";
645 let first = src_dir.join("first.ts");
646 let second = src_dir.join("second.ts");
647 std::fs::write(&first, code).expect("write first");
648 std::fs::write(&second, code).expect("write second");
649
650 let files = vec![
651 DiscoveredFile {
652 id: FileId(0),
653 path: first,
654 size_bytes: code.len() as u64,
655 },
656 DiscoveredFile {
657 id: FileId(1),
658 path: second,
659 size_bytes: code.len() as u64,
660 },
661 ];
662 let config = DuplicatesConfig {
663 min_tokens: 5,
664 min_lines: 2,
665 ..DuplicatesConfig::default()
666 };
667 let cache_root = dir.path().join(".fallow");
668
669 let report = find_duplicates_cached(dir.path(), &files, &config, &cache_root);
670
671 assert!(!report.clone_groups.is_empty());
672 assert!(
673 !cache_root.exists(),
674 "small projects should avoid token-cache IO overhead"
675 );
676 }
677
678 #[test]
679 fn find_duplicates_touching_files_keeps_cross_corpus_matches_only_for_focus() {
680 let dir = tempfile::tempdir().expect("create temp dir");
681 let src_dir = dir.path().join("src");
682 std::fs::create_dir_all(&src_dir).expect("create src dir");
683
684 let focused_code = r"
685export function focused(input: number): number {
686 const doubled = input * 2;
687 const shifted = doubled + 10;
688 return shifted / 2;
689}
690";
691 let untouched_code = r#"
692export function untouched(input: string): string {
693 const lowered = input.toLowerCase();
694 const padded = lowered.padStart(10, "x");
695 return padded.slice(0, 8);
696}
697"#;
698
699 let changed_path = src_dir.join("changed.ts");
700 let focused_copy_path = src_dir.join("focused-copy.ts");
701 let untouched_a_path = src_dir.join("untouched-a.ts");
702 let untouched_b_path = src_dir.join("untouched-b.ts");
703 std::fs::write(&changed_path, focused_code).expect("write changed");
704 std::fs::write(&focused_copy_path, focused_code).expect("write focused copy");
705 std::fs::write(&untouched_a_path, untouched_code).expect("write untouched a");
706 std::fs::write(&untouched_b_path, untouched_code).expect("write untouched b");
707
708 let files = vec![
709 DiscoveredFile {
710 id: FileId(0),
711 path: changed_path.clone(),
712 size_bytes: focused_code.len() as u64,
713 },
714 DiscoveredFile {
715 id: FileId(1),
716 path: focused_copy_path,
717 size_bytes: focused_code.len() as u64,
718 },
719 DiscoveredFile {
720 id: FileId(2),
721 path: untouched_a_path,
722 size_bytes: untouched_code.len() as u64,
723 },
724 DiscoveredFile {
725 id: FileId(3),
726 path: untouched_b_path,
727 size_bytes: untouched_code.len() as u64,
728 },
729 ];
730
731 let config = DuplicatesConfig {
732 mode: DetectionMode::Strict,
733 min_tokens: 5,
734 min_lines: 2,
735 min_corpus_size_for_shingle_filter: 1,
736 ..DuplicatesConfig::default()
737 };
738 let mut focus = FxHashSet::default();
739 focus.insert(changed_path.clone());
740
741 let full_report = find_duplicates(dir.path(), &files, &config);
742 let report = find_duplicates_touching_files(dir.path(), &files, &config, &focus);
743 let expected_touching = full_report
744 .clone_groups
745 .iter()
746 .filter(|group| {
747 group
748 .instances
749 .iter()
750 .any(|instance| instance.file == changed_path)
751 })
752 .count();
753
754 assert!(
755 !report.clone_groups.is_empty(),
756 "focused file should still match an unchanged duplicate"
757 );
758 assert_eq!(
759 report.clone_groups.len(),
760 expected_touching,
761 "focused shingle filtering must not drop clone groups touching the focused file"
762 );
763 assert!(report.clone_groups.iter().all(|group| {
764 group
765 .instances
766 .iter()
767 .any(|instance| instance.file == changed_path)
768 }));
769 }
770
771 #[test]
772 fn file_wide_suppression_excludes_file() {
773 let dir = tempfile::tempdir().expect("create temp dir");
774 let src_dir = dir.path().join("src");
775 std::fs::create_dir_all(&src_dir).expect("create src dir");
776
777 let code = r#"
778export function processData(input: string): string {
779 const trimmed = input.trim();
780 if (trimmed.length === 0) {
781 return "";
782 }
783 const parts = trimmed.split(",");
784 const filtered = parts.filter(p => p.length > 0);
785 const mapped = filtered.map(p => p.toUpperCase());
786 return mapped.join(", ");
787}
788"#;
789 let suppressed_code = format!("// fallow-ignore-file code-duplication\n{code}");
790
791 std::fs::write(src_dir.join("original.ts"), code).expect("write original");
792 std::fs::write(src_dir.join("suppressed.ts"), &suppressed_code).expect("write suppressed");
793 std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
794 .expect("write package.json");
795
796 let files = vec![
797 DiscoveredFile {
798 id: FileId(0),
799 path: src_dir.join("original.ts"),
800 size_bytes: code.len() as u64,
801 },
802 DiscoveredFile {
803 id: FileId(1),
804 path: src_dir.join("suppressed.ts"),
805 size_bytes: suppressed_code.len() as u64,
806 },
807 ];
808
809 let config = DuplicatesConfig {
810 min_tokens: 10,
811 min_lines: 2,
812 ..DuplicatesConfig::default()
813 };
814
815 let report = find_duplicates(dir.path(), &files, &config);
816 assert!(
817 report.clone_groups.is_empty(),
818 "File-wide suppression should exclude file from duplication analysis"
819 );
820 }
821
822 #[test]
823 fn min_occurrences_hides_pairs_and_records_count() {
824 let dir = tempfile::tempdir().expect("create temp dir");
825 let src_dir = dir.path().join("src");
826 std::fs::create_dir_all(&src_dir).expect("create src dir");
827
828 let block_a = r#"
829export function blockA(input: string): string {
830 const trimmed = input.trim();
831 if (trimmed.length === 0) {
832 return "";
833 }
834 const parts = trimmed.split(",");
835 const filtered = parts.filter(p => p.length > 0);
836 const mapped = filtered.map(p => p.toUpperCase());
837 return mapped.join(", ");
838}
839"#;
840 let block_b = r"
841export function blockB(value: number): number {
842 if (value <= 0) {
843 return 0;
844 }
845 let total = 0;
846 for (let i = 1; i <= value; i += 1) {
847 total += i * 2;
848 total -= 1;
849 }
850 return total + 7;
851}
852";
853
854 let pair_a1 = src_dir.join("pair-a1.ts");
855 let pair_a2 = src_dir.join("pair-a2.ts");
856 let triple_b1 = src_dir.join("triple-b1.ts");
857 let triple_b2 = src_dir.join("triple-b2.ts");
858 let triple_b3 = src_dir.join("triple-b3.ts");
859 std::fs::write(&pair_a1, block_a).expect("write");
860 std::fs::write(&pair_a2, block_a).expect("write");
861 std::fs::write(&triple_b1, block_b).expect("write");
862 std::fs::write(&triple_b2, block_b).expect("write");
863 std::fs::write(&triple_b3, block_b).expect("write");
864
865 let files = vec![
866 DiscoveredFile {
867 id: FileId(0),
868 path: pair_a1,
869 size_bytes: block_a.len() as u64,
870 },
871 DiscoveredFile {
872 id: FileId(1),
873 path: pair_a2,
874 size_bytes: block_a.len() as u64,
875 },
876 DiscoveredFile {
877 id: FileId(2),
878 path: triple_b1,
879 size_bytes: block_b.len() as u64,
880 },
881 DiscoveredFile {
882 id: FileId(3),
883 path: triple_b2,
884 size_bytes: block_b.len() as u64,
885 },
886 DiscoveredFile {
887 id: FileId(4),
888 path: triple_b3,
889 size_bytes: block_b.len() as u64,
890 },
891 ];
892
893 let default_config = DuplicatesConfig {
894 min_tokens: 10,
895 min_lines: 2,
896 ..DuplicatesConfig::default()
897 };
898 let baseline = find_duplicates(dir.path(), &files, &default_config);
899 assert_eq!(
900 baseline.clone_groups.len(),
901 2,
902 "default minOccurrences should report both the pair and the triple"
903 );
904 assert_eq!(
905 baseline.stats.clone_groups_below_min_occurrences, 0,
906 "default minOccurrences hides nothing"
907 );
908 let baseline_pct = baseline.stats.duplication_percentage;
909
910 let raised_config = DuplicatesConfig {
911 min_tokens: 10,
912 min_lines: 2,
913 min_occurrences: 3,
914 ..DuplicatesConfig::default()
915 };
916 let report = find_duplicates(dir.path(), &files, &raised_config);
917 assert_eq!(
918 report.clone_groups.len(),
919 1,
920 "minOccurrences=3 should hide the 2-instance group"
921 );
922 assert_eq!(
923 report.clone_groups[0].instances.len(),
924 3,
925 "surviving group must be the 3-instance group"
926 );
927 assert_eq!(
928 report.stats.clone_groups_below_min_occurrences, 1,
929 "the hidden 2-instance group must be counted"
930 );
931 assert_eq!(
932 report.stats.clone_groups, 1,
933 "stats.clone_groups must match the post-filter array length"
934 );
935 assert_eq!(
936 report.stats.clone_instances, 3,
937 "stats.clone_instances must match the surviving instance total"
938 );
939 assert!(
940 (report.stats.duplication_percentage - baseline_pct).abs() < f64::EPSILON,
941 "duplication_percentage should not shift when minOccurrences changes"
942 );
943 }
944
945 #[test]
946 fn min_occurrences_evaluates_after_line_suppressions() {
947 let dir = tempfile::tempdir().expect("create temp dir");
948 let src_dir = dir.path().join("src");
949 std::fs::create_dir_all(&src_dir).expect("create src dir");
950
951 let block = r#"
952export function shared(input: string): string {
953 const trimmed = input.trim();
954 if (trimmed.length === 0) {
955 return "";
956 }
957 const parts = trimmed.split(",");
958 const filtered = parts.filter(p => p.length > 0);
959 const mapped = filtered.map(p => p.toUpperCase());
960 return mapped.join(", ");
961}
962"#;
963 let suppressed = format!("// fallow-ignore-file code-duplication\n{block}");
964
965 let a = src_dir.join("a.ts");
966 let b = src_dir.join("b.ts");
967 let c = src_dir.join("c.ts");
968 std::fs::write(&a, block).expect("write a");
969 std::fs::write(&b, block).expect("write b");
970 std::fs::write(&c, &suppressed).expect("write c");
971
972 let files = vec![
973 DiscoveredFile {
974 id: FileId(0),
975 path: a,
976 size_bytes: block.len() as u64,
977 },
978 DiscoveredFile {
979 id: FileId(1),
980 path: b,
981 size_bytes: block.len() as u64,
982 },
983 DiscoveredFile {
984 id: FileId(2),
985 path: c,
986 size_bytes: suppressed.len() as u64,
987 },
988 ];
989
990 let config = DuplicatesConfig {
991 min_tokens: 10,
992 min_lines: 2,
993 min_occurrences: 3,
994 ..DuplicatesConfig::default()
995 };
996 let report = find_duplicates(dir.path(), &files, &config);
997 assert!(
998 report.clone_groups.is_empty(),
999 "post-suppression 2-instance group must be hidden by minOccurrences=3, \
1000 got groups: {:?}",
1001 report
1002 .clone_groups
1003 .iter()
1004 .map(|g| g.instances.len())
1005 .collect::<Vec<_>>()
1006 );
1007 assert_eq!(
1008 report.stats.clone_groups, 0,
1009 "stats.clone_groups must match the empty post-filter array"
1010 );
1011 assert_eq!(
1012 report.stats.clone_instances, 0,
1013 "stats.clone_instances must match the empty post-filter array"
1014 );
1015 }
1016}