Skip to main content

fallow_core/duplicates/
mod.rs

1//! Code duplication / clone detection module.
2//!
3//! This module implements suffix array + LCP based clone detection
4//! for TypeScript/JavaScript source files. It supports multiple detection
5//! modes from strict (exact matches only) to semantic (structure-aware
6//! matching that ignores identifier names and literal values).
7
8mod cache;
9pub mod detect;
10pub mod families;
11pub mod normalize;
12mod shingle_filter;
13pub mod token_types;
14mod token_visitor;
15pub mod tokenize;
16pub(crate) mod types;
17
18use rustc_hash::FxHashMap;
19use std::path::{Path, PathBuf};
20use std::sync::atomic::{AtomicUsize, Ordering};
21
22use globset::{Glob, GlobMatcher, GlobSet, GlobSetBuilder};
23use rayon::prelude::*;
24use rustc_hash::FxHashSet;
25
26use cache::{TokenCache, TokenCacheEntry, TokenCacheMode};
27use detect::CloneDetector;
28use normalize::normalize_and_hash_resolved;
29use tokenize::{tokenize_file, tokenize_file_cross_language};
30pub use types::{
31    CloneFamily, CloneGroup, CloneInstance, DefaultIgnoreSkipCount, DefaultIgnoreSkips,
32    DetectionMode, DuplicatesConfig, DuplicationReport, DuplicationStats, MirroredDirectory,
33    RefactoringKind, RefactoringSuggestion,
34};
35
36use crate::discover::{self, DiscoveredFile};
37use crate::suppress::{self, IssueKind, Suppression};
38
39/// Built-in duplicates ignores for generated framework and tool output.
40///
41/// These are engine policy defaults, not config-file defaults: `duplicates.ignore`
42/// stays empty in round-tripped configs, while the analyzer merges these patterns
43/// unless `duplicates.ignoreDefaults` is set to `false`.
44pub const DUPES_DEFAULT_IGNORES: &[&str] = &[
45    "**/.next/**",
46    "**/.nuxt/**",
47    "**/.svelte-kit/**",
48    "**/.turbo/**",
49    "**/.parcel-cache/**",
50    "**/.vite/**",
51    "**/.cache/**",
52    "**/out/**",
53    "**/storybook-static/**",
54];
55
56#[derive(Clone)]
57pub(super) struct TokenizedFile {
58    path: PathBuf,
59    hashed_tokens: Vec<normalize::HashedToken>,
60    file_tokens: tokenize::FileTokens,
61    metadata: Option<std::fs::Metadata>,
62    cache_hit: bool,
63    suppressions: Vec<Suppression>,
64}
65
66struct IgnoreSet {
67    all: GlobSet,
68    defaults: Vec<(&'static str, GlobMatcher)>,
69}
70
71impl IgnoreSet {
72    fn is_match(&self, path: &Path) -> bool {
73        self.all.is_match(path)
74    }
75
76    fn default_match_index(&self, path: &Path) -> Option<usize> {
77        self.defaults
78            .iter()
79            .position(|(_, matcher)| matcher.is_match(path))
80    }
81}
82
83struct DuplicationRun {
84    report: DuplicationReport,
85    default_ignore_skips: DefaultIgnoreSkips,
86}
87
88/// Run duplication detection on the given files.
89///
90/// This is the main entry point for the duplication analysis. It:
91/// 1. Reads and tokenizes all source files in parallel
92/// 2. Normalizes tokens according to the detection mode
93/// 3. Runs suffix array + LCP clone detection
94/// 4. Groups clone instances into families with refactoring suggestions
95/// 5. Applies inline suppression filters
96pub fn find_duplicates(
97    root: &Path,
98    files: &[DiscoveredFile],
99    config: &DuplicatesConfig,
100) -> DuplicationReport {
101    find_duplicates_inner(root, files, config, None, None).report
102}
103
104/// Run duplication detection and return human-format sidecar metadata for
105/// files skipped by built-in duplicates ignores.
106pub fn find_duplicates_with_default_ignore_skips(
107    root: &Path,
108    files: &[DiscoveredFile],
109    config: &DuplicatesConfig,
110) -> (DuplicationReport, DefaultIgnoreSkips) {
111    let run = find_duplicates_inner(root, files, config, None, None);
112    (run.report, run.default_ignore_skips)
113}
114
115/// Run duplication detection with the persistent token cache enabled.
116pub fn find_duplicates_cached(
117    root: &Path,
118    files: &[DiscoveredFile],
119    config: &DuplicatesConfig,
120    cache_root: &Path,
121) -> DuplicationReport {
122    find_duplicates_inner(root, files, config, None, Some(cache_root)).report
123}
124
125/// Run cached duplication detection and return human-format sidecar metadata for
126/// files skipped by built-in duplicates ignores.
127pub fn find_duplicates_cached_with_default_ignore_skips(
128    root: &Path,
129    files: &[DiscoveredFile],
130    config: &DuplicatesConfig,
131    cache_root: &Path,
132) -> (DuplicationReport, DefaultIgnoreSkips) {
133    let run = find_duplicates_inner(root, files, config, None, Some(cache_root));
134    (run.report, run.default_ignore_skips)
135}
136
137/// Run duplication detection and only return clone groups touching `focus_files`.
138///
139/// This keeps all files in the matching corpus, which preserves changed-file
140/// versus unchanged-file detection for diff-scoped audit runs, but avoids
141/// materializing duplicate groups that cannot appear in the scoped report.
142#[expect(
143    clippy::implicit_hasher,
144    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
145)]
146pub fn find_duplicates_touching_files(
147    root: &Path,
148    files: &[DiscoveredFile],
149    config: &DuplicatesConfig,
150    focus_files: &FxHashSet<PathBuf>,
151) -> DuplicationReport {
152    find_duplicates_inner(root, files, config, Some(focus_files), None).report
153}
154
155/// Run focused duplication detection and return human-format sidecar metadata
156/// for files skipped by built-in duplicates ignores.
157#[expect(
158    clippy::implicit_hasher,
159    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
160)]
161pub fn find_duplicates_touching_files_with_default_ignore_skips(
162    root: &Path,
163    files: &[DiscoveredFile],
164    config: &DuplicatesConfig,
165    focus_files: &FxHashSet<PathBuf>,
166) -> (DuplicationReport, DefaultIgnoreSkips) {
167    let run = find_duplicates_inner(root, files, config, Some(focus_files), None);
168    (run.report, run.default_ignore_skips)
169}
170
171/// Run focused duplication detection with the persistent token cache enabled.
172#[expect(
173    clippy::implicit_hasher,
174    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
175)]
176pub fn find_duplicates_touching_files_cached(
177    root: &Path,
178    files: &[DiscoveredFile],
179    config: &DuplicatesConfig,
180    focus_files: &FxHashSet<PathBuf>,
181    cache_root: &Path,
182) -> DuplicationReport {
183    find_duplicates_inner(root, files, config, Some(focus_files), Some(cache_root)).report
184}
185
186/// Run cached focused duplication detection and return human-format sidecar
187/// metadata for files skipped by built-in duplicates ignores.
188#[expect(
189    clippy::implicit_hasher,
190    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
191)]
192pub fn find_duplicates_touching_files_cached_with_default_ignore_skips(
193    root: &Path,
194    files: &[DiscoveredFile],
195    config: &DuplicatesConfig,
196    focus_files: &FxHashSet<PathBuf>,
197    cache_root: &Path,
198) -> (DuplicationReport, DefaultIgnoreSkips) {
199    let run = find_duplicates_inner(root, files, config, Some(focus_files), Some(cache_root));
200    (run.report, run.default_ignore_skips)
201}
202
203fn find_duplicates_inner(
204    root: &Path,
205    files: &[DiscoveredFile],
206    config: &DuplicatesConfig,
207    focus_files: Option<&FxHashSet<PathBuf>>,
208    cache_root: Option<&Path>,
209) -> DuplicationRun {
210    let _span = tracing::info_span!("find_duplicates").entered();
211
212    let extra_ignores = build_ignore_set(config);
213    let default_skip_counts = extra_ignores
214        .as_ref()
215        .map(|ignores| {
216            std::iter::repeat_with(|| AtomicUsize::new(0))
217                .take(ignores.defaults.len())
218                .collect::<Vec<_>>()
219        })
220        .unwrap_or_default();
221
222    // Resolve normalization: mode defaults + user overrides
223    let normalization =
224        fallow_config::ResolvedNormalization::resolve(config.mode, &config.normalization);
225
226    let strip_types = config.cross_language;
227    let skip_imports = config.ignore_imports;
228
229    tracing::debug!(
230        ignore_imports = skip_imports,
231        "duplication tokenization config"
232    );
233
234    let token_cache_mode = TokenCacheMode::new(normalization, strip_types, skip_imports);
235    let cache_root = cache_root.filter(|_| files.len() >= config.min_corpus_size_for_token_cache);
236    let token_cache = cache_root.map(TokenCache::load);
237
238    // Step 1 & 2: Tokenize and normalize all files in parallel, also parse suppressions
239    let mut file_data: Vec<TokenizedFile> = files
240        .par_iter()
241        .filter_map(|file| {
242            // Apply extra ignore patterns
243            let relative = file.path.strip_prefix(root).unwrap_or(&file.path);
244            if let Some(ref ignores) = extra_ignores {
245                if let Some(index) = ignores.default_match_index(relative) {
246                    default_skip_counts[index].fetch_add(1, Ordering::Relaxed);
247                    return None;
248                }
249                if ignores.is_match(relative) {
250                    return None;
251                }
252            }
253
254            let metadata = std::fs::metadata(&file.path).ok()?;
255
256            let cached_entry = token_cache
257                .as_ref()
258                .and_then(|cache| cache.get(&file.path, &metadata, token_cache_mode));
259            let cache_hit = cached_entry.is_some();
260
261            let (mut entry, suppressions) = if let Some(entry) = cached_entry {
262                let suppressions = entry.suppressions.clone();
263                if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
264                    return None;
265                }
266                (entry, suppressions)
267            } else {
268                let source = std::fs::read_to_string(&file.path).ok()?;
269                let suppressions = suppress::parse_suppressions_from_source(&source).suppressions;
270                if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
271                    return None;
272                }
273
274                // Tokenize (with optional type stripping for cross-language detection)
275                let file_tokens = if strip_types {
276                    tokenize_file_cross_language(&file.path, &source, true, skip_imports)
277                } else {
278                    tokenize_file(&file.path, &source, skip_imports)
279                };
280                if file_tokens.tokens.is_empty() {
281                    return None;
282                }
283
284                // Normalize and hash using resolved normalization flags
285                let hashed = normalize_and_hash_resolved(&file_tokens.tokens, normalization);
286                let entry = TokenCacheEntry {
287                    hashed_tokens: hashed,
288                    file_tokens,
289                    suppressions: suppressions.clone(),
290                };
291                (entry, suppressions)
292            };
293            if entry.file_tokens.tokens.is_empty() {
294                return None;
295            }
296            if entry.hashed_tokens.len() < config.min_tokens {
297                return None;
298            }
299
300            Some(TokenizedFile {
301                path: file.path.clone(),
302                hashed_tokens: std::mem::take(&mut entry.hashed_tokens),
303                file_tokens: entry.file_tokens,
304                metadata: Some(metadata),
305                cache_hit,
306                suppressions,
307            })
308        })
309        .collect();
310
311    if let (Some(cache_root), Some(mut cache)) = (cache_root, token_cache) {
312        for file in &file_data {
313            if !file.cache_hit
314                && let Some(metadata) = &file.metadata
315            {
316                cache.insert(
317                    &file.path,
318                    metadata,
319                    token_cache_mode,
320                    &file.hashed_tokens,
321                    &file.file_tokens,
322                    &file.suppressions,
323                );
324            }
325        }
326        cache.retain_paths(files);
327        match cache.save_if_dirty() {
328            Ok(true) => {
329                tracing::debug!(cache_root = %cache_root.display(), "saved duplication token cache");
330            }
331            Ok(false) => {
332                tracing::debug!(cache_root = %cache_root.display(), "duplication token cache unchanged");
333            }
334            Err(err) => {
335                tracing::warn!("Failed to save duplication token cache: {err}");
336            }
337        }
338    }
339
340    tracing::info!(
341        files = file_data.len(),
342        "tokenized files for duplication analysis"
343    );
344
345    if let Some(focus_files) = focus_files
346        && file_data.len() >= config.min_corpus_size_for_shingle_filter
347    {
348        shingle_filter::filter_to_focus_candidates(&mut file_data, focus_files, config.min_tokens);
349    }
350
351    // Collect per-file suppressions for line-level filtering
352    let suppressions_by_file: FxHashMap<PathBuf, Vec<Suppression>> = file_data
353        .iter()
354        .filter(|file| !file.suppressions.is_empty())
355        .map(|file| (file.path.clone(), file.suppressions.clone()))
356        .collect();
357
358    // Strip suppressions from the data passed to the detector
359    let detector_data: Vec<(PathBuf, Vec<normalize::HashedToken>, tokenize::FileTokens)> =
360        file_data
361            .into_iter()
362            .map(|file| (file.path, file.hashed_tokens, file.file_tokens))
363            .collect();
364
365    // Step 3 & 4: Detect clones
366    let detector = CloneDetector::new(config.min_tokens, config.min_lines, config.skip_local);
367    let mut report = if let Some(focus_files) = focus_files {
368        detector.detect_touching_files(detector_data, focus_files)
369    } else {
370        detector.detect(detector_data)
371    };
372
373    // Step 5: Apply line-level suppressions FIRST, so the post-suppression
374    // instance count is what the min-occurrences filter evaluates. Otherwise
375    // a 3-instance clone group whose third instance is line-suppressed would
376    // survive `--min-occurrences 3` and show up as a 2-instance group.
377    if !suppressions_by_file.is_empty() {
378        apply_line_suppressions(&mut report, &suppressions_by_file);
379    }
380
381    // Step 5b: Apply the min-occurrences filter on the post-suppression set.
382    apply_min_occurrences_filter(&mut report, config.min_occurrences);
383
384    let default_ignore_skips =
385        build_default_ignore_skips(extra_ignores.as_ref(), &default_skip_counts);
386
387    // Step 6: Group into families with refactoring suggestions
388    report.clone_families = families::group_into_families(&report.clone_groups, root);
389
390    // Step 7: Detect mirrored directory trees
391    report.mirrored_directories =
392        families::detect_mirrored_directories(&report.clone_families, root);
393
394    // Sort all result arrays for deterministic output ordering.
395    // Parallel tokenization (par_iter) doesn't guarantee collection order.
396    report.sort();
397
398    DuplicationRun {
399        report,
400        default_ignore_skips,
401    }
402}
403
404/// Drop clone groups with fewer than `min` instances and record the count on
405/// the stats block. The detector already guarantees `>= 2`, so this is a
406/// no-op when `min <= 2`.
407///
408/// Stats split: `clone_groups` and `clone_instances` are recomputed
409/// post-filter so they match the serialized array length (a CI consumer
410/// reading `stats.clone_groups` and iterating `clone_groups[]` sees the same
411/// count). `duplication_percentage`, `duplicated_lines`, `duplicated_tokens`,
412/// and `files_with_clones` stay pre-filter so the percentage math (lines /
413/// total) stays consistent and `threshold` gates / trend lines don't shift
414/// when the filter changes. The hidden count is disclosed in
415/// `clone_groups_below_min_occurrences`. The surviving groups feed every
416/// downstream step (families, mirrored dirs, --top, baseline, changed-since,
417/// workspace scoping) so there's a single source of truth.
418fn apply_min_occurrences_filter(report: &mut DuplicationReport, min: usize) {
419    if min <= 2 {
420        return;
421    }
422    let before = report.clone_groups.len();
423    report
424        .clone_groups
425        .retain(|group| group.instances.len() >= min);
426    let hidden = before - report.clone_groups.len();
427    if hidden == 0 {
428        return;
429    }
430    report.stats.clone_groups_below_min_occurrences = hidden;
431    report.stats.clone_groups = report.clone_groups.len();
432    report.stats.clone_instances = report.clone_groups.iter().map(|g| g.instances.len()).sum();
433}
434
435/// Filter out clone instances that are suppressed by line-level comments.
436#[expect(
437    clippy::cast_possible_truncation,
438    reason = "line numbers are bounded by source size"
439)]
440fn apply_line_suppressions(
441    report: &mut DuplicationReport,
442    suppressions_by_file: &FxHashMap<PathBuf, Vec<Suppression>>,
443) {
444    report.clone_groups.retain_mut(|group| {
445        group.instances.retain(|instance| {
446            if let Some(supps) = suppressions_by_file.get(&instance.file) {
447                // Check if any line in the instance range is suppressed
448                for line in instance.start_line..=instance.end_line {
449                    if suppress::is_suppressed(supps, line as u32, IssueKind::CodeDuplication) {
450                        return false;
451                    }
452                }
453            }
454            true
455        });
456        // Keep group only if it still has 2+ instances
457        group.instances.len() >= 2
458    });
459}
460
461/// Run duplication detection on a project directory using auto-discovered files.
462///
463/// This is a convenience function that handles file discovery internally.
464#[must_use]
465pub fn find_duplicates_in_project(root: &Path, config: &DuplicatesConfig) -> DuplicationReport {
466    let resolved = crate::default_config(root);
467    let files = discover::discover_files_with_plugin_scopes(&resolved);
468    find_duplicates(root, &files, config)
469}
470
471/// Build a merged ignore set from built-in and user-provided duplicates ignores.
472fn build_ignore_set(config: &DuplicatesConfig) -> Option<IgnoreSet> {
473    if !config.ignore_defaults && config.ignore.is_empty() {
474        return None;
475    }
476
477    let mut builder = GlobSetBuilder::new();
478    let mut defaults = Vec::new();
479
480    if config.ignore_defaults {
481        for pattern in DUPES_DEFAULT_IGNORES {
482            let glob = Glob::new(pattern).expect("default duplication ignore pattern is valid");
483            defaults.push((*pattern, glob.compile_matcher()));
484            builder.add(glob);
485        }
486    }
487
488    // User patterns were validated at config load time
489    // (see FallowConfig::validate_user_globs).
490    for pattern in &config.ignore {
491        builder.add(
492            Glob::new(pattern)
493                .expect("duplicates.ignore pattern was validated at config load time"),
494        );
495    }
496
497    builder.build().ok().map(|all| IgnoreSet { all, defaults })
498}
499
500fn build_default_ignore_skips(
501    ignores: Option<&IgnoreSet>,
502    counts: &[AtomicUsize],
503) -> DefaultIgnoreSkips {
504    let Some(ignores) = ignores else {
505        return DefaultIgnoreSkips::default();
506    };
507
508    let by_pattern = ignores
509        .defaults
510        .iter()
511        .zip(counts)
512        .filter_map(|((pattern, _), count)| {
513            let count = count.load(Ordering::Relaxed);
514            (count > 0).then_some(DefaultIgnoreSkipCount { pattern, count })
515        })
516        .collect::<Vec<_>>();
517    let total = by_pattern.iter().map(|entry| entry.count).sum();
518
519    DefaultIgnoreSkips { total, by_pattern }
520}
521
522#[cfg(test)]
523mod tests {
524    use super::*;
525    use crate::discover::FileId;
526
527    #[test]
528    fn find_duplicates_empty_files() {
529        let config = DuplicatesConfig::default();
530        let report = find_duplicates(Path::new("/tmp"), &[], &config);
531        assert!(report.clone_groups.is_empty());
532        assert!(report.clone_families.is_empty());
533        assert_eq!(report.stats.total_files, 0);
534    }
535
536    #[test]
537    fn build_ignore_set_empty() {
538        let config = DuplicatesConfig {
539            ignore_defaults: false,
540            ..DuplicatesConfig::default()
541        };
542        assert!(build_ignore_set(&config).is_none());
543    }
544
545    #[test]
546    fn build_ignore_set_valid_patterns() {
547        let config = DuplicatesConfig {
548            ignore_defaults: false,
549            ignore: vec!["**/*.test.ts".to_string(), "**/*.spec.ts".to_string()],
550            ..DuplicatesConfig::default()
551        };
552        let set = build_ignore_set(&config);
553        assert!(set.is_some());
554        let set = set.unwrap();
555        assert!(set.is_match(Path::new("src/foo.test.ts")));
556        assert!(set.is_match(Path::new("src/bar.spec.ts")));
557        assert!(!set.is_match(Path::new("src/baz.ts")));
558    }
559
560    #[test]
561    fn build_ignore_set_merges_defaults_with_user_patterns() {
562        let config = DuplicatesConfig {
563            ignore: vec!["**/foo/**".to_string()],
564            ..DuplicatesConfig::default()
565        };
566        let set = build_ignore_set(&config).expect("ignore set");
567        assert!(set.is_match(Path::new(".next/static/chunks/app.js")));
568        assert!(set.is_match(Path::new("src/foo/generated.js")));
569    }
570
571    #[test]
572    fn build_ignore_set_ignore_defaults_false_uses_only_user_patterns() {
573        let config = DuplicatesConfig {
574            ignore_defaults: false,
575            ignore: vec!["**/foo/**".to_string()],
576            ..DuplicatesConfig::default()
577        };
578        let set = build_ignore_set(&config).expect("ignore set");
579        assert!(!set.is_match(Path::new(".next/static/chunks/app.js")));
580        assert!(set.is_match(Path::new("src/foo/generated.js")));
581    }
582
583    #[test]
584    fn find_duplicates_with_real_files() {
585        // Create a temp directory with duplicate files
586        let dir = tempfile::tempdir().expect("create temp dir");
587        let src_dir = dir.path().join("src");
588        std::fs::create_dir_all(&src_dir).expect("create src dir");
589
590        let code = r#"
591export function processData(input: string): string {
592    const trimmed = input.trim();
593    if (trimmed.length === 0) {
594        return "";
595    }
596    const parts = trimmed.split(",");
597    const filtered = parts.filter(p => p.length > 0);
598    const mapped = filtered.map(p => p.toUpperCase());
599    return mapped.join(", ");
600}
601
602export function validateInput(data: string): boolean {
603    if (data === null || data === undefined) {
604        return false;
605    }
606    const cleaned = data.trim();
607    if (cleaned.length < 3) {
608        return false;
609    }
610    return true;
611}
612"#;
613
614        std::fs::write(src_dir.join("original.ts"), code).expect("write original");
615        std::fs::write(src_dir.join("copy.ts"), code).expect("write copy");
616        std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
617            .expect("write package.json");
618
619        let files = vec![
620            DiscoveredFile {
621                id: FileId(0),
622                path: src_dir.join("original.ts"),
623                size_bytes: code.len() as u64,
624            },
625            DiscoveredFile {
626                id: FileId(1),
627                path: src_dir.join("copy.ts"),
628                size_bytes: code.len() as u64,
629            },
630        ];
631
632        let config = DuplicatesConfig {
633            min_tokens: 10,
634            min_lines: 2,
635            ..DuplicatesConfig::default()
636        };
637
638        let report = find_duplicates(dir.path(), &files, &config);
639        assert!(
640            !report.clone_groups.is_empty(),
641            "Should detect clones in identical files"
642        );
643        assert!(report.stats.files_with_clones >= 2);
644
645        // Should also have clone families
646        assert!(
647            !report.clone_families.is_empty(),
648            "Should group clones into families"
649        );
650    }
651
652    #[test]
653    fn find_duplicates_cached_skips_token_cache_for_small_corpus() {
654        let dir = tempfile::tempdir().expect("create temp dir");
655        let src_dir = dir.path().join("src");
656        std::fs::create_dir_all(&src_dir).expect("create src dir");
657
658        let code = "export function same(input: number): number {\n  const doubled = input * 2;\n  return doubled + 1;\n}\n";
659        let first = src_dir.join("first.ts");
660        let second = src_dir.join("second.ts");
661        std::fs::write(&first, code).expect("write first");
662        std::fs::write(&second, code).expect("write second");
663
664        let files = vec![
665            DiscoveredFile {
666                id: FileId(0),
667                path: first,
668                size_bytes: code.len() as u64,
669            },
670            DiscoveredFile {
671                id: FileId(1),
672                path: second,
673                size_bytes: code.len() as u64,
674            },
675        ];
676        let config = DuplicatesConfig {
677            min_tokens: 5,
678            min_lines: 2,
679            ..DuplicatesConfig::default()
680        };
681        let cache_root = dir.path().join(".fallow");
682
683        let report = find_duplicates_cached(dir.path(), &files, &config, &cache_root);
684
685        assert!(!report.clone_groups.is_empty());
686        assert!(
687            !cache_root.exists(),
688            "small projects should avoid token-cache IO overhead"
689        );
690    }
691
692    #[test]
693    fn find_duplicates_touching_files_keeps_cross_corpus_matches_only_for_focus() {
694        let dir = tempfile::tempdir().expect("create temp dir");
695        let src_dir = dir.path().join("src");
696        std::fs::create_dir_all(&src_dir).expect("create src dir");
697
698        let focused_code = r"
699export function focused(input: number): number {
700    const doubled = input * 2;
701    const shifted = doubled + 10;
702    return shifted / 2;
703}
704";
705        let untouched_code = r#"
706export function untouched(input: string): string {
707    const lowered = input.toLowerCase();
708    const padded = lowered.padStart(10, "x");
709    return padded.slice(0, 8);
710}
711"#;
712
713        let changed_path = src_dir.join("changed.ts");
714        let focused_copy_path = src_dir.join("focused-copy.ts");
715        let untouched_a_path = src_dir.join("untouched-a.ts");
716        let untouched_b_path = src_dir.join("untouched-b.ts");
717        std::fs::write(&changed_path, focused_code).expect("write changed");
718        std::fs::write(&focused_copy_path, focused_code).expect("write focused copy");
719        std::fs::write(&untouched_a_path, untouched_code).expect("write untouched a");
720        std::fs::write(&untouched_b_path, untouched_code).expect("write untouched b");
721
722        let files = vec![
723            DiscoveredFile {
724                id: FileId(0),
725                path: changed_path.clone(),
726                size_bytes: focused_code.len() as u64,
727            },
728            DiscoveredFile {
729                id: FileId(1),
730                path: focused_copy_path,
731                size_bytes: focused_code.len() as u64,
732            },
733            DiscoveredFile {
734                id: FileId(2),
735                path: untouched_a_path,
736                size_bytes: untouched_code.len() as u64,
737            },
738            DiscoveredFile {
739                id: FileId(3),
740                path: untouched_b_path,
741                size_bytes: untouched_code.len() as u64,
742            },
743        ];
744
745        let config = DuplicatesConfig {
746            mode: DetectionMode::Strict,
747            min_tokens: 5,
748            min_lines: 2,
749            min_corpus_size_for_shingle_filter: 1,
750            ..DuplicatesConfig::default()
751        };
752        let mut focus = FxHashSet::default();
753        focus.insert(changed_path.clone());
754
755        let full_report = find_duplicates(dir.path(), &files, &config);
756        let report = find_duplicates_touching_files(dir.path(), &files, &config, &focus);
757        let expected_touching = full_report
758            .clone_groups
759            .iter()
760            .filter(|group| {
761                group
762                    .instances
763                    .iter()
764                    .any(|instance| instance.file == changed_path)
765            })
766            .count();
767
768        assert!(
769            !report.clone_groups.is_empty(),
770            "focused file should still match an unchanged duplicate"
771        );
772        assert_eq!(
773            report.clone_groups.len(),
774            expected_touching,
775            "focused shingle filtering must not drop clone groups touching the focused file"
776        );
777        assert!(report.clone_groups.iter().all(|group| {
778            group
779                .instances
780                .iter()
781                .any(|instance| instance.file == changed_path)
782        }));
783    }
784
785    #[test]
786    fn file_wide_suppression_excludes_file() {
787        let dir = tempfile::tempdir().expect("create temp dir");
788        let src_dir = dir.path().join("src");
789        std::fs::create_dir_all(&src_dir).expect("create src dir");
790
791        let code = r#"
792export function processData(input: string): string {
793    const trimmed = input.trim();
794    if (trimmed.length === 0) {
795        return "";
796    }
797    const parts = trimmed.split(",");
798    const filtered = parts.filter(p => p.length > 0);
799    const mapped = filtered.map(p => p.toUpperCase());
800    return mapped.join(", ");
801}
802"#;
803        let suppressed_code = format!("// fallow-ignore-file code-duplication\n{code}");
804
805        std::fs::write(src_dir.join("original.ts"), code).expect("write original");
806        std::fs::write(src_dir.join("suppressed.ts"), &suppressed_code).expect("write suppressed");
807        std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
808            .expect("write package.json");
809
810        let files = vec![
811            DiscoveredFile {
812                id: FileId(0),
813                path: src_dir.join("original.ts"),
814                size_bytes: code.len() as u64,
815            },
816            DiscoveredFile {
817                id: FileId(1),
818                path: src_dir.join("suppressed.ts"),
819                size_bytes: suppressed_code.len() as u64,
820            },
821        ];
822
823        let config = DuplicatesConfig {
824            min_tokens: 10,
825            min_lines: 2,
826            ..DuplicatesConfig::default()
827        };
828
829        let report = find_duplicates(dir.path(), &files, &config);
830        // With only 2 files and one suppressed, there should be no clones
831        assert!(
832            report.clone_groups.is_empty(),
833            "File-wide suppression should exclude file from duplication analysis"
834        );
835    }
836
837    #[test]
838    fn min_occurrences_hides_pairs_and_records_count() {
839        let dir = tempfile::tempdir().expect("create temp dir");
840        let src_dir = dir.path().join("src");
841        std::fs::create_dir_all(&src_dir).expect("create src dir");
842
843        // Block A: only appears in 2 files (a pair).
844        // Block B: appears in 3 files (a triple).
845        let block_a = r#"
846export function blockA(input: string): string {
847    const trimmed = input.trim();
848    if (trimmed.length === 0) {
849        return "";
850    }
851    const parts = trimmed.split(",");
852    const filtered = parts.filter(p => p.length > 0);
853    const mapped = filtered.map(p => p.toUpperCase());
854    return mapped.join(", ");
855}
856"#;
857        let block_b = r"
858export function blockB(value: number): number {
859    if (value <= 0) {
860        return 0;
861    }
862    let total = 0;
863    for (let i = 1; i <= value; i += 1) {
864        total += i * 2;
865        total -= 1;
866    }
867    return total + 7;
868}
869";
870
871        let pair_a1 = src_dir.join("pair-a1.ts");
872        let pair_a2 = src_dir.join("pair-a2.ts");
873        let triple_b1 = src_dir.join("triple-b1.ts");
874        let triple_b2 = src_dir.join("triple-b2.ts");
875        let triple_b3 = src_dir.join("triple-b3.ts");
876        std::fs::write(&pair_a1, block_a).expect("write");
877        std::fs::write(&pair_a2, block_a).expect("write");
878        std::fs::write(&triple_b1, block_b).expect("write");
879        std::fs::write(&triple_b2, block_b).expect("write");
880        std::fs::write(&triple_b3, block_b).expect("write");
881
882        let files = vec![
883            DiscoveredFile {
884                id: FileId(0),
885                path: pair_a1,
886                size_bytes: block_a.len() as u64,
887            },
888            DiscoveredFile {
889                id: FileId(1),
890                path: pair_a2,
891                size_bytes: block_a.len() as u64,
892            },
893            DiscoveredFile {
894                id: FileId(2),
895                path: triple_b1,
896                size_bytes: block_b.len() as u64,
897            },
898            DiscoveredFile {
899                id: FileId(3),
900                path: triple_b2,
901                size_bytes: block_b.len() as u64,
902            },
903            DiscoveredFile {
904                id: FileId(4),
905                path: triple_b3,
906                size_bytes: block_b.len() as u64,
907            },
908        ];
909
910        // Baseline: minOccurrences = 2 (default). Both groups reported.
911        let default_config = DuplicatesConfig {
912            min_tokens: 10,
913            min_lines: 2,
914            ..DuplicatesConfig::default()
915        };
916        let baseline = find_duplicates(dir.path(), &files, &default_config);
917        assert_eq!(
918            baseline.clone_groups.len(),
919            2,
920            "default minOccurrences should report both the pair and the triple"
921        );
922        assert_eq!(
923            baseline.stats.clone_groups_below_min_occurrences, 0,
924            "default minOccurrences hides nothing"
925        );
926        let baseline_pct = baseline.stats.duplication_percentage;
927
928        // Raised: minOccurrences = 3. Only the triple survives.
929        let raised_config = DuplicatesConfig {
930            min_tokens: 10,
931            min_lines: 2,
932            min_occurrences: 3,
933            ..DuplicatesConfig::default()
934        };
935        let report = find_duplicates(dir.path(), &files, &raised_config);
936        assert_eq!(
937            report.clone_groups.len(),
938            1,
939            "minOccurrences=3 should hide the 2-instance group"
940        );
941        assert_eq!(
942            report.clone_groups[0].instances.len(),
943            3,
944            "surviving group must be the 3-instance group"
945        );
946        assert_eq!(
947            report.stats.clone_groups_below_min_occurrences, 1,
948            "the hidden 2-instance group must be counted"
949        );
950        // `clone_groups` and `clone_instances` reflect the post-filter set so
951        // consumers iterating `clone_groups[]` see a matching count.
952        assert_eq!(
953            report.stats.clone_groups, 1,
954            "stats.clone_groups must match the post-filter array length"
955        );
956        assert_eq!(
957            report.stats.clone_instances, 3,
958            "stats.clone_instances must match the surviving instance total"
959        );
960        // `duplication_percentage` stays pre-filter so threshold gates and
961        // trend lines don't shift when minOccurrences changes.
962        assert!(
963            (report.stats.duplication_percentage - baseline_pct).abs() < f64::EPSILON,
964            "duplication_percentage should not shift when minOccurrences changes"
965        );
966    }
967
968    #[test]
969    fn min_occurrences_evaluates_after_line_suppressions() {
970        // Three files share a clone. The third file suppresses the clone with
971        // an inline comment. After suppression the group has 2 instances.
972        // With minOccurrences=3 the group must be hidden, NOT reported as a
973        // 2-instance clone. The filter evaluates the post-suppression count,
974        // not the pre-suppression detector output.
975        let dir = tempfile::tempdir().expect("create temp dir");
976        let src_dir = dir.path().join("src");
977        std::fs::create_dir_all(&src_dir).expect("create src dir");
978
979        let block = r#"
980export function shared(input: string): string {
981    const trimmed = input.trim();
982    if (trimmed.length === 0) {
983        return "";
984    }
985    const parts = trimmed.split(",");
986    const filtered = parts.filter(p => p.length > 0);
987    const mapped = filtered.map(p => p.toUpperCase());
988    return mapped.join(", ");
989}
990"#;
991        let suppressed = format!("// fallow-ignore-file code-duplication\n{block}");
992
993        let a = src_dir.join("a.ts");
994        let b = src_dir.join("b.ts");
995        let c = src_dir.join("c.ts");
996        std::fs::write(&a, block).expect("write a");
997        std::fs::write(&b, block).expect("write b");
998        std::fs::write(&c, &suppressed).expect("write c");
999
1000        let files = vec![
1001            DiscoveredFile {
1002                id: FileId(0),
1003                path: a,
1004                size_bytes: block.len() as u64,
1005            },
1006            DiscoveredFile {
1007                id: FileId(1),
1008                path: b,
1009                size_bytes: block.len() as u64,
1010            },
1011            DiscoveredFile {
1012                id: FileId(2),
1013                path: c,
1014                size_bytes: suppressed.len() as u64,
1015            },
1016        ];
1017
1018        let config = DuplicatesConfig {
1019            min_tokens: 10,
1020            min_lines: 2,
1021            min_occurrences: 3,
1022            ..DuplicatesConfig::default()
1023        };
1024        let report = find_duplicates(dir.path(), &files, &config);
1025        assert!(
1026            report.clone_groups.is_empty(),
1027            "post-suppression 2-instance group must be hidden by minOccurrences=3, \
1028             got groups: {:?}",
1029            report
1030                .clone_groups
1031                .iter()
1032                .map(|g| g.instances.len())
1033                .collect::<Vec<_>>()
1034        );
1035        assert_eq!(
1036            report.stats.clone_groups, 0,
1037            "stats.clone_groups must match the empty post-filter array"
1038        );
1039        assert_eq!(
1040            report.stats.clone_instances, 0,
1041            "stats.clone_instances must match the empty post-filter array"
1042        );
1043    }
1044}