Skip to main content

fallow_core/duplicates/
mod.rs

1//! Code duplication / clone detection module.
2//!
3//! This module implements suffix array + LCP based clone detection
4//! for TypeScript/JavaScript source files. It supports multiple detection
5//! modes from strict (exact matches only) to semantic (structure-aware
6//! matching that ignores identifier names and literal values).
7
8mod cache;
9pub mod detect;
10pub mod families;
11pub mod normalize;
12mod shingle_filter;
13pub mod token_types;
14mod token_visitor;
15pub mod tokenize;
16pub(crate) mod types;
17
18use rustc_hash::FxHashMap;
19use std::path::{Path, PathBuf};
20use std::sync::atomic::{AtomicUsize, Ordering};
21
22use globset::{Glob, GlobMatcher, GlobSet, GlobSetBuilder};
23use rayon::prelude::*;
24use rustc_hash::FxHashSet;
25
26use cache::{TokenCache, TokenCacheEntry, TokenCacheMode};
27use detect::CloneDetector;
28use normalize::normalize_and_hash_resolved;
29use tokenize::{tokenize_file, tokenize_file_cross_language};
30pub use types::{
31    CloneFamily, CloneGroup, CloneInstance, DefaultIgnoreSkipCount, DefaultIgnoreSkips,
32    DetectionMode, DuplicatesConfig, DuplicationReport, DuplicationStats, MirroredDirectory,
33    RefactoringKind, RefactoringSuggestion,
34};
35
36use crate::discover::{self, DiscoveredFile};
37use crate::suppress::{self, IssueKind, Suppression};
38
39/// Built-in duplicates ignores for generated framework and tool output.
40///
41/// These are engine policy defaults, not config-file defaults: `duplicates.ignore`
42/// stays empty in round-tripped configs, while the analyzer merges these patterns
43/// unless `duplicates.ignoreDefaults` is set to `false`.
44pub const DUPES_DEFAULT_IGNORES: &[&str] = &[
45    "**/.next/**",
46    "**/.nuxt/**",
47    "**/.svelte-kit/**",
48    "**/.turbo/**",
49    "**/.parcel-cache/**",
50    "**/.vite/**",
51    "**/.cache/**",
52    "**/out/**",
53    "**/storybook-static/**",
54];
55
56#[derive(Clone)]
57pub(super) struct TokenizedFile {
58    path: PathBuf,
59    hashed_tokens: Vec<normalize::HashedToken>,
60    file_tokens: tokenize::FileTokens,
61    metadata: Option<std::fs::Metadata>,
62    cache_hit: bool,
63    suppressions: Vec<Suppression>,
64}
65
66struct IgnoreSet {
67    all: GlobSet,
68    defaults: Vec<(&'static str, GlobMatcher)>,
69}
70
71impl IgnoreSet {
72    fn is_match(&self, path: &Path) -> bool {
73        self.all.is_match(path)
74    }
75
76    fn default_match_index(&self, path: &Path) -> Option<usize> {
77        self.defaults
78            .iter()
79            .position(|(_, matcher)| matcher.is_match(path))
80    }
81}
82
83struct DuplicationRun {
84    report: DuplicationReport,
85    default_ignore_skips: DefaultIgnoreSkips,
86}
87
88/// Run duplication detection on the given files.
89///
90/// This is the main entry point for the duplication analysis. It:
91/// 1. Reads and tokenizes all source files in parallel
92/// 2. Normalizes tokens according to the detection mode
93/// 3. Runs suffix array + LCP clone detection
94/// 4. Groups clone instances into families with refactoring suggestions
95/// 5. Applies inline suppression filters
96pub fn find_duplicates(
97    root: &Path,
98    files: &[DiscoveredFile],
99    config: &DuplicatesConfig,
100) -> DuplicationReport {
101    find_duplicates_inner(root, files, config, None, None).report
102}
103
104/// Run duplication detection and return human-format sidecar metadata for
105/// files skipped by built-in duplicates ignores.
106pub fn find_duplicates_with_default_ignore_skips(
107    root: &Path,
108    files: &[DiscoveredFile],
109    config: &DuplicatesConfig,
110) -> (DuplicationReport, DefaultIgnoreSkips) {
111    let run = find_duplicates_inner(root, files, config, None, None);
112    (run.report, run.default_ignore_skips)
113}
114
115/// Run duplication detection with the persistent token cache enabled.
116pub fn find_duplicates_cached(
117    root: &Path,
118    files: &[DiscoveredFile],
119    config: &DuplicatesConfig,
120    cache_root: &Path,
121) -> DuplicationReport {
122    find_duplicates_inner(root, files, config, None, Some(cache_root)).report
123}
124
125/// Run cached duplication detection and return human-format sidecar metadata for
126/// files skipped by built-in duplicates ignores.
127pub fn find_duplicates_cached_with_default_ignore_skips(
128    root: &Path,
129    files: &[DiscoveredFile],
130    config: &DuplicatesConfig,
131    cache_root: &Path,
132) -> (DuplicationReport, DefaultIgnoreSkips) {
133    let run = find_duplicates_inner(root, files, config, None, Some(cache_root));
134    (run.report, run.default_ignore_skips)
135}
136
137/// Run duplication detection and only return clone groups touching `focus_files`.
138///
139/// This keeps all files in the matching corpus, which preserves changed-file
140/// versus unchanged-file detection for diff-scoped audit runs, but avoids
141/// materializing duplicate groups that cannot appear in the scoped report.
142#[expect(
143    clippy::implicit_hasher,
144    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
145)]
146pub fn find_duplicates_touching_files(
147    root: &Path,
148    files: &[DiscoveredFile],
149    config: &DuplicatesConfig,
150    focus_files: &FxHashSet<PathBuf>,
151) -> DuplicationReport {
152    find_duplicates_inner(root, files, config, Some(focus_files), None).report
153}
154
155/// Run focused duplication detection and return human-format sidecar metadata
156/// for files skipped by built-in duplicates ignores.
157#[expect(
158    clippy::implicit_hasher,
159    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
160)]
161pub fn find_duplicates_touching_files_with_default_ignore_skips(
162    root: &Path,
163    files: &[DiscoveredFile],
164    config: &DuplicatesConfig,
165    focus_files: &FxHashSet<PathBuf>,
166) -> (DuplicationReport, DefaultIgnoreSkips) {
167    let run = find_duplicates_inner(root, files, config, Some(focus_files), None);
168    (run.report, run.default_ignore_skips)
169}
170
171/// Run focused duplication detection with the persistent token cache enabled.
172#[expect(
173    clippy::implicit_hasher,
174    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
175)]
176pub fn find_duplicates_touching_files_cached(
177    root: &Path,
178    files: &[DiscoveredFile],
179    config: &DuplicatesConfig,
180    focus_files: &FxHashSet<PathBuf>,
181    cache_root: &Path,
182) -> DuplicationReport {
183    find_duplicates_inner(root, files, config, Some(focus_files), Some(cache_root)).report
184}
185
186/// Run cached focused duplication detection and return human-format sidecar
187/// metadata for files skipped by built-in duplicates ignores.
188#[expect(
189    clippy::implicit_hasher,
190    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
191)]
192pub fn find_duplicates_touching_files_cached_with_default_ignore_skips(
193    root: &Path,
194    files: &[DiscoveredFile],
195    config: &DuplicatesConfig,
196    focus_files: &FxHashSet<PathBuf>,
197    cache_root: &Path,
198) -> (DuplicationReport, DefaultIgnoreSkips) {
199    let run = find_duplicates_inner(root, files, config, Some(focus_files), Some(cache_root));
200    (run.report, run.default_ignore_skips)
201}
202
203fn find_duplicates_inner(
204    root: &Path,
205    files: &[DiscoveredFile],
206    config: &DuplicatesConfig,
207    focus_files: Option<&FxHashSet<PathBuf>>,
208    cache_root: Option<&Path>,
209) -> DuplicationRun {
210    let _span = tracing::info_span!("find_duplicates").entered();
211
212    let extra_ignores = build_ignore_set(config);
213    let default_skip_counts = extra_ignores
214        .as_ref()
215        .map(|ignores| {
216            std::iter::repeat_with(|| AtomicUsize::new(0))
217                .take(ignores.defaults.len())
218                .collect::<Vec<_>>()
219        })
220        .unwrap_or_default();
221
222    // Resolve normalization: mode defaults + user overrides
223    let normalization =
224        fallow_config::ResolvedNormalization::resolve(config.mode, &config.normalization);
225
226    let strip_types = config.cross_language;
227    let skip_imports = config.ignore_imports;
228
229    tracing::debug!(
230        ignore_imports = skip_imports,
231        "duplication tokenization config"
232    );
233
234    let token_cache_mode = TokenCacheMode::new(normalization, strip_types, skip_imports);
235    let cache_root = cache_root.filter(|_| files.len() >= config.min_corpus_size_for_token_cache);
236    let token_cache = cache_root.map(TokenCache::load);
237
238    // Step 1 & 2: Tokenize and normalize all files in parallel, also parse suppressions
239    let mut file_data: Vec<TokenizedFile> = files
240        .par_iter()
241        .filter_map(|file| {
242            // Apply extra ignore patterns
243            let relative = file.path.strip_prefix(root).unwrap_or(&file.path);
244            if let Some(ref ignores) = extra_ignores {
245                if let Some(index) = ignores.default_match_index(relative) {
246                    default_skip_counts[index].fetch_add(1, Ordering::Relaxed);
247                    return None;
248                }
249                if ignores.is_match(relative) {
250                    return None;
251                }
252            }
253
254            let metadata = std::fs::metadata(&file.path).ok()?;
255
256            let cached_entry = token_cache
257                .as_ref()
258                .and_then(|cache| cache.get(&file.path, &metadata, token_cache_mode));
259            let cache_hit = cached_entry.is_some();
260
261            let (mut entry, suppressions) = if let Some(entry) = cached_entry {
262                let suppressions = entry.suppressions.clone();
263                if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
264                    return None;
265                }
266                (entry, suppressions)
267            } else {
268                let source = std::fs::read_to_string(&file.path).ok()?;
269                let suppressions = suppress::parse_suppressions_from_source(&source);
270                if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
271                    return None;
272                }
273
274                // Tokenize (with optional type stripping for cross-language detection)
275                let file_tokens = if strip_types {
276                    tokenize_file_cross_language(&file.path, &source, true, skip_imports)
277                } else {
278                    tokenize_file(&file.path, &source, skip_imports)
279                };
280                if file_tokens.tokens.is_empty() {
281                    return None;
282                }
283
284                // Normalize and hash using resolved normalization flags
285                let hashed = normalize_and_hash_resolved(&file_tokens.tokens, normalization);
286                let entry = TokenCacheEntry {
287                    hashed_tokens: hashed,
288                    file_tokens,
289                    suppressions: suppressions.clone(),
290                };
291                (entry, suppressions)
292            };
293            if entry.file_tokens.tokens.is_empty() {
294                return None;
295            }
296            if entry.hashed_tokens.len() < config.min_tokens {
297                return None;
298            }
299
300            Some(TokenizedFile {
301                path: file.path.clone(),
302                hashed_tokens: std::mem::take(&mut entry.hashed_tokens),
303                file_tokens: entry.file_tokens,
304                metadata: Some(metadata),
305                cache_hit,
306                suppressions,
307            })
308        })
309        .collect();
310
311    if let (Some(cache_root), Some(mut cache)) = (cache_root, token_cache) {
312        for file in &file_data {
313            if !file.cache_hit
314                && let Some(metadata) = &file.metadata
315            {
316                cache.insert(
317                    &file.path,
318                    metadata,
319                    token_cache_mode,
320                    &file.hashed_tokens,
321                    &file.file_tokens,
322                    &file.suppressions,
323                );
324            }
325        }
326        cache.retain_paths(files);
327        match cache.save_if_dirty() {
328            Ok(true) => {
329                tracing::debug!(cache_root = %cache_root.display(), "saved duplication token cache");
330            }
331            Ok(false) => {
332                tracing::debug!(cache_root = %cache_root.display(), "duplication token cache unchanged");
333            }
334            Err(err) => {
335                tracing::warn!("Failed to save duplication token cache: {err}");
336            }
337        }
338    }
339
340    tracing::info!(
341        files = file_data.len(),
342        "tokenized files for duplication analysis"
343    );
344
345    if let Some(focus_files) = focus_files
346        && file_data.len() >= config.min_corpus_size_for_shingle_filter
347    {
348        shingle_filter::filter_to_focus_candidates(&mut file_data, focus_files, config.min_tokens);
349    }
350
351    // Collect per-file suppressions for line-level filtering
352    let suppressions_by_file: FxHashMap<PathBuf, Vec<Suppression>> = file_data
353        .iter()
354        .filter(|file| !file.suppressions.is_empty())
355        .map(|file| (file.path.clone(), file.suppressions.clone()))
356        .collect();
357
358    // Strip suppressions from the data passed to the detector
359    let detector_data: Vec<(PathBuf, Vec<normalize::HashedToken>, tokenize::FileTokens)> =
360        file_data
361            .into_iter()
362            .map(|file| (file.path, file.hashed_tokens, file.file_tokens))
363            .collect();
364
365    // Step 3 & 4: Detect clones
366    let detector = CloneDetector::new(config.min_tokens, config.min_lines, config.skip_local);
367    let mut report = if let Some(focus_files) = focus_files {
368        detector.detect_touching_files(detector_data, focus_files)
369    } else {
370        detector.detect(detector_data)
371    };
372
373    // Step 5: Apply line-level suppressions
374    if !suppressions_by_file.is_empty() {
375        apply_line_suppressions(&mut report, &suppressions_by_file);
376    }
377
378    let default_ignore_skips =
379        build_default_ignore_skips(extra_ignores.as_ref(), &default_skip_counts);
380
381    // Step 6: Group into families with refactoring suggestions
382    report.clone_families = families::group_into_families(&report.clone_groups, root);
383
384    // Step 7: Detect mirrored directory trees
385    report.mirrored_directories =
386        families::detect_mirrored_directories(&report.clone_families, root);
387
388    // Sort all result arrays for deterministic output ordering.
389    // Parallel tokenization (par_iter) doesn't guarantee collection order.
390    report.sort();
391
392    DuplicationRun {
393        report,
394        default_ignore_skips,
395    }
396}
397
398/// Filter out clone instances that are suppressed by line-level comments.
399#[expect(
400    clippy::cast_possible_truncation,
401    reason = "line numbers are bounded by source size"
402)]
403fn apply_line_suppressions(
404    report: &mut DuplicationReport,
405    suppressions_by_file: &FxHashMap<PathBuf, Vec<Suppression>>,
406) {
407    report.clone_groups.retain_mut(|group| {
408        group.instances.retain(|instance| {
409            if let Some(supps) = suppressions_by_file.get(&instance.file) {
410                // Check if any line in the instance range is suppressed
411                for line in instance.start_line..=instance.end_line {
412                    if suppress::is_suppressed(supps, line as u32, IssueKind::CodeDuplication) {
413                        return false;
414                    }
415                }
416            }
417            true
418        });
419        // Keep group only if it still has 2+ instances
420        group.instances.len() >= 2
421    });
422}
423
424/// Run duplication detection on a project directory using auto-discovered files.
425///
426/// This is a convenience function that handles file discovery internally.
427#[must_use]
428pub fn find_duplicates_in_project(root: &Path, config: &DuplicatesConfig) -> DuplicationReport {
429    let resolved = crate::default_config(root);
430    let files = discover::discover_files_with_plugin_scopes(&resolved);
431    find_duplicates(root, &files, config)
432}
433
434/// Build a merged ignore set from built-in and user-provided duplicates ignores.
435fn build_ignore_set(config: &DuplicatesConfig) -> Option<IgnoreSet> {
436    if !config.ignore_defaults && config.ignore.is_empty() {
437        return None;
438    }
439
440    let mut builder = GlobSetBuilder::new();
441    let mut defaults = Vec::new();
442
443    if config.ignore_defaults {
444        for pattern in DUPES_DEFAULT_IGNORES {
445            match Glob::new(pattern) {
446                Ok(glob) => {
447                    defaults.push((*pattern, glob.compile_matcher()));
448                    builder.add(glob);
449                }
450                Err(e) => {
451                    tracing::warn!("Invalid default duplication ignore pattern '{pattern}': {e}");
452                }
453            }
454        }
455    }
456
457    for pattern in &config.ignore {
458        match Glob::new(pattern) {
459            Ok(glob) => {
460                builder.add(glob);
461            }
462            Err(e) => {
463                tracing::warn!("Invalid duplication ignore pattern '{pattern}': {e}");
464            }
465        }
466    }
467
468    builder.build().ok().map(|all| IgnoreSet { all, defaults })
469}
470
471fn build_default_ignore_skips(
472    ignores: Option<&IgnoreSet>,
473    counts: &[AtomicUsize],
474) -> DefaultIgnoreSkips {
475    let Some(ignores) = ignores else {
476        return DefaultIgnoreSkips::default();
477    };
478
479    let by_pattern = ignores
480        .defaults
481        .iter()
482        .zip(counts)
483        .filter_map(|((pattern, _), count)| {
484            let count = count.load(Ordering::Relaxed);
485            (count > 0).then_some(DefaultIgnoreSkipCount { pattern, count })
486        })
487        .collect::<Vec<_>>();
488    let total = by_pattern.iter().map(|entry| entry.count).sum();
489
490    DefaultIgnoreSkips { total, by_pattern }
491}
492
493#[cfg(test)]
494mod tests {
495    use super::*;
496    use crate::discover::FileId;
497
498    #[test]
499    fn find_duplicates_empty_files() {
500        let config = DuplicatesConfig::default();
501        let report = find_duplicates(Path::new("/tmp"), &[], &config);
502        assert!(report.clone_groups.is_empty());
503        assert!(report.clone_families.is_empty());
504        assert_eq!(report.stats.total_files, 0);
505    }
506
507    #[test]
508    fn build_ignore_set_empty() {
509        let config = DuplicatesConfig {
510            ignore_defaults: false,
511            ..DuplicatesConfig::default()
512        };
513        assert!(build_ignore_set(&config).is_none());
514    }
515
516    #[test]
517    fn build_ignore_set_valid_patterns() {
518        let config = DuplicatesConfig {
519            ignore_defaults: false,
520            ignore: vec!["**/*.test.ts".to_string(), "**/*.spec.ts".to_string()],
521            ..DuplicatesConfig::default()
522        };
523        let set = build_ignore_set(&config);
524        assert!(set.is_some());
525        let set = set.unwrap();
526        assert!(set.is_match(Path::new("src/foo.test.ts")));
527        assert!(set.is_match(Path::new("src/bar.spec.ts")));
528        assert!(!set.is_match(Path::new("src/baz.ts")));
529    }
530
531    #[test]
532    fn build_ignore_set_merges_defaults_with_user_patterns() {
533        let config = DuplicatesConfig {
534            ignore: vec!["**/foo/**".to_string()],
535            ..DuplicatesConfig::default()
536        };
537        let set = build_ignore_set(&config).expect("ignore set");
538        assert!(set.is_match(Path::new(".next/static/chunks/app.js")));
539        assert!(set.is_match(Path::new("src/foo/generated.js")));
540    }
541
542    #[test]
543    fn build_ignore_set_ignore_defaults_false_uses_only_user_patterns() {
544        let config = DuplicatesConfig {
545            ignore_defaults: false,
546            ignore: vec!["**/foo/**".to_string()],
547            ..DuplicatesConfig::default()
548        };
549        let set = build_ignore_set(&config).expect("ignore set");
550        assert!(!set.is_match(Path::new(".next/static/chunks/app.js")));
551        assert!(set.is_match(Path::new("src/foo/generated.js")));
552    }
553
554    #[test]
555    fn find_duplicates_with_real_files() {
556        // Create a temp directory with duplicate files
557        let dir = tempfile::tempdir().expect("create temp dir");
558        let src_dir = dir.path().join("src");
559        std::fs::create_dir_all(&src_dir).expect("create src dir");
560
561        let code = r#"
562export function processData(input: string): string {
563    const trimmed = input.trim();
564    if (trimmed.length === 0) {
565        return "";
566    }
567    const parts = trimmed.split(",");
568    const filtered = parts.filter(p => p.length > 0);
569    const mapped = filtered.map(p => p.toUpperCase());
570    return mapped.join(", ");
571}
572
573export function validateInput(data: string): boolean {
574    if (data === null || data === undefined) {
575        return false;
576    }
577    const cleaned = data.trim();
578    if (cleaned.length < 3) {
579        return false;
580    }
581    return true;
582}
583"#;
584
585        std::fs::write(src_dir.join("original.ts"), code).expect("write original");
586        std::fs::write(src_dir.join("copy.ts"), code).expect("write copy");
587        std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
588            .expect("write package.json");
589
590        let files = vec![
591            DiscoveredFile {
592                id: FileId(0),
593                path: src_dir.join("original.ts"),
594                size_bytes: code.len() as u64,
595            },
596            DiscoveredFile {
597                id: FileId(1),
598                path: src_dir.join("copy.ts"),
599                size_bytes: code.len() as u64,
600            },
601        ];
602
603        let config = DuplicatesConfig {
604            min_tokens: 10,
605            min_lines: 2,
606            ..DuplicatesConfig::default()
607        };
608
609        let report = find_duplicates(dir.path(), &files, &config);
610        assert!(
611            !report.clone_groups.is_empty(),
612            "Should detect clones in identical files"
613        );
614        assert!(report.stats.files_with_clones >= 2);
615
616        // Should also have clone families
617        assert!(
618            !report.clone_families.is_empty(),
619            "Should group clones into families"
620        );
621    }
622
623    #[test]
624    fn find_duplicates_cached_skips_token_cache_for_small_corpus() {
625        let dir = tempfile::tempdir().expect("create temp dir");
626        let src_dir = dir.path().join("src");
627        std::fs::create_dir_all(&src_dir).expect("create src dir");
628
629        let code = "export function same(input: number): number {\n  const doubled = input * 2;\n  return doubled + 1;\n}\n";
630        let first = src_dir.join("first.ts");
631        let second = src_dir.join("second.ts");
632        std::fs::write(&first, code).expect("write first");
633        std::fs::write(&second, code).expect("write second");
634
635        let files = vec![
636            DiscoveredFile {
637                id: FileId(0),
638                path: first,
639                size_bytes: code.len() as u64,
640            },
641            DiscoveredFile {
642                id: FileId(1),
643                path: second,
644                size_bytes: code.len() as u64,
645            },
646        ];
647        let config = DuplicatesConfig {
648            min_tokens: 5,
649            min_lines: 2,
650            ..DuplicatesConfig::default()
651        };
652        let cache_root = dir.path().join(".fallow");
653
654        let report = find_duplicates_cached(dir.path(), &files, &config, &cache_root);
655
656        assert!(!report.clone_groups.is_empty());
657        assert!(
658            !cache_root.exists(),
659            "small projects should avoid token-cache IO overhead"
660        );
661    }
662
663    #[test]
664    fn find_duplicates_touching_files_keeps_cross_corpus_matches_only_for_focus() {
665        let dir = tempfile::tempdir().expect("create temp dir");
666        let src_dir = dir.path().join("src");
667        std::fs::create_dir_all(&src_dir).expect("create src dir");
668
669        let focused_code = r"
670export function focused(input: number): number {
671    const doubled = input * 2;
672    const shifted = doubled + 10;
673    return shifted / 2;
674}
675";
676        let untouched_code = r#"
677export function untouched(input: string): string {
678    const lowered = input.toLowerCase();
679    const padded = lowered.padStart(10, "x");
680    return padded.slice(0, 8);
681}
682"#;
683
684        let changed_path = src_dir.join("changed.ts");
685        let focused_copy_path = src_dir.join("focused-copy.ts");
686        let untouched_a_path = src_dir.join("untouched-a.ts");
687        let untouched_b_path = src_dir.join("untouched-b.ts");
688        std::fs::write(&changed_path, focused_code).expect("write changed");
689        std::fs::write(&focused_copy_path, focused_code).expect("write focused copy");
690        std::fs::write(&untouched_a_path, untouched_code).expect("write untouched a");
691        std::fs::write(&untouched_b_path, untouched_code).expect("write untouched b");
692
693        let files = vec![
694            DiscoveredFile {
695                id: FileId(0),
696                path: changed_path.clone(),
697                size_bytes: focused_code.len() as u64,
698            },
699            DiscoveredFile {
700                id: FileId(1),
701                path: focused_copy_path,
702                size_bytes: focused_code.len() as u64,
703            },
704            DiscoveredFile {
705                id: FileId(2),
706                path: untouched_a_path,
707                size_bytes: untouched_code.len() as u64,
708            },
709            DiscoveredFile {
710                id: FileId(3),
711                path: untouched_b_path,
712                size_bytes: untouched_code.len() as u64,
713            },
714        ];
715
716        let config = DuplicatesConfig {
717            mode: DetectionMode::Strict,
718            min_tokens: 5,
719            min_lines: 2,
720            min_corpus_size_for_shingle_filter: 1,
721            ..DuplicatesConfig::default()
722        };
723        let mut focus = FxHashSet::default();
724        focus.insert(changed_path.clone());
725
726        let full_report = find_duplicates(dir.path(), &files, &config);
727        let report = find_duplicates_touching_files(dir.path(), &files, &config, &focus);
728        let expected_touching = full_report
729            .clone_groups
730            .iter()
731            .filter(|group| {
732                group
733                    .instances
734                    .iter()
735                    .any(|instance| instance.file == changed_path)
736            })
737            .count();
738
739        assert!(
740            !report.clone_groups.is_empty(),
741            "focused file should still match an unchanged duplicate"
742        );
743        assert_eq!(
744            report.clone_groups.len(),
745            expected_touching,
746            "focused shingle filtering must not drop clone groups touching the focused file"
747        );
748        assert!(report.clone_groups.iter().all(|group| {
749            group
750                .instances
751                .iter()
752                .any(|instance| instance.file == changed_path)
753        }));
754    }
755
756    #[test]
757    fn file_wide_suppression_excludes_file() {
758        let dir = tempfile::tempdir().expect("create temp dir");
759        let src_dir = dir.path().join("src");
760        std::fs::create_dir_all(&src_dir).expect("create src dir");
761
762        let code = r#"
763export function processData(input: string): string {
764    const trimmed = input.trim();
765    if (trimmed.length === 0) {
766        return "";
767    }
768    const parts = trimmed.split(",");
769    const filtered = parts.filter(p => p.length > 0);
770    const mapped = filtered.map(p => p.toUpperCase());
771    return mapped.join(", ");
772}
773"#;
774        let suppressed_code = format!("// fallow-ignore-file code-duplication\n{code}");
775
776        std::fs::write(src_dir.join("original.ts"), code).expect("write original");
777        std::fs::write(src_dir.join("suppressed.ts"), &suppressed_code).expect("write suppressed");
778        std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
779            .expect("write package.json");
780
781        let files = vec![
782            DiscoveredFile {
783                id: FileId(0),
784                path: src_dir.join("original.ts"),
785                size_bytes: code.len() as u64,
786            },
787            DiscoveredFile {
788                id: FileId(1),
789                path: src_dir.join("suppressed.ts"),
790                size_bytes: suppressed_code.len() as u64,
791            },
792        ];
793
794        let config = DuplicatesConfig {
795            min_tokens: 10,
796            min_lines: 2,
797            ..DuplicatesConfig::default()
798        };
799
800        let report = find_duplicates(dir.path(), &files, &config);
801        // With only 2 files and one suppressed, there should be no clones
802        assert!(
803            report.clone_groups.is_empty(),
804            "File-wide suppression should exclude file from duplication analysis"
805        );
806    }
807}