fallow_core/duplicates/
mod.rs

1//! Code duplication / clone detection module.
2//!
3//! This module implements suffix array + LCP based clone detection
4//! for TypeScript/JavaScript source files. It supports multiple detection
5//! modes from strict (exact matches only) to semantic (structure-aware
6//! matching that ignores identifier names and literal values).
7
8mod cache;
9pub mod detect;
10pub mod families;
11pub mod normalize;
12mod shingle_filter;
13pub mod token_types;
14mod token_visitor;
15pub mod tokenize;
16pub(crate) mod types;
17
18use rustc_hash::FxHashMap;
19use std::path::{Path, PathBuf};
20use std::sync::atomic::{AtomicUsize, Ordering};
21
22use globset::{Glob, GlobMatcher, GlobSet, GlobSetBuilder};
23use rayon::prelude::*;
24use rustc_hash::FxHashSet;
25
26use cache::{TokenCache, TokenCacheEntry, TokenCacheMode};
27use detect::CloneDetector;
28use normalize::normalize_and_hash_resolved;
29use tokenize::{tokenize_file, tokenize_file_cross_language};
30pub use types::{
31    CloneFamily, CloneGroup, CloneInstance, DefaultIgnoreSkipCount, DefaultIgnoreSkips,
32    DetectionMode, DuplicatesConfig, DuplicationReport, DuplicationStats, MirroredDirectory,
33    RefactoringKind, RefactoringSuggestion,
34};
35
36use crate::discover::{self, DiscoveredFile};
37use crate::suppress::{self, IssueKind, Suppression};
38
39/// Built-in duplicates ignores for generated framework and tool output.
40///
41/// These are engine policy defaults, not config-file defaults: `duplicates.ignore`
42/// stays empty in round-tripped configs, while the analyzer merges these patterns
43/// unless `duplicates.ignoreDefaults` is set to `false`.
44pub const DUPES_DEFAULT_IGNORES: &[&str] = &[
45    "**/.next/**",
46    "**/.nuxt/**",
47    "**/.svelte-kit/**",
48    "**/.turbo/**",
49    "**/.parcel-cache/**",
50    "**/.vite/**",
51    "**/.cache/**",
52    "**/out/**",
53    "**/storybook-static/**",
54];
55
56#[derive(Clone)]
57pub(super) struct TokenizedFile {
58    path: PathBuf,
59    hashed_tokens: Vec<normalize::HashedToken>,
60    file_tokens: tokenize::FileTokens,
61    metadata: Option<std::fs::Metadata>,
62    cache_hit: bool,
63    suppressions: Vec<Suppression>,
64}
65
66struct IgnoreSet {
67    all: GlobSet,
68    defaults: Vec<(&'static str, GlobMatcher)>,
69}
70
71impl IgnoreSet {
72    fn is_match(&self, path: &Path) -> bool {
73        self.all.is_match(path)
74    }
75
76    fn default_match_index(&self, path: &Path) -> Option<usize> {
77        self.defaults
78            .iter()
79            .position(|(_, matcher)| matcher.is_match(path))
80    }
81}
82
83struct DuplicationRun {
84    report: DuplicationReport,
85    default_ignore_skips: DefaultIgnoreSkips,
86}
87
88/// Run duplication detection on the given files.
89///
90/// This is the main entry point for the duplication analysis. It:
91/// 1. Reads and tokenizes all source files in parallel
92/// 2. Normalizes tokens according to the detection mode
93/// 3. Runs suffix array + LCP clone detection
94/// 4. Groups clone instances into families with refactoring suggestions
95/// 5. Applies inline suppression filters
96pub fn find_duplicates(
97    root: &Path,
98    files: &[DiscoveredFile],
99    config: &DuplicatesConfig,
100) -> DuplicationReport {
101    find_duplicates_inner(root, files, config, None, None).report
102}
103
104/// Run duplication detection and return human-format sidecar metadata for
105/// files skipped by built-in duplicates ignores.
106pub fn find_duplicates_with_default_ignore_skips(
107    root: &Path,
108    files: &[DiscoveredFile],
109    config: &DuplicatesConfig,
110) -> (DuplicationReport, DefaultIgnoreSkips) {
111    let run = find_duplicates_inner(root, files, config, None, None);
112    (run.report, run.default_ignore_skips)
113}
114
115/// Run duplication detection with the persistent token cache enabled.
116pub fn find_duplicates_cached(
117    root: &Path,
118    files: &[DiscoveredFile],
119    config: &DuplicatesConfig,
120    cache_root: &Path,
121) -> DuplicationReport {
122    find_duplicates_inner(root, files, config, None, Some(cache_root)).report
123}
124
125/// Run cached duplication detection and return human-format sidecar metadata for
126/// files skipped by built-in duplicates ignores.
127pub fn find_duplicates_cached_with_default_ignore_skips(
128    root: &Path,
129    files: &[DiscoveredFile],
130    config: &DuplicatesConfig,
131    cache_root: &Path,
132) -> (DuplicationReport, DefaultIgnoreSkips) {
133    let run = find_duplicates_inner(root, files, config, None, Some(cache_root));
134    (run.report, run.default_ignore_skips)
135}
136
137/// Run duplication detection and only return clone groups touching `focus_files`.
138///
139/// This keeps all files in the matching corpus, which preserves changed-file
140/// versus unchanged-file detection for diff-scoped audit runs, but avoids
141/// materializing duplicate groups that cannot appear in the scoped report.
142#[expect(
143    clippy::implicit_hasher,
144    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
145)]
146pub fn find_duplicates_touching_files(
147    root: &Path,
148    files: &[DiscoveredFile],
149    config: &DuplicatesConfig,
150    focus_files: &FxHashSet<PathBuf>,
151) -> DuplicationReport {
152    find_duplicates_inner(root, files, config, Some(focus_files), None).report
153}
154
155/// Run focused duplication detection and return human-format sidecar metadata
156/// for files skipped by built-in duplicates ignores.
157#[expect(
158    clippy::implicit_hasher,
159    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
160)]
161pub fn find_duplicates_touching_files_with_default_ignore_skips(
162    root: &Path,
163    files: &[DiscoveredFile],
164    config: &DuplicatesConfig,
165    focus_files: &FxHashSet<PathBuf>,
166) -> (DuplicationReport, DefaultIgnoreSkips) {
167    let run = find_duplicates_inner(root, files, config, Some(focus_files), None);
168    (run.report, run.default_ignore_skips)
169}
170
171/// Run focused duplication detection with the persistent token cache enabled.
172#[expect(
173    clippy::implicit_hasher,
174    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
175)]
176pub fn find_duplicates_touching_files_cached(
177    root: &Path,
178    files: &[DiscoveredFile],
179    config: &DuplicatesConfig,
180    focus_files: &FxHashSet<PathBuf>,
181    cache_root: &Path,
182) -> DuplicationReport {
183    find_duplicates_inner(root, files, config, Some(focus_files), Some(cache_root)).report
184}
185
186/// Run cached focused duplication detection and return human-format sidecar
187/// metadata for files skipped by built-in duplicates ignores.
188#[expect(
189    clippy::implicit_hasher,
190    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
191)]
192pub fn find_duplicates_touching_files_cached_with_default_ignore_skips(
193    root: &Path,
194    files: &[DiscoveredFile],
195    config: &DuplicatesConfig,
196    focus_files: &FxHashSet<PathBuf>,
197    cache_root: &Path,
198) -> (DuplicationReport, DefaultIgnoreSkips) {
199    let run = find_duplicates_inner(root, files, config, Some(focus_files), Some(cache_root));
200    (run.report, run.default_ignore_skips)
201}
202
203fn find_duplicates_inner(
204    root: &Path,
205    files: &[DiscoveredFile],
206    config: &DuplicatesConfig,
207    focus_files: Option<&FxHashSet<PathBuf>>,
208    cache_root: Option<&Path>,
209) -> DuplicationRun {
210    let _span = tracing::info_span!("find_duplicates").entered();
211
212    let extra_ignores = build_ignore_set(config);
213    let default_skip_counts = extra_ignores
214        .as_ref()
215        .map(|ignores| {
216            std::iter::repeat_with(|| AtomicUsize::new(0))
217                .take(ignores.defaults.len())
218                .collect::<Vec<_>>()
219        })
220        .unwrap_or_default();
221
222    // Resolve normalization: mode defaults + user overrides
223    let normalization =
224        fallow_config::ResolvedNormalization::resolve(config.mode, &config.normalization);
225
226    let strip_types = config.cross_language;
227    let skip_imports = config.ignore_imports;
228
229    tracing::debug!(
230        ignore_imports = skip_imports,
231        "duplication tokenization config"
232    );
233
234    let token_cache_mode = TokenCacheMode::new(normalization, strip_types, skip_imports);
235    let cache_root = cache_root.filter(|_| files.len() >= config.min_corpus_size_for_token_cache);
236    let token_cache = cache_root.map(TokenCache::load);
237
238    // Step 1 & 2: Tokenize and normalize all files in parallel, also parse suppressions
239    let mut file_data: Vec<TokenizedFile> = files
240        .par_iter()
241        .filter_map(|file| {
242            // Apply extra ignore patterns
243            let relative = file.path.strip_prefix(root).unwrap_or(&file.path);
244            if let Some(ref ignores) = extra_ignores {
245                if let Some(index) = ignores.default_match_index(relative) {
246                    default_skip_counts[index].fetch_add(1, Ordering::Relaxed);
247                    return None;
248                }
249                if ignores.is_match(relative) {
250                    return None;
251                }
252            }
253
254            let metadata = std::fs::metadata(&file.path).ok()?;
255
256            let cached_entry = token_cache
257                .as_ref()
258                .and_then(|cache| cache.get(&file.path, &metadata, token_cache_mode));
259            let cache_hit = cached_entry.is_some();
260
261            let (mut entry, suppressions) = if let Some(entry) = cached_entry {
262                let suppressions =
263                    suppress::parse_suppressions_from_source(&entry.file_tokens.source);
264                if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
265                    return None;
266                }
267                (entry, suppressions)
268            } else {
269                let source = std::fs::read_to_string(&file.path).ok()?;
270                let suppressions = suppress::parse_suppressions_from_source(&source);
271                if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
272                    return None;
273                }
274
275                // Tokenize (with optional type stripping for cross-language detection)
276                let file_tokens = if strip_types {
277                    tokenize_file_cross_language(&file.path, &source, true, skip_imports)
278                } else {
279                    tokenize_file(&file.path, &source, skip_imports)
280                };
281                if file_tokens.tokens.is_empty() {
282                    return None;
283                }
284
285                // Normalize and hash using resolved normalization flags
286                let hashed = normalize_and_hash_resolved(&file_tokens.tokens, normalization);
287                let entry = TokenCacheEntry {
288                    hashed_tokens: hashed,
289                    file_tokens,
290                };
291                (entry, suppressions)
292            };
293            if entry.file_tokens.tokens.is_empty() {
294                return None;
295            }
296            if entry.hashed_tokens.len() < config.min_tokens {
297                return None;
298            }
299
300            Some(TokenizedFile {
301                path: file.path.clone(),
302                hashed_tokens: std::mem::take(&mut entry.hashed_tokens),
303                file_tokens: entry.file_tokens,
304                metadata: Some(metadata),
305                cache_hit,
306                suppressions,
307            })
308        })
309        .collect();
310
311    if let (Some(cache_root), Some(mut cache)) = (cache_root, token_cache) {
312        for file in &file_data {
313            if !file.cache_hit
314                && let Some(metadata) = &file.metadata
315            {
316                cache.insert(
317                    &file.path,
318                    metadata,
319                    token_cache_mode,
320                    &file.hashed_tokens,
321                    &file.file_tokens,
322                );
323            }
324        }
325        cache.retain_paths(files);
326        match cache.save_if_dirty() {
327            Ok(true) => {
328                tracing::debug!(cache_root = %cache_root.display(), "saved duplication token cache");
329            }
330            Ok(false) => {
331                tracing::debug!(cache_root = %cache_root.display(), "duplication token cache unchanged");
332            }
333            Err(err) => {
334                tracing::warn!("Failed to save duplication token cache: {err}");
335            }
336        }
337    }
338
339    tracing::info!(
340        files = file_data.len(),
341        "tokenized files for duplication analysis"
342    );
343
344    if let Some(focus_files) = focus_files
345        && file_data.len() >= config.min_corpus_size_for_shingle_filter
346    {
347        shingle_filter::filter_to_focus_candidates(&mut file_data, focus_files, config.min_tokens);
348    }
349
350    // Collect per-file suppressions for line-level filtering
351    let suppressions_by_file: FxHashMap<PathBuf, Vec<Suppression>> = file_data
352        .iter()
353        .filter(|file| !file.suppressions.is_empty())
354        .map(|file| (file.path.clone(), file.suppressions.clone()))
355        .collect();
356
357    // Strip suppressions from the data passed to the detector
358    let detector_data: Vec<(PathBuf, Vec<normalize::HashedToken>, tokenize::FileTokens)> =
359        file_data
360            .into_iter()
361            .map(|file| (file.path, file.hashed_tokens, file.file_tokens))
362            .collect();
363
364    // Step 3 & 4: Detect clones
365    let detector = CloneDetector::new(config.min_tokens, config.min_lines, config.skip_local);
366    let mut report = if let Some(focus_files) = focus_files {
367        detector.detect_touching_files(detector_data, focus_files)
368    } else {
369        detector.detect(detector_data)
370    };
371
372    // Step 5: Apply line-level suppressions
373    if !suppressions_by_file.is_empty() {
374        apply_line_suppressions(&mut report, &suppressions_by_file);
375    }
376
377    let default_ignore_skips =
378        build_default_ignore_skips(extra_ignores.as_ref(), &default_skip_counts);
379
380    // Step 6: Group into families with refactoring suggestions
381    report.clone_families = families::group_into_families(&report.clone_groups, root);
382
383    // Step 7: Detect mirrored directory trees
384    report.mirrored_directories =
385        families::detect_mirrored_directories(&report.clone_families, root);
386
387    // Sort all result arrays for deterministic output ordering.
388    // Parallel tokenization (par_iter) doesn't guarantee collection order.
389    report.sort();
390
391    DuplicationRun {
392        report,
393        default_ignore_skips,
394    }
395}
396
397/// Filter out clone instances that are suppressed by line-level comments.
398#[expect(
399    clippy::cast_possible_truncation,
400    reason = "line numbers are bounded by source size"
401)]
402fn apply_line_suppressions(
403    report: &mut DuplicationReport,
404    suppressions_by_file: &FxHashMap<PathBuf, Vec<Suppression>>,
405) {
406    report.clone_groups.retain_mut(|group| {
407        group.instances.retain(|instance| {
408            if let Some(supps) = suppressions_by_file.get(&instance.file) {
409                // Check if any line in the instance range is suppressed
410                for line in instance.start_line..=instance.end_line {
411                    if suppress::is_suppressed(supps, line as u32, IssueKind::CodeDuplication) {
412                        return false;
413                    }
414                }
415            }
416            true
417        });
418        // Keep group only if it still has 2+ instances
419        group.instances.len() >= 2
420    });
421}
422
423/// Run duplication detection on a project directory using auto-discovered files.
424///
425/// This is a convenience function that handles file discovery internally.
426#[must_use]
427pub fn find_duplicates_in_project(root: &Path, config: &DuplicatesConfig) -> DuplicationReport {
428    let resolved = crate::default_config(root);
429    let files = discover::discover_files(&resolved);
430    find_duplicates(root, &files, config)
431}
432
433/// Build a merged ignore set from built-in and user-provided duplicates ignores.
434fn build_ignore_set(config: &DuplicatesConfig) -> Option<IgnoreSet> {
435    if !config.ignore_defaults && config.ignore.is_empty() {
436        return None;
437    }
438
439    let mut builder = GlobSetBuilder::new();
440    let mut defaults = Vec::new();
441
442    if config.ignore_defaults {
443        for pattern in DUPES_DEFAULT_IGNORES {
444            match Glob::new(pattern) {
445                Ok(glob) => {
446                    defaults.push((*pattern, glob.compile_matcher()));
447                    builder.add(glob);
448                }
449                Err(e) => {
450                    tracing::warn!("Invalid default duplication ignore pattern '{pattern}': {e}");
451                }
452            }
453        }
454    }
455
456    for pattern in &config.ignore {
457        match Glob::new(pattern) {
458            Ok(glob) => {
459                builder.add(glob);
460            }
461            Err(e) => {
462                tracing::warn!("Invalid duplication ignore pattern '{pattern}': {e}");
463            }
464        }
465    }
466
467    builder.build().ok().map(|all| IgnoreSet { all, defaults })
468}
469
470fn build_default_ignore_skips(
471    ignores: Option<&IgnoreSet>,
472    counts: &[AtomicUsize],
473) -> DefaultIgnoreSkips {
474    let Some(ignores) = ignores else {
475        return DefaultIgnoreSkips::default();
476    };
477
478    let by_pattern = ignores
479        .defaults
480        .iter()
481        .zip(counts)
482        .filter_map(|((pattern, _), count)| {
483            let count = count.load(Ordering::Relaxed);
484            (count > 0).then_some(DefaultIgnoreSkipCount { pattern, count })
485        })
486        .collect::<Vec<_>>();
487    let total = by_pattern.iter().map(|entry| entry.count).sum();
488
489    DefaultIgnoreSkips { total, by_pattern }
490}
491
492#[cfg(test)]
493mod tests {
494    use super::*;
495    use crate::discover::FileId;
496
497    #[test]
498    fn find_duplicates_empty_files() {
499        let config = DuplicatesConfig::default();
500        let report = find_duplicates(Path::new("/tmp"), &[], &config);
501        assert!(report.clone_groups.is_empty());
502        assert!(report.clone_families.is_empty());
503        assert_eq!(report.stats.total_files, 0);
504    }
505
506    #[test]
507    fn build_ignore_set_empty() {
508        let config = DuplicatesConfig {
509            ignore_defaults: false,
510            ..DuplicatesConfig::default()
511        };
512        assert!(build_ignore_set(&config).is_none());
513    }
514
515    #[test]
516    fn build_ignore_set_valid_patterns() {
517        let config = DuplicatesConfig {
518            ignore_defaults: false,
519            ignore: vec!["**/*.test.ts".to_string(), "**/*.spec.ts".to_string()],
520            ..DuplicatesConfig::default()
521        };
522        let set = build_ignore_set(&config);
523        assert!(set.is_some());
524        let set = set.unwrap();
525        assert!(set.is_match(Path::new("src/foo.test.ts")));
526        assert!(set.is_match(Path::new("src/bar.spec.ts")));
527        assert!(!set.is_match(Path::new("src/baz.ts")));
528    }
529
530    #[test]
531    fn build_ignore_set_merges_defaults_with_user_patterns() {
532        let config = DuplicatesConfig {
533            ignore: vec!["**/foo/**".to_string()],
534            ..DuplicatesConfig::default()
535        };
536        let set = build_ignore_set(&config).expect("ignore set");
537        assert!(set.is_match(Path::new(".next/static/chunks/app.js")));
538        assert!(set.is_match(Path::new("src/foo/generated.js")));
539    }
540
541    #[test]
542    fn build_ignore_set_ignore_defaults_false_uses_only_user_patterns() {
543        let config = DuplicatesConfig {
544            ignore_defaults: false,
545            ignore: vec!["**/foo/**".to_string()],
546            ..DuplicatesConfig::default()
547        };
548        let set = build_ignore_set(&config).expect("ignore set");
549        assert!(!set.is_match(Path::new(".next/static/chunks/app.js")));
550        assert!(set.is_match(Path::new("src/foo/generated.js")));
551    }
552
553    #[test]
554    fn find_duplicates_with_real_files() {
555        // Create a temp directory with duplicate files
556        let dir = tempfile::tempdir().expect("create temp dir");
557        let src_dir = dir.path().join("src");
558        std::fs::create_dir_all(&src_dir).expect("create src dir");
559
560        let code = r#"
561export function processData(input: string): string {
562    const trimmed = input.trim();
563    if (trimmed.length === 0) {
564        return "";
565    }
566    const parts = trimmed.split(",");
567    const filtered = parts.filter(p => p.length > 0);
568    const mapped = filtered.map(p => p.toUpperCase());
569    return mapped.join(", ");
570}
571
572export function validateInput(data: string): boolean {
573    if (data === null || data === undefined) {
574        return false;
575    }
576    const cleaned = data.trim();
577    if (cleaned.length < 3) {
578        return false;
579    }
580    return true;
581}
582"#;
583
584        std::fs::write(src_dir.join("original.ts"), code).expect("write original");
585        std::fs::write(src_dir.join("copy.ts"), code).expect("write copy");
586        std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
587            .expect("write package.json");
588
589        let files = vec![
590            DiscoveredFile {
591                id: FileId(0),
592                path: src_dir.join("original.ts"),
593                size_bytes: code.len() as u64,
594            },
595            DiscoveredFile {
596                id: FileId(1),
597                path: src_dir.join("copy.ts"),
598                size_bytes: code.len() as u64,
599            },
600        ];
601
602        let config = DuplicatesConfig {
603            min_tokens: 10,
604            min_lines: 2,
605            ..DuplicatesConfig::default()
606        };
607
608        let report = find_duplicates(dir.path(), &files, &config);
609        assert!(
610            !report.clone_groups.is_empty(),
611            "Should detect clones in identical files"
612        );
613        assert!(report.stats.files_with_clones >= 2);
614
615        // Should also have clone families
616        assert!(
617            !report.clone_families.is_empty(),
618            "Should group clones into families"
619        );
620    }
621
622    #[test]
623    fn find_duplicates_cached_skips_token_cache_for_small_corpus() {
624        let dir = tempfile::tempdir().expect("create temp dir");
625        let src_dir = dir.path().join("src");
626        std::fs::create_dir_all(&src_dir).expect("create src dir");
627
628        let code = "export function same(input: number): number {\n  const doubled = input * 2;\n  return doubled + 1;\n}\n";
629        let first = src_dir.join("first.ts");
630        let second = src_dir.join("second.ts");
631        std::fs::write(&first, code).expect("write first");
632        std::fs::write(&second, code).expect("write second");
633
634        let files = vec![
635            DiscoveredFile {
636                id: FileId(0),
637                path: first,
638                size_bytes: code.len() as u64,
639            },
640            DiscoveredFile {
641                id: FileId(1),
642                path: second,
643                size_bytes: code.len() as u64,
644            },
645        ];
646        let config = DuplicatesConfig {
647            min_tokens: 5,
648            min_lines: 2,
649            ..DuplicatesConfig::default()
650        };
651        let cache_root = dir.path().join(".fallow");
652
653        let report = find_duplicates_cached(dir.path(), &files, &config, &cache_root);
654
655        assert!(!report.clone_groups.is_empty());
656        assert!(
657            !cache_root.exists(),
658            "small projects should avoid token-cache IO overhead"
659        );
660    }
661
662    #[test]
663    fn find_duplicates_touching_files_keeps_cross_corpus_matches_only_for_focus() {
664        let dir = tempfile::tempdir().expect("create temp dir");
665        let src_dir = dir.path().join("src");
666        std::fs::create_dir_all(&src_dir).expect("create src dir");
667
668        let focused_code = r"
669export function focused(input: number): number {
670    const doubled = input * 2;
671    const shifted = doubled + 10;
672    return shifted / 2;
673}
674";
675        let untouched_code = r#"
676export function untouched(input: string): string {
677    const lowered = input.toLowerCase();
678    const padded = lowered.padStart(10, "x");
679    return padded.slice(0, 8);
680}
681"#;
682
683        let changed_path = src_dir.join("changed.ts");
684        let focused_copy_path = src_dir.join("focused-copy.ts");
685        let untouched_a_path = src_dir.join("untouched-a.ts");
686        let untouched_b_path = src_dir.join("untouched-b.ts");
687        std::fs::write(&changed_path, focused_code).expect("write changed");
688        std::fs::write(&focused_copy_path, focused_code).expect("write focused copy");
689        std::fs::write(&untouched_a_path, untouched_code).expect("write untouched a");
690        std::fs::write(&untouched_b_path, untouched_code).expect("write untouched b");
691
692        let files = vec![
693            DiscoveredFile {
694                id: FileId(0),
695                path: changed_path.clone(),
696                size_bytes: focused_code.len() as u64,
697            },
698            DiscoveredFile {
699                id: FileId(1),
700                path: focused_copy_path,
701                size_bytes: focused_code.len() as u64,
702            },
703            DiscoveredFile {
704                id: FileId(2),
705                path: untouched_a_path,
706                size_bytes: untouched_code.len() as u64,
707            },
708            DiscoveredFile {
709                id: FileId(3),
710                path: untouched_b_path,
711                size_bytes: untouched_code.len() as u64,
712            },
713        ];
714
715        let config = DuplicatesConfig {
716            mode: DetectionMode::Strict,
717            min_tokens: 5,
718            min_lines: 2,
719            min_corpus_size_for_shingle_filter: 1,
720            ..DuplicatesConfig::default()
721        };
722        let mut focus = FxHashSet::default();
723        focus.insert(changed_path.clone());
724
725        let full_report = find_duplicates(dir.path(), &files, &config);
726        let report = find_duplicates_touching_files(dir.path(), &files, &config, &focus);
727        let expected_touching = full_report
728            .clone_groups
729            .iter()
730            .filter(|group| {
731                group
732                    .instances
733                    .iter()
734                    .any(|instance| instance.file == changed_path)
735            })
736            .count();
737
738        assert!(
739            !report.clone_groups.is_empty(),
740            "focused file should still match an unchanged duplicate"
741        );
742        assert_eq!(
743            report.clone_groups.len(),
744            expected_touching,
745            "focused shingle filtering must not drop clone groups touching the focused file"
746        );
747        assert!(report.clone_groups.iter().all(|group| {
748            group
749                .instances
750                .iter()
751                .any(|instance| instance.file == changed_path)
752        }));
753    }
754
755    #[test]
756    fn file_wide_suppression_excludes_file() {
757        let dir = tempfile::tempdir().expect("create temp dir");
758        let src_dir = dir.path().join("src");
759        std::fs::create_dir_all(&src_dir).expect("create src dir");
760
761        let code = r#"
762export function processData(input: string): string {
763    const trimmed = input.trim();
764    if (trimmed.length === 0) {
765        return "";
766    }
767    const parts = trimmed.split(",");
768    const filtered = parts.filter(p => p.length > 0);
769    const mapped = filtered.map(p => p.toUpperCase());
770    return mapped.join(", ");
771}
772"#;
773        let suppressed_code = format!("// fallow-ignore-file code-duplication\n{code}");
774
775        std::fs::write(src_dir.join("original.ts"), code).expect("write original");
776        std::fs::write(src_dir.join("suppressed.ts"), &suppressed_code).expect("write suppressed");
777        std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
778            .expect("write package.json");
779
780        let files = vec![
781            DiscoveredFile {
782                id: FileId(0),
783                path: src_dir.join("original.ts"),
784                size_bytes: code.len() as u64,
785            },
786            DiscoveredFile {
787                id: FileId(1),
788                path: src_dir.join("suppressed.ts"),
789                size_bytes: suppressed_code.len() as u64,
790            },
791        ];
792
793        let config = DuplicatesConfig {
794            min_tokens: 10,
795            min_lines: 2,
796            ..DuplicatesConfig::default()
797        };
798
799        let report = find_duplicates(dir.path(), &files, &config);
800        // With only 2 files and one suppressed, there should be no clones
801        assert!(
802            report.clone_groups.is_empty(),
803            "File-wide suppression should exclude file from duplication analysis"
804        );
805    }
806}
fallow_core/duplicates/mod.rs

fallow_core/duplicates/
mod.rs