Skip to main content

fallow_core/duplicates/
mod.rs

1//! Code duplication / clone detection module.
2//!
3//! This module implements suffix array + LCP based clone detection
4//! for TypeScript/JavaScript source files. It supports multiple detection
5//! modes from strict (exact matches only) to semantic (structure-aware
6//! matching that ignores identifier names and literal values).
7
8mod cache;
9pub mod deepdive;
10pub mod detect;
11pub mod families;
12pub mod normalize;
13mod shingle_filter;
14pub mod token_types;
15mod token_visitor;
16pub mod tokenize;
17pub(crate) mod types;
18
19use rustc_hash::FxHashMap;
20use std::path::{Path, PathBuf};
21use std::sync::atomic::{AtomicUsize, Ordering};
22
23use globset::{Glob, GlobMatcher, GlobSet, GlobSetBuilder};
24use rayon::prelude::*;
25use rustc_hash::FxHashSet;
26
27use cache::{TokenCache, TokenCacheEntry, TokenCacheMode};
28pub use deepdive::{
29    CloneFingerprintKey, CloneFingerprintSet, FINGERPRINT_PREFIX, clone_fingerprint,
30    dominant_identifier, fingerprint_for_fragment, group_refactoring_suggestion,
31};
32use detect::CloneDetector;
33use normalize::normalize_and_hash_resolved;
34use tokenize::{tokenize_file, tokenize_file_cross_language};
35pub use types::{
36    CloneFamily, CloneGroup, CloneInstance, DefaultIgnoreSkipCount, DefaultIgnoreSkips,
37    DetectionMode, DuplicatesConfig, DuplicationReport, DuplicationStats, MirroredDirectory,
38    RefactoringKind, RefactoringSuggestion,
39};
40
41use crate::discover::{self, DiscoveredFile};
42use crate::suppress::{self, IssueKind, Suppression};
43
44/// Built-in duplicates ignores for generated framework and tool output.
45///
46/// These are engine policy defaults, not config-file defaults: `duplicates.ignore`
47/// stays empty in round-tripped configs, while the analyzer merges these patterns
48/// unless `duplicates.ignoreDefaults` is set to `false`.
49pub const DUPES_DEFAULT_IGNORES: &[&str] = &[
50    "**/.next/**",
51    "**/.nuxt/**",
52    "**/.svelte-kit/**",
53    "**/.turbo/**",
54    "**/.parcel-cache/**",
55    "**/.vite/**",
56    "**/.cache/**",
57    "**/out/**",
58    "**/storybook-static/**",
59];
60
61#[derive(Clone)]
62pub(super) struct TokenizedFile {
63    path: PathBuf,
64    hashed_tokens: Vec<normalize::HashedToken>,
65    file_tokens: tokenize::FileTokens,
66    metadata: Option<std::fs::Metadata>,
67    cache_hit: bool,
68    suppressions: Vec<Suppression>,
69}
70
71struct IgnoreSet {
72    all: GlobSet,
73    defaults: Vec<(&'static str, GlobMatcher)>,
74}
75
76impl IgnoreSet {
77    fn is_match(&self, path: &Path) -> bool {
78        self.all.is_match(path)
79    }
80
81    fn default_match_index(&self, path: &Path) -> Option<usize> {
82        self.defaults
83            .iter()
84            .position(|(_, matcher)| matcher.is_match(path))
85    }
86}
87
88struct DuplicationRun {
89    report: DuplicationReport,
90    default_ignore_skips: DefaultIgnoreSkips,
91}
92
93/// Run duplication detection on the given files.
94///
95/// This is the main entry point for the duplication analysis. It:
96/// 1. Reads and tokenizes all source files in parallel
97/// 2. Normalizes tokens according to the detection mode
98/// 3. Runs suffix array + LCP clone detection
99/// 4. Groups clone instances into families with refactoring suggestions
100/// 5. Applies inline suppression filters
101pub fn find_duplicates(
102    root: &Path,
103    files: &[DiscoveredFile],
104    config: &DuplicatesConfig,
105) -> DuplicationReport {
106    find_duplicates_inner(root, files, config, None, None).report
107}
108
109/// Run duplication detection and return human-format sidecar metadata for
110/// files skipped by built-in duplicates ignores.
111pub fn find_duplicates_with_default_ignore_skips(
112    root: &Path,
113    files: &[DiscoveredFile],
114    config: &DuplicatesConfig,
115) -> (DuplicationReport, DefaultIgnoreSkips) {
116    let run = find_duplicates_inner(root, files, config, None, None);
117    (run.report, run.default_ignore_skips)
118}
119
120/// Run duplication detection with the persistent token cache enabled.
121pub fn find_duplicates_cached(
122    root: &Path,
123    files: &[DiscoveredFile],
124    config: &DuplicatesConfig,
125    cache_root: &Path,
126) -> DuplicationReport {
127    find_duplicates_inner(root, files, config, None, Some(cache_root)).report
128}
129
130/// Run cached duplication detection and return human-format sidecar metadata for
131/// files skipped by built-in duplicates ignores.
132pub fn find_duplicates_cached_with_default_ignore_skips(
133    root: &Path,
134    files: &[DiscoveredFile],
135    config: &DuplicatesConfig,
136    cache_root: &Path,
137) -> (DuplicationReport, DefaultIgnoreSkips) {
138    let run = find_duplicates_inner(root, files, config, None, Some(cache_root));
139    (run.report, run.default_ignore_skips)
140}
141
142/// Run duplication detection and only return clone groups touching `focus_files`.
143///
144/// This keeps all files in the matching corpus, which preserves changed-file
145/// versus unchanged-file detection for diff-scoped audit runs, but avoids
146/// materializing duplicate groups that cannot appear in the scoped report.
147#[expect(
148    clippy::implicit_hasher,
149    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
150)]
151pub fn find_duplicates_touching_files(
152    root: &Path,
153    files: &[DiscoveredFile],
154    config: &DuplicatesConfig,
155    focus_files: &FxHashSet<PathBuf>,
156) -> DuplicationReport {
157    find_duplicates_inner(root, files, config, Some(focus_files), None).report
158}
159
160/// Run focused duplication detection and return human-format sidecar metadata
161/// for files skipped by built-in duplicates ignores.
162#[expect(
163    clippy::implicit_hasher,
164    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
165)]
166pub fn find_duplicates_touching_files_with_default_ignore_skips(
167    root: &Path,
168    files: &[DiscoveredFile],
169    config: &DuplicatesConfig,
170    focus_files: &FxHashSet<PathBuf>,
171) -> (DuplicationReport, DefaultIgnoreSkips) {
172    let run = find_duplicates_inner(root, files, config, Some(focus_files), None);
173    (run.report, run.default_ignore_skips)
174}
175
176/// Run focused duplication detection with the persistent token cache enabled.
177#[expect(
178    clippy::implicit_hasher,
179    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
180)]
181pub fn find_duplicates_touching_files_cached(
182    root: &Path,
183    files: &[DiscoveredFile],
184    config: &DuplicatesConfig,
185    focus_files: &FxHashSet<PathBuf>,
186    cache_root: &Path,
187) -> DuplicationReport {
188    find_duplicates_inner(root, files, config, Some(focus_files), Some(cache_root)).report
189}
190
191/// Run cached focused duplication detection and return human-format sidecar
192/// metadata for files skipped by built-in duplicates ignores.
193#[expect(
194    clippy::implicit_hasher,
195    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
196)]
197pub fn find_duplicates_touching_files_cached_with_default_ignore_skips(
198    root: &Path,
199    files: &[DiscoveredFile],
200    config: &DuplicatesConfig,
201    focus_files: &FxHashSet<PathBuf>,
202    cache_root: &Path,
203) -> (DuplicationReport, DefaultIgnoreSkips) {
204    let run = find_duplicates_inner(root, files, config, Some(focus_files), Some(cache_root));
205    (run.report, run.default_ignore_skips)
206}
207
208fn find_duplicates_inner(
209    root: &Path,
210    files: &[DiscoveredFile],
211    config: &DuplicatesConfig,
212    focus_files: Option<&FxHashSet<PathBuf>>,
213    cache_root: Option<&Path>,
214) -> DuplicationRun {
215    let _span = tracing::info_span!("find_duplicates").entered();
216
217    let extra_ignores = build_ignore_set(config);
218    let default_skip_counts = extra_ignores
219        .as_ref()
220        .map(|ignores| {
221            std::iter::repeat_with(|| AtomicUsize::new(0))
222                .take(ignores.defaults.len())
223                .collect::<Vec<_>>()
224        })
225        .unwrap_or_default();
226
227    // Resolve normalization: mode defaults + user overrides
228    let normalization =
229        fallow_config::ResolvedNormalization::resolve(config.mode, &config.normalization);
230
231    let strip_types = config.cross_language;
232    let skip_imports = config.ignore_imports;
233
234    tracing::debug!(
235        ignore_imports = skip_imports,
236        "duplication tokenization config"
237    );
238
239    let token_cache_mode = TokenCacheMode::new(normalization, strip_types, skip_imports);
240    let cache_root = cache_root.filter(|_| files.len() >= config.min_corpus_size_for_token_cache);
241    let token_cache = cache_root.map(TokenCache::load);
242
243    // Step 1 & 2: Tokenize and normalize all files in parallel, also parse suppressions
244    let mut file_data: Vec<TokenizedFile> = files
245        .par_iter()
246        .filter_map(|file| {
247            // Apply extra ignore patterns
248            let relative = file.path.strip_prefix(root).unwrap_or(&file.path);
249            if let Some(ref ignores) = extra_ignores {
250                if let Some(index) = ignores.default_match_index(relative) {
251                    default_skip_counts[index].fetch_add(1, Ordering::Relaxed);
252                    return None;
253                }
254                if ignores.is_match(relative) {
255                    return None;
256                }
257            }
258
259            let metadata = std::fs::metadata(&file.path).ok()?;
260
261            let cached_entry = token_cache
262                .as_ref()
263                .and_then(|cache| cache.get(&file.path, &metadata, token_cache_mode));
264            let cache_hit = cached_entry.is_some();
265
266            let (mut entry, suppressions) = if let Some(entry) = cached_entry {
267                let suppressions = entry.suppressions.clone();
268                if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
269                    return None;
270                }
271                (entry, suppressions)
272            } else {
273                let source = std::fs::read_to_string(&file.path).ok()?;
274                let suppressions = suppress::parse_suppressions_from_source(&source).suppressions;
275                if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
276                    return None;
277                }
278
279                // Tokenize (with optional type stripping for cross-language detection)
280                let file_tokens = if strip_types {
281                    tokenize_file_cross_language(&file.path, &source, true, skip_imports)
282                } else {
283                    tokenize_file(&file.path, &source, skip_imports)
284                };
285                if file_tokens.tokens.is_empty() {
286                    return None;
287                }
288
289                // Normalize and hash using resolved normalization flags
290                let hashed = normalize_and_hash_resolved(&file_tokens.tokens, normalization);
291                let entry = TokenCacheEntry {
292                    hashed_tokens: hashed,
293                    file_tokens,
294                    suppressions: suppressions.clone(),
295                };
296                (entry, suppressions)
297            };
298            if entry.file_tokens.tokens.is_empty() {
299                return None;
300            }
301            if entry.hashed_tokens.len() < config.min_tokens {
302                return None;
303            }
304
305            Some(TokenizedFile {
306                path: file.path.clone(),
307                hashed_tokens: std::mem::take(&mut entry.hashed_tokens),
308                file_tokens: entry.file_tokens,
309                metadata: Some(metadata),
310                cache_hit,
311                suppressions,
312            })
313        })
314        .collect();
315
316    if let (Some(cache_root), Some(mut cache)) = (cache_root, token_cache) {
317        for file in &file_data {
318            if !file.cache_hit
319                && let Some(metadata) = &file.metadata
320            {
321                cache.insert(
322                    &file.path,
323                    metadata,
324                    token_cache_mode,
325                    &file.hashed_tokens,
326                    &file.file_tokens,
327                    &file.suppressions,
328                );
329            }
330        }
331        cache.retain_paths(files);
332        match cache.save_if_dirty() {
333            Ok(true) => {
334                tracing::debug!(cache_root = %cache_root.display(), "saved duplication token cache");
335            }
336            Ok(false) => {
337                tracing::debug!(cache_root = %cache_root.display(), "duplication token cache unchanged");
338            }
339            Err(err) => {
340                tracing::warn!("Failed to save duplication token cache: {err}");
341            }
342        }
343    }
344
345    tracing::info!(
346        files = file_data.len(),
347        "tokenized files for duplication analysis"
348    );
349
350    if let Some(focus_files) = focus_files
351        && file_data.len() >= config.min_corpus_size_for_shingle_filter
352    {
353        shingle_filter::filter_to_focus_candidates(&mut file_data, focus_files, config.min_tokens);
354    }
355
356    // Collect per-file suppressions for line-level filtering
357    let suppressions_by_file: FxHashMap<PathBuf, Vec<Suppression>> = file_data
358        .iter()
359        .filter(|file| !file.suppressions.is_empty())
360        .map(|file| (file.path.clone(), file.suppressions.clone()))
361        .collect();
362
363    // Strip suppressions from the data passed to the detector
364    let detector_data: Vec<(PathBuf, Vec<normalize::HashedToken>, tokenize::FileTokens)> =
365        file_data
366            .into_iter()
367            .map(|file| (file.path, file.hashed_tokens, file.file_tokens))
368            .collect();
369
370    // Step 3 & 4: Detect clones
371    let detector = CloneDetector::new(config.min_tokens, config.min_lines, config.skip_local);
372    let mut report = if let Some(focus_files) = focus_files {
373        detector.detect_touching_files(detector_data, focus_files)
374    } else {
375        detector.detect(detector_data)
376    };
377
378    // Step 5: Apply line-level suppressions FIRST, so the post-suppression
379    // instance count is what the min-occurrences filter evaluates. Otherwise
380    // a 3-instance clone group whose third instance is line-suppressed would
381    // survive `--min-occurrences 3` and show up as a 2-instance group.
382    if !suppressions_by_file.is_empty() {
383        apply_line_suppressions(&mut report, &suppressions_by_file);
384    }
385
386    // Step 5b: Apply the min-occurrences filter on the post-suppression set.
387    apply_min_occurrences_filter(&mut report, config.min_occurrences);
388
389    let default_ignore_skips =
390        build_default_ignore_skips(extra_ignores.as_ref(), &default_skip_counts);
391
392    // Step 6: Group into families with refactoring suggestions
393    report.clone_families = families::group_into_families(&report.clone_groups, root);
394
395    // Step 7: Detect mirrored directory trees
396    report.mirrored_directories =
397        families::detect_mirrored_directories(&report.clone_families, root);
398
399    // Sort all result arrays for deterministic output ordering.
400    // Parallel tokenization (par_iter) doesn't guarantee collection order.
401    report.sort();
402
403    DuplicationRun {
404        report,
405        default_ignore_skips,
406    }
407}
408
409/// Drop clone groups with fewer than `min` instances and record the count on
410/// the stats block. The detector already guarantees `>= 2`, so this is a
411/// no-op when `min <= 2`.
412///
413/// Stats split: `clone_groups` and `clone_instances` are recomputed
414/// post-filter so they match the serialized array length (a CI consumer
415/// reading `stats.clone_groups` and iterating `clone_groups[]` sees the same
416/// count). `duplication_percentage`, `duplicated_lines`, `duplicated_tokens`,
417/// and `files_with_clones` stay pre-filter so the percentage math (lines /
418/// total) stays consistent and `threshold` gates / trend lines don't shift
419/// when the filter changes. The hidden count is disclosed in
420/// `clone_groups_below_min_occurrences`. The surviving groups feed every
421/// downstream step (families, mirrored dirs, --top, baseline, changed-since,
422/// workspace scoping) so there's a single source of truth.
423fn apply_min_occurrences_filter(report: &mut DuplicationReport, min: usize) {
424    if min <= 2 {
425        return;
426    }
427    let before = report.clone_groups.len();
428    report
429        .clone_groups
430        .retain(|group| group.instances.len() >= min);
431    let hidden = before - report.clone_groups.len();
432    if hidden == 0 {
433        return;
434    }
435    report.stats.clone_groups_below_min_occurrences = hidden;
436    report.stats.clone_groups = report.clone_groups.len();
437    report.stats.clone_instances = report.clone_groups.iter().map(|g| g.instances.len()).sum();
438}
439
440/// Filter out clone instances that are suppressed by line-level comments.
441#[expect(
442    clippy::cast_possible_truncation,
443    reason = "line numbers are bounded by source size"
444)]
445fn apply_line_suppressions(
446    report: &mut DuplicationReport,
447    suppressions_by_file: &FxHashMap<PathBuf, Vec<Suppression>>,
448) {
449    report.clone_groups.retain_mut(|group| {
450        group.instances.retain(|instance| {
451            if let Some(supps) = suppressions_by_file.get(&instance.file) {
452                // Check if any line in the instance range is suppressed
453                for line in instance.start_line..=instance.end_line {
454                    if suppress::is_suppressed(supps, line as u32, IssueKind::CodeDuplication) {
455                        return false;
456                    }
457                }
458            }
459            true
460        });
461        // Keep group only if it still has 2+ instances
462        group.instances.len() >= 2
463    });
464}
465
466/// Run duplication detection on a project directory using auto-discovered files.
467///
468/// This is a convenience function that handles file discovery internally.
469#[must_use]
470pub fn find_duplicates_in_project(root: &Path, config: &DuplicatesConfig) -> DuplicationReport {
471    let resolved = crate::default_config(root);
472    let files = discover::discover_files_with_plugin_scopes(&resolved);
473    find_duplicates(root, &files, config)
474}
475
476/// Build a merged ignore set from built-in and user-provided duplicates ignores.
477fn build_ignore_set(config: &DuplicatesConfig) -> Option<IgnoreSet> {
478    if !config.ignore_defaults && config.ignore.is_empty() {
479        return None;
480    }
481
482    let mut builder = GlobSetBuilder::new();
483    let mut defaults = Vec::new();
484
485    if config.ignore_defaults {
486        for pattern in DUPES_DEFAULT_IGNORES {
487            let glob = Glob::new(pattern).expect("default duplication ignore pattern is valid");
488            defaults.push((*pattern, glob.compile_matcher()));
489            builder.add(glob);
490        }
491    }
492
493    // User patterns were validated at config load time
494    // (see FallowConfig::validate_user_globs).
495    for pattern in &config.ignore {
496        builder.add(
497            Glob::new(pattern)
498                .expect("duplicates.ignore pattern was validated at config load time"),
499        );
500    }
501
502    builder.build().ok().map(|all| IgnoreSet { all, defaults })
503}
504
505fn build_default_ignore_skips(
506    ignores: Option<&IgnoreSet>,
507    counts: &[AtomicUsize],
508) -> DefaultIgnoreSkips {
509    let Some(ignores) = ignores else {
510        return DefaultIgnoreSkips::default();
511    };
512
513    let by_pattern = ignores
514        .defaults
515        .iter()
516        .zip(counts)
517        .filter_map(|((pattern, _), count)| {
518            let count = count.load(Ordering::Relaxed);
519            (count > 0).then_some(DefaultIgnoreSkipCount { pattern, count })
520        })
521        .collect::<Vec<_>>();
522    let total = by_pattern.iter().map(|entry| entry.count).sum();
523
524    DefaultIgnoreSkips { total, by_pattern }
525}
526
527#[cfg(test)]
528mod tests {
529    use super::*;
530    use crate::discover::FileId;
531
532    #[test]
533    fn find_duplicates_empty_files() {
534        let config = DuplicatesConfig::default();
535        let report = find_duplicates(Path::new("/tmp"), &[], &config);
536        assert!(report.clone_groups.is_empty());
537        assert!(report.clone_families.is_empty());
538        assert_eq!(report.stats.total_files, 0);
539    }
540
541    #[test]
542    fn build_ignore_set_empty() {
543        let config = DuplicatesConfig {
544            ignore_defaults: false,
545            ..DuplicatesConfig::default()
546        };
547        assert!(build_ignore_set(&config).is_none());
548    }
549
550    #[test]
551    fn build_ignore_set_valid_patterns() {
552        let config = DuplicatesConfig {
553            ignore_defaults: false,
554            ignore: vec!["**/*.test.ts".to_string(), "**/*.spec.ts".to_string()],
555            ..DuplicatesConfig::default()
556        };
557        let set = build_ignore_set(&config);
558        assert!(set.is_some());
559        let set = set.unwrap();
560        assert!(set.is_match(Path::new("src/foo.test.ts")));
561        assert!(set.is_match(Path::new("src/bar.spec.ts")));
562        assert!(!set.is_match(Path::new("src/baz.ts")));
563    }
564
565    #[test]
566    fn build_ignore_set_merges_defaults_with_user_patterns() {
567        let config = DuplicatesConfig {
568            ignore: vec!["**/foo/**".to_string()],
569            ..DuplicatesConfig::default()
570        };
571        let set = build_ignore_set(&config).expect("ignore set");
572        assert!(set.is_match(Path::new(".next/static/chunks/app.js")));
573        assert!(set.is_match(Path::new("src/foo/generated.js")));
574    }
575
576    #[test]
577    fn build_ignore_set_ignore_defaults_false_uses_only_user_patterns() {
578        let config = DuplicatesConfig {
579            ignore_defaults: false,
580            ignore: vec!["**/foo/**".to_string()],
581            ..DuplicatesConfig::default()
582        };
583        let set = build_ignore_set(&config).expect("ignore set");
584        assert!(!set.is_match(Path::new(".next/static/chunks/app.js")));
585        assert!(set.is_match(Path::new("src/foo/generated.js")));
586    }
587
588    #[test]
589    fn find_duplicates_with_real_files() {
590        // Create a temp directory with duplicate files
591        let dir = tempfile::tempdir().expect("create temp dir");
592        let src_dir = dir.path().join("src");
593        std::fs::create_dir_all(&src_dir).expect("create src dir");
594
595        let code = r#"
596export function processData(input: string): string {
597    const trimmed = input.trim();
598    if (trimmed.length === 0) {
599        return "";
600    }
601    const parts = trimmed.split(",");
602    const filtered = parts.filter(p => p.length > 0);
603    const mapped = filtered.map(p => p.toUpperCase());
604    return mapped.join(", ");
605}
606
607export function validateInput(data: string): boolean {
608    if (data === null || data === undefined) {
609        return false;
610    }
611    const cleaned = data.trim();
612    if (cleaned.length < 3) {
613        return false;
614    }
615    return true;
616}
617"#;
618
619        std::fs::write(src_dir.join("original.ts"), code).expect("write original");
620        std::fs::write(src_dir.join("copy.ts"), code).expect("write copy");
621        std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
622            .expect("write package.json");
623
624        let files = vec![
625            DiscoveredFile {
626                id: FileId(0),
627                path: src_dir.join("original.ts"),
628                size_bytes: code.len() as u64,
629            },
630            DiscoveredFile {
631                id: FileId(1),
632                path: src_dir.join("copy.ts"),
633                size_bytes: code.len() as u64,
634            },
635        ];
636
637        let config = DuplicatesConfig {
638            min_tokens: 10,
639            min_lines: 2,
640            ..DuplicatesConfig::default()
641        };
642
643        let report = find_duplicates(dir.path(), &files, &config);
644        assert!(
645            !report.clone_groups.is_empty(),
646            "Should detect clones in identical files"
647        );
648        assert!(report.stats.files_with_clones >= 2);
649
650        // Should also have clone families
651        assert!(
652            !report.clone_families.is_empty(),
653            "Should group clones into families"
654        );
655    }
656
657    #[test]
658    fn find_duplicates_cached_skips_token_cache_for_small_corpus() {
659        let dir = tempfile::tempdir().expect("create temp dir");
660        let src_dir = dir.path().join("src");
661        std::fs::create_dir_all(&src_dir).expect("create src dir");
662
663        let code = "export function same(input: number): number {\n  const doubled = input * 2;\n  return doubled + 1;\n}\n";
664        let first = src_dir.join("first.ts");
665        let second = src_dir.join("second.ts");
666        std::fs::write(&first, code).expect("write first");
667        std::fs::write(&second, code).expect("write second");
668
669        let files = vec![
670            DiscoveredFile {
671                id: FileId(0),
672                path: first,
673                size_bytes: code.len() as u64,
674            },
675            DiscoveredFile {
676                id: FileId(1),
677                path: second,
678                size_bytes: code.len() as u64,
679            },
680        ];
681        let config = DuplicatesConfig {
682            min_tokens: 5,
683            min_lines: 2,
684            ..DuplicatesConfig::default()
685        };
686        let cache_root = dir.path().join(".fallow");
687
688        let report = find_duplicates_cached(dir.path(), &files, &config, &cache_root);
689
690        assert!(!report.clone_groups.is_empty());
691        assert!(
692            !cache_root.exists(),
693            "small projects should avoid token-cache IO overhead"
694        );
695    }
696
697    #[test]
698    fn find_duplicates_touching_files_keeps_cross_corpus_matches_only_for_focus() {
699        let dir = tempfile::tempdir().expect("create temp dir");
700        let src_dir = dir.path().join("src");
701        std::fs::create_dir_all(&src_dir).expect("create src dir");
702
703        let focused_code = r"
704export function focused(input: number): number {
705    const doubled = input * 2;
706    const shifted = doubled + 10;
707    return shifted / 2;
708}
709";
710        let untouched_code = r#"
711export function untouched(input: string): string {
712    const lowered = input.toLowerCase();
713    const padded = lowered.padStart(10, "x");
714    return padded.slice(0, 8);
715}
716"#;
717
718        let changed_path = src_dir.join("changed.ts");
719        let focused_copy_path = src_dir.join("focused-copy.ts");
720        let untouched_a_path = src_dir.join("untouched-a.ts");
721        let untouched_b_path = src_dir.join("untouched-b.ts");
722        std::fs::write(&changed_path, focused_code).expect("write changed");
723        std::fs::write(&focused_copy_path, focused_code).expect("write focused copy");
724        std::fs::write(&untouched_a_path, untouched_code).expect("write untouched a");
725        std::fs::write(&untouched_b_path, untouched_code).expect("write untouched b");
726
727        let files = vec![
728            DiscoveredFile {
729                id: FileId(0),
730                path: changed_path.clone(),
731                size_bytes: focused_code.len() as u64,
732            },
733            DiscoveredFile {
734                id: FileId(1),
735                path: focused_copy_path,
736                size_bytes: focused_code.len() as u64,
737            },
738            DiscoveredFile {
739                id: FileId(2),
740                path: untouched_a_path,
741                size_bytes: untouched_code.len() as u64,
742            },
743            DiscoveredFile {
744                id: FileId(3),
745                path: untouched_b_path,
746                size_bytes: untouched_code.len() as u64,
747            },
748        ];
749
750        let config = DuplicatesConfig {
751            mode: DetectionMode::Strict,
752            min_tokens: 5,
753            min_lines: 2,
754            min_corpus_size_for_shingle_filter: 1,
755            ..DuplicatesConfig::default()
756        };
757        let mut focus = FxHashSet::default();
758        focus.insert(changed_path.clone());
759
760        let full_report = find_duplicates(dir.path(), &files, &config);
761        let report = find_duplicates_touching_files(dir.path(), &files, &config, &focus);
762        let expected_touching = full_report
763            .clone_groups
764            .iter()
765            .filter(|group| {
766                group
767                    .instances
768                    .iter()
769                    .any(|instance| instance.file == changed_path)
770            })
771            .count();
772
773        assert!(
774            !report.clone_groups.is_empty(),
775            "focused file should still match an unchanged duplicate"
776        );
777        assert_eq!(
778            report.clone_groups.len(),
779            expected_touching,
780            "focused shingle filtering must not drop clone groups touching the focused file"
781        );
782        assert!(report.clone_groups.iter().all(|group| {
783            group
784                .instances
785                .iter()
786                .any(|instance| instance.file == changed_path)
787        }));
788    }
789
790    #[test]
791    fn file_wide_suppression_excludes_file() {
792        let dir = tempfile::tempdir().expect("create temp dir");
793        let src_dir = dir.path().join("src");
794        std::fs::create_dir_all(&src_dir).expect("create src dir");
795
796        let code = r#"
797export function processData(input: string): string {
798    const trimmed = input.trim();
799    if (trimmed.length === 0) {
800        return "";
801    }
802    const parts = trimmed.split(",");
803    const filtered = parts.filter(p => p.length > 0);
804    const mapped = filtered.map(p => p.toUpperCase());
805    return mapped.join(", ");
806}
807"#;
808        let suppressed_code = format!("// fallow-ignore-file code-duplication\n{code}");
809
810        std::fs::write(src_dir.join("original.ts"), code).expect("write original");
811        std::fs::write(src_dir.join("suppressed.ts"), &suppressed_code).expect("write suppressed");
812        std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
813            .expect("write package.json");
814
815        let files = vec![
816            DiscoveredFile {
817                id: FileId(0),
818                path: src_dir.join("original.ts"),
819                size_bytes: code.len() as u64,
820            },
821            DiscoveredFile {
822                id: FileId(1),
823                path: src_dir.join("suppressed.ts"),
824                size_bytes: suppressed_code.len() as u64,
825            },
826        ];
827
828        let config = DuplicatesConfig {
829            min_tokens: 10,
830            min_lines: 2,
831            ..DuplicatesConfig::default()
832        };
833
834        let report = find_duplicates(dir.path(), &files, &config);
835        // With only 2 files and one suppressed, there should be no clones
836        assert!(
837            report.clone_groups.is_empty(),
838            "File-wide suppression should exclude file from duplication analysis"
839        );
840    }
841
842    #[test]
843    fn min_occurrences_hides_pairs_and_records_count() {
844        let dir = tempfile::tempdir().expect("create temp dir");
845        let src_dir = dir.path().join("src");
846        std::fs::create_dir_all(&src_dir).expect("create src dir");
847
848        // Block A: only appears in 2 files (a pair).
849        // Block B: appears in 3 files (a triple).
850        let block_a = r#"
851export function blockA(input: string): string {
852    const trimmed = input.trim();
853    if (trimmed.length === 0) {
854        return "";
855    }
856    const parts = trimmed.split(",");
857    const filtered = parts.filter(p => p.length > 0);
858    const mapped = filtered.map(p => p.toUpperCase());
859    return mapped.join(", ");
860}
861"#;
862        let block_b = r"
863export function blockB(value: number): number {
864    if (value <= 0) {
865        return 0;
866    }
867    let total = 0;
868    for (let i = 1; i <= value; i += 1) {
869        total += i * 2;
870        total -= 1;
871    }
872    return total + 7;
873}
874";
875
876        let pair_a1 = src_dir.join("pair-a1.ts");
877        let pair_a2 = src_dir.join("pair-a2.ts");
878        let triple_b1 = src_dir.join("triple-b1.ts");
879        let triple_b2 = src_dir.join("triple-b2.ts");
880        let triple_b3 = src_dir.join("triple-b3.ts");
881        std::fs::write(&pair_a1, block_a).expect("write");
882        std::fs::write(&pair_a2, block_a).expect("write");
883        std::fs::write(&triple_b1, block_b).expect("write");
884        std::fs::write(&triple_b2, block_b).expect("write");
885        std::fs::write(&triple_b3, block_b).expect("write");
886
887        let files = vec![
888            DiscoveredFile {
889                id: FileId(0),
890                path: pair_a1,
891                size_bytes: block_a.len() as u64,
892            },
893            DiscoveredFile {
894                id: FileId(1),
895                path: pair_a2,
896                size_bytes: block_a.len() as u64,
897            },
898            DiscoveredFile {
899                id: FileId(2),
900                path: triple_b1,
901                size_bytes: block_b.len() as u64,
902            },
903            DiscoveredFile {
904                id: FileId(3),
905                path: triple_b2,
906                size_bytes: block_b.len() as u64,
907            },
908            DiscoveredFile {
909                id: FileId(4),
910                path: triple_b3,
911                size_bytes: block_b.len() as u64,
912            },
913        ];
914
915        // Baseline: minOccurrences = 2 (default). Both groups reported.
916        let default_config = DuplicatesConfig {
917            min_tokens: 10,
918            min_lines: 2,
919            ..DuplicatesConfig::default()
920        };
921        let baseline = find_duplicates(dir.path(), &files, &default_config);
922        assert_eq!(
923            baseline.clone_groups.len(),
924            2,
925            "default minOccurrences should report both the pair and the triple"
926        );
927        assert_eq!(
928            baseline.stats.clone_groups_below_min_occurrences, 0,
929            "default minOccurrences hides nothing"
930        );
931        let baseline_pct = baseline.stats.duplication_percentage;
932
933        // Raised: minOccurrences = 3. Only the triple survives.
934        let raised_config = DuplicatesConfig {
935            min_tokens: 10,
936            min_lines: 2,
937            min_occurrences: 3,
938            ..DuplicatesConfig::default()
939        };
940        let report = find_duplicates(dir.path(), &files, &raised_config);
941        assert_eq!(
942            report.clone_groups.len(),
943            1,
944            "minOccurrences=3 should hide the 2-instance group"
945        );
946        assert_eq!(
947            report.clone_groups[0].instances.len(),
948            3,
949            "surviving group must be the 3-instance group"
950        );
951        assert_eq!(
952            report.stats.clone_groups_below_min_occurrences, 1,
953            "the hidden 2-instance group must be counted"
954        );
955        // `clone_groups` and `clone_instances` reflect the post-filter set so
956        // consumers iterating `clone_groups[]` see a matching count.
957        assert_eq!(
958            report.stats.clone_groups, 1,
959            "stats.clone_groups must match the post-filter array length"
960        );
961        assert_eq!(
962            report.stats.clone_instances, 3,
963            "stats.clone_instances must match the surviving instance total"
964        );
965        // `duplication_percentage` stays pre-filter so threshold gates and
966        // trend lines don't shift when minOccurrences changes.
967        assert!(
968            (report.stats.duplication_percentage - baseline_pct).abs() < f64::EPSILON,
969            "duplication_percentage should not shift when minOccurrences changes"
970        );
971    }
972
973    #[test]
974    fn min_occurrences_evaluates_after_line_suppressions() {
975        // Three files share a clone. The third file suppresses the clone with
976        // an inline comment. After suppression the group has 2 instances.
977        // With minOccurrences=3 the group must be hidden, NOT reported as a
978        // 2-instance clone. The filter evaluates the post-suppression count,
979        // not the pre-suppression detector output.
980        let dir = tempfile::tempdir().expect("create temp dir");
981        let src_dir = dir.path().join("src");
982        std::fs::create_dir_all(&src_dir).expect("create src dir");
983
984        let block = r#"
985export function shared(input: string): string {
986    const trimmed = input.trim();
987    if (trimmed.length === 0) {
988        return "";
989    }
990    const parts = trimmed.split(",");
991    const filtered = parts.filter(p => p.length > 0);
992    const mapped = filtered.map(p => p.toUpperCase());
993    return mapped.join(", ");
994}
995"#;
996        let suppressed = format!("// fallow-ignore-file code-duplication\n{block}");
997
998        let a = src_dir.join("a.ts");
999        let b = src_dir.join("b.ts");
1000        let c = src_dir.join("c.ts");
1001        std::fs::write(&a, block).expect("write a");
1002        std::fs::write(&b, block).expect("write b");
1003        std::fs::write(&c, &suppressed).expect("write c");
1004
1005        let files = vec![
1006            DiscoveredFile {
1007                id: FileId(0),
1008                path: a,
1009                size_bytes: block.len() as u64,
1010            },
1011            DiscoveredFile {
1012                id: FileId(1),
1013                path: b,
1014                size_bytes: block.len() as u64,
1015            },
1016            DiscoveredFile {
1017                id: FileId(2),
1018                path: c,
1019                size_bytes: suppressed.len() as u64,
1020            },
1021        ];
1022
1023        let config = DuplicatesConfig {
1024            min_tokens: 10,
1025            min_lines: 2,
1026            min_occurrences: 3,
1027            ..DuplicatesConfig::default()
1028        };
1029        let report = find_duplicates(dir.path(), &files, &config);
1030        assert!(
1031            report.clone_groups.is_empty(),
1032            "post-suppression 2-instance group must be hidden by minOccurrences=3, \
1033             got groups: {:?}",
1034            report
1035                .clone_groups
1036                .iter()
1037                .map(|g| g.instances.len())
1038                .collect::<Vec<_>>()
1039        );
1040        assert_eq!(
1041            report.stats.clone_groups, 0,
1042            "stats.clone_groups must match the empty post-filter array"
1043        );
1044        assert_eq!(
1045            report.stats.clone_instances, 0,
1046            "stats.clone_instances must match the empty post-filter array"
1047        );
1048    }
1049}