Skip to main content

fallow_core/duplicates/
mod.rs

1//! Code duplication / clone detection module.
2//!
3//! This module implements suffix array + LCP based clone detection
4//! for TypeScript/JavaScript source files. It supports multiple detection
5//! modes from strict (exact matches only) to semantic (structure-aware
6//! matching that ignores identifier names and literal values).
7
8mod cache;
9pub mod deepdive;
10pub mod detect;
11pub mod families;
12pub mod normalize;
13mod shingle_filter;
14pub mod token_types;
15mod token_visitor;
16pub mod tokenize;
17pub(crate) mod types;
18
19use rustc_hash::FxHashMap;
20use std::path::{Path, PathBuf};
21use std::sync::atomic::{AtomicUsize, Ordering};
22
23use globset::{Glob, GlobSet, GlobSetBuilder};
24use rayon::prelude::*;
25use rustc_hash::FxHashSet;
26
27use cache::{TokenCache, TokenCacheEntry, TokenCacheMode};
28pub use deepdive::{
29    CloneFingerprintKey, CloneFingerprintSet, FINGERPRINT_PREFIX, clone_fingerprint,
30    dominant_identifier, fingerprint_for_fragment, group_refactoring_suggestion,
31};
32use detect::CloneDetector;
33use normalize::normalize_and_hash_resolved;
34use tokenize::{tokenize_file, tokenize_file_cross_language};
35pub use types::{
36    CloneFamily, CloneGroup, CloneInstance, DefaultIgnoreSkipCount, DefaultIgnoreSkips,
37    DetectionMode, DuplicatesConfig, DuplicationReport, DuplicationStats, MirroredDirectory,
38    RefactoringKind, RefactoringSuggestion,
39};
40
41use crate::discover::{self, DiscoveredFile};
42use crate::suppress::{self, IssueKind, Suppression};
43
44/// Built-in duplicates ignores for generated framework, tool, and test output.
45///
46/// These are engine policy defaults, not config-file defaults: `duplicates.ignore`
47/// stays empty in round-tripped configs, while the analyzer merges these patterns
48/// unless `duplicates.ignoreDefaults` is set to `false`.
49pub const DUPES_DEFAULT_IGNORES: &[&str] = &[
50    "**/.next/**",
51    "**/.nuxt/**",
52    "**/.svelte-kit/**",
53    "**/.turbo/**",
54    "**/.parcel-cache/**",
55    "**/.vite/**",
56    "**/.cache/**",
57    "**/out/**",
58    "**/storybook-static/**",
59    "**/*.test.*",
60    "**/*.spec.*",
61    "**/__tests__/**",
62    "**/__mocks__/**",
63];
64
65#[derive(Clone)]
66pub(super) struct TokenizedFile {
67    path: PathBuf,
68    hashed_tokens: Vec<normalize::HashedToken>,
69    file_tokens: tokenize::FileTokens,
70    metadata: Option<std::fs::Metadata>,
71    cache_hit: bool,
72    suppressions: Vec<Suppression>,
73}
74
75struct IgnoreSet {
76    all: GlobSet,
77    defaults: Vec<&'static str>,
78}
79
80enum IgnoreMatch {
81    Default(usize),
82    User,
83}
84
85impl IgnoreSet {
86    fn match_path(&self, path: &Path, matches: &mut Vec<usize>) -> Option<IgnoreMatch> {
87        self.all.matches_into(path, matches);
88        let first = matches.first().copied()?;
89        if first < self.defaults.len() {
90            Some(IgnoreMatch::Default(first))
91        } else {
92            Some(IgnoreMatch::User)
93        }
94    }
95}
96
97struct DuplicationRun {
98    report: DuplicationReport,
99    default_ignore_skips: DefaultIgnoreSkips,
100}
101
102struct DuplicationTokenizeContext<'a> {
103    root: &'a Path,
104    config: &'a DuplicatesConfig,
105    extra_ignores: Option<&'a IgnoreSet>,
106    default_skip_counts: &'a [AtomicUsize],
107    token_cache: Option<&'a TokenCache>,
108    token_cache_mode: TokenCacheMode,
109    normalization: fallow_config::ResolvedNormalization,
110    strip_types: bool,
111    skip_imports: bool,
112}
113
114/// Run duplication detection on the given files.
115///
116/// This is the main entry point for the duplication analysis. It:
117/// 1. Reads and tokenizes all source files in parallel
118/// 2. Normalizes tokens according to the detection mode
119/// 3. Runs suffix array + LCP clone detection
120/// 4. Groups clone instances into families with refactoring suggestions
121/// 5. Applies inline suppression filters
122pub fn find_duplicates(
123    root: &Path,
124    files: &[DiscoveredFile],
125    config: &DuplicatesConfig,
126) -> DuplicationReport {
127    find_duplicates_inner(root, files, config, None, None).report
128}
129
130/// Run duplication detection and return human-format sidecar metadata for
131/// files skipped by built-in duplicates ignores.
132pub fn find_duplicates_with_default_ignore_skips(
133    root: &Path,
134    files: &[DiscoveredFile],
135    config: &DuplicatesConfig,
136) -> (DuplicationReport, DefaultIgnoreSkips) {
137    let run = find_duplicates_inner(root, files, config, None, None);
138    (run.report, run.default_ignore_skips)
139}
140
141/// Run duplication detection with the persistent token cache enabled.
142pub fn find_duplicates_cached(
143    root: &Path,
144    files: &[DiscoveredFile],
145    config: &DuplicatesConfig,
146    cache_root: &Path,
147) -> DuplicationReport {
148    find_duplicates_inner(root, files, config, None, Some(cache_root)).report
149}
150
151/// Run cached duplication detection and return human-format sidecar metadata for
152/// files skipped by built-in duplicates ignores.
153pub fn find_duplicates_cached_with_default_ignore_skips(
154    root: &Path,
155    files: &[DiscoveredFile],
156    config: &DuplicatesConfig,
157    cache_root: &Path,
158) -> (DuplicationReport, DefaultIgnoreSkips) {
159    let run = find_duplicates_inner(root, files, config, None, Some(cache_root));
160    (run.report, run.default_ignore_skips)
161}
162
163/// Run duplication detection and only return clone groups touching `focus_files`.
164///
165/// This keeps all files in the matching corpus, which preserves changed-file
166/// versus unchanged-file detection for diff-scoped audit runs, but avoids
167/// materializing duplicate groups that cannot appear in the scoped report.
168#[expect(
169    clippy::implicit_hasher,
170    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
171)]
172pub fn find_duplicates_touching_files(
173    root: &Path,
174    files: &[DiscoveredFile],
175    config: &DuplicatesConfig,
176    focus_files: &FxHashSet<PathBuf>,
177) -> DuplicationReport {
178    find_duplicates_inner(root, files, config, Some(focus_files), None).report
179}
180
181/// Run focused duplication detection and return human-format sidecar metadata
182/// for files skipped by built-in duplicates ignores.
183#[expect(
184    clippy::implicit_hasher,
185    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
186)]
187pub fn find_duplicates_touching_files_with_default_ignore_skips(
188    root: &Path,
189    files: &[DiscoveredFile],
190    config: &DuplicatesConfig,
191    focus_files: &FxHashSet<PathBuf>,
192) -> (DuplicationReport, DefaultIgnoreSkips) {
193    let run = find_duplicates_inner(root, files, config, Some(focus_files), None);
194    (run.report, run.default_ignore_skips)
195}
196
197/// Run focused duplication detection with the persistent token cache enabled.
198#[expect(
199    clippy::implicit_hasher,
200    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
201)]
202pub fn find_duplicates_touching_files_cached(
203    root: &Path,
204    files: &[DiscoveredFile],
205    config: &DuplicatesConfig,
206    focus_files: &FxHashSet<PathBuf>,
207    cache_root: &Path,
208) -> DuplicationReport {
209    find_duplicates_inner(root, files, config, Some(focus_files), Some(cache_root)).report
210}
211
212/// Run cached focused duplication detection and return human-format sidecar
213/// metadata for files skipped by built-in duplicates ignores.
214#[expect(
215    clippy::implicit_hasher,
216    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
217)]
218pub fn find_duplicates_touching_files_cached_with_default_ignore_skips(
219    root: &Path,
220    files: &[DiscoveredFile],
221    config: &DuplicatesConfig,
222    focus_files: &FxHashSet<PathBuf>,
223    cache_root: &Path,
224) -> (DuplicationReport, DefaultIgnoreSkips) {
225    let run = find_duplicates_inner(root, files, config, Some(focus_files), Some(cache_root));
226    (run.report, run.default_ignore_skips)
227}
228
229/// Tokenize the corpus for duplication detection: resolves normalization and
230/// cache config, tokenizes files (writing the token cache when enabled), and
231/// returns the per-file token data alongside the corpus totals.
232fn tokenize_corpus_for_duplicates(
233    root: &Path,
234    files: &[DiscoveredFile],
235    config: &DuplicatesConfig,
236    extra_ignores: Option<&IgnoreSet>,
237    default_skip_counts: &[AtomicUsize],
238    cache_root: Option<&Path>,
239) -> (Vec<TokenizedFile>, detect::CorpusTotals) {
240    let normalization =
241        fallow_config::ResolvedNormalization::resolve(config.mode, &config.normalization);
242
243    let strip_types = config.cross_language;
244    let skip_imports = config.ignore_imports;
245
246    tracing::debug!(
247        ignore_imports = skip_imports,
248        "duplication tokenization config"
249    );
250
251    let token_cache_mode = TokenCacheMode::new(normalization, strip_types, skip_imports);
252    let cache_root = cache_root.filter(|_| files.len() >= config.min_corpus_size_for_token_cache);
253    let token_cache = cache_root.map(TokenCache::load);
254
255    let file_data = tokenize_duplication_files(
256        files,
257        &DuplicationTokenizeContext {
258            root,
259            config,
260            extra_ignores,
261            default_skip_counts,
262            token_cache: token_cache.as_ref(),
263            token_cache_mode,
264            normalization,
265            strip_types,
266            skip_imports,
267        },
268    );
269
270    if let (Some(cache_root), Some(cache)) = (cache_root, token_cache) {
271        save_duplication_token_cache(cache_root, cache, files, &file_data, token_cache_mode);
272    }
273
274    tracing::info!(
275        files = file_data.len(),
276        "tokenized files for duplication analysis"
277    );
278
279    let corpus_totals = detect::CorpusTotals {
280        files: file_data.len(),
281        lines: file_data
282            .iter()
283            .map(|file| file.file_tokens.line_count)
284            .sum(),
285        tokens: file_data.iter().map(|file| file.hashed_tokens.len()).sum(),
286    };
287    (file_data, corpus_totals)
288}
289
290/// Run clone detection over tokenized files, then apply suppression and
291/// min-occurrence filters, family grouping, mirrored-directory detection, and
292/// final sorting.
293fn detect_and_postprocess(
294    root: &Path,
295    config: &DuplicatesConfig,
296    mut file_data: Vec<TokenizedFile>,
297    corpus_totals: detect::CorpusTotals,
298    focus_files: Option<&FxHashSet<PathBuf>>,
299) -> DuplicationReport {
300    if file_data.len() >= config.min_corpus_size_for_shingle_filter {
301        if let Some(focus_files) = focus_files {
302            shingle_filter::filter_to_focus_candidates(
303                &mut file_data,
304                focus_files,
305                config.min_tokens,
306            );
307        } else {
308            shingle_filter::filter_to_duplicate_candidates(&mut file_data, config.min_tokens);
309        }
310    }
311
312    let suppressions_by_file: FxHashMap<PathBuf, Vec<Suppression>> = file_data
313        .iter()
314        .filter(|file| !file.suppressions.is_empty())
315        .map(|file| (file.path.clone(), file.suppressions.clone()))
316        .collect();
317
318    let detector_data: Vec<(PathBuf, Vec<normalize::HashedToken>, tokenize::FileTokens)> =
319        file_data
320            .into_iter()
321            .map(|file| (file.path, file.hashed_tokens, file.file_tokens))
322            .collect();
323
324    let detector = CloneDetector::new(config.min_tokens, config.min_lines, config.skip_local);
325    let mut report = if let Some(focus_files) = focus_files {
326        detector.detect_touching_files(detector_data, focus_files)
327    } else {
328        detector.detect_with_totals(detector_data, corpus_totals)
329    };
330
331    if !suppressions_by_file.is_empty() {
332        apply_line_suppressions(&mut report, &suppressions_by_file);
333    }
334
335    apply_min_occurrences_filter(&mut report, config.min_occurrences);
336
337    report.clone_families = families::group_into_families(&report.clone_groups, root);
338    report.mirrored_directories =
339        families::detect_mirrored_directories(&report.clone_families, root);
340    report.sort();
341    report
342}
343
344fn find_duplicates_inner(
345    root: &Path,
346    files: &[DiscoveredFile],
347    config: &DuplicatesConfig,
348    focus_files: Option<&FxHashSet<PathBuf>>,
349    cache_root: Option<&Path>,
350) -> DuplicationRun {
351    let _span = tracing::info_span!("find_duplicates").entered();
352
353    let extra_ignores = build_ignore_set(config);
354    let default_skip_counts = extra_ignores
355        .as_ref()
356        .map(|ignores| {
357            std::iter::repeat_with(|| AtomicUsize::new(0))
358                .take(ignores.defaults.len())
359                .collect::<Vec<_>>()
360        })
361        .unwrap_or_default();
362
363    let (file_data, corpus_totals) = tokenize_corpus_for_duplicates(
364        root,
365        files,
366        config,
367        extra_ignores.as_ref(),
368        &default_skip_counts,
369        cache_root,
370    );
371
372    let report = detect_and_postprocess(root, config, file_data, corpus_totals, focus_files);
373
374    let default_ignore_skips =
375        build_default_ignore_skips(extra_ignores.as_ref(), &default_skip_counts);
376
377    DuplicationRun {
378        report,
379        default_ignore_skips,
380    }
381}
382
383fn tokenize_duplication_files(
384    files: &[DiscoveredFile],
385    ctx: &DuplicationTokenizeContext<'_>,
386) -> Vec<TokenizedFile> {
387    files
388        .par_iter()
389        .filter_map(|file| tokenize_duplication_file(file, ctx))
390        .collect()
391}
392
393fn tokenize_duplication_file(
394    file: &DiscoveredFile,
395    ctx: &DuplicationTokenizeContext<'_>,
396) -> Option<TokenizedFile> {
397    if should_skip_duplicate_file(file, ctx) {
398        return None;
399    }
400
401    let metadata = std::fs::metadata(&file.path).ok()?;
402    let cached_entry = ctx
403        .token_cache
404        .and_then(|cache| cache.get(&file.path, &metadata, ctx.token_cache_mode));
405    let cache_hit = cached_entry.is_some();
406    let (mut entry, suppressions) = duplication_token_cache_entry(file, ctx, cached_entry)?;
407    if entry.file_tokens.tokens.is_empty() || entry.hashed_tokens.len() < ctx.config.min_tokens {
408        return None;
409    }
410
411    Some(TokenizedFile {
412        path: file.path.clone(),
413        hashed_tokens: std::mem::take(&mut entry.hashed_tokens),
414        file_tokens: entry.file_tokens,
415        metadata: Some(metadata),
416        cache_hit,
417        suppressions,
418    })
419}
420
421fn should_skip_duplicate_file(file: &DiscoveredFile, ctx: &DuplicationTokenizeContext<'_>) -> bool {
422    let relative = file.path.strip_prefix(ctx.root).unwrap_or(&file.path);
423    let Some(ignores) = ctx.extra_ignores else {
424        return false;
425    };
426    let mut matches = Vec::new();
427    match ignores.match_path(relative, &mut matches) {
428        Some(IgnoreMatch::Default(index)) => {
429            ctx.default_skip_counts[index].fetch_add(1, Ordering::Relaxed);
430            true
431        }
432        Some(IgnoreMatch::User) => true,
433        None => false,
434    }
435}
436
437fn duplication_token_cache_entry(
438    file: &DiscoveredFile,
439    ctx: &DuplicationTokenizeContext<'_>,
440    cached_entry: Option<TokenCacheEntry>,
441) -> Option<(TokenCacheEntry, Vec<Suppression>)> {
442    if let Some(entry) = cached_entry {
443        let suppressions = entry.suppressions.clone();
444        if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
445            return None;
446        }
447        return Some((entry, suppressions));
448    }
449
450    let source = std::fs::read_to_string(&file.path).ok()?;
451    let suppressions = suppress::parse_suppressions_from_source(&source).suppressions;
452    if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
453        return None;
454    }
455    let file_tokens = tokenize_duplication_source(file, ctx, &source);
456    if file_tokens.tokens.is_empty() {
457        return None;
458    }
459    let hashed_tokens = normalize_and_hash_resolved(&file_tokens.tokens, ctx.normalization);
460    Some((
461        TokenCacheEntry {
462            hashed_tokens,
463            file_tokens,
464            suppressions: suppressions.clone(),
465        },
466        suppressions,
467    ))
468}
469
470fn tokenize_duplication_source(
471    file: &DiscoveredFile,
472    ctx: &DuplicationTokenizeContext<'_>,
473    source: &str,
474) -> tokenize::FileTokens {
475    if ctx.strip_types {
476        tokenize_file_cross_language(&file.path, source, true, ctx.skip_imports)
477    } else {
478        tokenize_file(&file.path, source, ctx.skip_imports)
479    }
480}
481
482fn save_duplication_token_cache(
483    cache_root: &Path,
484    mut cache: TokenCache,
485    files: &[DiscoveredFile],
486    file_data: &[TokenizedFile],
487    mode: TokenCacheMode,
488) {
489    for file in file_data {
490        if !file.cache_hit
491            && let Some(metadata) = &file.metadata
492        {
493            cache.insert(
494                &file.path,
495                metadata,
496                mode,
497                &cache::TokenPayload {
498                    hashed_tokens: &file.hashed_tokens,
499                    file_tokens: &file.file_tokens,
500                    suppressions: &file.suppressions,
501                },
502            );
503        }
504    }
505    cache.retain_paths(files);
506    match cache.save_if_dirty() {
507        Ok(true) => {
508            tracing::debug!(cache_root = %cache_root.display(), "saved duplication token cache");
509        }
510        Ok(false) => {
511            tracing::debug!(cache_root = %cache_root.display(), "duplication token cache unchanged");
512        }
513        Err(err) => tracing::warn!("Failed to save duplication token cache: {err}"),
514    }
515}
516
517/// Drop clone groups with fewer than `min` instances and record the count on
518/// the stats block. The detector already guarantees `>= 2`, so this is a
519/// no-op when `min <= 2`.
520///
521/// Stats split: `clone_groups` and `clone_instances` are recomputed
522/// post-filter so they match the serialized array length (a CI consumer
523/// reading `stats.clone_groups` and iterating `clone_groups[]` sees the same
524/// count). `duplication_percentage`, `duplicated_lines`, `duplicated_tokens`,
525/// and `files_with_clones` stay pre-filter so the percentage math (lines /
526/// total) stays consistent and `threshold` gates / trend lines don't shift
527/// when the filter changes. The hidden count is disclosed in
528/// `clone_groups_below_min_occurrences`. The surviving groups feed every
529/// downstream step (families, mirrored dirs, --top, baseline, changed-since,
530/// workspace scoping) so there's a single source of truth.
531fn apply_min_occurrences_filter(report: &mut DuplicationReport, min: usize) {
532    if min <= 2 {
533        return;
534    }
535    let before = report.clone_groups.len();
536    report
537        .clone_groups
538        .retain(|group| group.instances.len() >= min);
539    let hidden = before - report.clone_groups.len();
540    if hidden == 0 {
541        return;
542    }
543    report.stats.clone_groups_below_min_occurrences = hidden;
544    report.stats.clone_groups = report.clone_groups.len();
545    report.stats.clone_instances = report.clone_groups.iter().map(|g| g.instances.len()).sum();
546}
547
548/// Filter out clone instances that are suppressed by line-level comments.
549#[expect(
550    clippy::cast_possible_truncation,
551    reason = "line numbers are bounded by source size"
552)]
553fn apply_line_suppressions(
554    report: &mut DuplicationReport,
555    suppressions_by_file: &FxHashMap<PathBuf, Vec<Suppression>>,
556) {
557    report.clone_groups.retain_mut(|group| {
558        group.instances.retain(|instance| {
559            if let Some(supps) = suppressions_by_file.get(&instance.file) {
560                for line in instance.start_line..=instance.end_line {
561                    if suppress::is_suppressed(supps, line as u32, IssueKind::CodeDuplication) {
562                        return false;
563                    }
564                }
565            }
566            true
567        });
568        group.instances.len() >= 2
569    });
570}
571
572/// Run duplication detection on a project directory using auto-discovered files.
573///
574/// This is a convenience function that handles file discovery internally.
575#[must_use]
576pub fn find_duplicates_in_project(root: &Path, config: &DuplicatesConfig) -> DuplicationReport {
577    let resolved = crate::default_config(root);
578    let files = discover::discover_files_with_plugin_scopes(&resolved);
579    find_duplicates(root, &files, config)
580}
581
582/// Build a merged ignore set from built-in and user-provided duplicates ignores.
583#[expect(
584    clippy::expect_used,
585    reason = "duplicate ignore globs are validated before clone detection"
586)]
587fn build_ignore_set(config: &DuplicatesConfig) -> Option<IgnoreSet> {
588    if !config.ignore_defaults && config.ignore.is_empty() {
589        return None;
590    }
591
592    let mut builder = GlobSetBuilder::new();
593    let mut defaults = Vec::new();
594
595    if config.ignore_defaults {
596        for pattern in DUPES_DEFAULT_IGNORES {
597            let glob = Glob::new(pattern).expect("default duplication ignore pattern is valid");
598            defaults.push(*pattern);
599            builder.add(glob);
600        }
601    }
602
603    for pattern in &config.ignore {
604        builder.add(
605            Glob::new(pattern)
606                .expect("duplicates.ignore pattern was validated at config load time"),
607        );
608    }
609
610    builder.build().ok().map(|all| IgnoreSet { all, defaults })
611}
612
613fn build_default_ignore_skips(
614    ignores: Option<&IgnoreSet>,
615    counts: &[AtomicUsize],
616) -> DefaultIgnoreSkips {
617    let Some(ignores) = ignores else {
618        return DefaultIgnoreSkips::default();
619    };
620
621    let by_pattern = ignores
622        .defaults
623        .iter()
624        .zip(counts)
625        .filter_map(|(pattern, count)| {
626            let count = count.load(Ordering::Relaxed);
627            (count > 0).then_some(DefaultIgnoreSkipCount { pattern, count })
628        })
629        .collect::<Vec<_>>();
630    let total = by_pattern.iter().map(|entry| entry.count).sum();
631
632    DefaultIgnoreSkips { total, by_pattern }
633}
634
635#[cfg(test)]
636mod tests {
637    use super::*;
638    use crate::discover::FileId;
639
640    fn ignore_set_matches(set: &IgnoreSet, path: &str) -> bool {
641        let mut matches = Vec::new();
642        set.match_path(Path::new(path), &mut matches).is_some()
643    }
644
645    #[test]
646    fn find_duplicates_empty_files() {
647        let config = DuplicatesConfig::default();
648        let report = find_duplicates(Path::new("/tmp"), &[], &config);
649        assert!(report.clone_groups.is_empty());
650        assert!(report.clone_families.is_empty());
651        assert_eq!(report.stats.total_files, 0);
652    }
653
654    #[test]
655    fn build_ignore_set_empty() {
656        let config = DuplicatesConfig {
657            ignore_defaults: false,
658            ..DuplicatesConfig::default()
659        };
660        assert!(build_ignore_set(&config).is_none());
661    }
662
663    #[test]
664    fn build_ignore_set_valid_patterns() {
665        let config = DuplicatesConfig {
666            ignore_defaults: false,
667            ignore: vec!["**/*.test.ts".to_string(), "**/*.spec.ts".to_string()],
668            ..DuplicatesConfig::default()
669        };
670        let set = build_ignore_set(&config);
671        assert!(set.is_some());
672        let set = set.unwrap();
673        assert!(ignore_set_matches(&set, "src/foo.test.ts"));
674        assert!(ignore_set_matches(&set, "src/bar.spec.ts"));
675        assert!(!ignore_set_matches(&set, "src/baz.ts"));
676    }
677
678    #[test]
679    fn build_ignore_set_merges_defaults_with_user_patterns() {
680        let config = DuplicatesConfig {
681            ignore: vec!["**/foo/**".to_string()],
682            ..DuplicatesConfig::default()
683        };
684        let set = build_ignore_set(&config).expect("ignore set");
685        assert!(ignore_set_matches(&set, ".next/static/chunks/app.js"));
686        assert!(ignore_set_matches(&set, "src/foo.test.ts"));
687        assert!(ignore_set_matches(&set, "src/foo.spec.tsx"));
688        assert!(ignore_set_matches(&set, "src/__tests__/foo.ts"));
689        assert!(ignore_set_matches(&set, "src/__mocks__/foo.ts"));
690        assert!(ignore_set_matches(&set, "src/foo/generated.js"));
691    }
692
693    #[test]
694    fn build_ignore_set_ignore_defaults_false_uses_only_user_patterns() {
695        let config = DuplicatesConfig {
696            ignore_defaults: false,
697            ignore: vec!["**/foo/**".to_string()],
698            ..DuplicatesConfig::default()
699        };
700        let set = build_ignore_set(&config).expect("ignore set");
701        assert!(!ignore_set_matches(&set, ".next/static/chunks/app.js"));
702        assert!(!ignore_set_matches(&set, "src/foo.test.ts"));
703        assert!(!ignore_set_matches(&set, "src/foo.spec.tsx"));
704        assert!(!ignore_set_matches(&set, "src/__tests__/foo.ts"));
705        assert!(!ignore_set_matches(&set, "src/__mocks__/foo.ts"));
706        assert!(ignore_set_matches(&set, "src/foo/generated.js"));
707    }
708
709    #[test]
710    fn default_ignores_skip_duplicate_test_files() {
711        let dir = tempfile::tempdir().expect("create temp dir");
712        let tests_dir = dir.path().join("src").join("__tests__");
713        std::fs::create_dir_all(&tests_dir).expect("create tests dir");
714
715        let code = r#"
716export function repeatedTestHelper(input: string): string {
717    const trimmed = input.trim();
718    const lowered = trimmed.toLowerCase();
719    const compact = lowered.replaceAll(" ", "-");
720    return compact;
721}
722"#;
723        let first = tests_dir.join("first.test.ts");
724        let second = tests_dir.join("second.test.ts");
725        std::fs::write(&first, code).expect("write first");
726        std::fs::write(&second, code).expect("write second");
727
728        let files = vec![
729            DiscoveredFile {
730                id: FileId(0),
731                path: first,
732                size_bytes: code.len() as u64,
733            },
734            DiscoveredFile {
735                id: FileId(1),
736                path: second,
737                size_bytes: code.len() as u64,
738            },
739        ];
740        let config = DuplicatesConfig {
741            min_tokens: 5,
742            min_lines: 2,
743            ..DuplicatesConfig::default()
744        };
745
746        let (report, skips) =
747            find_duplicates_with_default_ignore_skips(dir.path(), &files, &config);
748
749        assert!(report.clone_groups.is_empty());
750        assert_eq!(skips.total, 2);
751    }
752
753    #[test]
754    fn ignore_defaults_false_restores_duplicate_test_files() {
755        let dir = tempfile::tempdir().expect("create temp dir");
756        let tests_dir = dir.path().join("src").join("__tests__");
757        std::fs::create_dir_all(&tests_dir).expect("create tests dir");
758
759        let code = r#"
760export function repeatedTestHelper(input: string): string {
761    const trimmed = input.trim();
762    const lowered = trimmed.toLowerCase();
763    const compact = lowered.replaceAll(" ", "-");
764    return compact;
765}
766"#;
767        let first = tests_dir.join("first.test.ts");
768        let second = tests_dir.join("second.test.ts");
769        std::fs::write(&first, code).expect("write first");
770        std::fs::write(&second, code).expect("write second");
771
772        let files = vec![
773            DiscoveredFile {
774                id: FileId(0),
775                path: first,
776                size_bytes: code.len() as u64,
777            },
778            DiscoveredFile {
779                id: FileId(1),
780                path: second,
781                size_bytes: code.len() as u64,
782            },
783        ];
784        let config = DuplicatesConfig {
785            min_tokens: 5,
786            min_lines: 2,
787            ignore_defaults: false,
788            ..DuplicatesConfig::default()
789        };
790
791        let report = find_duplicates(dir.path(), &files, &config);
792
793        assert!(!report.clone_groups.is_empty());
794    }
795
796    #[test]
797    fn find_duplicates_with_real_files() {
798        let dir = tempfile::tempdir().expect("create temp dir");
799        let src_dir = dir.path().join("src");
800        std::fs::create_dir_all(&src_dir).expect("create src dir");
801
802        let code = r#"
803export function processData(input: string): string {
804    const trimmed = input.trim();
805    if (trimmed.length === 0) {
806        return "";
807    }
808    const parts = trimmed.split(",");
809    const filtered = parts.filter(p => p.length > 0);
810    const mapped = filtered.map(p => p.toUpperCase());
811    return mapped.join(", ");
812}
813
814export function validateInput(data: string): boolean {
815    if (data === null || data === undefined) {
816        return false;
817    }
818    const cleaned = data.trim();
819    if (cleaned.length < 3) {
820        return false;
821    }
822    return true;
823}
824"#;
825
826        std::fs::write(src_dir.join("original.ts"), code).expect("write original");
827        std::fs::write(src_dir.join("copy.ts"), code).expect("write copy");
828        std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
829            .expect("write package.json");
830
831        let files = vec![
832            DiscoveredFile {
833                id: FileId(0),
834                path: src_dir.join("original.ts"),
835                size_bytes: code.len() as u64,
836            },
837            DiscoveredFile {
838                id: FileId(1),
839                path: src_dir.join("copy.ts"),
840                size_bytes: code.len() as u64,
841            },
842        ];
843
844        let config = DuplicatesConfig {
845            min_tokens: 10,
846            min_lines: 2,
847            ..DuplicatesConfig::default()
848        };
849
850        let report = find_duplicates(dir.path(), &files, &config);
851        assert!(
852            !report.clone_groups.is_empty(),
853            "Should detect clones in identical files"
854        );
855        assert!(report.stats.files_with_clones >= 2);
856
857        assert!(
858            !report.clone_families.is_empty(),
859            "Should group clones into families"
860        );
861    }
862
863    #[test]
864    fn global_shingle_prefilter_preserves_corpus_totals() {
865        let dir = tempfile::tempdir().expect("create temp dir");
866        let src_dir = dir.path().join("src");
867        std::fs::create_dir_all(&src_dir).expect("create src dir");
868
869        let duplicated = r#"
870export function normalizeUser(input: string): string {
871    const trimmed = input.trim();
872    const lowered = trimmed.toLowerCase();
873    const compact = lowered.replaceAll(" ", "-");
874    return compact;
875}
876"#;
877        let unique = r#"
878export function renderInvoice(id: string): string {
879    const prefix = "invoice";
880    const suffix = id.padStart(6, "0");
881    return `${prefix}:${suffix}`;
882}
883"#;
884
885        let original_path = src_dir.join("original.ts");
886        let copy_path = src_dir.join("copy.ts");
887        let unique_path = src_dir.join("unique.ts");
888        std::fs::write(&original_path, duplicated).expect("write original");
889        std::fs::write(&copy_path, duplicated).expect("write copy");
890        std::fs::write(&unique_path, unique).expect("write unique");
891
892        let files = vec![
893            DiscoveredFile {
894                id: FileId(0),
895                path: original_path,
896                size_bytes: duplicated.len() as u64,
897            },
898            DiscoveredFile {
899                id: FileId(1),
900                path: copy_path,
901                size_bytes: duplicated.len() as u64,
902            },
903            DiscoveredFile {
904                id: FileId(2),
905                path: unique_path,
906                size_bytes: unique.len() as u64,
907            },
908        ];
909        let config = DuplicatesConfig {
910            min_tokens: 5,
911            min_lines: 2,
912            min_corpus_size_for_shingle_filter: 1,
913            ..DuplicatesConfig::default()
914        };
915
916        let report = find_duplicates(dir.path(), &files, &config);
917
918        assert!(!report.clone_groups.is_empty());
919        assert_eq!(report.stats.total_files, 3);
920        assert!(report.stats.total_tokens > report.stats.duplicated_tokens);
921    }
922
923    #[test]
924    fn find_duplicates_cached_skips_token_cache_for_small_corpus() {
925        let dir = tempfile::tempdir().expect("create temp dir");
926        let src_dir = dir.path().join("src");
927        std::fs::create_dir_all(&src_dir).expect("create src dir");
928
929        let code = "export function same(input: number): number {\n  const doubled = input * 2;\n  return doubled + 1;\n}\n";
930        let first = src_dir.join("first.ts");
931        let second = src_dir.join("second.ts");
932        std::fs::write(&first, code).expect("write first");
933        std::fs::write(&second, code).expect("write second");
934
935        let files = vec![
936            DiscoveredFile {
937                id: FileId(0),
938                path: first,
939                size_bytes: code.len() as u64,
940            },
941            DiscoveredFile {
942                id: FileId(1),
943                path: second,
944                size_bytes: code.len() as u64,
945            },
946        ];
947        let config = DuplicatesConfig {
948            min_tokens: 5,
949            min_lines: 2,
950            ..DuplicatesConfig::default()
951        };
952        let cache_root = dir.path().join(".fallow");
953
954        let report = find_duplicates_cached(dir.path(), &files, &config, &cache_root);
955
956        assert!(!report.clone_groups.is_empty());
957        assert!(
958            !cache_root.exists(),
959            "small projects should avoid token-cache IO overhead"
960        );
961    }
962
963    #[test]
964    fn find_duplicates_touching_files_keeps_cross_corpus_matches_only_for_focus() {
965        let dir = tempfile::tempdir().expect("create temp dir");
966        let src_dir = dir.path().join("src");
967        std::fs::create_dir_all(&src_dir).expect("create src dir");
968
969        let focused_code = r"
970export function focused(input: number): number {
971    const doubled = input * 2;
972    const shifted = doubled + 10;
973    return shifted / 2;
974}
975";
976        let untouched_code = r#"
977export function untouched(input: string): string {
978    const lowered = input.toLowerCase();
979    const padded = lowered.padStart(10, "x");
980    return padded.slice(0, 8);
981}
982"#;
983
984        let changed_path = src_dir.join("changed.ts");
985        let focused_copy_path = src_dir.join("focused-copy.ts");
986        let untouched_a_path = src_dir.join("untouched-a.ts");
987        let untouched_b_path = src_dir.join("untouched-b.ts");
988        std::fs::write(&changed_path, focused_code).expect("write changed");
989        std::fs::write(&focused_copy_path, focused_code).expect("write focused copy");
990        std::fs::write(&untouched_a_path, untouched_code).expect("write untouched a");
991        std::fs::write(&untouched_b_path, untouched_code).expect("write untouched b");
992
993        let files = vec![
994            DiscoveredFile {
995                id: FileId(0),
996                path: changed_path.clone(),
997                size_bytes: focused_code.len() as u64,
998            },
999            DiscoveredFile {
1000                id: FileId(1),
1001                path: focused_copy_path,
1002                size_bytes: focused_code.len() as u64,
1003            },
1004            DiscoveredFile {
1005                id: FileId(2),
1006                path: untouched_a_path,
1007                size_bytes: untouched_code.len() as u64,
1008            },
1009            DiscoveredFile {
1010                id: FileId(3),
1011                path: untouched_b_path,
1012                size_bytes: untouched_code.len() as u64,
1013            },
1014        ];
1015
1016        let config = DuplicatesConfig {
1017            mode: DetectionMode::Strict,
1018            min_tokens: 5,
1019            min_lines: 2,
1020            min_corpus_size_for_shingle_filter: 1,
1021            ..DuplicatesConfig::default()
1022        };
1023        let mut focus = FxHashSet::default();
1024        focus.insert(changed_path.clone());
1025
1026        let full_report = find_duplicates(dir.path(), &files, &config);
1027        let report = find_duplicates_touching_files(dir.path(), &files, &config, &focus);
1028        let expected_touching = full_report
1029            .clone_groups
1030            .iter()
1031            .filter(|group| {
1032                group
1033                    .instances
1034                    .iter()
1035                    .any(|instance| instance.file == changed_path)
1036            })
1037            .count();
1038
1039        assert!(
1040            !report.clone_groups.is_empty(),
1041            "focused file should still match an unchanged duplicate"
1042        );
1043        assert_eq!(
1044            report.clone_groups.len(),
1045            expected_touching,
1046            "focused shingle filtering must not drop clone groups touching the focused file"
1047        );
1048        assert!(report.clone_groups.iter().all(|group| {
1049            group
1050                .instances
1051                .iter()
1052                .any(|instance| instance.file == changed_path)
1053        }));
1054    }
1055
1056    #[test]
1057    fn file_wide_suppression_excludes_file() {
1058        let dir = tempfile::tempdir().expect("create temp dir");
1059        let src_dir = dir.path().join("src");
1060        std::fs::create_dir_all(&src_dir).expect("create src dir");
1061
1062        let code = r#"
1063export function processData(input: string): string {
1064    const trimmed = input.trim();
1065    if (trimmed.length === 0) {
1066        return "";
1067    }
1068    const parts = trimmed.split(",");
1069    const filtered = parts.filter(p => p.length > 0);
1070    const mapped = filtered.map(p => p.toUpperCase());
1071    return mapped.join(", ");
1072}
1073"#;
1074        let suppressed_code = format!("// fallow-ignore-file code-duplication\n{code}");
1075
1076        std::fs::write(src_dir.join("original.ts"), code).expect("write original");
1077        std::fs::write(src_dir.join("suppressed.ts"), &suppressed_code).expect("write suppressed");
1078        std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
1079            .expect("write package.json");
1080
1081        let files = vec![
1082            DiscoveredFile {
1083                id: FileId(0),
1084                path: src_dir.join("original.ts"),
1085                size_bytes: code.len() as u64,
1086            },
1087            DiscoveredFile {
1088                id: FileId(1),
1089                path: src_dir.join("suppressed.ts"),
1090                size_bytes: suppressed_code.len() as u64,
1091            },
1092        ];
1093
1094        let config = DuplicatesConfig {
1095            min_tokens: 10,
1096            min_lines: 2,
1097            ..DuplicatesConfig::default()
1098        };
1099
1100        let report = find_duplicates(dir.path(), &files, &config);
1101        assert!(
1102            report.clone_groups.is_empty(),
1103            "File-wide suppression should exclude file from duplication analysis"
1104        );
1105    }
1106
1107    #[test]
1108    #[expect(
1109        clippy::too_many_lines,
1110        reason = "test fixture; linear setup/assert, length is not a maintainability concern"
1111    )]
1112    fn min_occurrences_hides_pairs_and_records_count() {
1113        let dir = tempfile::tempdir().expect("create temp dir");
1114        let src_dir = dir.path().join("src");
1115        std::fs::create_dir_all(&src_dir).expect("create src dir");
1116
1117        let block_a = r#"
1118export function blockA(input: string): string {
1119    const trimmed = input.trim();
1120    if (trimmed.length === 0) {
1121        return "";
1122    }
1123    const parts = trimmed.split(",");
1124    const filtered = parts.filter(p => p.length > 0);
1125    const mapped = filtered.map(p => p.toUpperCase());
1126    return mapped.join(", ");
1127}
1128"#;
1129        let block_b = r"
1130export function blockB(value: number): number {
1131    if (value <= 0) {
1132        return 0;
1133    }
1134    let total = 0;
1135    for (let i = 1; i <= value; i += 1) {
1136        total += i * 2;
1137        total -= 1;
1138    }
1139    return total + 7;
1140}
1141";
1142
1143        let pair_a1 = src_dir.join("pair-a1.ts");
1144        let pair_a2 = src_dir.join("pair-a2.ts");
1145        let triple_b1 = src_dir.join("triple-b1.ts");
1146        let triple_b2 = src_dir.join("triple-b2.ts");
1147        let triple_b3 = src_dir.join("triple-b3.ts");
1148        std::fs::write(&pair_a1, block_a).expect("write");
1149        std::fs::write(&pair_a2, block_a).expect("write");
1150        std::fs::write(&triple_b1, block_b).expect("write");
1151        std::fs::write(&triple_b2, block_b).expect("write");
1152        std::fs::write(&triple_b3, block_b).expect("write");
1153
1154        let files = vec![
1155            DiscoveredFile {
1156                id: FileId(0),
1157                path: pair_a1,
1158                size_bytes: block_a.len() as u64,
1159            },
1160            DiscoveredFile {
1161                id: FileId(1),
1162                path: pair_a2,
1163                size_bytes: block_a.len() as u64,
1164            },
1165            DiscoveredFile {
1166                id: FileId(2),
1167                path: triple_b1,
1168                size_bytes: block_b.len() as u64,
1169            },
1170            DiscoveredFile {
1171                id: FileId(3),
1172                path: triple_b2,
1173                size_bytes: block_b.len() as u64,
1174            },
1175            DiscoveredFile {
1176                id: FileId(4),
1177                path: triple_b3,
1178                size_bytes: block_b.len() as u64,
1179            },
1180        ];
1181
1182        let default_config = DuplicatesConfig {
1183            min_tokens: 10,
1184            min_lines: 2,
1185            ..DuplicatesConfig::default()
1186        };
1187        let baseline = find_duplicates(dir.path(), &files, &default_config);
1188        assert_eq!(
1189            baseline.clone_groups.len(),
1190            2,
1191            "default minOccurrences should report both the pair and the triple"
1192        );
1193        assert_eq!(
1194            baseline.stats.clone_groups_below_min_occurrences, 0,
1195            "default minOccurrences hides nothing"
1196        );
1197        let baseline_pct = baseline.stats.duplication_percentage;
1198
1199        let raised_config = DuplicatesConfig {
1200            min_tokens: 10,
1201            min_lines: 2,
1202            min_occurrences: 3,
1203            ..DuplicatesConfig::default()
1204        };
1205        let report = find_duplicates(dir.path(), &files, &raised_config);
1206        assert_eq!(
1207            report.clone_groups.len(),
1208            1,
1209            "minOccurrences=3 should hide the 2-instance group"
1210        );
1211        assert_eq!(
1212            report.clone_groups[0].instances.len(),
1213            3,
1214            "surviving group must be the 3-instance group"
1215        );
1216        assert_eq!(
1217            report.stats.clone_groups_below_min_occurrences, 1,
1218            "the hidden 2-instance group must be counted"
1219        );
1220        assert_eq!(
1221            report.stats.clone_groups, 1,
1222            "stats.clone_groups must match the post-filter array length"
1223        );
1224        assert_eq!(
1225            report.stats.clone_instances, 3,
1226            "stats.clone_instances must match the surviving instance total"
1227        );
1228        assert!(
1229            (report.stats.duplication_percentage - baseline_pct).abs() < f64::EPSILON,
1230            "duplication_percentage should not shift when minOccurrences changes"
1231        );
1232    }
1233
1234    #[test]
1235    fn min_occurrences_evaluates_after_line_suppressions() {
1236        let dir = tempfile::tempdir().expect("create temp dir");
1237        let src_dir = dir.path().join("src");
1238        std::fs::create_dir_all(&src_dir).expect("create src dir");
1239
1240        let block = r#"
1241export function shared(input: string): string {
1242    const trimmed = input.trim();
1243    if (trimmed.length === 0) {
1244        return "";
1245    }
1246    const parts = trimmed.split(",");
1247    const filtered = parts.filter(p => p.length > 0);
1248    const mapped = filtered.map(p => p.toUpperCase());
1249    return mapped.join(", ");
1250}
1251"#;
1252        let suppressed = format!("// fallow-ignore-file code-duplication\n{block}");
1253
1254        let a = src_dir.join("a.ts");
1255        let b = src_dir.join("b.ts");
1256        let c = src_dir.join("c.ts");
1257        std::fs::write(&a, block).expect("write a");
1258        std::fs::write(&b, block).expect("write b");
1259        std::fs::write(&c, &suppressed).expect("write c");
1260
1261        let files = vec![
1262            DiscoveredFile {
1263                id: FileId(0),
1264                path: a,
1265                size_bytes: block.len() as u64,
1266            },
1267            DiscoveredFile {
1268                id: FileId(1),
1269                path: b,
1270                size_bytes: block.len() as u64,
1271            },
1272            DiscoveredFile {
1273                id: FileId(2),
1274                path: c,
1275                size_bytes: suppressed.len() as u64,
1276            },
1277        ];
1278
1279        let config = DuplicatesConfig {
1280            min_tokens: 10,
1281            min_lines: 2,
1282            min_occurrences: 3,
1283            ..DuplicatesConfig::default()
1284        };
1285        let report = find_duplicates(dir.path(), &files, &config);
1286        assert!(
1287            report.clone_groups.is_empty(),
1288            "post-suppression 2-instance group must be hidden by minOccurrences=3, \
1289             got groups: {:?}",
1290            report
1291                .clone_groups
1292                .iter()
1293                .map(|g| g.instances.len())
1294                .collect::<Vec<_>>()
1295        );
1296        assert_eq!(
1297            report.stats.clone_groups, 0,
1298            "stats.clone_groups must match the empty post-filter array"
1299        );
1300        assert_eq!(
1301            report.stats.clone_instances, 0,
1302            "stats.clone_instances must match the empty post-filter array"
1303        );
1304    }
1305}