Skip to main content

fallow_core/duplicates/
mod.rs

1//! Code duplication / clone detection module.
2//!
3//! This module implements suffix array + LCP based clone detection
4//! for TypeScript/JavaScript source files. It supports multiple detection
5//! modes from strict (exact matches only) to semantic (structure-aware
6//! matching that ignores identifier names and literal values).
7
8mod cache;
9pub mod deepdive;
10pub mod detect;
11pub mod families;
12pub mod normalize;
13mod shingle_filter;
14pub mod token_types;
15mod token_visitor;
16pub mod tokenize;
17pub(crate) mod types;
18
19use rustc_hash::FxHashMap;
20use std::path::{Path, PathBuf};
21use std::sync::atomic::{AtomicUsize, Ordering};
22
23use globset::{Glob, GlobMatcher, GlobSet, GlobSetBuilder};
24use rayon::prelude::*;
25use rustc_hash::FxHashSet;
26
27use cache::{TokenCache, TokenCacheEntry, TokenCacheMode};
28pub use deepdive::{
29    CloneFingerprintKey, CloneFingerprintSet, FINGERPRINT_PREFIX, clone_fingerprint,
30    dominant_identifier, fingerprint_for_fragment, group_refactoring_suggestion,
31};
32use detect::CloneDetector;
33use normalize::normalize_and_hash_resolved;
34use tokenize::{tokenize_file, tokenize_file_cross_language};
35pub use types::{
36    CloneFamily, CloneGroup, CloneInstance, DefaultIgnoreSkipCount, DefaultIgnoreSkips,
37    DetectionMode, DuplicatesConfig, DuplicationReport, DuplicationStats, MirroredDirectory,
38    RefactoringKind, RefactoringSuggestion,
39};
40
41use crate::discover::{self, DiscoveredFile};
42use crate::suppress::{self, IssueKind, Suppression};
43
44/// Built-in duplicates ignores for generated framework and tool output.
45///
46/// These are engine policy defaults, not config-file defaults: `duplicates.ignore`
47/// stays empty in round-tripped configs, while the analyzer merges these patterns
48/// unless `duplicates.ignoreDefaults` is set to `false`.
49pub const DUPES_DEFAULT_IGNORES: &[&str] = &[
50    "**/.next/**",
51    "**/.nuxt/**",
52    "**/.svelte-kit/**",
53    "**/.turbo/**",
54    "**/.parcel-cache/**",
55    "**/.vite/**",
56    "**/.cache/**",
57    "**/out/**",
58    "**/storybook-static/**",
59];
60
61#[derive(Clone)]
62pub(super) struct TokenizedFile {
63    path: PathBuf,
64    hashed_tokens: Vec<normalize::HashedToken>,
65    file_tokens: tokenize::FileTokens,
66    metadata: Option<std::fs::Metadata>,
67    cache_hit: bool,
68    suppressions: Vec<Suppression>,
69}
70
71struct IgnoreSet {
72    all: GlobSet,
73    defaults: Vec<(&'static str, GlobMatcher)>,
74}
75
76impl IgnoreSet {
77    fn is_match(&self, path: &Path) -> bool {
78        self.all.is_match(path)
79    }
80
81    fn default_match_index(&self, path: &Path) -> Option<usize> {
82        self.defaults
83            .iter()
84            .position(|(_, matcher)| matcher.is_match(path))
85    }
86}
87
88struct DuplicationRun {
89    report: DuplicationReport,
90    default_ignore_skips: DefaultIgnoreSkips,
91}
92
93struct DuplicationTokenizeContext<'a> {
94    root: &'a Path,
95    config: &'a DuplicatesConfig,
96    extra_ignores: Option<&'a IgnoreSet>,
97    default_skip_counts: &'a [AtomicUsize],
98    token_cache: Option<&'a TokenCache>,
99    token_cache_mode: TokenCacheMode,
100    normalization: fallow_config::ResolvedNormalization,
101    strip_types: bool,
102    skip_imports: bool,
103}
104
105/// Run duplication detection on the given files.
106///
107/// This is the main entry point for the duplication analysis. It:
108/// 1. Reads and tokenizes all source files in parallel
109/// 2. Normalizes tokens according to the detection mode
110/// 3. Runs suffix array + LCP clone detection
111/// 4. Groups clone instances into families with refactoring suggestions
112/// 5. Applies inline suppression filters
113pub fn find_duplicates(
114    root: &Path,
115    files: &[DiscoveredFile],
116    config: &DuplicatesConfig,
117) -> DuplicationReport {
118    find_duplicates_inner(root, files, config, None, None).report
119}
120
121/// Run duplication detection and return human-format sidecar metadata for
122/// files skipped by built-in duplicates ignores.
123pub fn find_duplicates_with_default_ignore_skips(
124    root: &Path,
125    files: &[DiscoveredFile],
126    config: &DuplicatesConfig,
127) -> (DuplicationReport, DefaultIgnoreSkips) {
128    let run = find_duplicates_inner(root, files, config, None, None);
129    (run.report, run.default_ignore_skips)
130}
131
132/// Run duplication detection with the persistent token cache enabled.
133pub fn find_duplicates_cached(
134    root: &Path,
135    files: &[DiscoveredFile],
136    config: &DuplicatesConfig,
137    cache_root: &Path,
138) -> DuplicationReport {
139    find_duplicates_inner(root, files, config, None, Some(cache_root)).report
140}
141
142/// Run cached duplication detection and return human-format sidecar metadata for
143/// files skipped by built-in duplicates ignores.
144pub fn find_duplicates_cached_with_default_ignore_skips(
145    root: &Path,
146    files: &[DiscoveredFile],
147    config: &DuplicatesConfig,
148    cache_root: &Path,
149) -> (DuplicationReport, DefaultIgnoreSkips) {
150    let run = find_duplicates_inner(root, files, config, None, Some(cache_root));
151    (run.report, run.default_ignore_skips)
152}
153
154/// Run duplication detection and only return clone groups touching `focus_files`.
155///
156/// This keeps all files in the matching corpus, which preserves changed-file
157/// versus unchanged-file detection for diff-scoped audit runs, but avoids
158/// materializing duplicate groups that cannot appear in the scoped report.
159#[expect(
160    clippy::implicit_hasher,
161    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
162)]
163pub fn find_duplicates_touching_files(
164    root: &Path,
165    files: &[DiscoveredFile],
166    config: &DuplicatesConfig,
167    focus_files: &FxHashSet<PathBuf>,
168) -> DuplicationReport {
169    find_duplicates_inner(root, files, config, Some(focus_files), None).report
170}
171
172/// Run focused duplication detection and return human-format sidecar metadata
173/// for files skipped by built-in duplicates ignores.
174#[expect(
175    clippy::implicit_hasher,
176    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
177)]
178pub fn find_duplicates_touching_files_with_default_ignore_skips(
179    root: &Path,
180    files: &[DiscoveredFile],
181    config: &DuplicatesConfig,
182    focus_files: &FxHashSet<PathBuf>,
183) -> (DuplicationReport, DefaultIgnoreSkips) {
184    let run = find_duplicates_inner(root, files, config, Some(focus_files), None);
185    (run.report, run.default_ignore_skips)
186}
187
188/// Run focused duplication detection with the persistent token cache enabled.
189#[expect(
190    clippy::implicit_hasher,
191    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
192)]
193pub fn find_duplicates_touching_files_cached(
194    root: &Path,
195    files: &[DiscoveredFile],
196    config: &DuplicatesConfig,
197    focus_files: &FxHashSet<PathBuf>,
198    cache_root: &Path,
199) -> DuplicationReport {
200    find_duplicates_inner(root, files, config, Some(focus_files), Some(cache_root)).report
201}
202
203/// Run cached focused duplication detection and return human-format sidecar
204/// metadata for files skipped by built-in duplicates ignores.
205#[expect(
206    clippy::implicit_hasher,
207    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
208)]
209pub fn find_duplicates_touching_files_cached_with_default_ignore_skips(
210    root: &Path,
211    files: &[DiscoveredFile],
212    config: &DuplicatesConfig,
213    focus_files: &FxHashSet<PathBuf>,
214    cache_root: &Path,
215) -> (DuplicationReport, DefaultIgnoreSkips) {
216    let run = find_duplicates_inner(root, files, config, Some(focus_files), Some(cache_root));
217    (run.report, run.default_ignore_skips)
218}
219
220fn find_duplicates_inner(
221    root: &Path,
222    files: &[DiscoveredFile],
223    config: &DuplicatesConfig,
224    focus_files: Option<&FxHashSet<PathBuf>>,
225    cache_root: Option<&Path>,
226) -> DuplicationRun {
227    let _span = tracing::info_span!("find_duplicates").entered();
228
229    let extra_ignores = build_ignore_set(config);
230    let default_skip_counts = extra_ignores
231        .as_ref()
232        .map(|ignores| {
233            std::iter::repeat_with(|| AtomicUsize::new(0))
234                .take(ignores.defaults.len())
235                .collect::<Vec<_>>()
236        })
237        .unwrap_or_default();
238
239    let normalization =
240        fallow_config::ResolvedNormalization::resolve(config.mode, &config.normalization);
241
242    let strip_types = config.cross_language;
243    let skip_imports = config.ignore_imports;
244
245    tracing::debug!(
246        ignore_imports = skip_imports,
247        "duplication tokenization config"
248    );
249
250    let token_cache_mode = TokenCacheMode::new(normalization, strip_types, skip_imports);
251    let cache_root = cache_root.filter(|_| files.len() >= config.min_corpus_size_for_token_cache);
252    let token_cache = cache_root.map(TokenCache::load);
253
254    let mut file_data = tokenize_duplication_files(
255        files,
256        &DuplicationTokenizeContext {
257            root,
258            config,
259            extra_ignores: extra_ignores.as_ref(),
260            default_skip_counts: &default_skip_counts,
261            token_cache: token_cache.as_ref(),
262            token_cache_mode,
263            normalization,
264            strip_types,
265            skip_imports,
266        },
267    );
268
269    if let (Some(cache_root), Some(cache)) = (cache_root, token_cache) {
270        save_duplication_token_cache(cache_root, cache, files, &file_data, token_cache_mode);
271    }
272
273    tracing::info!(
274        files = file_data.len(),
275        "tokenized files for duplication analysis"
276    );
277
278    if let Some(focus_files) = focus_files
279        && file_data.len() >= config.min_corpus_size_for_shingle_filter
280    {
281        shingle_filter::filter_to_focus_candidates(&mut file_data, focus_files, config.min_tokens);
282    }
283
284    let suppressions_by_file: FxHashMap<PathBuf, Vec<Suppression>> = file_data
285        .iter()
286        .filter(|file| !file.suppressions.is_empty())
287        .map(|file| (file.path.clone(), file.suppressions.clone()))
288        .collect();
289
290    let detector_data: Vec<(PathBuf, Vec<normalize::HashedToken>, tokenize::FileTokens)> =
291        file_data
292            .into_iter()
293            .map(|file| (file.path, file.hashed_tokens, file.file_tokens))
294            .collect();
295
296    let detector = CloneDetector::new(config.min_tokens, config.min_lines, config.skip_local);
297    let mut report = if let Some(focus_files) = focus_files {
298        detector.detect_touching_files(detector_data, focus_files)
299    } else {
300        detector.detect(detector_data)
301    };
302
303    if !suppressions_by_file.is_empty() {
304        apply_line_suppressions(&mut report, &suppressions_by_file);
305    }
306
307    apply_min_occurrences_filter(&mut report, config.min_occurrences);
308
309    let default_ignore_skips =
310        build_default_ignore_skips(extra_ignores.as_ref(), &default_skip_counts);
311
312    report.clone_families = families::group_into_families(&report.clone_groups, root);
313
314    report.mirrored_directories =
315        families::detect_mirrored_directories(&report.clone_families, root);
316
317    report.sort();
318
319    DuplicationRun {
320        report,
321        default_ignore_skips,
322    }
323}
324
325fn tokenize_duplication_files(
326    files: &[DiscoveredFile],
327    ctx: &DuplicationTokenizeContext<'_>,
328) -> Vec<TokenizedFile> {
329    files
330        .par_iter()
331        .filter_map(|file| tokenize_duplication_file(file, ctx))
332        .collect()
333}
334
335fn tokenize_duplication_file(
336    file: &DiscoveredFile,
337    ctx: &DuplicationTokenizeContext<'_>,
338) -> Option<TokenizedFile> {
339    if should_skip_duplicate_file(file, ctx) {
340        return None;
341    }
342
343    let metadata = std::fs::metadata(&file.path).ok()?;
344    let cached_entry = ctx
345        .token_cache
346        .and_then(|cache| cache.get(&file.path, &metadata, ctx.token_cache_mode));
347    let cache_hit = cached_entry.is_some();
348    let (mut entry, suppressions) = duplication_token_cache_entry(file, ctx, cached_entry)?;
349    if entry.file_tokens.tokens.is_empty() || entry.hashed_tokens.len() < ctx.config.min_tokens {
350        return None;
351    }
352
353    Some(TokenizedFile {
354        path: file.path.clone(),
355        hashed_tokens: std::mem::take(&mut entry.hashed_tokens),
356        file_tokens: entry.file_tokens,
357        metadata: Some(metadata),
358        cache_hit,
359        suppressions,
360    })
361}
362
363fn should_skip_duplicate_file(file: &DiscoveredFile, ctx: &DuplicationTokenizeContext<'_>) -> bool {
364    let relative = file.path.strip_prefix(ctx.root).unwrap_or(&file.path);
365    let Some(ignores) = ctx.extra_ignores else {
366        return false;
367    };
368    if let Some(index) = ignores.default_match_index(relative) {
369        ctx.default_skip_counts[index].fetch_add(1, Ordering::Relaxed);
370        return true;
371    }
372    ignores.is_match(relative)
373}
374
375fn duplication_token_cache_entry(
376    file: &DiscoveredFile,
377    ctx: &DuplicationTokenizeContext<'_>,
378    cached_entry: Option<TokenCacheEntry>,
379) -> Option<(TokenCacheEntry, Vec<Suppression>)> {
380    if let Some(entry) = cached_entry {
381        let suppressions = entry.suppressions.clone();
382        if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
383            return None;
384        }
385        return Some((entry, suppressions));
386    }
387
388    let source = std::fs::read_to_string(&file.path).ok()?;
389    let suppressions = suppress::parse_suppressions_from_source(&source).suppressions;
390    if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
391        return None;
392    }
393    let file_tokens = tokenize_duplication_source(file, ctx, &source);
394    if file_tokens.tokens.is_empty() {
395        return None;
396    }
397    let hashed_tokens = normalize_and_hash_resolved(&file_tokens.tokens, ctx.normalization);
398    Some((
399        TokenCacheEntry {
400            hashed_tokens,
401            file_tokens,
402            suppressions: suppressions.clone(),
403        },
404        suppressions,
405    ))
406}
407
408fn tokenize_duplication_source(
409    file: &DiscoveredFile,
410    ctx: &DuplicationTokenizeContext<'_>,
411    source: &str,
412) -> tokenize::FileTokens {
413    if ctx.strip_types {
414        tokenize_file_cross_language(&file.path, source, true, ctx.skip_imports)
415    } else {
416        tokenize_file(&file.path, source, ctx.skip_imports)
417    }
418}
419
420fn save_duplication_token_cache(
421    cache_root: &Path,
422    mut cache: TokenCache,
423    files: &[DiscoveredFile],
424    file_data: &[TokenizedFile],
425    mode: TokenCacheMode,
426) {
427    for file in file_data {
428        if !file.cache_hit
429            && let Some(metadata) = &file.metadata
430        {
431            cache.insert(
432                &file.path,
433                metadata,
434                mode,
435                &file.hashed_tokens,
436                &file.file_tokens,
437                &file.suppressions,
438            );
439        }
440    }
441    cache.retain_paths(files);
442    match cache.save_if_dirty() {
443        Ok(true) => {
444            tracing::debug!(cache_root = %cache_root.display(), "saved duplication token cache");
445        }
446        Ok(false) => {
447            tracing::debug!(cache_root = %cache_root.display(), "duplication token cache unchanged");
448        }
449        Err(err) => tracing::warn!("Failed to save duplication token cache: {err}"),
450    }
451}
452
453/// Drop clone groups with fewer than `min` instances and record the count on
454/// the stats block. The detector already guarantees `>= 2`, so this is a
455/// no-op when `min <= 2`.
456///
457/// Stats split: `clone_groups` and `clone_instances` are recomputed
458/// post-filter so they match the serialized array length (a CI consumer
459/// reading `stats.clone_groups` and iterating `clone_groups[]` sees the same
460/// count). `duplication_percentage`, `duplicated_lines`, `duplicated_tokens`,
461/// and `files_with_clones` stay pre-filter so the percentage math (lines /
462/// total) stays consistent and `threshold` gates / trend lines don't shift
463/// when the filter changes. The hidden count is disclosed in
464/// `clone_groups_below_min_occurrences`. The surviving groups feed every
465/// downstream step (families, mirrored dirs, --top, baseline, changed-since,
466/// workspace scoping) so there's a single source of truth.
467fn apply_min_occurrences_filter(report: &mut DuplicationReport, min: usize) {
468    if min <= 2 {
469        return;
470    }
471    let before = report.clone_groups.len();
472    report
473        .clone_groups
474        .retain(|group| group.instances.len() >= min);
475    let hidden = before - report.clone_groups.len();
476    if hidden == 0 {
477        return;
478    }
479    report.stats.clone_groups_below_min_occurrences = hidden;
480    report.stats.clone_groups = report.clone_groups.len();
481    report.stats.clone_instances = report.clone_groups.iter().map(|g| g.instances.len()).sum();
482}
483
484/// Filter out clone instances that are suppressed by line-level comments.
485#[expect(
486    clippy::cast_possible_truncation,
487    reason = "line numbers are bounded by source size"
488)]
489fn apply_line_suppressions(
490    report: &mut DuplicationReport,
491    suppressions_by_file: &FxHashMap<PathBuf, Vec<Suppression>>,
492) {
493    report.clone_groups.retain_mut(|group| {
494        group.instances.retain(|instance| {
495            if let Some(supps) = suppressions_by_file.get(&instance.file) {
496                for line in instance.start_line..=instance.end_line {
497                    if suppress::is_suppressed(supps, line as u32, IssueKind::CodeDuplication) {
498                        return false;
499                    }
500                }
501            }
502            true
503        });
504        group.instances.len() >= 2
505    });
506}
507
508/// Run duplication detection on a project directory using auto-discovered files.
509///
510/// This is a convenience function that handles file discovery internally.
511#[must_use]
512pub fn find_duplicates_in_project(root: &Path, config: &DuplicatesConfig) -> DuplicationReport {
513    let resolved = crate::default_config(root);
514    let files = discover::discover_files_with_plugin_scopes(&resolved);
515    find_duplicates(root, &files, config)
516}
517
518/// Build a merged ignore set from built-in and user-provided duplicates ignores.
519#[expect(
520    clippy::expect_used,
521    reason = "duplicate ignore globs are validated before clone detection"
522)]
523fn build_ignore_set(config: &DuplicatesConfig) -> Option<IgnoreSet> {
524    if !config.ignore_defaults && config.ignore.is_empty() {
525        return None;
526    }
527
528    let mut builder = GlobSetBuilder::new();
529    let mut defaults = Vec::new();
530
531    if config.ignore_defaults {
532        for pattern in DUPES_DEFAULT_IGNORES {
533            let glob = Glob::new(pattern).expect("default duplication ignore pattern is valid");
534            defaults.push((*pattern, glob.compile_matcher()));
535            builder.add(glob);
536        }
537    }
538
539    for pattern in &config.ignore {
540        builder.add(
541            Glob::new(pattern)
542                .expect("duplicates.ignore pattern was validated at config load time"),
543        );
544    }
545
546    builder.build().ok().map(|all| IgnoreSet { all, defaults })
547}
548
549fn build_default_ignore_skips(
550    ignores: Option<&IgnoreSet>,
551    counts: &[AtomicUsize],
552) -> DefaultIgnoreSkips {
553    let Some(ignores) = ignores else {
554        return DefaultIgnoreSkips::default();
555    };
556
557    let by_pattern = ignores
558        .defaults
559        .iter()
560        .zip(counts)
561        .filter_map(|((pattern, _), count)| {
562            let count = count.load(Ordering::Relaxed);
563            (count > 0).then_some(DefaultIgnoreSkipCount { pattern, count })
564        })
565        .collect::<Vec<_>>();
566    let total = by_pattern.iter().map(|entry| entry.count).sum();
567
568    DefaultIgnoreSkips { total, by_pattern }
569}
570
571#[cfg(test)]
572mod tests {
573    use super::*;
574    use crate::discover::FileId;
575
576    #[test]
577    fn find_duplicates_empty_files() {
578        let config = DuplicatesConfig::default();
579        let report = find_duplicates(Path::new("/tmp"), &[], &config);
580        assert!(report.clone_groups.is_empty());
581        assert!(report.clone_families.is_empty());
582        assert_eq!(report.stats.total_files, 0);
583    }
584
585    #[test]
586    fn build_ignore_set_empty() {
587        let config = DuplicatesConfig {
588            ignore_defaults: false,
589            ..DuplicatesConfig::default()
590        };
591        assert!(build_ignore_set(&config).is_none());
592    }
593
594    #[test]
595    fn build_ignore_set_valid_patterns() {
596        let config = DuplicatesConfig {
597            ignore_defaults: false,
598            ignore: vec!["**/*.test.ts".to_string(), "**/*.spec.ts".to_string()],
599            ..DuplicatesConfig::default()
600        };
601        let set = build_ignore_set(&config);
602        assert!(set.is_some());
603        let set = set.unwrap();
604        assert!(set.is_match(Path::new("src/foo.test.ts")));
605        assert!(set.is_match(Path::new("src/bar.spec.ts")));
606        assert!(!set.is_match(Path::new("src/baz.ts")));
607    }
608
609    #[test]
610    fn build_ignore_set_merges_defaults_with_user_patterns() {
611        let config = DuplicatesConfig {
612            ignore: vec!["**/foo/**".to_string()],
613            ..DuplicatesConfig::default()
614        };
615        let set = build_ignore_set(&config).expect("ignore set");
616        assert!(set.is_match(Path::new(".next/static/chunks/app.js")));
617        assert!(set.is_match(Path::new("src/foo/generated.js")));
618    }
619
620    #[test]
621    fn build_ignore_set_ignore_defaults_false_uses_only_user_patterns() {
622        let config = DuplicatesConfig {
623            ignore_defaults: false,
624            ignore: vec!["**/foo/**".to_string()],
625            ..DuplicatesConfig::default()
626        };
627        let set = build_ignore_set(&config).expect("ignore set");
628        assert!(!set.is_match(Path::new(".next/static/chunks/app.js")));
629        assert!(set.is_match(Path::new("src/foo/generated.js")));
630    }
631
632    #[test]
633    fn find_duplicates_with_real_files() {
634        let dir = tempfile::tempdir().expect("create temp dir");
635        let src_dir = dir.path().join("src");
636        std::fs::create_dir_all(&src_dir).expect("create src dir");
637
638        let code = r#"
639export function processData(input: string): string {
640    const trimmed = input.trim();
641    if (trimmed.length === 0) {
642        return "";
643    }
644    const parts = trimmed.split(",");
645    const filtered = parts.filter(p => p.length > 0);
646    const mapped = filtered.map(p => p.toUpperCase());
647    return mapped.join(", ");
648}
649
650export function validateInput(data: string): boolean {
651    if (data === null || data === undefined) {
652        return false;
653    }
654    const cleaned = data.trim();
655    if (cleaned.length < 3) {
656        return false;
657    }
658    return true;
659}
660"#;
661
662        std::fs::write(src_dir.join("original.ts"), code).expect("write original");
663        std::fs::write(src_dir.join("copy.ts"), code).expect("write copy");
664        std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
665            .expect("write package.json");
666
667        let files = vec![
668            DiscoveredFile {
669                id: FileId(0),
670                path: src_dir.join("original.ts"),
671                size_bytes: code.len() as u64,
672            },
673            DiscoveredFile {
674                id: FileId(1),
675                path: src_dir.join("copy.ts"),
676                size_bytes: code.len() as u64,
677            },
678        ];
679
680        let config = DuplicatesConfig {
681            min_tokens: 10,
682            min_lines: 2,
683            ..DuplicatesConfig::default()
684        };
685
686        let report = find_duplicates(dir.path(), &files, &config);
687        assert!(
688            !report.clone_groups.is_empty(),
689            "Should detect clones in identical files"
690        );
691        assert!(report.stats.files_with_clones >= 2);
692
693        assert!(
694            !report.clone_families.is_empty(),
695            "Should group clones into families"
696        );
697    }
698
699    #[test]
700    fn find_duplicates_cached_skips_token_cache_for_small_corpus() {
701        let dir = tempfile::tempdir().expect("create temp dir");
702        let src_dir = dir.path().join("src");
703        std::fs::create_dir_all(&src_dir).expect("create src dir");
704
705        let code = "export function same(input: number): number {\n  const doubled = input * 2;\n  return doubled + 1;\n}\n";
706        let first = src_dir.join("first.ts");
707        let second = src_dir.join("second.ts");
708        std::fs::write(&first, code).expect("write first");
709        std::fs::write(&second, code).expect("write second");
710
711        let files = vec![
712            DiscoveredFile {
713                id: FileId(0),
714                path: first,
715                size_bytes: code.len() as u64,
716            },
717            DiscoveredFile {
718                id: FileId(1),
719                path: second,
720                size_bytes: code.len() as u64,
721            },
722        ];
723        let config = DuplicatesConfig {
724            min_tokens: 5,
725            min_lines: 2,
726            ..DuplicatesConfig::default()
727        };
728        let cache_root = dir.path().join(".fallow");
729
730        let report = find_duplicates_cached(dir.path(), &files, &config, &cache_root);
731
732        assert!(!report.clone_groups.is_empty());
733        assert!(
734            !cache_root.exists(),
735            "small projects should avoid token-cache IO overhead"
736        );
737    }
738
739    #[test]
740    fn find_duplicates_touching_files_keeps_cross_corpus_matches_only_for_focus() {
741        let dir = tempfile::tempdir().expect("create temp dir");
742        let src_dir = dir.path().join("src");
743        std::fs::create_dir_all(&src_dir).expect("create src dir");
744
745        let focused_code = r"
746export function focused(input: number): number {
747    const doubled = input * 2;
748    const shifted = doubled + 10;
749    return shifted / 2;
750}
751";
752        let untouched_code = r#"
753export function untouched(input: string): string {
754    const lowered = input.toLowerCase();
755    const padded = lowered.padStart(10, "x");
756    return padded.slice(0, 8);
757}
758"#;
759
760        let changed_path = src_dir.join("changed.ts");
761        let focused_copy_path = src_dir.join("focused-copy.ts");
762        let untouched_a_path = src_dir.join("untouched-a.ts");
763        let untouched_b_path = src_dir.join("untouched-b.ts");
764        std::fs::write(&changed_path, focused_code).expect("write changed");
765        std::fs::write(&focused_copy_path, focused_code).expect("write focused copy");
766        std::fs::write(&untouched_a_path, untouched_code).expect("write untouched a");
767        std::fs::write(&untouched_b_path, untouched_code).expect("write untouched b");
768
769        let files = vec![
770            DiscoveredFile {
771                id: FileId(0),
772                path: changed_path.clone(),
773                size_bytes: focused_code.len() as u64,
774            },
775            DiscoveredFile {
776                id: FileId(1),
777                path: focused_copy_path,
778                size_bytes: focused_code.len() as u64,
779            },
780            DiscoveredFile {
781                id: FileId(2),
782                path: untouched_a_path,
783                size_bytes: untouched_code.len() as u64,
784            },
785            DiscoveredFile {
786                id: FileId(3),
787                path: untouched_b_path,
788                size_bytes: untouched_code.len() as u64,
789            },
790        ];
791
792        let config = DuplicatesConfig {
793            mode: DetectionMode::Strict,
794            min_tokens: 5,
795            min_lines: 2,
796            min_corpus_size_for_shingle_filter: 1,
797            ..DuplicatesConfig::default()
798        };
799        let mut focus = FxHashSet::default();
800        focus.insert(changed_path.clone());
801
802        let full_report = find_duplicates(dir.path(), &files, &config);
803        let report = find_duplicates_touching_files(dir.path(), &files, &config, &focus);
804        let expected_touching = full_report
805            .clone_groups
806            .iter()
807            .filter(|group| {
808                group
809                    .instances
810                    .iter()
811                    .any(|instance| instance.file == changed_path)
812            })
813            .count();
814
815        assert!(
816            !report.clone_groups.is_empty(),
817            "focused file should still match an unchanged duplicate"
818        );
819        assert_eq!(
820            report.clone_groups.len(),
821            expected_touching,
822            "focused shingle filtering must not drop clone groups touching the focused file"
823        );
824        assert!(report.clone_groups.iter().all(|group| {
825            group
826                .instances
827                .iter()
828                .any(|instance| instance.file == changed_path)
829        }));
830    }
831
832    #[test]
833    fn file_wide_suppression_excludes_file() {
834        let dir = tempfile::tempdir().expect("create temp dir");
835        let src_dir = dir.path().join("src");
836        std::fs::create_dir_all(&src_dir).expect("create src dir");
837
838        let code = r#"
839export function processData(input: string): string {
840    const trimmed = input.trim();
841    if (trimmed.length === 0) {
842        return "";
843    }
844    const parts = trimmed.split(",");
845    const filtered = parts.filter(p => p.length > 0);
846    const mapped = filtered.map(p => p.toUpperCase());
847    return mapped.join(", ");
848}
849"#;
850        let suppressed_code = format!("// fallow-ignore-file code-duplication\n{code}");
851
852        std::fs::write(src_dir.join("original.ts"), code).expect("write original");
853        std::fs::write(src_dir.join("suppressed.ts"), &suppressed_code).expect("write suppressed");
854        std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
855            .expect("write package.json");
856
857        let files = vec![
858            DiscoveredFile {
859                id: FileId(0),
860                path: src_dir.join("original.ts"),
861                size_bytes: code.len() as u64,
862            },
863            DiscoveredFile {
864                id: FileId(1),
865                path: src_dir.join("suppressed.ts"),
866                size_bytes: suppressed_code.len() as u64,
867            },
868        ];
869
870        let config = DuplicatesConfig {
871            min_tokens: 10,
872            min_lines: 2,
873            ..DuplicatesConfig::default()
874        };
875
876        let report = find_duplicates(dir.path(), &files, &config);
877        assert!(
878            report.clone_groups.is_empty(),
879            "File-wide suppression should exclude file from duplication analysis"
880        );
881    }
882
883    #[test]
884    fn min_occurrences_hides_pairs_and_records_count() {
885        let dir = tempfile::tempdir().expect("create temp dir");
886        let src_dir = dir.path().join("src");
887        std::fs::create_dir_all(&src_dir).expect("create src dir");
888
889        let block_a = r#"
890export function blockA(input: string): string {
891    const trimmed = input.trim();
892    if (trimmed.length === 0) {
893        return "";
894    }
895    const parts = trimmed.split(",");
896    const filtered = parts.filter(p => p.length > 0);
897    const mapped = filtered.map(p => p.toUpperCase());
898    return mapped.join(", ");
899}
900"#;
901        let block_b = r"
902export function blockB(value: number): number {
903    if (value <= 0) {
904        return 0;
905    }
906    let total = 0;
907    for (let i = 1; i <= value; i += 1) {
908        total += i * 2;
909        total -= 1;
910    }
911    return total + 7;
912}
913";
914
915        let pair_a1 = src_dir.join("pair-a1.ts");
916        let pair_a2 = src_dir.join("pair-a2.ts");
917        let triple_b1 = src_dir.join("triple-b1.ts");
918        let triple_b2 = src_dir.join("triple-b2.ts");
919        let triple_b3 = src_dir.join("triple-b3.ts");
920        std::fs::write(&pair_a1, block_a).expect("write");
921        std::fs::write(&pair_a2, block_a).expect("write");
922        std::fs::write(&triple_b1, block_b).expect("write");
923        std::fs::write(&triple_b2, block_b).expect("write");
924        std::fs::write(&triple_b3, block_b).expect("write");
925
926        let files = vec![
927            DiscoveredFile {
928                id: FileId(0),
929                path: pair_a1,
930                size_bytes: block_a.len() as u64,
931            },
932            DiscoveredFile {
933                id: FileId(1),
934                path: pair_a2,
935                size_bytes: block_a.len() as u64,
936            },
937            DiscoveredFile {
938                id: FileId(2),
939                path: triple_b1,
940                size_bytes: block_b.len() as u64,
941            },
942            DiscoveredFile {
943                id: FileId(3),
944                path: triple_b2,
945                size_bytes: block_b.len() as u64,
946            },
947            DiscoveredFile {
948                id: FileId(4),
949                path: triple_b3,
950                size_bytes: block_b.len() as u64,
951            },
952        ];
953
954        let default_config = DuplicatesConfig {
955            min_tokens: 10,
956            min_lines: 2,
957            ..DuplicatesConfig::default()
958        };
959        let baseline = find_duplicates(dir.path(), &files, &default_config);
960        assert_eq!(
961            baseline.clone_groups.len(),
962            2,
963            "default minOccurrences should report both the pair and the triple"
964        );
965        assert_eq!(
966            baseline.stats.clone_groups_below_min_occurrences, 0,
967            "default minOccurrences hides nothing"
968        );
969        let baseline_pct = baseline.stats.duplication_percentage;
970
971        let raised_config = DuplicatesConfig {
972            min_tokens: 10,
973            min_lines: 2,
974            min_occurrences: 3,
975            ..DuplicatesConfig::default()
976        };
977        let report = find_duplicates(dir.path(), &files, &raised_config);
978        assert_eq!(
979            report.clone_groups.len(),
980            1,
981            "minOccurrences=3 should hide the 2-instance group"
982        );
983        assert_eq!(
984            report.clone_groups[0].instances.len(),
985            3,
986            "surviving group must be the 3-instance group"
987        );
988        assert_eq!(
989            report.stats.clone_groups_below_min_occurrences, 1,
990            "the hidden 2-instance group must be counted"
991        );
992        assert_eq!(
993            report.stats.clone_groups, 1,
994            "stats.clone_groups must match the post-filter array length"
995        );
996        assert_eq!(
997            report.stats.clone_instances, 3,
998            "stats.clone_instances must match the surviving instance total"
999        );
1000        assert!(
1001            (report.stats.duplication_percentage - baseline_pct).abs() < f64::EPSILON,
1002            "duplication_percentage should not shift when minOccurrences changes"
1003        );
1004    }
1005
1006    #[test]
1007    fn min_occurrences_evaluates_after_line_suppressions() {
1008        let dir = tempfile::tempdir().expect("create temp dir");
1009        let src_dir = dir.path().join("src");
1010        std::fs::create_dir_all(&src_dir).expect("create src dir");
1011
1012        let block = r#"
1013export function shared(input: string): string {
1014    const trimmed = input.trim();
1015    if (trimmed.length === 0) {
1016        return "";
1017    }
1018    const parts = trimmed.split(",");
1019    const filtered = parts.filter(p => p.length > 0);
1020    const mapped = filtered.map(p => p.toUpperCase());
1021    return mapped.join(", ");
1022}
1023"#;
1024        let suppressed = format!("// fallow-ignore-file code-duplication\n{block}");
1025
1026        let a = src_dir.join("a.ts");
1027        let b = src_dir.join("b.ts");
1028        let c = src_dir.join("c.ts");
1029        std::fs::write(&a, block).expect("write a");
1030        std::fs::write(&b, block).expect("write b");
1031        std::fs::write(&c, &suppressed).expect("write c");
1032
1033        let files = vec![
1034            DiscoveredFile {
1035                id: FileId(0),
1036                path: a,
1037                size_bytes: block.len() as u64,
1038            },
1039            DiscoveredFile {
1040                id: FileId(1),
1041                path: b,
1042                size_bytes: block.len() as u64,
1043            },
1044            DiscoveredFile {
1045                id: FileId(2),
1046                path: c,
1047                size_bytes: suppressed.len() as u64,
1048            },
1049        ];
1050
1051        let config = DuplicatesConfig {
1052            min_tokens: 10,
1053            min_lines: 2,
1054            min_occurrences: 3,
1055            ..DuplicatesConfig::default()
1056        };
1057        let report = find_duplicates(dir.path(), &files, &config);
1058        assert!(
1059            report.clone_groups.is_empty(),
1060            "post-suppression 2-instance group must be hidden by minOccurrences=3, \
1061             got groups: {:?}",
1062            report
1063                .clone_groups
1064                .iter()
1065                .map(|g| g.instances.len())
1066                .collect::<Vec<_>>()
1067        );
1068        assert_eq!(
1069            report.stats.clone_groups, 0,
1070            "stats.clone_groups must match the empty post-filter array"
1071        );
1072        assert_eq!(
1073            report.stats.clone_instances, 0,
1074            "stats.clone_instances must match the empty post-filter array"
1075        );
1076    }
1077}