Skip to main content

fallow_core/duplicates/
mod.rs

1//! Code duplication / clone detection module.
2//!
3//! This module implements suffix array + LCP based clone detection
4//! for TypeScript/JavaScript source files. It supports multiple detection
5//! modes from strict (exact matches only) to semantic (structure-aware
6//! matching that ignores identifier names and literal values).
7
8mod cache;
9pub mod deepdive;
10pub mod detect;
11pub mod families;
12pub mod normalize;
13mod shingle_filter;
14pub mod token_types;
15mod token_visitor;
16pub mod tokenize;
17pub(crate) mod types;
18
19use rustc_hash::FxHashMap;
20use std::path::{Path, PathBuf};
21use std::sync::atomic::{AtomicUsize, Ordering};
22
23use globset::{Glob, GlobMatcher, GlobSet, GlobSetBuilder};
24use rayon::prelude::*;
25use rustc_hash::FxHashSet;
26
27use cache::{TokenCache, TokenCacheEntry, TokenCacheMode};
28pub use deepdive::{
29    CloneFingerprintKey, CloneFingerprintSet, FINGERPRINT_PREFIX, clone_fingerprint,
30    dominant_identifier, fingerprint_for_fragment, group_refactoring_suggestion,
31};
32use detect::CloneDetector;
33use normalize::normalize_and_hash_resolved;
34use tokenize::{tokenize_file, tokenize_file_cross_language};
35pub use types::{
36    CloneFamily, CloneGroup, CloneInstance, DefaultIgnoreSkipCount, DefaultIgnoreSkips,
37    DetectionMode, DuplicatesConfig, DuplicationReport, DuplicationStats, MirroredDirectory,
38    RefactoringKind, RefactoringSuggestion,
39};
40
41use crate::discover::{self, DiscoveredFile};
42use crate::suppress::{self, IssueKind, Suppression};
43
44/// Built-in duplicates ignores for generated framework and tool output.
45///
46/// These are engine policy defaults, not config-file defaults: `duplicates.ignore`
47/// stays empty in round-tripped configs, while the analyzer merges these patterns
48/// unless `duplicates.ignoreDefaults` is set to `false`.
49pub const DUPES_DEFAULT_IGNORES: &[&str] = &[
50    "**/.next/**",
51    "**/.nuxt/**",
52    "**/.svelte-kit/**",
53    "**/.turbo/**",
54    "**/.parcel-cache/**",
55    "**/.vite/**",
56    "**/.cache/**",
57    "**/out/**",
58    "**/storybook-static/**",
59];
60
61#[derive(Clone)]
62pub(super) struct TokenizedFile {
63    path: PathBuf,
64    hashed_tokens: Vec<normalize::HashedToken>,
65    file_tokens: tokenize::FileTokens,
66    metadata: Option<std::fs::Metadata>,
67    cache_hit: bool,
68    suppressions: Vec<Suppression>,
69}
70
71struct IgnoreSet {
72    all: GlobSet,
73    defaults: Vec<(&'static str, GlobMatcher)>,
74}
75
76impl IgnoreSet {
77    fn is_match(&self, path: &Path) -> bool {
78        self.all.is_match(path)
79    }
80
81    fn default_match_index(&self, path: &Path) -> Option<usize> {
82        self.defaults
83            .iter()
84            .position(|(_, matcher)| matcher.is_match(path))
85    }
86}
87
88struct DuplicationRun {
89    report: DuplicationReport,
90    default_ignore_skips: DefaultIgnoreSkips,
91}
92
93/// Run duplication detection on the given files.
94///
95/// This is the main entry point for the duplication analysis. It:
96/// 1. Reads and tokenizes all source files in parallel
97/// 2. Normalizes tokens according to the detection mode
98/// 3. Runs suffix array + LCP clone detection
99/// 4. Groups clone instances into families with refactoring suggestions
100/// 5. Applies inline suppression filters
101pub fn find_duplicates(
102    root: &Path,
103    files: &[DiscoveredFile],
104    config: &DuplicatesConfig,
105) -> DuplicationReport {
106    find_duplicates_inner(root, files, config, None, None).report
107}
108
109/// Run duplication detection and return human-format sidecar metadata for
110/// files skipped by built-in duplicates ignores.
111pub fn find_duplicates_with_default_ignore_skips(
112    root: &Path,
113    files: &[DiscoveredFile],
114    config: &DuplicatesConfig,
115) -> (DuplicationReport, DefaultIgnoreSkips) {
116    let run = find_duplicates_inner(root, files, config, None, None);
117    (run.report, run.default_ignore_skips)
118}
119
120/// Run duplication detection with the persistent token cache enabled.
121pub fn find_duplicates_cached(
122    root: &Path,
123    files: &[DiscoveredFile],
124    config: &DuplicatesConfig,
125    cache_root: &Path,
126) -> DuplicationReport {
127    find_duplicates_inner(root, files, config, None, Some(cache_root)).report
128}
129
130/// Run cached duplication detection and return human-format sidecar metadata for
131/// files skipped by built-in duplicates ignores.
132pub fn find_duplicates_cached_with_default_ignore_skips(
133    root: &Path,
134    files: &[DiscoveredFile],
135    config: &DuplicatesConfig,
136    cache_root: &Path,
137) -> (DuplicationReport, DefaultIgnoreSkips) {
138    let run = find_duplicates_inner(root, files, config, None, Some(cache_root));
139    (run.report, run.default_ignore_skips)
140}
141
142/// Run duplication detection and only return clone groups touching `focus_files`.
143///
144/// This keeps all files in the matching corpus, which preserves changed-file
145/// versus unchanged-file detection for diff-scoped audit runs, but avoids
146/// materializing duplicate groups that cannot appear in the scoped report.
147#[expect(
148    clippy::implicit_hasher,
149    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
150)]
151pub fn find_duplicates_touching_files(
152    root: &Path,
153    files: &[DiscoveredFile],
154    config: &DuplicatesConfig,
155    focus_files: &FxHashSet<PathBuf>,
156) -> DuplicationReport {
157    find_duplicates_inner(root, files, config, Some(focus_files), None).report
158}
159
160/// Run focused duplication detection and return human-format sidecar metadata
161/// for files skipped by built-in duplicates ignores.
162#[expect(
163    clippy::implicit_hasher,
164    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
165)]
166pub fn find_duplicates_touching_files_with_default_ignore_skips(
167    root: &Path,
168    files: &[DiscoveredFile],
169    config: &DuplicatesConfig,
170    focus_files: &FxHashSet<PathBuf>,
171) -> (DuplicationReport, DefaultIgnoreSkips) {
172    let run = find_duplicates_inner(root, files, config, Some(focus_files), None);
173    (run.report, run.default_ignore_skips)
174}
175
176/// Run focused duplication detection with the persistent token cache enabled.
177#[expect(
178    clippy::implicit_hasher,
179    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
180)]
181pub fn find_duplicates_touching_files_cached(
182    root: &Path,
183    files: &[DiscoveredFile],
184    config: &DuplicatesConfig,
185    focus_files: &FxHashSet<PathBuf>,
186    cache_root: &Path,
187) -> DuplicationReport {
188    find_duplicates_inner(root, files, config, Some(focus_files), Some(cache_root)).report
189}
190
191/// Run cached focused duplication detection and return human-format sidecar
192/// metadata for files skipped by built-in duplicates ignores.
193#[expect(
194    clippy::implicit_hasher,
195    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
196)]
197pub fn find_duplicates_touching_files_cached_with_default_ignore_skips(
198    root: &Path,
199    files: &[DiscoveredFile],
200    config: &DuplicatesConfig,
201    focus_files: &FxHashSet<PathBuf>,
202    cache_root: &Path,
203) -> (DuplicationReport, DefaultIgnoreSkips) {
204    let run = find_duplicates_inner(root, files, config, Some(focus_files), Some(cache_root));
205    (run.report, run.default_ignore_skips)
206}
207
208fn find_duplicates_inner(
209    root: &Path,
210    files: &[DiscoveredFile],
211    config: &DuplicatesConfig,
212    focus_files: Option<&FxHashSet<PathBuf>>,
213    cache_root: Option<&Path>,
214) -> DuplicationRun {
215    let _span = tracing::info_span!("find_duplicates").entered();
216
217    let extra_ignores = build_ignore_set(config);
218    let default_skip_counts = extra_ignores
219        .as_ref()
220        .map(|ignores| {
221            std::iter::repeat_with(|| AtomicUsize::new(0))
222                .take(ignores.defaults.len())
223                .collect::<Vec<_>>()
224        })
225        .unwrap_or_default();
226
227    let normalization =
228        fallow_config::ResolvedNormalization::resolve(config.mode, &config.normalization);
229
230    let strip_types = config.cross_language;
231    let skip_imports = config.ignore_imports;
232
233    tracing::debug!(
234        ignore_imports = skip_imports,
235        "duplication tokenization config"
236    );
237
238    let token_cache_mode = TokenCacheMode::new(normalization, strip_types, skip_imports);
239    let cache_root = cache_root.filter(|_| files.len() >= config.min_corpus_size_for_token_cache);
240    let token_cache = cache_root.map(TokenCache::load);
241
242    let mut file_data: Vec<TokenizedFile> = files
243        .par_iter()
244        .filter_map(|file| {
245            let relative = file.path.strip_prefix(root).unwrap_or(&file.path);
246            if let Some(ref ignores) = extra_ignores {
247                if let Some(index) = ignores.default_match_index(relative) {
248                    default_skip_counts[index].fetch_add(1, Ordering::Relaxed);
249                    return None;
250                }
251                if ignores.is_match(relative) {
252                    return None;
253                }
254            }
255
256            let metadata = std::fs::metadata(&file.path).ok()?;
257
258            let cached_entry = token_cache
259                .as_ref()
260                .and_then(|cache| cache.get(&file.path, &metadata, token_cache_mode));
261            let cache_hit = cached_entry.is_some();
262
263            let (mut entry, suppressions) = if let Some(entry) = cached_entry {
264                let suppressions = entry.suppressions.clone();
265                if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
266                    return None;
267                }
268                (entry, suppressions)
269            } else {
270                let source = std::fs::read_to_string(&file.path).ok()?;
271                let suppressions = suppress::parse_suppressions_from_source(&source).suppressions;
272                if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
273                    return None;
274                }
275
276                let file_tokens = if strip_types {
277                    tokenize_file_cross_language(&file.path, &source, true, skip_imports)
278                } else {
279                    tokenize_file(&file.path, &source, skip_imports)
280                };
281                if file_tokens.tokens.is_empty() {
282                    return None;
283                }
284
285                let hashed = normalize_and_hash_resolved(&file_tokens.tokens, normalization);
286                let entry = TokenCacheEntry {
287                    hashed_tokens: hashed,
288                    file_tokens,
289                    suppressions: suppressions.clone(),
290                };
291                (entry, suppressions)
292            };
293            if entry.file_tokens.tokens.is_empty() {
294                return None;
295            }
296            if entry.hashed_tokens.len() < config.min_tokens {
297                return None;
298            }
299
300            Some(TokenizedFile {
301                path: file.path.clone(),
302                hashed_tokens: std::mem::take(&mut entry.hashed_tokens),
303                file_tokens: entry.file_tokens,
304                metadata: Some(metadata),
305                cache_hit,
306                suppressions,
307            })
308        })
309        .collect();
310
311    if let (Some(cache_root), Some(mut cache)) = (cache_root, token_cache) {
312        for file in &file_data {
313            if !file.cache_hit
314                && let Some(metadata) = &file.metadata
315            {
316                cache.insert(
317                    &file.path,
318                    metadata,
319                    token_cache_mode,
320                    &file.hashed_tokens,
321                    &file.file_tokens,
322                    &file.suppressions,
323                );
324            }
325        }
326        cache.retain_paths(files);
327        match cache.save_if_dirty() {
328            Ok(true) => {
329                tracing::debug!(cache_root = %cache_root.display(), "saved duplication token cache");
330            }
331            Ok(false) => {
332                tracing::debug!(cache_root = %cache_root.display(), "duplication token cache unchanged");
333            }
334            Err(err) => {
335                tracing::warn!("Failed to save duplication token cache: {err}");
336            }
337        }
338    }
339
340    tracing::info!(
341        files = file_data.len(),
342        "tokenized files for duplication analysis"
343    );
344
345    if let Some(focus_files) = focus_files
346        && file_data.len() >= config.min_corpus_size_for_shingle_filter
347    {
348        shingle_filter::filter_to_focus_candidates(&mut file_data, focus_files, config.min_tokens);
349    }
350
351    let suppressions_by_file: FxHashMap<PathBuf, Vec<Suppression>> = file_data
352        .iter()
353        .filter(|file| !file.suppressions.is_empty())
354        .map(|file| (file.path.clone(), file.suppressions.clone()))
355        .collect();
356
357    let detector_data: Vec<(PathBuf, Vec<normalize::HashedToken>, tokenize::FileTokens)> =
358        file_data
359            .into_iter()
360            .map(|file| (file.path, file.hashed_tokens, file.file_tokens))
361            .collect();
362
363    let detector = CloneDetector::new(config.min_tokens, config.min_lines, config.skip_local);
364    let mut report = if let Some(focus_files) = focus_files {
365        detector.detect_touching_files(detector_data, focus_files)
366    } else {
367        detector.detect(detector_data)
368    };
369
370    if !suppressions_by_file.is_empty() {
371        apply_line_suppressions(&mut report, &suppressions_by_file);
372    }
373
374    apply_min_occurrences_filter(&mut report, config.min_occurrences);
375
376    let default_ignore_skips =
377        build_default_ignore_skips(extra_ignores.as_ref(), &default_skip_counts);
378
379    report.clone_families = families::group_into_families(&report.clone_groups, root);
380
381    report.mirrored_directories =
382        families::detect_mirrored_directories(&report.clone_families, root);
383
384    report.sort();
385
386    DuplicationRun {
387        report,
388        default_ignore_skips,
389    }
390}
391
392/// Drop clone groups with fewer than `min` instances and record the count on
393/// the stats block. The detector already guarantees `>= 2`, so this is a
394/// no-op when `min <= 2`.
395///
396/// Stats split: `clone_groups` and `clone_instances` are recomputed
397/// post-filter so they match the serialized array length (a CI consumer
398/// reading `stats.clone_groups` and iterating `clone_groups[]` sees the same
399/// count). `duplication_percentage`, `duplicated_lines`, `duplicated_tokens`,
400/// and `files_with_clones` stay pre-filter so the percentage math (lines /
401/// total) stays consistent and `threshold` gates / trend lines don't shift
402/// when the filter changes. The hidden count is disclosed in
403/// `clone_groups_below_min_occurrences`. The surviving groups feed every
404/// downstream step (families, mirrored dirs, --top, baseline, changed-since,
405/// workspace scoping) so there's a single source of truth.
406fn apply_min_occurrences_filter(report: &mut DuplicationReport, min: usize) {
407    if min <= 2 {
408        return;
409    }
410    let before = report.clone_groups.len();
411    report
412        .clone_groups
413        .retain(|group| group.instances.len() >= min);
414    let hidden = before - report.clone_groups.len();
415    if hidden == 0 {
416        return;
417    }
418    report.stats.clone_groups_below_min_occurrences = hidden;
419    report.stats.clone_groups = report.clone_groups.len();
420    report.stats.clone_instances = report.clone_groups.iter().map(|g| g.instances.len()).sum();
421}
422
423/// Filter out clone instances that are suppressed by line-level comments.
424#[expect(
425    clippy::cast_possible_truncation,
426    reason = "line numbers are bounded by source size"
427)]
428fn apply_line_suppressions(
429    report: &mut DuplicationReport,
430    suppressions_by_file: &FxHashMap<PathBuf, Vec<Suppression>>,
431) {
432    report.clone_groups.retain_mut(|group| {
433        group.instances.retain(|instance| {
434            if let Some(supps) = suppressions_by_file.get(&instance.file) {
435                for line in instance.start_line..=instance.end_line {
436                    if suppress::is_suppressed(supps, line as u32, IssueKind::CodeDuplication) {
437                        return false;
438                    }
439                }
440            }
441            true
442        });
443        group.instances.len() >= 2
444    });
445}
446
447/// Run duplication detection on a project directory using auto-discovered files.
448///
449/// This is a convenience function that handles file discovery internally.
450#[must_use]
451pub fn find_duplicates_in_project(root: &Path, config: &DuplicatesConfig) -> DuplicationReport {
452    let resolved = crate::default_config(root);
453    let files = discover::discover_files_with_plugin_scopes(&resolved);
454    find_duplicates(root, &files, config)
455}
456
457/// Build a merged ignore set from built-in and user-provided duplicates ignores.
458#[expect(
459    clippy::expect_used,
460    reason = "duplicate ignore globs are validated before clone detection"
461)]
462fn build_ignore_set(config: &DuplicatesConfig) -> Option<IgnoreSet> {
463    if !config.ignore_defaults && config.ignore.is_empty() {
464        return None;
465    }
466
467    let mut builder = GlobSetBuilder::new();
468    let mut defaults = Vec::new();
469
470    if config.ignore_defaults {
471        for pattern in DUPES_DEFAULT_IGNORES {
472            let glob = Glob::new(pattern).expect("default duplication ignore pattern is valid");
473            defaults.push((*pattern, glob.compile_matcher()));
474            builder.add(glob);
475        }
476    }
477
478    for pattern in &config.ignore {
479        builder.add(
480            Glob::new(pattern)
481                .expect("duplicates.ignore pattern was validated at config load time"),
482        );
483    }
484
485    builder.build().ok().map(|all| IgnoreSet { all, defaults })
486}
487
488fn build_default_ignore_skips(
489    ignores: Option<&IgnoreSet>,
490    counts: &[AtomicUsize],
491) -> DefaultIgnoreSkips {
492    let Some(ignores) = ignores else {
493        return DefaultIgnoreSkips::default();
494    };
495
496    let by_pattern = ignores
497        .defaults
498        .iter()
499        .zip(counts)
500        .filter_map(|((pattern, _), count)| {
501            let count = count.load(Ordering::Relaxed);
502            (count > 0).then_some(DefaultIgnoreSkipCount { pattern, count })
503        })
504        .collect::<Vec<_>>();
505    let total = by_pattern.iter().map(|entry| entry.count).sum();
506
507    DefaultIgnoreSkips { total, by_pattern }
508}
509
510#[cfg(test)]
511mod tests {
512    use super::*;
513    use crate::discover::FileId;
514
515    #[test]
516    fn find_duplicates_empty_files() {
517        let config = DuplicatesConfig::default();
518        let report = find_duplicates(Path::new("/tmp"), &[], &config);
519        assert!(report.clone_groups.is_empty());
520        assert!(report.clone_families.is_empty());
521        assert_eq!(report.stats.total_files, 0);
522    }
523
524    #[test]
525    fn build_ignore_set_empty() {
526        let config = DuplicatesConfig {
527            ignore_defaults: false,
528            ..DuplicatesConfig::default()
529        };
530        assert!(build_ignore_set(&config).is_none());
531    }
532
533    #[test]
534    fn build_ignore_set_valid_patterns() {
535        let config = DuplicatesConfig {
536            ignore_defaults: false,
537            ignore: vec!["**/*.test.ts".to_string(), "**/*.spec.ts".to_string()],
538            ..DuplicatesConfig::default()
539        };
540        let set = build_ignore_set(&config);
541        assert!(set.is_some());
542        let set = set.unwrap();
543        assert!(set.is_match(Path::new("src/foo.test.ts")));
544        assert!(set.is_match(Path::new("src/bar.spec.ts")));
545        assert!(!set.is_match(Path::new("src/baz.ts")));
546    }
547
548    #[test]
549    fn build_ignore_set_merges_defaults_with_user_patterns() {
550        let config = DuplicatesConfig {
551            ignore: vec!["**/foo/**".to_string()],
552            ..DuplicatesConfig::default()
553        };
554        let set = build_ignore_set(&config).expect("ignore set");
555        assert!(set.is_match(Path::new(".next/static/chunks/app.js")));
556        assert!(set.is_match(Path::new("src/foo/generated.js")));
557    }
558
559    #[test]
560    fn build_ignore_set_ignore_defaults_false_uses_only_user_patterns() {
561        let config = DuplicatesConfig {
562            ignore_defaults: false,
563            ignore: vec!["**/foo/**".to_string()],
564            ..DuplicatesConfig::default()
565        };
566        let set = build_ignore_set(&config).expect("ignore set");
567        assert!(!set.is_match(Path::new(".next/static/chunks/app.js")));
568        assert!(set.is_match(Path::new("src/foo/generated.js")));
569    }
570
571    #[test]
572    fn find_duplicates_with_real_files() {
573        let dir = tempfile::tempdir().expect("create temp dir");
574        let src_dir = dir.path().join("src");
575        std::fs::create_dir_all(&src_dir).expect("create src dir");
576
577        let code = r#"
578export function processData(input: string): string {
579    const trimmed = input.trim();
580    if (trimmed.length === 0) {
581        return "";
582    }
583    const parts = trimmed.split(",");
584    const filtered = parts.filter(p => p.length > 0);
585    const mapped = filtered.map(p => p.toUpperCase());
586    return mapped.join(", ");
587}
588
589export function validateInput(data: string): boolean {
590    if (data === null || data === undefined) {
591        return false;
592    }
593    const cleaned = data.trim();
594    if (cleaned.length < 3) {
595        return false;
596    }
597    return true;
598}
599"#;
600
601        std::fs::write(src_dir.join("original.ts"), code).expect("write original");
602        std::fs::write(src_dir.join("copy.ts"), code).expect("write copy");
603        std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
604            .expect("write package.json");
605
606        let files = vec![
607            DiscoveredFile {
608                id: FileId(0),
609                path: src_dir.join("original.ts"),
610                size_bytes: code.len() as u64,
611            },
612            DiscoveredFile {
613                id: FileId(1),
614                path: src_dir.join("copy.ts"),
615                size_bytes: code.len() as u64,
616            },
617        ];
618
619        let config = DuplicatesConfig {
620            min_tokens: 10,
621            min_lines: 2,
622            ..DuplicatesConfig::default()
623        };
624
625        let report = find_duplicates(dir.path(), &files, &config);
626        assert!(
627            !report.clone_groups.is_empty(),
628            "Should detect clones in identical files"
629        );
630        assert!(report.stats.files_with_clones >= 2);
631
632        assert!(
633            !report.clone_families.is_empty(),
634            "Should group clones into families"
635        );
636    }
637
638    #[test]
639    fn find_duplicates_cached_skips_token_cache_for_small_corpus() {
640        let dir = tempfile::tempdir().expect("create temp dir");
641        let src_dir = dir.path().join("src");
642        std::fs::create_dir_all(&src_dir).expect("create src dir");
643
644        let code = "export function same(input: number): number {\n  const doubled = input * 2;\n  return doubled + 1;\n}\n";
645        let first = src_dir.join("first.ts");
646        let second = src_dir.join("second.ts");
647        std::fs::write(&first, code).expect("write first");
648        std::fs::write(&second, code).expect("write second");
649
650        let files = vec![
651            DiscoveredFile {
652                id: FileId(0),
653                path: first,
654                size_bytes: code.len() as u64,
655            },
656            DiscoveredFile {
657                id: FileId(1),
658                path: second,
659                size_bytes: code.len() as u64,
660            },
661        ];
662        let config = DuplicatesConfig {
663            min_tokens: 5,
664            min_lines: 2,
665            ..DuplicatesConfig::default()
666        };
667        let cache_root = dir.path().join(".fallow");
668
669        let report = find_duplicates_cached(dir.path(), &files, &config, &cache_root);
670
671        assert!(!report.clone_groups.is_empty());
672        assert!(
673            !cache_root.exists(),
674            "small projects should avoid token-cache IO overhead"
675        );
676    }
677
678    #[test]
679    fn find_duplicates_touching_files_keeps_cross_corpus_matches_only_for_focus() {
680        let dir = tempfile::tempdir().expect("create temp dir");
681        let src_dir = dir.path().join("src");
682        std::fs::create_dir_all(&src_dir).expect("create src dir");
683
684        let focused_code = r"
685export function focused(input: number): number {
686    const doubled = input * 2;
687    const shifted = doubled + 10;
688    return shifted / 2;
689}
690";
691        let untouched_code = r#"
692export function untouched(input: string): string {
693    const lowered = input.toLowerCase();
694    const padded = lowered.padStart(10, "x");
695    return padded.slice(0, 8);
696}
697"#;
698
699        let changed_path = src_dir.join("changed.ts");
700        let focused_copy_path = src_dir.join("focused-copy.ts");
701        let untouched_a_path = src_dir.join("untouched-a.ts");
702        let untouched_b_path = src_dir.join("untouched-b.ts");
703        std::fs::write(&changed_path, focused_code).expect("write changed");
704        std::fs::write(&focused_copy_path, focused_code).expect("write focused copy");
705        std::fs::write(&untouched_a_path, untouched_code).expect("write untouched a");
706        std::fs::write(&untouched_b_path, untouched_code).expect("write untouched b");
707
708        let files = vec![
709            DiscoveredFile {
710                id: FileId(0),
711                path: changed_path.clone(),
712                size_bytes: focused_code.len() as u64,
713            },
714            DiscoveredFile {
715                id: FileId(1),
716                path: focused_copy_path,
717                size_bytes: focused_code.len() as u64,
718            },
719            DiscoveredFile {
720                id: FileId(2),
721                path: untouched_a_path,
722                size_bytes: untouched_code.len() as u64,
723            },
724            DiscoveredFile {
725                id: FileId(3),
726                path: untouched_b_path,
727                size_bytes: untouched_code.len() as u64,
728            },
729        ];
730
731        let config = DuplicatesConfig {
732            mode: DetectionMode::Strict,
733            min_tokens: 5,
734            min_lines: 2,
735            min_corpus_size_for_shingle_filter: 1,
736            ..DuplicatesConfig::default()
737        };
738        let mut focus = FxHashSet::default();
739        focus.insert(changed_path.clone());
740
741        let full_report = find_duplicates(dir.path(), &files, &config);
742        let report = find_duplicates_touching_files(dir.path(), &files, &config, &focus);
743        let expected_touching = full_report
744            .clone_groups
745            .iter()
746            .filter(|group| {
747                group
748                    .instances
749                    .iter()
750                    .any(|instance| instance.file == changed_path)
751            })
752            .count();
753
754        assert!(
755            !report.clone_groups.is_empty(),
756            "focused file should still match an unchanged duplicate"
757        );
758        assert_eq!(
759            report.clone_groups.len(),
760            expected_touching,
761            "focused shingle filtering must not drop clone groups touching the focused file"
762        );
763        assert!(report.clone_groups.iter().all(|group| {
764            group
765                .instances
766                .iter()
767                .any(|instance| instance.file == changed_path)
768        }));
769    }
770
771    #[test]
772    fn file_wide_suppression_excludes_file() {
773        let dir = tempfile::tempdir().expect("create temp dir");
774        let src_dir = dir.path().join("src");
775        std::fs::create_dir_all(&src_dir).expect("create src dir");
776
777        let code = r#"
778export function processData(input: string): string {
779    const trimmed = input.trim();
780    if (trimmed.length === 0) {
781        return "";
782    }
783    const parts = trimmed.split(",");
784    const filtered = parts.filter(p => p.length > 0);
785    const mapped = filtered.map(p => p.toUpperCase());
786    return mapped.join(", ");
787}
788"#;
789        let suppressed_code = format!("// fallow-ignore-file code-duplication\n{code}");
790
791        std::fs::write(src_dir.join("original.ts"), code).expect("write original");
792        std::fs::write(src_dir.join("suppressed.ts"), &suppressed_code).expect("write suppressed");
793        std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
794            .expect("write package.json");
795
796        let files = vec![
797            DiscoveredFile {
798                id: FileId(0),
799                path: src_dir.join("original.ts"),
800                size_bytes: code.len() as u64,
801            },
802            DiscoveredFile {
803                id: FileId(1),
804                path: src_dir.join("suppressed.ts"),
805                size_bytes: suppressed_code.len() as u64,
806            },
807        ];
808
809        let config = DuplicatesConfig {
810            min_tokens: 10,
811            min_lines: 2,
812            ..DuplicatesConfig::default()
813        };
814
815        let report = find_duplicates(dir.path(), &files, &config);
816        assert!(
817            report.clone_groups.is_empty(),
818            "File-wide suppression should exclude file from duplication analysis"
819        );
820    }
821
822    #[test]
823    fn min_occurrences_hides_pairs_and_records_count() {
824        let dir = tempfile::tempdir().expect("create temp dir");
825        let src_dir = dir.path().join("src");
826        std::fs::create_dir_all(&src_dir).expect("create src dir");
827
828        let block_a = r#"
829export function blockA(input: string): string {
830    const trimmed = input.trim();
831    if (trimmed.length === 0) {
832        return "";
833    }
834    const parts = trimmed.split(",");
835    const filtered = parts.filter(p => p.length > 0);
836    const mapped = filtered.map(p => p.toUpperCase());
837    return mapped.join(", ");
838}
839"#;
840        let block_b = r"
841export function blockB(value: number): number {
842    if (value <= 0) {
843        return 0;
844    }
845    let total = 0;
846    for (let i = 1; i <= value; i += 1) {
847        total += i * 2;
848        total -= 1;
849    }
850    return total + 7;
851}
852";
853
854        let pair_a1 = src_dir.join("pair-a1.ts");
855        let pair_a2 = src_dir.join("pair-a2.ts");
856        let triple_b1 = src_dir.join("triple-b1.ts");
857        let triple_b2 = src_dir.join("triple-b2.ts");
858        let triple_b3 = src_dir.join("triple-b3.ts");
859        std::fs::write(&pair_a1, block_a).expect("write");
860        std::fs::write(&pair_a2, block_a).expect("write");
861        std::fs::write(&triple_b1, block_b).expect("write");
862        std::fs::write(&triple_b2, block_b).expect("write");
863        std::fs::write(&triple_b3, block_b).expect("write");
864
865        let files = vec![
866            DiscoveredFile {
867                id: FileId(0),
868                path: pair_a1,
869                size_bytes: block_a.len() as u64,
870            },
871            DiscoveredFile {
872                id: FileId(1),
873                path: pair_a2,
874                size_bytes: block_a.len() as u64,
875            },
876            DiscoveredFile {
877                id: FileId(2),
878                path: triple_b1,
879                size_bytes: block_b.len() as u64,
880            },
881            DiscoveredFile {
882                id: FileId(3),
883                path: triple_b2,
884                size_bytes: block_b.len() as u64,
885            },
886            DiscoveredFile {
887                id: FileId(4),
888                path: triple_b3,
889                size_bytes: block_b.len() as u64,
890            },
891        ];
892
893        let default_config = DuplicatesConfig {
894            min_tokens: 10,
895            min_lines: 2,
896            ..DuplicatesConfig::default()
897        };
898        let baseline = find_duplicates(dir.path(), &files, &default_config);
899        assert_eq!(
900            baseline.clone_groups.len(),
901            2,
902            "default minOccurrences should report both the pair and the triple"
903        );
904        assert_eq!(
905            baseline.stats.clone_groups_below_min_occurrences, 0,
906            "default minOccurrences hides nothing"
907        );
908        let baseline_pct = baseline.stats.duplication_percentage;
909
910        let raised_config = DuplicatesConfig {
911            min_tokens: 10,
912            min_lines: 2,
913            min_occurrences: 3,
914            ..DuplicatesConfig::default()
915        };
916        let report = find_duplicates(dir.path(), &files, &raised_config);
917        assert_eq!(
918            report.clone_groups.len(),
919            1,
920            "minOccurrences=3 should hide the 2-instance group"
921        );
922        assert_eq!(
923            report.clone_groups[0].instances.len(),
924            3,
925            "surviving group must be the 3-instance group"
926        );
927        assert_eq!(
928            report.stats.clone_groups_below_min_occurrences, 1,
929            "the hidden 2-instance group must be counted"
930        );
931        assert_eq!(
932            report.stats.clone_groups, 1,
933            "stats.clone_groups must match the post-filter array length"
934        );
935        assert_eq!(
936            report.stats.clone_instances, 3,
937            "stats.clone_instances must match the surviving instance total"
938        );
939        assert!(
940            (report.stats.duplication_percentage - baseline_pct).abs() < f64::EPSILON,
941            "duplication_percentage should not shift when minOccurrences changes"
942        );
943    }
944
945    #[test]
946    fn min_occurrences_evaluates_after_line_suppressions() {
947        let dir = tempfile::tempdir().expect("create temp dir");
948        let src_dir = dir.path().join("src");
949        std::fs::create_dir_all(&src_dir).expect("create src dir");
950
951        let block = r#"
952export function shared(input: string): string {
953    const trimmed = input.trim();
954    if (trimmed.length === 0) {
955        return "";
956    }
957    const parts = trimmed.split(",");
958    const filtered = parts.filter(p => p.length > 0);
959    const mapped = filtered.map(p => p.toUpperCase());
960    return mapped.join(", ");
961}
962"#;
963        let suppressed = format!("// fallow-ignore-file code-duplication\n{block}");
964
965        let a = src_dir.join("a.ts");
966        let b = src_dir.join("b.ts");
967        let c = src_dir.join("c.ts");
968        std::fs::write(&a, block).expect("write a");
969        std::fs::write(&b, block).expect("write b");
970        std::fs::write(&c, &suppressed).expect("write c");
971
972        let files = vec![
973            DiscoveredFile {
974                id: FileId(0),
975                path: a,
976                size_bytes: block.len() as u64,
977            },
978            DiscoveredFile {
979                id: FileId(1),
980                path: b,
981                size_bytes: block.len() as u64,
982            },
983            DiscoveredFile {
984                id: FileId(2),
985                path: c,
986                size_bytes: suppressed.len() as u64,
987            },
988        ];
989
990        let config = DuplicatesConfig {
991            min_tokens: 10,
992            min_lines: 2,
993            min_occurrences: 3,
994            ..DuplicatesConfig::default()
995        };
996        let report = find_duplicates(dir.path(), &files, &config);
997        assert!(
998            report.clone_groups.is_empty(),
999            "post-suppression 2-instance group must be hidden by minOccurrences=3, \
1000             got groups: {:?}",
1001            report
1002                .clone_groups
1003                .iter()
1004                .map(|g| g.instances.len())
1005                .collect::<Vec<_>>()
1006        );
1007        assert_eq!(
1008            report.stats.clone_groups, 0,
1009            "stats.clone_groups must match the empty post-filter array"
1010        );
1011        assert_eq!(
1012            report.stats.clone_instances, 0,
1013            "stats.clone_instances must match the empty post-filter array"
1014        );
1015    }
1016}