fallow_core/duplicates/
mod.rs

1//! Code duplication / clone detection module.
2//!
3//! This module implements suffix array + LCP based clone detection
4//! for TypeScript/JavaScript source files. It supports multiple detection
5//! modes from strict (exact matches only) to semantic (structure-aware
6//! matching that ignores identifier names and literal values).
7
8mod cache;
9pub mod detect;
10pub mod families;
11pub mod normalize;
12mod shingle_filter;
13pub mod token_types;
14mod token_visitor;
15pub mod tokenize;
16pub(crate) mod types;
17
18use rustc_hash::FxHashMap;
19use std::path::{Path, PathBuf};
20use std::sync::atomic::{AtomicUsize, Ordering};
21
22use globset::{Glob, GlobMatcher, GlobSet, GlobSetBuilder};
23use rayon::prelude::*;
24use rustc_hash::FxHashSet;
25
26use cache::{TokenCache, TokenCacheEntry, TokenCacheMode};
27use detect::CloneDetector;
28use normalize::normalize_and_hash_resolved;
29use tokenize::{tokenize_file, tokenize_file_cross_language};
30pub use types::{
31    CloneFamily, CloneGroup, CloneInstance, DefaultIgnoreSkipCount, DefaultIgnoreSkips,
32    DetectionMode, DuplicatesConfig, DuplicationReport, DuplicationStats, MirroredDirectory,
33    RefactoringKind, RefactoringSuggestion,
34};
35
36use crate::discover::{self, DiscoveredFile};
37use crate::suppress::{self, IssueKind, Suppression};
38
39/// Built-in duplicates ignores for generated framework and tool output.
40///
41/// These are engine policy defaults, not config-file defaults: `duplicates.ignore`
42/// stays empty in round-tripped configs, while the analyzer merges these patterns
43/// unless `duplicates.ignoreDefaults` is set to `false`.
44pub const DUPES_DEFAULT_IGNORES: &[&str] = &[
45    "**/.next/**",
46    "**/.nuxt/**",
47    "**/.svelte-kit/**",
48    "**/.turbo/**",
49    "**/.parcel-cache/**",
50    "**/.vite/**",
51    "**/.cache/**",
52    "**/out/**",
53    "**/storybook-static/**",
54];
55
56#[derive(Clone)]
57pub(super) struct TokenizedFile {
58    path: PathBuf,
59    hashed_tokens: Vec<normalize::HashedToken>,
60    file_tokens: tokenize::FileTokens,
61    metadata: Option<std::fs::Metadata>,
62    cache_hit: bool,
63    suppressions: Vec<Suppression>,
64}
65
66struct IgnoreSet {
67    all: GlobSet,
68    defaults: Vec<(&'static str, GlobMatcher)>,
69}
70
71impl IgnoreSet {
72    fn is_match(&self, path: &Path) -> bool {
73        self.all.is_match(path)
74    }
75
76    fn default_match_index(&self, path: &Path) -> Option<usize> {
77        self.defaults
78            .iter()
79            .position(|(_, matcher)| matcher.is_match(path))
80    }
81}
82
83struct DuplicationRun {
84    report: DuplicationReport,
85    default_ignore_skips: DefaultIgnoreSkips,
86}
87
88/// Run duplication detection on the given files.
89///
90/// This is the main entry point for the duplication analysis. It:
91/// 1. Reads and tokenizes all source files in parallel
92/// 2. Normalizes tokens according to the detection mode
93/// 3. Runs suffix array + LCP clone detection
94/// 4. Groups clone instances into families with refactoring suggestions
95/// 5. Applies inline suppression filters
96pub fn find_duplicates(
97    root: &Path,
98    files: &[DiscoveredFile],
99    config: &DuplicatesConfig,
100) -> DuplicationReport {
101    find_duplicates_inner(root, files, config, None, None).report
102}
103
104/// Run duplication detection and return human-format sidecar metadata for
105/// files skipped by built-in duplicates ignores.
106pub fn find_duplicates_with_default_ignore_skips(
107    root: &Path,
108    files: &[DiscoveredFile],
109    config: &DuplicatesConfig,
110) -> (DuplicationReport, DefaultIgnoreSkips) {
111    let run = find_duplicates_inner(root, files, config, None, None);
112    (run.report, run.default_ignore_skips)
113}
114
115/// Run duplication detection with the persistent token cache enabled.
116pub fn find_duplicates_cached(
117    root: &Path,
118    files: &[DiscoveredFile],
119    config: &DuplicatesConfig,
120    cache_root: &Path,
121) -> DuplicationReport {
122    find_duplicates_inner(root, files, config, None, Some(cache_root)).report
123}
124
125/// Run cached duplication detection and return human-format sidecar metadata for
126/// files skipped by built-in duplicates ignores.
127pub fn find_duplicates_cached_with_default_ignore_skips(
128    root: &Path,
129    files: &[DiscoveredFile],
130    config: &DuplicatesConfig,
131    cache_root: &Path,
132) -> (DuplicationReport, DefaultIgnoreSkips) {
133    let run = find_duplicates_inner(root, files, config, None, Some(cache_root));
134    (run.report, run.default_ignore_skips)
135}
136
137/// Run duplication detection and only return clone groups touching `focus_files`.
138///
139/// This keeps all files in the matching corpus, which preserves changed-file
140/// versus unchanged-file detection for diff-scoped audit runs, but avoids
141/// materializing duplicate groups that cannot appear in the scoped report.
142#[expect(
143    clippy::implicit_hasher,
144    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
145)]
146pub fn find_duplicates_touching_files(
147    root: &Path,
148    files: &[DiscoveredFile],
149    config: &DuplicatesConfig,
150    focus_files: &FxHashSet<PathBuf>,
151) -> DuplicationReport {
152    find_duplicates_inner(root, files, config, Some(focus_files), None).report
153}
154
155/// Run focused duplication detection and return human-format sidecar metadata
156/// for files skipped by built-in duplicates ignores.
157#[expect(
158    clippy::implicit_hasher,
159    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
160)]
161pub fn find_duplicates_touching_files_with_default_ignore_skips(
162    root: &Path,
163    files: &[DiscoveredFile],
164    config: &DuplicatesConfig,
165    focus_files: &FxHashSet<PathBuf>,
166) -> (DuplicationReport, DefaultIgnoreSkips) {
167    let run = find_duplicates_inner(root, files, config, Some(focus_files), None);
168    (run.report, run.default_ignore_skips)
169}
170
171/// Run focused duplication detection with the persistent token cache enabled.
172#[expect(
173    clippy::implicit_hasher,
174    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
175)]
176pub fn find_duplicates_touching_files_cached(
177    root: &Path,
178    files: &[DiscoveredFile],
179    config: &DuplicatesConfig,
180    focus_files: &FxHashSet<PathBuf>,
181    cache_root: &Path,
182) -> DuplicationReport {
183    find_duplicates_inner(root, files, config, Some(focus_files), Some(cache_root)).report
184}
185
186/// Run cached focused duplication detection and return human-format sidecar
187/// metadata for files skipped by built-in duplicates ignores.
188#[expect(
189    clippy::implicit_hasher,
190    reason = "fallow uses FxHashSet for changed-file sets throughout analysis"
191)]
192pub fn find_duplicates_touching_files_cached_with_default_ignore_skips(
193    root: &Path,
194    files: &[DiscoveredFile],
195    config: &DuplicatesConfig,
196    focus_files: &FxHashSet<PathBuf>,
197    cache_root: &Path,
198) -> (DuplicationReport, DefaultIgnoreSkips) {
199    let run = find_duplicates_inner(root, files, config, Some(focus_files), Some(cache_root));
200    (run.report, run.default_ignore_skips)
201}
202
203fn find_duplicates_inner(
204    root: &Path,
205    files: &[DiscoveredFile],
206    config: &DuplicatesConfig,
207    focus_files: Option<&FxHashSet<PathBuf>>,
208    cache_root: Option<&Path>,
209) -> DuplicationRun {
210    let _span = tracing::info_span!("find_duplicates").entered();
211
212    let extra_ignores = build_ignore_set(config);
213    let default_skip_counts = extra_ignores
214        .as_ref()
215        .map(|ignores| {
216            std::iter::repeat_with(|| AtomicUsize::new(0))
217                .take(ignores.defaults.len())
218                .collect::<Vec<_>>()
219        })
220        .unwrap_or_default();
221
222    // Resolve normalization: mode defaults + user overrides
223    let normalization =
224        fallow_config::ResolvedNormalization::resolve(config.mode, &config.normalization);
225
226    let strip_types = config.cross_language;
227    let skip_imports = config.ignore_imports;
228
229    tracing::debug!(
230        ignore_imports = skip_imports,
231        "duplication tokenization config"
232    );
233
234    let token_cache_mode = TokenCacheMode::new(normalization, strip_types, skip_imports);
235    let cache_root = cache_root.filter(|_| files.len() >= config.min_corpus_size_for_token_cache);
236    let token_cache = cache_root.map(TokenCache::load);
237
238    // Step 1 & 2: Tokenize and normalize all files in parallel, also parse suppressions
239    let mut file_data: Vec<TokenizedFile> = files
240        .par_iter()
241        .filter_map(|file| {
242            // Apply extra ignore patterns
243            let relative = file.path.strip_prefix(root).unwrap_or(&file.path);
244            if let Some(ref ignores) = extra_ignores {
245                if let Some(index) = ignores.default_match_index(relative) {
246                    default_skip_counts[index].fetch_add(1, Ordering::Relaxed);
247                    return None;
248                }
249                if ignores.is_match(relative) {
250                    return None;
251                }
252            }
253
254            let metadata = std::fs::metadata(&file.path).ok()?;
255
256            let cached_entry = token_cache
257                .as_ref()
258                .and_then(|cache| cache.get(&file.path, &metadata, token_cache_mode));
259            let cache_hit = cached_entry.is_some();
260
261            let (mut entry, suppressions) = if let Some(entry) = cached_entry {
262                let suppressions = entry.suppressions.clone();
263                if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
264                    return None;
265                }
266                (entry, suppressions)
267            } else {
268                let source = std::fs::read_to_string(&file.path).ok()?;
269                let suppressions = suppress::parse_suppressions_from_source(&source);
270                if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
271                    return None;
272                }
273
274                // Tokenize (with optional type stripping for cross-language detection)
275                let file_tokens = if strip_types {
276                    tokenize_file_cross_language(&file.path, &source, true, skip_imports)
277                } else {
278                    tokenize_file(&file.path, &source, skip_imports)
279                };
280                if file_tokens.tokens.is_empty() {
281                    return None;
282                }
283
284                // Normalize and hash using resolved normalization flags
285                let hashed = normalize_and_hash_resolved(&file_tokens.tokens, normalization);
286                let entry = TokenCacheEntry {
287                    hashed_tokens: hashed,
288                    file_tokens,
289                    suppressions: suppressions.clone(),
290                };
291                (entry, suppressions)
292            };
293            if entry.file_tokens.tokens.is_empty() {
294                return None;
295            }
296            if entry.hashed_tokens.len() < config.min_tokens {
297                return None;
298            }
299
300            Some(TokenizedFile {
301                path: file.path.clone(),
302                hashed_tokens: std::mem::take(&mut entry.hashed_tokens),
303                file_tokens: entry.file_tokens,
304                metadata: Some(metadata),
305                cache_hit,
306                suppressions,
307            })
308        })
309        .collect();
310
311    if let (Some(cache_root), Some(mut cache)) = (cache_root, token_cache) {
312        for file in &file_data {
313            if !file.cache_hit
314                && let Some(metadata) = &file.metadata
315            {
316                cache.insert(
317                    &file.path,
318                    metadata,
319                    token_cache_mode,
320                    &file.hashed_tokens,
321                    &file.file_tokens,
322                    &file.suppressions,
323                );
324            }
325        }
326        cache.retain_paths(files);
327        match cache.save_if_dirty() {
328            Ok(true) => {
329                tracing::debug!(cache_root = %cache_root.display(), "saved duplication token cache");
330            }
331            Ok(false) => {
332                tracing::debug!(cache_root = %cache_root.display(), "duplication token cache unchanged");
333            }
334            Err(err) => {
335                tracing::warn!("Failed to save duplication token cache: {err}");
336            }
337        }
338    }
339
340    tracing::info!(
341        files = file_data.len(),
342        "tokenized files for duplication analysis"
343    );
344
345    if let Some(focus_files) = focus_files
346        && file_data.len() >= config.min_corpus_size_for_shingle_filter
347    {
348        shingle_filter::filter_to_focus_candidates(&mut file_data, focus_files, config.min_tokens);
349    }
350
351    // Collect per-file suppressions for line-level filtering
352    let suppressions_by_file: FxHashMap<PathBuf, Vec<Suppression>> = file_data
353        .iter()
354        .filter(|file| !file.suppressions.is_empty())
355        .map(|file| (file.path.clone(), file.suppressions.clone()))
356        .collect();
357
358    // Strip suppressions from the data passed to the detector
359    let detector_data: Vec<(PathBuf, Vec<normalize::HashedToken>, tokenize::FileTokens)> =
360        file_data
361            .into_iter()
362            .map(|file| (file.path, file.hashed_tokens, file.file_tokens))
363            .collect();
364
365    // Step 3 & 4: Detect clones
366    let detector = CloneDetector::new(config.min_tokens, config.min_lines, config.skip_local);
367    let mut report = if let Some(focus_files) = focus_files {
368        detector.detect_touching_files(detector_data, focus_files)
369    } else {
370        detector.detect(detector_data)
371    };
372
373    // Step 5: Apply line-level suppressions FIRST, so the post-suppression
374    // instance count is what the min-occurrences filter evaluates. Otherwise
375    // a 3-instance clone group whose third instance is line-suppressed would
376    // survive `--min-occurrences 3` and show up as a 2-instance group.
377    if !suppressions_by_file.is_empty() {
378        apply_line_suppressions(&mut report, &suppressions_by_file);
379    }
380
381    // Step 5b: Apply the min-occurrences filter on the post-suppression set.
382    apply_min_occurrences_filter(&mut report, config.min_occurrences);
383
384    let default_ignore_skips =
385        build_default_ignore_skips(extra_ignores.as_ref(), &default_skip_counts);
386
387    // Step 6: Group into families with refactoring suggestions
388    report.clone_families = families::group_into_families(&report.clone_groups, root);
389
390    // Step 7: Detect mirrored directory trees
391    report.mirrored_directories =
392        families::detect_mirrored_directories(&report.clone_families, root);
393
394    // Sort all result arrays for deterministic output ordering.
395    // Parallel tokenization (par_iter) doesn't guarantee collection order.
396    report.sort();
397
398    DuplicationRun {
399        report,
400        default_ignore_skips,
401    }
402}
403
404/// Drop clone groups with fewer than `min` instances and record the count on
405/// the stats block. The detector already guarantees `>= 2`, so this is a
406/// no-op when `min <= 2`.
407///
408/// Stats split: `clone_groups` and `clone_instances` are recomputed
409/// post-filter so they match the serialized array length (a CI consumer
410/// reading `stats.clone_groups` and iterating `clone_groups[]` sees the same
411/// count). `duplication_percentage`, `duplicated_lines`, `duplicated_tokens`,
412/// and `files_with_clones` stay pre-filter so the percentage math (lines /
413/// total) stays consistent and `threshold` gates / trend lines don't shift
414/// when the filter changes. The hidden count is disclosed in
415/// `clone_groups_below_min_occurrences`. The surviving groups feed every
416/// downstream step (families, mirrored dirs, --top, baseline, changed-since,
417/// workspace scoping) so there's a single source of truth.
418fn apply_min_occurrences_filter(report: &mut DuplicationReport, min: usize) {
419    if min <= 2 {
420        return;
421    }
422    let before = report.clone_groups.len();
423    report
424        .clone_groups
425        .retain(|group| group.instances.len() >= min);
426    let hidden = before - report.clone_groups.len();
427    if hidden == 0 {
428        return;
429    }
430    report.stats.clone_groups_below_min_occurrences = hidden;
431    report.stats.clone_groups = report.clone_groups.len();
432    report.stats.clone_instances = report.clone_groups.iter().map(|g| g.instances.len()).sum();
433}
434
435/// Filter out clone instances that are suppressed by line-level comments.
436#[expect(
437    clippy::cast_possible_truncation,
438    reason = "line numbers are bounded by source size"
439)]
440fn apply_line_suppressions(
441    report: &mut DuplicationReport,
442    suppressions_by_file: &FxHashMap<PathBuf, Vec<Suppression>>,
443) {
444    report.clone_groups.retain_mut(|group| {
445        group.instances.retain(|instance| {
446            if let Some(supps) = suppressions_by_file.get(&instance.file) {
447                // Check if any line in the instance range is suppressed
448                for line in instance.start_line..=instance.end_line {
449                    if suppress::is_suppressed(supps, line as u32, IssueKind::CodeDuplication) {
450                        return false;
451                    }
452                }
453            }
454            true
455        });
456        // Keep group only if it still has 2+ instances
457        group.instances.len() >= 2
458    });
459}
460
461/// Run duplication detection on a project directory using auto-discovered files.
462///
463/// This is a convenience function that handles file discovery internally.
464#[must_use]
465pub fn find_duplicates_in_project(root: &Path, config: &DuplicatesConfig) -> DuplicationReport {
466    let resolved = crate::default_config(root);
467    let files = discover::discover_files_with_plugin_scopes(&resolved);
468    find_duplicates(root, &files, config)
469}
470
471/// Build a merged ignore set from built-in and user-provided duplicates ignores.
472fn build_ignore_set(config: &DuplicatesConfig) -> Option<IgnoreSet> {
473    if !config.ignore_defaults && config.ignore.is_empty() {
474        return None;
475    }
476
477    let mut builder = GlobSetBuilder::new();
478    let mut defaults = Vec::new();
479
480    if config.ignore_defaults {
481        for pattern in DUPES_DEFAULT_IGNORES {
482            match Glob::new(pattern) {
483                Ok(glob) => {
484                    defaults.push((*pattern, glob.compile_matcher()));
485                    builder.add(glob);
486                }
487                Err(e) => {
488                    tracing::warn!("Invalid default duplication ignore pattern '{pattern}': {e}");
489                }
490            }
491        }
492    }
493
494    for pattern in &config.ignore {
495        match Glob::new(pattern) {
496            Ok(glob) => {
497                builder.add(glob);
498            }
499            Err(e) => {
500                tracing::warn!("Invalid duplication ignore pattern '{pattern}': {e}");
501            }
502        }
503    }
504
505    builder.build().ok().map(|all| IgnoreSet { all, defaults })
506}
507
508fn build_default_ignore_skips(
509    ignores: Option<&IgnoreSet>,
510    counts: &[AtomicUsize],
511) -> DefaultIgnoreSkips {
512    let Some(ignores) = ignores else {
513        return DefaultIgnoreSkips::default();
514    };
515
516    let by_pattern = ignores
517        .defaults
518        .iter()
519        .zip(counts)
520        .filter_map(|((pattern, _), count)| {
521            let count = count.load(Ordering::Relaxed);
522            (count > 0).then_some(DefaultIgnoreSkipCount { pattern, count })
523        })
524        .collect::<Vec<_>>();
525    let total = by_pattern.iter().map(|entry| entry.count).sum();
526
527    DefaultIgnoreSkips { total, by_pattern }
528}
529
530#[cfg(test)]
531mod tests {
532    use super::*;
533    use crate::discover::FileId;
534
535    #[test]
536    fn find_duplicates_empty_files() {
537        let config = DuplicatesConfig::default();
538        let report = find_duplicates(Path::new("/tmp"), &[], &config);
539        assert!(report.clone_groups.is_empty());
540        assert!(report.clone_families.is_empty());
541        assert_eq!(report.stats.total_files, 0);
542    }
543
544    #[test]
545    fn build_ignore_set_empty() {
546        let config = DuplicatesConfig {
547            ignore_defaults: false,
548            ..DuplicatesConfig::default()
549        };
550        assert!(build_ignore_set(&config).is_none());
551    }
552
553    #[test]
554    fn build_ignore_set_valid_patterns() {
555        let config = DuplicatesConfig {
556            ignore_defaults: false,
557            ignore: vec!["**/*.test.ts".to_string(), "**/*.spec.ts".to_string()],
558            ..DuplicatesConfig::default()
559        };
560        let set = build_ignore_set(&config);
561        assert!(set.is_some());
562        let set = set.unwrap();
563        assert!(set.is_match(Path::new("src/foo.test.ts")));
564        assert!(set.is_match(Path::new("src/bar.spec.ts")));
565        assert!(!set.is_match(Path::new("src/baz.ts")));
566    }
567
568    #[test]
569    fn build_ignore_set_merges_defaults_with_user_patterns() {
570        let config = DuplicatesConfig {
571            ignore: vec!["**/foo/**".to_string()],
572            ..DuplicatesConfig::default()
573        };
574        let set = build_ignore_set(&config).expect("ignore set");
575        assert!(set.is_match(Path::new(".next/static/chunks/app.js")));
576        assert!(set.is_match(Path::new("src/foo/generated.js")));
577    }
578
579    #[test]
580    fn build_ignore_set_ignore_defaults_false_uses_only_user_patterns() {
581        let config = DuplicatesConfig {
582            ignore_defaults: false,
583            ignore: vec!["**/foo/**".to_string()],
584            ..DuplicatesConfig::default()
585        };
586        let set = build_ignore_set(&config).expect("ignore set");
587        assert!(!set.is_match(Path::new(".next/static/chunks/app.js")));
588        assert!(set.is_match(Path::new("src/foo/generated.js")));
589    }
590
591    #[test]
592    fn find_duplicates_with_real_files() {
593        // Create a temp directory with duplicate files
594        let dir = tempfile::tempdir().expect("create temp dir");
595        let src_dir = dir.path().join("src");
596        std::fs::create_dir_all(&src_dir).expect("create src dir");
597
598        let code = r#"
599export function processData(input: string): string {
600    const trimmed = input.trim();
601    if (trimmed.length === 0) {
602        return "";
603    }
604    const parts = trimmed.split(",");
605    const filtered = parts.filter(p => p.length > 0);
606    const mapped = filtered.map(p => p.toUpperCase());
607    return mapped.join(", ");
608}
609
610export function validateInput(data: string): boolean {
611    if (data === null || data === undefined) {
612        return false;
613    }
614    const cleaned = data.trim();
615    if (cleaned.length < 3) {
616        return false;
617    }
618    return true;
619}
620"#;
621
622        std::fs::write(src_dir.join("original.ts"), code).expect("write original");
623        std::fs::write(src_dir.join("copy.ts"), code).expect("write copy");
624        std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
625            .expect("write package.json");
626
627        let files = vec![
628            DiscoveredFile {
629                id: FileId(0),
630                path: src_dir.join("original.ts"),
631                size_bytes: code.len() as u64,
632            },
633            DiscoveredFile {
634                id: FileId(1),
635                path: src_dir.join("copy.ts"),
636                size_bytes: code.len() as u64,
637            },
638        ];
639
640        let config = DuplicatesConfig {
641            min_tokens: 10,
642            min_lines: 2,
643            ..DuplicatesConfig::default()
644        };
645
646        let report = find_duplicates(dir.path(), &files, &config);
647        assert!(
648            !report.clone_groups.is_empty(),
649            "Should detect clones in identical files"
650        );
651        assert!(report.stats.files_with_clones >= 2);
652
653        // Should also have clone families
654        assert!(
655            !report.clone_families.is_empty(),
656            "Should group clones into families"
657        );
658    }
659
660    #[test]
661    fn find_duplicates_cached_skips_token_cache_for_small_corpus() {
662        let dir = tempfile::tempdir().expect("create temp dir");
663        let src_dir = dir.path().join("src");
664        std::fs::create_dir_all(&src_dir).expect("create src dir");
665
666        let code = "export function same(input: number): number {\n  const doubled = input * 2;\n  return doubled + 1;\n}\n";
667        let first = src_dir.join("first.ts");
668        let second = src_dir.join("second.ts");
669        std::fs::write(&first, code).expect("write first");
670        std::fs::write(&second, code).expect("write second");
671
672        let files = vec![
673            DiscoveredFile {
674                id: FileId(0),
675                path: first,
676                size_bytes: code.len() as u64,
677            },
678            DiscoveredFile {
679                id: FileId(1),
680                path: second,
681                size_bytes: code.len() as u64,
682            },
683        ];
684        let config = DuplicatesConfig {
685            min_tokens: 5,
686            min_lines: 2,
687            ..DuplicatesConfig::default()
688        };
689        let cache_root = dir.path().join(".fallow");
690
691        let report = find_duplicates_cached(dir.path(), &files, &config, &cache_root);
692
693        assert!(!report.clone_groups.is_empty());
694        assert!(
695            !cache_root.exists(),
696            "small projects should avoid token-cache IO overhead"
697        );
698    }
699
700    #[test]
701    fn find_duplicates_touching_files_keeps_cross_corpus_matches_only_for_focus() {
702        let dir = tempfile::tempdir().expect("create temp dir");
703        let src_dir = dir.path().join("src");
704        std::fs::create_dir_all(&src_dir).expect("create src dir");
705
706        let focused_code = r"
707export function focused(input: number): number {
708    const doubled = input * 2;
709    const shifted = doubled + 10;
710    return shifted / 2;
711}
712";
713        let untouched_code = r#"
714export function untouched(input: string): string {
715    const lowered = input.toLowerCase();
716    const padded = lowered.padStart(10, "x");
717    return padded.slice(0, 8);
718}
719"#;
720
721        let changed_path = src_dir.join("changed.ts");
722        let focused_copy_path = src_dir.join("focused-copy.ts");
723        let untouched_a_path = src_dir.join("untouched-a.ts");
724        let untouched_b_path = src_dir.join("untouched-b.ts");
725        std::fs::write(&changed_path, focused_code).expect("write changed");
726        std::fs::write(&focused_copy_path, focused_code).expect("write focused copy");
727        std::fs::write(&untouched_a_path, untouched_code).expect("write untouched a");
728        std::fs::write(&untouched_b_path, untouched_code).expect("write untouched b");
729
730        let files = vec![
731            DiscoveredFile {
732                id: FileId(0),
733                path: changed_path.clone(),
734                size_bytes: focused_code.len() as u64,
735            },
736            DiscoveredFile {
737                id: FileId(1),
738                path: focused_copy_path,
739                size_bytes: focused_code.len() as u64,
740            },
741            DiscoveredFile {
742                id: FileId(2),
743                path: untouched_a_path,
744                size_bytes: untouched_code.len() as u64,
745            },
746            DiscoveredFile {
747                id: FileId(3),
748                path: untouched_b_path,
749                size_bytes: untouched_code.len() as u64,
750            },
751        ];
752
753        let config = DuplicatesConfig {
754            mode: DetectionMode::Strict,
755            min_tokens: 5,
756            min_lines: 2,
757            min_corpus_size_for_shingle_filter: 1,
758            ..DuplicatesConfig::default()
759        };
760        let mut focus = FxHashSet::default();
761        focus.insert(changed_path.clone());
762
763        let full_report = find_duplicates(dir.path(), &files, &config);
764        let report = find_duplicates_touching_files(dir.path(), &files, &config, &focus);
765        let expected_touching = full_report
766            .clone_groups
767            .iter()
768            .filter(|group| {
769                group
770                    .instances
771                    .iter()
772                    .any(|instance| instance.file == changed_path)
773            })
774            .count();
775
776        assert!(
777            !report.clone_groups.is_empty(),
778            "focused file should still match an unchanged duplicate"
779        );
780        assert_eq!(
781            report.clone_groups.len(),
782            expected_touching,
783            "focused shingle filtering must not drop clone groups touching the focused file"
784        );
785        assert!(report.clone_groups.iter().all(|group| {
786            group
787                .instances
788                .iter()
789                .any(|instance| instance.file == changed_path)
790        }));
791    }
792
793    #[test]
794    fn file_wide_suppression_excludes_file() {
795        let dir = tempfile::tempdir().expect("create temp dir");
796        let src_dir = dir.path().join("src");
797        std::fs::create_dir_all(&src_dir).expect("create src dir");
798
799        let code = r#"
800export function processData(input: string): string {
801    const trimmed = input.trim();
802    if (trimmed.length === 0) {
803        return "";
804    }
805    const parts = trimmed.split(",");
806    const filtered = parts.filter(p => p.length > 0);
807    const mapped = filtered.map(p => p.toUpperCase());
808    return mapped.join(", ");
809}
810"#;
811        let suppressed_code = format!("// fallow-ignore-file code-duplication\n{code}");
812
813        std::fs::write(src_dir.join("original.ts"), code).expect("write original");
814        std::fs::write(src_dir.join("suppressed.ts"), &suppressed_code).expect("write suppressed");
815        std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
816            .expect("write package.json");
817
818        let files = vec![
819            DiscoveredFile {
820                id: FileId(0),
821                path: src_dir.join("original.ts"),
822                size_bytes: code.len() as u64,
823            },
824            DiscoveredFile {
825                id: FileId(1),
826                path: src_dir.join("suppressed.ts"),
827                size_bytes: suppressed_code.len() as u64,
828            },
829        ];
830
831        let config = DuplicatesConfig {
832            min_tokens: 10,
833            min_lines: 2,
834            ..DuplicatesConfig::default()
835        };
836
837        let report = find_duplicates(dir.path(), &files, &config);
838        // With only 2 files and one suppressed, there should be no clones
839        assert!(
840            report.clone_groups.is_empty(),
841            "File-wide suppression should exclude file from duplication analysis"
842        );
843    }
844
845    #[test]
846    fn min_occurrences_hides_pairs_and_records_count() {
847        let dir = tempfile::tempdir().expect("create temp dir");
848        let src_dir = dir.path().join("src");
849        std::fs::create_dir_all(&src_dir).expect("create src dir");
850
851        // Block A: only appears in 2 files (a pair).
852        // Block B: appears in 3 files (a triple).
853        let block_a = r#"
854export function blockA(input: string): string {
855    const trimmed = input.trim();
856    if (trimmed.length === 0) {
857        return "";
858    }
859    const parts = trimmed.split(",");
860    const filtered = parts.filter(p => p.length > 0);
861    const mapped = filtered.map(p => p.toUpperCase());
862    return mapped.join(", ");
863}
864"#;
865        let block_b = r"
866export function blockB(value: number): number {
867    if (value <= 0) {
868        return 0;
869    }
870    let total = 0;
871    for (let i = 1; i <= value; i += 1) {
872        total += i * 2;
873        total -= 1;
874    }
875    return total + 7;
876}
877";
878
879        let pair_a1 = src_dir.join("pair-a1.ts");
880        let pair_a2 = src_dir.join("pair-a2.ts");
881        let triple_b1 = src_dir.join("triple-b1.ts");
882        let triple_b2 = src_dir.join("triple-b2.ts");
883        let triple_b3 = src_dir.join("triple-b3.ts");
884        std::fs::write(&pair_a1, block_a).expect("write");
885        std::fs::write(&pair_a2, block_a).expect("write");
886        std::fs::write(&triple_b1, block_b).expect("write");
887        std::fs::write(&triple_b2, block_b).expect("write");
888        std::fs::write(&triple_b3, block_b).expect("write");
889
890        let files = vec![
891            DiscoveredFile {
892                id: FileId(0),
893                path: pair_a1,
894                size_bytes: block_a.len() as u64,
895            },
896            DiscoveredFile {
897                id: FileId(1),
898                path: pair_a2,
899                size_bytes: block_a.len() as u64,
900            },
901            DiscoveredFile {
902                id: FileId(2),
903                path: triple_b1,
904                size_bytes: block_b.len() as u64,
905            },
906            DiscoveredFile {
907                id: FileId(3),
908                path: triple_b2,
909                size_bytes: block_b.len() as u64,
910            },
911            DiscoveredFile {
912                id: FileId(4),
913                path: triple_b3,
914                size_bytes: block_b.len() as u64,
915            },
916        ];
917
918        // Baseline: minOccurrences = 2 (default). Both groups reported.
919        let default_config = DuplicatesConfig {
920            min_tokens: 10,
921            min_lines: 2,
922            ..DuplicatesConfig::default()
923        };
924        let baseline = find_duplicates(dir.path(), &files, &default_config);
925        assert_eq!(
926            baseline.clone_groups.len(),
927            2,
928            "default minOccurrences should report both the pair and the triple"
929        );
930        assert_eq!(
931            baseline.stats.clone_groups_below_min_occurrences, 0,
932            "default minOccurrences hides nothing"
933        );
934        let baseline_pct = baseline.stats.duplication_percentage;
935
936        // Raised: minOccurrences = 3. Only the triple survives.
937        let raised_config = DuplicatesConfig {
938            min_tokens: 10,
939            min_lines: 2,
940            min_occurrences: 3,
941            ..DuplicatesConfig::default()
942        };
943        let report = find_duplicates(dir.path(), &files, &raised_config);
944        assert_eq!(
945            report.clone_groups.len(),
946            1,
947            "minOccurrences=3 should hide the 2-instance group"
948        );
949        assert_eq!(
950            report.clone_groups[0].instances.len(),
951            3,
952            "surviving group must be the 3-instance group"
953        );
954        assert_eq!(
955            report.stats.clone_groups_below_min_occurrences, 1,
956            "the hidden 2-instance group must be counted"
957        );
958        // `clone_groups` and `clone_instances` reflect the post-filter set so
959        // consumers iterating `clone_groups[]` see a matching count.
960        assert_eq!(
961            report.stats.clone_groups, 1,
962            "stats.clone_groups must match the post-filter array length"
963        );
964        assert_eq!(
965            report.stats.clone_instances, 3,
966            "stats.clone_instances must match the surviving instance total"
967        );
968        // `duplication_percentage` stays pre-filter so threshold gates and
969        // trend lines don't shift when minOccurrences changes.
970        assert!(
971            (report.stats.duplication_percentage - baseline_pct).abs() < f64::EPSILON,
972            "duplication_percentage should not shift when minOccurrences changes"
973        );
974    }
975
976    #[test]
977    fn min_occurrences_evaluates_after_line_suppressions() {
978        // Three files share a clone. The third file suppresses the clone with
979        // an inline comment. After suppression the group has 2 instances.
980        // With minOccurrences=3 the group must be hidden, NOT reported as a
981        // 2-instance clone. The filter evaluates the post-suppression count,
982        // not the pre-suppression detector output.
983        let dir = tempfile::tempdir().expect("create temp dir");
984        let src_dir = dir.path().join("src");
985        std::fs::create_dir_all(&src_dir).expect("create src dir");
986
987        let block = r#"
988export function shared(input: string): string {
989    const trimmed = input.trim();
990    if (trimmed.length === 0) {
991        return "";
992    }
993    const parts = trimmed.split(",");
994    const filtered = parts.filter(p => p.length > 0);
995    const mapped = filtered.map(p => p.toUpperCase());
996    return mapped.join(", ");
997}
998"#;
999        let suppressed = format!("// fallow-ignore-file code-duplication\n{block}");
1000
1001        let a = src_dir.join("a.ts");
1002        let b = src_dir.join("b.ts");
1003        let c = src_dir.join("c.ts");
1004        std::fs::write(&a, block).expect("write a");
1005        std::fs::write(&b, block).expect("write b");
1006        std::fs::write(&c, &suppressed).expect("write c");
1007
1008        let files = vec![
1009            DiscoveredFile {
1010                id: FileId(0),
1011                path: a,
1012                size_bytes: block.len() as u64,
1013            },
1014            DiscoveredFile {
1015                id: FileId(1),
1016                path: b,
1017                size_bytes: block.len() as u64,
1018            },
1019            DiscoveredFile {
1020                id: FileId(2),
1021                path: c,
1022                size_bytes: suppressed.len() as u64,
1023            },
1024        ];
1025
1026        let config = DuplicatesConfig {
1027            min_tokens: 10,
1028            min_lines: 2,
1029            min_occurrences: 3,
1030            ..DuplicatesConfig::default()
1031        };
1032        let report = find_duplicates(dir.path(), &files, &config);
1033        assert!(
1034            report.clone_groups.is_empty(),
1035            "post-suppression 2-instance group must be hidden by minOccurrences=3, \
1036             got groups: {:?}",
1037            report
1038                .clone_groups
1039                .iter()
1040                .map(|g| g.instances.len())
1041                .collect::<Vec<_>>()
1042        );
1043        assert_eq!(
1044            report.stats.clone_groups, 0,
1045            "stats.clone_groups must match the empty post-filter array"
1046        );
1047        assert_eq!(
1048            report.stats.clone_instances, 0,
1049            "stats.clone_instances must match the empty post-filter array"
1050        );
1051    }
1052}
fallow_core/duplicates/mod.rs

fallow_core/duplicates/
mod.rs