Skip to main content

fallow_core/duplicates/
mod.rs

1//! Code duplication / clone detection module.
2//!
3//! This module implements suffix array + LCP based clone detection
4//! for TypeScript/JavaScript source files. It supports multiple detection
5//! modes from strict (exact matches only) to semantic (structure-aware
6//! matching that ignores identifier names and literal values).
7
8pub mod detect;
9pub mod families;
10pub mod normalize;
11pub mod token_types;
12mod token_visitor;
13pub mod tokenize;
14pub(crate) mod types;
15
16use rustc_hash::FxHashMap;
17use std::path::{Path, PathBuf};
18
19use globset::{Glob, GlobSet, GlobSetBuilder};
20use rayon::prelude::*;
21
22use detect::CloneDetector;
23use normalize::normalize_and_hash_resolved;
24use tokenize::{tokenize_file, tokenize_file_cross_language};
25pub use types::{
26    CloneFamily, CloneGroup, CloneInstance, DetectionMode, DuplicatesConfig, DuplicationReport,
27    DuplicationStats, MirroredDirectory, RefactoringKind, RefactoringSuggestion,
28};
29
30use crate::discover::{self, DiscoveredFile};
31use crate::suppress::{self, IssueKind, Suppression};
32
33/// Run duplication detection on the given files.
34///
35/// This is the main entry point for the duplication analysis. It:
36/// 1. Reads and tokenizes all source files in parallel
37/// 2. Normalizes tokens according to the detection mode
38/// 3. Runs suffix array + LCP clone detection
39/// 4. Groups clone instances into families with refactoring suggestions
40/// 5. Applies inline suppression filters
41pub fn find_duplicates(
42    root: &Path,
43    files: &[DiscoveredFile],
44    config: &DuplicatesConfig,
45) -> DuplicationReport {
46    let _span = tracing::info_span!("find_duplicates").entered();
47
48    // Build extra ignore patterns for duplication analysis
49    let extra_ignores = build_ignore_set(&config.ignore);
50
51    // Resolve normalization: mode defaults + user overrides
52    let normalization =
53        fallow_config::ResolvedNormalization::resolve(config.mode, &config.normalization);
54
55    let strip_types = config.cross_language;
56    let skip_imports = config.ignore_imports;
57
58    tracing::debug!(
59        ignore_imports = skip_imports,
60        "duplication tokenization config"
61    );
62
63    // Step 1 & 2: Tokenize and normalize all files in parallel, also parse suppressions
64    let file_data: Vec<(
65        PathBuf,
66        Vec<normalize::HashedToken>,
67        tokenize::FileTokens,
68        Vec<Suppression>,
69    )> = files
70        .par_iter()
71        .filter_map(|file| {
72            // Apply extra ignore patterns
73            let relative = file.path.strip_prefix(root).unwrap_or(&file.path);
74            if let Some(ref ignores) = extra_ignores
75                && ignores.is_match(relative)
76            {
77                return None;
78            }
79
80            // Read the file
81            let source = std::fs::read_to_string(&file.path).ok()?;
82
83            // Parse inline suppression comments
84            let suppressions = suppress::parse_suppressions_from_source(&source);
85
86            // Check for file-wide code-duplication suppression
87            if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
88                return None;
89            }
90
91            // Tokenize (with optional type stripping for cross-language detection)
92            let file_tokens = if strip_types {
93                tokenize_file_cross_language(&file.path, &source, true, skip_imports)
94            } else {
95                tokenize_file(&file.path, &source, skip_imports)
96            };
97            if file_tokens.tokens.is_empty() {
98                return None;
99            }
100
101            // Normalize and hash using resolved normalization flags
102            let hashed = normalize_and_hash_resolved(&file_tokens.tokens, normalization);
103            if hashed.len() < config.min_tokens {
104                return None;
105            }
106
107            Some((file.path.clone(), hashed, file_tokens, suppressions))
108        })
109        .collect();
110
111    tracing::info!(
112        files = file_data.len(),
113        "tokenized files for duplication analysis"
114    );
115
116    // Collect per-file suppressions for line-level filtering
117    let suppressions_by_file: FxHashMap<PathBuf, Vec<Suppression>> = file_data
118        .iter()
119        .filter(|(_, _, _, supps)| !supps.is_empty())
120        .map(|(path, _, _, supps)| (path.clone(), supps.clone()))
121        .collect();
122
123    // Strip suppressions from the data passed to the detector
124    let detector_data: Vec<(PathBuf, Vec<normalize::HashedToken>, tokenize::FileTokens)> =
125        file_data
126            .into_iter()
127            .map(|(path, hashed, tokens, _)| (path, hashed, tokens))
128            .collect();
129
130    // Step 3 & 4: Detect clones
131    let detector = CloneDetector::new(config.min_tokens, config.min_lines, config.skip_local);
132    let mut report = detector.detect(detector_data);
133
134    // Step 5: Apply line-level suppressions
135    if !suppressions_by_file.is_empty() {
136        apply_line_suppressions(&mut report, &suppressions_by_file);
137    }
138
139    // Step 6: Group into families with refactoring suggestions
140    report.clone_families = families::group_into_families(&report.clone_groups, root);
141
142    // Step 7: Detect mirrored directory trees
143    report.mirrored_directories =
144        families::detect_mirrored_directories(&report.clone_families, root);
145
146    // Sort all result arrays for deterministic output ordering.
147    // Parallel tokenization (par_iter) doesn't guarantee collection order.
148    report.sort();
149
150    report
151}
152
153/// Filter out clone instances that are suppressed by line-level comments.
154#[expect(
155    clippy::cast_possible_truncation,
156    reason = "line numbers are bounded by source size"
157)]
158fn apply_line_suppressions(
159    report: &mut DuplicationReport,
160    suppressions_by_file: &FxHashMap<PathBuf, Vec<Suppression>>,
161) {
162    report.clone_groups.retain_mut(|group| {
163        group.instances.retain(|instance| {
164            if let Some(supps) = suppressions_by_file.get(&instance.file) {
165                // Check if any line in the instance range is suppressed
166                for line in instance.start_line..=instance.end_line {
167                    if suppress::is_suppressed(supps, line as u32, IssueKind::CodeDuplication) {
168                        return false;
169                    }
170                }
171            }
172            true
173        });
174        // Keep group only if it still has 2+ instances
175        group.instances.len() >= 2
176    });
177}
178
179/// Run duplication detection on a project directory using auto-discovered files.
180///
181/// This is a convenience function that handles file discovery internally.
182#[must_use]
183pub fn find_duplicates_in_project(root: &Path, config: &DuplicatesConfig) -> DuplicationReport {
184    let resolved = crate::default_config(root);
185    let files = discover::discover_files(&resolved);
186    find_duplicates(root, &files, config)
187}
188
189/// Build a `GlobSet` from ignore patterns.
190fn build_ignore_set(patterns: &[String]) -> Option<GlobSet> {
191    if patterns.is_empty() {
192        return None;
193    }
194
195    let mut builder = GlobSetBuilder::new();
196    for pattern in patterns {
197        match Glob::new(pattern) {
198            Ok(glob) => {
199                builder.add(glob);
200            }
201            Err(e) => {
202                tracing::warn!("Invalid duplication ignore pattern '{pattern}': {e}");
203            }
204        }
205    }
206
207    builder.build().ok()
208}
209
210#[cfg(test)]
211mod tests {
212    use super::*;
213    use crate::discover::FileId;
214
215    #[test]
216    fn find_duplicates_empty_files() {
217        let config = DuplicatesConfig::default();
218        let report = find_duplicates(Path::new("/tmp"), &[], &config);
219        assert!(report.clone_groups.is_empty());
220        assert!(report.clone_families.is_empty());
221        assert_eq!(report.stats.total_files, 0);
222    }
223
224    #[test]
225    fn build_ignore_set_empty() {
226        assert!(build_ignore_set(&[]).is_none());
227    }
228
229    #[test]
230    fn build_ignore_set_valid_patterns() {
231        let set = build_ignore_set(&["**/*.test.ts".to_string(), "**/*.spec.ts".to_string()]);
232        assert!(set.is_some());
233        let set = set.unwrap();
234        assert!(set.is_match("src/foo.test.ts"));
235        assert!(set.is_match("src/bar.spec.ts"));
236        assert!(!set.is_match("src/baz.ts"));
237    }
238
239    #[test]
240    fn find_duplicates_with_real_files() {
241        // Create a temp directory with duplicate files
242        let dir = tempfile::tempdir().expect("create temp dir");
243        let src_dir = dir.path().join("src");
244        std::fs::create_dir_all(&src_dir).expect("create src dir");
245
246        let code = r#"
247export function processData(input: string): string {
248    const trimmed = input.trim();
249    if (trimmed.length === 0) {
250        return "";
251    }
252    const parts = trimmed.split(",");
253    const filtered = parts.filter(p => p.length > 0);
254    const mapped = filtered.map(p => p.toUpperCase());
255    return mapped.join(", ");
256}
257
258export function validateInput(data: string): boolean {
259    if (data === null || data === undefined) {
260        return false;
261    }
262    const cleaned = data.trim();
263    if (cleaned.length < 3) {
264        return false;
265    }
266    return true;
267}
268"#;
269
270        std::fs::write(src_dir.join("original.ts"), code).expect("write original");
271        std::fs::write(src_dir.join("copy.ts"), code).expect("write copy");
272        std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
273            .expect("write package.json");
274
275        let files = vec![
276            DiscoveredFile {
277                id: FileId(0),
278                path: src_dir.join("original.ts"),
279                size_bytes: code.len() as u64,
280            },
281            DiscoveredFile {
282                id: FileId(1),
283                path: src_dir.join("copy.ts"),
284                size_bytes: code.len() as u64,
285            },
286        ];
287
288        let config = DuplicatesConfig {
289            min_tokens: 10,
290            min_lines: 2,
291            ..DuplicatesConfig::default()
292        };
293
294        let report = find_duplicates(dir.path(), &files, &config);
295        assert!(
296            !report.clone_groups.is_empty(),
297            "Should detect clones in identical files"
298        );
299        assert!(report.stats.files_with_clones >= 2);
300
301        // Should also have clone families
302        assert!(
303            !report.clone_families.is_empty(),
304            "Should group clones into families"
305        );
306    }
307
308    #[test]
309    fn file_wide_suppression_excludes_file() {
310        let dir = tempfile::tempdir().expect("create temp dir");
311        let src_dir = dir.path().join("src");
312        std::fs::create_dir_all(&src_dir).expect("create src dir");
313
314        let code = r#"
315export function processData(input: string): string {
316    const trimmed = input.trim();
317    if (trimmed.length === 0) {
318        return "";
319    }
320    const parts = trimmed.split(",");
321    const filtered = parts.filter(p => p.length > 0);
322    const mapped = filtered.map(p => p.toUpperCase());
323    return mapped.join(", ");
324}
325"#;
326        let suppressed_code = format!("// fallow-ignore-file code-duplication\n{code}");
327
328        std::fs::write(src_dir.join("original.ts"), code).expect("write original");
329        std::fs::write(src_dir.join("suppressed.ts"), &suppressed_code).expect("write suppressed");
330        std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
331            .expect("write package.json");
332
333        let files = vec![
334            DiscoveredFile {
335                id: FileId(0),
336                path: src_dir.join("original.ts"),
337                size_bytes: code.len() as u64,
338            },
339            DiscoveredFile {
340                id: FileId(1),
341                path: src_dir.join("suppressed.ts"),
342                size_bytes: suppressed_code.len() as u64,
343            },
344        ];
345
346        let config = DuplicatesConfig {
347            min_tokens: 10,
348            min_lines: 2,
349            ..DuplicatesConfig::default()
350        };
351
352        let report = find_duplicates(dir.path(), &files, &config);
353        // With only 2 files and one suppressed, there should be no clones
354        assert!(
355            report.clone_groups.is_empty(),
356            "File-wide suppression should exclude file from duplication analysis"
357        );
358    }
359}