Skip to main content

fallow_core/duplicates/
mod.rs

1//! Code duplication / clone detection module.
2//!
3//! This module implements suffix array + LCP based clone detection
4//! for TypeScript/JavaScript source files. It supports multiple detection
5//! modes from strict (exact matches only) to semantic (structure-aware
6//! matching that ignores identifier names and literal values).
7
8pub mod detect;
9pub mod families;
10pub mod normalize;
11pub mod token_types;
12mod token_visitor;
13pub mod tokenize;
14pub(crate) mod types;
15
16use rustc_hash::FxHashMap;
17use std::path::{Path, PathBuf};
18
19use globset::{Glob, GlobSet, GlobSetBuilder};
20use rayon::prelude::*;
21
22use detect::CloneDetector;
23use normalize::normalize_and_hash_resolved;
24use tokenize::{tokenize_file, tokenize_file_cross_language};
25pub use types::{
26    CloneFamily, CloneGroup, CloneInstance, DetectionMode, DuplicatesConfig, DuplicationReport,
27    DuplicationStats, MirroredDirectory, RefactoringKind, RefactoringSuggestion,
28};
29
30use crate::discover::{self, DiscoveredFile};
31use crate::suppress::{self, IssueKind, Suppression};
32
33/// Run duplication detection on the given files.
34///
35/// This is the main entry point for the duplication analysis. It:
36/// 1. Reads and tokenizes all source files in parallel
37/// 2. Normalizes tokens according to the detection mode
38/// 3. Runs suffix array + LCP clone detection
39/// 4. Groups clone instances into families with refactoring suggestions
40/// 5. Applies inline suppression filters
41pub fn find_duplicates(
42    root: &Path,
43    files: &[DiscoveredFile],
44    config: &DuplicatesConfig,
45) -> DuplicationReport {
46    let _span = tracing::info_span!("find_duplicates").entered();
47
48    // Build extra ignore patterns for duplication analysis
49    let extra_ignores = build_ignore_set(&config.ignore);
50
51    // Resolve normalization: mode defaults + user overrides
52    let normalization =
53        fallow_config::ResolvedNormalization::resolve(config.mode, &config.normalization);
54
55    let strip_types = config.cross_language;
56
57    // Step 1 & 2: Tokenize and normalize all files in parallel, also parse suppressions
58    let file_data: Vec<(
59        PathBuf,
60        Vec<normalize::HashedToken>,
61        tokenize::FileTokens,
62        Vec<Suppression>,
63    )> = files
64        .par_iter()
65        .filter_map(|file| {
66            // Apply extra ignore patterns
67            let relative = file.path.strip_prefix(root).unwrap_or(&file.path);
68            if let Some(ref ignores) = extra_ignores
69                && ignores.is_match(relative)
70            {
71                return None;
72            }
73
74            // Read the file
75            let source = std::fs::read_to_string(&file.path).ok()?;
76
77            // Parse inline suppression comments
78            let suppressions = suppress::parse_suppressions_from_source(&source);
79
80            // Check for file-wide code-duplication suppression
81            if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
82                return None;
83            }
84
85            // Tokenize (with optional type stripping for cross-language detection)
86            let file_tokens = if strip_types {
87                tokenize_file_cross_language(&file.path, &source, true)
88            } else {
89                tokenize_file(&file.path, &source)
90            };
91            if file_tokens.tokens.is_empty() {
92                return None;
93            }
94
95            // Normalize and hash using resolved normalization flags
96            let hashed = normalize_and_hash_resolved(&file_tokens.tokens, normalization);
97            if hashed.len() < config.min_tokens {
98                return None;
99            }
100
101            Some((file.path.clone(), hashed, file_tokens, suppressions))
102        })
103        .collect();
104
105    tracing::info!(
106        files = file_data.len(),
107        "tokenized files for duplication analysis"
108    );
109
110    // Collect per-file suppressions for line-level filtering
111    let suppressions_by_file: FxHashMap<PathBuf, Vec<Suppression>> = file_data
112        .iter()
113        .filter(|(_, _, _, supps)| !supps.is_empty())
114        .map(|(path, _, _, supps)| (path.clone(), supps.clone()))
115        .collect();
116
117    // Strip suppressions from the data passed to the detector
118    let detector_data: Vec<(PathBuf, Vec<normalize::HashedToken>, tokenize::FileTokens)> =
119        file_data
120            .into_iter()
121            .map(|(path, hashed, tokens, _)| (path, hashed, tokens))
122            .collect();
123
124    // Step 3 & 4: Detect clones
125    let detector = CloneDetector::new(config.min_tokens, config.min_lines, config.skip_local);
126    let mut report = detector.detect(detector_data);
127
128    // Step 5: Apply line-level suppressions
129    if !suppressions_by_file.is_empty() {
130        apply_line_suppressions(&mut report, &suppressions_by_file);
131    }
132
133    // Step 6: Group into families with refactoring suggestions
134    report.clone_families = families::group_into_families(&report.clone_groups, root);
135
136    // Step 7: Detect mirrored directory trees
137    report.mirrored_directories =
138        families::detect_mirrored_directories(&report.clone_families, root);
139
140    // Sort all result arrays for deterministic output ordering.
141    // Parallel tokenization (par_iter) doesn't guarantee collection order.
142    report.sort();
143
144    report
145}
146
147/// Filter out clone instances that are suppressed by line-level comments.
148#[expect(
149    clippy::cast_possible_truncation,
150    reason = "line numbers are bounded by source size"
151)]
152fn apply_line_suppressions(
153    report: &mut DuplicationReport,
154    suppressions_by_file: &FxHashMap<PathBuf, Vec<Suppression>>,
155) {
156    report.clone_groups.retain_mut(|group| {
157        group.instances.retain(|instance| {
158            if let Some(supps) = suppressions_by_file.get(&instance.file) {
159                // Check if any line in the instance range is suppressed
160                for line in instance.start_line..=instance.end_line {
161                    if suppress::is_suppressed(supps, line as u32, IssueKind::CodeDuplication) {
162                        return false;
163                    }
164                }
165            }
166            true
167        });
168        // Keep group only if it still has 2+ instances
169        group.instances.len() >= 2
170    });
171}
172
173/// Run duplication detection on a project directory using auto-discovered files.
174///
175/// This is a convenience function that handles file discovery internally.
176#[must_use]
177pub fn find_duplicates_in_project(root: &Path, config: &DuplicatesConfig) -> DuplicationReport {
178    let resolved = crate::default_config(root);
179    let files = discover::discover_files(&resolved);
180    find_duplicates(root, &files, config)
181}
182
183/// Build a `GlobSet` from ignore patterns.
184fn build_ignore_set(patterns: &[String]) -> Option<GlobSet> {
185    if patterns.is_empty() {
186        return None;
187    }
188
189    let mut builder = GlobSetBuilder::new();
190    for pattern in patterns {
191        match Glob::new(pattern) {
192            Ok(glob) => {
193                builder.add(glob);
194            }
195            Err(e) => {
196                tracing::warn!("Invalid duplication ignore pattern '{pattern}': {e}");
197            }
198        }
199    }
200
201    builder.build().ok()
202}
203
204#[cfg(test)]
205mod tests {
206    use super::*;
207    use crate::discover::FileId;
208
209    #[test]
210    fn find_duplicates_empty_files() {
211        let config = DuplicatesConfig::default();
212        let report = find_duplicates(Path::new("/tmp"), &[], &config);
213        assert!(report.clone_groups.is_empty());
214        assert!(report.clone_families.is_empty());
215        assert_eq!(report.stats.total_files, 0);
216    }
217
218    #[test]
219    fn build_ignore_set_empty() {
220        assert!(build_ignore_set(&[]).is_none());
221    }
222
223    #[test]
224    fn build_ignore_set_valid_patterns() {
225        let set = build_ignore_set(&["**/*.test.ts".to_string(), "**/*.spec.ts".to_string()]);
226        assert!(set.is_some());
227        let set = set.unwrap();
228        assert!(set.is_match("src/foo.test.ts"));
229        assert!(set.is_match("src/bar.spec.ts"));
230        assert!(!set.is_match("src/baz.ts"));
231    }
232
233    #[test]
234    fn find_duplicates_with_real_files() {
235        // Create a temp directory with duplicate files
236        let dir = tempfile::tempdir().expect("create temp dir");
237        let src_dir = dir.path().join("src");
238        std::fs::create_dir_all(&src_dir).expect("create src dir");
239
240        let code = r#"
241export function processData(input: string): string {
242    const trimmed = input.trim();
243    if (trimmed.length === 0) {
244        return "";
245    }
246    const parts = trimmed.split(",");
247    const filtered = parts.filter(p => p.length > 0);
248    const mapped = filtered.map(p => p.toUpperCase());
249    return mapped.join(", ");
250}
251
252export function validateInput(data: string): boolean {
253    if (data === null || data === undefined) {
254        return false;
255    }
256    const cleaned = data.trim();
257    if (cleaned.length < 3) {
258        return false;
259    }
260    return true;
261}
262"#;
263
264        std::fs::write(src_dir.join("original.ts"), code).expect("write original");
265        std::fs::write(src_dir.join("copy.ts"), code).expect("write copy");
266        std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
267            .expect("write package.json");
268
269        let files = vec![
270            DiscoveredFile {
271                id: FileId(0),
272                path: src_dir.join("original.ts"),
273                size_bytes: code.len() as u64,
274            },
275            DiscoveredFile {
276                id: FileId(1),
277                path: src_dir.join("copy.ts"),
278                size_bytes: code.len() as u64,
279            },
280        ];
281
282        let config = DuplicatesConfig {
283            min_tokens: 10,
284            min_lines: 2,
285            ..DuplicatesConfig::default()
286        };
287
288        let report = find_duplicates(dir.path(), &files, &config);
289        assert!(
290            !report.clone_groups.is_empty(),
291            "Should detect clones in identical files"
292        );
293        assert!(report.stats.files_with_clones >= 2);
294
295        // Should also have clone families
296        assert!(
297            !report.clone_families.is_empty(),
298            "Should group clones into families"
299        );
300    }
301
302    #[test]
303    fn file_wide_suppression_excludes_file() {
304        let dir = tempfile::tempdir().expect("create temp dir");
305        let src_dir = dir.path().join("src");
306        std::fs::create_dir_all(&src_dir).expect("create src dir");
307
308        let code = r#"
309export function processData(input: string): string {
310    const trimmed = input.trim();
311    if (trimmed.length === 0) {
312        return "";
313    }
314    const parts = trimmed.split(",");
315    const filtered = parts.filter(p => p.length > 0);
316    const mapped = filtered.map(p => p.toUpperCase());
317    return mapped.join(", ");
318}
319"#;
320        let suppressed_code = format!("// fallow-ignore-file code-duplication\n{code}");
321
322        std::fs::write(src_dir.join("original.ts"), code).expect("write original");
323        std::fs::write(src_dir.join("suppressed.ts"), &suppressed_code).expect("write suppressed");
324        std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
325            .expect("write package.json");
326
327        let files = vec![
328            DiscoveredFile {
329                id: FileId(0),
330                path: src_dir.join("original.ts"),
331                size_bytes: code.len() as u64,
332            },
333            DiscoveredFile {
334                id: FileId(1),
335                path: src_dir.join("suppressed.ts"),
336                size_bytes: suppressed_code.len() as u64,
337            },
338        ];
339
340        let config = DuplicatesConfig {
341            min_tokens: 10,
342            min_lines: 2,
343            ..DuplicatesConfig::default()
344        };
345
346        let report = find_duplicates(dir.path(), &files, &config);
347        // With only 2 files and one suppressed, there should be no clones
348        assert!(
349            report.clone_groups.is_empty(),
350            "File-wide suppression should exclude file from duplication analysis"
351        );
352    }
353}