Skip to main content

fallow_core/duplicates/
mod.rs

1//! Code duplication / clone detection module.
2//!
3//! This module implements suffix array + LCP based clone detection
4//! for JavaScript/TypeScript source files. It supports multiple detection
5//! modes from strict (exact matches only) to semantic (structure-aware
6//! matching that ignores identifier names and literal values).
7
8pub mod detect;
9pub mod normalize;
10pub mod tokenize;
11pub mod types;
12
13use std::path::{Path, PathBuf};
14
15use globset::{Glob, GlobSet, GlobSetBuilder};
16use rayon::prelude::*;
17
18use detect::CloneDetector;
19use normalize::normalize_and_hash;
20use tokenize::tokenize_file;
21pub use types::{
22    CloneGroup, CloneInstance, DetectionMode, DuplicatesConfig, DuplicationReport, DuplicationStats,
23};
24
25use crate::discover::{self, DiscoveredFile};
26
27/// Run duplication detection on the given files.
28///
29/// This is the main entry point for the duplication analysis. It:
30/// 1. Reads and tokenizes all source files in parallel
31/// 2. Normalizes tokens according to the detection mode
32/// 3. Runs suffix array + LCP clone detection
33/// 4. Groups and reports clone instances
34pub fn find_duplicates(
35    root: &Path,
36    files: &[DiscoveredFile],
37    config: &DuplicatesConfig,
38) -> DuplicationReport {
39    let _span = tracing::info_span!("find_duplicates").entered();
40
41    // Build extra ignore patterns for duplication analysis
42    let extra_ignores = build_ignore_set(&config.ignore);
43
44    // Step 1 & 2: Tokenize and normalize all files in parallel
45    let file_data: Vec<(PathBuf, Vec<normalize::HashedToken>, tokenize::FileTokens)> = files
46        .par_iter()
47        .filter_map(|file| {
48            // Apply extra ignore patterns
49            let relative = file.path.strip_prefix(root).unwrap_or(&file.path);
50            if let Some(ref ignores) = extra_ignores
51                && ignores.is_match(relative)
52            {
53                return None;
54            }
55
56            // Read the file
57            let source = std::fs::read_to_string(&file.path).ok()?;
58
59            // Tokenize
60            let file_tokens = tokenize_file(&file.path, &source);
61            if file_tokens.tokens.is_empty() {
62                return None;
63            }
64
65            // Normalize and hash
66            let hashed = normalize_and_hash(&file_tokens.tokens, config.mode);
67            if hashed.len() < config.min_tokens {
68                return None;
69            }
70
71            Some((file.path.clone(), hashed, file_tokens))
72        })
73        .collect();
74
75    tracing::info!(
76        files = file_data.len(),
77        "tokenized files for duplication analysis"
78    );
79
80    // Step 3 & 4: Detect clones
81    let detector = CloneDetector::new(config.min_tokens, config.min_lines, config.skip_local);
82    detector.detect(file_data)
83}
84
85/// Run duplication detection on a project directory using auto-discovered files.
86///
87/// This is a convenience function that handles file discovery internally.
88pub fn find_duplicates_in_project(root: &Path, config: &DuplicatesConfig) -> DuplicationReport {
89    let resolved = crate::default_config(root);
90    let files = discover::discover_files(&resolved);
91    find_duplicates(root, &files, config)
92}
93
94/// Build a GlobSet from ignore patterns.
95fn build_ignore_set(patterns: &[String]) -> Option<GlobSet> {
96    if patterns.is_empty() {
97        return None;
98    }
99
100    let mut builder = GlobSetBuilder::new();
101    for pattern in patterns {
102        match Glob::new(pattern) {
103            Ok(glob) => {
104                builder.add(glob);
105            }
106            Err(e) => {
107                tracing::warn!("Invalid duplication ignore pattern '{pattern}': {e}");
108            }
109        }
110    }
111
112    builder.build().ok()
113}
114
115#[cfg(test)]
116mod tests {
117    use super::*;
118    use crate::discover::FileId;
119
120    #[test]
121    fn find_duplicates_empty_files() {
122        let config = DuplicatesConfig::default();
123        let report = find_duplicates(Path::new("/tmp"), &[], &config);
124        assert!(report.clone_groups.is_empty());
125        assert_eq!(report.stats.total_files, 0);
126    }
127
128    #[test]
129    fn build_ignore_set_empty() {
130        assert!(build_ignore_set(&[]).is_none());
131    }
132
133    #[test]
134    fn build_ignore_set_valid_patterns() {
135        let set = build_ignore_set(&["**/*.test.ts".to_string(), "**/*.spec.ts".to_string()]);
136        assert!(set.is_some());
137        let set = set.unwrap();
138        assert!(set.is_match("src/foo.test.ts"));
139        assert!(set.is_match("src/bar.spec.ts"));
140        assert!(!set.is_match("src/baz.ts"));
141    }
142
143    #[test]
144    fn find_duplicates_with_real_files() {
145        // Create a temp directory with duplicate files
146        let dir = tempfile::tempdir().expect("create temp dir");
147        let src_dir = dir.path().join("src");
148        std::fs::create_dir_all(&src_dir).expect("create src dir");
149
150        let code = r#"
151export function processData(input: string): string {
152    const trimmed = input.trim();
153    if (trimmed.length === 0) {
154        return "";
155    }
156    const parts = trimmed.split(",");
157    const filtered = parts.filter(p => p.length > 0);
158    const mapped = filtered.map(p => p.toUpperCase());
159    return mapped.join(", ");
160}
161
162export function validateInput(data: string): boolean {
163    if (data === null || data === undefined) {
164        return false;
165    }
166    const cleaned = data.trim();
167    if (cleaned.length < 3) {
168        return false;
169    }
170    return true;
171}
172"#;
173
174        std::fs::write(src_dir.join("original.ts"), code).expect("write original");
175        std::fs::write(src_dir.join("copy.ts"), code).expect("write copy");
176        std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
177            .expect("write package.json");
178
179        let files = vec![
180            DiscoveredFile {
181                id: FileId(0),
182                path: src_dir.join("original.ts"),
183                size_bytes: code.len() as u64,
184            },
185            DiscoveredFile {
186                id: FileId(1),
187                path: src_dir.join("copy.ts"),
188                size_bytes: code.len() as u64,
189            },
190        ];
191
192        let config = DuplicatesConfig {
193            min_tokens: 10,
194            min_lines: 2,
195            ..DuplicatesConfig::default()
196        };
197
198        let report = find_duplicates(dir.path(), &files, &config);
199        assert!(
200            !report.clone_groups.is_empty(),
201            "Should detect clones in identical files"
202        );
203        assert!(report.stats.files_with_clones >= 2);
204    }
205}