Skip to main content

fallow_core/duplicates/
mod.rs

1//! Code duplication / clone detection module.
2//!
3//! This module implements suffix array + LCP based clone detection
4//! for JavaScript/TypeScript source files. It supports multiple detection
5//! modes from strict (exact matches only) to semantic (structure-aware
6//! matching that ignores identifier names and literal values).
7
8pub mod detect;
9pub mod families;
10pub mod normalize;
11pub mod tokenize;
12pub(crate) mod types;
13
14use std::collections::HashMap;
15use std::path::{Path, PathBuf};
16
17use globset::{Glob, GlobSet, GlobSetBuilder};
18use rayon::prelude::*;
19
20use detect::CloneDetector;
21use normalize::normalize_and_hash;
22use tokenize::tokenize_file;
23pub use types::{
24    CloneFamily, CloneGroup, CloneInstance, DetectionMode, DuplicatesConfig, DuplicationReport,
25    DuplicationStats, RefactoringKind, RefactoringSuggestion,
26};
27
28use crate::discover::{self, DiscoveredFile};
29use crate::suppress::{self, IssueKind, Suppression};
30
31/// Run duplication detection on the given files.
32///
33/// This is the main entry point for the duplication analysis. It:
34/// 1. Reads and tokenizes all source files in parallel
35/// 2. Normalizes tokens according to the detection mode
36/// 3. Runs suffix array + LCP clone detection
37/// 4. Groups clone instances into families with refactoring suggestions
38/// 5. Applies inline suppression filters
39pub fn find_duplicates(
40    root: &Path,
41    files: &[DiscoveredFile],
42    config: &DuplicatesConfig,
43) -> DuplicationReport {
44    let _span = tracing::info_span!("find_duplicates").entered();
45
46    // Build extra ignore patterns for duplication analysis
47    let extra_ignores = build_ignore_set(&config.ignore);
48
49    // Step 1 & 2: Tokenize and normalize all files in parallel, also parse suppressions
50    let file_data: Vec<(
51        PathBuf,
52        Vec<normalize::HashedToken>,
53        tokenize::FileTokens,
54        Vec<Suppression>,
55    )> = files
56        .par_iter()
57        .filter_map(|file| {
58            // Apply extra ignore patterns
59            let relative = file.path.strip_prefix(root).unwrap_or(&file.path);
60            if let Some(ref ignores) = extra_ignores
61                && ignores.is_match(relative)
62            {
63                return None;
64            }
65
66            // Read the file
67            let source = std::fs::read_to_string(&file.path).ok()?;
68
69            // Parse inline suppression comments
70            let suppressions = suppress::parse_suppressions_from_source(&source);
71
72            // Check for file-wide code-duplication suppression
73            if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
74                return None;
75            }
76
77            // Tokenize
78            let file_tokens = tokenize_file(&file.path, &source);
79            if file_tokens.tokens.is_empty() {
80                return None;
81            }
82
83            // Normalize and hash
84            let hashed = normalize_and_hash(&file_tokens.tokens, config.mode);
85            if hashed.len() < config.min_tokens {
86                return None;
87            }
88
89            Some((file.path.clone(), hashed, file_tokens, suppressions))
90        })
91        .collect();
92
93    tracing::info!(
94        files = file_data.len(),
95        "tokenized files for duplication analysis"
96    );
97
98    // Collect per-file suppressions for line-level filtering
99    let suppressions_by_file: HashMap<PathBuf, Vec<Suppression>> = file_data
100        .iter()
101        .filter(|(_, _, _, supps)| !supps.is_empty())
102        .map(|(path, _, _, supps)| (path.clone(), supps.clone()))
103        .collect();
104
105    // Strip suppressions from the data passed to the detector
106    let detector_data: Vec<(PathBuf, Vec<normalize::HashedToken>, tokenize::FileTokens)> =
107        file_data
108            .into_iter()
109            .map(|(path, hashed, tokens, _)| (path, hashed, tokens))
110            .collect();
111
112    // Step 3 & 4: Detect clones
113    let detector = CloneDetector::new(config.min_tokens, config.min_lines, config.skip_local);
114    let mut report = detector.detect(detector_data);
115
116    // Step 5: Apply line-level suppressions
117    if !suppressions_by_file.is_empty() {
118        apply_line_suppressions(&mut report, &suppressions_by_file);
119    }
120
121    // Step 6: Group into families with refactoring suggestions
122    report.clone_families = families::group_into_families(&report.clone_groups);
123
124    report
125}
126
127/// Filter out clone instances that are suppressed by line-level comments.
128fn apply_line_suppressions(
129    report: &mut DuplicationReport,
130    suppressions_by_file: &HashMap<PathBuf, Vec<Suppression>>,
131) {
132    report.clone_groups.retain_mut(|group| {
133        group.instances.retain(|instance| {
134            if let Some(supps) = suppressions_by_file.get(&instance.file) {
135                // Check if any line in the instance range is suppressed
136                for line in instance.start_line..=instance.end_line {
137                    if suppress::is_suppressed(supps, line as u32, IssueKind::CodeDuplication) {
138                        return false;
139                    }
140                }
141            }
142            true
143        });
144        // Keep group only if it still has 2+ instances
145        group.instances.len() >= 2
146    });
147}
148
149/// Run duplication detection on a project directory using auto-discovered files.
150///
151/// This is a convenience function that handles file discovery internally.
152pub fn find_duplicates_in_project(root: &Path, config: &DuplicatesConfig) -> DuplicationReport {
153    let resolved = crate::default_config(root);
154    let files = discover::discover_files(&resolved);
155    find_duplicates(root, &files, config)
156}
157
158/// Build a GlobSet from ignore patterns.
159fn build_ignore_set(patterns: &[String]) -> Option<GlobSet> {
160    if patterns.is_empty() {
161        return None;
162    }
163
164    let mut builder = GlobSetBuilder::new();
165    for pattern in patterns {
166        match Glob::new(pattern) {
167            Ok(glob) => {
168                builder.add(glob);
169            }
170            Err(e) => {
171                tracing::warn!("Invalid duplication ignore pattern '{pattern}': {e}");
172            }
173        }
174    }
175
176    builder.build().ok()
177}
178
179#[cfg(test)]
180mod tests {
181    use super::*;
182    use crate::discover::FileId;
183
184    #[test]
185    fn find_duplicates_empty_files() {
186        let config = DuplicatesConfig::default();
187        let report = find_duplicates(Path::new("/tmp"), &[], &config);
188        assert!(report.clone_groups.is_empty());
189        assert!(report.clone_families.is_empty());
190        assert_eq!(report.stats.total_files, 0);
191    }
192
193    #[test]
194    fn build_ignore_set_empty() {
195        assert!(build_ignore_set(&[]).is_none());
196    }
197
198    #[test]
199    fn build_ignore_set_valid_patterns() {
200        let set = build_ignore_set(&["**/*.test.ts".to_string(), "**/*.spec.ts".to_string()]);
201        assert!(set.is_some());
202        let set = set.unwrap();
203        assert!(set.is_match("src/foo.test.ts"));
204        assert!(set.is_match("src/bar.spec.ts"));
205        assert!(!set.is_match("src/baz.ts"));
206    }
207
208    #[test]
209    fn find_duplicates_with_real_files() {
210        // Create a temp directory with duplicate files
211        let dir = tempfile::tempdir().expect("create temp dir");
212        let src_dir = dir.path().join("src");
213        std::fs::create_dir_all(&src_dir).expect("create src dir");
214
215        let code = r#"
216export function processData(input: string): string {
217    const trimmed = input.trim();
218    if (trimmed.length === 0) {
219        return "";
220    }
221    const parts = trimmed.split(",");
222    const filtered = parts.filter(p => p.length > 0);
223    const mapped = filtered.map(p => p.toUpperCase());
224    return mapped.join(", ");
225}
226
227export function validateInput(data: string): boolean {
228    if (data === null || data === undefined) {
229        return false;
230    }
231    const cleaned = data.trim();
232    if (cleaned.length < 3) {
233        return false;
234    }
235    return true;
236}
237"#;
238
239        std::fs::write(src_dir.join("original.ts"), code).expect("write original");
240        std::fs::write(src_dir.join("copy.ts"), code).expect("write copy");
241        std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
242            .expect("write package.json");
243
244        let files = vec![
245            DiscoveredFile {
246                id: FileId(0),
247                path: src_dir.join("original.ts"),
248                size_bytes: code.len() as u64,
249            },
250            DiscoveredFile {
251                id: FileId(1),
252                path: src_dir.join("copy.ts"),
253                size_bytes: code.len() as u64,
254            },
255        ];
256
257        let config = DuplicatesConfig {
258            min_tokens: 10,
259            min_lines: 2,
260            ..DuplicatesConfig::default()
261        };
262
263        let report = find_duplicates(dir.path(), &files, &config);
264        assert!(
265            !report.clone_groups.is_empty(),
266            "Should detect clones in identical files"
267        );
268        assert!(report.stats.files_with_clones >= 2);
269
270        // Should also have clone families
271        assert!(
272            !report.clone_families.is_empty(),
273            "Should group clones into families"
274        );
275    }
276
277    #[test]
278    fn file_wide_suppression_excludes_file() {
279        let dir = tempfile::tempdir().expect("create temp dir");
280        let src_dir = dir.path().join("src");
281        std::fs::create_dir_all(&src_dir).expect("create src dir");
282
283        let code = r#"
284export function processData(input: string): string {
285    const trimmed = input.trim();
286    if (trimmed.length === 0) {
287        return "";
288    }
289    const parts = trimmed.split(",");
290    const filtered = parts.filter(p => p.length > 0);
291    const mapped = filtered.map(p => p.toUpperCase());
292    return mapped.join(", ");
293}
294"#;
295        let suppressed_code = format!("// fallow-ignore-file code-duplication\n{code}");
296
297        std::fs::write(src_dir.join("original.ts"), code).expect("write original");
298        std::fs::write(src_dir.join("suppressed.ts"), &suppressed_code).expect("write suppressed");
299        std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
300            .expect("write package.json");
301
302        let files = vec![
303            DiscoveredFile {
304                id: FileId(0),
305                path: src_dir.join("original.ts"),
306                size_bytes: code.len() as u64,
307            },
308            DiscoveredFile {
309                id: FileId(1),
310                path: src_dir.join("suppressed.ts"),
311                size_bytes: suppressed_code.len() as u64,
312            },
313        ];
314
315        let config = DuplicatesConfig {
316            min_tokens: 10,
317            min_lines: 2,
318            ..DuplicatesConfig::default()
319        };
320
321        let report = find_duplicates(dir.path(), &files, &config);
322        // With only 2 files and one suppressed, there should be no clones
323        assert!(
324            report.clone_groups.is_empty(),
325            "File-wide suppression should exclude file from duplication analysis"
326        );
327    }
328}