Skip to main content

fallow_core/duplicates/
mod.rs

1//! Code duplication / clone detection module.
2//!
3//! This module implements suffix array + LCP based clone detection
4//! for JavaScript/TypeScript source files. It supports multiple detection
5//! modes from strict (exact matches only) to semantic (structure-aware
6//! matching that ignores identifier names and literal values).
7
8pub mod detect;
9pub mod families;
10pub mod normalize;
11pub mod tokenize;
12pub(crate) mod types;
13
14use rustc_hash::FxHashMap;
15use std::path::{Path, PathBuf};
16
17use globset::{Glob, GlobSet, GlobSetBuilder};
18use rayon::prelude::*;
19
20use detect::CloneDetector;
21use normalize::normalize_and_hash_resolved;
22use tokenize::{tokenize_file, tokenize_file_cross_language};
23pub use types::{
24    CloneFamily, CloneGroup, CloneInstance, DetectionMode, DuplicatesConfig, DuplicationReport,
25    DuplicationStats, RefactoringKind, RefactoringSuggestion,
26};
27
28use crate::discover::{self, DiscoveredFile};
29use crate::suppress::{self, IssueKind, Suppression};
30
31/// Run duplication detection on the given files.
32///
33/// This is the main entry point for the duplication analysis. It:
34/// 1. Reads and tokenizes all source files in parallel
35/// 2. Normalizes tokens according to the detection mode
36/// 3. Runs suffix array + LCP clone detection
37/// 4. Groups clone instances into families with refactoring suggestions
38/// 5. Applies inline suppression filters
39pub fn find_duplicates(
40    root: &Path,
41    files: &[DiscoveredFile],
42    config: &DuplicatesConfig,
43) -> DuplicationReport {
44    let _span = tracing::info_span!("find_duplicates").entered();
45
46    // Build extra ignore patterns for duplication analysis
47    let extra_ignores = build_ignore_set(&config.ignore);
48
49    // Resolve normalization: mode defaults + user overrides
50    let normalization =
51        fallow_config::ResolvedNormalization::resolve(config.mode, &config.normalization);
52
53    let strip_types = config.cross_language;
54
55    // Step 1 & 2: Tokenize and normalize all files in parallel, also parse suppressions
56    let file_data: Vec<(
57        PathBuf,
58        Vec<normalize::HashedToken>,
59        tokenize::FileTokens,
60        Vec<Suppression>,
61    )> = files
62        .par_iter()
63        .filter_map(|file| {
64            // Apply extra ignore patterns
65            let relative = file.path.strip_prefix(root).unwrap_or(&file.path);
66            if let Some(ref ignores) = extra_ignores
67                && ignores.is_match(relative)
68            {
69                return None;
70            }
71
72            // Read the file
73            let source = std::fs::read_to_string(&file.path).ok()?;
74
75            // Parse inline suppression comments
76            let suppressions = suppress::parse_suppressions_from_source(&source);
77
78            // Check for file-wide code-duplication suppression
79            if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
80                return None;
81            }
82
83            // Tokenize (with optional type stripping for cross-language detection)
84            let file_tokens = if strip_types {
85                tokenize_file_cross_language(&file.path, &source, true)
86            } else {
87                tokenize_file(&file.path, &source)
88            };
89            if file_tokens.tokens.is_empty() {
90                return None;
91            }
92
93            // Normalize and hash using resolved normalization flags
94            let hashed = normalize_and_hash_resolved(&file_tokens.tokens, &normalization);
95            if hashed.len() < config.min_tokens {
96                return None;
97            }
98
99            Some((file.path.clone(), hashed, file_tokens, suppressions))
100        })
101        .collect();
102
103    tracing::info!(
104        files = file_data.len(),
105        "tokenized files for duplication analysis"
106    );
107
108    // Collect per-file suppressions for line-level filtering
109    let suppressions_by_file: FxHashMap<PathBuf, Vec<Suppression>> = file_data
110        .iter()
111        .filter(|(_, _, _, supps)| !supps.is_empty())
112        .map(|(path, _, _, supps)| (path.clone(), supps.clone()))
113        .collect();
114
115    // Strip suppressions from the data passed to the detector
116    let detector_data: Vec<(PathBuf, Vec<normalize::HashedToken>, tokenize::FileTokens)> =
117        file_data
118            .into_iter()
119            .map(|(path, hashed, tokens, _)| (path, hashed, tokens))
120            .collect();
121
122    // Step 3 & 4: Detect clones
123    let detector = CloneDetector::new(config.min_tokens, config.min_lines, config.skip_local);
124    let mut report = detector.detect(detector_data);
125
126    // Step 5: Apply line-level suppressions
127    if !suppressions_by_file.is_empty() {
128        apply_line_suppressions(&mut report, &suppressions_by_file);
129    }
130
131    // Step 6: Group into families with refactoring suggestions
132    report.clone_families = families::group_into_families(&report.clone_groups);
133
134    report
135}
136
137/// Filter out clone instances that are suppressed by line-level comments.
138fn apply_line_suppressions(
139    report: &mut DuplicationReport,
140    suppressions_by_file: &FxHashMap<PathBuf, Vec<Suppression>>,
141) {
142    report.clone_groups.retain_mut(|group| {
143        group.instances.retain(|instance| {
144            if let Some(supps) = suppressions_by_file.get(&instance.file) {
145                // Check if any line in the instance range is suppressed
146                for line in instance.start_line..=instance.end_line {
147                    if suppress::is_suppressed(supps, line as u32, IssueKind::CodeDuplication) {
148                        return false;
149                    }
150                }
151            }
152            true
153        });
154        // Keep group only if it still has 2+ instances
155        group.instances.len() >= 2
156    });
157}
158
159/// Run duplication detection on a project directory using auto-discovered files.
160///
161/// This is a convenience function that handles file discovery internally.
162pub fn find_duplicates_in_project(root: &Path, config: &DuplicatesConfig) -> DuplicationReport {
163    let resolved = crate::default_config(root);
164    let files = discover::discover_files(&resolved);
165    find_duplicates(root, &files, config)
166}
167
168/// Build a `GlobSet` from ignore patterns.
169fn build_ignore_set(patterns: &[String]) -> Option<GlobSet> {
170    if patterns.is_empty() {
171        return None;
172    }
173
174    let mut builder = GlobSetBuilder::new();
175    for pattern in patterns {
176        match Glob::new(pattern) {
177            Ok(glob) => {
178                builder.add(glob);
179            }
180            Err(e) => {
181                tracing::warn!("Invalid duplication ignore pattern '{pattern}': {e}");
182            }
183        }
184    }
185
186    builder.build().ok()
187}
188
189#[cfg(test)]
190mod tests {
191    use super::*;
192    use crate::discover::FileId;
193
194    #[test]
195    fn find_duplicates_empty_files() {
196        let config = DuplicatesConfig::default();
197        let report = find_duplicates(Path::new("/tmp"), &[], &config);
198        assert!(report.clone_groups.is_empty());
199        assert!(report.clone_families.is_empty());
200        assert_eq!(report.stats.total_files, 0);
201    }
202
203    #[test]
204    fn build_ignore_set_empty() {
205        assert!(build_ignore_set(&[]).is_none());
206    }
207
208    #[test]
209    fn build_ignore_set_valid_patterns() {
210        let set = build_ignore_set(&["**/*.test.ts".to_string(), "**/*.spec.ts".to_string()]);
211        assert!(set.is_some());
212        let set = set.unwrap();
213        assert!(set.is_match("src/foo.test.ts"));
214        assert!(set.is_match("src/bar.spec.ts"));
215        assert!(!set.is_match("src/baz.ts"));
216    }
217
218    #[test]
219    fn find_duplicates_with_real_files() {
220        // Create a temp directory with duplicate files
221        let dir = tempfile::tempdir().expect("create temp dir");
222        let src_dir = dir.path().join("src");
223        std::fs::create_dir_all(&src_dir).expect("create src dir");
224
225        let code = r#"
226export function processData(input: string): string {
227    const trimmed = input.trim();
228    if (trimmed.length === 0) {
229        return "";
230    }
231    const parts = trimmed.split(",");
232    const filtered = parts.filter(p => p.length > 0);
233    const mapped = filtered.map(p => p.toUpperCase());
234    return mapped.join(", ");
235}
236
237export function validateInput(data: string): boolean {
238    if (data === null || data === undefined) {
239        return false;
240    }
241    const cleaned = data.trim();
242    if (cleaned.length < 3) {
243        return false;
244    }
245    return true;
246}
247"#;
248
249        std::fs::write(src_dir.join("original.ts"), code).expect("write original");
250        std::fs::write(src_dir.join("copy.ts"), code).expect("write copy");
251        std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
252            .expect("write package.json");
253
254        let files = vec![
255            DiscoveredFile {
256                id: FileId(0),
257                path: src_dir.join("original.ts"),
258                size_bytes: code.len() as u64,
259            },
260            DiscoveredFile {
261                id: FileId(1),
262                path: src_dir.join("copy.ts"),
263                size_bytes: code.len() as u64,
264            },
265        ];
266
267        let config = DuplicatesConfig {
268            min_tokens: 10,
269            min_lines: 2,
270            ..DuplicatesConfig::default()
271        };
272
273        let report = find_duplicates(dir.path(), &files, &config);
274        assert!(
275            !report.clone_groups.is_empty(),
276            "Should detect clones in identical files"
277        );
278        assert!(report.stats.files_with_clones >= 2);
279
280        // Should also have clone families
281        assert!(
282            !report.clone_families.is_empty(),
283            "Should group clones into families"
284        );
285    }
286
287    #[test]
288    fn file_wide_suppression_excludes_file() {
289        let dir = tempfile::tempdir().expect("create temp dir");
290        let src_dir = dir.path().join("src");
291        std::fs::create_dir_all(&src_dir).expect("create src dir");
292
293        let code = r#"
294export function processData(input: string): string {
295    const trimmed = input.trim();
296    if (trimmed.length === 0) {
297        return "";
298    }
299    const parts = trimmed.split(",");
300    const filtered = parts.filter(p => p.length > 0);
301    const mapped = filtered.map(p => p.toUpperCase());
302    return mapped.join(", ");
303}
304"#;
305        let suppressed_code = format!("// fallow-ignore-file code-duplication\n{code}");
306
307        std::fs::write(src_dir.join("original.ts"), code).expect("write original");
308        std::fs::write(src_dir.join("suppressed.ts"), &suppressed_code).expect("write suppressed");
309        std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
310            .expect("write package.json");
311
312        let files = vec![
313            DiscoveredFile {
314                id: FileId(0),
315                path: src_dir.join("original.ts"),
316                size_bytes: code.len() as u64,
317            },
318            DiscoveredFile {
319                id: FileId(1),
320                path: src_dir.join("suppressed.ts"),
321                size_bytes: suppressed_code.len() as u64,
322            },
323        ];
324
325        let config = DuplicatesConfig {
326            min_tokens: 10,
327            min_lines: 2,
328            ..DuplicatesConfig::default()
329        };
330
331        let report = find_duplicates(dir.path(), &files, &config);
332        // With only 2 files and one suppressed, there should be no clones
333        assert!(
334            report.clone_groups.is_empty(),
335            "File-wide suppression should exclude file from duplication analysis"
336        );
337    }
338}