1pub mod detect;
9pub mod families;
10pub mod normalize;
11pub mod token_types;
12mod token_visitor;
13pub mod tokenize;
14pub(crate) mod types;
15
16use rustc_hash::FxHashMap;
17use std::path::{Path, PathBuf};
18
19use globset::{Glob, GlobSet, GlobSetBuilder};
20use rayon::prelude::*;
21
22use detect::CloneDetector;
23use normalize::normalize_and_hash_resolved;
24use tokenize::{tokenize_file, tokenize_file_cross_language};
25pub use types::{
26 CloneFamily, CloneGroup, CloneInstance, DetectionMode, DuplicatesConfig, DuplicationReport,
27 DuplicationStats, MirroredDirectory, RefactoringKind, RefactoringSuggestion,
28};
29
30use crate::discover::{self, DiscoveredFile};
31use crate::suppress::{self, IssueKind, Suppression};
32
33pub fn find_duplicates(
42 root: &Path,
43 files: &[DiscoveredFile],
44 config: &DuplicatesConfig,
45) -> DuplicationReport {
46 let _span = tracing::info_span!("find_duplicates").entered();
47
48 let extra_ignores = build_ignore_set(&config.ignore);
50
51 let normalization =
53 fallow_config::ResolvedNormalization::resolve(config.mode, &config.normalization);
54
55 let strip_types = config.cross_language;
56 let skip_imports = config.ignore_imports;
57
58 tracing::debug!(
59 ignore_imports = skip_imports,
60 "duplication tokenization config"
61 );
62
63 let file_data: Vec<(
65 PathBuf,
66 Vec<normalize::HashedToken>,
67 tokenize::FileTokens,
68 Vec<Suppression>,
69 )> = files
70 .par_iter()
71 .filter_map(|file| {
72 let relative = file.path.strip_prefix(root).unwrap_or(&file.path);
74 if let Some(ref ignores) = extra_ignores
75 && ignores.is_match(relative)
76 {
77 return None;
78 }
79
80 let source = std::fs::read_to_string(&file.path).ok()?;
82
83 let suppressions = suppress::parse_suppressions_from_source(&source);
85
86 if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
88 return None;
89 }
90
91 let file_tokens = if strip_types {
93 tokenize_file_cross_language(&file.path, &source, true, skip_imports)
94 } else {
95 tokenize_file(&file.path, &source, skip_imports)
96 };
97 if file_tokens.tokens.is_empty() {
98 return None;
99 }
100
101 let hashed = normalize_and_hash_resolved(&file_tokens.tokens, normalization);
103 if hashed.len() < config.min_tokens {
104 return None;
105 }
106
107 Some((file.path.clone(), hashed, file_tokens, suppressions))
108 })
109 .collect();
110
111 tracing::info!(
112 files = file_data.len(),
113 "tokenized files for duplication analysis"
114 );
115
116 let suppressions_by_file: FxHashMap<PathBuf, Vec<Suppression>> = file_data
118 .iter()
119 .filter(|(_, _, _, supps)| !supps.is_empty())
120 .map(|(path, _, _, supps)| (path.clone(), supps.clone()))
121 .collect();
122
123 let detector_data: Vec<(PathBuf, Vec<normalize::HashedToken>, tokenize::FileTokens)> =
125 file_data
126 .into_iter()
127 .map(|(path, hashed, tokens, _)| (path, hashed, tokens))
128 .collect();
129
130 let detector = CloneDetector::new(config.min_tokens, config.min_lines, config.skip_local);
132 let mut report = detector.detect(detector_data);
133
134 if !suppressions_by_file.is_empty() {
136 apply_line_suppressions(&mut report, &suppressions_by_file);
137 }
138
139 report.clone_families = families::group_into_families(&report.clone_groups, root);
141
142 report.mirrored_directories =
144 families::detect_mirrored_directories(&report.clone_families, root);
145
146 report.sort();
149
150 report
151}
152
153#[expect(
155 clippy::cast_possible_truncation,
156 reason = "line numbers are bounded by source size"
157)]
158fn apply_line_suppressions(
159 report: &mut DuplicationReport,
160 suppressions_by_file: &FxHashMap<PathBuf, Vec<Suppression>>,
161) {
162 report.clone_groups.retain_mut(|group| {
163 group.instances.retain(|instance| {
164 if let Some(supps) = suppressions_by_file.get(&instance.file) {
165 for line in instance.start_line..=instance.end_line {
167 if suppress::is_suppressed(supps, line as u32, IssueKind::CodeDuplication) {
168 return false;
169 }
170 }
171 }
172 true
173 });
174 group.instances.len() >= 2
176 });
177}
178
179#[must_use]
183pub fn find_duplicates_in_project(root: &Path, config: &DuplicatesConfig) -> DuplicationReport {
184 let resolved = crate::default_config(root);
185 let files = discover::discover_files(&resolved);
186 find_duplicates(root, &files, config)
187}
188
189fn build_ignore_set(patterns: &[String]) -> Option<GlobSet> {
191 if patterns.is_empty() {
192 return None;
193 }
194
195 let mut builder = GlobSetBuilder::new();
196 for pattern in patterns {
197 match Glob::new(pattern) {
198 Ok(glob) => {
199 builder.add(glob);
200 }
201 Err(e) => {
202 tracing::warn!("Invalid duplication ignore pattern '{pattern}': {e}");
203 }
204 }
205 }
206
207 builder.build().ok()
208}
209
210#[cfg(test)]
211mod tests {
212 use super::*;
213 use crate::discover::FileId;
214
215 #[test]
216 fn find_duplicates_empty_files() {
217 let config = DuplicatesConfig::default();
218 let report = find_duplicates(Path::new("/tmp"), &[], &config);
219 assert!(report.clone_groups.is_empty());
220 assert!(report.clone_families.is_empty());
221 assert_eq!(report.stats.total_files, 0);
222 }
223
224 #[test]
225 fn build_ignore_set_empty() {
226 assert!(build_ignore_set(&[]).is_none());
227 }
228
229 #[test]
230 fn build_ignore_set_valid_patterns() {
231 let set = build_ignore_set(&["**/*.test.ts".to_string(), "**/*.spec.ts".to_string()]);
232 assert!(set.is_some());
233 let set = set.unwrap();
234 assert!(set.is_match("src/foo.test.ts"));
235 assert!(set.is_match("src/bar.spec.ts"));
236 assert!(!set.is_match("src/baz.ts"));
237 }
238
239 #[test]
240 fn find_duplicates_with_real_files() {
241 let dir = tempfile::tempdir().expect("create temp dir");
243 let src_dir = dir.path().join("src");
244 std::fs::create_dir_all(&src_dir).expect("create src dir");
245
246 let code = r#"
247export function processData(input: string): string {
248 const trimmed = input.trim();
249 if (trimmed.length === 0) {
250 return "";
251 }
252 const parts = trimmed.split(",");
253 const filtered = parts.filter(p => p.length > 0);
254 const mapped = filtered.map(p => p.toUpperCase());
255 return mapped.join(", ");
256}
257
258export function validateInput(data: string): boolean {
259 if (data === null || data === undefined) {
260 return false;
261 }
262 const cleaned = data.trim();
263 if (cleaned.length < 3) {
264 return false;
265 }
266 return true;
267}
268"#;
269
270 std::fs::write(src_dir.join("original.ts"), code).expect("write original");
271 std::fs::write(src_dir.join("copy.ts"), code).expect("write copy");
272 std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
273 .expect("write package.json");
274
275 let files = vec![
276 DiscoveredFile {
277 id: FileId(0),
278 path: src_dir.join("original.ts"),
279 size_bytes: code.len() as u64,
280 },
281 DiscoveredFile {
282 id: FileId(1),
283 path: src_dir.join("copy.ts"),
284 size_bytes: code.len() as u64,
285 },
286 ];
287
288 let config = DuplicatesConfig {
289 min_tokens: 10,
290 min_lines: 2,
291 ..DuplicatesConfig::default()
292 };
293
294 let report = find_duplicates(dir.path(), &files, &config);
295 assert!(
296 !report.clone_groups.is_empty(),
297 "Should detect clones in identical files"
298 );
299 assert!(report.stats.files_with_clones >= 2);
300
301 assert!(
303 !report.clone_families.is_empty(),
304 "Should group clones into families"
305 );
306 }
307
308 #[test]
309 fn file_wide_suppression_excludes_file() {
310 let dir = tempfile::tempdir().expect("create temp dir");
311 let src_dir = dir.path().join("src");
312 std::fs::create_dir_all(&src_dir).expect("create src dir");
313
314 let code = r#"
315export function processData(input: string): string {
316 const trimmed = input.trim();
317 if (trimmed.length === 0) {
318 return "";
319 }
320 const parts = trimmed.split(",");
321 const filtered = parts.filter(p => p.length > 0);
322 const mapped = filtered.map(p => p.toUpperCase());
323 return mapped.join(", ");
324}
325"#;
326 let suppressed_code = format!("// fallow-ignore-file code-duplication\n{code}");
327
328 std::fs::write(src_dir.join("original.ts"), code).expect("write original");
329 std::fs::write(src_dir.join("suppressed.ts"), &suppressed_code).expect("write suppressed");
330 std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
331 .expect("write package.json");
332
333 let files = vec![
334 DiscoveredFile {
335 id: FileId(0),
336 path: src_dir.join("original.ts"),
337 size_bytes: code.len() as u64,
338 },
339 DiscoveredFile {
340 id: FileId(1),
341 path: src_dir.join("suppressed.ts"),
342 size_bytes: suppressed_code.len() as u64,
343 },
344 ];
345
346 let config = DuplicatesConfig {
347 min_tokens: 10,
348 min_lines: 2,
349 ..DuplicatesConfig::default()
350 };
351
352 let report = find_duplicates(dir.path(), &files, &config);
353 assert!(
355 report.clone_groups.is_empty(),
356 "File-wide suppression should exclude file from duplication analysis"
357 );
358 }
359}