1pub mod detect;
9pub mod families;
10pub mod normalize;
11pub mod token_types;
12mod token_visitor;
13pub mod tokenize;
14pub(crate) mod types;
15
16use rustc_hash::FxHashMap;
17use std::path::{Path, PathBuf};
18
19use globset::{Glob, GlobSet, GlobSetBuilder};
20use rayon::prelude::*;
21
22use detect::CloneDetector;
23use normalize::normalize_and_hash_resolved;
24use tokenize::{tokenize_file, tokenize_file_cross_language};
25pub use types::{
26 CloneFamily, CloneGroup, CloneInstance, DetectionMode, DuplicatesConfig, DuplicationReport,
27 DuplicationStats, MirroredDirectory, RefactoringKind, RefactoringSuggestion,
28};
29
30use crate::discover::{self, DiscoveredFile};
31use crate::suppress::{self, IssueKind, Suppression};
32
33pub fn find_duplicates(
42 root: &Path,
43 files: &[DiscoveredFile],
44 config: &DuplicatesConfig,
45) -> DuplicationReport {
46 let _span = tracing::info_span!("find_duplicates").entered();
47
48 let extra_ignores = build_ignore_set(&config.ignore);
50
51 let normalization =
53 fallow_config::ResolvedNormalization::resolve(config.mode, &config.normalization);
54
55 let strip_types = config.cross_language;
56
57 let file_data: Vec<(
59 PathBuf,
60 Vec<normalize::HashedToken>,
61 tokenize::FileTokens,
62 Vec<Suppression>,
63 )> = files
64 .par_iter()
65 .filter_map(|file| {
66 let relative = file.path.strip_prefix(root).unwrap_or(&file.path);
68 if let Some(ref ignores) = extra_ignores
69 && ignores.is_match(relative)
70 {
71 return None;
72 }
73
74 let source = std::fs::read_to_string(&file.path).ok()?;
76
77 let suppressions = suppress::parse_suppressions_from_source(&source);
79
80 if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
82 return None;
83 }
84
85 let file_tokens = if strip_types {
87 tokenize_file_cross_language(&file.path, &source, true)
88 } else {
89 tokenize_file(&file.path, &source)
90 };
91 if file_tokens.tokens.is_empty() {
92 return None;
93 }
94
95 let hashed = normalize_and_hash_resolved(&file_tokens.tokens, normalization);
97 if hashed.len() < config.min_tokens {
98 return None;
99 }
100
101 Some((file.path.clone(), hashed, file_tokens, suppressions))
102 })
103 .collect();
104
105 tracing::info!(
106 files = file_data.len(),
107 "tokenized files for duplication analysis"
108 );
109
110 let suppressions_by_file: FxHashMap<PathBuf, Vec<Suppression>> = file_data
112 .iter()
113 .filter(|(_, _, _, supps)| !supps.is_empty())
114 .map(|(path, _, _, supps)| (path.clone(), supps.clone()))
115 .collect();
116
117 let detector_data: Vec<(PathBuf, Vec<normalize::HashedToken>, tokenize::FileTokens)> =
119 file_data
120 .into_iter()
121 .map(|(path, hashed, tokens, _)| (path, hashed, tokens))
122 .collect();
123
124 let detector = CloneDetector::new(config.min_tokens, config.min_lines, config.skip_local);
126 let mut report = detector.detect(detector_data);
127
128 if !suppressions_by_file.is_empty() {
130 apply_line_suppressions(&mut report, &suppressions_by_file);
131 }
132
133 report.clone_families = families::group_into_families(&report.clone_groups, root);
135
136 report.mirrored_directories =
138 families::detect_mirrored_directories(&report.clone_families, root);
139
140 report.sort();
143
144 report
145}
146
147#[expect(
149 clippy::cast_possible_truncation,
150 reason = "line numbers are bounded by source size"
151)]
152fn apply_line_suppressions(
153 report: &mut DuplicationReport,
154 suppressions_by_file: &FxHashMap<PathBuf, Vec<Suppression>>,
155) {
156 report.clone_groups.retain_mut(|group| {
157 group.instances.retain(|instance| {
158 if let Some(supps) = suppressions_by_file.get(&instance.file) {
159 for line in instance.start_line..=instance.end_line {
161 if suppress::is_suppressed(supps, line as u32, IssueKind::CodeDuplication) {
162 return false;
163 }
164 }
165 }
166 true
167 });
168 group.instances.len() >= 2
170 });
171}
172
173#[must_use]
177pub fn find_duplicates_in_project(root: &Path, config: &DuplicatesConfig) -> DuplicationReport {
178 let resolved = crate::default_config(root);
179 let files = discover::discover_files(&resolved);
180 find_duplicates(root, &files, config)
181}
182
183fn build_ignore_set(patterns: &[String]) -> Option<GlobSet> {
185 if patterns.is_empty() {
186 return None;
187 }
188
189 let mut builder = GlobSetBuilder::new();
190 for pattern in patterns {
191 match Glob::new(pattern) {
192 Ok(glob) => {
193 builder.add(glob);
194 }
195 Err(e) => {
196 tracing::warn!("Invalid duplication ignore pattern '{pattern}': {e}");
197 }
198 }
199 }
200
201 builder.build().ok()
202}
203
204#[cfg(test)]
205mod tests {
206 use super::*;
207 use crate::discover::FileId;
208
209 #[test]
210 fn find_duplicates_empty_files() {
211 let config = DuplicatesConfig::default();
212 let report = find_duplicates(Path::new("/tmp"), &[], &config);
213 assert!(report.clone_groups.is_empty());
214 assert!(report.clone_families.is_empty());
215 assert_eq!(report.stats.total_files, 0);
216 }
217
218 #[test]
219 fn build_ignore_set_empty() {
220 assert!(build_ignore_set(&[]).is_none());
221 }
222
223 #[test]
224 fn build_ignore_set_valid_patterns() {
225 let set = build_ignore_set(&["**/*.test.ts".to_string(), "**/*.spec.ts".to_string()]);
226 assert!(set.is_some());
227 let set = set.unwrap();
228 assert!(set.is_match("src/foo.test.ts"));
229 assert!(set.is_match("src/bar.spec.ts"));
230 assert!(!set.is_match("src/baz.ts"));
231 }
232
233 #[test]
234 fn find_duplicates_with_real_files() {
235 let dir = tempfile::tempdir().expect("create temp dir");
237 let src_dir = dir.path().join("src");
238 std::fs::create_dir_all(&src_dir).expect("create src dir");
239
240 let code = r#"
241export function processData(input: string): string {
242 const trimmed = input.trim();
243 if (trimmed.length === 0) {
244 return "";
245 }
246 const parts = trimmed.split(",");
247 const filtered = parts.filter(p => p.length > 0);
248 const mapped = filtered.map(p => p.toUpperCase());
249 return mapped.join(", ");
250}
251
252export function validateInput(data: string): boolean {
253 if (data === null || data === undefined) {
254 return false;
255 }
256 const cleaned = data.trim();
257 if (cleaned.length < 3) {
258 return false;
259 }
260 return true;
261}
262"#;
263
264 std::fs::write(src_dir.join("original.ts"), code).expect("write original");
265 std::fs::write(src_dir.join("copy.ts"), code).expect("write copy");
266 std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
267 .expect("write package.json");
268
269 let files = vec![
270 DiscoveredFile {
271 id: FileId(0),
272 path: src_dir.join("original.ts"),
273 size_bytes: code.len() as u64,
274 },
275 DiscoveredFile {
276 id: FileId(1),
277 path: src_dir.join("copy.ts"),
278 size_bytes: code.len() as u64,
279 },
280 ];
281
282 let config = DuplicatesConfig {
283 min_tokens: 10,
284 min_lines: 2,
285 ..DuplicatesConfig::default()
286 };
287
288 let report = find_duplicates(dir.path(), &files, &config);
289 assert!(
290 !report.clone_groups.is_empty(),
291 "Should detect clones in identical files"
292 );
293 assert!(report.stats.files_with_clones >= 2);
294
295 assert!(
297 !report.clone_families.is_empty(),
298 "Should group clones into families"
299 );
300 }
301
302 #[test]
303 fn file_wide_suppression_excludes_file() {
304 let dir = tempfile::tempdir().expect("create temp dir");
305 let src_dir = dir.path().join("src");
306 std::fs::create_dir_all(&src_dir).expect("create src dir");
307
308 let code = r#"
309export function processData(input: string): string {
310 const trimmed = input.trim();
311 if (trimmed.length === 0) {
312 return "";
313 }
314 const parts = trimmed.split(",");
315 const filtered = parts.filter(p => p.length > 0);
316 const mapped = filtered.map(p => p.toUpperCase());
317 return mapped.join(", ");
318}
319"#;
320 let suppressed_code = format!("// fallow-ignore-file code-duplication\n{code}");
321
322 std::fs::write(src_dir.join("original.ts"), code).expect("write original");
323 std::fs::write(src_dir.join("suppressed.ts"), &suppressed_code).expect("write suppressed");
324 std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
325 .expect("write package.json");
326
327 let files = vec![
328 DiscoveredFile {
329 id: FileId(0),
330 path: src_dir.join("original.ts"),
331 size_bytes: code.len() as u64,
332 },
333 DiscoveredFile {
334 id: FileId(1),
335 path: src_dir.join("suppressed.ts"),
336 size_bytes: suppressed_code.len() as u64,
337 },
338 ];
339
340 let config = DuplicatesConfig {
341 min_tokens: 10,
342 min_lines: 2,
343 ..DuplicatesConfig::default()
344 };
345
346 let report = find_duplicates(dir.path(), &files, &config);
347 assert!(
349 report.clone_groups.is_empty(),
350 "File-wide suppression should exclude file from duplication analysis"
351 );
352 }
353}