1pub mod detect;
9pub mod families;
10pub mod normalize;
11pub mod tokenize;
12pub(crate) mod types;
13
14use rustc_hash::FxHashMap;
15use std::path::{Path, PathBuf};
16
17use globset::{Glob, GlobSet, GlobSetBuilder};
18use rayon::prelude::*;
19
20use detect::CloneDetector;
21use normalize::normalize_and_hash_resolved;
22use tokenize::{tokenize_file, tokenize_file_cross_language};
23pub use types::{
24 CloneFamily, CloneGroup, CloneInstance, DetectionMode, DuplicatesConfig, DuplicationReport,
25 DuplicationStats, RefactoringKind, RefactoringSuggestion,
26};
27
28use crate::discover::{self, DiscoveredFile};
29use crate::suppress::{self, IssueKind, Suppression};
30
31pub fn find_duplicates(
40 root: &Path,
41 files: &[DiscoveredFile],
42 config: &DuplicatesConfig,
43) -> DuplicationReport {
44 let _span = tracing::info_span!("find_duplicates").entered();
45
46 let extra_ignores = build_ignore_set(&config.ignore);
48
49 let normalization =
51 fallow_config::ResolvedNormalization::resolve(config.mode, &config.normalization);
52
53 let strip_types = config.cross_language;
54
55 let file_data: Vec<(
57 PathBuf,
58 Vec<normalize::HashedToken>,
59 tokenize::FileTokens,
60 Vec<Suppression>,
61 )> = files
62 .par_iter()
63 .filter_map(|file| {
64 let relative = file.path.strip_prefix(root).unwrap_or(&file.path);
66 if let Some(ref ignores) = extra_ignores
67 && ignores.is_match(relative)
68 {
69 return None;
70 }
71
72 let source = std::fs::read_to_string(&file.path).ok()?;
74
75 let suppressions = suppress::parse_suppressions_from_source(&source);
77
78 if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
80 return None;
81 }
82
83 let file_tokens = if strip_types {
85 tokenize_file_cross_language(&file.path, &source, true)
86 } else {
87 tokenize_file(&file.path, &source)
88 };
89 if file_tokens.tokens.is_empty() {
90 return None;
91 }
92
93 let hashed = normalize_and_hash_resolved(&file_tokens.tokens, &normalization);
95 if hashed.len() < config.min_tokens {
96 return None;
97 }
98
99 Some((file.path.clone(), hashed, file_tokens, suppressions))
100 })
101 .collect();
102
103 tracing::info!(
104 files = file_data.len(),
105 "tokenized files for duplication analysis"
106 );
107
108 let suppressions_by_file: FxHashMap<PathBuf, Vec<Suppression>> = file_data
110 .iter()
111 .filter(|(_, _, _, supps)| !supps.is_empty())
112 .map(|(path, _, _, supps)| (path.clone(), supps.clone()))
113 .collect();
114
115 let detector_data: Vec<(PathBuf, Vec<normalize::HashedToken>, tokenize::FileTokens)> =
117 file_data
118 .into_iter()
119 .map(|(path, hashed, tokens, _)| (path, hashed, tokens))
120 .collect();
121
122 let detector = CloneDetector::new(config.min_tokens, config.min_lines, config.skip_local);
124 let mut report = detector.detect(detector_data);
125
126 if !suppressions_by_file.is_empty() {
128 apply_line_suppressions(&mut report, &suppressions_by_file);
129 }
130
131 report.clone_families = families::group_into_families(&report.clone_groups);
133
134 report
135}
136
137fn apply_line_suppressions(
139 report: &mut DuplicationReport,
140 suppressions_by_file: &FxHashMap<PathBuf, Vec<Suppression>>,
141) {
142 report.clone_groups.retain_mut(|group| {
143 group.instances.retain(|instance| {
144 if let Some(supps) = suppressions_by_file.get(&instance.file) {
145 for line in instance.start_line..=instance.end_line {
147 if suppress::is_suppressed(supps, line as u32, IssueKind::CodeDuplication) {
148 return false;
149 }
150 }
151 }
152 true
153 });
154 group.instances.len() >= 2
156 });
157}
158
159pub fn find_duplicates_in_project(root: &Path, config: &DuplicatesConfig) -> DuplicationReport {
163 let resolved = crate::default_config(root);
164 let files = discover::discover_files(&resolved);
165 find_duplicates(root, &files, config)
166}
167
168fn build_ignore_set(patterns: &[String]) -> Option<GlobSet> {
170 if patterns.is_empty() {
171 return None;
172 }
173
174 let mut builder = GlobSetBuilder::new();
175 for pattern in patterns {
176 match Glob::new(pattern) {
177 Ok(glob) => {
178 builder.add(glob);
179 }
180 Err(e) => {
181 tracing::warn!("Invalid duplication ignore pattern '{pattern}': {e}");
182 }
183 }
184 }
185
186 builder.build().ok()
187}
188
189#[cfg(test)]
190mod tests {
191 use super::*;
192 use crate::discover::FileId;
193
194 #[test]
195 fn find_duplicates_empty_files() {
196 let config = DuplicatesConfig::default();
197 let report = find_duplicates(Path::new("/tmp"), &[], &config);
198 assert!(report.clone_groups.is_empty());
199 assert!(report.clone_families.is_empty());
200 assert_eq!(report.stats.total_files, 0);
201 }
202
203 #[test]
204 fn build_ignore_set_empty() {
205 assert!(build_ignore_set(&[]).is_none());
206 }
207
208 #[test]
209 fn build_ignore_set_valid_patterns() {
210 let set = build_ignore_set(&["**/*.test.ts".to_string(), "**/*.spec.ts".to_string()]);
211 assert!(set.is_some());
212 let set = set.unwrap();
213 assert!(set.is_match("src/foo.test.ts"));
214 assert!(set.is_match("src/bar.spec.ts"));
215 assert!(!set.is_match("src/baz.ts"));
216 }
217
218 #[test]
219 fn find_duplicates_with_real_files() {
220 let dir = tempfile::tempdir().expect("create temp dir");
222 let src_dir = dir.path().join("src");
223 std::fs::create_dir_all(&src_dir).expect("create src dir");
224
225 let code = r#"
226export function processData(input: string): string {
227 const trimmed = input.trim();
228 if (trimmed.length === 0) {
229 return "";
230 }
231 const parts = trimmed.split(",");
232 const filtered = parts.filter(p => p.length > 0);
233 const mapped = filtered.map(p => p.toUpperCase());
234 return mapped.join(", ");
235}
236
237export function validateInput(data: string): boolean {
238 if (data === null || data === undefined) {
239 return false;
240 }
241 const cleaned = data.trim();
242 if (cleaned.length < 3) {
243 return false;
244 }
245 return true;
246}
247"#;
248
249 std::fs::write(src_dir.join("original.ts"), code).expect("write original");
250 std::fs::write(src_dir.join("copy.ts"), code).expect("write copy");
251 std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
252 .expect("write package.json");
253
254 let files = vec![
255 DiscoveredFile {
256 id: FileId(0),
257 path: src_dir.join("original.ts"),
258 size_bytes: code.len() as u64,
259 },
260 DiscoveredFile {
261 id: FileId(1),
262 path: src_dir.join("copy.ts"),
263 size_bytes: code.len() as u64,
264 },
265 ];
266
267 let config = DuplicatesConfig {
268 min_tokens: 10,
269 min_lines: 2,
270 ..DuplicatesConfig::default()
271 };
272
273 let report = find_duplicates(dir.path(), &files, &config);
274 assert!(
275 !report.clone_groups.is_empty(),
276 "Should detect clones in identical files"
277 );
278 assert!(report.stats.files_with_clones >= 2);
279
280 assert!(
282 !report.clone_families.is_empty(),
283 "Should group clones into families"
284 );
285 }
286
287 #[test]
288 fn file_wide_suppression_excludes_file() {
289 let dir = tempfile::tempdir().expect("create temp dir");
290 let src_dir = dir.path().join("src");
291 std::fs::create_dir_all(&src_dir).expect("create src dir");
292
293 let code = r#"
294export function processData(input: string): string {
295 const trimmed = input.trim();
296 if (trimmed.length === 0) {
297 return "";
298 }
299 const parts = trimmed.split(",");
300 const filtered = parts.filter(p => p.length > 0);
301 const mapped = filtered.map(p => p.toUpperCase());
302 return mapped.join(", ");
303}
304"#;
305 let suppressed_code = format!("// fallow-ignore-file code-duplication\n{code}");
306
307 std::fs::write(src_dir.join("original.ts"), code).expect("write original");
308 std::fs::write(src_dir.join("suppressed.ts"), &suppressed_code).expect("write suppressed");
309 std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
310 .expect("write package.json");
311
312 let files = vec![
313 DiscoveredFile {
314 id: FileId(0),
315 path: src_dir.join("original.ts"),
316 size_bytes: code.len() as u64,
317 },
318 DiscoveredFile {
319 id: FileId(1),
320 path: src_dir.join("suppressed.ts"),
321 size_bytes: suppressed_code.len() as u64,
322 },
323 ];
324
325 let config = DuplicatesConfig {
326 min_tokens: 10,
327 min_lines: 2,
328 ..DuplicatesConfig::default()
329 };
330
331 let report = find_duplicates(dir.path(), &files, &config);
332 assert!(
334 report.clone_groups.is_empty(),
335 "File-wide suppression should exclude file from duplication analysis"
336 );
337 }
338}