1pub mod detect;
9pub mod families;
10pub mod normalize;
11pub mod tokenize;
12pub(crate) mod types;
13
14use std::collections::HashMap;
15use std::path::{Path, PathBuf};
16
17use globset::{Glob, GlobSet, GlobSetBuilder};
18use rayon::prelude::*;
19
20use detect::CloneDetector;
21use normalize::normalize_and_hash;
22use tokenize::tokenize_file;
23pub use types::{
24 CloneFamily, CloneGroup, CloneInstance, DetectionMode, DuplicatesConfig, DuplicationReport,
25 DuplicationStats, RefactoringKind, RefactoringSuggestion,
26};
27
28use crate::discover::{self, DiscoveredFile};
29use crate::suppress::{self, IssueKind, Suppression};
30
31pub fn find_duplicates(
40 root: &Path,
41 files: &[DiscoveredFile],
42 config: &DuplicatesConfig,
43) -> DuplicationReport {
44 let _span = tracing::info_span!("find_duplicates").entered();
45
46 let extra_ignores = build_ignore_set(&config.ignore);
48
49 let file_data: Vec<(
51 PathBuf,
52 Vec<normalize::HashedToken>,
53 tokenize::FileTokens,
54 Vec<Suppression>,
55 )> = files
56 .par_iter()
57 .filter_map(|file| {
58 let relative = file.path.strip_prefix(root).unwrap_or(&file.path);
60 if let Some(ref ignores) = extra_ignores
61 && ignores.is_match(relative)
62 {
63 return None;
64 }
65
66 let source = std::fs::read_to_string(&file.path).ok()?;
68
69 let suppressions = suppress::parse_suppressions_from_source(&source);
71
72 if suppress::is_file_suppressed(&suppressions, IssueKind::CodeDuplication) {
74 return None;
75 }
76
77 let file_tokens = tokenize_file(&file.path, &source);
79 if file_tokens.tokens.is_empty() {
80 return None;
81 }
82
83 let hashed = normalize_and_hash(&file_tokens.tokens, config.mode);
85 if hashed.len() < config.min_tokens {
86 return None;
87 }
88
89 Some((file.path.clone(), hashed, file_tokens, suppressions))
90 })
91 .collect();
92
93 tracing::info!(
94 files = file_data.len(),
95 "tokenized files for duplication analysis"
96 );
97
98 let suppressions_by_file: HashMap<PathBuf, Vec<Suppression>> = file_data
100 .iter()
101 .filter(|(_, _, _, supps)| !supps.is_empty())
102 .map(|(path, _, _, supps)| (path.clone(), supps.clone()))
103 .collect();
104
105 let detector_data: Vec<(PathBuf, Vec<normalize::HashedToken>, tokenize::FileTokens)> =
107 file_data
108 .into_iter()
109 .map(|(path, hashed, tokens, _)| (path, hashed, tokens))
110 .collect();
111
112 let detector = CloneDetector::new(config.min_tokens, config.min_lines, config.skip_local);
114 let mut report = detector.detect(detector_data);
115
116 if !suppressions_by_file.is_empty() {
118 apply_line_suppressions(&mut report, &suppressions_by_file);
119 }
120
121 report.clone_families = families::group_into_families(&report.clone_groups);
123
124 report
125}
126
127fn apply_line_suppressions(
129 report: &mut DuplicationReport,
130 suppressions_by_file: &HashMap<PathBuf, Vec<Suppression>>,
131) {
132 report.clone_groups.retain_mut(|group| {
133 group.instances.retain(|instance| {
134 if let Some(supps) = suppressions_by_file.get(&instance.file) {
135 for line in instance.start_line..=instance.end_line {
137 if suppress::is_suppressed(supps, line as u32, IssueKind::CodeDuplication) {
138 return false;
139 }
140 }
141 }
142 true
143 });
144 group.instances.len() >= 2
146 });
147}
148
149pub fn find_duplicates_in_project(root: &Path, config: &DuplicatesConfig) -> DuplicationReport {
153 let resolved = crate::default_config(root);
154 let files = discover::discover_files(&resolved);
155 find_duplicates(root, &files, config)
156}
157
158fn build_ignore_set(patterns: &[String]) -> Option<GlobSet> {
160 if patterns.is_empty() {
161 return None;
162 }
163
164 let mut builder = GlobSetBuilder::new();
165 for pattern in patterns {
166 match Glob::new(pattern) {
167 Ok(glob) => {
168 builder.add(glob);
169 }
170 Err(e) => {
171 tracing::warn!("Invalid duplication ignore pattern '{pattern}': {e}");
172 }
173 }
174 }
175
176 builder.build().ok()
177}
178
179#[cfg(test)]
180mod tests {
181 use super::*;
182 use crate::discover::FileId;
183
184 #[test]
185 fn find_duplicates_empty_files() {
186 let config = DuplicatesConfig::default();
187 let report = find_duplicates(Path::new("/tmp"), &[], &config);
188 assert!(report.clone_groups.is_empty());
189 assert!(report.clone_families.is_empty());
190 assert_eq!(report.stats.total_files, 0);
191 }
192
193 #[test]
194 fn build_ignore_set_empty() {
195 assert!(build_ignore_set(&[]).is_none());
196 }
197
198 #[test]
199 fn build_ignore_set_valid_patterns() {
200 let set = build_ignore_set(&["**/*.test.ts".to_string(), "**/*.spec.ts".to_string()]);
201 assert!(set.is_some());
202 let set = set.unwrap();
203 assert!(set.is_match("src/foo.test.ts"));
204 assert!(set.is_match("src/bar.spec.ts"));
205 assert!(!set.is_match("src/baz.ts"));
206 }
207
208 #[test]
209 fn find_duplicates_with_real_files() {
210 let dir = tempfile::tempdir().expect("create temp dir");
212 let src_dir = dir.path().join("src");
213 std::fs::create_dir_all(&src_dir).expect("create src dir");
214
215 let code = r#"
216export function processData(input: string): string {
217 const trimmed = input.trim();
218 if (trimmed.length === 0) {
219 return "";
220 }
221 const parts = trimmed.split(",");
222 const filtered = parts.filter(p => p.length > 0);
223 const mapped = filtered.map(p => p.toUpperCase());
224 return mapped.join(", ");
225}
226
227export function validateInput(data: string): boolean {
228 if (data === null || data === undefined) {
229 return false;
230 }
231 const cleaned = data.trim();
232 if (cleaned.length < 3) {
233 return false;
234 }
235 return true;
236}
237"#;
238
239 std::fs::write(src_dir.join("original.ts"), code).expect("write original");
240 std::fs::write(src_dir.join("copy.ts"), code).expect("write copy");
241 std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
242 .expect("write package.json");
243
244 let files = vec![
245 DiscoveredFile {
246 id: FileId(0),
247 path: src_dir.join("original.ts"),
248 size_bytes: code.len() as u64,
249 },
250 DiscoveredFile {
251 id: FileId(1),
252 path: src_dir.join("copy.ts"),
253 size_bytes: code.len() as u64,
254 },
255 ];
256
257 let config = DuplicatesConfig {
258 min_tokens: 10,
259 min_lines: 2,
260 ..DuplicatesConfig::default()
261 };
262
263 let report = find_duplicates(dir.path(), &files, &config);
264 assert!(
265 !report.clone_groups.is_empty(),
266 "Should detect clones in identical files"
267 );
268 assert!(report.stats.files_with_clones >= 2);
269
270 assert!(
272 !report.clone_families.is_empty(),
273 "Should group clones into families"
274 );
275 }
276
277 #[test]
278 fn file_wide_suppression_excludes_file() {
279 let dir = tempfile::tempdir().expect("create temp dir");
280 let src_dir = dir.path().join("src");
281 std::fs::create_dir_all(&src_dir).expect("create src dir");
282
283 let code = r#"
284export function processData(input: string): string {
285 const trimmed = input.trim();
286 if (trimmed.length === 0) {
287 return "";
288 }
289 const parts = trimmed.split(",");
290 const filtered = parts.filter(p => p.length > 0);
291 const mapped = filtered.map(p => p.toUpperCase());
292 return mapped.join(", ");
293}
294"#;
295 let suppressed_code = format!("// fallow-ignore-file code-duplication\n{code}");
296
297 std::fs::write(src_dir.join("original.ts"), code).expect("write original");
298 std::fs::write(src_dir.join("suppressed.ts"), &suppressed_code).expect("write suppressed");
299 std::fs::write(dir.path().join("package.json"), r#"{"name": "test"}"#)
300 .expect("write package.json");
301
302 let files = vec![
303 DiscoveredFile {
304 id: FileId(0),
305 path: src_dir.join("original.ts"),
306 size_bytes: code.len() as u64,
307 },
308 DiscoveredFile {
309 id: FileId(1),
310 path: src_dir.join("suppressed.ts"),
311 size_bytes: suppressed_code.len() as u64,
312 },
313 ];
314
315 let config = DuplicatesConfig {
316 min_tokens: 10,
317 min_lines: 2,
318 ..DuplicatesConfig::default()
319 };
320
321 let report = find_duplicates(dir.path(), &files, &config);
322 assert!(
324 report.clone_groups.is_empty(),
325 "File-wide suppression should exclude file from duplication analysis"
326 );
327 }
328}