1use std::collections::hash_map::DefaultHasher;
2use std::collections::HashMap;
3use std::hash::{Hash, Hasher};
4use std::path::{Path, PathBuf};
5
6use crate::analyzer::{CodeIssue, Severity};
7use crate::treesitter::engine::ParsedFile;
8
9type TokenSetPair = (
12 std::collections::HashSet<FToken>,
13 std::collections::HashSet<(FToken, FToken)>,
14);
15
16#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
18enum FToken {
19 Ident(u16),
20 Int,
21 Float,
22 Str,
23 Bool,
24 Type,
25 Kind(u16),
26}
27
28#[derive(Debug, Clone)]
30struct FuncFingerprint {
31 hash: u64,
32 name: String,
33 file: PathBuf,
34 line_start: usize,
35}
36
37fn fingerprint_function(
39 file: &ParsedFile,
40 node: tree_sitter::Node,
41) -> Option<(FuncFingerprint, Vec<FToken>)> {
42 let name = extract_func_name(file, node)?;
43 let line_start = node.start_position().row + 1;
44 let line_end = node.end_position().row + 1;
45 let line_count = line_end - line_start + 1;
46
47 if line_count < 8 || is_adapter_batch_helper(&name) {
48 return None;
49 }
50
51 let tokens = normalize_node(file, node);
52 if tokens.len() < 20 {
53 return None;
54 }
55
56 let mut hasher = DefaultHasher::new();
57 tokens.hash(&mut hasher);
58 let hash = hasher.finish();
59
60 Some((
61 FuncFingerprint {
62 hash,
63 name,
64 file: file.path.clone(),
65 line_start,
66 },
67 tokens,
68 ))
69}
70
71fn hash_text(text: &str) -> u16 {
72 let mut h = DefaultHasher::new();
73 text.hash(&mut h);
74 (h.finish() & u16::MAX as u64) as u16
75}
76
77fn is_adapter_batch_helper(name: &str) -> bool {
78 (name.starts_with("count_") || name.starts_with("extract_"))
79 && (name.ends_with("_from_batch") || name.ends_with("_from_batch_with"))
80}
81
82fn is_skippable_dup_path(path: &std::path::Path) -> bool {
83 let name = path.to_string_lossy();
84 name.contains("_test.")
85 || name.starts_with("example/")
86 || name.starts_with("examples/")
87 || name.starts_with("demo/")
88 || name.contains("/tests/")
89 || name.contains("/test/")
90 || name.contains("/example/")
91 || name.contains("/examples/")
92 || name.contains("/demo/")
93 || name.contains("/fixtures/")
94 || name.contains("/mocks/")
95 || name.contains("/benches/")
96}
97
98#[allow(clippy::only_used_in_recursion)]
99fn normalize_node(file: &ParsedFile, node: tree_sitter::Node) -> Vec<FToken> {
100 let kind = node.kind();
101 match kind {
102 "identifier"
103 | "field_identifier"
104 | "shorthand_property_identifier"
105 | "variable_name"
106 | "name" => {
107 let start = node.start_byte();
108 let end = node.end_byte();
109 let text = &file.content[start..end];
110 return vec![FToken::Ident(hash_text(text))];
111 }
112 "type_identifier" | "primitive_type" | "mutable_specifier" => return vec![FToken::Type],
113 "integer_literal" | "integer" | "number" => return vec![FToken::Int],
114 "float_literal" | "float" => return vec![FToken::Float],
115 "string_literal" | "string" | "interpreted_string_literal" | "char_literal" => {
116 return vec![FToken::Str];
117 }
118 "boolean" | "true" | "false" => return vec![FToken::Bool],
119 "let" | "const" | "mut" | "fn" | "def" | "function" | "fun" | "return" | "if" | "else"
120 | "for" | "while" | "match" | "use" | "mod" | "struct" | "enum" | "impl" | "trait"
121 | "pub" | "pub(crate)" | "unsafe" | "async" | "await" | "where" | "in" | "as"
122 | "static" | "extern" | "ref" | "type" => return vec![],
123 _ => {}
124 }
125
126 let mut tokens = vec![FToken::Kind(hash_text(kind))];
127 let mut cursor = node.walk();
128 for child in node.children(&mut cursor) {
129 tokens.extend(normalize_node(file, child));
130 }
131 tokens
132}
133
134fn extract_func_name(file: &ParsedFile, node: tree_sitter::Node) -> Option<String> {
136 let mut cursor = node.walk();
137 for child in node.named_children(&mut cursor) {
138 if child.kind() == "identifier" || child.kind() == "name" {
139 let start = child.start_byte();
140 let end = child.end_byte();
141 return Some(file.content[start..end].to_string());
142 }
143 }
144 None
146}
147
148const FN_NODE_KINDS: &[&str] = &[
150 "function_item", "function_definition", "function_declaration", "method_definition", "method_declaration", "method", ];
157
158fn build_token_bigrams(tokens: &[FToken]) -> std::collections::HashSet<(FToken, FToken)> {
159 tokens.windows(2).map(|w| (w[0], w[1])).collect()
160}
161
162fn find_functions_recursive(
164 file: &ParsedFile,
165 node: tree_sitter::Node,
166 fingerprints: &mut Vec<FuncFingerprint>,
167 token_sets: &mut Vec<TokenSetPair>,
168 processed: &mut usize,
169) {
170 let kind = node.kind();
171 if FN_NODE_KINDS.contains(&kind) {
172 if let Some((fp, tokens)) = fingerprint_function(file, node) {
173 let type_set: std::collections::HashSet<FToken> = tokens.iter().copied().collect();
174 let bigram_set = build_token_bigrams(&tokens);
175 fingerprints.push(fp);
176 token_sets.push((type_set, bigram_set));
177 *processed += 1;
178 }
179 }
180 let mut cursor = node.walk();
181 for child in node.children(&mut cursor) {
182 find_functions_recursive(file, child, fingerprints, token_sets, processed);
183 }
184}
185
186pub struct CrossFileDupDetector {
188 fingerprints: Vec<FuncFingerprint>,
189 token_sets: Vec<TokenSetPair>,
190 processed: usize,
191}
192
193impl Default for CrossFileDupDetector {
194 fn default() -> Self {
195 Self::new()
196 }
197}
198
199impl CrossFileDupDetector {
200 pub fn new() -> Self {
201 Self {
202 fingerprints: Vec::new(),
203 token_sets: Vec::new(),
204 processed: 0,
205 }
206 }
207
208 pub fn process_file(&mut self, file: &ParsedFile) {
210 if is_skippable_dup_path(&file.path) {
211 return;
212 }
213 let root = file.root_node();
214 find_functions_recursive(
215 file,
216 root,
217 &mut self.fingerprints,
218 &mut self.token_sets,
219 &mut self.processed,
220 );
221 }
222
223 pub fn find_duplicates(&self) -> Vec<CodeIssue> {
225 let mut issues = Vec::new();
226
227 let mut by_hash: HashMap<u64, Vec<&FuncFingerprint>> = HashMap::new();
229 for fp in &self.fingerprints {
230 by_hash.entry(fp.hash).or_default().push(fp);
231 }
232
233 for group in by_hash.values() {
234 if group.len() < 2 {
235 continue;
236 }
237 let unique_files: std::collections::HashSet<&Path> =
238 group.iter().map(|fp| fp.file.as_path()).collect();
239 let file_count = unique_files.len();
240 if file_count < 2 {
241 continue;
242 }
243
244 let sev = if group.len() > 10 {
245 Severity::Nuclear
246 } else if group.len() > 5 {
247 Severity::Spicy
248 } else {
249 Severity::Mild
250 };
251
252 let representative = group
253 .iter()
254 .min_by_key(|fp| (fp.file.as_path(), fp.line_start))
255 .copied()
256 .unwrap_or(group[0]);
257 issues.push(CodeIssue {
258 file_path: representative.file.clone(),
259 line: representative.line_start,
260 column: 1,
261 rule_name: "cross-file-duplication".to_string(),
262 message: format!(
263 "Function '{}' duplicated in {} files ({} occurrences)",
264 representative.name,
265 file_count,
266 group.len()
267 ),
268 severity: sev.clone(),
269 });
270 }
271 issues
272 }
273
274 pub fn find_near_duplicates(&self) -> Vec<CodeIssue> {
276 if self.fingerprints.len() < 2 {
277 return vec![];
278 }
279
280 let mut by_len: HashMap<usize, Vec<usize>> = HashMap::new();
281 for i in 0..self.fingerprints.len() {
282 let bucket = self.token_sets[i].1.len() / 5;
283 by_len.entry(bucket).or_default().push(i);
284 }
285
286 let mut reported = std::collections::HashSet::new();
287 let mut issues = Vec::new();
288
289 for indices in by_len.values() {
290 for pair in indices.windows(2) {
291 let (a, b) = (pair[0], pair[1]);
292 if self.fingerprints[a].file == self.fingerprints[b].file {
293 continue;
294 }
295 if self.fingerprints[a].hash == self.fingerprints[b].hash {
296 continue;
297 }
298
299 let (types_a, bigrams_a) = &self.token_sets[a];
300 let (types_b, bigrams_b) = &self.token_sets[b];
301
302 let bigram_inter = bigrams_a.intersection(bigrams_b).count();
303 let bigram_union = bigrams_a.union(bigrams_b).count();
304 if bigram_union == 0 {
305 continue;
306 }
307 let similarity = bigram_inter as f64 / bigram_union as f64;
308
309 let type_inter = types_a.intersection(types_b).count();
310 let type_union = types_a.union(types_b).count();
311 let type_ratio = if type_union > 0 {
312 type_inter as f64 / type_union as f64
313 } else {
314 0.0
315 };
316
317 if similarity >= 0.95
318 && bigrams_a.len() >= 20
319 && bigrams_b.len() >= 20
320 && type_ratio >= 0.85
321 {
322 for &idx in &[a, b] {
323 if reported.insert(idx) {
324 issues.push(CodeIssue {
325 file_path: self.fingerprints[idx].file.clone(),
326 line: self.fingerprints[idx].line_start,
327 column: 1,
328 rule_name: "near-duplicate".to_string(),
329 message: format!(
330 "Function '{}' is near-identical to '{}' ({:.0}% similar)",
331 self.fingerprints[a].name,
332 self.fingerprints[b].name,
333 similarity * 100.0,
334 ),
335 severity: Severity::Mild,
336 });
337 }
338 }
339 }
340 }
341 }
342
343 issues
344 }
345
346 pub fn stats(&self) -> (usize, usize) {
347 (self.fingerprints.len(), self.processed)
348 }
349
350 pub fn infection_spread(&self) -> HashMap<String, Vec<(String, usize, Vec<String>)>> {
353 let mut by_hash: HashMap<u64, Vec<&FuncFingerprint>> = HashMap::new();
354 for fp in &self.fingerprints {
355 by_hash.entry(fp.hash).or_default().push(fp);
356 }
357
358 let mut spread: HashMap<String, HashMap<String, (usize, Vec<String>)>> = HashMap::new();
360 for group in by_hash.values() {
361 let unique_files: std::collections::HashSet<&PathBuf> =
362 group.iter().map(|fp| &fp.file).collect();
363 if unique_files.len() < 2 {
364 continue;
365 }
366 let func_name = &group[0].name;
367
368 for a in &unique_files {
369 let a_str = a.to_string_lossy().to_string();
370 for b in &unique_files {
371 if a == b {
372 continue;
373 }
374 let b_str = b.to_string_lossy().to_string();
375 let entry = spread.entry(a_str.clone()).or_default();
376 let inner = entry.entry(b_str).or_insert((0, Vec::new()));
377 inner.0 += 1;
378 inner.1.push(func_name.clone());
379 }
380 }
381 }
382
383 spread
385 .into_iter()
386 .map(|(file, targets)| {
387 let mut sorted: Vec<_> = targets.into_iter().collect();
388 sorted.sort_by_key(|(_, (count, _))| std::cmp::Reverse(*count));
389 let result: Vec<_> = sorted
390 .into_iter()
391 .map(|(target, (count, funcs))| (target, count, funcs))
392 .collect();
393 (file, result)
394 })
395 .collect()
396 }
397}
398
399pub struct IntraFileDupDetector;
408
409impl IntraFileDupDetector {
410 fn is_skippable(path: &std::path::Path) -> bool {
412 is_skippable_dup_path(path)
413 }
414
415 fn is_substantive_line(line: &str) -> bool {
416 let trimmed = line.trim();
417 !trimmed.is_empty()
418 && !trimmed.starts_with("//")
419 && !trimmed.starts_with('#')
420 && !matches!(trimmed, "{" | "}" | ");" | "," | "else" | "} else {")
421 && !trimmed.starts_with("}")
422 && !trimmed.ends_with("{")
423 && !trimmed.ends_with("=>")
424 }
425
426 pub fn check(file: &ParsedFile) -> Vec<CodeIssue> {
427 if Self::is_skippable(&file.path) {
428 return vec![];
429 }
430 let lines: Vec<&str> = file.content.lines().collect();
431 if lines.len() < 10 {
432 return vec![];
433 }
434
435 let chunk_size = 5usize;
436 let mut seen: HashMap<u64, Vec<usize>> = HashMap::new();
437
438 for i in 0..lines.len().saturating_sub(chunk_size - 1) {
439 let chunk: Vec<&str> = lines[i..i + chunk_size]
440 .iter()
441 .map(|l| l.trim())
442 .filter(|l| Self::is_substantive_line(l))
443 .collect();
444 if chunk.len() < 4 {
445 continue;
446 }
447
448 let mut hasher = DefaultHasher::new();
449 for line in &chunk {
450 line.hash(&mut hasher);
451 }
452 let hash = hasher.finish();
453 seen.entry(hash).or_default().push(i);
454 }
455
456 let mut groups: Vec<(usize, usize)> = seen
457 .values()
458 .filter_map(|positions| {
459 if positions.len() < 2 {
460 return None;
461 }
462 let is_spaced = positions.windows(2).any(|w| w[1] - w[0] > chunk_size);
463 if is_spaced {
464 Some((positions[0], positions.len()))
465 } else {
466 None
467 }
468 })
469 .collect();
470 groups.sort_by_key(|(start, _)| *start);
471
472 let mut issues = Vec::new();
473 let mut last_reported_end = 0usize;
474 for (start, occurrences) in groups {
475 if start < last_reported_end {
476 continue;
477 }
478 last_reported_end = start + chunk_size;
479
480 issues.push(CodeIssue {
481 file_path: file.path.clone(),
482 line: start + 1,
483 column: 1,
484 rule_name: "code-duplication".to_string(),
485 message: format!(
486 "Duplicate code block ({} lines) found {} times starting at line {}",
487 chunk_size,
488 occurrences,
489 start + 1
490 ),
491 severity: Severity::Mild,
492 });
493 }
494 issues
495 }
496}
497
498#[cfg(test)]
499mod tests {
500 use super::*;
501 use crate::treesitter::engine::TreeSitterEngine;
502 use std::path::Path;
503 use std::sync::OnceLock;
504
505 fn shared_engine() -> &'static TreeSitterEngine {
506 static ENGINE: OnceLock<TreeSitterEngine> = OnceLock::new();
507 ENGINE.get_or_init(|| {
508 let engine = TreeSitterEngine::new();
509 engine.ensure_parser(crate::language::Language::Rust);
510 engine.ensure_parser(crate::language::Language::Python);
511 engine
512 })
513 }
514
515 fn parse_rust(code: &str) -> ParsedFile {
516 shared_engine()
517 .parse_file(Path::new("main.rs"), code)
518 .expect("Rust parse should succeed")
519 }
520
521 fn parse_rust_as(filename: &str, code: &str) -> ParsedFile {
522 shared_engine()
523 .parse_file(Path::new(filename), code)
524 .expect("Rust parse should succeed")
525 }
526
527 fn parse_python_as(filename: &str, code: &str) -> ParsedFile {
528 shared_engine()
529 .parse_file(Path::new(filename), code)
530 .expect("Python parse should succeed")
531 }
532
533 #[test]
534 fn test_find_function_nodes() {
535 let file =
536 parse_rust("fn compute(a: i32, b: i32) -> i32 {\n let r = a + b;\n r\n}\n");
537 let root = file.root_node();
538 assert_eq!(root.kind(), "source_file");
539 let mut c2 = root.walk();
540 let found_fn = c2.goto_first_child() && c2.node().kind() == "function_item";
541 assert!(found_fn, "Should find function_item node");
542 }
543
544 #[test]
545 fn test_cross_file_dup_detects_identical_functions() {
546 let mut detector = CrossFileDupDetector::new();
547 let code = r#"
548fn compute(a: i32, b: i32) -> i32 {
549 let base = a + b;
550 let doubled = base * 2;
551 let shifted = doubled + 10;
552 let checked = shifted - a;
553 let final_value = checked + b;
554 final_value
555}
556"#;
557 let a = parse_rust_as("file_a.rs", code);
558 let b = parse_rust_as("file_b.rs", code);
559 detector.process_file(&a);
560 detector.process_file(&b);
561 let issues = detector.find_duplicates();
562 assert!(!issues.is_empty(), "Should detect cross-file duplicate");
563 }
564
565 #[test]
566 fn test_cross_file_duplicate_group_reports_once() {
567 let mut detector = CrossFileDupDetector::new();
568 let code = r#"
569fn compute(a: i32, b: i32) -> i32 {
570 let base = a + b;
571 let doubled = base * 2;
572 let shifted = doubled + 10;
573 let checked = shifted - a;
574 let final_value = checked + b;
575 final_value
576}
577"#;
578 let a = parse_rust_as("file_a.rs", code);
579 let b = parse_rust_as("file_b.rs", code);
580 let c = parse_rust_as("file_c.rs", code);
581 detector.process_file(&a);
582 detector.process_file(&b);
583 detector.process_file(&c);
584 let issues = detector.find_duplicates();
585 assert_eq!(issues.len(), 1, "Duplicate group should report once");
586 assert!(
587 issues[0].message.contains("3 occurrences"),
588 "message should preserve occurrence count"
589 );
590 }
591
592 #[test]
593 fn test_cross_file_skip_trivial_functions() {
594 let mut detector = CrossFileDupDetector::new();
595 let f = parse_rust("fn noop() {}");
596 detector.process_file(&f);
597 let (count, _) = detector.stats();
598 assert_eq!(count, 0, "Trivial functions should be skipped");
599 }
600
601 #[test]
602 fn test_cross_file_python() {
603 let mut detector = CrossFileDupDetector::new();
604 let code = r#"
605def add(a, b):
606 result = a + b
607 doubled = result * 2
608 shifted = doubled + 10
609 checked = shifted - a
610 final_value = checked + b
611 print(final_value)
612 return final_value
613"#;
614 let a = parse_python_as("mod_a.py", code);
615 let b = parse_python_as("mod_b.py", code);
616 detector.process_file(&a);
617 detector.process_file(&b);
618 let issues = detector.find_duplicates();
619 assert!(!issues.is_empty(), "Should detect Python cross-file dup");
620 }
621
622 #[test]
623 fn test_cross_file_skips_adapter_batch_helpers() {
624 let mut detector = CrossFileDupDetector::new();
625 let code = r#"
626fn count_debug_from_batch<'a>(batch: &[Vec<QueryCapture<'a>>]) -> usize {
627 batch
628 .iter()
629 .filter(|m| m.iter().any(|c| c.name == "dp_method"))
630 .count()
631}
632"#;
633 let a = parse_rust_as("adapter_a.rs", code);
634 let b = parse_rust_as("adapter_b.rs", code);
635 detector.process_file(&a);
636 detector.process_file(&b);
637 let issues = detector.find_duplicates();
638 assert!(issues.is_empty(), "Adapter batch helpers should be skipped");
639 }
640
641 #[test]
642 fn test_cross_file_skips_demo_paths() {
643 let mut detector = CrossFileDupDetector::new();
644 let code = r#"
645fn compute(a: i32, b: i32) -> i32 {
646 let base = a + b;
647 let doubled = base * 2;
648 let shifted = doubled + 10;
649 let checked = shifted - a;
650 let final_value = checked + b;
651 final_value
652}
653"#;
654 let a = parse_rust_as("demo/file_a.rs", code);
655 let b = parse_rust_as("demo/file_b.rs", code);
656 detector.process_file(&a);
657 detector.process_file(&b);
658 let issues = detector.find_duplicates();
659 assert!(issues.is_empty(), "Demo paths should be skipped");
660 }
661
662 #[test]
663 fn test_intra_file_dup_detects_repeated_block() {
664 let file = parse_rust(
665 r#"
666fn main() {
667 let a = 1;
668 let b = 2;
669 let c = a + b;
670 let d = c * 2;
671 println!("block_end");
672 let x = 0;
673 let a = 1;
674 let b = 2;
675 let c = a + b;
676 let d = c * 2;
677 println!("block_end");
678}
679"#,
680 );
681 let issues = IntraFileDupDetector::check(&file);
682 assert!(!issues.is_empty(), "Should detect repeated code block");
683 assert!(issues.iter().any(|i| i.rule_name == "code-duplication"));
684 }
685
686 #[test]
687 fn test_intra_file_dup_merges_overlapping_chunks() {
688 let file = parse_rust(
689 r#"
690fn main() {
691 let a = 1;
692 let b = 2;
693 let c = 3;
694 let d = 4;
695 let e = 5;
696 let f = 6;
697 let marker = 0;
698 let a = 1;
699 let b = 2;
700 let c = 3;
701 let d = 4;
702 let e = 5;
703 let f = 6;
704}
705"#,
706 );
707 let issues = IntraFileDupDetector::check(&file);
708 assert_eq!(issues.len(), 1, "Overlapping duplicate chunks should merge");
709 }
710
711 #[test]
712 fn test_intra_file_dup_skips_example_paths() {
713 let file = parse_rust_as(
714 "example/demo.rs",
715 r#"
716fn main() {
717 let a = 1;
718 let b = 2;
719 let c = a + b;
720 let d = c * 2;
721 println!("block_end");
722 let x = 0;
723 let a = 1;
724 let b = 2;
725 let c = a + b;
726 let d = c * 2;
727 println!("block_end");
728}
729"#,
730 );
731 let issues = IntraFileDupDetector::check(&file);
732 assert!(issues.is_empty(), "Example paths should be skipped");
733 }
734
735 #[test]
736 fn test_intra_file_dup_ignores_structural_lines() {
737 let file = parse_rust(
738 r#"
739fn first(a: bool, b: bool, c: bool, d: bool) {
740 if a {
741 if b {
742 if c {
743 if d {
744 println!("one");
745 }
746 }
747 }
748 }
749}
750
751fn second(a: bool, b: bool, c: bool, d: bool) {
752 if a {
753 if b {
754 if c {
755 if d {
756 println!("two");
757 }
758 }
759 }
760 }
761}
762"#,
763 );
764 let issues = IntraFileDupDetector::check(&file);
765 assert!(
766 issues.is_empty(),
767 "Structural lines should not trigger duplication"
768 );
769 }
770
771 #[test]
772 fn test_intra_file_clean_code_no_false_positive() {
773 let file = parse_rust(
774 r#"
775fn add(a: i32, b: i32) -> i32 { a + b }
776fn sub(a: i32, b: i32) -> i32 { a - b }
777fn mul(a: i32, b: i32) -> i32 { a * b }
778"#,
779 );
780 let issues = IntraFileDupDetector::check(&file);
781 assert!(issues.is_empty(), "Different functions should not trigger");
782 }
783}