garbage_code_hunter/treesitter/
duplication.rs1use std::collections::hash_map::DefaultHasher;
2use std::collections::HashMap;
3use std::hash::{Hash, Hasher};
4use std::path::{Path, PathBuf};
5
6use crate::analyzer::{CodeIssue, Severity};
7use crate::treesitter::engine::ParsedFile;
8
9#[derive(Debug, Clone, PartialEq, Eq, Hash)]
13#[allow(dead_code)]
14enum FToken {
15 Ident,
16 Int,
17 Float,
18 Str,
19 Bool,
20 Type,
21}
22
23#[derive(Debug, Clone)]
25struct FuncFingerprint {
26 hash: u64,
27 tokens: Vec<FToken>,
28 name: String,
29 file: PathBuf,
30 line_start: usize,
31}
32
33fn fingerprint_function(file: &ParsedFile, node: tree_sitter::Node) -> Option<FuncFingerprint> {
35 let name = extract_func_name(file, node)?;
36 let line_start = node.start_position().row + 1;
37 let line_end = node.end_position().row + 1;
38 let line_count = line_end - line_start + 1;
39
40 if line_count < 5 {
42 return None;
43 }
44
45 let tokens = normalize_node(file, node);
46 if tokens.len() < 8 {
47 return None;
48 }
49
50 let mut hasher = DefaultHasher::new();
51 tokens.hash(&mut hasher);
52 let hash = hasher.finish();
53
54 Some(FuncFingerprint {
55 hash,
56 tokens,
57 name,
58 file: file.path.clone(),
59 line_start,
60 })
61}
62
63#[allow(clippy::only_used_in_recursion)]
67fn normalize_node(file: &ParsedFile, node: tree_sitter::Node) -> Vec<FToken> {
68 let kind = node.kind();
69 match kind {
70 "identifier"
71 | "field_identifier"
72 | "shorthand_property_identifier"
73 | "variable_name"
74 | "name" => return vec![FToken::Ident],
75 "type_identifier" | "primitive_type" | "mutable_specifier" => return vec![FToken::Type],
76 "integer_literal" | "integer" | "number" => return vec![FToken::Int],
77 "float_literal" | "float" => return vec![FToken::Float],
78 "string_literal" | "string" | "interpreted_string_literal" | "char_literal" => {
79 return vec![FToken::Str];
80 }
81 "boolean" | "true" | "false" => return vec![FToken::Bool],
82 "let" | "const" | "mut" | "fn" | "def" | "function" | "fun" | "return" | "if" | "else"
84 | "for" | "while" | "match" | "use" | "mod" | "struct" | "enum" | "impl" | "trait"
85 | "pub" | "pub(crate)" | "unsafe" | "async" | "await" | "where" | "in" | "as"
86 | "static" | "extern" | "ref" | "type" => return vec![],
87 _ => {}
88 }
89
90 let mut tokens = Vec::new();
91 let mut cursor = node.walk();
92 for child in node.named_children(&mut cursor) {
93 tokens.extend(normalize_node(file, child));
94 }
95 tokens
96}
97
98fn extract_func_name(file: &ParsedFile, node: tree_sitter::Node) -> Option<String> {
100 let mut cursor = node.walk();
101 for child in node.named_children(&mut cursor) {
102 if child.kind() == "identifier" || child.kind() == "name" {
103 let start = child.start_byte();
104 let end = child.end_byte();
105 return Some(file.content[start..end].to_string());
106 }
107 }
108 None
110}
111
112const FN_NODE_KINDS: &[&str] = &[
114 "function_item", "function_definition", "function_declaration", "method_definition", "method_declaration", "method", ];
121
122fn find_functions_recursive(
124 file: &ParsedFile,
125 node: tree_sitter::Node,
126 fingerprints: &mut Vec<FuncFingerprint>,
127 processed: &mut usize,
128) {
129 let kind = node.kind();
130 if FN_NODE_KINDS.contains(&kind) {
131 if let Some(fp) = fingerprint_function(file, node) {
132 fingerprints.push(fp);
133 *processed += 1;
134 }
135 }
136 let mut cursor = node.walk();
137 for child in node.children(&mut cursor) {
138 find_functions_recursive(file, child, fingerprints, processed);
139 }
140}
141
142pub struct CrossFileDupDetector {
144 fingerprints: Vec<FuncFingerprint>,
145 processed: usize,
146}
147
148impl Default for CrossFileDupDetector {
149 fn default() -> Self {
150 Self::new()
151 }
152}
153
154impl CrossFileDupDetector {
155 pub fn new() -> Self {
156 Self {
157 fingerprints: Vec::new(),
158 processed: 0,
159 }
160 }
161
162 pub fn process_file(&mut self, file: &ParsedFile) {
164 let root = file.root_node();
165 find_functions_recursive(file, root, &mut self.fingerprints, &mut self.processed);
166 }
167
168 pub fn find_duplicates(&self) -> Vec<CodeIssue> {
170 let mut issues = Vec::new();
171
172 let mut by_hash: HashMap<u64, Vec<&FuncFingerprint>> = HashMap::new();
174 for fp in &self.fingerprints {
175 by_hash.entry(fp.hash).or_default().push(fp);
176 }
177
178 for group in by_hash.values() {
179 if group.len() < 2 {
180 continue;
181 }
182 let unique_files: std::collections::HashSet<&Path> =
183 group.iter().map(|fp| fp.file.as_path()).collect();
184 let file_count = unique_files.len();
185 if file_count < 2 {
186 continue;
187 }
188
189 let sev = if group.len() > 10 {
190 Severity::Nuclear
191 } else if group.len() > 5 {
192 Severity::Spicy
193 } else {
194 Severity::Mild
195 };
196
197 for fp in group {
198 issues.push(CodeIssue {
199 file_path: fp.file.clone(),
200 line: fp.line_start,
201 column: 1,
202 rule_name: "cross-file-duplication".to_string(),
203 message: format!(
204 "Function '{}' duplicated in {} files ({} occurrences)",
205 fp.name,
206 file_count,
207 group.len()
208 ),
209 severity: sev.clone(),
210 });
211 }
212 }
213 issues
214 }
215
216 pub fn find_near_duplicates(&self) -> Vec<CodeIssue> {
218 let mut issues = Vec::new();
219 let fps: Vec<&FuncFingerprint> = self.fingerprints.iter().collect();
220 let mut seen: std::collections::HashSet<(usize, usize)> = std::collections::HashSet::new();
221
222 for i in 0..fps.len() {
223 for j in (i + 1)..fps.len() {
224 if seen.contains(&(i, j)) || seen.contains(&(j, i)) {
225 continue;
226 }
227 seen.insert((i, j));
228
229 let a = &fps[i].tokens;
230 let b = &fps[j].tokens;
231 let sim = jaccard_similarity(a, b);
232
233 if sim >= 0.80 && fps[i].file != fps[j].file {
234 issues.push(CodeIssue {
235 file_path: fps[i].file.clone(),
236 line: fps[i].line_start,
237 column: 1,
238 rule_name: "cross-file-near-duplicate".to_string(),
239 message: format!(
240 "Function '{}' is {:.0}% similar to '{}' in {} — consider refactoring",
241 fps[i].name,
242 sim * 100.0,
243 fps[j].name,
244 fps[j].file.display(),
245 ),
246 severity: Severity::Mild,
247 });
248 }
249 }
250 }
251 issues
252 }
253
254 pub fn stats(&self) -> (usize, usize) {
255 (self.fingerprints.len(), self.processed)
256 }
257}
258
259fn jaccard_similarity(a: &[FToken], b: &[FToken]) -> f64 {
261 use std::collections::HashSet;
262 if a.is_empty() && b.is_empty() {
263 return 1.0;
264 }
265 if a.is_empty() || b.is_empty() {
266 return 0.0;
267 }
268 let set_a: HashSet<&FToken> = a.iter().collect();
269 let set_b: HashSet<&FToken> = b.iter().collect();
270 let intersection = set_a.intersection(&set_b).count();
271 let union = set_a.union(&set_b).count();
272 if union == 0 {
273 0.0
274 } else {
275 intersection as f64 / union as f64
276 }
277}
278
279pub struct IntraFileDupDetector;
288
289impl IntraFileDupDetector {
290 pub fn check(file: &ParsedFile) -> Vec<CodeIssue> {
291 let lines: Vec<&str> = file.content.lines().collect();
292 if lines.len() < 10 {
293 return vec![];
294 }
295
296 let chunk_size = 5usize;
297 let mut seen: HashMap<u64, Vec<usize>> = HashMap::new();
298
299 for i in 0..lines.len().saturating_sub(chunk_size - 1) {
300 let chunk: Vec<&str> = lines[i..i + chunk_size]
301 .iter()
302 .map(|l| l.trim())
303 .filter(|l| !l.is_empty() && !l.starts_with("//") && !l.starts_with('#'))
304 .collect();
305 if chunk.len() < 3 {
306 continue;
307 }
308
309 let mut hasher = DefaultHasher::new();
310 for line in &chunk {
311 line.hash(&mut hasher);
312 }
313 let hash = hasher.finish();
314 seen.entry(hash).or_default().push(i);
315 }
316
317 let mut issues = Vec::new();
318 for positions in seen.values() {
319 if positions.len() < 2 {
320 continue;
321 }
322 let is_spaced = positions.windows(2).any(|w| w[1] - w[0] > chunk_size);
324 if !is_spaced {
325 continue;
326 }
327 let start = positions[0];
328 issues.push(CodeIssue {
329 file_path: file.path.clone(),
330 line: start + 1,
331 column: 1,
332 rule_name: "code-duplication".to_string(),
333 message: format!(
334 "Duplicate code block ({} lines) found {} times starting at line {}",
335 chunk_size,
336 positions.len(),
337 start + 1
338 ),
339 severity: Severity::Mild,
340 });
341 }
342 issues
343 }
344}
345
346#[cfg(test)]
347mod tests {
348 use super::*;
349 use crate::treesitter::TreeSitterEngine;
350
351 fn parse_rust_as(name: &str, code: &str) -> ParsedFile {
352 let engine = TreeSitterEngine::new();
353 engine
354 .parse_file(Path::new(name), code)
355 .expect("Should parse")
356 }
357
358 fn parse_rust(code: &str) -> ParsedFile {
359 parse_rust_as("default.rs", code)
360 }
361
362 fn parse_python_as(name: &str, code: &str) -> ParsedFile {
363 let engine = TreeSitterEngine::new();
364 engine
365 .parse_file(Path::new(name), code)
366 .expect("Should parse")
367 }
368
369 #[test]
370 fn test_find_function_nodes() {
371 let file =
372 parse_rust("fn compute(a: i32, b: i32) -> i32 {\n let r = a + b;\n r\n}\n");
373 let root = file.root_node();
374 assert_eq!(root.kind(), "source_file");
375 let mut c2 = root.walk();
376 let found_fn = c2.goto_first_child() && c2.node().kind() == "function_item";
377 assert!(found_fn, "Should find function_item node");
378 }
379
380 #[test]
381 fn test_cross_file_dup_detects_identical_functions() {
382 let mut detector = CrossFileDupDetector::new();
383 let a = parse_rust_as("file_a.rs", "fn compute(a: i32, b: i32) -> i32 {\n let r = a + b;\n let s = r * 2;\n s\n}\n");
384 let b = parse_rust_as("file_b.rs", "fn compute(x: i32, y: i32) -> i32 {\n let z = x + y;\n let w = z * 2;\n w\n}\n");
385 detector.process_file(&a);
386 detector.process_file(&b);
387 let issues = detector.find_duplicates();
388 assert!(!issues.is_empty(), "Should detect cross-file duplicate");
389 }
390
391 #[test]
392 fn test_cross_file_skip_trivial_functions() {
393 let mut detector = CrossFileDupDetector::new();
394 let f = parse_rust("fn noop() {}");
395 detector.process_file(&f);
396 let (count, _) = detector.stats();
397 assert_eq!(count, 0, "Trivial functions should be skipped");
398 }
399
400 #[test]
401 fn test_cross_file_python() {
402 let mut detector = CrossFileDupDetector::new();
403 let a = parse_python_as(
404 "mod_a.py",
405 "def add(a, b):\n result = a + b\n print(result)\n result += 1\n return result\n",
406 );
407 let b = parse_python_as(
408 "mod_b.py",
409 "def sum(x, y):\n total = x + y\n print(total)\n total += 1\n return total\n",
410 );
411 detector.process_file(&a);
412 detector.process_file(&b);
413 let issues = detector.find_duplicates();
414 assert!(!issues.is_empty(), "Should detect Python cross-file dup");
415 }
416
417 #[test]
418 fn test_intra_file_dup_detects_repeated_block() {
419 let file = parse_rust(
420 r#"
421fn main() {
422 let a = 1;
423 let b = 2;
424 let c = a + b;
425 let d = c * 2;
426 println!("block_end");
427 let x = 0;
428 let a = 1;
429 let b = 2;
430 let c = a + b;
431 let d = c * 2;
432 println!("block_end");
433}
434"#,
435 );
436 let issues = IntraFileDupDetector::check(&file);
437 assert!(!issues.is_empty(), "Should detect repeated code block");
438 assert!(issues.iter().any(|i| i.rule_name == "code-duplication"));
439 }
440
441 #[test]
442 fn test_intra_file_clean_code_no_false_positive() {
443 let file = parse_rust(
444 r#"
445fn add(a: i32, b: i32) -> i32 { a + b }
446fn sub(a: i32, b: i32) -> i32 { a - b }
447fn mul(a: i32, b: i32) -> i32 { a * b }
448"#,
449 );
450 let issues = IntraFileDupDetector::check(&file);
451 assert!(issues.is_empty(), "Different functions should not trigger");
452 }
453}