garbage_code_hunter/treesitter/
duplication.rs

1use std::collections::hash_map::DefaultHasher;
2use std::collections::HashMap;
3use std::hash::{Hash, Hasher};
4use std::path::{Path, PathBuf};
5
6use crate::analyzer::{CodeIssue, Severity};
7use crate::treesitter::engine::ParsedFile;
8
9// ─── Cross-file duplication ──────────────────────────────────────────────────
10
11/// Normalized token for function fingerprinting.
12#[derive(Debug, Clone, PartialEq, Eq, Hash)]
13#[allow(dead_code)]
14enum FToken {
15    Ident,
16    Int,
17    Float,
18    Str,
19    Bool,
20    Type,
21}
22
23/// Fingerprint of a function — hash + source location.
24#[derive(Debug, Clone)]
25struct FuncFingerprint {
26    hash: u64,
27    tokens: Vec<FToken>,
28    name: String,
29    file: PathBuf,
30    line_start: usize,
31}
32
33/// Build a normalized token sequence from a function node.
34fn fingerprint_function(file: &ParsedFile, node: tree_sitter::Node) -> Option<FuncFingerprint> {
35    let name = extract_func_name(file, node)?;
36    let line_start = node.start_position().row + 1;
37    let line_end = node.end_position().row + 1;
38    let line_count = line_end - line_start + 1;
39
40    // Skip small functions (likely trivial or boilerplate)
41    if line_count < 5 {
42        return None;
43    }
44
45    let tokens = normalize_node(file, node);
46    if tokens.len() < 8 {
47        return None;
48    }
49
50    let mut hasher = DefaultHasher::new();
51    tokens.hash(&mut hasher);
52    let hash = hasher.finish();
53
54    Some(FuncFingerprint {
55        hash,
56        tokens,
57        name,
58        file: file.path.clone(),
59        line_start,
60    })
61}
62
63/// Normalize a tree-sitter node into a Vec<FToken> for fingerprinting.
64/// Leaf nodes (identifiers, literals) map to generic tokens.
65/// Compound nodes recurse into their named children.
66#[allow(clippy::only_used_in_recursion)]
67fn normalize_node(file: &ParsedFile, node: tree_sitter::Node) -> Vec<FToken> {
68    let kind = node.kind();
69    match kind {
70        "identifier"
71        | "field_identifier"
72        | "shorthand_property_identifier"
73        | "variable_name"
74        | "name" => return vec![FToken::Ident],
75        "type_identifier" | "primitive_type" | "mutable_specifier" => return vec![FToken::Type],
76        "integer_literal" | "integer" | "number" => return vec![FToken::Int],
77        "float_literal" | "float" => return vec![FToken::Float],
78        "string_literal" | "string" | "interpreted_string_literal" | "char_literal" => {
79            return vec![FToken::Str];
80        }
81        "boolean" | "true" | "false" => return vec![FToken::Bool],
82        // Ignored: keywords, operators, punctuation
83        "let" | "const" | "mut" | "fn" | "def" | "function" | "fun" | "return" | "if" | "else"
84        | "for" | "while" | "match" | "use" | "mod" | "struct" | "enum" | "impl" | "trait"
85        | "pub" | "pub(crate)" | "unsafe" | "async" | "await" | "where" | "in" | "as"
86        | "static" | "extern" | "ref" | "type" => return vec![],
87        _ => {}
88    }
89
90    let mut tokens = Vec::new();
91    let mut cursor = node.walk();
92    for child in node.named_children(&mut cursor) {
93        tokens.extend(normalize_node(file, child));
94    }
95    tokens
96}
97
98/// Extract function name from a function node.
99fn extract_func_name(file: &ParsedFile, node: tree_sitter::Node) -> Option<String> {
100    let mut cursor = node.walk();
101    for child in node.named_children(&mut cursor) {
102        if child.kind() == "identifier" || child.kind() == "name" {
103            let start = child.start_byte();
104            let end = child.end_byte();
105            return Some(file.content[start..end].to_string());
106        }
107    }
108    // Anonymous / closure
109    None
110}
111
112/// Function-like node kinds across languages.
113const FN_NODE_KINDS: &[&str] = &[
114    "function_item",        // Rust
115    "function_definition",  // Python, C, C++
116    "function_declaration", // JavaScript, Go
117    "method_definition",    // JavaScript (class methods)
118    "method_declaration",   // Go, Java
119    "method",               // Ruby
120];
121
122/// Recursively find function-like nodes and fingerprint them.
123fn find_functions_recursive(
124    file: &ParsedFile,
125    node: tree_sitter::Node,
126    fingerprints: &mut Vec<FuncFingerprint>,
127    processed: &mut usize,
128) {
129    let kind = node.kind();
130    if FN_NODE_KINDS.contains(&kind) {
131        if let Some(fp) = fingerprint_function(file, node) {
132            fingerprints.push(fp);
133            *processed += 1;
134        }
135    }
136    let mut cursor = node.walk();
137    for child in node.children(&mut cursor) {
138        find_functions_recursive(file, child, fingerprints, processed);
139    }
140}
141
142/// Cross-file duplication analyzer using tree-sitter fingerprints.
143pub struct CrossFileDupDetector {
144    fingerprints: Vec<FuncFingerprint>,
145    processed: usize,
146}
147
148impl Default for CrossFileDupDetector {
149    fn default() -> Self {
150        Self::new()
151    }
152}
153
154impl CrossFileDupDetector {
155    pub fn new() -> Self {
156        Self {
157            fingerprints: Vec::new(),
158            processed: 0,
159        }
160    }
161
162    /// Process a parsed file, extracting function fingerprints.
163    pub fn process_file(&mut self, file: &ParsedFile) {
164        let root = file.root_node();
165        find_functions_recursive(file, root, &mut self.fingerprints, &mut self.processed);
166    }
167
168    /// Find duplicate functions across files.
169    pub fn find_duplicates(&self) -> Vec<CodeIssue> {
170        let mut issues = Vec::new();
171
172        // Group by hash
173        let mut by_hash: HashMap<u64, Vec<&FuncFingerprint>> = HashMap::new();
174        for fp in &self.fingerprints {
175            by_hash.entry(fp.hash).or_default().push(fp);
176        }
177
178        for group in by_hash.values() {
179            if group.len() < 2 {
180                continue;
181            }
182            let unique_files: std::collections::HashSet<&Path> =
183                group.iter().map(|fp| fp.file.as_path()).collect();
184            let file_count = unique_files.len();
185            if file_count < 2 {
186                continue;
187            }
188
189            let sev = if group.len() > 10 {
190                Severity::Nuclear
191            } else if group.len() > 5 {
192                Severity::Spicy
193            } else {
194                Severity::Mild
195            };
196
197            for fp in group {
198                issues.push(CodeIssue {
199                    file_path: fp.file.clone(),
200                    line: fp.line_start,
201                    column: 1,
202                    rule_name: "cross-file-duplication".to_string(),
203                    message: format!(
204                        "Function '{}' duplicated in {} files ({} occurrences)",
205                        fp.name,
206                        file_count,
207                        group.len()
208                    ),
209                    severity: sev.clone(),
210                });
211            }
212        }
213        issues
214    }
215
216    /// Find near-duplicate functions using Jaccard similarity on normalized tokens.
217    pub fn find_near_duplicates(&self) -> Vec<CodeIssue> {
218        let mut issues = Vec::new();
219        let fps: Vec<&FuncFingerprint> = self.fingerprints.iter().collect();
220        let mut seen: std::collections::HashSet<(usize, usize)> = std::collections::HashSet::new();
221
222        for i in 0..fps.len() {
223            for j in (i + 1)..fps.len() {
224                if seen.contains(&(i, j)) || seen.contains(&(j, i)) {
225                    continue;
226                }
227                seen.insert((i, j));
228
229                let a = &fps[i].tokens;
230                let b = &fps[j].tokens;
231                let sim = jaccard_similarity(a, b);
232
233                if sim >= 0.80 && fps[i].file != fps[j].file {
234                    issues.push(CodeIssue {
235                        file_path: fps[i].file.clone(),
236                        line: fps[i].line_start,
237                        column: 1,
238                        rule_name: "cross-file-near-duplicate".to_string(),
239                        message: format!(
240                            "Function '{}' is {:.0}% similar to '{}' in {} — consider refactoring",
241                            fps[i].name,
242                            sim * 100.0,
243                            fps[j].name,
244                            fps[j].file.display(),
245                        ),
246                        severity: Severity::Mild,
247                    });
248                }
249            }
250        }
251        issues
252    }
253
254    pub fn stats(&self) -> (usize, usize) {
255        (self.fingerprints.len(), self.processed)
256    }
257}
258
259/// Jaccard similarity between two token slices.
260fn jaccard_similarity(a: &[FToken], b: &[FToken]) -> f64 {
261    use std::collections::HashSet;
262    if a.is_empty() && b.is_empty() {
263        return 1.0;
264    }
265    if a.is_empty() || b.is_empty() {
266        return 0.0;
267    }
268    let set_a: HashSet<&FToken> = a.iter().collect();
269    let set_b: HashSet<&FToken> = b.iter().collect();
270    let intersection = set_a.intersection(&set_b).count();
271    let union = set_a.union(&set_b).count();
272    if union == 0 {
273        0.0
274    } else {
275        intersection as f64 / union as f64
276    }
277}
278
279// ─── Intra-file duplication ──────────────────────────────────────────────────
280
281/// Intra-file code duplication detector (chunk-based line matching).
282///
283/// Splits a file into N-line chunks, normalizes them (strips comments),
284/// and flags chunks that appear more than once.
285/// Chunk size of 5 and requirement of 2+ spaced-out occurrences
286/// reduces false positives from repetitive but unrelated code.
287pub struct IntraFileDupDetector;
288
289impl IntraFileDupDetector {
290    pub fn check(file: &ParsedFile) -> Vec<CodeIssue> {
291        let lines: Vec<&str> = file.content.lines().collect();
292        if lines.len() < 10 {
293            return vec![];
294        }
295
296        let chunk_size = 5usize;
297        let mut seen: HashMap<u64, Vec<usize>> = HashMap::new();
298
299        for i in 0..lines.len().saturating_sub(chunk_size - 1) {
300            let chunk: Vec<&str> = lines[i..i + chunk_size]
301                .iter()
302                .map(|l| l.trim())
303                .filter(|l| !l.is_empty() && !l.starts_with("//") && !l.starts_with('#'))
304                .collect();
305            if chunk.len() < 3 {
306                continue;
307            }
308
309            let mut hasher = DefaultHasher::new();
310            for line in &chunk {
311                line.hash(&mut hasher);
312            }
313            let hash = hasher.finish();
314            seen.entry(hash).or_default().push(i);
315        }
316
317        let mut issues = Vec::new();
318        for positions in seen.values() {
319            if positions.len() < 2 {
320                continue;
321            }
322            // Only flag if occurrences are well-separated (not adjacent)
323            let is_spaced = positions.windows(2).any(|w| w[1] - w[0] > chunk_size);
324            if !is_spaced {
325                continue;
326            }
327            let start = positions[0];
328            issues.push(CodeIssue {
329                file_path: file.path.clone(),
330                line: start + 1,
331                column: 1,
332                rule_name: "code-duplication".to_string(),
333                message: format!(
334                    "Duplicate code block ({} lines) found {} times starting at line {}",
335                    chunk_size,
336                    positions.len(),
337                    start + 1
338                ),
339                severity: Severity::Mild,
340            });
341        }
342        issues
343    }
344}
345
346#[cfg(test)]
347mod tests {
348    use super::*;
349    use crate::treesitter::TreeSitterEngine;
350
351    fn parse_rust_as(name: &str, code: &str) -> ParsedFile {
352        let engine = TreeSitterEngine::new();
353        engine
354            .parse_file(Path::new(name), code)
355            .expect("Should parse")
356    }
357
358    fn parse_rust(code: &str) -> ParsedFile {
359        parse_rust_as("default.rs", code)
360    }
361
362    fn parse_python_as(name: &str, code: &str) -> ParsedFile {
363        let engine = TreeSitterEngine::new();
364        engine
365            .parse_file(Path::new(name), code)
366            .expect("Should parse")
367    }
368
369    #[test]
370    fn test_find_function_nodes() {
371        let file =
372            parse_rust("fn compute(a: i32, b: i32) -> i32 {\n    let r = a + b;\n    r\n}\n");
373        let root = file.root_node();
374        assert_eq!(root.kind(), "source_file");
375        let mut c2 = root.walk();
376        let found_fn = c2.goto_first_child() && c2.node().kind() == "function_item";
377        assert!(found_fn, "Should find function_item node");
378    }
379
380    #[test]
381    fn test_cross_file_dup_detects_identical_functions() {
382        let mut detector = CrossFileDupDetector::new();
383        let a = parse_rust_as("file_a.rs", "fn compute(a: i32, b: i32) -> i32 {\n    let r = a + b;\n    let s = r * 2;\n    s\n}\n");
384        let b = parse_rust_as("file_b.rs", "fn compute(x: i32, y: i32) -> i32 {\n    let z = x + y;\n    let w = z * 2;\n    w\n}\n");
385        detector.process_file(&a);
386        detector.process_file(&b);
387        let issues = detector.find_duplicates();
388        assert!(!issues.is_empty(), "Should detect cross-file duplicate");
389    }
390
391    #[test]
392    fn test_cross_file_skip_trivial_functions() {
393        let mut detector = CrossFileDupDetector::new();
394        let f = parse_rust("fn noop() {}");
395        detector.process_file(&f);
396        let (count, _) = detector.stats();
397        assert_eq!(count, 0, "Trivial functions should be skipped");
398    }
399
400    #[test]
401    fn test_cross_file_python() {
402        let mut detector = CrossFileDupDetector::new();
403        let a = parse_python_as(
404            "mod_a.py",
405            "def add(a, b):\n    result = a + b\n    print(result)\n    result += 1\n    return result\n",
406        );
407        let b = parse_python_as(
408            "mod_b.py",
409            "def sum(x, y):\n    total = x + y\n    print(total)\n    total += 1\n    return total\n",
410        );
411        detector.process_file(&a);
412        detector.process_file(&b);
413        let issues = detector.find_duplicates();
414        assert!(!issues.is_empty(), "Should detect Python cross-file dup");
415    }
416
417    #[test]
418    fn test_intra_file_dup_detects_repeated_block() {
419        let file = parse_rust(
420            r#"
421fn main() {
422    let a = 1;
423    let b = 2;
424    let c = a + b;
425    let d = c * 2;
426    println!("block_end");
427    let x = 0;
428    let a = 1;
429    let b = 2;
430    let c = a + b;
431    let d = c * 2;
432    println!("block_end");
433}
434"#,
435        );
436        let issues = IntraFileDupDetector::check(&file);
437        assert!(!issues.is_empty(), "Should detect repeated code block");
438        assert!(issues.iter().any(|i| i.rule_name == "code-duplication"));
439    }
440
441    #[test]
442    fn test_intra_file_clean_code_no_false_positive() {
443        let file = parse_rust(
444            r#"
445fn add(a: i32, b: i32) -> i32 { a + b }
446fn sub(a: i32, b: i32) -> i32 { a - b }
447fn mul(a: i32, b: i32) -> i32 { a * b }
448"#,
449        );
450        let issues = IntraFileDupDetector::check(&file);
451        assert!(issues.is_empty(), "Different functions should not trigger");
452    }
453}
garbage_code_hunter/treesitter/duplication.rs

garbage_code_hunter/treesitter/
duplication.rs