use std::collections::hash_map::DefaultHasher;
use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use std::path::{Path, PathBuf};
use crate::analyzer::{CodeIssue, Severity};
use crate::treesitter::engine::ParsedFile;
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
#[allow(dead_code)]
enum FToken {
Ident,
Int,
Float,
Str,
Bool,
Type,
}
#[derive(Debug, Clone)]
struct FuncFingerprint {
hash: u64,
tokens: Vec<FToken>,
name: String,
file: PathBuf,
line_start: usize,
}
fn fingerprint_function(file: &ParsedFile, node: tree_sitter::Node) -> Option<FuncFingerprint> {
let name = extract_func_name(file, node)?;
let line_start = node.start_position().row + 1;
let line_end = node.end_position().row + 1;
let line_count = line_end - line_start + 1;
if line_count < 5 {
return None;
}
let tokens = normalize_node(file, node);
if tokens.len() < 8 {
return None;
}
let mut hasher = DefaultHasher::new();
tokens.hash(&mut hasher);
let hash = hasher.finish();
Some(FuncFingerprint {
hash,
tokens,
name,
file: file.path.clone(),
line_start,
})
}
#[allow(clippy::only_used_in_recursion)]
fn normalize_node(file: &ParsedFile, node: tree_sitter::Node) -> Vec<FToken> {
let kind = node.kind();
match kind {
"identifier"
| "field_identifier"
| "shorthand_property_identifier"
| "variable_name"
| "name" => return vec![FToken::Ident],
"type_identifier" | "primitive_type" | "mutable_specifier" => return vec![FToken::Type],
"integer_literal" | "integer" | "number" => return vec![FToken::Int],
"float_literal" | "float" => return vec![FToken::Float],
"string_literal" | "string" | "interpreted_string_literal" | "char_literal" => {
return vec![FToken::Str];
}
"boolean" | "true" | "false" => return vec![FToken::Bool],
"let" | "const" | "mut" | "fn" | "def" | "function" | "fun" | "return" | "if" | "else"
| "for" | "while" | "match" | "use" | "mod" | "struct" | "enum" | "impl" | "trait"
| "pub" | "pub(crate)" | "unsafe" | "async" | "await" | "where" | "in" | "as"
| "static" | "extern" | "ref" | "type" => return vec![],
_ => {}
}
let mut tokens = Vec::new();
let mut cursor = node.walk();
for child in node.named_children(&mut cursor) {
tokens.extend(normalize_node(file, child));
}
tokens
}
fn extract_func_name(file: &ParsedFile, node: tree_sitter::Node) -> Option<String> {
let mut cursor = node.walk();
for child in node.named_children(&mut cursor) {
if child.kind() == "identifier" || child.kind() == "name" {
let start = child.start_byte();
let end = child.end_byte();
return Some(file.content[start..end].to_string());
}
}
None
}
const FN_NODE_KINDS: &[&str] = &[
"function_item", "function_definition", "function_declaration", "method_definition", "method_declaration", "method", ];
fn find_functions_recursive(
file: &ParsedFile,
node: tree_sitter::Node,
fingerprints: &mut Vec<FuncFingerprint>,
processed: &mut usize,
) {
let kind = node.kind();
if FN_NODE_KINDS.contains(&kind) {
if let Some(fp) = fingerprint_function(file, node) {
fingerprints.push(fp);
*processed += 1;
}
}
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
find_functions_recursive(file, child, fingerprints, processed);
}
}
pub struct CrossFileDupDetector {
fingerprints: Vec<FuncFingerprint>,
processed: usize,
}
impl Default for CrossFileDupDetector {
fn default() -> Self {
Self::new()
}
}
impl CrossFileDupDetector {
pub fn new() -> Self {
Self {
fingerprints: Vec::new(),
processed: 0,
}
}
pub fn process_file(&mut self, file: &ParsedFile) {
let root = file.root_node();
find_functions_recursive(file, root, &mut self.fingerprints, &mut self.processed);
}
pub fn find_duplicates(&self) -> Vec<CodeIssue> {
let mut issues = Vec::new();
let mut by_hash: HashMap<u64, Vec<&FuncFingerprint>> = HashMap::new();
for fp in &self.fingerprints {
by_hash.entry(fp.hash).or_default().push(fp);
}
for group in by_hash.values() {
if group.len() < 2 {
continue;
}
let unique_files: std::collections::HashSet<&Path> =
group.iter().map(|fp| fp.file.as_path()).collect();
let file_count = unique_files.len();
if file_count < 2 {
continue;
}
let sev = if group.len() > 10 {
Severity::Nuclear
} else if group.len() > 5 {
Severity::Spicy
} else {
Severity::Mild
};
for fp in group {
issues.push(CodeIssue {
file_path: fp.file.clone(),
line: fp.line_start,
column: 1,
rule_name: "cross-file-duplication".to_string(),
message: format!(
"Function '{}' duplicated in {} files ({} occurrences)",
fp.name,
file_count,
group.len()
),
severity: sev.clone(),
});
}
}
issues
}
pub fn find_near_duplicates(&self) -> Vec<CodeIssue> {
let mut issues = Vec::new();
let fps: Vec<&FuncFingerprint> = self.fingerprints.iter().collect();
let mut seen: std::collections::HashSet<(usize, usize)> = std::collections::HashSet::new();
for i in 0..fps.len() {
for j in (i + 1)..fps.len() {
if seen.contains(&(i, j)) || seen.contains(&(j, i)) {
continue;
}
seen.insert((i, j));
let a = &fps[i].tokens;
let b = &fps[j].tokens;
let sim = jaccard_similarity(a, b);
if sim >= 0.80 && fps[i].file != fps[j].file {
issues.push(CodeIssue {
file_path: fps[i].file.clone(),
line: fps[i].line_start,
column: 1,
rule_name: "cross-file-near-duplicate".to_string(),
message: format!(
"Function '{}' is {:.0}% similar to '{}' in {} — consider refactoring",
fps[i].name,
sim * 100.0,
fps[j].name,
fps[j].file.display(),
),
severity: Severity::Mild,
});
}
}
}
issues
}
pub fn stats(&self) -> (usize, usize) {
(self.fingerprints.len(), self.processed)
}
}
fn jaccard_similarity(a: &[FToken], b: &[FToken]) -> f64 {
use std::collections::HashSet;
if a.is_empty() && b.is_empty() {
return 1.0;
}
if a.is_empty() || b.is_empty() {
return 0.0;
}
let set_a: HashSet<&FToken> = a.iter().collect();
let set_b: HashSet<&FToken> = b.iter().collect();
let intersection = set_a.intersection(&set_b).count();
let union = set_a.union(&set_b).count();
if union == 0 {
0.0
} else {
intersection as f64 / union as f64
}
}
pub struct IntraFileDupDetector;
impl IntraFileDupDetector {
pub fn check(file: &ParsedFile) -> Vec<CodeIssue> {
let lines: Vec<&str> = file.content.lines().collect();
if lines.len() < 10 {
return vec![];
}
let chunk_size = 5usize;
let mut seen: HashMap<u64, Vec<usize>> = HashMap::new();
for i in 0..lines.len().saturating_sub(chunk_size - 1) {
let chunk: Vec<&str> = lines[i..i + chunk_size]
.iter()
.map(|l| l.trim())
.filter(|l| !l.is_empty() && !l.starts_with("//") && !l.starts_with('#'))
.collect();
if chunk.len() < 3 {
continue;
}
let mut hasher = DefaultHasher::new();
for line in &chunk {
line.hash(&mut hasher);
}
let hash = hasher.finish();
seen.entry(hash).or_default().push(i);
}
let mut issues = Vec::new();
for positions in seen.values() {
if positions.len() < 2 {
continue;
}
let is_spaced = positions.windows(2).any(|w| w[1] - w[0] > chunk_size);
if !is_spaced {
continue;
}
let start = positions[0];
issues.push(CodeIssue {
file_path: file.path.clone(),
line: start + 1,
column: 1,
rule_name: "code-duplication".to_string(),
message: format!(
"Duplicate code block ({} lines) found {} times starting at line {}",
chunk_size,
positions.len(),
start + 1
),
severity: Severity::Mild,
});
}
issues
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::treesitter::TreeSitterEngine;
fn parse_rust_as(name: &str, code: &str) -> ParsedFile {
let engine = TreeSitterEngine::new();
engine
.parse_file(Path::new(name), code)
.expect("Should parse")
}
fn parse_rust(code: &str) -> ParsedFile {
parse_rust_as("default.rs", code)
}
fn parse_python_as(name: &str, code: &str) -> ParsedFile {
let engine = TreeSitterEngine::new();
engine
.parse_file(Path::new(name), code)
.expect("Should parse")
}
#[test]
fn test_find_function_nodes() {
let file =
parse_rust("fn compute(a: i32, b: i32) -> i32 {\n let r = a + b;\n r\n}\n");
let root = file.root_node();
assert_eq!(root.kind(), "source_file");
let mut c2 = root.walk();
let found_fn = c2.goto_first_child() && c2.node().kind() == "function_item";
assert!(found_fn, "Should find function_item node");
}
#[test]
fn test_cross_file_dup_detects_identical_functions() {
let mut detector = CrossFileDupDetector::new();
let a = parse_rust_as("file_a.rs", "fn compute(a: i32, b: i32) -> i32 {\n let r = a + b;\n let s = r * 2;\n s\n}\n");
let b = parse_rust_as("file_b.rs", "fn compute(x: i32, y: i32) -> i32 {\n let z = x + y;\n let w = z * 2;\n w\n}\n");
detector.process_file(&a);
detector.process_file(&b);
let issues = detector.find_duplicates();
assert!(!issues.is_empty(), "Should detect cross-file duplicate");
}
#[test]
fn test_cross_file_skip_trivial_functions() {
let mut detector = CrossFileDupDetector::new();
let f = parse_rust("fn noop() {}");
detector.process_file(&f);
let (count, _) = detector.stats();
assert_eq!(count, 0, "Trivial functions should be skipped");
}
#[test]
fn test_cross_file_python() {
let mut detector = CrossFileDupDetector::new();
let a = parse_python_as(
"mod_a.py",
"def add(a, b):\n result = a + b\n print(result)\n result += 1\n return result\n",
);
let b = parse_python_as(
"mod_b.py",
"def sum(x, y):\n total = x + y\n print(total)\n total += 1\n return total\n",
);
detector.process_file(&a);
detector.process_file(&b);
let issues = detector.find_duplicates();
assert!(!issues.is_empty(), "Should detect Python cross-file dup");
}
#[test]
fn test_intra_file_dup_detects_repeated_block() {
let file = parse_rust(
r#"
fn main() {
let a = 1;
let b = 2;
let c = a + b;
let d = c * 2;
println!("block_end");
let x = 0;
let a = 1;
let b = 2;
let c = a + b;
let d = c * 2;
println!("block_end");
}
"#,
);
let issues = IntraFileDupDetector::check(&file);
assert!(!issues.is_empty(), "Should detect repeated code block");
assert!(issues.iter().any(|i| i.rule_name == "code-duplication"));
}
#[test]
fn test_intra_file_clean_code_no_false_positive() {
let file = parse_rust(
r#"
fn add(a: i32, b: i32) -> i32 { a + b }
fn sub(a: i32, b: i32) -> i32 { a - b }
fn mul(a: i32, b: i32) -> i32 { a * b }
"#,
);
let issues = IntraFileDupDetector::check(&file);
assert!(issues.is_empty(), "Different functions should not trigger");
}
}