use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
use std::path::Path;
use serde::{Deserialize, Serialize};
use crate::graph::Node;
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub enum CloneKind {
Exact,
Near,
}
impl CloneKind {
pub fn as_str(&self) -> &'static str {
match self {
CloneKind::Exact => "exact",
CloneKind::Near => "near",
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ClonePair {
pub node_a_id: String,
pub node_b_id: String,
pub node_a_name: String,
pub node_b_name: String,
pub node_a_path: String,
pub node_b_path: String,
pub node_a_line: u32,
pub node_b_line: u32,
pub similarity: f64,
pub kind: CloneKind,
}
pub fn normalize_body(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let chars: Vec<char> = text.chars().collect();
let mut i = 0;
while i < chars.len() {
if i + 1 < chars.len() && chars[i] == '/' && chars[i + 1] == '/' {
while i < chars.len() && chars[i] != '\n' {
i += 1;
}
continue;
}
if i + 1 < chars.len() && chars[i] == '/' && chars[i + 1] == '*' {
i += 2;
while i + 1 < chars.len() && !(chars[i] == '*' && chars[i + 1] == '/') {
i += 1;
}
i += 2; continue;
}
if chars[i] == '"' || chars[i] == '\'' || chars[i] == '`' {
let quote = chars[i];
i += 1;
while i < chars.len() {
if chars[i] == '\\' {
i += 2; continue;
}
if chars[i] == quote {
i += 1;
break;
}
i += 1;
}
result.push('S');
continue;
}
if chars[i].is_ascii_digit()
|| (chars[i] == '.' && i + 1 < chars.len() && chars[i + 1].is_ascii_digit())
{
while i < chars.len()
&& (chars[i].is_ascii_digit()
|| chars[i] == '.'
|| chars[i] == '_'
|| chars[i] == 'x'
|| chars[i] == 'o'
|| chars[i] == 'b')
{
i += 1;
}
result.push('N');
continue;
}
if chars[i].is_whitespace() {
result.push(' ');
i += 1;
while i < chars.len() && chars[i].is_whitespace() {
i += 1;
}
continue;
}
result.push(chars[i]);
i += 1;
}
result.trim().to_string()
}
pub fn fingerprint(body_text: &str) -> u64 {
let normalized = normalize_body(body_text);
let mut hasher = DefaultHasher::new();
normalized.hash(&mut hasher);
hasher.finish()
}
pub fn text_similarity(a: &str, b: &str) -> f64 {
if a.is_empty() && b.is_empty() {
return 1.0;
}
if a.is_empty() || b.is_empty() {
return 0.0;
}
let ngrams_a: std::collections::HashSet<_> = a.as_bytes().windows(4).collect();
let ngrams_b: std::collections::HashSet<_> = b.as_bytes().windows(4).collect();
if ngrams_a.is_empty() || ngrams_b.is_empty() {
return 0.0;
}
let intersection = ngrams_a.intersection(&ngrams_b).count();
let union = ngrams_a.union(&ngrams_b).count();
if union == 0 {
return 0.0;
}
intersection as f64 / union as f64
}
pub fn detect_clones(
nodes: &[Node],
repo_path: &Path,
threshold: f64,
) -> anyhow::Result<Vec<ClonePair>> {
let fn_nodes: Vec<&Node> = nodes
.iter()
.filter(|n| n.kind == "Function" && !n.path.is_empty() && n.line_end >= n.line_start)
.collect();
if fn_nodes.len() < 2 {
return Ok(Vec::new());
}
let mut file_cache: std::collections::HashMap<String, Vec<String>> =
std::collections::HashMap::new();
let mut bodies: Vec<(usize, String)> = Vec::new();
for (idx, node) in fn_nodes.iter().enumerate() {
let file_lines = file_cache.entry(node.path.clone()).or_insert_with(|| {
let full_path = repo_path.join(&node.path);
std::fs::read_to_string(&full_path)
.unwrap_or_default()
.lines()
.map(|l| l.to_string())
.collect()
});
let start = (node.line_start as usize).saturating_sub(1);
let end = (node.line_end as usize).min(file_lines.len());
if start >= end {
bodies.push((idx, String::new()));
continue;
}
let body = file_lines[start..end].join("\n");
let normalized = normalize_body(&body);
bodies.push((idx, normalized));
}
let valid_bodies: Vec<(usize, &str)> = bodies
.iter()
.filter(|(_, b)| b.len() >= 4)
.map(|(idx, b)| (*idx, b.as_str()))
.collect();
let mut fp_map: std::collections::HashMap<u64, Vec<usize>> = std::collections::HashMap::new();
for (idx, body) in &valid_bodies {
let mut hasher = DefaultHasher::new();
body.hash(&mut hasher);
let fp = hasher.finish();
fp_map.entry(fp).or_default().push(*idx);
}
let mut pairs: Vec<ClonePair> = Vec::new();
let mut seen: std::collections::HashSet<(usize, usize)> = std::collections::HashSet::new();
for group in fp_map.values() {
if group.len() < 2 {
continue;
}
for i in 0..group.len() {
for j in (i + 1)..group.len() {
let idx_a = group[i];
let idx_b = group[j];
let key = (idx_a.min(idx_b), idx_a.max(idx_b));
if seen.insert(key) {
let node_a = fn_nodes[idx_a];
let node_b = fn_nodes[idx_b];
pairs.push(ClonePair {
node_a_id: node_a.id.clone(),
node_b_id: node_b.id.clone(),
node_a_name: node_a.name.clone(),
node_b_name: node_b.name.clone(),
node_a_path: node_a.path.clone(),
node_b_path: node_b.path.clone(),
node_a_line: node_a.line_start,
node_b_line: node_b.line_start,
similarity: 1.0,
kind: CloneKind::Exact,
});
}
}
}
}
if valid_bodies.len() <= 500 {
for i in 0..valid_bodies.len() {
for j in (i + 1)..valid_bodies.len() {
let idx_a = valid_bodies[i].0;
let idx_b = valid_bodies[j].0;
let key = (idx_a.min(idx_b), idx_a.max(idx_b));
if seen.contains(&key) {
continue; }
let body_a = valid_bodies[i].1;
let body_b = valid_bodies[j].1;
let sim = text_similarity(body_a, body_b);
if sim >= threshold {
seen.insert(key);
let node_a = fn_nodes[idx_a];
let node_b = fn_nodes[idx_b];
pairs.push(ClonePair {
node_a_id: node_a.id.clone(),
node_b_id: node_b.id.clone(),
node_a_name: node_a.name.clone(),
node_b_name: node_b.name.clone(),
node_a_path: node_a.path.clone(),
node_b_path: node_b.path.clone(),
node_a_line: node_a.line_start,
node_b_line: node_b.line_start,
similarity: sim,
kind: CloneKind::Near,
});
}
}
}
}
pairs.sort_by(|a, b| {
b.similarity
.partial_cmp(&a.similarity)
.unwrap_or(std::cmp::Ordering::Equal)
});
Ok(pairs)
}