use crate::graph::body_hash::BodyHash128;
use crate::graph::unified::concurrent::CodeGraph;
use crate::graph::unified::node::{NodeId, NodeKind};
use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use std::str::FromStr;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DuplicateType {
Body,
Signature,
Struct,
}
impl DuplicateType {
#[must_use]
pub fn parse(value: &str) -> Option<Self> {
value.parse().ok()
}
}
impl FromStr for DuplicateType {
type Err = anyhow::Error;
fn from_str(value: &str) -> Result<Self, Self::Err> {
match value.trim().to_lowercase().as_str() {
"body" | "function" => Ok(Self::Body),
"signature" | "sig" => Ok(Self::Signature),
"struct" | "type" => Ok(Self::Struct),
_ => Err(anyhow::anyhow!("Unknown duplicate type: {value}")),
}
}
}
#[derive(Debug, Clone)]
pub struct DuplicateConfig {
pub threshold: f64,
#[allow(dead_code)]
pub max_results: usize,
pub is_exact_only: bool,
}
impl Default for DuplicateConfig {
fn default() -> Self {
Self {
threshold: 0.8,
max_results: 1000,
is_exact_only: false,
}
}
}
#[derive(Debug, Clone)]
pub struct DuplicateGroup {
pub hash: u64,
pub body_hash_128: Option<BodyHash128>,
pub node_ids: Vec<NodeId>,
}
fn compute_hash(graph: &CodeGraph, node_id: NodeId, dup_type: DuplicateType) -> Option<u64> {
let entry = graph.nodes().get(node_id)?;
let strings = graph.strings();
match dup_type {
DuplicateType::Body => {
if let Some(body_hash) = entry.body_hash {
return Some(body_hash.high ^ body_hash.low);
}
if let Some(sig_id) = entry.signature
&& let Some(sig) = strings.resolve(sig_id)
{
let mut hasher = std::collections::hash_map::DefaultHasher::new();
sig.hash(&mut hasher);
entry.kind.hash(&mut hasher);
return Some(hasher.finish());
}
let name = entry
.qualified_name
.and_then(|id| strings.resolve(id))
.or_else(|| strings.resolve(entry.name))?;
let mut hasher = std::collections::hash_map::DefaultHasher::new();
name.hash(&mut hasher);
entry.kind.hash(&mut hasher);
let lines = entry.end_line.saturating_sub(entry.start_line);
lines.hash(&mut hasher);
Some(hasher.finish())
}
DuplicateType::Signature => {
if let Some(sig_id) = entry.signature
&& let Some(sig) = strings.resolve(sig_id)
{
let mut hasher = std::collections::hash_map::DefaultHasher::new();
sig.hash(&mut hasher);
return Some(hasher.finish());
}
let name = strings.resolve(entry.name)?;
let mut hasher = std::collections::hash_map::DefaultHasher::new();
name.hash(&mut hasher);
entry.kind.hash(&mut hasher);
Some(hasher.finish())
}
DuplicateType::Struct => {
if !matches!(entry.kind, NodeKind::Struct | NodeKind::Class) {
return None;
}
let name = entry
.qualified_name
.and_then(|id| strings.resolve(id))
.or_else(|| strings.resolve(entry.name))?;
let mut hasher = std::collections::hash_map::DefaultHasher::new();
name.hash(&mut hasher);
entry.kind.hash(&mut hasher);
Some(hasher.finish())
}
}
}
#[must_use]
pub fn build_duplicate_groups_graph(
dup_type: DuplicateType,
graph: &CodeGraph,
config: &DuplicateConfig,
) -> Vec<DuplicateGroup> {
let mut hash_to_nodes: HashMap<u64, Vec<NodeId>> = HashMap::new();
let relevant_kinds: Vec<NodeKind> = match dup_type {
DuplicateType::Body | DuplicateType::Signature => {
vec![NodeKind::Function, NodeKind::Method]
}
DuplicateType::Struct => vec![NodeKind::Struct, NodeKind::Class],
};
for (node_id, entry) in graph.nodes().iter() {
if !relevant_kinds.contains(&entry.kind) {
continue;
}
if let Some(hash) = compute_hash(graph, node_id, dup_type) {
hash_to_nodes.entry(hash).or_default().push(node_id);
}
}
let mut groups: Vec<DuplicateGroup> = hash_to_nodes
.into_iter()
.filter(|(_, nodes)| {
if config.is_exact_only {
nodes.len() > 1
} else {
nodes.len() > 1
}
})
.map(|(hash, node_ids)| {
let body_hash_128 = if dup_type == DuplicateType::Body {
node_ids
.first()
.and_then(|&id| graph.nodes().get(id).and_then(|entry| entry.body_hash))
} else {
None
};
DuplicateGroup {
hash,
body_hash_128,
node_ids,
}
})
.collect();
groups.sort_by(|a, b| b.node_ids.len().cmp(&a.node_ids.len()));
groups.truncate(config.max_results);
groups
}
#[cfg(test)]
mod tests {
use super::*;
use crate::graph::unified::storage::arena::NodeEntry;
use std::path::Path;
fn create_test_graph_with_signatures(
nodes: &[(&str, NodeKind, Option<&str>)],
) -> (CodeGraph, Vec<NodeId>) {
let mut graph = CodeGraph::new();
let file_id = graph.files_mut().register(Path::new("test.rs")).unwrap();
let mut node_ids = Vec::new();
for (name, kind, signature) in nodes {
let name_id = graph.strings_mut().intern(name).unwrap();
let mut entry = NodeEntry::new(*kind, name_id, file_id).with_qualified_name(name_id);
if let Some(sig) = signature {
let sig_id = graph.strings_mut().intern(sig).unwrap();
entry = entry.with_signature(sig_id);
}
let node_id = graph.nodes_mut().alloc(entry).unwrap();
node_ids.push(node_id);
}
(graph, node_ids)
}
fn create_test_graph_with_spans(
nodes: &[(&str, NodeKind, u32, u32)], ) -> (CodeGraph, Vec<NodeId>) {
let mut graph = CodeGraph::new();
let file_id = graph.files_mut().register(Path::new("test.rs")).unwrap();
let mut node_ids = Vec::new();
for (name, kind, start_line, end_line) in nodes {
let name_id = graph.strings_mut().intern(name).unwrap();
let entry = NodeEntry::new(*kind, name_id, file_id)
.with_qualified_name(name_id)
.with_location(*start_line, 0, *end_line, 0);
let node_id = graph.nodes_mut().alloc(entry).unwrap();
node_ids.push(node_id);
}
(graph, node_ids)
}
#[test]
fn test_empty_graph() {
let graph = CodeGraph::new();
let config = DuplicateConfig::default();
let groups = build_duplicate_groups_graph(DuplicateType::Body, &graph, &config);
assert!(groups.is_empty());
}
#[test]
fn test_no_duplicates_unique_signatures() {
let nodes = [
("func_a", NodeKind::Function, Some("fn func_a() -> i32")),
("func_b", NodeKind::Function, Some("fn func_b() -> String")),
(
"func_c",
NodeKind::Function,
Some("fn func_c(x: i32) -> bool"),
),
];
let (graph, _) = create_test_graph_with_signatures(&nodes);
let config = DuplicateConfig::default();
let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
assert!(groups.is_empty(), "No duplicates should be found");
}
#[test]
fn test_signature_duplicates() {
let nodes = [
(
"func_a",
NodeKind::Function,
Some("fn process(x: i32) -> i32"),
),
(
"func_b",
NodeKind::Function,
Some("fn process(x: i32) -> i32"),
),
("func_c", NodeKind::Function, Some("fn other() -> String")),
];
let (graph, node_ids) = create_test_graph_with_signatures(&nodes);
let config = DuplicateConfig::default();
let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
assert_eq!(groups.len(), 1, "Should find one duplicate group");
assert_eq!(
groups[0].node_ids.len(),
2,
"Group should have 2 duplicates"
);
assert!(groups[0].node_ids.contains(&node_ids[0]));
assert!(groups[0].node_ids.contains(&node_ids[1]));
}
#[test]
fn test_body_duplicates_with_signatures() {
let nodes = [
(
"func_a",
NodeKind::Function,
Some("fn compute(x: i32) -> i32"),
),
(
"func_b",
NodeKind::Function,
Some("fn compute(x: i32) -> i32"),
),
(
"func_c",
NodeKind::Method,
Some("fn compute(x: i32) -> i32"),
), ];
let (graph, node_ids) = create_test_graph_with_signatures(&nodes);
let config = DuplicateConfig::default();
let groups = build_duplicate_groups_graph(DuplicateType::Body, &graph, &config);
assert_eq!(groups.len(), 1, "Should find one duplicate group");
assert_eq!(groups[0].node_ids.len(), 2);
assert!(groups[0].node_ids.contains(&node_ids[0]));
assert!(groups[0].node_ids.contains(&node_ids[1]));
}
#[test]
fn test_body_duplicates_fallback_to_name_and_span() {
let nodes = [
("helper", NodeKind::Function, 10, 20), ("helper", NodeKind::Function, 30, 40), ("other", NodeKind::Function, 50, 60), ];
let (graph, node_ids) = create_test_graph_with_spans(&nodes);
let config = DuplicateConfig::default();
let groups = build_duplicate_groups_graph(DuplicateType::Body, &graph, &config);
assert_eq!(groups.len(), 1, "Should find one duplicate group");
assert!(groups[0].node_ids.contains(&node_ids[0]));
assert!(groups[0].node_ids.contains(&node_ids[1]));
}
#[test]
fn test_struct_duplicates() {
let nodes = [
("MyStruct", NodeKind::Struct, None),
("MyStruct", NodeKind::Struct, None),
("MyStruct", NodeKind::Function, None), ("OtherStruct", NodeKind::Struct, None),
];
let (graph, node_ids) = create_test_graph_with_signatures(&nodes);
let config = DuplicateConfig::default();
let groups = build_duplicate_groups_graph(DuplicateType::Struct, &graph, &config);
assert_eq!(groups.len(), 1, "Should find one duplicate group");
assert!(groups[0].node_ids.contains(&node_ids[0]));
assert!(groups[0].node_ids.contains(&node_ids[1]));
assert!(!groups[0].node_ids.contains(&node_ids[2])); }
#[test]
fn test_class_duplicates() {
let nodes = [
("MyClass", NodeKind::Class, None),
("MyClass", NodeKind::Class, None),
("OtherClass", NodeKind::Class, None),
];
let (graph, node_ids) = create_test_graph_with_signatures(&nodes);
let config = DuplicateConfig::default();
let groups = build_duplicate_groups_graph(DuplicateType::Struct, &graph, &config);
assert_eq!(groups.len(), 1);
assert!(groups[0].node_ids.contains(&node_ids[0]));
assert!(groups[0].node_ids.contains(&node_ids[1]));
}
#[test]
fn test_methods_ignored_for_struct_duplicates() {
let nodes = [
("process", NodeKind::Method, Some("fn process()")),
("process", NodeKind::Method, Some("fn process()")),
];
let (graph, _) = create_test_graph_with_signatures(&nodes);
let config = DuplicateConfig::default();
let groups = build_duplicate_groups_graph(DuplicateType::Struct, &graph, &config);
assert!(
groups.is_empty(),
"Methods should not be considered for struct duplicates"
);
}
#[test]
fn test_max_results_limit() {
let mut nodes = Vec::new();
for i in 0..10 {
let sig = format!("fn group{i}()");
nodes.push((format!("func_{i}_a"), NodeKind::Function, Some(sig.clone())));
nodes.push((format!("func_{i}_b"), NodeKind::Function, Some(sig)));
}
let nodes_ref: Vec<(&str, NodeKind, Option<&str>)> = nodes
.iter()
.map(|(name, kind, sig)| (name.as_str(), *kind, sig.as_deref()))
.collect();
let (graph, _) = create_test_graph_with_signatures(&nodes_ref);
let config = DuplicateConfig {
max_results: 3,
..Default::default()
};
let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
assert_eq!(groups.len(), 3, "Should respect max_results limit");
}
#[test]
fn test_groups_sorted_by_size() {
let nodes = [
("large_a", NodeKind::Function, Some("fn large()")),
("large_b", NodeKind::Function, Some("fn large()")),
("large_c", NodeKind::Function, Some("fn large()")),
("small_a", NodeKind::Function, Some("fn small()")),
("small_b", NodeKind::Function, Some("fn small()")),
];
let (graph, _) = create_test_graph_with_signatures(&nodes);
let config = DuplicateConfig::default();
let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
assert_eq!(groups.len(), 2);
assert_eq!(groups[0].node_ids.len(), 3, "Largest group should be first");
assert_eq!(
groups[1].node_ids.len(),
2,
"Smaller group should be second"
);
}
#[test]
fn test_single_node_not_duplicate() {
let nodes = [
("unique_func", NodeKind::Function, Some("fn unique()")),
("other_func", NodeKind::Function, Some("fn different()")),
];
let (graph, _) = create_test_graph_with_signatures(&nodes);
let config = DuplicateConfig::default();
let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
assert!(
groups.is_empty(),
"Single nodes should not form duplicate groups"
);
}
#[test]
fn test_mixed_kinds_not_duplicates() {
let nodes = [
("process", NodeKind::Function, Some("fn process()")),
("process", NodeKind::Method, Some("fn process()")),
];
let (graph, _) = create_test_graph_with_signatures(&nodes);
let config = DuplicateConfig::default();
let groups = build_duplicate_groups_graph(DuplicateType::Body, &graph, &config);
assert!(
groups.is_empty(),
"Different kinds should not be duplicates for body type"
);
let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
assert_eq!(
groups.len(),
1,
"Same signature should be duplicates regardless of kind"
);
}
#[test]
fn test_multiple_duplicate_groups() {
let nodes = [
("func_a1", NodeKind::Function, Some("fn alpha()")),
("func_a2", NodeKind::Function, Some("fn alpha()")),
("func_b1", NodeKind::Function, Some("fn beta()")),
("func_b2", NodeKind::Function, Some("fn beta()")),
("unique", NodeKind::Function, Some("fn gamma()")),
];
let (graph, _) = create_test_graph_with_signatures(&nodes);
let config = DuplicateConfig::default();
let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
assert_eq!(groups.len(), 2, "Should find two duplicate groups");
}
#[test]
fn test_exact_only_config() {
let nodes = [
("func_a", NodeKind::Function, Some("fn exact()")),
("func_b", NodeKind::Function, Some("fn exact()")),
];
let (graph, _) = create_test_graph_with_signatures(&nodes);
let config = DuplicateConfig {
is_exact_only: true,
..Default::default()
};
let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
assert_eq!(groups.len(), 1);
}
}