use crate::graph::body_hash::BodyHash128;
use crate::graph::unified::concurrent::CodeGraph;
use crate::graph::unified::node::{NodeId, NodeKind};
use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use std::str::FromStr;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DuplicateType {
Body,
Signature,
Struct,
}
impl DuplicateType {
#[must_use]
pub fn parse(value: &str) -> Option<Self> {
value.parse().ok()
}
}
impl FromStr for DuplicateType {
type Err = anyhow::Error;
fn from_str(value: &str) -> Result<Self, Self::Err> {
match value.trim().to_lowercase().as_str() {
"body" | "function" => Ok(Self::Body),
"signature" | "sig" => Ok(Self::Signature),
"struct" | "type" => Ok(Self::Struct),
_ => Err(anyhow::anyhow!("Unknown duplicate type: {value}")),
}
}
}
#[derive(Debug, Clone)]
pub struct DuplicateConfig {
pub threshold: f64,
#[allow(dead_code)]
pub max_results: usize,
pub is_exact_only: bool,
pub max_members_per_group: usize,
}
impl Default for DuplicateConfig {
fn default() -> Self {
Self {
threshold: 0.8,
max_results: 1000,
is_exact_only: false,
max_members_per_group: 10,
}
}
}
#[derive(Debug, Clone)]
pub struct DuplicateGroup {
pub hash: u64,
pub body_hash_128: Option<BodyHash128>,
pub node_ids: Vec<NodeId>,
pub total_members: usize,
pub members_truncated: bool,
}
fn compute_hash(graph: &CodeGraph, node_id: NodeId, dup_type: DuplicateType) -> Option<u64> {
let entry = graph.nodes().get(node_id)?;
if entry.is_unified_loser() {
return None;
}
let strings = graph.strings();
match dup_type {
DuplicateType::Body => {
if let Some(body_hash) = entry.body_hash {
return Some(body_hash.high ^ body_hash.low);
}
if let Some(sig_id) = entry.signature
&& let Some(sig) = strings.resolve(sig_id)
{
let mut hasher = std::collections::hash_map::DefaultHasher::new();
sig.hash(&mut hasher);
entry.kind.hash(&mut hasher);
return Some(hasher.finish());
}
let name = entry
.qualified_name
.and_then(|id| strings.resolve(id))
.or_else(|| strings.resolve(entry.name))?;
let mut hasher = std::collections::hash_map::DefaultHasher::new();
name.hash(&mut hasher);
entry.kind.hash(&mut hasher);
let lines = entry.end_line.saturating_sub(entry.start_line);
lines.hash(&mut hasher);
Some(hasher.finish())
}
DuplicateType::Signature => {
if let Some(sig_id) = entry.signature
&& let Some(sig) = strings.resolve(sig_id)
{
let mut hasher = std::collections::hash_map::DefaultHasher::new();
sig.hash(&mut hasher);
return Some(hasher.finish());
}
let name = strings.resolve(entry.name)?;
let mut hasher = std::collections::hash_map::DefaultHasher::new();
name.hash(&mut hasher);
entry.kind.hash(&mut hasher);
Some(hasher.finish())
}
DuplicateType::Struct => {
if !matches!(entry.kind, NodeKind::Struct | NodeKind::Class) {
return None;
}
let name = entry
.qualified_name
.and_then(|id| strings.resolve(id))
.or_else(|| strings.resolve(entry.name))?;
let mut hasher = std::collections::hash_map::DefaultHasher::new();
name.hash(&mut hasher);
entry.kind.hash(&mut hasher);
Some(hasher.finish())
}
}
}
#[must_use]
pub fn build_duplicate_groups_graph(
dup_type: DuplicateType,
graph: &CodeGraph,
config: &DuplicateConfig,
) -> Vec<DuplicateGroup> {
let mut hash_to_nodes: HashMap<u64, Vec<NodeId>> = HashMap::new();
let relevant_kinds: Vec<NodeKind> = match dup_type {
DuplicateType::Body | DuplicateType::Signature => {
vec![NodeKind::Function, NodeKind::Method]
}
DuplicateType::Struct => vec![NodeKind::Struct, NodeKind::Class],
};
for (node_id, entry) in graph.nodes().iter() {
if entry.is_unified_loser() {
continue;
}
if !relevant_kinds.contains(&entry.kind) {
continue;
}
if let Some(hash) = compute_hash(graph, node_id, dup_type) {
hash_to_nodes.entry(hash).or_default().push(node_id);
}
}
let mut groups: Vec<DuplicateGroup> = hash_to_nodes
.into_iter()
.filter(|(_, nodes)| {
if config.is_exact_only {
nodes.len() > 1
} else {
nodes.len() > 1
}
})
.map(|(hash, mut node_ids)| {
let body_hash_128 = if dup_type == DuplicateType::Body {
node_ids.first().and_then(|&id| {
graph.nodes().get(id).and_then(|entry| {
if entry.is_unified_loser() {
None
} else {
entry.body_hash
}
})
})
} else {
None
};
let strings = graph.strings();
let files = graph.files();
node_ids.sort_by(|&a, &b| {
let path_a = graph
.nodes()
.get(a)
.and_then(|e| files.resolve(e.file))
.map(|p| p.to_string_lossy().into_owned())
.unwrap_or_default();
let path_b = graph
.nodes()
.get(b)
.and_then(|e| files.resolve(e.file))
.map(|p| p.to_string_lossy().into_owned())
.unwrap_or_default();
let name_a = graph
.nodes()
.get(a)
.and_then(|e| strings.resolve(e.name))
.as_deref()
.map(str::to_owned)
.unwrap_or_default();
let name_b = graph
.nodes()
.get(b)
.and_then(|e| strings.resolve(e.name))
.as_deref()
.map(str::to_owned)
.unwrap_or_default();
path_a
.cmp(&path_b)
.then_with(|| name_a.cmp(&name_b))
.then_with(|| a.cmp(&b))
});
let total_members = node_ids.len();
let members_truncated =
config.max_members_per_group > 0 && node_ids.len() > config.max_members_per_group;
if members_truncated {
node_ids.truncate(config.max_members_per_group);
}
DuplicateGroup {
hash,
body_hash_128,
node_ids,
total_members,
members_truncated,
}
})
.collect();
groups.sort_by(|a, b| b.total_members.cmp(&a.total_members));
groups.truncate(config.max_results);
groups
}
#[cfg(test)]
mod tests {
use super::*;
use crate::graph::unified::storage::arena::NodeEntry;
use std::path::Path;
fn create_test_graph_with_signatures(
nodes: &[(&str, NodeKind, Option<&str>)],
) -> (CodeGraph, Vec<NodeId>) {
let mut graph = CodeGraph::new();
let file_id = graph.files_mut().register(Path::new("test.rs")).unwrap();
let mut node_ids = Vec::new();
for (name, kind, signature) in nodes {
let name_id = graph.strings_mut().intern(name).unwrap();
let mut entry = NodeEntry::new(*kind, name_id, file_id).with_qualified_name(name_id);
if let Some(sig) = signature {
let sig_id = graph.strings_mut().intern(sig).unwrap();
entry = entry.with_signature(sig_id);
}
let node_id = graph.nodes_mut().alloc(entry).unwrap();
node_ids.push(node_id);
}
(graph, node_ids)
}
fn create_test_graph_with_spans(
nodes: &[(&str, NodeKind, u32, u32)], ) -> (CodeGraph, Vec<NodeId>) {
let mut graph = CodeGraph::new();
let file_id = graph.files_mut().register(Path::new("test.rs")).unwrap();
let mut node_ids = Vec::new();
for (name, kind, start_line, end_line) in nodes {
let name_id = graph.strings_mut().intern(name).unwrap();
let entry = NodeEntry::new(*kind, name_id, file_id)
.with_qualified_name(name_id)
.with_location(*start_line, 0, *end_line, 0);
let node_id = graph.nodes_mut().alloc(entry).unwrap();
node_ids.push(node_id);
}
(graph, node_ids)
}
#[test]
fn test_empty_graph() {
let graph = CodeGraph::new();
let config = DuplicateConfig::default();
let groups = build_duplicate_groups_graph(DuplicateType::Body, &graph, &config);
assert!(groups.is_empty());
}
#[test]
fn test_no_duplicates_unique_signatures() {
let nodes = [
("func_a", NodeKind::Function, Some("fn func_a() -> i32")),
("func_b", NodeKind::Function, Some("fn func_b() -> String")),
(
"func_c",
NodeKind::Function,
Some("fn func_c(x: i32) -> bool"),
),
];
let (graph, _) = create_test_graph_with_signatures(&nodes);
let config = DuplicateConfig::default();
let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
assert!(groups.is_empty(), "No duplicates should be found");
}
#[test]
fn test_signature_duplicates() {
let nodes = [
(
"func_a",
NodeKind::Function,
Some("fn process(x: i32) -> i32"),
),
(
"func_b",
NodeKind::Function,
Some("fn process(x: i32) -> i32"),
),
("func_c", NodeKind::Function, Some("fn other() -> String")),
];
let (graph, node_ids) = create_test_graph_with_signatures(&nodes);
let config = DuplicateConfig::default();
let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
assert_eq!(groups.len(), 1, "Should find one duplicate group");
assert_eq!(
groups[0].node_ids.len(),
2,
"Group should have 2 duplicates"
);
assert!(groups[0].node_ids.contains(&node_ids[0]));
assert!(groups[0].node_ids.contains(&node_ids[1]));
}
#[test]
fn test_body_duplicates_with_signatures() {
let nodes = [
(
"func_a",
NodeKind::Function,
Some("fn compute(x: i32) -> i32"),
),
(
"func_b",
NodeKind::Function,
Some("fn compute(x: i32) -> i32"),
),
(
"func_c",
NodeKind::Method,
Some("fn compute(x: i32) -> i32"),
), ];
let (graph, node_ids) = create_test_graph_with_signatures(&nodes);
let config = DuplicateConfig::default();
let groups = build_duplicate_groups_graph(DuplicateType::Body, &graph, &config);
assert_eq!(groups.len(), 1, "Should find one duplicate group");
assert_eq!(groups[0].node_ids.len(), 2);
assert!(groups[0].node_ids.contains(&node_ids[0]));
assert!(groups[0].node_ids.contains(&node_ids[1]));
}
#[test]
fn test_body_duplicates_fallback_to_name_and_span() {
let nodes = [
("helper", NodeKind::Function, 10, 20), ("helper", NodeKind::Function, 30, 40), ("other", NodeKind::Function, 50, 60), ];
let (graph, node_ids) = create_test_graph_with_spans(&nodes);
let config = DuplicateConfig::default();
let groups = build_duplicate_groups_graph(DuplicateType::Body, &graph, &config);
assert_eq!(groups.len(), 1, "Should find one duplicate group");
assert!(groups[0].node_ids.contains(&node_ids[0]));
assert!(groups[0].node_ids.contains(&node_ids[1]));
}
#[test]
fn test_struct_duplicates() {
let nodes = [
("MyStruct", NodeKind::Struct, None),
("MyStruct", NodeKind::Struct, None),
("MyStruct", NodeKind::Function, None), ("OtherStruct", NodeKind::Struct, None),
];
let (graph, node_ids) = create_test_graph_with_signatures(&nodes);
let config = DuplicateConfig::default();
let groups = build_duplicate_groups_graph(DuplicateType::Struct, &graph, &config);
assert_eq!(groups.len(), 1, "Should find one duplicate group");
assert!(groups[0].node_ids.contains(&node_ids[0]));
assert!(groups[0].node_ids.contains(&node_ids[1]));
assert!(!groups[0].node_ids.contains(&node_ids[2])); }
#[test]
fn test_class_duplicates() {
let nodes = [
("MyClass", NodeKind::Class, None),
("MyClass", NodeKind::Class, None),
("OtherClass", NodeKind::Class, None),
];
let (graph, node_ids) = create_test_graph_with_signatures(&nodes);
let config = DuplicateConfig::default();
let groups = build_duplicate_groups_graph(DuplicateType::Struct, &graph, &config);
assert_eq!(groups.len(), 1);
assert!(groups[0].node_ids.contains(&node_ids[0]));
assert!(groups[0].node_ids.contains(&node_ids[1]));
}
#[test]
fn test_methods_ignored_for_struct_duplicates() {
let nodes = [
("process", NodeKind::Method, Some("fn process()")),
("process", NodeKind::Method, Some("fn process()")),
];
let (graph, _) = create_test_graph_with_signatures(&nodes);
let config = DuplicateConfig::default();
let groups = build_duplicate_groups_graph(DuplicateType::Struct, &graph, &config);
assert!(
groups.is_empty(),
"Methods should not be considered for struct duplicates"
);
}
#[test]
fn test_max_results_limit() {
let mut nodes = Vec::new();
for i in 0..10 {
let sig = format!("fn group{i}()");
nodes.push((format!("func_{i}_a"), NodeKind::Function, Some(sig.clone())));
nodes.push((format!("func_{i}_b"), NodeKind::Function, Some(sig)));
}
let nodes_ref: Vec<(&str, NodeKind, Option<&str>)> = nodes
.iter()
.map(|(name, kind, sig)| (name.as_str(), *kind, sig.as_deref()))
.collect();
let (graph, _) = create_test_graph_with_signatures(&nodes_ref);
let config = DuplicateConfig {
max_results: 3,
..Default::default()
};
let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
assert_eq!(groups.len(), 3, "Should respect max_results limit");
}
#[test]
fn test_groups_sorted_by_size() {
let nodes = [
("large_a", NodeKind::Function, Some("fn large()")),
("large_b", NodeKind::Function, Some("fn large()")),
("large_c", NodeKind::Function, Some("fn large()")),
("small_a", NodeKind::Function, Some("fn small()")),
("small_b", NodeKind::Function, Some("fn small()")),
];
let (graph, _) = create_test_graph_with_signatures(&nodes);
let config = DuplicateConfig::default();
let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
assert_eq!(groups.len(), 2);
assert_eq!(groups[0].node_ids.len(), 3, "Largest group should be first");
assert_eq!(
groups[1].node_ids.len(),
2,
"Smaller group should be second"
);
}
#[test]
fn test_single_node_not_duplicate() {
let nodes = [
("unique_func", NodeKind::Function, Some("fn unique()")),
("other_func", NodeKind::Function, Some("fn different()")),
];
let (graph, _) = create_test_graph_with_signatures(&nodes);
let config = DuplicateConfig::default();
let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
assert!(
groups.is_empty(),
"Single nodes should not form duplicate groups"
);
}
#[test]
fn test_mixed_kinds_not_duplicates() {
let nodes = [
("process", NodeKind::Function, Some("fn process()")),
("process", NodeKind::Method, Some("fn process()")),
];
let (graph, _) = create_test_graph_with_signatures(&nodes);
let config = DuplicateConfig::default();
let groups = build_duplicate_groups_graph(DuplicateType::Body, &graph, &config);
assert!(
groups.is_empty(),
"Different kinds should not be duplicates for body type"
);
let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
assert_eq!(
groups.len(),
1,
"Same signature should be duplicates regardless of kind"
);
}
#[test]
fn test_multiple_duplicate_groups() {
let nodes = [
("func_a1", NodeKind::Function, Some("fn alpha()")),
("func_a2", NodeKind::Function, Some("fn alpha()")),
("func_b1", NodeKind::Function, Some("fn beta()")),
("func_b2", NodeKind::Function, Some("fn beta()")),
("unique", NodeKind::Function, Some("fn gamma()")),
];
let (graph, _) = create_test_graph_with_signatures(&nodes);
let config = DuplicateConfig::default();
let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
assert_eq!(groups.len(), 2, "Should find two duplicate groups");
}
#[test]
fn test_exact_only_config() {
let nodes = [
("func_a", NodeKind::Function, Some("fn exact()")),
("func_b", NodeKind::Function, Some("fn exact()")),
];
let (graph, _) = create_test_graph_with_signatures(&nodes);
let config = DuplicateConfig {
is_exact_only: true,
..Default::default()
};
let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
assert_eq!(groups.len(), 1);
}
#[test]
fn duplicates_query_excludes_unified_losers() {
use crate::graph::body_hash::BodyHash128;
use crate::graph::unified::build::unification::merge_node_into;
let mut graph = CodeGraph::new();
let winner_name = graph.strings_mut().intern("shared_fn").unwrap();
let winner_qn = graph.strings_mut().intern("mod_a::shared_fn").unwrap();
let sig = graph.strings_mut().intern("fn shared_fn() -> ()").unwrap();
let body = BodyHash128 {
high: 0xDEAD_BEEF,
low: 0xCAFE_F00D,
};
let file_a = graph
.files_mut()
.register(Path::new("src/a.rs"))
.expect("register a");
let file_b = graph
.files_mut()
.register(Path::new("src/b.rs"))
.expect("register b");
let (winner_id, loser_id) = {
let arena = graph.nodes_mut();
let mut w = NodeEntry::new(NodeKind::Function, winner_name, file_a)
.with_location(10, 0, 20, 0)
.with_signature(sig)
.with_body_hash(body);
w.qualified_name = Some(winner_qn);
let w_id = arena.alloc(w).unwrap();
let mut l = NodeEntry::new(NodeKind::Function, winner_name, file_b)
.with_location(5, 0, 15, 0)
.with_signature(sig)
.with_body_hash(body);
l.qualified_name = Some(winner_qn);
let l_id = arena.alloc(l).unwrap();
(w_id, l_id)
};
graph.files_mut().record_node(file_a, winner_id);
graph.files_mut().record_node(file_b, loser_id);
merge_node_into(graph.nodes_mut(), loser_id, winner_id).unwrap();
let loser_entry = graph.nodes().get(loser_id).expect("loser present");
assert!(loser_entry.is_unified_loser());
assert!(loser_entry.body_hash.is_none(), "body_hash must be cleared");
assert!(loser_entry.signature.is_none(), "signature must be cleared");
graph.rebuild_indices();
let groups =
build_duplicate_groups_graph(DuplicateType::Body, &graph, &DuplicateConfig::default());
for group in &groups {
assert!(
!group.node_ids.contains(&loser_id),
"Unified loser leaked into `duplicates:body` group: {:?}",
group.node_ids
);
}
assert!(
groups.is_empty(),
"After filtering losers, the winner is alone and no duplicate group should \
be returned; got {} groups: {groups:?}",
groups.len(),
);
let sig_groups = build_duplicate_groups_graph(
DuplicateType::Signature,
&graph,
&DuplicateConfig::default(),
);
for group in &sig_groups {
assert!(
!group.node_ids.contains(&loser_id),
"Unified loser leaked into `duplicates:signature` group: {:?}",
group.node_ids
);
}
assert!(
sig_groups.is_empty(),
"After filtering losers, the winner is alone and no signature duplicate \
group should be returned; got {} groups",
sig_groups.len(),
);
}
fn create_large_group_graph(member_count: usize) -> (CodeGraph, Vec<NodeId>) {
let mut graph = CodeGraph::new();
let file_a = graph.files_mut().register(Path::new("src/a.rs")).unwrap();
let file_b = graph.files_mut().register(Path::new("src/b.rs")).unwrap();
let shared_sig = graph
.strings_mut()
.intern("fn duplicate_fn() -> i32")
.unwrap();
let mut node_ids = Vec::new();
for i in 0..member_count {
let name = format!("dup_fn_{i}");
let name_id = graph.strings_mut().intern(&name).unwrap();
let file_id = if i % 2 == 0 { file_a } else { file_b };
let entry = NodeEntry::new(NodeKind::Function, name_id, file_id)
.with_qualified_name(name_id)
.with_signature(shared_sig);
let node_id = graph.nodes_mut().alloc(entry).unwrap();
node_ids.push(node_id);
}
(graph, node_ids)
}
#[test]
fn test_per_group_cap_truncation() {
let (graph, _) = create_large_group_graph(10);
let config = DuplicateConfig {
max_members_per_group: 3,
..Default::default()
};
let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
assert_eq!(groups.len(), 1, "Expected exactly one duplicate group");
let group = &groups[0];
assert_eq!(
group.total_members, 10,
"total_members must be pre-truncation count"
);
assert!(group.members_truncated, "members_truncated must be true");
assert_eq!(
group.node_ids.len(),
3,
"displayed node_ids must be capped at max_members_per_group"
);
}
#[test]
fn test_per_group_cap_zero_means_unlimited() {
let (graph, _) = create_large_group_graph(10);
let config = DuplicateConfig {
max_members_per_group: 0,
..Default::default()
};
let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
assert_eq!(groups.len(), 1);
let group = &groups[0];
assert_eq!(group.total_members, 10);
assert!(
!group.members_truncated,
"members_truncated must be false when unlimited"
);
assert_eq!(
group.node_ids.len(),
10,
"All members must be returned when cap is 0"
);
}
#[test]
fn test_per_group_cap_no_truncation_when_below_cap() {
let (graph, _) = create_large_group_graph(5);
let config = DuplicateConfig {
max_members_per_group: 10,
..Default::default()
};
let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
assert_eq!(groups.len(), 1);
let group = &groups[0];
assert_eq!(group.total_members, 5);
assert!(
!group.members_truncated,
"members_truncated must be false when count <= cap"
);
assert_eq!(
group.node_ids.len(),
5,
"All members must be present when below cap"
);
}
#[test]
fn test_per_group_cap_one_member_displayed() {
let (graph, _) = create_large_group_graph(5);
let config = DuplicateConfig {
max_members_per_group: 1,
..Default::default()
};
let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
assert_eq!(
groups.len(),
1,
"Group must not be filtered just because only 1 member is displayed"
);
let group = &groups[0];
assert_eq!(group.total_members, 5);
assert!(group.members_truncated);
assert_eq!(group.node_ids.len(), 1);
}
#[test]
fn test_per_group_deterministic_ordering() {
let mut graph = CodeGraph::new();
let file_a = graph.files_mut().register(Path::new("src/a.rs")).unwrap();
let file_b = graph.files_mut().register(Path::new("src/b.rs")).unwrap();
let shared_sig = graph.strings_mut().intern("fn stable_fn() -> i32").unwrap();
let names_and_files: &[(&str, _)] = &[
("z_func", file_b),
("m_func", file_b),
("a_func", file_b),
("z_func", file_a),
("a_func", file_a),
];
let mut inserted: Vec<(String, String, NodeId)> = Vec::new();
for (name, file_id) in names_and_files {
let name_id = graph.strings_mut().intern(name).unwrap();
let entry = NodeEntry::new(NodeKind::Function, name_id, *file_id)
.with_qualified_name(name_id)
.with_signature(shared_sig);
let node_id = graph.nodes_mut().alloc(entry).unwrap();
let path = if *file_id == file_a {
"src/a.rs".to_owned()
} else {
"src/b.rs".to_owned()
};
inserted.push((path, (*name).to_owned(), node_id));
}
let mut expected = inserted.clone();
expected.sort_by(|(pa, na, ia), (pb, nb, ib)| {
pa.cmp(pb).then_with(|| na.cmp(nb)).then_with(|| ia.cmp(ib))
});
let expected_ids: Vec<NodeId> = expected.iter().map(|(_, _, id)| *id).collect();
let config = DuplicateConfig {
max_members_per_group: 0, ..Default::default()
};
let groups = build_duplicate_groups_graph(DuplicateType::Signature, &graph, &config);
assert_eq!(groups.len(), 1, "Expected one duplicate group");
assert_eq!(
groups[0].node_ids, expected_ids,
"node_ids must be in deterministic (file_path, name, NodeId) order"
);
assert_eq!(groups[0].total_members, 5);
assert!(!groups[0].members_truncated);
}
}