use crate::error::{M1ndError, M1ndResult};
use crate::graph::Graph;
use crate::types::{FiniteF32, NodeId, NodeType};
use serde::Serialize;
use std::collections::HashMap;
#[derive(Clone, Debug)]
pub struct TwinConfig {
pub similarity_threshold: f32,
pub top_k: usize,
pub node_types: Vec<NodeType>,
pub scope: Option<String>,
pub use_edge_types: bool,
}
impl Default for TwinConfig {
fn default() -> Self {
Self {
similarity_threshold: 0.80,
top_k: 50,
node_types: vec![],
scope: None,
use_edge_types: true,
}
}
}
#[derive(Clone, Debug, Serialize)]
pub struct TwinPair {
pub node_a_id: String,
pub node_a_label: String,
pub node_b_id: String,
pub node_b_label: String,
pub similarity: f32,
pub shared_properties: Vec<String>,
}
#[derive(Clone, Debug)]
struct StructuralSignature {
node_id: NodeId,
ext_id: String,
label: String,
features: Vec<f32>,
}
#[derive(Clone, Debug, Serialize)]
pub struct TwinResult {
pub pairs: Vec<TwinPair>,
pub nodes_analyzed: usize,
pub signatures_computed: usize,
pub elapsed_ms: f64,
}
fn node_type_as_f32(nt: &NodeType) -> f32 {
match nt {
NodeType::File => 0.0,
NodeType::Directory => 1.0,
NodeType::Function => 2.0,
NodeType::Class => 3.0,
NodeType::Struct => 4.0,
NodeType::Enum => 5.0,
NodeType::Type => 6.0,
NodeType::Module => 7.0,
NodeType::Reference => 8.0,
NodeType::Concept => 9.0,
NodeType::Material => 10.0,
NodeType::Process => 11.0,
NodeType::Product => 12.0,
NodeType::Supplier => 13.0,
NodeType::Regulatory => 14.0,
NodeType::System => 15.0,
NodeType::Cost => 16.0,
NodeType::Custom(v) => 17.0 + *v as f32,
}
}
const BASE_FEATURES: usize = 8;
fn compute_signature(
graph: &Graph,
node: NodeId,
n: usize,
edge_type_vocab: &HashMap<String, usize>,
use_edge_types: bool,
) -> Vec<f32> {
let edge_vocab_size = if use_edge_types {
edge_type_vocab.len()
} else {
0
};
let total_features = BASE_FEATURES + edge_vocab_size * 2; let mut features = vec![0.0f32; total_features];
let idx = node.as_usize();
if idx >= n || !graph.finalized {
return features;
}
let out_range = graph.csr.out_range(node);
let out_degree = out_range.len() as f32;
features[0] = out_degree;
let in_range = graph.csr.in_range(node);
let in_degree = in_range.len() as f32;
features[1] = in_degree;
let total_degree = in_degree + out_degree;
features[2] = if total_degree > 0.0 {
in_degree / total_degree
} else {
0.5
};
features[3] = graph.nodes.pagerank[idx].get();
features[4] = node_type_as_f32(&graph.nodes.node_type[idx]);
features[5] = graph.nodes.change_frequency[idx].get();
features[6] = graph.nodes.tags[idx].len() as f32;
let mut type_counts: [f32; 8] = [0.0; 8]; for edge_idx in out_range.clone() {
let target = graph.csr.targets[edge_idx];
let tidx = target.as_usize();
if tidx < n {
let t = node_type_as_f32(&graph.nodes.node_type[tidx]) as usize;
if t < type_counts.len() {
type_counts[t] += 1.0;
}
}
}
let total: f32 = type_counts.iter().sum();
if total > 0.0 {
let entropy: f32 = type_counts
.iter()
.filter(|&&c| c > 0.0)
.map(|&c| {
let p = c / total;
-p * p.ln()
})
.sum();
features[7] = entropy;
}
if use_edge_types {
for edge_idx in graph.csr.out_range(node) {
let rel = graph.strings.resolve(graph.csr.relations[edge_idx]);
if let Some(&vocab_idx) = edge_type_vocab.get(rel) {
features[BASE_FEATURES + vocab_idx] += 1.0;
}
}
for edge_idx in graph.csr.in_range(node) {
let fwd_idx = graph.csr.rev_edge_idx[edge_idx];
let rel = graph
.strings
.resolve(graph.csr.relations[fwd_idx.as_usize()]);
if let Some(&vocab_idx) = edge_type_vocab.get(rel) {
features[BASE_FEATURES + edge_vocab_size + vocab_idx] += 1.0;
}
}
}
features
}
fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm_a < f32::EPSILON || norm_b < f32::EPSILON {
return 0.0;
}
(dot / (norm_a * norm_b)).clamp(0.0, 1.0)
}
fn describe_shared(a: &[f32], b: &[f32]) -> Vec<String> {
let mut props = Vec::new();
if a.len() < BASE_FEATURES || b.len() < BASE_FEATURES {
return props;
}
if (a[0] - b[0]).abs() < 0.5 {
props.push(format!("out_degree≈{:.0}", a[0]));
}
if (a[1] - b[1]).abs() < 0.5 {
props.push(format!("in_degree≈{:.0}", a[1]));
}
if (a[4] - b[4]).abs() < 0.5 {
props.push("same_node_type".to_string());
}
if (a[6] - b[6]).abs() < 0.5 {
props.push(format!("tag_count≈{:.0}", a[6]));
}
props
}
pub fn find_twins(graph: &Graph, config: &TwinConfig) -> M1ndResult<TwinResult> {
let start = std::time::Instant::now();
let n = graph.num_nodes() as usize;
if n == 0 || !graph.finalized {
return Err(M1ndError::EmptyGraph);
}
let mut node_to_ext: Vec<String> = vec![String::new(); n];
for (interned, node_id) in &graph.id_to_node {
let idx = node_id.as_usize();
if idx < n {
node_to_ext[idx] = graph.strings.resolve(*interned).to_string();
}
}
let mut edge_type_vocab: HashMap<String, usize> = HashMap::new();
if config.use_edge_types {
for i in 0..n {
let nid = NodeId::new(i as u32);
for edge_idx in graph.csr.out_range(nid) {
let rel = graph
.strings
.resolve(graph.csr.relations[edge_idx])
.to_string();
let len = edge_type_vocab.len();
edge_type_vocab.entry(rel).or_insert(len);
}
}
}
let candidate_nodes: Vec<NodeId> = (0..n)
.filter(|&i| {
if !config.node_types.is_empty()
&& !config.node_types.contains(&graph.nodes.node_type[i])
{
return false;
}
if let Some(ref scope) = config.scope {
if !node_to_ext[i].contains(scope.as_str()) {
return false;
}
}
true
})
.map(|i| NodeId::new(i as u32))
.collect();
let signatures: Vec<StructuralSignature> = candidate_nodes
.iter()
.map(|&nid| {
let idx = nid.as_usize();
StructuralSignature {
node_id: nid,
ext_id: node_to_ext[idx].clone(),
label: graph.strings.resolve(graph.nodes.label[idx]).to_string(),
features: compute_signature(graph, nid, n, &edge_type_vocab, config.use_edge_types),
}
})
.collect();
let sigs_count = signatures.len();
let mut pairs: Vec<TwinPair> = Vec::new();
let budget = 100_000usize; let mut comparisons = 0usize;
for i in 0..signatures.len() {
if comparisons >= budget {
break;
}
for j in (i + 1)..signatures.len() {
comparisons += 1;
if comparisons >= budget {
break;
}
let sim = cosine_similarity(&signatures[i].features, &signatures[j].features);
if sim >= config.similarity_threshold {
pairs.push(TwinPair {
node_a_id: signatures[i].ext_id.clone(),
node_a_label: signatures[i].label.clone(),
node_b_id: signatures[j].ext_id.clone(),
node_b_label: signatures[j].label.clone(),
similarity: sim,
shared_properties: describe_shared(
&signatures[i].features,
&signatures[j].features,
),
});
}
}
}
pairs.sort_by(|a, b| {
b.similarity
.partial_cmp(&a.similarity)
.unwrap_or(std::cmp::Ordering::Equal)
});
pairs.truncate(config.top_k);
Ok(TwinResult {
pairs,
nodes_analyzed: candidate_nodes.len(),
signatures_computed: sigs_count,
elapsed_ms: start.elapsed().as_secs_f64() * 1000.0,
})
}
#[cfg(test)]
mod tests {
use super::*;
use crate::graph::*;
use crate::types::{EdgeDirection, FiniteF32};
fn build_twin_graph() -> Graph {
let mut g = Graph::new();
g.add_node(
"a_h",
"handler_a",
NodeType::Function,
&["handler"],
0.0,
0.5,
)
.unwrap();
g.add_node("a_p", "process_a", NodeType::Function, &["data"], 0.0, 0.3)
.unwrap();
g.add_node("a_o", "output_a", NodeType::Function, &["output"], 0.0, 0.2)
.unwrap();
g.add_node(
"b_h",
"handler_b",
NodeType::Function,
&["handler"],
0.0,
0.5,
)
.unwrap();
g.add_node("b_p", "process_b", NodeType::Function, &["data"], 0.0, 0.3)
.unwrap();
g.add_node("b_o", "output_b", NodeType::Function, &["output"], 0.0, 0.2)
.unwrap();
g.add_edge(
NodeId::new(0),
NodeId::new(1),
"calls",
FiniteF32::new(0.8),
EdgeDirection::Forward,
false,
FiniteF32::new(0.5),
)
.unwrap();
g.add_edge(
NodeId::new(1),
NodeId::new(2),
"calls",
FiniteF32::new(0.7),
EdgeDirection::Forward,
false,
FiniteF32::new(0.4),
)
.unwrap();
g.add_edge(
NodeId::new(3),
NodeId::new(4),
"calls",
FiniteF32::new(0.8),
EdgeDirection::Forward,
false,
FiniteF32::new(0.5),
)
.unwrap();
g.add_edge(
NodeId::new(4),
NodeId::new(5),
"calls",
FiniteF32::new(0.7),
EdgeDirection::Forward,
false,
FiniteF32::new(0.4),
)
.unwrap();
g.finalize().unwrap();
g
}
fn build_different_graph() -> Graph {
let mut g = Graph::new();
g.add_node("hub", "dispatcher", NodeType::Function, &["core"], 0.0, 0.9)
.unwrap();
g.add_node(
"leaf1",
"handler_1",
NodeType::Function,
&["handler"],
0.0,
0.1,
)
.unwrap();
g.add_node(
"leaf2",
"handler_2",
NodeType::Function,
&["handler"],
0.0,
0.1,
)
.unwrap();
g.add_node(
"leaf3",
"handler_3",
NodeType::Function,
&["handler"],
0.0,
0.1,
)
.unwrap();
g.add_edge(
NodeId::new(0),
NodeId::new(1),
"calls",
FiniteF32::new(0.8),
EdgeDirection::Forward,
false,
FiniteF32::new(0.5),
)
.unwrap();
g.add_edge(
NodeId::new(0),
NodeId::new(2),
"calls",
FiniteF32::new(0.8),
EdgeDirection::Forward,
false,
FiniteF32::new(0.5),
)
.unwrap();
g.add_edge(
NodeId::new(0),
NodeId::new(3),
"calls",
FiniteF32::new(0.8),
EdgeDirection::Forward,
false,
FiniteF32::new(0.5),
)
.unwrap();
g.finalize().unwrap();
g
}
#[test]
fn cosine_sim_identical() {
let a = vec![1.0, 2.0, 3.0];
assert!((cosine_similarity(&a, &a) - 1.0).abs() < 0.001);
}
#[test]
fn cosine_sim_orthogonal() {
let a = vec![1.0, 0.0];
let b = vec![0.0, 1.0];
assert!(cosine_similarity(&a, &b).abs() < 0.001);
}
#[test]
fn cosine_sim_zero_vectors() {
let a = vec![0.0, 0.0, 0.0];
assert!(cosine_similarity(&a, &a).abs() < 0.001);
}
#[test]
fn twins_found_in_symmetric_graph() {
let g = build_twin_graph();
let config = TwinConfig {
similarity_threshold: 0.85,
..TwinConfig::default()
};
let result = find_twins(&g, &config).unwrap();
assert!(result.nodes_analyzed == 6);
assert!(
!result.pairs.is_empty(),
"Should detect twin pairs, got none"
);
let has_handler_twin = result
.pairs
.iter()
.any(|p| (p.node_a_label.contains("handler") && p.node_b_label.contains("handler")));
assert!(
has_handler_twin,
"Should detect handler_a ↔ handler_b twin, pairs: {:?}",
result.pairs
);
}
#[test]
fn hub_not_twin_with_leaf() {
let g = build_different_graph();
let config = TwinConfig {
similarity_threshold: 0.90,
..TwinConfig::default()
};
let result = find_twins(&g, &config).unwrap();
let has_hub_leaf_twin = result
.pairs
.iter()
.any(|p| (p.node_a_label == "dispatcher" || p.node_b_label == "dispatcher"));
assert!(
!has_hub_leaf_twin,
"Hub should not be twin with leaf at 0.9 threshold"
);
}
#[test]
fn leaves_are_twins_of_each_other() {
let g = build_different_graph();
let config = TwinConfig {
similarity_threshold: 0.85,
..TwinConfig::default()
};
let result = find_twins(&g, &config).unwrap();
let leaf_pairs: Vec<_> = result
.pairs
.iter()
.filter(|p| {
p.node_a_label.starts_with("handler_") && p.node_b_label.starts_with("handler_")
})
.collect();
assert!(
leaf_pairs.len() >= 2,
"Should find at least 2 leaf twin pairs, found {}",
leaf_pairs.len()
);
}
#[test]
fn empty_graph_returns_error() {
let g = Graph::new();
let config = TwinConfig::default();
assert!(find_twins(&g, &config).is_err());
}
#[test]
fn scope_filter_limits_analysis() {
let g = build_twin_graph();
let config = TwinConfig {
scope: Some("nonexistent_scope".to_string()),
..TwinConfig::default()
};
let result = find_twins(&g, &config).unwrap();
assert!(
result.pairs.is_empty(),
"Scoped to nonexistent prefix should find no twins"
);
}
}