use std::collections::{HashMap, HashSet};
use anyhow::Result;
use infomap_rs::{Infomap, Network};
use crate::graph::{Edge, Graph, Node};
pub const WEIGHT_CALLS: f64 = 1.0;
pub const WEIGHT_IMPORTS: f64 = 0.8;
pub const WEIGHT_TYPE_REF: f64 = 0.5;
pub const WEIGHT_STRUCTURAL: f64 = 0.2;
pub const WEIGHT_DEPENDS_ON: f64 = 0.4;
pub const WEIGHT_CO_CITATION: f64 = 0.4;
pub const CO_CITATION_MIN_SHARED: usize = 2;
pub const WEIGHT_DIR_COLOCATION: f64 = 0.3;
pub const WEIGHT_SYMBOL_SIMILARITY: f64 = 0.5;
pub const SYMBOL_MIN_SHARED_TOKENS: usize = 2;
pub const SYMBOL_MIN_JACCARD: f64 = 0.15;
#[deprecated(note = "co-location is now isolation-gated; pairwise limit is unnecessary")]
pub const COLOCATION_PAIRWISE_LIMIT: usize = 80;
pub fn relation_weight(relation: &str) -> f64 {
match relation {
"calls" => WEIGHT_CALLS,
"imports" => WEIGHT_IMPORTS,
"type_reference" | "inherits" | "implements" | "uses" => WEIGHT_TYPE_REF,
"defined_in" | "contains" | "belongs_to" => WEIGHT_STRUCTURAL,
"depends_on" => WEIGHT_DEPENDS_ON,
"overrides" => WEIGHT_TYPE_REF, "tests_for" => 0.3, _ => 0.0,
}
}
#[derive(Debug, Clone)]
pub struct ClusterConfig {
pub teleportation_rate: f64,
pub num_trials: u32,
pub min_community_size: usize,
pub hierarchical: bool,
pub co_citation_weight: f64,
pub co_citation_min_shared: usize,
pub dir_colocation_weight: f64,
pub symbol_similarity_weight: f64,
pub symbol_min_shared_tokens: usize,
pub symbol_min_jaccard: f64,
pub seed: u64,
pub max_cluster_size: Option<usize>,
pub hub_exclusion_threshold: f64,
pub hub_min_degree: usize,
}
impl Default for ClusterConfig {
fn default() -> Self {
Self {
teleportation_rate: 0.05,
num_trials: 10,
min_community_size: 2,
co_citation_weight: WEIGHT_CO_CITATION,
co_citation_min_shared: CO_CITATION_MIN_SHARED,
dir_colocation_weight: WEIGHT_DIR_COLOCATION,
symbol_similarity_weight: WEIGHT_SYMBOL_SIMILARITY,
symbol_min_shared_tokens: SYMBOL_MIN_SHARED_TOKENS,
symbol_min_jaccard: SYMBOL_MIN_JACCARD,
hierarchical: false,
seed: 42,
max_cluster_size: None,
hub_exclusion_threshold: 0.05,
hub_min_degree: 10,
}
}
}
#[derive(Debug, Clone)]
pub struct RawCluster {
pub id: usize,
pub member_ids: Vec<String>,
pub flow: f64,
pub parent: Option<usize>,
pub children: Vec<usize>,
}
#[derive(Debug, Clone, Default)]
pub struct ClusterMetrics {
pub codelength: f64,
pub num_communities: usize,
pub num_total: usize,
pub orphan_count_raw: usize,
pub orphans_merged_by_affinity: usize,
pub orphans_assigned_by_dir: usize,
pub singleton_clusters_final: usize,
pub clusters_split: usize,
pub sub_clusters_created: usize,
}
#[derive(Debug, Clone)]
pub struct ClusterResult {
pub nodes: Vec<Node>,
pub edges: Vec<Edge>,
pub metrics: ClusterMetrics,
}
impl ClusterResult {
pub fn empty() -> Self {
Self {
nodes: Vec::new(),
edges: Vec::new(),
metrics: ClusterMetrics {
codelength: 0.0,
num_communities: 0,
num_total: 0,
..Default::default()
},
}
}
}
pub fn build_network(graph: &Graph) -> (Network, Vec<String>) {
let mut id_to_idx: HashMap<&str, usize> = HashMap::new();
let mut idx_to_id: Vec<String> = Vec::new();
for node in &graph.nodes {
let is_file = node.node_type.as_deref() == Some("file")
|| (node.node_type.as_deref() == Some("code")
&& node.node_kind.as_deref() == Some("File"));
if is_file {
let idx = idx_to_id.len();
id_to_idx.insert(&node.id, idx);
idx_to_id.push(node.id.clone());
}
}
let mut node_to_file_idx: HashMap<&str, usize> = HashMap::new();
for node in &graph.nodes {
let is_file = node.node_type.as_deref() == Some("file")
|| (node.node_type.as_deref() == Some("code")
&& node.node_kind.as_deref() == Some("File"));
if is_file {
continue;
}
if let Some(ref fp) = node.file_path {
let file_id = format!("file:{}", fp);
if let Some(&idx) = id_to_idx.get(file_id.as_str()) {
node_to_file_idx.insert(&node.id, idx);
continue;
}
}
if let Some(fp_val) = node.metadata.get("file_path") {
if let Some(fp) = fp_val.as_str() {
let file_id = format!("file:{}", fp);
if let Some(&idx) = id_to_idx.get(file_id.as_str()) {
node_to_file_idx.insert(&node.id, idx);
continue;
}
}
}
}
let mut edge_weights: HashMap<(usize, usize), f64> = HashMap::new();
for edge in &graph.edges {
let w = relation_weight(&edge.relation);
if w == 0.0 {
continue;
}
let from_idx = node_to_file_idx
.get(edge.from.as_str())
.or_else(|| id_to_idx.get(edge.from.as_str()))
.copied();
let to_idx = node_to_file_idx
.get(edge.to.as_str())
.or_else(|| id_to_idx.get(edge.to.as_str()))
.copied();
if let (Some(f), Some(t)) = (from_idx, to_idx) {
if f == t {
continue; }
*edge_weights.entry((f, t)).or_insert(0.0) += w;
}
}
let mut net = Network::new();
if !idx_to_id.is_empty() {
for (idx, node_id) in idx_to_id.iter().enumerate() {
net.add_node_name(idx, node_id);
}
}
for (&(from, to), &total_weight) in &edge_weights {
net.add_edge(from, to, total_weight);
}
(net, idx_to_id)
}
pub fn add_co_citation_edges(
net: &mut Network,
graph: &Graph,
idx_to_id: &[String],
weight: f64,
min_shared: usize,
max_edge_weight: f64,
) {
if weight <= 0.0 || idx_to_id.is_empty() {
return;
}
let mut id_to_idx: HashMap<&str, usize> = HashMap::new();
for (idx, id) in idx_to_id.iter().enumerate() {
id_to_idx.insert(id.as_str(), idx);
}
let mut node_to_file_idx: HashMap<&str, usize> = HashMap::new();
for node in &graph.nodes {
let is_file = node.node_type.as_deref() == Some("file")
|| (node.node_type.as_deref() == Some("code")
&& node.node_kind.as_deref() == Some("File"));
if is_file {
continue;
}
if let Some(ref fp) = node.file_path {
let file_id = format!("file:{}", fp);
if let Some(&idx) = id_to_idx.get(file_id.as_str()) {
node_to_file_idx.insert(&node.id, idx);
continue;
}
}
if let Some(fp_val) = node.metadata.get("file_path") {
if let Some(fp) = fp_val.as_str() {
let file_id = format!("file:{}", fp);
if let Some(&idx) = id_to_idx.get(file_id.as_str()) {
node_to_file_idx.insert(&node.id, idx);
}
}
}
}
let mut imported_by: HashMap<usize, HashSet<usize>> = HashMap::new();
for edge in &graph.edges {
let is_import_like = matches!(
edge.relation.as_str(),
"imports" | "calls" | "uses" | "type_reference" | "depends_on"
);
if !is_import_like {
continue;
}
let from_idx = node_to_file_idx
.get(edge.from.as_str())
.or_else(|| id_to_idx.get(edge.from.as_str()))
.copied();
let to_idx = node_to_file_idx
.get(edge.to.as_str())
.or_else(|| id_to_idx.get(edge.to.as_str()))
.copied();
if let (Some(from), Some(to)) = (from_idx, to_idx) {
if from != to {
imported_by.entry(to).or_default().insert(from);
}
}
}
let candidates: Vec<usize> = imported_by.keys().copied().collect();
if candidates.len() < 2 {
return;
}
let mut co_citation_edges = 0usize;
for i in 0..candidates.len() {
let a = candidates[i];
let citers_a = &imported_by[&a];
for j in (i + 1)..candidates.len() {
let b = candidates[j];
let citers_b = &imported_by[&b];
let shared = citers_a.intersection(citers_b).count();
if shared >= min_shared {
let edge_weight = (weight * shared as f64).min(max_edge_weight);
net.add_edge(a, b, edge_weight);
net.add_edge(b, a, edge_weight);
co_citation_edges += 1;
}
}
}
if co_citation_edges > 0 {
eprintln!(
"🔗 Added {} co-citation edges ({} candidate files, min_shared={})",
co_citation_edges,
candidates.len(),
min_shared,
);
}
}
pub fn add_dir_colocation_edges(net: &mut Network, idx_to_id: &[String], weight: f64) {
if weight <= 0.0 {
return;
}
let mut isolated: HashSet<usize> = HashSet::new();
for idx in 0..idx_to_id.len() {
if idx < net.num_nodes()
&& net.out_neighbors(idx).is_empty()
&& net.in_neighbors(idx).is_empty()
{
isolated.insert(idx);
}
}
if isolated.is_empty() {
return;
}
let mut dir_groups: HashMap<String, Vec<usize>> = HashMap::new();
for &idx in &isolated {
let node_id = &idx_to_id[idx];
let path = node_id.strip_prefix("file:").unwrap_or(node_id);
let dir = match path.rsplit_once('/') {
Some((parent, _)) if !parent.is_empty() => parent.to_string(),
_ => "root".to_string(),
};
dir_groups.entry(dir).or_default().push(idx);
}
let mut total_edges = 0usize;
for (_dir, files) in &dir_groups {
if files.len() < 2 {
continue; }
add_pairwise_edges(net, files, weight);
total_edges += files.len() * (files.len() - 1); }
if total_edges > 0 {
eprintln!(
"📂 Co-location: {} isolated files → {} edges across {} directories",
isolated.len(),
total_edges,
dir_groups.values().filter(|f| f.len() >= 2).count(),
);
}
}
fn add_pairwise_edges(net: &mut Network, files: &[usize], weight: f64) {
for i in 0..files.len() {
for j in (i + 1)..files.len() {
net.add_edge(files[i], files[j], weight);
net.add_edge(files[j], files[i], weight);
}
}
}
fn split_camel_case(s: &str) -> Vec<String> {
let mut words: Vec<String> = Vec::new();
let mut current = String::new();
let chars: Vec<char> = s.chars().collect();
for i in 0..chars.len() {
let c = chars[i];
if current.is_empty() {
current.push(c);
continue;
}
let prev_upper = chars[i - 1].is_uppercase();
let curr_upper = c.is_uppercase();
let next_lower = i + 1 < chars.len() && chars[i + 1].is_lowercase();
if !curr_upper {
current.push(c);
} else if !prev_upper {
words.push(current);
current = String::new();
current.push(c);
} else if next_lower {
words.push(current);
current = String::new();
current.push(c);
} else {
current.push(c);
}
}
if !current.is_empty() {
words.push(current);
}
words
}
fn is_stop_word(word: &str) -> bool {
matches!(
word,
"get" | "set" | "is" | "has" | "on" | "from" | "to" | "new" | "create"
| "make" | "with" | "for" | "the" | "an" | "default" | "init" | "handle"
| "process" | "do" | "run" | "execute" | "test" | "spec" | "mock" | "stub"
| "should" | "expect" | "describe" | "it" | "before" | "after"
| "use" | "fn" | "func" | "function" | "method" | "class" | "type"
| "value" | "data" | "item" | "result" | "response" | "request"
| "index" | "main" | "app" | "module" | "export" | "import"
| "self" | "this" | "super" | "that" | "then" | "else" | "if"
| "return" | "async" | "await" | "try" | "catch" | "throw" | "error"
| "null" | "undefined" | "true" | "false" | "none" | "some"
| "add" | "remove" | "delete" | "update" | "check" | "can" | "will"
| "of" | "in" | "at" | "by" | "or" | "and" | "not" | "all" | "any"
)
}
fn tokenize_symbol_name(name: &str) -> HashSet<String> {
let mut tokens = HashSet::new();
for part in name.split('_') {
for word in split_camel_case(part) {
let lower = word.to_lowercase();
if lower.len() >= 2 && !is_stop_word(&lower) {
tokens.insert(lower);
}
}
}
tokens
}
pub fn add_symbol_similarity_edges(
net: &mut Network,
graph: &Graph,
idx_to_id: &[String],
weight: f64,
min_shared: usize,
min_jaccard: f64,
) {
if weight <= 0.0 || idx_to_id.is_empty() {
return;
}
let mut id_to_idx: HashMap<&str, usize> = HashMap::new();
for (idx, id) in idx_to_id.iter().enumerate() {
id_to_idx.insert(id.as_str(), idx);
}
let mut file_tokens: HashMap<usize, HashSet<String>> = HashMap::new();
for node in &graph.nodes {
let is_file = node.node_type.as_deref() == Some("file")
|| (node.node_type.as_deref() == Some("code")
&& node.node_kind.as_deref() == Some("File"));
if is_file {
continue;
}
let is_symbol = matches!(
node.node_kind.as_deref(),
Some("Function") | Some("Class") | Some("Method") | Some("Module")
);
if !is_symbol {
continue;
}
let file_idx = node.file_path.as_ref().and_then(|fp| {
let file_id = format!("file:{}", fp);
id_to_idx.get(file_id.as_str()).copied()
});
if let Some(idx) = file_idx {
let tokens = tokenize_symbol_name(&node.title);
file_tokens.entry(idx).or_default().extend(tokens);
}
}
let mut inverted_index: HashMap<String, Vec<usize>> = HashMap::new();
for (&file_idx, tokens) in &file_tokens {
for token in tokens {
inverted_index
.entry(token.clone())
.or_default()
.push(file_idx);
}
}
let mut shared_counts: HashMap<(usize, usize), usize> = HashMap::new();
for (_token, files) in &inverted_index {
if files.len() < 2 || files.len() > 200 {
continue;
}
for i in 0..files.len() {
for j in (i + 1)..files.len() {
let pair = if files[i] < files[j] {
(files[i], files[j])
} else {
(files[j], files[i])
};
*shared_counts.entry(pair).or_insert(0) += 1;
}
}
}
let mut edges_added = 0usize;
for (&(a, b), &shared) in &shared_counts {
if shared < min_shared {
continue;
}
let tokens_a = &file_tokens[&a];
let tokens_b = &file_tokens[&b];
let union_size = tokens_a.union(tokens_b).count();
if union_size == 0 {
continue;
}
let jaccard = shared as f64 / union_size as f64;
if jaccard < min_jaccard {
continue;
}
let edge_weight = weight * jaccard;
net.add_edge(a, b, edge_weight);
net.add_edge(b, a, edge_weight);
edges_added += 1;
}
if edges_added > 0 {
eprintln!(
"🏷️ Symbol similarity: {} files with symbols → {} edges (min_shared={}, min_jaccard={:.2})",
file_tokens.len(),
edges_added,
min_shared,
min_jaccard,
);
}
}
pub fn run_clustering(
net: &Network,
idx_to_id: &[String],
config: &ClusterConfig,
) -> (Vec<RawCluster>, ClusterMetrics) {
let result = Infomap::new(net)
.seed(config.seed)
.num_trials(config.num_trials as usize)
.hierarchical(config.hierarchical)
.tau(config.teleportation_rate)
.run();
let (clusters, diag) = if config.hierarchical {
let c = build_hierarchical_clusters(&result, idx_to_id);
let singleton_count = c.iter().filter(|cl| cl.member_ids.len() == 1).count();
(c, OrphanDiagnostics {
orphan_count_raw: 0,
merged_by_affinity: 0,
assigned_by_dir: 0,
singleton_clusters_final: singleton_count,
})
} else {
build_flat_clusters(&result, idx_to_id, net, config.min_community_size)
};
let metrics = ClusterMetrics {
codelength: result.codelength(),
num_communities: result.num_modules(),
num_total: net.num_nodes(),
orphan_count_raw: diag.orphan_count_raw,
orphans_merged_by_affinity: diag.merged_by_affinity,
orphans_assigned_by_dir: diag.assigned_by_dir,
singleton_clusters_final: diag.singleton_clusters_final,
..Default::default()
};
(clusters, metrics)
}
pub fn split_mega_clusters(
clusters: Vec<RawCluster>,
net: &Network,
idx_to_id: &[String],
config: &ClusterConfig,
max_cluster_size: usize,
) -> Vec<RawCluster> {
let mut result =
split_mega_clusters_recursive(clusters, net, idx_to_id, config, max_cluster_size, 0, 3);
for (i, c) in result.iter_mut().enumerate() {
c.id = i;
}
result
}
fn split_mega_clusters_recursive(
clusters: Vec<RawCluster>,
net: &Network,
idx_to_id: &[String],
config: &ClusterConfig,
max_cluster_size: usize,
depth: usize,
max_depth: usize,
) -> Vec<RawCluster> {
if depth >= max_depth {
if depth > 0 {
eprintln!(
"⚠ Max recursion depth {} reached for mega-cluster splitting",
max_depth
);
}
return clusters;
}
let mut result_clusters: Vec<RawCluster> = Vec::new();
let mut any_split = false;
for cluster in clusters {
if cluster.member_ids.len() <= max_cluster_size {
result_clusters.push(cluster);
continue;
}
eprintln!(
" 🔪 Splitting mega-cluster {} with {} files (max: {})",
cluster.id,
cluster.member_ids.len(),
max_cluster_size
);
let mut id_to_net_idx: HashMap<&str, usize> = HashMap::new();
for (idx, nid) in idx_to_id.iter().enumerate() {
id_to_net_idx.insert(nid.as_str(), idx);
}
let member_net_indices: Vec<usize> = cluster
.member_ids
.iter()
.filter_map(|mid| id_to_net_idx.get(mid.as_str()).copied())
.collect();
if member_net_indices.is_empty() {
result_clusters.push(cluster);
continue;
}
let subgraph = extract_subgraph(net, &member_net_indices);
let sub_idx_to_id: Vec<String> = member_net_indices
.iter()
.map(|&idx| idx_to_id[idx].clone())
.collect();
if subgraph.num_nodes() < 2 {
result_clusters.push(cluster);
continue;
}
let sub_tau = (config.teleportation_rate * 1.5).min(0.15);
let mut sub_config = config.clone();
sub_config.teleportation_rate = sub_tau;
sub_config.hierarchical = false; sub_config.dir_colocation_weight = 0.0;
eprintln!(
" 📊 Subgraph: {} nodes, {} edges, tau={:.4}",
subgraph.num_nodes(),
subgraph.num_edges(),
sub_config.teleportation_rate,
);
let sub_result = Infomap::new(&subgraph)
.seed(sub_config.seed)
.num_trials(sub_config.num_trials as usize)
.hierarchical(false) .tau(sub_config.teleportation_rate)
.run();
let sub_modules = sub_result.modules();
if sub_modules.len() <= 1 {
eprintln!(
" ℹ Cluster {} is monolithic (1 sub-module), keeping as-is (edges: {})",
cluster.id,
subgraph.num_edges(),
);
result_clusters.push(cluster);
continue;
}
any_split = true;
eprintln!(
" ✓ Split cluster {} into {} sub-clusters",
cluster.id,
sub_modules.len()
);
let base_id = result_clusters.len();
for (sub_idx, module) in sub_modules.iter().enumerate() {
let sub_member_ids: Vec<String> = module
.nodes
.iter()
.map(|&idx| sub_idx_to_id[idx].clone())
.collect();
if !sub_member_ids.is_empty() {
result_clusters.push(RawCluster {
id: base_id + sub_idx,
member_ids: sub_member_ids,
flow: module.flow,
parent: None,
children: Vec::new(),
});
}
}
}
if any_split {
split_mega_clusters_recursive(
result_clusters,
net,
idx_to_id,
config,
max_cluster_size,
depth + 1,
max_depth,
)
} else {
result_clusters
}
}
fn extract_subgraph(net: &Network, node_indices: &[usize]) -> Network {
let mut subgraph = Network::new();
let mut old_to_new: HashMap<usize, usize> = HashMap::new();
for (new_idx, &old_idx) in node_indices.iter().enumerate() {
old_to_new.insert(old_idx, new_idx);
subgraph.add_node_name(new_idx, &format!("{}", old_idx));
}
let node_set: HashSet<usize> = node_indices.iter().copied().collect();
for &from_old in node_indices {
for &(to_old, weight) in net.out_neighbors(from_old) {
if node_set.contains(&to_old) {
let from_new = old_to_new[&from_old];
let to_new = old_to_new[&to_old];
subgraph.add_edge(from_new, to_new, weight);
}
}
}
subgraph
}
fn split_oversized_by_directory(clusters: Vec<RawCluster>, max_size: usize) -> Vec<RawCluster> {
let mut result: Vec<RawCluster> = Vec::new();
for cluster in clusters {
if cluster.children.is_empty() && cluster.member_ids.len() > max_size {
eprintln!(
" 📁 Splitting oversized leaf cluster {} ({} files) by directory (max: {})",
cluster.id,
cluster.member_ids.len(),
max_size,
);
let mut dir_groups: HashMap<String, Vec<String>> = HashMap::new();
for member_id in &cluster.member_ids {
let dir = extract_parent_dir(member_id);
dir_groups.entry(dir).or_default().push(member_id.clone());
}
if dir_groups.len() <= 1 {
eprintln!(
" ℹ All files in same directory, keeping cluster {} as-is",
cluster.id,
);
result.push(cluster);
continue;
}
let base_id = result.len();
let mut sub_idx = 0;
for (_dir, members) in dir_groups {
result.push(RawCluster {
id: base_id + sub_idx,
member_ids: members,
flow: 0.0,
parent: cluster.parent,
children: Vec::new(),
});
sub_idx += 1;
}
eprintln!(
" ✓ Split into {} directory-based sub-clusters",
sub_idx,
);
} else {
result.push(cluster);
}
}
for (i, c) in result.iter_mut().enumerate() {
c.id = i;
}
result
}
#[derive(Debug, Clone, Default)]
struct OrphanDiagnostics {
orphan_count_raw: usize,
merged_by_affinity: usize,
assigned_by_dir: usize,
singleton_clusters_final: usize,
}
fn extract_parent_dir(node_id: &str) -> String {
let path = node_id.strip_prefix("file:").unwrap_or(node_id);
match path.rsplit_once('/') {
Some((parent, _)) if !parent.is_empty() => parent.to_string(),
_ => "root".to_string(),
}
}
fn common_prefix_len(a: &str, b: &str) -> usize {
let a_segments: Vec<&str> = a.split('/').collect();
let b_segments: Vec<&str> = b.split('/').collect();
let mut common = 0;
for (sa, sb) in a_segments.iter().zip(b_segments.iter()) {
if sa == sb {
common += sa.len() + 1;
} else {
break;
}
}
common
}
fn build_flat_clusters(
result: &infomap_rs::InfomapResult,
idx_to_id: &[String],
net: &Network,
min_community_size: usize,
) -> (Vec<RawCluster>, OrphanDiagnostics) {
let mut diag = OrphanDiagnostics::default();
let modules = result.modules();
let mut clusters: Vec<RawCluster> = Vec::new();
let mut orphan_nodes: Vec<(usize, String)> = Vec::new();
for module in modules {
let member_ids: Vec<String> = module
.nodes
.iter()
.map(|&idx| idx_to_id[idx].clone())
.collect();
if member_ids.len() < min_community_size {
for &node_idx in &module.nodes {
orphan_nodes.push((node_idx, idx_to_id[node_idx].clone()));
}
} else {
clusters.push(RawCluster {
id: clusters.len(),
member_ids,
flow: module.flow,
parent: None,
children: Vec::new(),
});
}
}
diag.orphan_count_raw = orphan_nodes.len();
if !orphan_nodes.is_empty() && !clusters.is_empty() {
let mut id_to_net_idx: HashMap<&str, usize> = HashMap::new();
for (idx, nid) in idx_to_id.iter().enumerate() {
id_to_net_idx.insert(nid.as_str(), idx);
}
let mut net_idx_to_cluster: HashMap<usize, usize> = HashMap::new();
for (ci, cluster) in clusters.iter().enumerate() {
for mid in &cluster.member_ids {
if let Some(&net_idx) = id_to_net_idx.get(mid.as_str()) {
net_idx_to_cluster.insert(net_idx, ci);
}
}
}
let mut unassigned: Vec<(usize, String)> = orphan_nodes;
let max_iterations = 100;
for _iter in 0..max_iterations {
let mut merged_any = false;
let mut still_unassigned: Vec<(usize, String)> = Vec::new();
for (node_idx, node_id) in unassigned {
let mut cluster_weights: HashMap<usize, f64> = HashMap::new();
for &(neighbor_idx, w) in net.out_neighbors(node_idx) {
if let Some(&ci) = net_idx_to_cluster.get(&neighbor_idx) {
*cluster_weights.entry(ci).or_insert(0.0) += w;
}
}
for &(neighbor_idx, w) in net.in_neighbors(node_idx) {
if let Some(&ci) = net_idx_to_cluster.get(&neighbor_idx) {
*cluster_weights.entry(ci).or_insert(0.0) += w;
}
}
if let Some((&best_ci, _)) = cluster_weights
.iter()
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
{
clusters[best_ci].member_ids.push(node_id);
net_idx_to_cluster.insert(node_idx, best_ci);
merged_any = true;
diag.merged_by_affinity += 1;
} else {
still_unassigned.push((node_idx, node_id));
}
}
unassigned = still_unassigned;
if !merged_any {
break;
}
if unassigned.is_empty() {
break;
}
}
if !unassigned.is_empty() {
let mut dir_groups: HashMap<String, Vec<String>> = HashMap::new();
for (_, node_id) in unassigned {
diag.assigned_by_dir += 1;
let dir = extract_parent_dir(&node_id);
dir_groups.entry(dir).or_default().push(node_id);
}
let mut singleton_orphans: Vec<String> = Vec::new();
for (_, group) in dir_groups {
if group.len() >= min_community_size {
clusters.push(RawCluster {
id: clusters.len(),
member_ids: group,
flow: 0.0,
parent: None,
children: Vec::new(),
});
} else {
singleton_orphans.extend(group);
}
}
if !singleton_orphans.is_empty() && !clusters.is_empty() {
for orphan_id in &singleton_orphans {
let orphan_dir = extract_parent_dir(orphan_id);
let mut best_cluster: Option<usize> = None;
let mut best_prefix_len: usize = 0;
for (ci, cluster) in clusters.iter().enumerate() {
for member in &cluster.member_ids {
let member_dir = extract_parent_dir(member);
let prefix_len = common_prefix_len(&orphan_dir, &member_dir);
if prefix_len > best_prefix_len {
best_prefix_len = prefix_len;
best_cluster = Some(ci);
}
}
}
if let Some(ci) = best_cluster {
clusters[ci].member_ids.push(orphan_id.clone());
} else {
clusters.push(RawCluster {
id: clusters.len(),
member_ids: vec![orphan_id.clone()],
flow: 0.0,
parent: None,
children: Vec::new(),
});
}
}
} else if !singleton_orphans.is_empty() {
for orphan_id in singleton_orphans {
clusters.push(RawCluster {
id: clusters.len(),
member_ids: vec![orphan_id],
flow: 0.0,
parent: None,
children: Vec::new(),
});
}
}
}
} else if !orphan_nodes.is_empty() {
let mut dir_groups: HashMap<String, Vec<String>> = HashMap::new();
for (_, node_id) in orphan_nodes {
diag.assigned_by_dir += 1;
let dir = extract_parent_dir(&node_id);
dir_groups.entry(dir).or_default().push(node_id);
}
for (_, group) in dir_groups {
clusters.push(RawCluster {
id: clusters.len(),
member_ids: group,
flow: 0.0,
parent: None,
children: Vec::new(),
});
}
}
for (i, c) in clusters.iter_mut().enumerate() {
c.id = i;
}
diag.singleton_clusters_final = clusters.iter().filter(|c| c.member_ids.len() == 1).count();
(clusters, diag)
}
fn build_hierarchical_clusters(
result: &infomap_rs::InfomapResult,
idx_to_id: &[String],
) -> Vec<RawCluster> {
let mut clusters: Vec<RawCluster> = Vec::new();
if let Some(tree) = result.tree() {
let mut counter: usize = 0;
build_tree_clusters(tree, idx_to_id, &mut clusters, &mut counter, None, "".to_string());
} else {
for module in result.modules() {
let member_ids: Vec<String> = module
.nodes
.iter()
.map(|&idx| idx_to_id[idx].clone())
.collect();
clusters.push(RawCluster {
id: clusters.len(),
member_ids,
flow: module.flow,
parent: None,
children: Vec::new(),
});
}
}
clusters
}
fn build_tree_clusters(
tree_node: &infomap_rs::TreeNode,
idx_to_id: &[String],
clusters: &mut Vec<RawCluster>,
counter: &mut usize,
parent_idx: Option<usize>,
_path: String,
) {
let my_idx = *counter;
*counter += 1;
let member_ids: Vec<String> = if let Some(ref nodes) = tree_node.nodes {
nodes.iter().map(|&idx| idx_to_id[idx].clone()).collect()
} else {
Vec::new()
};
clusters.push(RawCluster {
id: my_idx,
member_ids,
flow: tree_node.flow,
parent: parent_idx,
children: Vec::new(),
});
if let Some(pidx) = parent_idx {
clusters[pidx].children.push(my_idx);
}
if let Some(ref children) = tree_node.children {
for (ci, child) in children.iter().enumerate() {
let child_path = if _path.is_empty() {
format!("{}", ci)
} else {
format!("{}.{}", _path, ci)
};
build_tree_clusters(child, idx_to_id, clusters, counter, Some(my_idx), child_path);
}
}
}
pub fn map_to_components(clusters: &[RawCluster], graph: &Graph) -> ClusterResult {
let node_map: HashMap<&str, &Node> = graph
.nodes
.iter()
.map(|n| (n.id.as_str(), n))
.collect();
let mut nodes: Vec<Node> = Vec::new();
let mut edges: Vec<Edge> = Vec::new();
let is_hierarchical = clusters.iter().any(|c| c.parent.is_some() || !c.children.is_empty());
let dot_paths: HashMap<usize, String> = if is_hierarchical {
build_dot_paths(clusters)
} else {
HashMap::new()
};
let infer_meta = serde_json::json!({"source": "infer"});
for cluster in clusters {
let component_id = if is_hierarchical {
let path = dot_paths
.get(&cluster.id)
.cloned()
.unwrap_or_else(|| format!("{}", cluster.id));
format!("infer:component:{}", path)
} else {
format!("infer:component:{}", cluster.id)
};
let file_paths: Vec<&str> = cluster
.member_ids
.iter()
.filter_map(|mid| {
node_map.get(mid.as_str()).and_then(|n| {
n.file_path.as_deref().or_else(|| {
mid.strip_prefix("file:")
})
})
})
.collect();
let title = if !file_paths.is_empty() {
auto_name(&file_paths)
} else if is_hierarchical && !cluster.children.is_empty() {
"component".to_string()
} else {
auto_name(&[])
};
let mut node = Node::new(&component_id, &title);
node.node_type = Some("component".into());
node.source = Some("infer".into());
node.metadata
.insert("flow".into(), serde_json::json!(cluster.flow));
node.metadata
.insert("size".into(), serde_json::json!(cluster.member_ids.len()));
if is_hierarchical && !cluster.children.is_empty() {
let total = count_total_descendants(cluster, clusters);
node.metadata.insert("total_size".into(), serde_json::json!(total));
}
nodes.push(node);
for mid in &cluster.member_ids {
let mut edge = Edge::new(&component_id, mid, "contains");
edge.metadata = Some(infer_meta.clone());
edges.push(edge);
}
if is_hierarchical {
for &child_id in &cluster.children {
let child_component_id = {
let path = dot_paths
.get(&child_id)
.cloned()
.unwrap_or_else(|| format!("{}", child_id));
format!("infer:component:{}", path)
};
let mut edge = Edge::new(&component_id, &child_component_id, "contains");
edge.metadata = Some(infer_meta.clone());
edges.push(edge);
}
}
}
if is_hierarchical {
let cluster_id_to_node_idx: HashMap<usize, usize> = clusters
.iter()
.enumerate()
.map(|(i, c)| (c.id, i))
.collect();
for cluster in clusters {
if cluster.member_ids.is_empty() && !cluster.children.is_empty() {
let child_titles: Vec<&str> = cluster
.children
.iter()
.filter_map(|&child_id| {
cluster_id_to_node_idx
.get(&child_id)
.map(|&idx| nodes[idx].title.as_str())
})
.collect();
if !child_titles.is_empty() {
if let Some(&node_idx) = cluster_id_to_node_idx.get(&cluster.id) {
nodes[node_idx].title = auto_name_hierarchical(&child_titles);
}
}
}
}
}
let metrics = ClusterMetrics {
codelength: 0.0,
num_communities: clusters.len(),
num_total: 0,
..Default::default()
};
ClusterResult {
nodes,
edges,
metrics,
}
}
fn build_dot_paths(clusters: &[RawCluster]) -> HashMap<usize, String> {
let mut paths: HashMap<usize, String> = HashMap::new();
let roots: Vec<usize> = clusters
.iter()
.filter(|c| c.parent.is_none())
.map(|c| c.id)
.collect();
for (ri, &root_id) in roots.iter().enumerate() {
let root_path = format!("{}", ri);
paths.insert(root_id, root_path.clone());
assign_child_paths(clusters, root_id, &root_path, &mut paths);
}
paths
}
fn assign_child_paths(
clusters: &[RawCluster],
parent_id: usize,
parent_path: &str,
paths: &mut HashMap<usize, String>,
) {
if let Some(cluster) = clusters.iter().find(|c| c.id == parent_id) {
for (ci, &child_id) in cluster.children.iter().enumerate() {
let child_path = format!("{}.{}", parent_path, ci);
paths.insert(child_id, child_path.clone());
assign_child_paths(clusters, child_id, &child_path, paths);
}
}
}
pub fn auto_name(file_paths: &[&str]) -> String {
if file_paths.is_empty() {
return "component".to_string();
}
if file_paths.len() == 1 {
let parts: Vec<&str> = file_paths[0].split('/').collect();
if parts.len() > 1 {
return parts[parts.len() - 2].to_string();
}
return parts[0]
.rsplit_once('.')
.map(|(stem, _)| stem)
.unwrap_or(parts[0])
.to_string();
}
let split_paths: Vec<Vec<&str>> = file_paths
.iter()
.map(|p| p.split('/').collect::<Vec<_>>())
.collect();
let min_len = split_paths.iter().map(|p| p.len()).min().unwrap_or(0);
let mut prefix_len = 0;
for i in 0..min_len {
let first = split_paths[0][i];
if split_paths.iter().all(|p| p[i] == first) {
prefix_len = i + 1;
} else {
break;
}
}
if prefix_len > 0 {
let deepest = split_paths[0][prefix_len - 1];
if deepest.contains('.') && prefix_len > 1 {
return split_paths[0][prefix_len - 2].to_string();
}
return deepest.to_string();
}
let mut freq: HashMap<&str, usize> = HashMap::new();
for parts in &split_paths {
for &part in parts.iter().take(parts.len().saturating_sub(1)) {
*freq.entry(part).or_insert(0) += 1;
}
}
if let Some((&dir, _)) = freq.iter().max_by_key(|(_, &count)| count) {
return dir.to_string();
}
let hash: u64 = file_paths.iter().fold(0u64, |acc, p| {
acc.wrapping_add(p.bytes().fold(0u64, |h, b| h.wrapping_mul(31).wrapping_add(b as u64)))
});
format!("component-{}", hash % 10000)
}
pub fn auto_name_hierarchical(child_titles: &[&str]) -> String {
if child_titles.is_empty() {
return "group".to_string();
}
if child_titles.len() == 1 {
return format!("{}-group", child_titles[0]);
}
let split_titles: Vec<Vec<&str>> = child_titles
.iter()
.map(|t| t.split(&['-', '_', '/'][..]).collect::<Vec<_>>())
.collect();
let min_len = split_titles.iter().map(|p| p.len()).min().unwrap_or(0);
let mut prefix_len = 0;
for i in 0..min_len {
let first = split_titles[0][i];
if split_titles.iter().all(|p| p[i] == first) {
prefix_len = i + 1;
} else {
break;
}
}
if prefix_len > 0 {
let prefix: Vec<&str> = split_titles[0][..prefix_len].to_vec();
return prefix.join("-");
}
let top: Vec<&str> = child_titles.iter().take(3).copied().collect();
let joined = top.join("+");
if child_titles.len() > 3 {
format!("{}+…", joined)
} else {
joined
}
}
fn count_total_descendants(cluster: &RawCluster, all_clusters: &[RawCluster]) -> usize {
let mut total = cluster.member_ids.len();
for &child_id in &cluster.children {
if let Some(child) = all_clusters.iter().find(|c| c.id == child_id) {
total += count_total_descendants(child, all_clusters);
}
}
total
}
pub fn auto_config(file_count: usize) -> ClusterConfig {
let (min_community_size, hierarchical) = match file_count {
0..=49 => (2, false),
50..=499 => (3, true),
500..=1999 => (5, true),
_ => (8, true),
};
ClusterConfig {
min_community_size,
hierarchical,
..ClusterConfig::default()
}
}
pub fn auto_config_with_network(file_count: usize, net: &Network) -> ClusterConfig {
let mut config = auto_config(file_count);
let num_nodes = net.num_nodes();
if num_nodes > 0 {
let avg_degree = net.num_edges() as f64 / num_nodes as f64;
config.teleportation_rate = if avg_degree < 3.0 {
0.01
} else if avg_degree <= 20.0 {
0.05
} else {
0.10
};
}
config
}
pub fn post_process(result: &mut ClusterResult, graph: &Graph, _config: &ClusterConfig) {
deduplicate_names(&mut result.nodes);
let file_count = graph
.nodes
.iter()
.filter(|n| {
n.node_type.as_deref() == Some("file")
|| (n.node_type.as_deref() == Some("code")
&& n.node_kind.as_deref() == Some("File"))
})
.count();
let component_count = result.nodes.len();
let max_size = result
.nodes
.iter()
.filter_map(|n| n.metadata.get("size").and_then(|v| v.as_u64()))
.max()
.unwrap_or(0) as usize;
if max_size > file_count / 2 && file_count > 10 {
eprintln!(
"⚠ Largest component has {} files ({}% of total) — may need manual review",
max_size,
max_size * 100 / file_count
);
}
result.metrics.num_communities = component_count;
}
fn deduplicate_names(nodes: &mut [Node]) {
use std::collections::HashMap as DedupMap;
let mut title_groups: DedupMap<String, Vec<usize>> = DedupMap::new();
for (i, node) in nodes.iter().enumerate() {
title_groups.entry(node.title.clone()).or_default().push(i);
}
for (title, indices) in &title_groups {
if indices.len() <= 1 {
continue;
}
for &idx in indices {
let node = &nodes[idx];
if let Some(num_str) = node.id.strip_prefix("infer:component:") {
nodes[idx].title = format!("{}-{}", title, num_str);
}
}
}
}
pub fn identify_hubs(
graph: &Graph,
idx_to_id: &[String],
threshold: f64,
min_degree: usize,
) -> HashSet<usize> {
if threshold <= 0.0 || idx_to_id.is_empty() {
return HashSet::new();
}
let mut id_to_idx: HashMap<&str, usize> = HashMap::new();
for (idx, id) in idx_to_id.iter().enumerate() {
id_to_idx.insert(id.as_str(), idx);
}
let mut node_to_file_idx: HashMap<&str, usize> = HashMap::new();
for node in &graph.nodes {
let is_file = node.node_type.as_deref() == Some("file")
|| (node.node_type.as_deref() == Some("code")
&& node.node_kind.as_deref() == Some("File"));
if is_file {
continue;
}
if let Some(ref fp) = node.file_path {
let file_id = format!("file:{}", fp);
if let Some(&idx) = id_to_idx.get(file_id.as_str()) {
node_to_file_idx.insert(&node.id, idx);
continue;
}
}
if let Some(fp_val) = node.metadata.get("file_path") {
if let Some(fp) = fp_val.as_str() {
let file_id = format!("file:{}", fp);
if let Some(&idx) = id_to_idx.get(file_id.as_str()) {
node_to_file_idx.insert(&node.id, idx);
}
}
}
}
let mut in_degree: HashMap<usize, usize> = HashMap::new();
for edge in &graph.edges {
let is_import_like = matches!(
edge.relation.as_str(),
"imports" | "calls" | "uses" | "type_reference" | "depends_on"
);
if !is_import_like {
continue;
}
let from_idx = node_to_file_idx
.get(edge.from.as_str())
.or_else(|| id_to_idx.get(edge.from.as_str()))
.copied();
let to_idx = node_to_file_idx
.get(edge.to.as_str())
.or_else(|| id_to_idx.get(edge.to.as_str()))
.copied();
if let (Some(from), Some(to)) = (from_idx, to_idx) {
if from != to {
*in_degree.entry(to).or_insert(0) += 1;
}
}
}
let total_files = idx_to_id.len();
let cutoff = (threshold * total_files as f64).ceil().max(min_degree as f64) as usize;
let mut hub_indices: HashSet<usize> = HashSet::new();
let mut hub_info: Vec<(usize, &str, usize)> = Vec::new();
for (&idx, °ree) in &in_degree {
if degree > cutoff {
hub_indices.insert(idx);
hub_info.push((idx, &idx_to_id[idx], degree));
}
}
if !hub_indices.is_empty() {
hub_info.sort_by(|a, b| b.2.cmp(&a.2));
let top5: Vec<String> = hub_info
.iter()
.take(5)
.map(|(_, id, deg)| format!("{}({})", id, deg))
.collect();
eprintln!(
"🔌 Hub exclusion: {} files excluded (threshold: {:.0}%, cutoff: {}), top hubs: [{}]",
hub_indices.len(),
threshold * 100.0,
cutoff,
top5.join(", "),
);
}
hub_indices
}
pub fn exclude_hubs_from_network(
net: &Network,
idx_to_id: &[String],
hub_indices: &HashSet<usize>,
) -> (Network, Vec<String>, Vec<String>) {
let mut old_to_new: HashMap<usize, usize> = HashMap::new();
let mut new_idx_to_id: Vec<String> = Vec::new();
let mut excluded_ids: Vec<String> = Vec::new();
for (old_idx, id) in idx_to_id.iter().enumerate() {
if hub_indices.contains(&old_idx) {
excluded_ids.push(id.clone());
} else {
let new_idx = new_idx_to_id.len();
old_to_new.insert(old_idx, new_idx);
new_idx_to_id.push(id.clone());
}
}
let mut new_net = Network::new();
for (new_idx, id) in new_idx_to_id.iter().enumerate() {
new_net.add_node_name(new_idx, id);
}
for old_from in 0..idx_to_id.len() {
if hub_indices.contains(&old_from) {
continue;
}
let new_from = old_to_new[&old_from];
for &(old_to, weight) in net.out_neighbors(old_from) {
if hub_indices.contains(&old_to) {
continue;
}
if let Some(&new_to) = old_to_new.get(&old_to) {
new_net.add_edge(new_from, new_to, weight);
}
}
}
eprintln!(
"🔌 Network after hub exclusion: {} nodes (was {}), {} edges (was {})",
new_net.num_nodes(),
net.num_nodes(),
new_net.num_edges(),
net.num_edges(),
);
(new_net, new_idx_to_id, excluded_ids)
}
fn create_infra_component(
excluded_ids: &[String],
_graph: &Graph,
) -> (Node, Vec<Edge>) {
let component_id = "infer:component:infrastructure";
let title = "Infrastructure & Shared Utilities";
let mut node = Node::new(component_id, title);
node.node_type = Some("component".into());
node.source = Some("infer".into());
node.metadata
.insert("flow".into(), serde_json::json!(0.0));
node.metadata
.insert("size".into(), serde_json::json!(excluded_ids.len()));
node.metadata
.insert("hub_excluded".into(), serde_json::json!(true));
let infer_meta = serde_json::json!({"source": "infer"});
let edges: Vec<Edge> = excluded_ids
.iter()
.map(|mid| {
let mut edge = Edge::new(component_id, mid, "contains");
edge.metadata = Some(infer_meta.clone());
edge
})
.collect();
eprintln!(
"🏗️ Infrastructure component: {} hub files → '{}'",
excluded_ids.len(),
title,
);
(node, edges)
}
pub fn cluster(graph: &Graph, config: &ClusterConfig) -> Result<ClusterResult> {
let (net, idx_to_id) = build_network(graph);
let (mut net, idx_to_id, excluded_hub_ids) = if config.hub_exclusion_threshold > 0.0 {
let hub_indices = identify_hubs(graph, &idx_to_id, config.hub_exclusion_threshold, config.hub_min_degree);
if hub_indices.is_empty() {
(net, idx_to_id, Vec::new())
} else {
exclude_hubs_from_network(&net, &idx_to_id, &hub_indices)
}
} else {
(net, idx_to_id, Vec::new())
};
add_co_citation_edges(
&mut net,
graph,
&idx_to_id,
config.co_citation_weight,
config.co_citation_min_shared,
2.0, );
add_symbol_similarity_edges(
&mut net,
graph,
&idx_to_id,
config.symbol_similarity_weight,
config.symbol_min_shared_tokens,
config.symbol_min_jaccard,
);
add_dir_colocation_edges(&mut net, &idx_to_id, config.dir_colocation_weight);
if net.num_nodes() < 2 {
return Ok(ClusterResult::empty());
}
tracing::info!(
"Infomap input: {} nodes, {} edges (num_trials={})",
net.num_nodes(),
net.num_edges(),
config.num_trials,
);
let t0 = std::time::Instant::now();
let (clusters, metrics) = run_clustering(&net, &idx_to_id, config);
tracing::info!("Infomap completed in {:.1}s", t0.elapsed().as_secs_f64());
let total_files = idx_to_id.len();
let max_size = config.max_cluster_size.unwrap_or_else(|| {
let auto = ((total_files as f64).ln() * 6.0).ceil() as usize;
auto.clamp(15, 60)
});
let clusters = if config.hierarchical {
split_oversized_by_directory(clusters, max_size)
} else {
let after_infomap = split_mega_clusters(clusters, &net, &idx_to_id, config, max_size);
split_oversized_by_directory(after_infomap, max_size)
};
let mut result = map_to_components(&clusters, graph);
result.metrics = metrics;
if !excluded_hub_ids.is_empty() {
let (infra_node, infra_edges) = create_infra_component(&excluded_hub_ids, graph);
result.nodes.push(infra_node);
result.edges.extend(infra_edges);
result.metrics.num_communities += 1;
}
post_process(&mut result, graph, config);
Ok(result)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::graph::{Edge, Graph, Node};
fn make_file_node(path: &str) -> Node {
let mut n = Node::new(&format!("file:{}", path), path);
n.node_type = Some("file".into());
n.file_path = Some(path.into());
n
}
fn make_fn_node(id: &str, file_path: &str) -> Node {
let mut n = Node::new(id, id);
n.node_type = Some("function".into());
n.file_path = Some(file_path.into());
n
}
fn default_config() -> ClusterConfig {
ClusterConfig {
seed: 42,
num_trials: 10,
min_community_size: 2,
..Default::default()
}
}
fn two_community_graph() -> Graph {
let mut g = Graph::default();
let group_a = ["src/auth/login.rs", "src/auth/logout.rs", "src/auth/session.rs"];
let group_b = ["src/db/pool.rs", "src/db/query.rs", "src/db/migrate.rs"];
for p in group_a.iter().chain(group_b.iter()) {
g.nodes.push(make_file_node(p));
}
for i in 0..group_a.len() {
for j in 0..group_a.len() {
if i != j {
g.edges.push(Edge::new(
&format!("file:{}", group_a[i]),
&format!("file:{}", group_a[j]),
"calls",
));
}
}
}
for i in 0..group_b.len() {
for j in 0..group_b.len() {
if i != j {
g.edges.push(Edge::new(
&format!("file:{}", group_b[i]),
&format!("file:{}", group_b[j]),
"calls",
));
}
}
}
g
}
#[test]
fn test_relation_weight() {
assert_eq!(relation_weight("calls"), WEIGHT_CALLS);
assert_eq!(relation_weight("imports"), WEIGHT_IMPORTS);
assert_eq!(relation_weight("type_reference"), WEIGHT_TYPE_REF);
assert_eq!(relation_weight("inherits"), WEIGHT_TYPE_REF);
assert_eq!(relation_weight("implements"), WEIGHT_TYPE_REF);
assert_eq!(relation_weight("uses"), WEIGHT_TYPE_REF);
assert_eq!(relation_weight("defined_in"), WEIGHT_STRUCTURAL);
assert_eq!(relation_weight("contains"), WEIGHT_STRUCTURAL);
assert_eq!(relation_weight("belongs_to"), WEIGHT_STRUCTURAL);
assert_eq!(relation_weight("depends_on"), WEIGHT_DEPENDS_ON);
assert_eq!(relation_weight("overrides"), WEIGHT_TYPE_REF);
assert_eq!(relation_weight("tests_for"), 0.3);
assert_eq!(relation_weight("foobar"), 0.0);
assert_eq!(relation_weight(""), 0.0);
}
#[test]
fn test_build_network_basic() {
let mut g = Graph::default();
g.nodes.push(make_file_node("a.rs"));
g.nodes.push(make_file_node("b.rs"));
g.nodes.push(make_file_node("c.rs"));
g.edges.push(Edge::new("file:a.rs", "file:b.rs", "calls"));
g.edges.push(Edge::new("file:b.rs", "file:c.rs", "imports"));
let (net, idx_to_id) = build_network(&g);
assert_eq!(net.num_nodes(), 3);
assert_eq!(idx_to_id.len(), 3);
assert_eq!(net.num_edges(), 2);
}
#[test]
fn test_build_network_weight_differentiation() {
let mut g = Graph::default();
g.nodes.push(make_file_node("x.rs"));
g.nodes.push(make_file_node("y.rs"));
g.nodes.push(make_file_node("z.rs"));
g.edges.push(Edge::new("file:x.rs", "file:y.rs", "calls"));
g.edges.push(Edge::new("file:x.rs", "file:z.rs", "imports"));
let (net, idx_to_id) = build_network(&g);
let x = idx_to_id.iter().position(|id| id == "file:x.rs").unwrap();
let y = idx_to_id.iter().position(|id| id == "file:y.rs").unwrap();
let z = idx_to_id.iter().position(|id| id == "file:z.rs").unwrap();
let out = net.out_neighbors(x);
let weight_xy = out.iter().find(|&&(t, _)| t == y).map(|&(_, w)| w).unwrap();
let weight_xz = out.iter().find(|&&(t, _)| t == z).map(|&(_, w)| w).unwrap();
assert!(
weight_xy > weight_xz,
"calls weight ({}) should be > imports weight ({})",
weight_xy,
weight_xz
);
}
#[test]
fn test_build_network_skips_self_loops() {
let mut g = Graph::default();
g.nodes.push(make_file_node("a.rs"));
g.nodes.push(make_file_node("b.rs"));
g.edges.push(Edge::new("file:a.rs", "file:a.rs", "calls")); g.edges.push(Edge::new("file:a.rs", "file:b.rs", "calls"));
let (net, _idx_to_id) = build_network(&g);
assert_eq!(net.num_edges(), 1);
}
#[test]
fn test_build_network_maps_functions_to_files() {
let mut g = Graph::default();
g.nodes.push(make_file_node("src/main.rs"));
g.nodes.push(make_file_node("src/lib.rs"));
g.nodes.push(make_fn_node("fn:do_stuff", "src/main.rs"));
g.nodes.push(make_fn_node("fn:helper", "src/lib.rs"));
g.edges.push(Edge::new("fn:do_stuff", "fn:helper", "calls"));
let (net, idx_to_id) = build_network(&g);
assert_eq!(net.num_nodes(), 2);
assert_eq!(idx_to_id.len(), 2);
assert_eq!(net.num_edges(), 1);
let main_idx = idx_to_id.iter().position(|id| id == "file:src/main.rs").unwrap();
let lib_idx = idx_to_id.iter().position(|id| id == "file:src/lib.rs").unwrap();
let out = net.out_neighbors(main_idx);
assert_eq!(out.len(), 1);
assert_eq!(out[0].0, lib_idx);
}
#[test]
fn test_cluster_two_communities() {
let g = two_community_graph();
let config = default_config();
let result = cluster(&g, &config).unwrap();
assert_eq!(
result.metrics.num_communities, 2,
"Expected 2 communities, got {}",
result.metrics.num_communities
);
assert_eq!(result.nodes.len(), 2);
}
#[test]
fn test_cluster_single_community() {
let mut g = Graph::default();
let files = ["a.rs", "b.rs", "c.rs", "d.rs"];
for f in &files {
g.nodes.push(make_file_node(f));
}
for i in 0..files.len() {
for j in 0..files.len() {
if i != j {
g.edges.push(Edge::new(
&format!("file:{}", files[i]),
&format!("file:{}", files[j]),
"calls",
));
}
}
}
let config = ClusterConfig {
seed: 42,
num_trials: 10,
min_community_size: 1,
..Default::default()
};
let result = cluster(&g, &config).unwrap();
assert_eq!(
result.metrics.num_communities, 1,
"Fully connected graph should yield 1 community, got {}",
result.metrics.num_communities
);
}
#[test]
fn test_cluster_empty_graph() {
let g = Graph::default();
let config = default_config();
let result = cluster(&g, &config).unwrap();
assert!(result.nodes.is_empty());
assert!(result.edges.is_empty());
assert_eq!(result.metrics.codelength, 0.0);
assert_eq!(result.metrics.num_communities, 0);
assert_eq!(result.metrics.num_total, 0);
}
#[test]
fn test_cluster_min_community_size() {
let mut g = Graph::default();
let core = ["src/core/a.rs", "src/core/b.rs", "src/core/c.rs", "src/core/d.rs"];
for f in &core {
g.nodes.push(make_file_node(f));
}
g.nodes.push(make_file_node("src/misc/loner.rs"));
for i in 0..core.len() {
for j in 0..core.len() {
if i != j {
g.edges.push(Edge::new(
&format!("file:{}", core[i]),
&format!("file:{}", core[j]),
"calls",
));
}
}
}
g.edges.push(Edge::new("file:src/misc/loner.rs", "file:src/core/a.rs", "depends_on"));
let config = ClusterConfig {
seed: 42,
num_trials: 10,
min_community_size: 2,
..Default::default()
};
let result = cluster(&g, &config).unwrap();
let total_members: usize = result
.edges
.iter()
.filter(|e| e.relation == "contains")
.count();
assert_eq!(total_members, 5, "All 5 nodes should be assigned to some cluster");
}
#[test]
fn test_cluster_hierarchical() {
let g = two_community_graph();
let config = ClusterConfig {
seed: 42,
num_trials: 10,
min_community_size: 1,
hierarchical: true,
..Default::default()
};
let result = cluster(&g, &config).unwrap();
let component_ids: Vec<&str> = result
.nodes
.iter()
.map(|n| n.id.as_str())
.collect();
let parent_child_edges: Vec<&Edge> = result
.edges
.iter()
.filter(|e| {
e.relation == "contains"
&& component_ids.contains(&e.from.as_str())
&& component_ids.contains(&e.to.as_str())
})
.collect();
assert!(
!parent_child_edges.is_empty(),
"Hierarchical clustering should produce parent→child component edges"
);
assert!(
result.nodes.len() > 2,
"Hierarchical mode should produce more than 2 component nodes, got {}",
result.nodes.len()
);
}
#[test]
fn test_auto_name_common_prefix() {
let paths = ["src/auth/login.rs", "src/auth/logout.rs"];
let name = auto_name(&paths);
assert_eq!(name, "auth");
}
#[test]
fn test_auto_name_mixed_dirs() {
let paths = [
"src/db/pool.rs",
"src/db/query.rs",
"lib/utils/helper.rs",
];
let name = auto_name(&paths);
assert!(
name == "src" || name == "db",
"Expected most frequent directory, got '{}'",
name
);
}
#[test]
fn test_component_node_schema() {
let g = two_community_graph();
let config = default_config();
let result = cluster(&g, &config).unwrap();
assert!(!result.nodes.is_empty(), "Should have component nodes");
for node in &result.nodes {
assert_eq!(
node.node_type.as_deref(),
Some("component"),
"Component node should have node_type='component'"
);
assert_eq!(
node.source.as_deref(),
Some("infer"),
"Component node should have source='infer'"
);
assert!(
node.metadata.contains_key("flow"),
"Component node metadata should contain 'flow'"
);
assert!(
node.metadata.contains_key("size"),
"Component node metadata should contain 'size'"
);
}
}
#[test]
fn test_contains_edge_direction() {
let g = two_community_graph();
let config = default_config();
let result = cluster(&g, &config).unwrap();
let component_ids: Vec<&str> = result
.nodes
.iter()
.map(|n| n.id.as_str())
.collect();
let file_ids: Vec<String> = g
.nodes
.iter()
.filter(|n| n.node_type.as_deref() == Some("file"))
.map(|n| n.id.clone())
.collect();
for edge in &result.edges {
if edge.relation == "contains" {
if file_ids.contains(&edge.to) {
assert!(
component_ids.contains(&edge.from.as_str()),
"'contains' edge 'from' ({}) should be a component node",
edge.from
);
assert!(
!component_ids.contains(&edge.to.as_str()),
"'contains' edge 'to' ({}) should NOT be a component node (it should be a file)",
edge.to
);
}
}
}
for edge in &result.edges {
if edge.relation == "contains" {
assert!(
!file_ids.contains(&edge.from),
"'contains' edge should not have a file as 'from': {} → {}",
edge.from,
edge.to
);
}
}
}
#[test]
fn test_metrics_output() {
let g = two_community_graph();
let config = default_config();
let result = cluster(&g, &config).unwrap();
assert!(
result.metrics.codelength > 0.0,
"Codelength should be > 0, got {}",
result.metrics.codelength
);
assert_eq!(
result.metrics.num_communities, 2,
"num_communities should be 2, got {}",
result.metrics.num_communities
);
assert_eq!(
result.metrics.num_total, 6,
"num_total should be 6 (all file nodes), got {}",
result.metrics.num_total
);
}
#[test]
fn test_deterministic_with_seed() {
let g = two_community_graph();
let config = ClusterConfig {
seed: 123,
num_trials: 10,
min_community_size: 2,
..Default::default()
};
let result1 = cluster(&g, &config).unwrap();
let result2 = cluster(&g, &config).unwrap();
assert_eq!(result1.nodes.len(), result2.nodes.len());
assert!(
(result1.metrics.codelength - result2.metrics.codelength).abs() < f64::EPSILON,
"Codelength should be identical: {} vs {}",
result1.metrics.codelength,
result2.metrics.codelength
);
let mut edges1: Vec<(String, String)> = result1
.edges
.iter()
.filter(|e| e.relation == "contains")
.map(|e| (e.from.clone(), e.to.clone()))
.collect();
let mut edges2: Vec<(String, String)> = result2
.edges
.iter()
.filter(|e| e.relation == "contains")
.map(|e| (e.from.clone(), e.to.clone()))
.collect();
edges1.sort();
edges2.sort();
assert_eq!(edges1, edges2, "Deterministic: same seed should produce identical clustering");
}
#[test]
fn test_orphan_reassignment_uses_in_neighbors() {
let mut g = Graph::default();
let cluster_a = ["src/core/a.rs", "src/core/b.rs", "src/core/c.rs"];
for p in &cluster_a {
g.nodes.push(make_file_node(p));
}
for i in 0..cluster_a.len() {
for j in 0..cluster_a.len() {
if i != j {
g.edges.push(Edge::new(
&format!("file:{}", cluster_a[i]),
&format!("file:{}", cluster_a[j]),
"calls",
));
}
}
}
g.nodes.push(make_file_node("src/utils/types.rs"));
g.edges.push(Edge::new("file:src/core/a.rs", "file:src/utils/types.rs", "imports"));
g.edges.push(Edge::new("file:src/core/b.rs", "file:src/utils/types.rs", "imports"));
let config = ClusterConfig {
seed: 42,
num_trials: 10,
min_community_size: 2,
..Default::default()
};
let result = cluster(&g, &config).unwrap();
let total_members: usize = result
.edges
.iter()
.filter(|e| e.relation == "contains")
.filter(|e| !e.from.starts_with("file:") && !e.to.starts_with("infer:"))
.count();
assert_eq!(total_members, 4, "All 4 nodes should be assigned");
for node in &result.nodes {
let size = node.metadata.get("size").and_then(|v| v.as_u64()).unwrap_or(0);
assert!(
size >= 2,
"Component '{}' has size {} — orphan with incoming edges should have merged",
node.title, size
);
}
}
#[test]
fn test_orphan_reassignment_aggregate_weight() {
let mut g = Graph::default();
let cluster_a = ["src/web/handler.rs", "src/web/router.rs", "src/web/middleware.rs"];
for p in &cluster_a {
g.nodes.push(make_file_node(p));
}
for i in 0..cluster_a.len() {
for j in 0..cluster_a.len() {
if i != j {
g.edges.push(Edge::new(
&format!("file:{}", cluster_a[i]),
&format!("file:{}", cluster_a[j]),
"calls",
));
}
}
}
let cluster_b = ["src/db/pool.rs", "src/db/query.rs", "src/db/migrate.rs"];
for p in &cluster_b {
g.nodes.push(make_file_node(p));
}
for i in 0..cluster_b.len() {
for j in 0..cluster_b.len() {
if i != j {
g.edges.push(Edge::new(
&format!("file:{}", cluster_b[i]),
&format!("file:{}", cluster_b[j]),
"calls",
));
}
}
}
g.nodes.push(make_file_node("src/shared/config.rs"));
g.edges.push(Edge::new("file:src/shared/config.rs", "file:src/web/handler.rs", "depends_on"));
g.edges.push(Edge::new("file:src/shared/config.rs", "file:src/web/router.rs", "depends_on"));
g.edges.push(Edge::new("file:src/shared/config.rs", "file:src/web/middleware.rs", "depends_on"));
g.edges.push(Edge::new("file:src/shared/config.rs", "file:src/db/pool.rs", "calls"));
let config = ClusterConfig {
seed: 42,
num_trials: 10,
min_community_size: 2,
..Default::default()
};
let result = cluster(&g, &config).unwrap();
let orphan_id = "file:src/shared/config.rs";
let orphan_cluster = result
.edges
.iter()
.find(|e| e.relation == "contains" && e.to == orphan_id)
.map(|e| e.from.clone());
assert!(orphan_cluster.is_some(), "Orphan should be assigned to a cluster");
let web_cluster = result
.edges
.iter()
.find(|e| e.relation == "contains" && e.to == "file:src/web/handler.rs")
.map(|e| e.from.clone());
assert_eq!(
orphan_cluster, web_cluster,
"Orphan should be in cluster A (aggregate depends_on 1.2 > calls 1.0)"
);
}
#[test]
fn test_orphan_directory_fallback() {
let mut g = Graph::default();
let cluster_a = ["src/core/a.rs", "src/core/b.rs", "src/core/c.rs"];
for p in &cluster_a {
g.nodes.push(make_file_node(p));
}
for i in 0..cluster_a.len() {
for j in 0..cluster_a.len() {
if i != j {
g.edges.push(Edge::new(
&format!("file:{}", cluster_a[i]),
&format!("file:{}", cluster_a[j]),
"calls",
));
}
}
}
g.nodes.push(make_file_node("src/config/base.rs"));
g.nodes.push(make_file_node("src/config/env.rs"));
g.nodes.push(make_file_node("src/config/defaults.rs"));
let config = ClusterConfig {
seed: 42,
num_trials: 10,
min_community_size: 2,
..Default::default()
};
let result = cluster(&g, &config).unwrap();
let config_files = ["file:src/config/base.rs", "file:src/config/env.rs", "file:src/config/defaults.rs"];
let config_clusters: std::collections::HashSet<&str> = result
.edges
.iter()
.filter(|e| e.relation == "contains" && config_files.contains(&e.to.as_str()))
.map(|e| e.from.as_str())
.collect();
assert_eq!(
config_clusters.len(), 1,
"All 3 config files should be in ONE directory-based cluster, not {} clusters",
config_clusters.len()
);
for node in &result.nodes {
let size = node.metadata.get("size").and_then(|v| v.as_u64()).unwrap_or(0);
assert!(
size >= 2,
"Component '{}' has size {} — should not have singletons",
node.title, size
);
}
}
#[test]
fn test_orphan_iterative_propagation() {
let mut g = Graph::default();
let cluster_a = ["src/lib/x.rs", "src/lib/y.rs", "src/lib/z.rs"];
for p in &cluster_a {
g.nodes.push(make_file_node(p));
}
for i in 0..cluster_a.len() {
for j in 0..cluster_a.len() {
if i != j {
g.edges.push(Edge::new(
&format!("file:{}", cluster_a[i]),
&format!("file:{}", cluster_a[j]),
"calls",
));
}
}
}
g.nodes.push(make_file_node("src/ext/bridge.rs"));
g.edges.push(Edge::new("file:src/ext/bridge.rs", "file:src/lib/x.rs", "imports"));
g.nodes.push(make_file_node("src/ext/adapter.rs"));
g.edges.push(Edge::new("file:src/ext/adapter.rs", "file:src/ext/bridge.rs", "calls"));
let config = ClusterConfig {
seed: 42,
num_trials: 10,
min_community_size: 2,
..Default::default()
};
let result = cluster(&g, &config).unwrap();
let total_members: usize = result
.edges
.iter()
.filter(|e| e.relation == "contains")
.filter(|e| !e.from.starts_with("file:") && !e.to.starts_with("infer:"))
.count();
assert_eq!(total_members, 5, "All 5 nodes should be assigned to clusters");
let bridge_cluster = result
.edges
.iter()
.find(|e| e.relation == "contains" && e.to == "file:src/ext/bridge.rs")
.map(|e| e.from.clone());
let adapter_cluster = result
.edges
.iter()
.find(|e| e.relation == "contains" && e.to == "file:src/ext/adapter.rs")
.map(|e| e.from.clone());
assert!(bridge_cluster.is_some(), "bridge.rs should be assigned");
assert!(adapter_cluster.is_some(), "adapter.rs should be assigned");
}
#[test]
fn test_extract_parent_dir() {
assert_eq!(extract_parent_dir("file:src/auth/login.rs"), "src/auth");
assert_eq!(extract_parent_dir("file:src/main.rs"), "src");
assert_eq!(extract_parent_dir("file:lib.rs"), "root");
assert_eq!(extract_parent_dir("src/utils/helper.rs"), "src/utils");
}
#[test]
fn test_default_teleportation_rate() {
let config = ClusterConfig::default();
assert!(
(config.teleportation_rate - 0.05).abs() < f64::EPSILON,
"Default teleportation rate should be 0.05, got {}",
config.teleportation_rate
);
assert_eq!(config.num_trials, 10, "Default num_trials should be 10");
}
#[test]
fn test_auto_config_with_network_sparse() {
let mut net = Network::new();
for i in 0..10 {
net.add_node_name(i, &format!("node_{}", i));
}
net.add_edge(0, 1, 1.0);
net.add_edge(2, 3, 1.0);
net.add_edge(4, 5, 1.0);
net.add_edge(6, 7, 1.0);
net.add_edge(8, 9, 1.0);
let config = auto_config_with_network(10, &net);
assert!(
(config.teleportation_rate - 0.01).abs() < f64::EPSILON,
"Sparse graph (avg degree < 3) should get τ=0.01, got {}",
config.teleportation_rate
);
}
#[test]
fn test_auto_config_with_network_dense() {
let mut net = Network::new();
for i in 0..5 {
net.add_node_name(i, &format!("node_{}", i));
}
for i in 0..5 {
for j in 0..5 {
if i != j {
net.add_edge(i, j, 1.0);
}
}
}
let config = auto_config_with_network(5, &net);
assert!(
(config.teleportation_rate - 0.05).abs() < f64::EPSILON,
"Normal density graph (avg degree 3-20) should get τ=0.05, got {}",
config.teleportation_rate
);
}
#[test]
fn test_auto_config_with_network_very_dense() {
let mut net = Network::new();
for i in 0..3 {
net.add_node_name(i, &format!("node_{}", i));
}
for _ in 0..30 {
net.add_edge(0, 1, 1.0);
net.add_edge(1, 2, 1.0);
net.add_edge(2, 0, 1.0);
}
let config = auto_config_with_network(3, &net);
assert!(
(config.teleportation_rate - 0.10).abs() < f64::EPSILON,
"Very dense graph (avg degree > 20) should get τ=0.10, got {}",
config.teleportation_rate
);
}
#[test]
fn test_dir_colocation_edges_basic() {
let mut g = Graph::default();
g.nodes.push(make_file_node("src/models/user.rs"));
g.nodes.push(make_file_node("src/models/post.rs"));
g.nodes.push(make_file_node("src/models/comment.rs"));
g.nodes.push(make_file_node("src/utils/helper.rs"));
let (mut net, idx_to_id) = build_network(&g);
assert_eq!(net.num_edges(), 0, "Should have no edges before co-location");
add_dir_colocation_edges(&mut net, &idx_to_id, 0.3);
assert_eq!(
net.num_edges(), 6,
"3 files in same dir should produce 6 directed co-location edges"
);
}
#[test]
fn test_dir_colocation_improves_clustering() {
let mut g = Graph::default();
let cluster_a = ["src/core/a.rs", "src/core/b.rs", "src/core/c.rs"];
for p in &cluster_a {
g.nodes.push(make_file_node(p));
}
for i in 0..cluster_a.len() {
for j in 0..cluster_a.len() {
if i != j {
g.edges.push(Edge::new(
&format!("file:{}", cluster_a[i]),
&format!("file:{}", cluster_a[j]),
"calls",
));
}
}
}
g.nodes.push(make_file_node("src/config/base.rs"));
g.nodes.push(make_file_node("src/config/env.rs"));
g.nodes.push(make_file_node("src/config/defaults.rs"));
let config = ClusterConfig {
seed: 42,
num_trials: 10,
min_community_size: 2,
..Default::default()
};
let result = cluster(&g, &config).unwrap();
for node in &result.nodes {
let size = node.metadata.get("size").and_then(|v| v.as_u64()).unwrap_or(0);
assert!(
size >= 2,
"Component '{}' has size {} — co-location should prevent singletons",
node.title, size
);
}
}
#[test]
fn test_dir_colocation_disabled() {
let mut g = Graph::default();
g.nodes.push(make_file_node("src/models/a.rs"));
g.nodes.push(make_file_node("src/models/b.rs"));
let (mut net, idx_to_id) = build_network(&g);
assert_eq!(net.num_edges(), 0);
add_dir_colocation_edges(&mut net, &idx_to_id, 0.0);
assert_eq!(net.num_edges(), 0, "Weight 0.0 should add no edges");
}
#[test]
fn test_sink_file_diagnostics() {
let mut g = Graph::default();
for name in &["auth/login.ts", "auth/register.ts", "auth/session.ts"] {
g.nodes.push(make_file_node(&format!("src/{}", name)));
}
for name in &[
"commands/run.ts",
"commands/build.ts",
"commands/test.ts",
"commands/deploy.ts",
] {
g.nodes.push(make_file_node(&format!("src/{}", name)));
}
for name in &["ui/render.ts", "ui/layout.ts", "ui/theme.ts"] {
g.nodes.push(make_file_node(&format!("src/{}", name)));
}
g.nodes.push(make_file_node("src/utils/errors.ts"));
let cluster_a = vec!["src/auth/login.ts", "src/auth/register.ts", "src/auth/session.ts"];
for i in 0..cluster_a.len() {
for j in (i + 1)..cluster_a.len() {
g.edges.push(Edge::new(
&format!("file:{}", cluster_a[i]),
&format!("file:{}", cluster_a[j]),
"imports",
));
}
}
let cluster_b = vec![
"src/commands/run.ts",
"src/commands/build.ts",
"src/commands/test.ts",
"src/commands/deploy.ts",
];
for i in 0..cluster_b.len() {
for j in (i + 1)..cluster_b.len() {
g.edges.push(Edge::new(
&format!("file:{}", cluster_b[i]),
&format!("file:{}", cluster_b[j]),
"imports",
));
}
}
let cluster_c = vec!["src/ui/render.ts", "src/ui/layout.ts", "src/ui/theme.ts"];
for i in 0..cluster_c.len() {
for j in (i + 1)..cluster_c.len() {
g.edges.push(Edge::new(
&format!("file:{}", cluster_c[i]),
&format!("file:{}", cluster_c[j]),
"imports",
));
}
}
for src in &["src/auth/login.ts", "src/auth/register.ts"] {
g.edges.push(Edge::new(
&format!("file:{}", src),
"file:src/utils/errors.ts",
"imports",
));
}
for src in &[
"src/commands/run.ts",
"src/commands/build.ts",
"src/commands/test.ts",
"src/commands/deploy.ts",
] {
g.edges.push(Edge::new(
&format!("file:{}", src),
"file:src/utils/errors.ts",
"imports",
));
}
g.edges.push(Edge::new(
"file:src/ui/render.ts",
"file:src/utils/errors.ts",
"imports",
));
let config = ClusterConfig {
min_community_size: 2,
..Default::default()
};
let (mut net, idx_to_id) = build_network(&g);
add_dir_colocation_edges(&mut net, &idx_to_id, config.dir_colocation_weight);
let (clusters, metrics) = run_clustering(&net, &idx_to_id, &config);
eprintln!("=== Sink File Diagnostic Test ===");
eprintln!("Total nodes: {}", metrics.num_total);
eprintln!("Infomap communities: {}", metrics.num_communities);
eprintln!("Orphans (raw): {}", metrics.orphan_count_raw);
eprintln!("Merged by affinity: {}", metrics.orphans_merged_by_affinity);
eprintln!("Assigned by dir: {}", metrics.orphans_assigned_by_dir);
eprintln!("Singleton clusters (final): {}", metrics.singleton_clusters_final);
eprintln!("Total clusters: {}", clusters.len());
for (i, c) in clusters.iter().enumerate() {
eprintln!(" Cluster {}: {} members: {:?}", i, c.member_ids.len(), c.member_ids);
}
let errors_cluster = clusters
.iter()
.find(|c| c.member_ids.iter().any(|m| m.contains("errors.ts")));
assert!(
errors_cluster.is_some(),
"errors.ts should be in a cluster"
);
let errors_cluster = errors_cluster.unwrap();
assert!(
errors_cluster.member_ids.len() > 1,
"errors.ts should be merged with its importers, not be a singleton. Cluster: {:?}",
errors_cluster.member_ids
);
let joined_commands = errors_cluster
.member_ids
.iter()
.any(|m| m.contains("commands"));
eprintln!(
"errors.ts joined commands cluster: {} (expected: true, since 4/7 importers are commands)",
joined_commands
);
}
#[test]
fn test_truly_isolated_files() {
let mut g = Graph::default();
for name in &["core/engine.ts", "core/parser.ts", "core/lexer.ts"] {
g.nodes.push(make_file_node(&format!("src/{}", name)));
}
g.edges.push(Edge::new("file:src/core/engine.ts", "file:src/core/parser.ts", "imports"));
g.edges.push(Edge::new("file:src/core/engine.ts", "file:src/core/lexer.ts", "imports"));
g.edges.push(Edge::new("file:src/core/parser.ts", "file:src/core/lexer.ts", "imports"));
g.nodes.push(make_file_node("src/tools/bash.ts"));
g.nodes.push(make_file_node("src/tools/glob.ts"));
g.nodes.push(make_file_node("src/tools/grep.ts"));
g.nodes.push(make_file_node("src/fixtures/sample.ts"));
g.nodes.push(make_file_node("src/generated/schema.ts"));
let config = ClusterConfig {
min_community_size: 2,
..Default::default()
};
let (mut net, idx_to_id) = build_network(&g);
add_dir_colocation_edges(&mut net, &idx_to_id, config.dir_colocation_weight);
let (clusters, metrics) = run_clustering(&net, &idx_to_id, &config);
eprintln!("=== Truly Isolated Files Test ===");
eprintln!("Total nodes: {}", metrics.num_total);
eprintln!("Infomap communities: {}", metrics.num_communities);
eprintln!("Orphans (raw): {}", metrics.orphan_count_raw);
eprintln!("Merged by affinity: {}", metrics.orphans_merged_by_affinity);
eprintln!("Assigned by dir: {}", metrics.orphans_assigned_by_dir);
eprintln!("Singleton clusters (final): {}", metrics.singleton_clusters_final);
eprintln!("Total clusters: {}", clusters.len());
for (i, c) in clusters.iter().enumerate() {
eprintln!(" Cluster {}: {} members: {:?}", i, c.member_ids.len(), c.member_ids);
}
let tools_cluster = clusters
.iter()
.find(|c| c.member_ids.iter().any(|m| m.contains("tools/")));
if let Some(tc) = tools_cluster {
eprintln!("Tools cluster size: {} (expected: 3 from dir colocation)", tc.member_ids.len());
}
let singleton_count = metrics.singleton_clusters_final;
eprintln!("Singletons: {} (these are truly isolated files)", singleton_count);
}
#[test]
fn test_split_mega_cluster() {
let mut g = Graph::default();
let file_count = 30;
let files: Vec<String> = (0..file_count)
.map(|i| format!("src/big/file{}.rs", i))
.collect();
for f in &files {
g.nodes.push(make_file_node(f));
}
for i in 0..file_count {
for j in 0..file_count {
if i != j {
g.edges.push(Edge::new(
&format!("file:{}", files[i]),
&format!("file:{}", files[j]),
"calls",
));
}
}
}
let config = ClusterConfig {
seed: 42,
num_trials: 10,
min_community_size: 1,
max_cluster_size: Some(10),
..Default::default()
};
let (mut net, idx_to_id) = build_network(&g);
add_dir_colocation_edges(&mut net, &idx_to_id, config.dir_colocation_weight);
let (clusters, _metrics) = run_clustering(&net, &idx_to_id, &config);
let big_before = clusters.iter().filter(|c| c.member_ids.len() > 10).count();
let total_files_val = idx_to_id.len();
let max_size = config
.max_cluster_size
.unwrap_or_else(|| total_files_val.max(100) / 5)
.max(20);
let split_clusters = split_mega_clusters_recursive(
clusters,
&net,
&idx_to_id,
&config,
max_size,
0,
3,
);
let total_members: usize = split_clusters.iter().map(|c| c.member_ids.len()).sum();
assert_eq!(
total_members, file_count,
"All {} files should be present after split, got {}",
file_count, total_members
);
eprintln!(
"test_split_mega_cluster: big_before={}, clusters_after={}, max_size={}",
big_before,
split_clusters.len(),
max_size
);
}
#[test]
fn test_split_preserves_small_clusters() {
let mut g = Graph::default();
let group_a: Vec<String> = (0..5).map(|i| format!("src/alpha/a{}.rs", i)).collect();
let group_b: Vec<String> = (0..5).map(|i| format!("src/beta/b{}.rs", i)).collect();
for f in group_a.iter().chain(group_b.iter()) {
g.nodes.push(make_file_node(f));
}
for i in 0..group_a.len() {
for j in 0..group_a.len() {
if i != j {
g.edges.push(Edge::new(
&format!("file:{}", group_a[i]),
&format!("file:{}", group_a[j]),
"calls",
));
}
}
}
for i in 0..group_b.len() {
for j in 0..group_b.len() {
if i != j {
g.edges.push(Edge::new(
&format!("file:{}", group_b[i]),
&format!("file:{}", group_b[j]),
"calls",
));
}
}
}
let config = ClusterConfig {
seed: 42,
num_trials: 10,
min_community_size: 1,
max_cluster_size: Some(10), ..Default::default()
};
let (mut net, idx_to_id) = build_network(&g);
add_dir_colocation_edges(&mut net, &idx_to_id, config.dir_colocation_weight);
let (clusters, _metrics) = run_clustering(&net, &idx_to_id, &config);
let clusters_before = clusters.len();
let split_clusters =
split_mega_clusters_recursive(clusters, &net, &idx_to_id, &config, 10, 0, 3);
assert_eq!(
split_clusters.len(),
clusters_before,
"Small clusters should not be split: before={}, after={}",
clusters_before,
split_clusters.len()
);
}
#[test]
fn test_split_stops_on_monolith() {
let mut g = Graph::default();
let file_count = 15;
let files: Vec<String> = (0..file_count)
.map(|i| format!("src/mono/m{}.rs", i))
.collect();
for f in &files {
g.nodes.push(make_file_node(f));
}
for i in 0..file_count {
for j in 0..file_count {
if i != j {
g.edges.push(Edge::new(
&format!("file:{}", files[i]),
&format!("file:{}", files[j]),
"calls",
));
}
}
}
let config = ClusterConfig {
seed: 42,
num_trials: 10,
min_community_size: 1,
max_cluster_size: Some(10),
..Default::default()
};
let (mut net, idx_to_id) = build_network(&g);
add_dir_colocation_edges(&mut net, &idx_to_id, config.dir_colocation_weight);
let (clusters, _metrics) = run_clustering(&net, &idx_to_id, &config);
let split_clusters =
split_mega_clusters_recursive(clusters, &net, &idx_to_id, &config, 10, 0, 3);
let total_members: usize = split_clusters.iter().map(|c| c.member_ids.len()).sum();
assert_eq!(
total_members, file_count,
"All {} files should be present, got {}",
file_count, total_members
);
eprintln!(
"test_split_stops_on_monolith: {} clusters, sizes: {:?}",
split_clusters.len(),
split_clusters
.iter()
.map(|c| c.member_ids.len())
.collect::<Vec<_>>()
);
}
#[test]
fn test_split_max_depth() {
let mut g = Graph::default();
let file_count = 30;
let files: Vec<String> = (0..file_count)
.map(|i| format!("src/deep/d{}.rs", i))
.collect();
for f in &files {
g.nodes.push(make_file_node(f));
}
for i in 0..file_count {
for j in 0..file_count {
if i != j {
g.edges.push(Edge::new(
&format!("file:{}", files[i]),
&format!("file:{}", files[j]),
"calls",
));
}
}
}
let config = ClusterConfig {
seed: 42,
num_trials: 10,
min_community_size: 1,
max_cluster_size: Some(10),
..Default::default()
};
let (mut net, idx_to_id) = build_network(&g);
add_dir_colocation_edges(&mut net, &idx_to_id, config.dir_colocation_weight);
let (clusters, _metrics) = run_clustering(&net, &idx_to_id, &config);
let clusters_snapshot = clusters.clone();
let no_split =
split_mega_clusters_recursive(clusters_snapshot, &net, &idx_to_id, &config, 10, 0, 0);
let original_sizes: Vec<usize> = clusters
.iter()
.map(|c| c.member_ids.len())
.collect();
let no_split_sizes: Vec<usize> = no_split
.iter()
.map(|c| c.member_ids.len())
.collect();
assert_eq!(
original_sizes, no_split_sizes,
"max_depth=0 should produce identical clusters: original={:?}, got={:?}",
original_sizes, no_split_sizes
);
}
#[test]
fn test_auto_config_hierarchical_threshold() {
let config_49 = auto_config(49);
assert!(
!config_49.hierarchical,
"auto_config(49) should return hierarchical=false"
);
assert_eq!(config_49.min_community_size, 2);
let config_50 = auto_config(50);
assert!(
config_50.hierarchical,
"auto_config(50) should return hierarchical=true"
);
assert_eq!(config_50.min_community_size, 3);
let config_500 = auto_config(500);
assert!(
config_500.hierarchical,
"auto_config(500) should return hierarchical=true"
);
assert_eq!(config_500.min_community_size, 5);
let config_2000 = auto_config(2000);
assert!(
config_2000.hierarchical,
"auto_config(2000) should return hierarchical=true"
);
assert_eq!(config_2000.min_community_size, 8);
}
#[test]
fn test_hierarchical_skips_mega_split() {
let mut g = Graph::default();
let file_count = 65;
let files: Vec<String> = (0..file_count)
.map(|i| format!("src/mod{}/f{}.rs", i / 10, i))
.collect();
for f in &files {
g.nodes.push(make_file_node(f));
}
for i in 0..file_count {
for offset in 1..=3 {
let j = (i + offset) % file_count;
g.edges.push(Edge::new(
&format!("file:{}", files[i]),
&format!("file:{}", files[j]),
"calls",
));
}
}
let config = ClusterConfig {
seed: 42,
num_trials: 10,
min_community_size: 2,
hierarchical: true,
max_cluster_size: Some(10), ..Default::default()
};
let result = cluster(&g, &config).unwrap();
assert_eq!(
result.metrics.clusters_split, 0,
"Hierarchical mode should skip split_mega_clusters (clusters_split should be 0, got {})",
result.metrics.clusters_split
);
assert!(
!result.nodes.is_empty(),
"Hierarchical clustering should still produce component nodes"
);
}
#[test]
fn test_split_oversized_by_directory() {
let oversized = RawCluster {
id: 0,
member_ids: vec![
"file:src/auth/login.rs".to_string(),
"file:src/auth/logout.rs".to_string(),
"file:src/auth/session.rs".to_string(),
"file:src/db/pool.rs".to_string(),
"file:src/db/query.rs".to_string(),
"file:src/db/migrate.rs".to_string(),
],
flow: 1.0,
parent: None,
children: Vec::new(), };
let small = RawCluster {
id: 1,
member_ids: vec![
"file:src/api/handler.rs".to_string(),
"file:src/api/router.rs".to_string(),
],
flow: 0.5,
parent: None,
children: Vec::new(),
};
let clusters = vec![oversized, small];
let result = split_oversized_by_directory(clusters, 4);
assert_eq!(
result.len(),
3,
"Expected 3 clusters (2 from split + 1 preserved), got {}",
result.len()
);
let all_members: HashSet<String> = result
.iter()
.flat_map(|c| c.member_ids.iter().cloned())
.collect();
assert_eq!(all_members.len(), 8, "All 8 files should be preserved");
assert!(all_members.contains("file:src/auth/login.rs"));
assert!(all_members.contains("file:src/db/pool.rs"));
assert!(all_members.contains("file:src/api/handler.rs"));
let auth_cluster = result.iter().find(|c| {
c.member_ids.iter().any(|m| m.contains("auth"))
});
let db_cluster = result.iter().find(|c| {
c.member_ids.iter().any(|m| m.contains("/db/"))
});
assert!(auth_cluster.is_some(), "Should have an auth cluster");
assert!(db_cluster.is_some(), "Should have a db cluster");
assert_eq!(auth_cluster.unwrap().member_ids.len(), 3);
assert_eq!(db_cluster.unwrap().member_ids.len(), 3);
for (i, c) in result.iter().enumerate() {
assert_eq!(c.id, i, "Cluster IDs should be sequential");
}
}
#[test]
fn test_split_oversized_preserves_non_leaf() {
let parent = RawCluster {
id: 0,
member_ids: (0..10)
.map(|i| format!("file:src/a/f{}.rs", i))
.collect(),
flow: 1.0,
parent: None,
children: vec![1, 2], };
let clusters = vec![parent];
let result = split_oversized_by_directory(clusters, 4);
assert_eq!(
result.len(),
1,
"Non-leaf cluster should not be split, got {} clusters",
result.len()
);
}
#[test]
fn test_co_citation_basic() {
let mut g = Graph::default();
for name in &[
"utils/a.ts",
"utils/b.ts",
"utils/c.ts",
"utils/d.ts",
"utils/e.ts",
] {
g.nodes.push(make_file_node(name));
}
for name in &["features/f1.ts", "features/f2.ts", "features/f3.ts"] {
g.nodes.push(make_file_node(name));
}
g.edges
.push(Edge::new("file:features/f1.ts", "file:utils/a.ts", "imports"));
g.edges
.push(Edge::new("file:features/f1.ts", "file:utils/b.ts", "imports"));
g.edges
.push(Edge::new("file:features/f2.ts", "file:utils/a.ts", "imports"));
g.edges
.push(Edge::new("file:features/f2.ts", "file:utils/b.ts", "imports"));
g.edges
.push(Edge::new("file:features/f2.ts", "file:utils/c.ts", "imports"));
g.edges
.push(Edge::new("file:features/f3.ts", "file:utils/d.ts", "imports"));
g.edges
.push(Edge::new("file:features/f3.ts", "file:utils/e.ts", "imports"));
let (mut net, idx_to_id) = build_network(&g);
let edges_before = net.num_edges();
add_co_citation_edges(&mut net, &g, &idx_to_id, 0.4, 2, 2.0);
let edges_after = net.num_edges();
assert!(
edges_after > edges_before,
"co-citation should add edges: before={}, after={}",
edges_before,
edges_after
);
let a_idx = idx_to_id
.iter()
.position(|id| id == "file:utils/a.ts")
.unwrap();
let b_idx = idx_to_id
.iter()
.position(|id| id == "file:utils/b.ts")
.unwrap();
let a_neighbors: Vec<usize> = net.out_neighbors(a_idx).iter().map(|&(t, _)| t).collect();
assert!(
a_neighbors.contains(&b_idx),
"a should have co-citation edge to b"
);
let b_neighbors: Vec<usize> = net.out_neighbors(b_idx).iter().map(|&(t, _)| t).collect();
assert!(
b_neighbors.contains(&a_idx),
"b should have co-citation edge to a (bidirectional)"
);
}
#[test]
fn test_co_citation_min_shared_threshold() {
let mut g = Graph::default();
g.nodes.push(make_file_node("utils/a.ts"));
g.nodes.push(make_file_node("utils/b.ts"));
g.nodes.push(make_file_node("features/f1.ts"));
g.edges
.push(Edge::new("file:features/f1.ts", "file:utils/a.ts", "imports"));
g.edges
.push(Edge::new("file:features/f1.ts", "file:utils/b.ts", "imports"));
let (mut net, idx_to_id) = build_network(&g);
let edges_before = net.num_edges();
add_co_citation_edges(&mut net, &g, &idx_to_id, 0.4, 2, 2.0);
assert_eq!(
net.num_edges(),
edges_before,
"should not add edges when below min_shared"
);
add_co_citation_edges(&mut net, &g, &idx_to_id, 0.4, 1, 2.0);
assert!(
net.num_edges() > edges_before,
"should add edges when min_shared=1"
);
}
#[test]
fn test_co_citation_weight_cap() {
let mut g = Graph::default();
g.nodes.push(make_file_node("utils/a.ts"));
g.nodes.push(make_file_node("utils/b.ts"));
for i in 0..10 {
let name = format!("features/f{}.ts", i);
g.nodes.push(make_file_node(&name));
g.edges.push(Edge::new(
&format!("file:features/f{}.ts", i),
"file:utils/a.ts",
"imports",
));
g.edges.push(Edge::new(
&format!("file:features/f{}.ts", i),
"file:utils/b.ts",
"imports",
));
}
let (mut net, idx_to_id) = build_network(&g);
add_co_citation_edges(&mut net, &g, &idx_to_id, 0.4, 2, 2.0);
let a_idx = idx_to_id
.iter()
.position(|id| id == "file:utils/a.ts")
.unwrap();
let b_idx = idx_to_id
.iter()
.position(|id| id == "file:utils/b.ts")
.unwrap();
let weight = net
.out_neighbors(a_idx)
.iter()
.find(|&&(t, _)| t == b_idx)
.map(|&(_, w)| w)
.expect("should have co-citation edge");
assert!(
(weight - 2.0).abs() < 0.001,
"weight should be capped at 2.0, got {}",
weight
);
}
#[test]
fn test_co_citation_disabled_when_zero_weight() {
let mut g = Graph::default();
g.nodes.push(make_file_node("utils/a.ts"));
g.nodes.push(make_file_node("utils/b.ts"));
g.nodes.push(make_file_node("features/f1.ts"));
g.nodes.push(make_file_node("features/f2.ts"));
g.edges
.push(Edge::new("file:features/f1.ts", "file:utils/a.ts", "imports"));
g.edges
.push(Edge::new("file:features/f1.ts", "file:utils/b.ts", "imports"));
g.edges
.push(Edge::new("file:features/f2.ts", "file:utils/a.ts", "imports"));
g.edges
.push(Edge::new("file:features/f2.ts", "file:utils/b.ts", "imports"));
let (mut net, idx_to_id) = build_network(&g);
let edges_before = net.num_edges();
add_co_citation_edges(&mut net, &g, &idx_to_id, 0.0, 2, 2.0);
assert_eq!(net.num_edges(), edges_before);
}
#[test]
fn test_co_citation_splits_utils_cluster() {
let mut g = Graph::default();
let auth_utils = [
"hooks/useAuth.ts",
"hooks/useSession.ts",
"hooks/usePermissions.ts",
"hooks/useAuthCallback.ts",
"hooks/useToken.ts",
];
let ui_utils = [
"hooks/useTheme.ts",
"hooks/useModal.ts",
"hooks/useToast.ts",
"hooks/useAnimation.ts",
"hooks/useMediaQuery.ts",
];
for name in auth_utils.iter().chain(ui_utils.iter()) {
g.nodes.push(make_file_node(name));
}
for i in 0..4 {
let feat = format!("features/auth/page{}.ts", i);
g.nodes.push(make_file_node(&feat));
for util in &auth_utils {
g.edges.push(Edge::new(
&format!("file:{}", feat),
&format!("file:{}", util),
"imports",
));
}
}
for i in 0..4 {
let feat = format!("features/ui/page{}.ts", i);
g.nodes.push(make_file_node(&feat));
for util in &ui_utils {
g.edges.push(Edge::new(
&format!("file:{}", feat),
&format!("file:{}", util),
"imports",
));
}
}
let config = ClusterConfig {
co_citation_weight: 0.4,
co_citation_min_shared: 2,
dir_colocation_weight: 0.0, seed: 42,
num_trials: 10,
min_community_size: 2,
max_cluster_size: Some(8),
..Default::default()
};
let result = cluster(&g, &config).unwrap();
assert!(
result.nodes.len() >= 3,
"co-citation should produce >= 3 components (auth-hooks, ui-hooks, features), got {}",
result.nodes.len()
);
let all_hooks: HashSet<String> = auth_utils
.iter()
.chain(ui_utils.iter())
.map(|s| format!("file:{}", s))
.collect();
for component in &result.nodes {
let hook_members: Vec<_> = result
.edges
.iter()
.filter(|e| e.to == component.id && e.relation == "belongs_to")
.filter(|e| all_hooks.contains(&e.from))
.collect();
assert!(
hook_members.len() < all_hooks.len(),
"component {} contains all {} hooks — co-citation didn't split them",
component.id,
all_hooks.len()
);
}
}
#[test]
fn test_colocation_isolation_gating() {
let mut g = Graph::default();
for i in 0..7 {
g.nodes
.push(make_file_node(&format!("src/utils/connected{}.ts", i)));
}
for i in 0..6 {
g.edges.push(Edge::new(
&format!("file:src/utils/connected{}.ts", i),
&format!("file:src/utils/connected{}.ts", i + 1),
"imports",
));
}
for i in 0..3 {
g.nodes
.push(make_file_node(&format!("src/utils/orphan{}.ts", i)));
}
let (mut net, idx_to_id) = build_network(&g);
let edges_before = net.num_edges();
assert_eq!(
edges_before, 6,
"import chain should produce 6 edges, got {}",
edges_before
);
add_dir_colocation_edges(&mut net, &idx_to_id, 0.3);
let edges_after = net.num_edges();
let colocation_edges_added = edges_after - edges_before;
assert_eq!(
colocation_edges_added, 6,
"Should add 6 co-location edges (3 isolated files × C(3,2) × 2 directions), got {}",
colocation_edges_added
);
for i in 0..7 {
let file_id = format!("file:src/utils/connected{}.ts", i);
let idx = idx_to_id.iter().position(|id| id == &file_id).unwrap();
let out_count = net.out_neighbors(idx).len();
let in_count = net.in_neighbors(idx).len();
assert!(
out_count <= 1 && in_count <= 1,
"connected file {} should have ≤1 out + ≤1 in edges, got out={} in={}",
i, out_count, in_count
);
}
}
#[test]
fn test_colocation_skips_when_all_connected() {
let mut g = Graph::default();
for i in 0..50 {
g.nodes
.push(make_file_node(&format!("src/big_flat_dir/f{}.ts", i)));
}
for i in 0..50 {
g.edges.push(Edge::new(
&format!("file:src/big_flat_dir/f{}.ts", i),
&format!("file:src/big_flat_dir/f{}.ts", (i + 1) % 50),
"imports",
));
}
let (mut net, idx_to_id) = build_network(&g);
let edges_before = net.num_edges();
add_dir_colocation_edges(&mut net, &idx_to_id, 0.3);
assert_eq!(
net.num_edges(),
edges_before,
"all files have edges → co-location should add ZERO new edges, \
but added {} (was {}, now {})",
net.num_edges() - edges_before,
edges_before,
net.num_edges(),
);
}
#[test]
fn test_co_citation_only_import_like_edges() {
let mut g = Graph::default();
g.nodes.push(make_file_node("utils/a.ts"));
g.nodes.push(make_file_node("utils/b.ts"));
g.nodes.push(make_file_node("features/f1.ts"));
g.nodes.push(make_file_node("features/f2.ts"));
g.edges
.push(Edge::new("file:features/f1.ts", "file:utils/a.ts", "contains"));
g.edges
.push(Edge::new("file:features/f1.ts", "file:utils/b.ts", "contains"));
g.edges
.push(Edge::new("file:features/f2.ts", "file:utils/a.ts", "contains"));
g.edges
.push(Edge::new("file:features/f2.ts", "file:utils/b.ts", "contains"));
let (mut net, idx_to_id) = build_network(&g);
let edges_before = net.num_edges();
add_co_citation_edges(&mut net, &g, &idx_to_id, 0.4, 2, 2.0);
assert_eq!(
net.num_edges(),
edges_before,
"structural edges should not create co-citation"
);
}
#[test]
fn test_split_camel_case() {
assert_eq!(
split_camel_case("getAuthToken"),
vec!["get", "Auth", "Token"]
);
assert_eq!(
split_camel_case("getOAuthToken"),
vec!["get", "O", "Auth", "Token"]
);
assert_eq!(
split_camel_case("AwsAuthStatusManager"),
vec!["Aws", "Auth", "Status", "Manager"]
);
assert_eq!(split_camel_case("HTMLParser"), vec!["HTML", "Parser"]);
assert_eq!(split_camel_case("simple"), vec!["simple"]);
assert_eq!(split_camel_case("URL"), vec!["URL"]);
assert_eq!(split_camel_case(""), Vec::<String>::new());
assert_eq!(split_camel_case("a"), vec!["a"]);
assert_eq!(split_camel_case("parseJSON"), vec!["parse", "JSON"]);
assert_eq!(
split_camel_case("XMLHttpRequest"),
vec!["XML", "Http", "Request"]
);
}
#[test]
fn test_tokenize_symbol_name() {
let tokens = tokenize_symbol_name("getOAuthToken");
assert!(tokens.contains("auth"));
assert!(tokens.contains("token"));
assert!(!tokens.contains("get")); assert!(!tokens.contains("o"));
let tokens = tokenize_symbol_name("parse_auth_token");
assert!(tokens.contains("parse"));
assert!(tokens.contains("auth"));
assert!(tokens.contains("token"));
let tokens = tokenize_symbol_name("AwsAuthStatusManager");
assert!(tokens.contains("aws"));
assert!(tokens.contains("auth"));
assert!(tokens.contains("status"));
assert!(tokens.contains("manager"));
let tokens = tokenize_symbol_name("getDefaultValue");
assert!(tokens.is_empty());
let tokens = tokenize_symbol_name("aB");
assert!(tokens.is_empty());
}
#[test]
fn test_is_stop_word() {
assert!(is_stop_word("get"));
assert!(is_stop_word("set"));
assert!(is_stop_word("create"));
assert!(is_stop_word("default"));
assert!(is_stop_word("value"));
assert!(!is_stop_word("auth"));
assert!(!is_stop_word("oauth"));
assert!(!is_stop_word("token"));
assert!(!is_stop_word("parser"));
assert!(!is_stop_word("manager"));
}
#[test]
fn test_symbol_similarity_edges_basic() {
let mut g = Graph::default();
g.nodes.push(make_file_node("src/auth/login.ts"));
g.nodes.push(make_file_node("src/auth/token.ts"));
g.nodes.push(make_file_node("src/utils/formatDate.ts"));
g.nodes.push(make_file_node("src/utils/formatCurrency.ts"));
let mut n = Node::new("func:validateAuthToken", "validateAuthToken");
n.node_type = Some("code".into());
n.node_kind = Some("Function".into());
n.file_path = Some("src/auth/login.ts".into());
g.nodes.push(n);
let mut n = Node::new("func:refreshAuthCredential", "refreshAuthCredential");
n.node_type = Some("code".into());
n.node_kind = Some("Function".into());
n.file_path = Some("src/auth/login.ts".into());
g.nodes.push(n);
let mut n = Node::new("func:storeAuthToken", "storeAuthToken");
n.node_type = Some("code".into());
n.node_kind = Some("Function".into());
n.file_path = Some("src/auth/token.ts".into());
g.nodes.push(n);
let mut n = Node::new("func:revokeAuthCredential", "revokeAuthCredential");
n.node_type = Some("code".into());
n.node_kind = Some("Function".into());
n.file_path = Some("src/auth/token.ts".into());
g.nodes.push(n);
let mut n = Node::new("func:formatLocalDate", "formatLocalDate");
n.node_type = Some("code".into());
n.node_kind = Some("Function".into());
n.file_path = Some("src/utils/formatDate.ts".into());
g.nodes.push(n);
let mut n = Node::new("func:parseDateString", "parseDateString");
n.node_type = Some("code".into());
n.node_kind = Some("Function".into());
n.file_path = Some("src/utils/formatDate.ts".into());
g.nodes.push(n);
let mut n = Node::new("func:formatLocalCurrency", "formatLocalCurrency");
n.node_type = Some("code".into());
n.node_kind = Some("Function".into());
n.file_path = Some("src/utils/formatCurrency.ts".into());
g.nodes.push(n);
let mut n = Node::new("func:parseCurrencyString", "parseCurrencyString");
n.node_type = Some("code".into());
n.node_kind = Some("Function".into());
n.file_path = Some("src/utils/formatCurrency.ts".into());
g.nodes.push(n);
let (mut net, idx_to_id) = build_network(&g);
assert_eq!(net.num_nodes(), 4);
assert_eq!(net.num_edges(), 0);
add_symbol_similarity_edges(&mut net, &g, &idx_to_id, 0.5, 2, 0.15);
let login_idx = idx_to_id
.iter()
.position(|id| id.contains("login"))
.unwrap();
let token_idx = idx_to_id
.iter()
.position(|id| id.contains("token.ts"))
.unwrap();
let login_out = net.out_neighbors(login_idx);
let has_auth_edge = login_out.iter().any(|&(t, _)| t == token_idx);
assert!(
has_auth_edge,
"Auth files should be connected by symbol similarity"
);
let date_idx = idx_to_id
.iter()
.position(|id| id.contains("formatDate"))
.unwrap();
let currency_idx = idx_to_id
.iter()
.position(|id| id.contains("formatCurrency"))
.unwrap();
let date_out = net.out_neighbors(date_idx);
let has_format_edge = date_out.iter().any(|&(t, _)| t == currency_idx);
assert!(
has_format_edge,
"Format files should be connected by symbol similarity"
);
let login_connects_to_date = login_out.iter().any(|&(t, _)| t == date_idx);
let login_connects_to_currency = login_out.iter().any(|&(t, _)| t == currency_idx);
assert!(
!login_connects_to_date,
"Auth should not connect to format (date)"
);
assert!(
!login_connects_to_currency,
"Auth should not connect to format (currency)"
);
}
#[test]
fn test_symbol_similarity_threshold_enforcement() {
let mut g = Graph::default();
g.nodes.push(make_file_node("a.ts"));
g.nodes.push(make_file_node("b.ts"));
let mut n = Node::new("func:authHandler", "authHandler");
n.node_type = Some("code".into());
n.node_kind = Some("Function".into());
n.file_path = Some("a.ts".into());
g.nodes.push(n);
let mut n = Node::new("func:authValidator", "authValidator");
n.node_type = Some("code".into());
n.node_kind = Some("Function".into());
n.file_path = Some("b.ts".into());
g.nodes.push(n);
let (mut net, idx_to_id) = build_network(&g);
add_symbol_similarity_edges(&mut net, &g, &idx_to_id, 0.5, 2, 0.15);
assert_eq!(
net.num_edges(),
0,
"Single shared token should not create edge with min_shared=2"
);
let (mut net2, idx_to_id2) = build_network(&g);
add_symbol_similarity_edges(&mut net2, &g, &idx_to_id2, 0.5, 1, 0.0);
assert!(
net2.num_edges() > 0,
"Single shared token should create edge with min_shared=1"
);
}
#[test]
fn test_symbol_similarity_weight_scaling() {
let mut g = Graph::default();
g.nodes.push(make_file_node("a.ts"));
g.nodes.push(make_file_node("b.ts"));
let mut n = Node::new("func:validateAuthToken", "validateAuthToken");
n.node_type = Some("code".into());
n.node_kind = Some("Function".into());
n.file_path = Some("a.ts".into());
g.nodes.push(n);
let mut n = Node::new("func:authCredentialStore", "authCredentialStore");
n.node_type = Some("code".into());
n.node_kind = Some("Function".into());
n.file_path = Some("a.ts".into());
g.nodes.push(n);
let mut n = Node::new("func:refreshAuthToken", "refreshAuthToken");
n.node_type = Some("code".into());
n.node_kind = Some("Function".into());
n.file_path = Some("b.ts".into());
g.nodes.push(n);
let mut n = Node::new("func:authSessionStore", "authSessionStore");
n.node_type = Some("code".into());
n.node_kind = Some("Function".into());
n.file_path = Some("b.ts".into());
g.nodes.push(n);
let (mut net, idx_to_id) = build_network(&g);
let base_weight = 0.5;
add_symbol_similarity_edges(&mut net, &g, &idx_to_id, base_weight, 2, 0.0);
let a_idx = idx_to_id
.iter()
.position(|id| id.contains("a.ts"))
.unwrap();
let out = net.out_neighbors(a_idx);
assert!(!out.is_empty(), "Should have symbol similarity edge");
let edge_weight = out[0].1;
assert!(edge_weight > 0.0);
assert!(edge_weight <= base_weight);
}
#[test]
fn test_symbol_similarity_empty_files() {
let mut g = Graph::default();
g.nodes.push(make_file_node("a.ts"));
g.nodes.push(make_file_node("b.ts"));
let (mut net, idx_to_id) = build_network(&g);
add_symbol_similarity_edges(&mut net, &g, &idx_to_id, 0.5, 2, 0.15);
assert_eq!(net.num_edges(), 0);
}
#[test]
fn test_symbol_similarity_disabled() {
let mut g = Graph::default();
g.nodes.push(make_file_node("a.ts"));
g.nodes.push(make_file_node("b.ts"));
let mut n = Node::new("func:authLogin", "authLogin");
n.node_type = Some("code".into());
n.node_kind = Some("Function".into());
n.file_path = Some("a.ts".into());
g.nodes.push(n);
let mut n = Node::new("func:authLogout", "authLogout");
n.node_type = Some("code".into());
n.node_kind = Some("Function".into());
n.file_path = Some("b.ts".into());
g.nodes.push(n);
let (mut net, idx_to_id) = build_network(&g);
add_symbol_similarity_edges(&mut net, &g, &idx_to_id, 0.0, 1, 0.0);
assert_eq!(net.num_edges(), 0, "Weight 0.0 should add no edges");
}
#[test]
fn test_identify_hubs_basic() {
let mut g = Graph::default();
g.nodes.push(make_file_node("src/hub.ts"));
for i in 0..10 {
g.nodes.push(make_file_node(&format!("src/consumer{}.ts", i)));
}
for i in 0..10 {
g.edges.push(Edge::new(
&format!("file:src/consumer{}.ts", i),
"file:src/hub.ts",
"imports",
));
}
let (_, idx_to_id) = build_network(&g);
let hubs = identify_hubs(&g, &idx_to_id, 0.05, 1);
assert!(
!hubs.is_empty(),
"hub.ts with in_degree=10 should be identified as a hub"
);
let hub_idx = idx_to_id.iter().position(|id| id == "file:src/hub.ts").unwrap();
assert!(hubs.contains(&hub_idx), "hub.ts index should be in the hub set");
assert_eq!(hubs.len(), 1, "Only hub.ts should be a hub");
}
#[test]
fn test_identify_hubs_threshold_sensitivity() {
let mut g = Graph::default();
g.nodes.push(make_file_node("src/shared.ts"));
for i in 0..19 {
g.nodes.push(make_file_node(&format!("src/f{}.ts", i)));
}
for i in 0..5 {
g.edges.push(Edge::new(
&format!("file:src/f{}.ts", i),
"file:src/shared.ts",
"imports",
));
}
let (_, idx_to_id) = build_network(&g);
let hubs_low = identify_hubs(&g, &idx_to_id, 0.05, 1);
assert!(!hubs_low.is_empty(), "Low threshold should catch shared.ts");
let hubs_high = identify_hubs(&g, &idx_to_id, 0.5, 1);
assert!(hubs_high.is_empty(), "High threshold should not catch shared.ts with in_degree=5");
}
#[test]
fn test_hub_exclusion_produces_cleaner_clusters() {
let mut g = Graph::default();
let domain_a = ["src/auth/login.ts", "src/auth/logout.ts", "src/auth/session.ts", "src/auth/token.ts"];
for p in &domain_a {
g.nodes.push(make_file_node(p));
}
for i in 0..domain_a.len() {
for j in 0..domain_a.len() {
if i != j {
g.edges.push(Edge::new(
&format!("file:{}", domain_a[i]),
&format!("file:{}", domain_a[j]),
"calls",
));
}
}
}
let domain_b = ["src/db/pool.ts", "src/db/query.ts", "src/db/migrate.ts", "src/db/schema.ts"];
for p in &domain_b {
g.nodes.push(make_file_node(p));
}
for i in 0..domain_b.len() {
for j in 0..domain_b.len() {
if i != j {
g.edges.push(Edge::new(
&format!("file:{}", domain_b[i]),
&format!("file:{}", domain_b[j]),
"calls",
));
}
}
}
g.nodes.push(make_file_node("src/utils/ink.ts"));
for p in domain_a.iter().chain(domain_b.iter()) {
g.edges.push(Edge::new(
&format!("file:{}", p),
"file:src/utils/ink.ts",
"imports",
));
}
let config_with = ClusterConfig {
seed: 42,
num_trials: 10,
min_community_size: 2,
hub_exclusion_threshold: 0.05,
hub_min_degree: 5,
..Default::default()
};
let result_with = cluster(&g, &config_with).unwrap();
assert!(
result_with.metrics.num_communities >= 3,
"Hub exclusion should produce at least 3 components (2 domains + 1 infra), got {}",
result_with.metrics.num_communities,
);
let infra = result_with.nodes.iter().find(|n| n.id == "infer:component:infrastructure");
assert!(infra.is_some(), "Infrastructure component should exist");
let infra_node = infra.unwrap();
assert_eq!(infra_node.title, "Infrastructure & Shared Utilities");
let infra_edges: Vec<&Edge> = result_with
.edges
.iter()
.filter(|e| e.from == "infer:component:infrastructure" && e.relation == "contains")
.collect();
assert_eq!(infra_edges.len(), 1, "Infrastructure should contain exactly 1 hub file");
assert_eq!(infra_edges[0].to, "file:src/utils/ink.ts");
}
#[test]
fn test_hub_exclusion_disabled() {
let mut g = Graph::default();
g.nodes.push(make_file_node("src/hub.ts"));
for i in 0..10 {
g.nodes.push(make_file_node(&format!("src/f{}.ts", i)));
g.edges.push(Edge::new(
&format!("file:src/f{}.ts", i),
"file:src/hub.ts",
"imports",
));
}
let config = ClusterConfig {
seed: 42,
num_trials: 10,
min_community_size: 1,
hub_exclusion_threshold: 0.0,
..Default::default()
};
let result = cluster(&g, &config).unwrap();
let infra = result.nodes.iter().find(|n| n.id == "infer:component:infrastructure");
assert!(infra.is_none(), "Hub exclusion disabled should produce no infrastructure component");
}
#[test]
fn test_infra_component_created() {
let excluded_ids = vec![
"file:src/utils/ink.ts".to_string(),
"file:src/utils/debug.ts".to_string(),
];
let g = Graph::default();
let (node, edges) = create_infra_component(&excluded_ids, &g);
assert_eq!(node.id, "infer:component:infrastructure");
assert_eq!(node.title, "Infrastructure & Shared Utilities");
assert_eq!(node.node_type.as_deref(), Some("component"));
assert_eq!(node.source.as_deref(), Some("infer"));
assert_eq!(
node.metadata.get("size").and_then(|v| v.as_u64()),
Some(2),
);
assert_eq!(
node.metadata.get("hub_excluded").and_then(|v| v.as_bool()),
Some(true),
);
assert_eq!(edges.len(), 2);
for edge in &edges {
assert_eq!(edge.from, "infer:component:infrastructure");
assert_eq!(edge.relation, "contains");
}
let edge_targets: Vec<&str> = edges.iter().map(|e| e.to.as_str()).collect();
assert!(edge_targets.contains(&"file:src/utils/ink.ts"));
assert!(edge_targets.contains(&"file:src/utils/debug.ts"));
}
}