use std::collections::{BTreeMap, BTreeSet, HashMap};
use rusqlite::Connection;
use serde::Serialize;
use crate::query::repo_brief::{
self, FileBriefRow, RepoBriefMemoryCounts, enrich_memory_counts, enrich_symbol_kinds,
};
const MAX_PATH_BUCKET_FILES: usize = 120;
const MAX_COTOUCH_COMMIT_FILES: usize = 40;
const MAX_CLUSTER_PATHS: usize = 8;
const MIN_EDGE_SCORE: f64 = 0.34;
#[derive(Debug, Clone, Copy)]
pub struct RepoClustersOptions {
pub limit: u32,
pub include_generated: bool,
pub include_memories: bool,
pub min_cluster_size: u32,
}
impl Default for RepoClustersOptions {
fn default() -> Self {
Self { limit: 10, include_generated: false, include_memories: true, min_cluster_size: 2 }
}
}
#[derive(Debug, Serialize)]
pub struct RepoClustersReport {
pub summary: RepoClustersSummary,
pub clusters: Vec<RepoCluster>,
pub warnings: Vec<String>,
}
#[derive(Debug, Serialize)]
pub struct RepoClustersSummary {
pub total_files: u64,
pub clustered_files: u64,
pub singleton_files: u64,
pub returned_clusters: u64,
pub generated_files: u64,
pub git_commits: u64,
pub git_file_changes: u64,
pub graph_edges: u64,
pub repo_memories: RepoBriefMemoryCounts,
pub scoring_note: String,
}
#[derive(Debug, Serialize)]
pub struct RepoCluster {
pub cluster_id: String,
pub name: String,
pub category: String,
pub confidence: String,
pub score: f64,
pub metrics: RepoClusterMetrics,
pub representative_paths: Vec<String>,
pub why: Vec<String>,
pub next_tools: Vec<RepoClusterNextTool>,
}
#[derive(Debug, Default, Clone, Serialize)]
pub struct RepoClusterMetrics {
pub file_count: u64,
pub total_lines: u64,
pub total_chunks: u64,
pub total_symbols: u64,
pub fan_in: u64,
pub fan_out: u64,
pub commit_touch_count: u64,
pub recent_touch_count: u64,
pub additions: u64,
pub deletions: u64,
pub co_touch_edges: u64,
pub graph_edges: u64,
pub languages: BTreeMap<String, u64>,
pub kinds: BTreeMap<String, u64>,
pub memories: RepoBriefMemoryCounts,
}
#[derive(Debug, Serialize)]
pub struct RepoClusterNextTool {
pub tool: String,
pub reason: String,
pub args: BTreeMap<String, serde_json::Value>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
struct PathPair(usize, usize);
impl PathPair {
fn new(left: usize, right: usize) -> Self {
if left <= right { Self(left, right) } else { Self(right, left) }
}
}
#[derive(Debug, Default, Clone)]
struct PairEvidence {
co_touch_count: u64,
graph_edge_count: u64,
path_candidate: bool,
}
#[derive(Debug, Clone)]
struct ScoredPair {
pair: PathPair,
score: f64,
}
#[derive(Debug, Clone)]
struct ClusterBuild {
rows: Vec<FileBriefRow>,
avg_edge_score: f64,
co_touch_edges: u64,
graph_edges: u64,
}
pub fn repo_clusters(
conn: &Connection,
options: RepoClustersOptions,
) -> anyhow::Result<RepoClustersReport> {
let summary = repo_brief::summary_counts(conn)?;
let mut warnings = Vec::new();
if summary.git_commits == 0 {
warnings.push("git history is not indexed; co-touch clustering is unavailable".to_string());
}
if summary.graph_edges == 0 {
warnings.push(
"graph edges are not indexed; graph-neighborhood clustering is unavailable".to_string(),
);
}
let mut rows = repo_brief::file_rows(conn, options.include_generated)?;
if options.include_memories {
enrich_memory_counts(conn, &mut rows)?;
}
enrich_symbol_kinds(conn, &mut rows)?;
let co_touch_pairs = co_touch_pairs(conn, options.include_generated, &rows)?;
let graph_pairs = graph_pairs(conn, options.include_generated, &rows)?;
let clusters = cluster_rows(&rows, &co_touch_pairs, &graph_pairs, options.min_cluster_size);
let clustered_files = clusters.iter().map(|cluster| cluster.rows.len() as u64).sum::<u64>();
let mut clusters = clusters.into_iter().map(cluster_for).collect::<Vec<_>>();
clusters.sort_by(|a, b| {
b.score
.total_cmp(&a.score)
.then_with(|| b.metrics.file_count.cmp(&a.metrics.file_count))
.then_with(|| a.name.cmp(&b.name))
});
for (index, cluster) in clusters.iter_mut().enumerate() {
cluster.cluster_id = format!("cluster_{:03}", index + 1);
}
let limit = usize::try_from(options.limit.max(1)).unwrap_or(usize::MAX);
clusters.truncate(limit);
let singleton_files = rows.len() as u64 - clustered_files;
Ok(RepoClustersReport {
summary: RepoClustersSummary {
total_files: summary.total_files,
clustered_files,
singleton_files,
returned_clusters: clusters.len() as u64,
generated_files: summary.generated_files,
git_commits: summary.git_commits,
git_file_changes: summary.git_file_changes,
graph_edges: summary.graph_edges,
repo_memories: summary.memories,
scoring_note:
"cheap sparse ownership clustering: path proximity, direct graph edges, bounded git co-touches, churn, and metadata"
.to_string(),
},
clusters,
warnings,
})
}
fn cluster_rows(
rows: &[FileBriefRow],
co_touch_pairs: &BTreeMap<PathPair, u64>,
graph_pairs: &BTreeMap<PathPair, u64>,
min_cluster_size: u32,
) -> Vec<ClusterBuild> {
if rows.is_empty() {
return Vec::new();
}
let candidates = candidate_pairs(rows, co_touch_pairs, graph_pairs);
let mut parent = UnionFind::new(rows.len());
let mut accepted_pairs = Vec::new();
for pair in candidates {
if pair.score >= MIN_EDGE_SCORE {
parent.union(pair.pair.0, pair.pair.1);
accepted_pairs.push(pair);
}
}
let mut groups = BTreeMap::<usize, Vec<usize>>::new();
for index in 0..rows.len() {
groups.entry(parent.find(index)).or_default().push(index);
}
let min_cluster_size = usize::try_from(min_cluster_size.max(2)).unwrap_or(usize::MAX);
let mut clusters = Vec::new();
for indexes in groups.values().filter(|indexes| indexes.len() >= min_cluster_size) {
let index_set = indexes.iter().copied().collect::<BTreeSet<_>>();
let mut edge_score_sum = 0.0;
let mut edge_count = 0_u64;
let mut co_touch_edges = 0_u64;
let mut graph_edges = 0_u64;
for pair in &accepted_pairs {
if index_set.contains(&pair.pair.0) && index_set.contains(&pair.pair.1) {
edge_score_sum += pair.score;
edge_count += 1;
co_touch_edges += co_touch_pairs.get(&pair.pair).copied().unwrap_or(0);
graph_edges += graph_pairs.get(&pair.pair).copied().unwrap_or(0);
}
}
let avg_edge_score = if edge_count == 0 { 0.0 } else { edge_score_sum / edge_count as f64 };
clusters.push(ClusterBuild {
rows: indexes.iter().map(|index| rows[*index].clone()).collect(),
avg_edge_score,
co_touch_edges,
graph_edges,
});
}
clusters
}
fn candidate_pairs(
rows: &[FileBriefRow],
co_touch_pairs: &BTreeMap<PathPair, u64>,
graph_pairs: &BTreeMap<PathPair, u64>,
) -> Vec<ScoredPair> {
let mut evidence = BTreeMap::<PathPair, PairEvidence>::new();
for (pair, count) in co_touch_pairs {
evidence.entry(*pair).or_default().co_touch_count = *count;
}
for (pair, count) in graph_pairs {
evidence.entry(*pair).or_default().graph_edge_count = *count;
}
for pair in path_candidate_pairs(rows) {
evidence.entry(pair).or_default().path_candidate = true;
}
evidence
.into_iter()
.filter_map(|(pair, evidence)| {
let score = pair_score(&rows[pair.0], &rows[pair.1], &evidence);
(score > 0.0).then_some(ScoredPair { pair, score })
})
.collect()
}
fn pair_score(left: &FileBriefRow, right: &FileBriefRow, evidence: &PairEvidence) -> f64 {
let path = path_similarity(&left.path, &right.path);
let same_bucket = path_bucket(&left.path) == path_bucket(&right.path);
if !same_bucket {
return 0.0;
}
let co_touch = capped(evidence.co_touch_count as f64 / 4.0);
let graph = capped(evidence.graph_edge_count as f64 / 6.0);
let same_language = if left.language == right.language { 1.0 } else { 0.0 };
let same_kind = if left.kind == right.kind { 1.0 } else { 0.0 };
let churn = churn_similarity(left, right);
let path_weight = if evidence.path_candidate || same_bucket { 0.43 } else { 0.25 };
capped(
path * path_weight
+ co_touch * 0.24
+ graph * 0.22
+ same_language * 0.06
+ same_kind * 0.03
+ churn * 0.02,
)
}
fn co_touch_pairs(
conn: &Connection,
include_generated: bool,
rows: &[FileBriefRow],
) -> anyhow::Result<BTreeMap<PathPair, u64>> {
let path_index = path_index(rows);
let mut stmt = conn.prepare(
"
SELECT git_file_changes.commit_hash, git_file_changes.path
FROM git_file_changes
JOIN files ON files.path = git_file_changes.path
WHERE (?1 OR files.generated = 0)
ORDER BY git_file_changes.commit_hash, git_file_changes.path
",
)?;
let mut query_rows = stmt.query([include_generated])?;
let mut pairs = BTreeMap::<PathPair, u64>::new();
let mut current_commit = String::new();
let mut paths = Vec::new();
while let Some(row) = query_rows.next()? {
let commit = row.get::<_, String>(0)?;
let path = row.get::<_, String>(1)?;
if current_commit.is_empty() {
current_commit = commit.clone();
}
if commit != current_commit {
add_co_touch_pairs(&path_index, &paths, &mut pairs);
paths.clear();
current_commit = commit;
}
paths.push(path);
}
add_co_touch_pairs(&path_index, &paths, &mut pairs);
Ok(pairs)
}
fn add_co_touch_pairs(
path_index: &HashMap<&str, usize>,
paths: &[String],
pairs: &mut BTreeMap<PathPair, u64>,
) {
if paths.len() < 2 || paths.len() > MAX_COTOUCH_COMMIT_FILES {
return;
}
let mut indexes =
paths.iter().filter_map(|path| path_index.get(path.as_str()).copied()).collect::<Vec<_>>();
indexes.sort_unstable();
indexes.dedup();
for left in 0..indexes.len() {
for right in (left + 1)..indexes.len() {
*pairs.entry(PathPair::new(indexes[left], indexes[right])).or_default() += 1;
}
}
}
fn graph_pairs(
conn: &Connection,
include_generated: bool,
rows: &[FileBriefRow],
) -> anyhow::Result<BTreeMap<PathPair, u64>> {
let path_index = path_index(rows);
let bucket_by_path = rows
.iter()
.map(|row| (row.path.as_str(), path_bucket(&row.path)))
.collect::<HashMap<_, _>>();
let mut stmt = conn.prepare(
"
SELECT source_files.path, target_files.path
FROM edges
JOIN files source_files ON source_files.id = edges.source_file_id
JOIN symbols target_symbols ON target_symbols.id = edges.to_symbol_id
JOIN files target_files ON target_files.id = target_symbols.file_id
WHERE source_files.path != target_files.path
AND (?1 OR (source_files.generated = 0 AND target_files.generated = 0))
",
)?;
let mut pairs = BTreeMap::<PathPair, u64>::new();
let rows = stmt.query_map([include_generated], |row| {
Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?))
})?;
for row in rows {
let (left, right) = row?;
let (Some(left_bucket), Some(right_bucket)) =
(bucket_by_path.get(left.as_str()), bucket_by_path.get(right.as_str()))
else {
continue;
};
if left_bucket != right_bucket {
continue;
}
let (Some(left), Some(right)) =
(path_index.get(left.as_str()), path_index.get(right.as_str()))
else {
continue;
};
*pairs.entry(PathPair::new(*left, *right)).or_default() += 1;
}
Ok(pairs)
}
fn path_candidate_pairs(rows: &[FileBriefRow]) -> Vec<PathPair> {
let buckets = indexed_path_buckets(rows);
let mut pairs = Vec::new();
for indexes in buckets.values() {
if indexes.len() < 2 || indexes.len() > MAX_PATH_BUCKET_FILES {
continue;
}
for left in 0..indexes.len() {
for right in (left + 1)..indexes.len() {
pairs.push(PathPair::new(indexes[left], indexes[right]));
}
}
}
pairs
}
fn cluster_for(cluster: ClusterBuild) -> RepoCluster {
let mut metrics = RepoClusterMetrics::default();
let mut rows = cluster.rows;
rows.sort_by(|a, b| {
file_representative_score(b)
.total_cmp(&file_representative_score(a))
.then_with(|| a.path.cmp(&b.path))
});
for row in &rows {
metrics.file_count += 1;
metrics.total_lines += row.line_count;
metrics.total_chunks += row.chunk_count;
metrics.total_symbols += row.symbol_count;
metrics.fan_in += row.fan_in;
metrics.fan_out += row.fan_out;
metrics.commit_touch_count += row.commit_touch_count;
metrics.recent_touch_count += row.recent_touch_count;
metrics.additions += row.additions;
metrics.deletions += row.deletions;
*metrics.languages.entry(row.language.clone()).or_default() += 1;
*metrics.kinds.entry(row.kind.clone()).or_default() += 1;
metrics.memories.active += row.memories.active;
metrics.memories.stale += row.memories.stale;
metrics.memories.obsolete += row.memories.obsolete;
metrics.memories.other += row.memories.other;
}
metrics.co_touch_edges = cluster.co_touch_edges;
metrics.graph_edges = cluster.graph_edges;
let representative_paths =
rows.iter().take(MAX_CLUSTER_PATHS).map(|row| row.path.clone()).collect::<Vec<_>>();
let name = cluster_name(&rows);
let score = cluster_score(&metrics, cluster.avg_edge_score);
let category = cluster_category(&metrics).to_string();
let confidence = confidence_for(&metrics, cluster.avg_edge_score).to_string();
let why = why_for(&metrics, cluster.avg_edge_score, &name);
let next_tools = next_tools_for(representative_paths.first());
RepoCluster {
cluster_id: String::new(),
name,
category,
confidence,
score,
metrics,
representative_paths,
why,
next_tools,
}
}
fn file_representative_score(row: &FileBriefRow) -> f64 {
capped((row.fan_in + row.fan_out) as f64 / 50.0) * 0.40
+ capped(row.symbol_count as f64 / 40.0) * 0.25
+ capped(row.commit_touch_count as f64 / 25.0) * 0.25
+ capped(row.line_count as f64 / 800.0) * 0.10
}
fn cluster_score(metrics: &RepoClusterMetrics, avg_edge_score: f64) -> f64 {
capped(
avg_edge_score * 0.35
+ capped(metrics.file_count as f64 / 12.0) * 0.12
+ capped((metrics.fan_in + metrics.fan_out) as f64 / 100.0) * 0.18
+ capped(metrics.commit_touch_count as f64 / 60.0) * 0.15
+ capped(metrics.total_symbols as f64 / 120.0) * 0.12
+ capped((metrics.co_touch_edges + metrics.graph_edges) as f64 / 20.0) * 0.08,
)
}
fn cluster_category(metrics: &RepoClusterMetrics) -> &'static str {
if metrics.co_touch_edges > 0 && metrics.graph_edges > 0 {
"graph_and_churn_ownership"
} else if metrics.co_touch_edges > 0 {
"co_touch_ownership"
} else if metrics.graph_edges > 0 {
"graph_neighborhood"
} else {
"path_proximity"
}
}
fn confidence_for(metrics: &RepoClusterMetrics, avg_edge_score: f64) -> &'static str {
if (metrics.co_touch_edges > 0 && metrics.graph_edges > 0) || avg_edge_score >= 0.55 {
"high"
} else if metrics.co_touch_edges > 0 || metrics.graph_edges > 0 || avg_edge_score >= 0.42 {
"medium"
} else {
"low"
}
}
fn why_for(metrics: &RepoClusterMetrics, avg_edge_score: f64, name: &str) -> Vec<String> {
let mut why = vec![format!("shared ownership bucket: {name}")];
if metrics.graph_edges > 0 {
why.push(format!(
"graph neighborhood: {} direct indexed cross-file edges",
metrics.graph_edges
));
}
if metrics.co_touch_edges > 0 {
why.push(format!(
"co-touch history: {} bounded same-commit file-pair hits",
metrics.co_touch_edges
));
}
if metrics.commit_touch_count > 0 {
why.push(format!(
"churn: {} total commit touches, {} recent",
metrics.commit_touch_count, metrics.recent_touch_count
));
}
if metrics.total_symbols > 0 {
why.push(format!(
"source shape: {} files, {} symbols, {} lines",
metrics.file_count, metrics.total_symbols, metrics.total_lines
));
}
why.push(format!("average accepted edge score: {:.3}", avg_edge_score));
why
}
fn next_tools_for(path: Option<&String>) -> Vec<RepoClusterNextTool> {
let Some(path) = path else {
return Vec::new();
};
vec![
next_tool(
"repo_brief",
"compare cluster representatives against spine/churn/god-module rankings",
[("mode", "refactor_candidates".to_string())],
),
next_tool(
"impact_surface",
"inspect graph, git, papertrail, and memories for the representative path",
[("query", path.clone())],
),
next_tool(
"git_history_for_path",
"inspect co-touch and churn history",
[("path", path.clone())],
),
]
}
fn next_tool<const N: usize>(
tool: &str,
reason: &str,
args: [(&str, String); N],
) -> RepoClusterNextTool {
RepoClusterNextTool {
tool: tool.to_string(),
reason: reason.to_string(),
args: args
.into_iter()
.map(|(key, value)| (key.to_string(), serde_json::Value::String(value)))
.collect(),
}
}
fn cluster_name(rows: &[FileBriefRow]) -> String {
let paths = rows.iter().map(|row| row.path.as_str()).collect::<Vec<_>>();
let common = common_parent(&paths);
if !common.is_empty() {
return common;
}
let dominant_language = dominant_key(rows.iter().map(|row| row.language.as_str()));
format!("{dominant_language} files")
}
fn common_parent(paths: &[&str]) -> String {
let mut iter = paths.iter();
let Some(first) = iter.next() else {
return String::new();
};
let mut common = parent_components(first);
for path in iter {
let components = parent_components(path);
let keep =
common.iter().zip(components.iter()).take_while(|(left, right)| left == right).count();
common.truncate(keep);
}
common.join("/")
}
fn dominant_key<'a>(values: impl Iterator<Item = &'a str>) -> String {
let mut counts = BTreeMap::<&str, u64>::new();
for value in values {
*counts.entry(value).or_default() += 1;
}
counts
.into_iter()
.max_by(|(left_key, left_count), (right_key, right_count)| {
left_count.cmp(right_count).then_with(|| right_key.cmp(left_key))
})
.map(|(value, _)| value.to_string())
.unwrap_or_else(|| "source".to_string())
}
fn path_similarity(left: &str, right: &str) -> f64 {
let left_components = path_components(left);
let right_components = path_components(right);
let common_prefix = left_components
.iter()
.zip(right_components.iter())
.take_while(|(left, right)| left == right)
.count();
let prefix = if left_components.is_empty() && right_components.is_empty() {
0.0
} else {
common_prefix as f64 / left_components.len().max(right_components.len()) as f64
};
let left_tokens = path_tokens(left);
let right_tokens = path_tokens(right);
let union = left_tokens.union(&right_tokens).count();
let token_overlap = if union == 0 {
0.0
} else {
left_tokens.intersection(&right_tokens).count() as f64 / union as f64
};
prefix * 0.65 + token_overlap * 0.35
}
fn churn_similarity(left: &FileBriefRow, right: &FileBriefRow) -> f64 {
let left_churn = left.additions.saturating_add(left.deletions);
let right_churn = right.additions.saturating_add(right.deletions);
let max_churn = left_churn.max(right_churn);
let churn = if max_churn == 0 {
0.0
} else {
1.0 - (left_churn.abs_diff(right_churn) as f64 / max_churn as f64)
};
let max_touches = left.commit_touch_count.max(right.commit_touch_count);
let touches = if max_touches == 0 {
0.0
} else {
1.0 - (left.commit_touch_count.abs_diff(right.commit_touch_count) as f64
/ max_touches as f64)
};
(churn + touches) * 0.5
}
fn path_bucket(path: &str) -> String {
let mut components = parent_components(path);
components.truncate(5);
match components.as_slice() {
[] => ".".to_string(),
[one] => one.clone(),
_ => components.join("/"),
}
}
fn indexed_path_buckets(rows: &[FileBriefRow]) -> BTreeMap<String, Vec<usize>> {
let mut buckets = BTreeMap::<String, Vec<usize>>::new();
for (index, row) in rows.iter().enumerate() {
buckets.entry(path_bucket(&row.path)).or_default().push(index);
}
buckets
}
fn parent_components(path: &str) -> Vec<String> {
let mut components = path_components(path);
components.pop();
components
}
fn path_components(path: &str) -> Vec<String> {
path.split('/').filter(|part| !part.is_empty()).map(str::to_string).collect()
}
fn path_tokens(path: &str) -> BTreeSet<String> {
path.split(|ch: char| !ch.is_ascii_alphanumeric())
.filter(|part| part.len() > 1)
.map(|part| part.to_ascii_lowercase())
.collect()
}
fn path_index(rows: &[FileBriefRow]) -> HashMap<&str, usize> {
rows.iter().enumerate().map(|(index, row)| (row.path.as_str(), index)).collect()
}
fn capped(value: f64) -> f64 {
value.clamp(0.0, 1.0)
}
#[derive(Debug)]
struct UnionFind {
parent: Vec<usize>,
rank: Vec<u8>,
}
impl UnionFind {
fn new(len: usize) -> Self {
Self { parent: (0..len).collect(), rank: vec![0; len] }
}
fn find(&mut self, value: usize) -> usize {
if self.parent[value] != value {
self.parent[value] = self.find(self.parent[value]);
}
self.parent[value]
}
fn union(&mut self, left: usize, right: usize) {
let left_root = self.find(left);
let right_root = self.find(right);
if left_root == right_root {
return;
}
if self.rank[left_root] < self.rank[right_root] {
self.parent[left_root] = right_root;
} else if self.rank[left_root] > self.rank[right_root] {
self.parent[right_root] = left_root;
} else {
self.parent[right_root] = left_root;
self.rank[left_root] += 1;
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn path_similarity_prefers_nearby_files() {
let nearby = path_similarity("src/actors/sync/actor.rs", "src/actors/sync/msg.rs");
let distant = path_similarity("src/actors/sync/actor.rs", "apps/mobile/src/App.tsx");
assert!(nearby > distant, "nearby={nearby} distant={distant}");
}
#[test]
fn cluster_rows_uses_cotouch_edges() {
let rows =
vec![row("src/sync/actor.rs"), row("src/sync/msg.rs"), row("apps/mobile/App.tsx")];
let co_touch_pairs = BTreeMap::from([(PathPair::new(0, 1), 3)]);
let graph_pairs = BTreeMap::new();
let clusters = cluster_rows(&rows, &co_touch_pairs, &graph_pairs, 2);
assert_eq!(clusters.len(), 1);
assert_eq!(clusters[0].rows.len(), 2);
assert_eq!(clusters[0].co_touch_edges, 3);
}
fn row(path: &str) -> FileBriefRow {
FileBriefRow {
path: path.to_string(),
language: "rust".to_string(),
kind: "source".to_string(),
generated: false,
line_count: 10,
chunk_count: 1,
symbol_count: 1,
fan_in: 0,
fan_out: 0,
commit_touch_count: 1,
recent_touch_count: 0,
additions: 10,
deletions: 0,
github_ref_count: 0,
symbol_kinds: BTreeMap::new(),
memories: RepoBriefMemoryCounts::default(),
}
}
}