1use std::collections::{BTreeMap, BTreeSet, HashMap};
2
3use rusqlite::Connection;
4use serde::Serialize;
5
6use crate::query::repo_brief::{
7 self, FileBriefRow, RepoBriefMemoryCounts, enrich_memory_counts, enrich_symbol_kinds,
8};
9
10const MAX_PATH_BUCKET_FILES: usize = 120;
11const MAX_COTOUCH_COMMIT_FILES: usize = 40;
12const MAX_CLUSTER_PATHS: usize = 8;
13const MIN_EDGE_SCORE: f64 = 0.34;
14
15#[derive(Debug, Clone, Copy)]
16pub struct RepoClustersOptions {
17 pub limit: u32,
18 pub include_generated: bool,
19 pub include_memories: bool,
20 pub min_cluster_size: u32,
21}
22
23impl Default for RepoClustersOptions {
24 fn default() -> Self {
25 Self { limit: 10, include_generated: false, include_memories: true, min_cluster_size: 2 }
26 }
27}
28
29#[derive(Debug, Serialize)]
30pub struct RepoClustersReport {
31 pub summary: RepoClustersSummary,
32 pub clusters: Vec<RepoCluster>,
33 pub warnings: Vec<String>,
34}
35
36#[derive(Debug, Serialize)]
37pub struct RepoClustersSummary {
38 pub total_files: u64,
39 pub clustered_files: u64,
40 pub singleton_files: u64,
41 pub returned_clusters: u64,
42 pub generated_files: u64,
43 pub git_commits: u64,
44 pub git_file_changes: u64,
45 pub graph_edges: u64,
46 pub repo_memories: RepoBriefMemoryCounts,
47 pub scoring_note: String,
48}
49
50#[derive(Debug, Serialize)]
51pub struct RepoCluster {
52 pub cluster_id: String,
53 pub name: String,
54 pub category: String,
55 pub confidence: String,
56 pub score: f64,
57 pub metrics: RepoClusterMetrics,
58 pub representative_paths: Vec<String>,
59 pub why: Vec<String>,
60 pub next_tools: Vec<RepoClusterNextTool>,
61}
62
63#[derive(Debug, Default, Clone, Serialize)]
64pub struct RepoClusterMetrics {
65 pub file_count: u64,
66 pub total_lines: u64,
67 pub total_chunks: u64,
68 pub total_symbols: u64,
69 pub fan_in: u64,
70 pub fan_out: u64,
71 pub commit_touch_count: u64,
72 pub recent_touch_count: u64,
73 pub additions: u64,
74 pub deletions: u64,
75 pub co_touch_edges: u64,
76 pub graph_edges: u64,
77 pub languages: BTreeMap<String, u64>,
78 pub kinds: BTreeMap<String, u64>,
79 pub memories: RepoBriefMemoryCounts,
80}
81
82#[derive(Debug, Serialize)]
83pub struct RepoClusterNextTool {
84 pub tool: String,
85 pub reason: String,
86 pub args: BTreeMap<String, serde_json::Value>,
87}
88
89#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
90struct PathPair(usize, usize);
91
92impl PathPair {
93 fn new(left: usize, right: usize) -> Self {
94 if left <= right { Self(left, right) } else { Self(right, left) }
95 }
96}
97
98#[derive(Debug, Default, Clone)]
99struct PairEvidence {
100 co_touch_count: u64,
101 graph_edge_count: u64,
102 path_candidate: bool,
103}
104
105#[derive(Debug, Clone)]
106struct ScoredPair {
107 pair: PathPair,
108 score: f64,
109}
110
111#[derive(Debug, Clone)]
112struct ClusterBuild {
113 rows: Vec<FileBriefRow>,
114 avg_edge_score: f64,
115 co_touch_edges: u64,
116 graph_edges: u64,
117}
118
119pub fn repo_clusters(
120 conn: &Connection,
121 options: RepoClustersOptions,
122) -> anyhow::Result<RepoClustersReport> {
123 let summary = repo_brief::summary_counts(conn)?;
124 let mut warnings = Vec::new();
125 if summary.git_commits == 0 {
126 warnings.push("git history is not indexed; co-touch clustering is unavailable".to_string());
127 }
128 if summary.graph_edges == 0 {
129 warnings.push(
130 "graph edges are not indexed; graph-neighborhood clustering is unavailable".to_string(),
131 );
132 }
133
134 let mut rows = repo_brief::file_rows(conn, options.include_generated)?;
135 if options.include_memories {
136 enrich_memory_counts(conn, &mut rows)?;
137 }
138 enrich_symbol_kinds(conn, &mut rows)?;
139
140 let co_touch_pairs = co_touch_pairs(conn, options.include_generated, &rows)?;
141 let graph_pairs = graph_pairs(conn, options.include_generated, &rows)?;
142 let clusters = cluster_rows(&rows, &co_touch_pairs, &graph_pairs, options.min_cluster_size);
143 let clustered_files = clusters.iter().map(|cluster| cluster.rows.len() as u64).sum::<u64>();
144 let mut clusters = clusters.into_iter().map(cluster_for).collect::<Vec<_>>();
145 clusters.sort_by(|a, b| {
146 b.score
147 .total_cmp(&a.score)
148 .then_with(|| b.metrics.file_count.cmp(&a.metrics.file_count))
149 .then_with(|| a.name.cmp(&b.name))
150 });
151 for (index, cluster) in clusters.iter_mut().enumerate() {
152 cluster.cluster_id = format!("cluster_{:03}", index + 1);
153 }
154
155 let limit = usize::try_from(options.limit.max(1)).unwrap_or(usize::MAX);
156 clusters.truncate(limit);
157 let singleton_files = rows.len() as u64 - clustered_files;
158
159 Ok(RepoClustersReport {
160 summary: RepoClustersSummary {
161 total_files: summary.total_files,
162 clustered_files,
163 singleton_files,
164 returned_clusters: clusters.len() as u64,
165 generated_files: summary.generated_files,
166 git_commits: summary.git_commits,
167 git_file_changes: summary.git_file_changes,
168 graph_edges: summary.graph_edges,
169 repo_memories: summary.memories,
170 scoring_note:
171 "cheap sparse ownership clustering: path proximity, direct graph edges, bounded git co-touches, churn, and metadata"
172 .to_string(),
173 },
174 clusters,
175 warnings,
176 })
177}
178
179fn cluster_rows(
180 rows: &[FileBriefRow],
181 co_touch_pairs: &BTreeMap<PathPair, u64>,
182 graph_pairs: &BTreeMap<PathPair, u64>,
183 min_cluster_size: u32,
184) -> Vec<ClusterBuild> {
185 if rows.is_empty() {
186 return Vec::new();
187 }
188
189 let candidates = candidate_pairs(rows, co_touch_pairs, graph_pairs);
190 let mut parent = UnionFind::new(rows.len());
191 let mut accepted_pairs = Vec::new();
192 for pair in candidates {
193 if pair.score >= MIN_EDGE_SCORE {
194 parent.union(pair.pair.0, pair.pair.1);
195 accepted_pairs.push(pair);
196 }
197 }
198
199 let mut groups = BTreeMap::<usize, Vec<usize>>::new();
200 for index in 0..rows.len() {
201 groups.entry(parent.find(index)).or_default().push(index);
202 }
203
204 let min_cluster_size = usize::try_from(min_cluster_size.max(2)).unwrap_or(usize::MAX);
205 let mut clusters = Vec::new();
206 for indexes in groups.values().filter(|indexes| indexes.len() >= min_cluster_size) {
207 let index_set = indexes.iter().copied().collect::<BTreeSet<_>>();
208 let mut edge_score_sum = 0.0;
209 let mut edge_count = 0_u64;
210 let mut co_touch_edges = 0_u64;
211 let mut graph_edges = 0_u64;
212 for pair in &accepted_pairs {
213 if index_set.contains(&pair.pair.0) && index_set.contains(&pair.pair.1) {
214 edge_score_sum += pair.score;
215 edge_count += 1;
216 co_touch_edges += co_touch_pairs.get(&pair.pair).copied().unwrap_or(0);
217 graph_edges += graph_pairs.get(&pair.pair).copied().unwrap_or(0);
218 }
219 }
220 let avg_edge_score = if edge_count == 0 { 0.0 } else { edge_score_sum / edge_count as f64 };
221 clusters.push(ClusterBuild {
222 rows: indexes.iter().map(|index| rows[*index].clone()).collect(),
223 avg_edge_score,
224 co_touch_edges,
225 graph_edges,
226 });
227 }
228 clusters
229}
230
231fn candidate_pairs(
232 rows: &[FileBriefRow],
233 co_touch_pairs: &BTreeMap<PathPair, u64>,
234 graph_pairs: &BTreeMap<PathPair, u64>,
235) -> Vec<ScoredPair> {
236 let mut evidence = BTreeMap::<PathPair, PairEvidence>::new();
237 for (pair, count) in co_touch_pairs {
238 evidence.entry(*pair).or_default().co_touch_count = *count;
239 }
240 for (pair, count) in graph_pairs {
241 evidence.entry(*pair).or_default().graph_edge_count = *count;
242 }
243 for pair in path_candidate_pairs(rows) {
244 evidence.entry(pair).or_default().path_candidate = true;
245 }
246
247 evidence
248 .into_iter()
249 .filter_map(|(pair, evidence)| {
250 let score = pair_score(&rows[pair.0], &rows[pair.1], &evidence);
251 (score > 0.0).then_some(ScoredPair { pair, score })
252 })
253 .collect()
254}
255
256fn pair_score(left: &FileBriefRow, right: &FileBriefRow, evidence: &PairEvidence) -> f64 {
257 let path = path_similarity(&left.path, &right.path);
258 let same_bucket = path_bucket(&left.path) == path_bucket(&right.path);
259 if !same_bucket {
260 return 0.0;
261 }
262 let co_touch = capped(evidence.co_touch_count as f64 / 4.0);
263 let graph = capped(evidence.graph_edge_count as f64 / 6.0);
264 let same_language = if left.language == right.language { 1.0 } else { 0.0 };
265 let same_kind = if left.kind == right.kind { 1.0 } else { 0.0 };
266 let churn = churn_similarity(left, right);
267 let path_weight = if evidence.path_candidate || same_bucket { 0.43 } else { 0.25 };
268 capped(
269 path * path_weight
270 + co_touch * 0.24
271 + graph * 0.22
272 + same_language * 0.06
273 + same_kind * 0.03
274 + churn * 0.02,
275 )
276}
277
278fn co_touch_pairs(
279 conn: &Connection,
280 include_generated: bool,
281 rows: &[FileBriefRow],
282) -> anyhow::Result<BTreeMap<PathPair, u64>> {
283 let path_index = path_index(rows);
284 let mut stmt = conn.prepare(
285 "
286 SELECT git_file_changes.commit_hash, git_file_changes.path
287 FROM git_file_changes
288 JOIN files ON files.path = git_file_changes.path
289 WHERE (?1 OR files.generated = 0)
290 ORDER BY git_file_changes.commit_hash, git_file_changes.path
291 ",
292 )?;
293 let mut query_rows = stmt.query([include_generated])?;
294 let mut pairs = BTreeMap::<PathPair, u64>::new();
295 let mut current_commit = String::new();
296 let mut paths = Vec::new();
297 while let Some(row) = query_rows.next()? {
298 let commit = row.get::<_, String>(0)?;
299 let path = row.get::<_, String>(1)?;
300 if current_commit.is_empty() {
301 current_commit = commit.clone();
302 }
303 if commit != current_commit {
304 add_co_touch_pairs(&path_index, &paths, &mut pairs);
305 paths.clear();
306 current_commit = commit;
307 }
308 paths.push(path);
309 }
310 add_co_touch_pairs(&path_index, &paths, &mut pairs);
311 Ok(pairs)
312}
313
314fn add_co_touch_pairs(
315 path_index: &HashMap<&str, usize>,
316 paths: &[String],
317 pairs: &mut BTreeMap<PathPair, u64>,
318) {
319 if paths.len() < 2 || paths.len() > MAX_COTOUCH_COMMIT_FILES {
320 return;
321 }
322 let mut indexes =
323 paths.iter().filter_map(|path| path_index.get(path.as_str()).copied()).collect::<Vec<_>>();
324 indexes.sort_unstable();
325 indexes.dedup();
326 for left in 0..indexes.len() {
327 for right in (left + 1)..indexes.len() {
328 *pairs.entry(PathPair::new(indexes[left], indexes[right])).or_default() += 1;
329 }
330 }
331}
332
333fn graph_pairs(
334 conn: &Connection,
335 include_generated: bool,
336 rows: &[FileBriefRow],
337) -> anyhow::Result<BTreeMap<PathPair, u64>> {
338 let path_index = path_index(rows);
339 let bucket_by_path = rows
340 .iter()
341 .map(|row| (row.path.as_str(), path_bucket(&row.path)))
342 .collect::<HashMap<_, _>>();
343 let mut stmt = conn.prepare(
344 "
345 SELECT source_files.path, target_files.path
346 FROM edges
347 JOIN files source_files ON source_files.id = edges.source_file_id
348 JOIN symbols target_symbols ON target_symbols.id = edges.to_symbol_id
349 JOIN files target_files ON target_files.id = target_symbols.file_id
350 WHERE source_files.path != target_files.path
351 AND (?1 OR (source_files.generated = 0 AND target_files.generated = 0))
352 ",
353 )?;
354 let mut pairs = BTreeMap::<PathPair, u64>::new();
355 let rows = stmt.query_map([include_generated], |row| {
356 Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?))
357 })?;
358 for row in rows {
359 let (left, right) = row?;
360 let (Some(left_bucket), Some(right_bucket)) =
361 (bucket_by_path.get(left.as_str()), bucket_by_path.get(right.as_str()))
362 else {
363 continue;
364 };
365 if left_bucket != right_bucket {
366 continue;
367 }
368 let (Some(left), Some(right)) =
369 (path_index.get(left.as_str()), path_index.get(right.as_str()))
370 else {
371 continue;
372 };
373 *pairs.entry(PathPair::new(*left, *right)).or_default() += 1;
374 }
375 Ok(pairs)
376}
377
378fn path_candidate_pairs(rows: &[FileBriefRow]) -> Vec<PathPair> {
379 let buckets = indexed_path_buckets(rows);
380 let mut pairs = Vec::new();
381 for indexes in buckets.values() {
382 if indexes.len() < 2 || indexes.len() > MAX_PATH_BUCKET_FILES {
383 continue;
384 }
385 for left in 0..indexes.len() {
386 for right in (left + 1)..indexes.len() {
387 pairs.push(PathPair::new(indexes[left], indexes[right]));
388 }
389 }
390 }
391 pairs
392}
393
394fn cluster_for(cluster: ClusterBuild) -> RepoCluster {
395 let mut metrics = RepoClusterMetrics::default();
396 let mut rows = cluster.rows;
397 rows.sort_by(|a, b| {
398 file_representative_score(b)
399 .total_cmp(&file_representative_score(a))
400 .then_with(|| a.path.cmp(&b.path))
401 });
402
403 for row in &rows {
404 metrics.file_count += 1;
405 metrics.total_lines += row.line_count;
406 metrics.total_chunks += row.chunk_count;
407 metrics.total_symbols += row.symbol_count;
408 metrics.fan_in += row.fan_in;
409 metrics.fan_out += row.fan_out;
410 metrics.commit_touch_count += row.commit_touch_count;
411 metrics.recent_touch_count += row.recent_touch_count;
412 metrics.additions += row.additions;
413 metrics.deletions += row.deletions;
414 *metrics.languages.entry(row.language.clone()).or_default() += 1;
415 *metrics.kinds.entry(row.kind.clone()).or_default() += 1;
416 metrics.memories.active += row.memories.active;
417 metrics.memories.stale += row.memories.stale;
418 metrics.memories.obsolete += row.memories.obsolete;
419 metrics.memories.other += row.memories.other;
420 }
421 metrics.co_touch_edges = cluster.co_touch_edges;
422 metrics.graph_edges = cluster.graph_edges;
423
424 let representative_paths =
425 rows.iter().take(MAX_CLUSTER_PATHS).map(|row| row.path.clone()).collect::<Vec<_>>();
426 let name = cluster_name(&rows);
427 let score = cluster_score(&metrics, cluster.avg_edge_score);
428 let category = cluster_category(&metrics).to_string();
429 let confidence = confidence_for(&metrics, cluster.avg_edge_score).to_string();
430 let why = why_for(&metrics, cluster.avg_edge_score, &name);
431 let next_tools = next_tools_for(representative_paths.first());
432
433 RepoCluster {
434 cluster_id: String::new(),
435 name,
436 category,
437 confidence,
438 score,
439 metrics,
440 representative_paths,
441 why,
442 next_tools,
443 }
444}
445
446fn file_representative_score(row: &FileBriefRow) -> f64 {
447 capped((row.fan_in + row.fan_out) as f64 / 50.0) * 0.40
448 + capped(row.symbol_count as f64 / 40.0) * 0.25
449 + capped(row.commit_touch_count as f64 / 25.0) * 0.25
450 + capped(row.line_count as f64 / 800.0) * 0.10
451}
452
453fn cluster_score(metrics: &RepoClusterMetrics, avg_edge_score: f64) -> f64 {
454 capped(
455 avg_edge_score * 0.35
456 + capped(metrics.file_count as f64 / 12.0) * 0.12
457 + capped((metrics.fan_in + metrics.fan_out) as f64 / 100.0) * 0.18
458 + capped(metrics.commit_touch_count as f64 / 60.0) * 0.15
459 + capped(metrics.total_symbols as f64 / 120.0) * 0.12
460 + capped((metrics.co_touch_edges + metrics.graph_edges) as f64 / 20.0) * 0.08,
461 )
462}
463
464fn cluster_category(metrics: &RepoClusterMetrics) -> &'static str {
465 if metrics.co_touch_edges > 0 && metrics.graph_edges > 0 {
466 "graph_and_churn_ownership"
467 } else if metrics.co_touch_edges > 0 {
468 "co_touch_ownership"
469 } else if metrics.graph_edges > 0 {
470 "graph_neighborhood"
471 } else {
472 "path_proximity"
473 }
474}
475
476fn confidence_for(metrics: &RepoClusterMetrics, avg_edge_score: f64) -> &'static str {
477 if (metrics.co_touch_edges > 0 && metrics.graph_edges > 0) || avg_edge_score >= 0.55 {
478 "high"
479 } else if metrics.co_touch_edges > 0 || metrics.graph_edges > 0 || avg_edge_score >= 0.42 {
480 "medium"
481 } else {
482 "low"
483 }
484}
485
486fn why_for(metrics: &RepoClusterMetrics, avg_edge_score: f64, name: &str) -> Vec<String> {
487 let mut why = vec![format!("shared ownership bucket: {name}")];
488 if metrics.graph_edges > 0 {
489 why.push(format!(
490 "graph neighborhood: {} direct indexed cross-file edges",
491 metrics.graph_edges
492 ));
493 }
494 if metrics.co_touch_edges > 0 {
495 why.push(format!(
496 "co-touch history: {} bounded same-commit file-pair hits",
497 metrics.co_touch_edges
498 ));
499 }
500 if metrics.commit_touch_count > 0 {
501 why.push(format!(
502 "churn: {} total commit touches, {} recent",
503 metrics.commit_touch_count, metrics.recent_touch_count
504 ));
505 }
506 if metrics.total_symbols > 0 {
507 why.push(format!(
508 "source shape: {} files, {} symbols, {} lines",
509 metrics.file_count, metrics.total_symbols, metrics.total_lines
510 ));
511 }
512 why.push(format!("average accepted edge score: {:.3}", avg_edge_score));
513 why
514}
515
516fn next_tools_for(path: Option<&String>) -> Vec<RepoClusterNextTool> {
517 let Some(path) = path else {
518 return Vec::new();
519 };
520 vec![
521 next_tool(
522 "repo_brief",
523 "compare cluster representatives against spine/churn/god-module rankings",
524 [("mode", "refactor_candidates".to_string())],
525 ),
526 next_tool(
527 "impact_surface",
528 "inspect graph, git, papertrail, and memories for the representative path",
529 [("query", path.clone())],
530 ),
531 next_tool(
532 "git_history_for_path",
533 "inspect co-touch and churn history",
534 [("path", path.clone())],
535 ),
536 ]
537}
538
539fn next_tool<const N: usize>(
540 tool: &str,
541 reason: &str,
542 args: [(&str, String); N],
543) -> RepoClusterNextTool {
544 RepoClusterNextTool {
545 tool: tool.to_string(),
546 reason: reason.to_string(),
547 args: args
548 .into_iter()
549 .map(|(key, value)| (key.to_string(), serde_json::Value::String(value)))
550 .collect(),
551 }
552}
553
554fn cluster_name(rows: &[FileBriefRow]) -> String {
555 let paths = rows.iter().map(|row| row.path.as_str()).collect::<Vec<_>>();
556 let common = common_parent(&paths);
557 if !common.is_empty() {
558 return common;
559 }
560 let dominant_language = dominant_key(rows.iter().map(|row| row.language.as_str()));
561 format!("{dominant_language} files")
562}
563
564fn common_parent(paths: &[&str]) -> String {
565 let mut iter = paths.iter();
566 let Some(first) = iter.next() else {
567 return String::new();
568 };
569 let mut common = parent_components(first);
570 for path in iter {
571 let components = parent_components(path);
572 let keep =
573 common.iter().zip(components.iter()).take_while(|(left, right)| left == right).count();
574 common.truncate(keep);
575 }
576 common.join("/")
577}
578
579fn dominant_key<'a>(values: impl Iterator<Item = &'a str>) -> String {
580 let mut counts = BTreeMap::<&str, u64>::new();
581 for value in values {
582 *counts.entry(value).or_default() += 1;
583 }
584 counts
585 .into_iter()
586 .max_by(|(left_key, left_count), (right_key, right_count)| {
587 left_count.cmp(right_count).then_with(|| right_key.cmp(left_key))
588 })
589 .map(|(value, _)| value.to_string())
590 .unwrap_or_else(|| "source".to_string())
591}
592
593fn path_similarity(left: &str, right: &str) -> f64 {
594 let left_components = path_components(left);
595 let right_components = path_components(right);
596 let common_prefix = left_components
597 .iter()
598 .zip(right_components.iter())
599 .take_while(|(left, right)| left == right)
600 .count();
601 let prefix = if left_components.is_empty() && right_components.is_empty() {
602 0.0
603 } else {
604 common_prefix as f64 / left_components.len().max(right_components.len()) as f64
605 };
606 let left_tokens = path_tokens(left);
607 let right_tokens = path_tokens(right);
608 let union = left_tokens.union(&right_tokens).count();
609 let token_overlap = if union == 0 {
610 0.0
611 } else {
612 left_tokens.intersection(&right_tokens).count() as f64 / union as f64
613 };
614 prefix * 0.65 + token_overlap * 0.35
615}
616
617fn churn_similarity(left: &FileBriefRow, right: &FileBriefRow) -> f64 {
618 let left_churn = left.additions.saturating_add(left.deletions);
619 let right_churn = right.additions.saturating_add(right.deletions);
620 let max_churn = left_churn.max(right_churn);
621 let churn = if max_churn == 0 {
622 0.0
623 } else {
624 1.0 - (left_churn.abs_diff(right_churn) as f64 / max_churn as f64)
625 };
626 let max_touches = left.commit_touch_count.max(right.commit_touch_count);
627 let touches = if max_touches == 0 {
628 0.0
629 } else {
630 1.0 - (left.commit_touch_count.abs_diff(right.commit_touch_count) as f64
631 / max_touches as f64)
632 };
633 (churn + touches) * 0.5
634}
635
636fn path_bucket(path: &str) -> String {
637 let mut components = parent_components(path);
638 components.truncate(5);
639 match components.as_slice() {
640 [] => ".".to_string(),
641 [one] => one.clone(),
642 _ => components.join("/"),
643 }
644}
645
646fn indexed_path_buckets(rows: &[FileBriefRow]) -> BTreeMap<String, Vec<usize>> {
647 let mut buckets = BTreeMap::<String, Vec<usize>>::new();
648 for (index, row) in rows.iter().enumerate() {
649 buckets.entry(path_bucket(&row.path)).or_default().push(index);
650 }
651 buckets
652}
653
654fn parent_components(path: &str) -> Vec<String> {
655 let mut components = path_components(path);
656 components.pop();
657 components
658}
659
660fn path_components(path: &str) -> Vec<String> {
661 path.split('/').filter(|part| !part.is_empty()).map(str::to_string).collect()
662}
663
664fn path_tokens(path: &str) -> BTreeSet<String> {
665 path.split(|ch: char| !ch.is_ascii_alphanumeric())
666 .filter(|part| part.len() > 1)
667 .map(|part| part.to_ascii_lowercase())
668 .collect()
669}
670
671fn path_index(rows: &[FileBriefRow]) -> HashMap<&str, usize> {
672 rows.iter().enumerate().map(|(index, row)| (row.path.as_str(), index)).collect()
673}
674
675fn capped(value: f64) -> f64 {
676 value.clamp(0.0, 1.0)
677}
678
679#[derive(Debug)]
680struct UnionFind {
681 parent: Vec<usize>,
682 rank: Vec<u8>,
683}
684
685impl UnionFind {
686 fn new(len: usize) -> Self {
687 Self { parent: (0..len).collect(), rank: vec![0; len] }
688 }
689
690 fn find(&mut self, value: usize) -> usize {
691 if self.parent[value] != value {
692 self.parent[value] = self.find(self.parent[value]);
693 }
694 self.parent[value]
695 }
696
697 fn union(&mut self, left: usize, right: usize) {
698 let left_root = self.find(left);
699 let right_root = self.find(right);
700 if left_root == right_root {
701 return;
702 }
703 if self.rank[left_root] < self.rank[right_root] {
704 self.parent[left_root] = right_root;
705 } else if self.rank[left_root] > self.rank[right_root] {
706 self.parent[right_root] = left_root;
707 } else {
708 self.parent[right_root] = left_root;
709 self.rank[left_root] += 1;
710 }
711 }
712}
713
714#[cfg(test)]
715mod tests {
716 use super::*;
717
718 #[test]
719 fn path_similarity_prefers_nearby_files() {
720 let nearby = path_similarity("src/actors/sync/actor.rs", "src/actors/sync/msg.rs");
721 let distant = path_similarity("src/actors/sync/actor.rs", "apps/mobile/src/App.tsx");
722 assert!(nearby > distant, "nearby={nearby} distant={distant}");
723 }
724
725 #[test]
726 fn cluster_rows_uses_cotouch_edges() {
727 let rows =
728 vec![row("src/sync/actor.rs"), row("src/sync/msg.rs"), row("apps/mobile/App.tsx")];
729 let co_touch_pairs = BTreeMap::from([(PathPair::new(0, 1), 3)]);
730 let graph_pairs = BTreeMap::new();
731
732 let clusters = cluster_rows(&rows, &co_touch_pairs, &graph_pairs, 2);
733
734 assert_eq!(clusters.len(), 1);
735 assert_eq!(clusters[0].rows.len(), 2);
736 assert_eq!(clusters[0].co_touch_edges, 3);
737 }
738
739 fn row(path: &str) -> FileBriefRow {
740 FileBriefRow {
741 path: path.to_string(),
742 language: "rust".to_string(),
743 kind: "source".to_string(),
744 generated: false,
745 line_count: 10,
746 chunk_count: 1,
747 symbol_count: 1,
748 fan_in: 0,
749 fan_out: 0,
750 commit_touch_count: 1,
751 recent_touch_count: 0,
752 additions: 10,
753 deletions: 0,
754 github_ref_count: 0,
755 symbol_kinds: BTreeMap::new(),
756 memories: RepoBriefMemoryCounts::default(),
757 }
758 }
759}