1use std::collections::{HashMap, HashSet};
9
10use serde::{Deserialize, Serialize};
11
12pub const MIN_COCHANGE_COUNT: u32 = 5;
16
17#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
19pub struct Cluster {
20 pub id: String,
22 pub label: String,
24 pub members: Vec<String>,
26 pub cohesion: f32,
28 pub centroid: String,
30 pub size: u32,
32}
33
34#[derive(Debug, Clone, Default, Serialize, Deserialize)]
36pub struct ClusterIndex {
37 pub clusters: Vec<Cluster>,
39 pub total: u32,
41 pub clustered_files: u32,
43 pub isolated_files: u32,
45}
46
47impl ClusterIndex {
48 pub fn compute(co_change_pairs: &[(String, String, u32)], total_files: usize) -> Self {
56 let strong_pairs: Vec<&(String, String, u32)> = co_change_pairs
58 .iter()
59 .filter(|(_, _, count)| *count >= MIN_COCHANGE_COUNT)
60 .collect();
61
62 if strong_pairs.is_empty() {
63 return ClusterIndex {
64 clusters: vec![],
65 total: 0,
66 clustered_files: 0,
67 isolated_files: total_files as u32,
68 };
69 }
70
71 let mut all_nodes: HashSet<&str> = HashSet::new();
73 for (a, b, _) in &strong_pairs {
74 all_nodes.insert(a.as_str());
75 all_nodes.insert(b.as_str());
76 }
77
78 let node_list: Vec<&str> = all_nodes.into_iter().collect();
80 let node_to_idx: HashMap<&str, usize> =
81 node_list.iter().enumerate().map(|(i, &n)| (n, i)).collect();
82
83 let mut parent: Vec<usize> = (0..node_list.len()).collect();
85 let mut rank: Vec<usize> = vec![0; node_list.len()];
86
87 fn find(parent: &mut [usize], x: usize) -> usize {
88 if parent[x] != x {
89 parent[x] = find(parent, parent[x]);
90 }
91 parent[x]
92 }
93
94 fn union(parent: &mut [usize], rank: &mut [usize], a: usize, b: usize) {
95 let ra = find(parent, a);
96 let rb = find(parent, b);
97 if ra == rb {
98 return;
99 }
100 if rank[ra] < rank[rb] {
101 parent[ra] = rb;
102 } else if rank[ra] > rank[rb] {
103 parent[rb] = ra;
104 } else {
105 parent[rb] = ra;
106 rank[ra] += 1;
107 }
108 }
109
110 for (a, b, _) in &strong_pairs {
111 let ia = node_to_idx[a.as_str()];
112 let ib = node_to_idx[b.as_str()];
113 union(&mut parent, &mut rank, ia, ib);
114 }
115
116 let mut components: HashMap<usize, Vec<&str>> = HashMap::new();
118 for (i, &node) in node_list.iter().enumerate() {
119 let root = find(&mut parent, i);
120 components.entry(root).or_default().push(node);
121 }
122
123 let edge_set: HashSet<(&str, &str)> = strong_pairs
125 .iter()
126 .map(|(a, b, _)| (a.as_str(), b.as_str()))
127 .collect();
128
129 let mut clusters: Vec<Cluster> = components
131 .into_values()
132 .filter(|members| members.len() >= 2)
133 .map(|mut members| {
134 members.sort();
135 let member_set: HashSet<&str> = members.iter().copied().collect();
136 let n = members.len();
137
138 let mut intra_edges = 0u32;
140 let mut degree: HashMap<&str, u32> = HashMap::new();
141 for &(a, b) in &edge_set {
142 if member_set.contains(a) && member_set.contains(b) {
143 intra_edges += 1;
144 *degree.entry(a).or_default() += 1;
145 *degree.entry(b).or_default() += 1;
146 }
147 }
148
149 let max_edges = (n * (n - 1) / 2) as f32;
151 let cohesion = if max_edges > 0.0 {
152 intra_edges as f32 / max_edges
153 } else {
154 0.0
155 };
156
157 let centroid = members
159 .iter()
160 .max_by_key(|&&m| degree.get(m).copied().unwrap_or(0))
161 .copied()
162 .unwrap_or(members[0]);
163
164 let label = compute_label(&members, centroid);
165 let id = stem(centroid);
166 let centroid_owned = centroid.to_string();
167
168 Cluster {
169 id,
170 label,
171 members: members.into_iter().map(String::from).collect(),
172 cohesion,
173 centroid: centroid_owned,
174 size: n as u32,
175 }
176 })
177 .collect();
178
179 clusters.sort_by(|a, b| b.size.cmp(&a.size).then_with(|| a.label.cmp(&b.label)));
181
182 {
186 let mut label_counts: HashMap<String, u32> = HashMap::new();
187 for cluster in &clusters {
188 *label_counts.entry(cluster.label.clone()).or_insert(0) += 1;
189 }
190
191 let mut seen: HashMap<String, u32> = HashMap::new();
192 for cluster in clusters.iter_mut() {
193 let total = *label_counts.get(&cluster.label).unwrap_or(&0);
194 if total > 1 {
195 let count = seen.entry(cluster.label.clone()).or_insert(0);
196 if *count > 0 {
197 cluster.label = format!("{} ({})", cluster.label, stem(&cluster.centroid));
198 }
199 *count += 1;
200 }
201 }
202 }
203
204 let clustered_files: u32 = clusters.iter().map(|c| c.size).sum();
205 let total = clusters.len() as u32;
206 let isolated_files = total_files.saturating_sub(clustered_files as usize) as u32;
207
208 ClusterIndex {
209 clusters,
210 total,
211 clustered_files,
212 isolated_files,
213 }
214 }
215
216 pub fn cluster_for(&self, file_path: &str) -> Option<&Cluster> {
218 self.clusters
219 .iter()
220 .find(|c| c.members.iter().any(|m| m == file_path))
221 }
222}
223
224fn compute_label(members: &[&str], centroid: &str) -> String {
231 if members.len() < 2 {
232 return stem(centroid);
233 }
234
235 let segments: Vec<Vec<&str>> = members
237 .iter()
238 .map(|m| m.split('/').collect::<Vec<_>>())
239 .collect();
240
241 let min_len = segments.iter().map(|s| s.len()).min().unwrap_or(0);
243 let mut prefix_len = 0;
244 for i in 0..min_len {
245 if segments.iter().all(|s| s[i] == segments[0][i]) {
246 prefix_len = i + 1;
247 } else {
248 break;
249 }
250 }
251
252 if prefix_len >= 2 {
254 segments[0][prefix_len - 1].to_string()
255 } else {
256 stem(centroid)
257 }
258}
259
260fn stem(path: &str) -> String {
262 std::path::Path::new(path)
263 .file_stem()
264 .and_then(|s| s.to_str())
265 .unwrap_or(path)
266 .to_string()
267}
268
269#[cfg(test)]
272mod tests {
273 use super::*;
274
275 fn pairs(data: &[(&str, &str, u32)]) -> Vec<(String, String, u32)> {
276 data.iter()
277 .map(|(a, b, c)| (a.to_string(), b.to_string(), *c))
278 .collect()
279 }
280
281 #[test]
282 fn empty_pairs_produce_empty_index() {
283 let idx = ClusterIndex::compute(&[], 10);
284 assert_eq!(idx.total, 0);
285 assert!(idx.clusters.is_empty());
286 assert_eq!(idx.isolated_files, 10);
287 assert_eq!(idx.clustered_files, 0);
288 }
289
290 #[test]
291 fn pairs_below_threshold_produce_no_clusters() {
292 let p = pairs(&[("src/a.rs", "src/b.rs", 3)]); let idx = ClusterIndex::compute(&p, 5);
294 assert_eq!(idx.total, 0);
295 assert!(idx.clusters.is_empty());
296 }
297
298 #[test]
299 fn triangle_forms_one_cluster_with_full_cohesion() {
300 let p = pairs(&[
301 ("src/a.rs", "src/b.rs", 10),
302 ("src/b.rs", "src/c.rs", 8),
303 ("src/a.rs", "src/c.rs", 7),
304 ]);
305 let idx = ClusterIndex::compute(&p, 5);
306 assert_eq!(idx.total, 1);
307 assert_eq!(idx.clusters[0].size, 3);
308 assert!(
309 (idx.clusters[0].cohesion - 1.0).abs() < f32::EPSILON,
310 "triangle should have cohesion 1.0, got {}",
311 idx.clusters[0].cohesion
312 );
313 }
314
315 #[test]
316 fn two_disjoint_pairs_form_two_clusters() {
317 let p = pairs(&[("src/a.rs", "src/b.rs", 10), ("src/c.rs", "src/d.rs", 8)]);
318 let idx = ClusterIndex::compute(&p, 10);
319 assert_eq!(idx.total, 2);
320 assert_eq!(idx.clusters[0].size, 2);
321 assert_eq!(idx.clusters[1].size, 2);
322 assert_eq!(idx.clustered_files, 4);
323 assert_eq!(idx.isolated_files, 6);
324 }
325
326 #[test]
327 fn chain_of_four_forms_one_cluster_with_partial_cohesion() {
328 let p = pairs(&[
330 ("src/a.rs", "src/b.rs", 10),
331 ("src/b.rs", "src/c.rs", 8),
332 ("src/c.rs", "src/d.rs", 7),
333 ]);
334 let idx = ClusterIndex::compute(&p, 4);
335 assert_eq!(idx.total, 1);
336 assert_eq!(idx.clusters[0].size, 4);
337 let expected_cohesion = 3.0 / 6.0; assert!(
339 (idx.clusters[0].cohesion - expected_cohesion).abs() < f32::EPSILON,
340 "chain of 4 should have cohesion 0.5, got {}",
341 idx.clusters[0].cohesion
342 );
343 }
344
345 #[test]
346 fn edge_below_min_count_excluded() {
347 let p = pairs(&[
348 ("src/a.rs", "src/b.rs", 10), ("src/b.rs", "src/c.rs", 3), ]);
351 let idx = ClusterIndex::compute(&p, 5);
352 assert_eq!(idx.total, 1);
354 assert_eq!(idx.clusters[0].size, 2);
355 assert!(idx.clusters[0].members.contains(&"src/a.rs".to_string()));
356 assert!(idx.clusters[0].members.contains(&"src/b.rs".to_string()));
357 }
358
359 #[test]
360 fn shared_directory_prefix_produces_label() {
361 let p = pairs(&[
362 ("src/auth/session.rs", "src/auth/tokens.rs", 10),
363 ("src/auth/tokens.rs", "src/auth/middleware.rs", 8),
364 ]);
365 let idx = ClusterIndex::compute(&p, 5);
366 assert_eq!(idx.clusters[0].label, "auth");
367 }
368
369 #[test]
370 fn no_shared_prefix_uses_centroid_stem() {
371 let p = pairs(&[
372 ("src/store/record.rs", "src/mcp/tools.rs", 10),
373 ("src/mcp/tools.rs", "src/cli/init.rs", 8),
374 ]);
375 let idx = ClusterIndex::compute(&p, 5);
376 assert_eq!(idx.clusters[0].label, "tools");
379 }
380
381 #[test]
382 fn singleton_not_in_any_cluster() {
383 let p = pairs(&[("src/a.rs", "src/b.rs", 10)]);
384 let idx = ClusterIndex::compute(&p, 5);
385 assert!(idx.cluster_for("src/c.rs").is_none());
386 assert_eq!(idx.isolated_files, 3); }
388
389 #[test]
390 fn centroid_is_highest_degree_member() {
391 let p = pairs(&[
393 ("src/a.rs", "src/b.rs", 10),
394 ("src/a.rs", "src/c.rs", 8),
395 ("src/a.rs", "src/d.rs", 7),
396 ]);
397 let idx = ClusterIndex::compute(&p, 4);
398 assert_eq!(idx.clusters[0].centroid, "src/a.rs");
399 }
400
401 #[test]
402 fn cohesion_triangle_is_one() {
403 let p = pairs(&[
404 ("src/x.rs", "src/y.rs", 10),
405 ("src/y.rs", "src/z.rs", 10),
406 ("src/x.rs", "src/z.rs", 10),
407 ]);
408 let idx = ClusterIndex::compute(&p, 3);
409 assert!((idx.clusters[0].cohesion - 1.0).abs() < f32::EPSILON);
411 }
412
413 #[test]
414 fn cohesion_chain_of_four_is_half() {
415 let p = pairs(&[
416 ("src/a.rs", "src/b.rs", 10),
417 ("src/b.rs", "src/c.rs", 10),
418 ("src/c.rs", "src/d.rs", 10),
419 ]);
420 let idx = ClusterIndex::compute(&p, 4);
421 assert!((idx.clusters[0].cohesion - 0.5).abs() < f32::EPSILON);
423 }
424
425 #[test]
426 fn cluster_for_returns_correct_cluster() {
427 let p = pairs(&[("src/a.rs", "src/b.rs", 10), ("src/c.rs", "src/d.rs", 8)]);
428 let idx = ClusterIndex::compute(&p, 4);
429 let c = idx.cluster_for("src/a.rs").unwrap();
430 assert!(c.members.contains(&"src/a.rs".to_string()));
431 assert!(c.members.contains(&"src/b.rs".to_string()));
432
433 let c2 = idx.cluster_for("src/d.rs").unwrap();
434 assert!(c2.members.contains(&"src/c.rs".to_string()));
435 }
436
437 #[test]
438 fn clusters_sorted_by_size_descending() {
439 let p = pairs(&[
440 ("src/a.rs", "src/b.rs", 10),
442 ("src/b.rs", "src/c.rs", 8),
443 ("src/x.rs", "src/y.rs", 7),
445 ]);
446 let idx = ClusterIndex::compute(&p, 10);
447 assert_eq!(idx.clusters[0].size, 3);
448 assert_eq!(idx.clusters[1].size, 2);
449 }
450
451 #[test]
452 fn serde_roundtrip() {
453 let p = pairs(&[("src/a.rs", "src/b.rs", 10)]);
454 let idx = ClusterIndex::compute(&p, 5);
455 let json = serde_json::to_string(&idx).unwrap();
456 let back: ClusterIndex = serde_json::from_str(&json).unwrap();
457 assert_eq!(idx.clusters.len(), back.clusters.len());
458 assert_eq!(idx.total, back.total);
459 }
460
461 #[test]
464 fn label_disambiguation_two_clusters_same_prefix() {
465 let p = pairs(&[
468 ("src/cli/init.rs", "src/cli/explain.rs", 10),
470 ("src/cli/explain.rs", "src/cli/review.rs", 8),
471 ("src/cli/stats.rs", "src/cli/status.rs", 12),
473 ]);
474 let idx = ClusterIndex::compute(&p, 10);
475 assert_eq!(idx.total, 2);
476 assert_eq!(idx.clusters[0].label, "cli");
478 assert_eq!(idx.clusters[0].size, 3);
479 assert!(
481 idx.clusters[1].label.starts_with("cli ("),
482 "second cluster should be disambiguated, got: {}",
483 idx.clusters[1].label
484 );
485 }
486
487 #[test]
488 fn label_disambiguation_three_clusters_same_prefix() {
489 let p = pairs(&[
490 ("src/cli/init.rs", "src/cli/explain.rs", 10),
492 ("src/cli/explain.rs", "src/cli/review.rs", 8),
493 ("src/cli/stats.rs", "src/cli/status.rs", 12),
495 ("src/cli/gaps.rs", "src/cli/stale.rs", 7),
497 ]);
498 let idx = ClusterIndex::compute(&p, 10);
499 assert_eq!(idx.total, 3);
500 assert_eq!(idx.clusters[0].label, "cli");
501 assert!(idx.clusters[1].label.starts_with("cli ("));
503 assert!(idx.clusters[2].label.starts_with("cli ("));
504 assert_ne!(idx.clusters[1].label, idx.clusters[2].label);
506 }
507
508 #[test]
509 fn label_no_collision_stays_clean() {
510 let p = pairs(&[
511 ("src/cli/init.rs", "src/cli/explain.rs", 10),
512 ("src/analysis/parser.rs", "src/analysis/walker.rs", 8),
513 ]);
514 let idx = ClusterIndex::compute(&p, 10);
515 assert_eq!(idx.total, 2);
516 let labels: Vec<&str> = idx.clusters.iter().map(|c| c.label.as_str()).collect();
518 assert!(labels.contains(&"cli"));
519 assert!(labels.contains(&"analysis"));
520 }
521
522 #[test]
523 fn label_disambiguation_preserves_cluster_id() {
524 let p = pairs(&[
525 ("src/cli/init.rs", "src/cli/explain.rs", 10),
526 ("src/cli/stats.rs", "src/cli/status.rs", 12),
527 ]);
528 let idx = ClusterIndex::compute(&p, 10);
529 for c in &idx.clusters {
531 assert!(
532 !c.id.contains(' ') && !c.id.contains('('),
533 "cluster id should not be disambiguated: {}",
534 c.id
535 );
536 }
537 }
538
539 #[test]
540 fn label_disambiguation_handles_weird_centroid_names() {
541 let p = pairs(&[
543 ("src/cli/Makefile", "src/cli/Dockerfile", 10),
544 ("src/cli/init.rs", "src/cli/explain.rs", 8),
545 ]);
546 let idx = ClusterIndex::compute(&p, 10);
547 for c in &idx.clusters {
549 assert!(!c.label.is_empty(), "label must not be empty");
550 }
551 }
552}