Skip to main content

mati_core/analysis/
clusters.rs

1//! Co-change cluster detection from git history.
2//!
3//! Discovers logical modules by running connected-components over co-change
4//! pairs. Files that frequently change together form clusters, regardless of
5//! directory structure. Uses union-find over filtered pairs — simpler and more
6//! transparent than petgraph's SCC algorithms for undirected, weighted edges.
7
8use std::collections::{HashMap, HashSet};
9
10use serde::{Deserialize, Serialize};
11
12/// Minimum raw co-change count for a pair to count toward cluster formation.
13/// Below this, pairs are treated as noise even if they passed the 0.70
14/// correlation threshold in `mine_git_history`.
15pub const MIN_COCHANGE_COUNT: u32 = 5;
16
17/// A co-change cluster discovered from git history.
18#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
19pub struct Cluster {
20    /// Stable identifier derived from the centroid file name.
21    pub id: String,
22    /// Human-readable label from shared directory prefix or centroid stem.
23    pub label: String,
24    /// File paths (not keys) belonging to this cluster. Sorted.
25    pub members: Vec<String>,
26    /// Graph density: edges_in_cluster / max_possible_edges. Range 0.0–1.0.
27    pub cohesion: f32,
28    /// File with highest intra-cluster degree.
29    pub centroid: String,
30    /// Number of members.
31    pub size: u32,
32}
33
34/// Full cluster index for a repository. Cached under key `cluster:index`.
35#[derive(Debug, Clone, Default, Serialize, Deserialize)]
36pub struct ClusterIndex {
37    /// Clusters of size >= 2, sorted by size descending.
38    pub clusters: Vec<Cluster>,
39    /// Total cluster count.
40    pub total: u32,
41    /// Files belonging to at least one cluster.
42    pub clustered_files: u32,
43    /// Files with no co-change neighbors meeting the threshold.
44    pub isolated_files: u32,
45}
46
47impl ClusterIndex {
48    /// Compute cluster index from co-change pairs (with raw counts).
49    ///
50    /// `co_change_pairs` are `(path_a, path_b, count)` with `a < b`, already
51    /// filtered by the 0.70 correlation threshold. This function applies the
52    /// additional `MIN_COCHANGE_COUNT` filter for cluster formation.
53    ///
54    /// `total_files` is the total file count for computing `isolated_files`.
55    pub fn compute(co_change_pairs: &[(String, String, u32)], total_files: usize) -> Self {
56        // Filter pairs by minimum count.
57        let strong_pairs: Vec<&(String, String, u32)> = co_change_pairs
58            .iter()
59            .filter(|(_, _, count)| *count >= MIN_COCHANGE_COUNT)
60            .collect();
61
62        if strong_pairs.is_empty() {
63            return ClusterIndex {
64                clusters: vec![],
65                total: 0,
66                clustered_files: 0,
67                isolated_files: total_files as u32,
68            };
69        }
70
71        // Collect all nodes that participate in strong pairs.
72        let mut all_nodes: HashSet<&str> = HashSet::new();
73        for (a, b, _) in &strong_pairs {
74            all_nodes.insert(a.as_str());
75            all_nodes.insert(b.as_str());
76        }
77
78        // Assign indices to nodes for union-find.
79        let node_list: Vec<&str> = all_nodes.into_iter().collect();
80        let node_to_idx: HashMap<&str, usize> =
81            node_list.iter().enumerate().map(|(i, &n)| (n, i)).collect();
82
83        // Union-find.
84        let mut parent: Vec<usize> = (0..node_list.len()).collect();
85        let mut rank: Vec<usize> = vec![0; node_list.len()];
86
87        fn find(parent: &mut [usize], x: usize) -> usize {
88            if parent[x] != x {
89                parent[x] = find(parent, parent[x]);
90            }
91            parent[x]
92        }
93
94        fn union(parent: &mut [usize], rank: &mut [usize], a: usize, b: usize) {
95            let ra = find(parent, a);
96            let rb = find(parent, b);
97            if ra == rb {
98                return;
99            }
100            if rank[ra] < rank[rb] {
101                parent[ra] = rb;
102            } else if rank[ra] > rank[rb] {
103                parent[rb] = ra;
104            } else {
105                parent[rb] = ra;
106                rank[ra] += 1;
107            }
108        }
109
110        for (a, b, _) in &strong_pairs {
111            let ia = node_to_idx[a.as_str()];
112            let ib = node_to_idx[b.as_str()];
113            union(&mut parent, &mut rank, ia, ib);
114        }
115
116        // Group nodes by component root.
117        let mut components: HashMap<usize, Vec<&str>> = HashMap::new();
118        for (i, &node) in node_list.iter().enumerate() {
119            let root = find(&mut parent, i);
120            components.entry(root).or_default().push(node);
121        }
122
123        // Build edge set for cohesion and degree computation.
124        let edge_set: HashSet<(&str, &str)> = strong_pairs
125            .iter()
126            .map(|(a, b, _)| (a.as_str(), b.as_str()))
127            .collect();
128
129        // Build clusters from components with size >= 2.
130        let mut clusters: Vec<Cluster> = components
131            .into_values()
132            .filter(|members| members.len() >= 2)
133            .map(|mut members| {
134                members.sort();
135                let member_set: HashSet<&str> = members.iter().copied().collect();
136                let n = members.len();
137
138                // Count intra-cluster edges.
139                let mut intra_edges = 0u32;
140                let mut degree: HashMap<&str, u32> = HashMap::new();
141                for &(a, b) in &edge_set {
142                    if member_set.contains(a) && member_set.contains(b) {
143                        intra_edges += 1;
144                        *degree.entry(a).or_default() += 1;
145                        *degree.entry(b).or_default() += 1;
146                    }
147                }
148
149                // Cohesion = edges / max_possible_edges (undirected).
150                let max_edges = (n * (n - 1) / 2) as f32;
151                let cohesion = if max_edges > 0.0 {
152                    intra_edges as f32 / max_edges
153                } else {
154                    0.0
155                };
156
157                // Centroid = member with highest degree.
158                let centroid = members
159                    .iter()
160                    .max_by_key(|&&m| degree.get(m).copied().unwrap_or(0))
161                    .copied()
162                    .unwrap_or(members[0]);
163
164                let label = compute_label(&members, centroid);
165                let id = stem(centroid);
166                let centroid_owned = centroid.to_string();
167
168                Cluster {
169                    id,
170                    label,
171                    members: members.into_iter().map(String::from).collect(),
172                    cohesion,
173                    centroid: centroid_owned,
174                    size: n as u32,
175                }
176            })
177            .collect();
178
179        // Sort by size descending, then by label for stability.
180        clusters.sort_by(|a, b| b.size.cmp(&a.size).then_with(|| a.label.cmp(&b.label)));
181
182        // Disambiguate duplicate labels. If two or more clusters share a
183        // directory-based label, suffix each except the first (largest) with
184        // its centroid's filename stem. The largest keeps the clean label.
185        {
186            let mut label_counts: HashMap<String, u32> = HashMap::new();
187            for cluster in &clusters {
188                *label_counts.entry(cluster.label.clone()).or_insert(0) += 1;
189            }
190
191            let mut seen: HashMap<String, u32> = HashMap::new();
192            for cluster in clusters.iter_mut() {
193                let total = *label_counts.get(&cluster.label).unwrap_or(&0);
194                if total > 1 {
195                    let count = seen.entry(cluster.label.clone()).or_insert(0);
196                    if *count > 0 {
197                        cluster.label = format!("{} ({})", cluster.label, stem(&cluster.centroid));
198                    }
199                    *count += 1;
200                }
201            }
202        }
203
204        let clustered_files: u32 = clusters.iter().map(|c| c.size).sum();
205        let total = clusters.len() as u32;
206        let isolated_files = total_files.saturating_sub(clustered_files as usize) as u32;
207
208        ClusterIndex {
209            clusters,
210            total,
211            clustered_files,
212            isolated_files,
213        }
214    }
215
216    /// Return the cluster containing the given file path, if any.
217    pub fn cluster_for(&self, file_path: &str) -> Option<&Cluster> {
218        self.clusters
219            .iter()
220            .find(|c| c.members.iter().any(|m| m == file_path))
221    }
222}
223
224/// Compute a human-readable label for a cluster.
225///
226/// Priority:
227/// 1. Shared directory prefix of >= 2 segments → last segment
228/// 2. Single shared root segment → centroid stem
229/// 3. No shared prefix → centroid stem
230fn compute_label(members: &[&str], centroid: &str) -> String {
231    if members.len() < 2 {
232        return stem(centroid);
233    }
234
235    // Split each member into path segments.
236    let segments: Vec<Vec<&str>> = members
237        .iter()
238        .map(|m| m.split('/').collect::<Vec<_>>())
239        .collect();
240
241    // Find common prefix length.
242    let min_len = segments.iter().map(|s| s.len()).min().unwrap_or(0);
243    let mut prefix_len = 0;
244    for i in 0..min_len {
245        if segments.iter().all(|s| s[i] == segments[0][i]) {
246            prefix_len = i + 1;
247        } else {
248            break;
249        }
250    }
251
252    // Need at least 2 common segments for a meaningful prefix label.
253    if prefix_len >= 2 {
254        segments[0][prefix_len - 1].to_string()
255    } else {
256        stem(centroid)
257    }
258}
259
260/// Extract filename stem (without extension) from a path.
261fn stem(path: &str) -> String {
262    std::path::Path::new(path)
263        .file_stem()
264        .and_then(|s| s.to_str())
265        .unwrap_or(path)
266        .to_string()
267}
268
269// ── Tests ───────────────────────────────────────────────────────────────────
270
271#[cfg(test)]
272mod tests {
273    use super::*;
274
275    fn pairs(data: &[(&str, &str, u32)]) -> Vec<(String, String, u32)> {
276        data.iter()
277            .map(|(a, b, c)| (a.to_string(), b.to_string(), *c))
278            .collect()
279    }
280
281    #[test]
282    fn empty_pairs_produce_empty_index() {
283        let idx = ClusterIndex::compute(&[], 10);
284        assert_eq!(idx.total, 0);
285        assert!(idx.clusters.is_empty());
286        assert_eq!(idx.isolated_files, 10);
287        assert_eq!(idx.clustered_files, 0);
288    }
289
290    #[test]
291    fn pairs_below_threshold_produce_no_clusters() {
292        let p = pairs(&[("src/a.rs", "src/b.rs", 3)]); // count=3 < MIN_COCHANGE_COUNT=5
293        let idx = ClusterIndex::compute(&p, 5);
294        assert_eq!(idx.total, 0);
295        assert!(idx.clusters.is_empty());
296    }
297
298    #[test]
299    fn triangle_forms_one_cluster_with_full_cohesion() {
300        let p = pairs(&[
301            ("src/a.rs", "src/b.rs", 10),
302            ("src/b.rs", "src/c.rs", 8),
303            ("src/a.rs", "src/c.rs", 7),
304        ]);
305        let idx = ClusterIndex::compute(&p, 5);
306        assert_eq!(idx.total, 1);
307        assert_eq!(idx.clusters[0].size, 3);
308        assert!(
309            (idx.clusters[0].cohesion - 1.0).abs() < f32::EPSILON,
310            "triangle should have cohesion 1.0, got {}",
311            idx.clusters[0].cohesion
312        );
313    }
314
315    #[test]
316    fn two_disjoint_pairs_form_two_clusters() {
317        let p = pairs(&[("src/a.rs", "src/b.rs", 10), ("src/c.rs", "src/d.rs", 8)]);
318        let idx = ClusterIndex::compute(&p, 10);
319        assert_eq!(idx.total, 2);
320        assert_eq!(idx.clusters[0].size, 2);
321        assert_eq!(idx.clusters[1].size, 2);
322        assert_eq!(idx.clustered_files, 4);
323        assert_eq!(idx.isolated_files, 6);
324    }
325
326    #[test]
327    fn chain_of_four_forms_one_cluster_with_partial_cohesion() {
328        // A-B, B-C, C-D → 1 component of 4 nodes, 3 edges, max=6
329        let p = pairs(&[
330            ("src/a.rs", "src/b.rs", 10),
331            ("src/b.rs", "src/c.rs", 8),
332            ("src/c.rs", "src/d.rs", 7),
333        ]);
334        let idx = ClusterIndex::compute(&p, 4);
335        assert_eq!(idx.total, 1);
336        assert_eq!(idx.clusters[0].size, 4);
337        let expected_cohesion = 3.0 / 6.0; // 3 edges out of max 6
338        assert!(
339            (idx.clusters[0].cohesion - expected_cohesion).abs() < f32::EPSILON,
340            "chain of 4 should have cohesion 0.5, got {}",
341            idx.clusters[0].cohesion
342        );
343    }
344
345    #[test]
346    fn edge_below_min_count_excluded() {
347        let p = pairs(&[
348            ("src/a.rs", "src/b.rs", 10), // above threshold
349            ("src/b.rs", "src/c.rs", 3),  // below MIN_COCHANGE_COUNT
350        ]);
351        let idx = ClusterIndex::compute(&p, 5);
352        // Only A-B qualifies → 1 cluster of size 2, C is isolated
353        assert_eq!(idx.total, 1);
354        assert_eq!(idx.clusters[0].size, 2);
355        assert!(idx.clusters[0].members.contains(&"src/a.rs".to_string()));
356        assert!(idx.clusters[0].members.contains(&"src/b.rs".to_string()));
357    }
358
359    #[test]
360    fn shared_directory_prefix_produces_label() {
361        let p = pairs(&[
362            ("src/auth/session.rs", "src/auth/tokens.rs", 10),
363            ("src/auth/tokens.rs", "src/auth/middleware.rs", 8),
364        ]);
365        let idx = ClusterIndex::compute(&p, 5);
366        assert_eq!(idx.clusters[0].label, "auth");
367    }
368
369    #[test]
370    fn no_shared_prefix_uses_centroid_stem() {
371        let p = pairs(&[
372            ("src/store/record.rs", "src/mcp/tools.rs", 10),
373            ("src/mcp/tools.rs", "src/cli/init.rs", 8),
374        ]);
375        let idx = ClusterIndex::compute(&p, 5);
376        // No 2-segment common prefix → label is centroid stem
377        // tools.rs has degree 2 (connected to both), so it's the centroid
378        assert_eq!(idx.clusters[0].label, "tools");
379    }
380
381    #[test]
382    fn singleton_not_in_any_cluster() {
383        let p = pairs(&[("src/a.rs", "src/b.rs", 10)]);
384        let idx = ClusterIndex::compute(&p, 5);
385        assert!(idx.cluster_for("src/c.rs").is_none());
386        assert_eq!(idx.isolated_files, 3); // 5 total - 2 clustered = 3
387    }
388
389    #[test]
390    fn centroid_is_highest_degree_member() {
391        // a connects to b, c, d → degree 3. Others have degree 1.
392        let p = pairs(&[
393            ("src/a.rs", "src/b.rs", 10),
394            ("src/a.rs", "src/c.rs", 8),
395            ("src/a.rs", "src/d.rs", 7),
396        ]);
397        let idx = ClusterIndex::compute(&p, 4);
398        assert_eq!(idx.clusters[0].centroid, "src/a.rs");
399    }
400
401    #[test]
402    fn cohesion_triangle_is_one() {
403        let p = pairs(&[
404            ("src/x.rs", "src/y.rs", 10),
405            ("src/y.rs", "src/z.rs", 10),
406            ("src/x.rs", "src/z.rs", 10),
407        ]);
408        let idx = ClusterIndex::compute(&p, 3);
409        // 3 edges, max = 3*(3-1)/2 = 3 → cohesion = 1.0
410        assert!((idx.clusters[0].cohesion - 1.0).abs() < f32::EPSILON);
411    }
412
413    #[test]
414    fn cohesion_chain_of_four_is_half() {
415        let p = pairs(&[
416            ("src/a.rs", "src/b.rs", 10),
417            ("src/b.rs", "src/c.rs", 10),
418            ("src/c.rs", "src/d.rs", 10),
419        ]);
420        let idx = ClusterIndex::compute(&p, 4);
421        // 3 edges, max = 4*3/2 = 6 → cohesion = 0.5
422        assert!((idx.clusters[0].cohesion - 0.5).abs() < f32::EPSILON);
423    }
424
425    #[test]
426    fn cluster_for_returns_correct_cluster() {
427        let p = pairs(&[("src/a.rs", "src/b.rs", 10), ("src/c.rs", "src/d.rs", 8)]);
428        let idx = ClusterIndex::compute(&p, 4);
429        let c = idx.cluster_for("src/a.rs").unwrap();
430        assert!(c.members.contains(&"src/a.rs".to_string()));
431        assert!(c.members.contains(&"src/b.rs".to_string()));
432
433        let c2 = idx.cluster_for("src/d.rs").unwrap();
434        assert!(c2.members.contains(&"src/c.rs".to_string()));
435    }
436
437    #[test]
438    fn clusters_sorted_by_size_descending() {
439        let p = pairs(&[
440            // Cluster 1: 3 files
441            ("src/a.rs", "src/b.rs", 10),
442            ("src/b.rs", "src/c.rs", 8),
443            // Cluster 2: 2 files
444            ("src/x.rs", "src/y.rs", 7),
445        ]);
446        let idx = ClusterIndex::compute(&p, 10);
447        assert_eq!(idx.clusters[0].size, 3);
448        assert_eq!(idx.clusters[1].size, 2);
449    }
450
451    #[test]
452    fn serde_roundtrip() {
453        let p = pairs(&[("src/a.rs", "src/b.rs", 10)]);
454        let idx = ClusterIndex::compute(&p, 5);
455        let json = serde_json::to_string(&idx).unwrap();
456        let back: ClusterIndex = serde_json::from_str(&json).unwrap();
457        assert_eq!(idx.clusters.len(), back.clusters.len());
458        assert_eq!(idx.total, back.total);
459    }
460
461    // ── Label disambiguation tests ──────────────────────────────────────────
462
463    #[test]
464    fn label_disambiguation_two_clusters_same_prefix() {
465        // Two disconnected clusters in src/cli/ — both get label "cli".
466        // The larger one keeps "cli", the smaller becomes "cli (centroid_stem)".
467        let p = pairs(&[
468            // Cluster 1: 3 files in cli/
469            ("src/cli/init.rs", "src/cli/explain.rs", 10),
470            ("src/cli/explain.rs", "src/cli/review.rs", 8),
471            // Cluster 2: 2 files in cli/ (disconnected from cluster 1)
472            ("src/cli/stats.rs", "src/cli/status.rs", 12),
473        ]);
474        let idx = ClusterIndex::compute(&p, 10);
475        assert_eq!(idx.total, 2);
476        // First (larger) keeps clean label
477        assert_eq!(idx.clusters[0].label, "cli");
478        assert_eq!(idx.clusters[0].size, 3);
479        // Second gets disambiguated
480        assert!(
481            idx.clusters[1].label.starts_with("cli ("),
482            "second cluster should be disambiguated, got: {}",
483            idx.clusters[1].label
484        );
485    }
486
487    #[test]
488    fn label_disambiguation_three_clusters_same_prefix() {
489        let p = pairs(&[
490            // Cluster 1: 3 files
491            ("src/cli/init.rs", "src/cli/explain.rs", 10),
492            ("src/cli/explain.rs", "src/cli/review.rs", 8),
493            // Cluster 2: 2 files
494            ("src/cli/stats.rs", "src/cli/status.rs", 12),
495            // Cluster 3: 2 files
496            ("src/cli/gaps.rs", "src/cli/stale.rs", 7),
497        ]);
498        let idx = ClusterIndex::compute(&p, 10);
499        assert_eq!(idx.total, 3);
500        assert_eq!(idx.clusters[0].label, "cli");
501        // Both second and third are disambiguated
502        assert!(idx.clusters[1].label.starts_with("cli ("));
503        assert!(idx.clusters[2].label.starts_with("cli ("));
504        // And they're distinct from each other
505        assert_ne!(idx.clusters[1].label, idx.clusters[2].label);
506    }
507
508    #[test]
509    fn label_no_collision_stays_clean() {
510        let p = pairs(&[
511            ("src/cli/init.rs", "src/cli/explain.rs", 10),
512            ("src/analysis/parser.rs", "src/analysis/walker.rs", 8),
513        ]);
514        let idx = ClusterIndex::compute(&p, 10);
515        assert_eq!(idx.total, 2);
516        // No collisions — both keep clean labels
517        let labels: Vec<&str> = idx.clusters.iter().map(|c| c.label.as_str()).collect();
518        assert!(labels.contains(&"cli"));
519        assert!(labels.contains(&"analysis"));
520    }
521
522    #[test]
523    fn label_disambiguation_preserves_cluster_id() {
524        let p = pairs(&[
525            ("src/cli/init.rs", "src/cli/explain.rs", 10),
526            ("src/cli/stats.rs", "src/cli/status.rs", 12),
527        ]);
528        let idx = ClusterIndex::compute(&p, 10);
529        // IDs are derived from centroids, not labels — must remain stable
530        for c in &idx.clusters {
531            assert!(
532                !c.id.contains(' ') && !c.id.contains('('),
533                "cluster id should not be disambiguated: {}",
534                c.id
535            );
536        }
537    }
538
539    #[test]
540    fn label_disambiguation_handles_weird_centroid_names() {
541        // Files without extension or with unusual names
542        let p = pairs(&[
543            ("src/cli/Makefile", "src/cli/Dockerfile", 10),
544            ("src/cli/init.rs", "src/cli/explain.rs", 8),
545        ]);
546        let idx = ClusterIndex::compute(&p, 10);
547        // Should not panic, and both clusters have labels
548        for c in &idx.clusters {
549            assert!(!c.label.is_empty(), "label must not be empty");
550        }
551    }
552}