Skip to main content

cha_core/graph/
module_graph.rs

1use std::collections::{BTreeMap, HashMap, HashSet};
2
3/// A group of tightly-coupled files inferred from the import graph.
4#[derive(Debug, Clone)]
5pub struct Module {
6    pub name: String,
7    pub files: Vec<String>,
8    /// LCOM4: number of connected components in internal import graph.
9    /// 1 = perfectly cohesive, >1 = should be split.
10    pub lcom4: usize,
11    /// TCC: fraction of file pairs that are directly/indirectly connected.
12    /// Range [0, 1], higher is better.
13    pub tcc: f64,
14    /// Cohesion: average ratio of internal connections to total connections.
15    pub cohesion: f64,
16}
17
18/// Infer modules from file-level import edges.
19///
20/// Algorithm:
21/// 1. Auto-detect directory depth (elbow method) or use provided depth
22/// 2. Group files by directory at that depth
23/// 3. For each group with LCOM4 > 1: recursively split into subdirectories
24/// 4. For large groups with LCOM4 = 1 but ICR < 0.30: split one level
25pub fn infer_modules(
26    file_imports: &[(String, String)],
27    all_files: &[String],
28    depth: Option<usize>,
29) -> Vec<Module> {
30    let adj = build_undirected_adj(file_imports, all_files);
31    let depth = depth.unwrap_or_else(|| auto_detect_depth(all_files));
32    let groups = group_by_directory(all_files, depth);
33
34    let mut modules = Vec::new();
35    for (dir, files) in groups {
36        adaptive_split(&dir, &files, &adj, &mut modules);
37    }
38
39    // Compute metrics for each module
40    for m in &mut modules {
41        let mset: HashSet<&str> = m.files.iter().map(|s| s.as_str()).collect();
42        m.lcom4 = compute_lcom4(&mset, &adj);
43        m.tcc = compute_tcc(&mset, &adj);
44        m.cohesion = compute_cohesion(&mset, &adj);
45    }
46    modules
47}
48
49// ── Auto depth detection ──
50
51fn auto_detect_depth(files: &[String]) -> usize {
52    let max_d = files
53        .iter()
54        .map(|f| f.matches('/').count().saturating_sub(1)) // exclude filename
55        .max()
56        .unwrap_or(0);
57    if max_d == 0 {
58        return 1;
59    }
60    let counts: Vec<usize> = (1..=max_d)
61        .map(|d| group_by_directory(files, d).len())
62        .collect();
63    // Find depth with largest relative growth in next level
64    let mut best = 0;
65    let mut max_ratio = 0.0_f64;
66    for i in 0..counts.len().saturating_sub(1) {
67        if counts[i] > 0 {
68            let ratio = (counts[i + 1] as f64 - counts[i] as f64) / counts[i] as f64;
69            if ratio > max_ratio {
70                max_ratio = ratio;
71                best = i;
72            }
73        }
74    }
75    best + 1 // depths are 1-indexed
76}
77
78fn group_by_directory(files: &[String], depth: usize) -> BTreeMap<String, Vec<String>> {
79    let mut groups: BTreeMap<String, Vec<String>> = BTreeMap::new();
80    for f in files {
81        let parts: Vec<&str> = f.split('/').collect();
82        let dir_parts = &parts[..parts.len().saturating_sub(1)]; // exclude filename
83        let key = if dir_parts.len() >= depth {
84            dir_parts[..depth].join("/")
85        } else if dir_parts.is_empty() {
86            "(root)".to_string()
87        } else {
88            dir_parts.join("/")
89        };
90        groups.entry(key).or_default().push(f.clone());
91    }
92    groups
93}
94
95// ── Adaptive splitting ──
96
97fn adaptive_split(
98    dir: &str,
99    files: &[String],
100    adj: &HashMap<String, HashSet<String>>,
101    out: &mut Vec<Module>,
102) {
103    if files.len() < 3 {
104        out.push(make_module(dir, files));
105        return;
106    }
107
108    let mset: HashSet<&str> = files.iter().map(|s| s.as_str()).collect();
109
110    if try_lcom4_split(dir, files, &mset, adj, out) {
111        return;
112    }
113    if try_icr_split(dir, files, adj, out) {
114        return;
115    }
116    out.push(make_module(dir, files));
117}
118
119fn try_lcom4_split(
120    dir: &str,
121    files: &[String],
122    mset: &HashSet<&str>,
123    adj: &HashMap<String, HashSet<String>>,
124    out: &mut Vec<Module>,
125) -> bool {
126    if compute_lcom4(mset, adj) <= 1 {
127        return false;
128    }
129    let subs = group_by_next_level(dir, files);
130    if subs.len() <= 1 {
131        return false;
132    }
133    for (sd, sf) in &subs {
134        adaptive_split(sd, sf, adj, out);
135    }
136    true
137}
138
139fn try_icr_split(
140    dir: &str,
141    files: &[String],
142    adj: &HashMap<String, HashSet<String>>,
143    out: &mut Vec<Module>,
144) -> bool {
145    if files.len() <= 30 {
146        return false;
147    }
148    let subs = group_by_next_level(dir, files);
149    if subs.len() <= 1 || inter_child_ratio(&subs, adj) >= 0.30 {
150        return false;
151    }
152    for (sd, sf) in &subs {
153        adaptive_split(sd, sf, adj, out);
154    }
155    true
156}
157
158fn group_by_next_level(dir: &str, files: &[String]) -> BTreeMap<String, Vec<String>> {
159    let mut subs: BTreeMap<String, Vec<String>> = BTreeMap::new();
160    let prefix = if dir == "(root)" { "" } else { dir };
161    for f in files {
162        let rest = if prefix.is_empty() {
163            f.as_str()
164        } else {
165            f.strip_prefix(prefix)
166                .and_then(|s| s.strip_prefix('/'))
167                .unwrap_or(f)
168        };
169        let parts: Vec<&str> = rest.split('/').collect();
170        let key = if parts.len() > 1 {
171            if prefix.is_empty() {
172                parts[0].to_string()
173            } else {
174                format!("{prefix}/{}", parts[0])
175            }
176        } else {
177            // Direct file in this directory
178            format!("{dir}/*")
179        };
180        subs.entry(key).or_default().push(f.clone());
181    }
182    subs
183}
184
185fn make_module(dir: &str, files: &[String]) -> Module {
186    Module {
187        name: dir.to_string(),
188        files: files.to_vec(),
189        lcom4: 0,
190        tcc: 0.0,
191        cohesion: 0.0,
192    }
193}
194
195// ── Metrics ──
196
197fn build_undirected_adj(
198    edges: &[(String, String)],
199    all_files: &[String],
200) -> HashMap<String, HashSet<String>> {
201    let file_set: HashSet<&str> = all_files.iter().map(|s| s.as_str()).collect();
202    let mut adj: HashMap<String, HashSet<String>> = HashMap::new();
203    for f in all_files {
204        adj.entry(f.clone()).or_default();
205    }
206    for (s, d) in edges {
207        if file_set.contains(s.as_str()) && file_set.contains(d.as_str()) {
208            adj.entry(s.clone()).or_default().insert(d.clone());
209            adj.entry(d.clone()).or_default().insert(s.clone());
210        }
211    }
212    adj
213}
214
215fn compute_lcom4(members: &HashSet<&str>, adj: &HashMap<String, HashSet<String>>) -> usize {
216    let mut visited: HashSet<&str> = HashSet::new();
217    let mut count = 0;
218    for &f in members {
219        if visited.contains(f) {
220            continue;
221        }
222        count += 1;
223        bfs_visit(f, members, adj, &mut visited);
224    }
225    count
226}
227
228fn bfs_visit<'a>(
229    start: &'a str,
230    members: &HashSet<&str>,
231    adj: &'a HashMap<String, HashSet<String>>,
232    visited: &mut HashSet<&'a str>,
233) {
234    let mut stack = vec![start];
235    while let Some(cur) = stack.pop() {
236        if !visited.insert(cur) {
237            continue;
238        }
239        let Some(neighbors) = adj.get(cur) else {
240            continue;
241        };
242        for n in neighbors {
243            if members.contains(n.as_str()) && !visited.contains(n.as_str()) {
244                stack.push(n);
245            }
246        }
247    }
248}
249
250fn compute_tcc(members: &HashSet<&str>, adj: &HashMap<String, HashSet<String>>) -> f64 {
251    let n = members.len();
252    if n < 2 {
253        return 1.0;
254    }
255    // For large modules, skip TCC (O(n²))
256    if n > 200 {
257        return -1.0;
258    }
259    let list: Vec<&str> = members.iter().copied().collect();
260    // file_targets[f] = set of internal neighbors
261    let targets: HashMap<&str, HashSet<&str>> = list
262        .iter()
263        .map(|&f| {
264            let t: HashSet<&str> = adj
265                .get(f)
266                .map(|ns| {
267                    ns.iter()
268                        .filter(|n| members.contains(n.as_str()))
269                        .map(|n| n.as_str())
270                        .collect()
271                })
272                .unwrap_or_default();
273            (f, t)
274        })
275        .collect();
276
277    let mut connected = 0usize;
278    for i in 0..n {
279        for j in (i + 1)..n {
280            let a = list[i];
281            let b = list[j];
282            // Connected if: direct edge, or share a common neighbor
283            if targets[a].contains(b)
284                || targets[b].contains(a)
285                || !targets[a].is_disjoint(&targets[b])
286            {
287                connected += 1;
288            }
289        }
290    }
291    connected as f64 / (n * (n - 1) / 2) as f64
292}
293
294fn compute_cohesion(members: &HashSet<&str>, adj: &HashMap<String, HashSet<String>>) -> f64 {
295    let mut sum = 0.0;
296    let mut count = 0;
297    for &f in members {
298        if let Some(neighbors) = adj.get(f) {
299            let total = neighbors.len();
300            if total == 0 {
301                continue;
302            }
303            let internal = neighbors
304                .iter()
305                .filter(|n| members.contains(n.as_str()))
306                .count();
307            sum += internal as f64 / total as f64;
308            count += 1;
309        }
310    }
311    if count > 0 { sum / count as f64 } else { 0.0 }
312}
313
314fn inter_child_ratio(
315    subs: &BTreeMap<String, Vec<String>>,
316    adj: &HashMap<String, HashSet<String>>,
317) -> f64 {
318    let file_to_sub: HashMap<&str, &str> = subs
319        .iter()
320        .flat_map(|(sd, files)| files.iter().map(move |f| (f.as_str(), sd.as_str())))
321        .collect();
322    let all_files: HashSet<&str> = file_to_sub.keys().copied().collect();
323    let (mut intra, mut inter) = (0usize, 0usize);
324    for &f in &all_files {
325        let Some(neighbors) = adj.get(f) else {
326            continue;
327        };
328        let f_sub = file_to_sub[f];
329        for n in neighbors {
330            let ns = n.as_str();
331            if !all_files.contains(ns) {
332                continue;
333            }
334            if f_sub == file_to_sub[ns] {
335                intra += 1;
336            } else {
337                inter += 1;
338            }
339        }
340    }
341    let total = intra + inter;
342    if total > 0 {
343        inter as f64 / total as f64
344    } else {
345        1.0
346    }
347}
348
349#[cfg(test)]
350mod tests {
351    use super::*;
352
353    #[test]
354    fn auto_depth_simple() {
355        let files: Vec<String> = vec![
356            "src/core/a.rs",
357            "src/core/b.rs",
358            "src/util/c.rs",
359            "src/util/d.rs",
360            "src/util/sub/e.rs",
361        ]
362        .into_iter()
363        .map(String::from)
364        .collect();
365        let d = auto_detect_depth(&files);
366        assert!(d >= 1 && d <= 2);
367    }
368
369    #[test]
370    fn lcom4_two_components() {
371        let files: Vec<String> = vec!["a.rs", "b.rs", "c.rs", "d.rs"]
372            .into_iter()
373            .map(String::from)
374            .collect();
375        // a↔b, c↔d (two components)
376        let edges = vec![
377            ("a.rs".into(), "b.rs".into()),
378            ("c.rs".into(), "d.rs".into()),
379        ];
380        let modules = infer_modules(&edges, &files, Some(1));
381        // Should detect LCOM4 > 1 at root level
382        assert!(modules.iter().any(|m| m.lcom4 >= 1));
383    }
384
385    #[test]
386    fn directory_grouping() {
387        let files: Vec<String> = vec!["src/core/x.rs", "src/core/y.rs", "src/util/z.rs"]
388            .into_iter()
389            .map(String::from)
390            .collect();
391        let modules = infer_modules(&[], &files, Some(1));
392        let core_mod = modules
393            .iter()
394            .find(|m| m.files.contains(&"src/core/x.rs".to_string()));
395        assert!(core_mod.is_some());
396        assert!(
397            core_mod
398                .unwrap()
399                .files
400                .contains(&"src/core/y.rs".to_string())
401        );
402    }
403}