Skip to main content

mollify_graph/
lib.rs

1//! # mollify-graph
2//!
3//! Discovers Python modules, assigns **path-sorted stable FileIds** (ADR-004
4//! analog), builds the internal import graph, computes **reachability** from
5//! entry points, and answers symbol-usage queries. Pure structure — the
6//! `mollify-core` crate turns these into [`mollify_parse`]-backed findings.
7
8use camino::{Utf8Path, Utf8PathBuf};
9use mollify_parse::{Import, ParsedModule, PyParser};
10use rayon::prelude::*;
11use rustc_hash::{FxHashMap, FxHashSet};
12
13/// Stable, path-sorted module identifier.
14#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
15pub struct FileId(pub u32);
16
17/// One module node in the graph.
18#[derive(Debug, Clone)]
19pub struct ModuleInfo {
20    pub id: FileId,
21    pub path: Utf8PathBuf,
22    /// Dotted module name relative to its source root (e.g. `pkg.sub.mod`).
23    pub dotted: String,
24    pub parsed: ParsedModule,
25    /// True if this module is an analysis root (entry point).
26    pub is_entry: bool,
27}
28
29/// An import that looks first-party/relative but resolves to no module.
30#[derive(Debug, Clone)]
31pub struct UnresolvedImport {
32    /// The importing module's file path.
33    pub importer: Utf8PathBuf,
34    /// The import as written (`.sub.thing` for relative, `pkg.mod` for absolute).
35    pub display: String,
36    pub line: u32,
37    /// True for a relative import (`from . import x`) — these must be internal.
38    pub relative: bool,
39}
40
41/// The whole project graph.
42pub struct ModuleGraph {
43    pub modules: Vec<ModuleInfo>,
44    by_dotted: FxHashMap<String, FileId>,
45    /// Resolved internal import edges: importer → imported.
46    edges: Vec<(FileId, FileId)>,
47    /// For each imported module, the set of symbol names pulled in by importers,
48    /// keyed by the imported module's FileId.
49    imported_symbols: FxHashMap<FileId, FxHashSet<String>>,
50    reachable: FxHashSet<FileId>,
51    /// True if any module in the project has a dynamic dispatch/import sink.
52    pub global_dynamic: bool,
53}
54
55/// Walk `root` for `*.py` and `*.ipynb` files, honoring `.gitignore`.
56/// Deterministic order.
57pub fn discover_python_files(root: &Utf8Path) -> Vec<Utf8PathBuf> {
58    let mut out = Vec::new();
59    for entry in ignore::WalkBuilder::new(root)
60        .hidden(false)
61        .build()
62        .flatten()
63    {
64        let p = entry.path();
65        if p.extension().is_some_and(|e| e == "py" || e == "ipynb") {
66            if let Ok(u) = Utf8PathBuf::from_path_buf(p.to_path_buf()) {
67                out.push(u);
68            }
69        }
70    }
71    out.sort();
72    out
73}
74
75/// Read a module's source. For `.ipynb`, extract and concatenate code cells into
76/// one Python source (line numbers are relative to that concatenation — a
77/// documented v1 simplification). Jupyter magics/shell-escapes are skipped.
78pub fn read_source(path: &Utf8Path) -> Option<String> {
79    let raw = std::fs::read_to_string(path).ok()?;
80    if path.extension() != Some("ipynb") {
81        return Some(raw);
82    }
83    let nb: serde_json::Value = serde_json::from_str(&raw).ok()?;
84    let cells = nb.get("cells")?.as_array()?;
85    let mut src = String::new();
86    for cell in cells {
87        if cell.get("cell_type").and_then(|t| t.as_str()) != Some("code") {
88            continue;
89        }
90        match cell.get("source") {
91            Some(serde_json::Value::Array(lines)) => {
92                for l in lines {
93                    if let Some(s) = l.as_str() {
94                        let t = s.trim_start();
95                        if t.starts_with('%') || t.starts_with('!') {
96                            continue;
97                        }
98                        src.push_str(s);
99                    }
100                }
101                src.push('\n');
102            }
103            Some(serde_json::Value::String(s)) => {
104                src.push_str(s);
105                src.push('\n');
106            }
107            _ => {}
108        }
109    }
110    Some(src)
111}
112
113/// Compute a module's dotted name relative to a source root. `src/` is treated
114/// as a source root if present; otherwise the project root is used.
115fn dotted_name(root: &Utf8Path, path: &Utf8Path) -> String {
116    let rel = path.strip_prefix(root).unwrap_or(path);
117    let mut rel = rel.to_owned();
118    // src-layout: drop a leading `src/` segment.
119    if rel.starts_with("src") {
120        if let Ok(stripped) = rel.strip_prefix("src") {
121            rel = stripped.to_owned();
122        }
123    }
124    let no_ext = rel.as_str().strip_suffix(".py").unwrap_or(rel.as_str());
125    let no_init = no_ext.strip_suffix("/__init__").unwrap_or(no_ext);
126    no_init.replace('/', ".").trim_matches('.').to_string()
127}
128
129fn is_entry(path: &Utf8Path) -> bool {
130    let name = path.file_name().unwrap_or("");
131    name == "__main__.py"
132        || name == "conftest.py"
133        || name == "setup.py"
134        || name == "__init__.py" // package surface is a public root
135        || name.starts_with("test_")
136        || name.ends_with("_test.py")
137        || path.extension() == Some("ipynb")
138}
139
140impl ModuleGraph {
141    /// Parse all files (in parallel) and build the graph.
142    pub fn build(root: &Utf8Path, files: &[Utf8PathBuf]) -> Self {
143        // Parse in parallel; each rayon task gets its own parser.
144        let parsed: Vec<(Utf8PathBuf, ParsedModule)> = files
145            .par_iter()
146            .filter_map(|p| {
147                let src = read_source(p)?;
148                let mut parser = PyParser::new().ok()?;
149                let pm = parser.parse(p, &src).ok()?;
150                Some((p.clone(), pm))
151            })
152            .collect();
153
154        // Stable FileIds by sorted path (already sorted from discovery, but be safe).
155        let mut parsed = parsed;
156        parsed.sort_by(|a, b| a.0.cmp(&b.0));
157
158        let mut modules = Vec::with_capacity(parsed.len());
159        let mut by_dotted = FxHashMap::default();
160        let mut global_dynamic = false;
161        for (i, (path, pm)) in parsed.into_iter().enumerate() {
162            let id = FileId(i as u32);
163            let dotted = dotted_name(root, &path);
164            global_dynamic |= pm.has_dynamic_sink;
165            by_dotted.entry(dotted.clone()).or_insert(id);
166            modules.push(ModuleInfo {
167                id,
168                is_entry: is_entry(&path),
169                path,
170                dotted,
171                parsed: pm,
172            });
173        }
174
175        let mut g = ModuleGraph {
176            modules,
177            by_dotted,
178            edges: Vec::new(),
179            imported_symbols: FxHashMap::default(),
180            reachable: FxHashSet::default(),
181            global_dynamic,
182        };
183        g.resolve_edges();
184        g.compute_reachability();
185        g
186    }
187
188    fn module(&self, id: FileId) -> &ModuleInfo {
189        &self.modules[id.0 as usize]
190    }
191
192    /// Resolve each import to an internal module if possible, recording edges
193    /// and which symbol names each importer pulls from the target.
194    fn resolve_edges(&mut self) {
195        let mut edges = Vec::new();
196        let mut imported_symbols: FxHashMap<FileId, FxHashSet<String>> = FxHashMap::default();
197
198        for m in &self.modules {
199            for imp in &m.parsed.imports {
200                let target_dotted = if imp.relative_dots > 0 {
201                    resolve_relative(&m.dotted, imp.relative_dots, &imp.module)
202                } else {
203                    imp.module.clone()
204                };
205                // Try the full dotted path, then progressively shorter prefixes
206                // (handles `from pkg.mod import name` where `pkg.mod` is a module
207                // and `name` is a symbol, vs `import pkg.mod`).
208                if let Some(&tid) = self.lookup(&target_dotted) {
209                    edges.push((m.id, tid));
210                    let set = imported_symbols.entry(tid).or_default();
211                    for n in &imp.names {
212                        set.insert(n.clone());
213                    }
214                    if imp.is_star {
215                        set.insert("*".into());
216                    }
217                } else if !imp.names.is_empty() {
218                    // `from pkg import submod` where submod is itself a module.
219                    for n in &imp.names {
220                        let candidate = format!("{target_dotted}.{n}");
221                        if let Some(&tid) = self.lookup(&candidate) {
222                            edges.push((m.id, tid));
223                        }
224                    }
225                }
226            }
227        }
228        edges.sort();
229        edges.dedup();
230        self.edges = edges;
231        self.imported_symbols = imported_symbols;
232    }
233
234    fn lookup(&self, dotted: &str) -> Option<&FileId> {
235        self.by_dotted.get(dotted)
236    }
237
238    /// True if an import target resolves to an internal module — directly, or as
239    /// `from pkg import submodule` where `pkg.submodule` is a module.
240    fn import_resolves(&self, target: &str, imp: &Import) -> bool {
241        if self.lookup(target).is_some() {
242            return true;
243        }
244        imp.names
245            .iter()
246            .any(|n| self.lookup(&format!("{target}.{n}")).is_some())
247    }
248
249    /// Imports that *look* internal but resolve to no module in the project:
250    /// every relative import that fails to resolve, plus absolute imports under a
251    /// first-party top-level package that fail to resolve. These are typically a
252    /// typo or a broken refactor — distinct from third-party `missing-dependency`.
253    pub fn unresolved_imports(&self) -> Vec<UnresolvedImport> {
254        // First-party top-level package names (the first segment of any module).
255        let mut first_party: FxHashSet<&str> = FxHashSet::default();
256        for k in self.by_dotted.keys() {
257            if let Some(top) = k.split('.').next() {
258                first_party.insert(top);
259            }
260        }
261        let mut out = Vec::new();
262        for m in &self.modules {
263            for imp in &m.parsed.imports {
264                let relative = imp.relative_dots > 0;
265                let target = if relative {
266                    resolve_relative(&m.dotted, imp.relative_dots, &imp.module)
267                } else {
268                    imp.module.clone()
269                };
270                if target.is_empty() || self.import_resolves(&target, imp) {
271                    continue;
272                }
273                if !relative {
274                    let top = target.split('.').next().unwrap_or(&target);
275                    if !first_party.contains(top) {
276                        continue; // third-party → handled by dependency hygiene
277                    }
278                }
279                let display = if relative {
280                    format!("{}{}", ".".repeat(imp.relative_dots as usize), imp.module)
281                } else {
282                    imp.module.clone()
283                };
284                out.push(UnresolvedImport {
285                    importer: m.path.clone(),
286                    display,
287                    line: imp.line,
288                    relative,
289                });
290            }
291        }
292        out.sort_by(|a, b| {
293            a.importer
294                .cmp(&b.importer)
295                .then(a.line.cmp(&b.line))
296                .then(a.display.cmp(&b.display))
297        });
298        out.dedup_by(|a, b| a.importer == b.importer && a.line == b.line && a.display == b.display);
299        out
300    }
301
302    /// BFS mark-reachable from all entry modules over import edges.
303    fn compute_reachability(&mut self) {
304        let mut adj: FxHashMap<FileId, Vec<FileId>> = FxHashMap::default();
305        for (a, b) in &self.edges {
306            adj.entry(*a).or_default().push(*b);
307        }
308        let mut queue: Vec<FileId> = self
309            .modules
310            .iter()
311            .filter(|m| m.is_entry)
312            .map(|m| m.id)
313            .collect();
314        let mut seen: FxHashSet<FileId> = queue.iter().copied().collect();
315        while let Some(id) = queue.pop() {
316            if let Some(neighbors) = adj.get(&id) {
317                for &n in neighbors {
318                    if seen.insert(n) {
319                        queue.push(n);
320                    }
321                }
322            }
323        }
324        self.reachable = seen;
325    }
326
327    /// Files that are neither entries nor reachable from any entry.
328    pub fn unused_files(&self) -> Vec<&ModuleInfo> {
329        self.modules
330            .iter()
331            .filter(|m| !m.is_entry && !self.reachable.contains(&m.id))
332            .collect()
333    }
334
335    /// Whether a symbol defined in `module` is referenced internally or by any
336    /// importer of that module. `defs_named` is how many top-level defs share
337    /// the name (to discount the definition site in the internal count).
338    pub fn symbol_used(&self, module: FileId, name: &str, defs_named: u32) -> bool {
339        let m = self.module(module);
340        // Internal use. With scope/binding resolution, a top-level symbol is used
341        // iff some free `Name` load resolves to it (module_used) — precise: it
342        // ignores shadowing function-locals and attribute accesses. In modules
343        // with a dynamic sink (getattr/eval/importlib) we can't trust static
344        // resolution, so fall back to the conservative token-frequency count.
345        let internal = if m.parsed.has_dynamic_sink {
346            m.parsed.name_counts.get(name).copied().unwrap_or(0) > defs_named
347        } else {
348            m.parsed
349                .module_used
350                .binary_search_by(|s| s.as_str().cmp(name))
351                .is_ok()
352        };
353        if internal {
354            return true;
355        }
356        // Imported by name from this module (covers `from m import name`).
357        if let Some(set) = self.imported_symbols.get(&module) {
358            if set.contains(name) || set.contains("*") {
359                return true;
360            }
361        }
362        // Cross-module: any module that imports `module` references `name`.
363        let importers: Vec<FileId> = self
364            .edges
365            .iter()
366            .filter(|(_, b)| *b == module)
367            .map(|(a, _)| *a)
368            .collect();
369        for imp in importers {
370            let im = self.module(imp);
371            if im.parsed.name_counts.contains_key(name) {
372                return true;
373            }
374        }
375        false
376    }
377
378    pub fn edge_count(&self) -> usize {
379        self.edges.len()
380    }
381
382    /// Internal import edges as (importer dotted, imported dotted) pairs.
383    /// Used by the architecture-boundary engine.
384    pub fn import_edges(&self) -> Vec<(&str, &str)> {
385        self.edges
386            .iter()
387            .map(|(a, b)| {
388                (
389                    self.modules[a.0 as usize].dotted.as_str(),
390                    self.modules[b.0 as usize].dotted.as_str(),
391                )
392            })
393            .collect()
394    }
395
396    /// The path of a module by its dotted name (first match), for findings.
397    pub fn path_of_dotted(&self, dotted: &str) -> Option<&Utf8Path> {
398        self.modules
399            .iter()
400            .find(|m| m.dotted == dotted)
401            .map(|m| m.path.as_path())
402    }
403
404    /// Find import cycles: strongly-connected components of size > 1, plus
405    /// self-loops. Tarjan's algorithm; results are deterministic (each cycle's
406    /// members sorted, and the list sorted). Cross-module circular imports.
407    pub fn find_cycles(&self) -> Vec<Vec<FileId>> {
408        let n = self.modules.len();
409        let mut adj: Vec<Vec<usize>> = vec![Vec::new(); n];
410        let mut self_loops: Vec<usize> = Vec::new();
411        for (a, b) in &self.edges {
412            if a == b {
413                self_loops.push(a.0 as usize);
414            } else {
415                adj[a.0 as usize].push(b.0 as usize);
416            }
417        }
418
419        // Iterative Tarjan to avoid stack overflow on large graphs.
420        let mut index = vec![u32::MAX; n];
421        let mut low = vec![0u32; n];
422        let mut on_stack = vec![false; n];
423        let mut stack: Vec<usize> = Vec::new();
424        let mut idx: u32 = 0;
425        let mut out: Vec<Vec<FileId>> = Vec::new();
426
427        // Explicit DFS frame: (node, next child position).
428        for start in 0..n {
429            if index[start] != u32::MAX {
430                continue;
431            }
432            let mut call: Vec<(usize, usize)> = vec![(start, 0)];
433            while let Some(&(v, ci)) = call.last() {
434                if ci == 0 {
435                    index[v] = idx;
436                    low[v] = idx;
437                    idx += 1;
438                    stack.push(v);
439                    on_stack[v] = true;
440                }
441                if ci < adj[v].len() {
442                    let w = adj[v][ci];
443                    call.last_mut().unwrap().1 += 1;
444                    if index[w] == u32::MAX {
445                        call.push((w, 0));
446                    } else if on_stack[w] {
447                        low[v] = low[v].min(index[w]);
448                    }
449                } else {
450                    if low[v] == index[v] {
451                        let mut comp = Vec::new();
452                        loop {
453                            let w = stack.pop().unwrap();
454                            on_stack[w] = false;
455                            comp.push(FileId(w as u32));
456                            if w == v {
457                                break;
458                            }
459                        }
460                        if comp.len() > 1 {
461                            comp.sort();
462                            out.push(comp);
463                        }
464                    }
465                    call.pop();
466                    if let Some(&(parent, _)) = call.last() {
467                        low[parent] = low[parent].min(low[v]);
468                    }
469                }
470            }
471        }
472
473        for s in self_loops {
474            out.push(vec![FileId(s as u32)]);
475        }
476        out.sort();
477        out
478    }
479}
480
481/// Resolve a relative import (`from ..pkg import x` inside `a.b.c`) to a dotted
482/// module name. `dots`=1 means the current package.
483fn resolve_relative(importer_dotted: &str, dots: u8, module: &str) -> String {
484    let parts: Vec<&str> = importer_dotted.split('.').collect();
485    // The importer's package = drop the module's own last segment.
486    // For `a.b.c`, package is `a.b`; one extra dot goes up one more level.
487    let keep = parts.len().saturating_sub(dots as usize);
488    let base = parts[..keep].join(".");
489    match (base.is_empty(), module.is_empty()) {
490        (true, _) => module.to_string(),
491        (false, true) => base,
492        (false, false) => format!("{base}.{module}"),
493    }
494}
495
496#[cfg(test)]
497mod tests {
498    use super::*;
499
500    fn write(dir: &Utf8Path, rel: &str, src: &str) {
501        let p = dir.join(rel);
502        std::fs::create_dir_all(p.parent().unwrap()).unwrap();
503        std::fs::write(p, src).unwrap();
504    }
505
506    fn temp(tag: &str) -> Utf8PathBuf {
507        let base =
508            std::env::temp_dir().join(format!("mollify-graph-test-{}-{tag}", std::process::id()));
509        let _ = std::fs::remove_dir_all(&base);
510        Utf8PathBuf::from_path_buf(base).unwrap()
511    }
512
513    #[test]
514    fn dotted_name_handles_init_and_src() {
515        let root = Utf8Path::new("/proj");
516        assert_eq!(
517            dotted_name(root, Utf8Path::new("/proj/pkg/mod.py")),
518            "pkg.mod"
519        );
520        assert_eq!(
521            dotted_name(root, Utf8Path::new("/proj/pkg/__init__.py")),
522            "pkg"
523        );
524        assert_eq!(dotted_name(root, Utf8Path::new("/proj/src/a/b.py")), "a.b");
525    }
526
527    #[test]
528    fn relative_import_resolution() {
529        assert_eq!(resolve_relative("a.b.c", 1, "d"), "a.b.d");
530        assert_eq!(resolve_relative("a.b.c", 2, "d"), "a.d");
531        assert_eq!(resolve_relative("a.b.c", 1, ""), "a.b");
532    }
533
534    #[test]
535    fn unused_file_detected() {
536        let d = temp("unused");
537        write(&d, "__main__.py", "from used import helper\nhelper()\n");
538        write(&d, "used.py", "def helper():\n    return 1\n");
539        write(&d, "orphan.py", "def never():\n    return 2\n");
540        let files = discover_python_files(&d);
541        let g = ModuleGraph::build(&d, &files);
542        let unused: Vec<_> = g.unused_files().iter().map(|m| m.dotted.clone()).collect();
543        assert!(unused.contains(&"orphan".to_string()), "got {unused:?}");
544        assert!(!unused.contains(&"used".to_string()));
545        std::fs::remove_dir_all(&d).ok();
546    }
547
548    #[test]
549    fn reads_notebook_code_cells() {
550        let d = temp("nb");
551        let nb = r##"{"cells":[{"cell_type":"markdown","source":["title"]},{"cell_type":"code","source":["def nb_fn(x):\n","    return x\n"]}]}"##;
552        write(&d, "analysis.ipynb", nb);
553        let files = discover_python_files(&d);
554        assert!(files.iter().any(|f| f.as_str().ends_with("analysis.ipynb")));
555        let g = ModuleGraph::build(&d, &files);
556        let nbmod = g
557            .modules
558            .iter()
559            .find(|m| m.path.as_str().ends_with("analysis.ipynb"))
560            .unwrap();
561        assert!(
562            nbmod.parsed.definitions.iter().any(|x| x.name == "nb_fn"),
563            "notebook code not parsed"
564        );
565        std::fs::remove_dir_all(&d).ok();
566    }
567
568    #[test]
569    fn detects_import_cycle() {
570        let d = temp("cycle");
571        write(
572            &d,
573            "__init__.py",
574            "import a
575import b
576",
577        );
578        write(
579            &d,
580            "a.py",
581            "import b
582",
583        );
584        write(
585            &d,
586            "b.py",
587            "import a
588",
589        );
590        let files = discover_python_files(&d);
591        let g = ModuleGraph::build(&d, &files);
592        let cycles = g.find_cycles();
593        assert!(
594            cycles.iter().any(|c| c.len() == 2),
595            "expected a 2-cycle, got {cycles:?}"
596        );
597        std::fs::remove_dir_all(&d).ok();
598    }
599
600    #[test]
601    fn symbol_use_cross_module() {
602        let d = temp("symuse");
603        write(&d, "__main__.py", "from lib import used_fn\nused_fn()\n");
604        write(
605            &d,
606            "lib.py",
607            "def used_fn():\n    return 1\n\ndef dead_fn():\n    return 2\n",
608        );
609        let files = discover_python_files(&d);
610        let g = ModuleGraph::build(&d, &files);
611        let lib = g.modules.iter().find(|m| m.dotted == "lib").unwrap().id;
612        assert!(g.symbol_used(lib, "used_fn", 1));
613        assert!(!g.symbol_used(lib, "dead_fn", 1));
614        std::fs::remove_dir_all(&d).ok();
615    }
616}