1use camino::{Utf8Path, Utf8PathBuf};
9use mollify_parse::{Import, ParsedModule, PyParser};
10use rayon::prelude::*;
11use rustc_hash::{FxHashMap, FxHashSet};
12
13#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
15pub struct FileId(pub u32);
16
17#[derive(Debug, Clone)]
19pub struct ModuleInfo {
20 pub id: FileId,
21 pub path: Utf8PathBuf,
22 pub dotted: String,
24 pub parsed: ParsedModule,
25 pub is_entry: bool,
27}
28
29#[derive(Debug, Clone)]
31pub struct UnresolvedImport {
32 pub importer: Utf8PathBuf,
34 pub display: String,
36 pub line: u32,
37 pub relative: bool,
39}
40
41pub struct ModuleGraph {
43 pub modules: Vec<ModuleInfo>,
44 by_dotted: FxHashMap<String, FileId>,
45 edges: Vec<(FileId, FileId)>,
47 imported_symbols: FxHashMap<FileId, FxHashSet<String>>,
50 reachable: FxHashSet<FileId>,
51 pub global_dynamic: bool,
53}
54
55const DEFAULT_EXCLUDE_DIRS: &[&str] = &[
60 ".bzr",
61 ".direnv",
62 ".eggs",
63 ".git",
64 ".hg",
65 ".svn",
66 ".ipynb_checkpoints",
67 ".mypy_cache",
68 ".nox",
69 ".pyenv",
70 ".pytest_cache",
71 ".pytype",
72 ".ruff_cache",
73 ".tox",
74 ".venv",
75 "__pycache__",
76 "__pypackages__",
77 "_build",
78 "buck-out",
79 "build",
80 "dist",
81 "env",
82 "node_modules",
83 "site-packages",
84 "venv",
85];
86
87fn is_excluded_dir(entry: &ignore::DirEntry, extra_excludes: &[String]) -> bool {
92 if !entry.file_type().is_some_and(|t| t.is_dir()) {
93 return false;
94 }
95 let Some(name) = entry.file_name().to_str() else {
96 return false;
97 };
98 if DEFAULT_EXCLUDE_DIRS.contains(&name) || extra_excludes.iter().any(|e| e == name) {
99 return true;
100 }
101 entry.path().join("pyvenv.cfg").is_file()
102}
103
104pub fn discover_python_files(root: &Utf8Path) -> Vec<Utf8PathBuf> {
108 discover_python_files_excluding(root, &[])
109}
110
111pub fn discover_python_files_excluding(
115 root: &Utf8Path,
116 extra_excludes: &[String],
117) -> Vec<Utf8PathBuf> {
118 let extra = extra_excludes.to_vec();
119 let mut out = Vec::new();
120 for entry in ignore::WalkBuilder::new(root)
121 .hidden(false)
122 .filter_entry(move |e| !is_excluded_dir(e, &extra))
123 .build()
124 .flatten()
125 {
126 let p = entry.path();
127 if p.extension().is_some_and(|e| e == "py" || e == "ipynb") {
128 if let Ok(u) = Utf8PathBuf::from_path_buf(p.to_path_buf()) {
129 out.push(u);
130 }
131 }
132 }
133 out.sort();
134 out
135}
136
137pub fn read_source(path: &Utf8Path) -> Option<String> {
141 let raw = std::fs::read_to_string(path).ok()?;
142 if path.extension() != Some("ipynb") {
143 return Some(raw);
144 }
145 let nb: serde_json::Value = serde_json::from_str(&raw).ok()?;
146 let cells = nb.get("cells")?.as_array()?;
147 let mut src = String::new();
148 for cell in cells {
149 if cell.get("cell_type").and_then(|t| t.as_str()) != Some("code") {
150 continue;
151 }
152 match cell.get("source") {
153 Some(serde_json::Value::Array(lines)) => {
154 for l in lines {
155 if let Some(s) = l.as_str() {
156 let t = s.trim_start();
157 if t.starts_with('%') || t.starts_with('!') {
158 continue;
159 }
160 src.push_str(s);
161 }
162 }
163 src.push('\n');
164 }
165 Some(serde_json::Value::String(s)) => {
166 src.push_str(s);
167 src.push('\n');
168 }
169 _ => {}
170 }
171 }
172 Some(src)
173}
174
175fn dotted_name(root: &Utf8Path, path: &Utf8Path) -> String {
178 let rel = path.strip_prefix(root).unwrap_or(path);
179 let mut rel = rel.to_owned();
180 if rel.starts_with("src") {
182 if let Ok(stripped) = rel.strip_prefix("src") {
183 rel = stripped.to_owned();
184 }
185 }
186 let no_ext = rel.as_str().strip_suffix(".py").unwrap_or(rel.as_str());
187 let no_init = no_ext.strip_suffix("/__init__").unwrap_or(no_ext);
188 no_init.replace('/', ".").trim_matches('.').to_string()
189}
190
191fn is_entry(path: &Utf8Path) -> bool {
192 let name = path.file_name().unwrap_or("");
193 name == "__main__.py"
194 || name == "conftest.py"
195 || name == "setup.py"
196 || name == "__init__.py" || name.starts_with("test_")
198 || name.ends_with("_test.py")
199 || path.extension() == Some("ipynb")
200}
201
202impl ModuleGraph {
203 pub fn build(root: &Utf8Path, files: &[Utf8PathBuf]) -> Self {
205 let parsed: Vec<(Utf8PathBuf, ParsedModule)> = files
207 .par_iter()
208 .filter_map(|p| {
209 let src = read_source(p)?;
210 let mut parser = PyParser::new().ok()?;
211 let pm = parser.parse(p, &src).ok()?;
212 Some((p.clone(), pm))
213 })
214 .collect();
215
216 let mut parsed = parsed;
218 parsed.sort_by(|a, b| a.0.cmp(&b.0));
219
220 let mut modules = Vec::with_capacity(parsed.len());
221 let mut by_dotted = FxHashMap::default();
222 let mut global_dynamic = false;
223 for (i, (path, pm)) in parsed.into_iter().enumerate() {
224 let id = FileId(i as u32);
225 let dotted = dotted_name(root, &path);
226 global_dynamic |= pm.has_dynamic_sink;
227 by_dotted.entry(dotted.clone()).or_insert(id);
228 modules.push(ModuleInfo {
229 id,
230 is_entry: is_entry(&path),
231 path,
232 dotted,
233 parsed: pm,
234 });
235 }
236
237 let mut g = ModuleGraph {
238 modules,
239 by_dotted,
240 edges: Vec::new(),
241 imported_symbols: FxHashMap::default(),
242 reachable: FxHashSet::default(),
243 global_dynamic,
244 };
245 g.resolve_edges();
246 g.compute_reachability();
247 g
248 }
249
250 fn module(&self, id: FileId) -> &ModuleInfo {
251 &self.modules[id.0 as usize]
252 }
253
254 fn resolve_edges(&mut self) {
257 let mut edges = Vec::new();
258 let mut imported_symbols: FxHashMap<FileId, FxHashSet<String>> = FxHashMap::default();
259
260 for m in &self.modules {
261 for imp in &m.parsed.imports {
262 let target_dotted = if imp.relative_dots > 0 {
263 resolve_relative(&m.dotted, imp.relative_dots, &imp.module)
264 } else {
265 imp.module.clone()
266 };
267 if let Some(&tid) = self.lookup(&target_dotted) {
271 edges.push((m.id, tid));
272 let set = imported_symbols.entry(tid).or_default();
273 for n in &imp.names {
274 set.insert(n.clone());
275 }
276 if imp.is_star {
277 set.insert("*".into());
278 }
279 } else if !imp.names.is_empty() {
280 for n in &imp.names {
282 let candidate = format!("{target_dotted}.{n}");
283 if let Some(&tid) = self.lookup(&candidate) {
284 edges.push((m.id, tid));
285 }
286 }
287 }
288 }
289 }
290 edges.sort();
291 edges.dedup();
292 self.edges = edges;
293 self.imported_symbols = imported_symbols;
294 }
295
296 fn lookup(&self, dotted: &str) -> Option<&FileId> {
297 self.by_dotted.get(dotted)
298 }
299
300 fn import_resolves(&self, target: &str, imp: &Import) -> bool {
303 if self.lookup(target).is_some() {
304 return true;
305 }
306 imp.names
307 .iter()
308 .any(|n| self.lookup(&format!("{target}.{n}")).is_some())
309 }
310
311 pub fn unresolved_imports(&self) -> Vec<UnresolvedImport> {
316 let mut first_party: FxHashSet<&str> = FxHashSet::default();
318 for k in self.by_dotted.keys() {
319 if let Some(top) = k.split('.').next() {
320 first_party.insert(top);
321 }
322 }
323 let mut out = Vec::new();
324 for m in &self.modules {
325 for imp in &m.parsed.imports {
326 let relative = imp.relative_dots > 0;
327 let target = if relative {
328 resolve_relative(&m.dotted, imp.relative_dots, &imp.module)
329 } else {
330 imp.module.clone()
331 };
332 if target.is_empty() || self.import_resolves(&target, imp) {
333 continue;
334 }
335 if !relative {
336 let top = target.split('.').next().unwrap_or(&target);
337 if !first_party.contains(top) {
338 continue; }
340 }
341 let display = if relative {
342 format!("{}{}", ".".repeat(imp.relative_dots as usize), imp.module)
343 } else {
344 imp.module.clone()
345 };
346 out.push(UnresolvedImport {
347 importer: m.path.clone(),
348 display,
349 line: imp.line,
350 relative,
351 });
352 }
353 }
354 out.sort_by(|a, b| {
355 a.importer
356 .cmp(&b.importer)
357 .then(a.line.cmp(&b.line))
358 .then(a.display.cmp(&b.display))
359 });
360 out.dedup_by(|a, b| a.importer == b.importer && a.line == b.line && a.display == b.display);
361 out
362 }
363
364 fn compute_reachability(&mut self) {
366 let mut adj: FxHashMap<FileId, Vec<FileId>> = FxHashMap::default();
367 for (a, b) in &self.edges {
368 adj.entry(*a).or_default().push(*b);
369 }
370 let mut queue: Vec<FileId> = self
371 .modules
372 .iter()
373 .filter(|m| m.is_entry)
374 .map(|m| m.id)
375 .collect();
376 let mut seen: FxHashSet<FileId> = queue.iter().copied().collect();
377 while let Some(id) = queue.pop() {
378 if let Some(neighbors) = adj.get(&id) {
379 for &n in neighbors {
380 if seen.insert(n) {
381 queue.push(n);
382 }
383 }
384 }
385 }
386 self.reachable = seen;
387 }
388
389 pub fn unused_files(&self) -> Vec<&ModuleInfo> {
391 self.modules
392 .iter()
393 .filter(|m| !m.is_entry && !self.reachable.contains(&m.id))
394 .collect()
395 }
396
397 pub fn symbol_used(&self, module: FileId, name: &str, defs_named: u32) -> bool {
401 let m = self.module(module);
402 let internal = if m.parsed.has_dynamic_sink {
408 m.parsed.name_counts.get(name).copied().unwrap_or(0) > defs_named
409 } else {
410 m.parsed
411 .module_used
412 .binary_search_by(|s| s.as_str().cmp(name))
413 .is_ok()
414 };
415 if internal {
416 return true;
417 }
418 if let Some(set) = self.imported_symbols.get(&module) {
420 if set.contains(name) || set.contains("*") {
421 return true;
422 }
423 }
424 let importers: Vec<FileId> = self
426 .edges
427 .iter()
428 .filter(|(_, b)| *b == module)
429 .map(|(a, _)| *a)
430 .collect();
431 for imp in importers {
432 let im = self.module(imp);
433 if im.parsed.name_counts.contains_key(name) {
434 return true;
435 }
436 }
437 false
438 }
439
440 pub fn edge_count(&self) -> usize {
441 self.edges.len()
442 }
443
444 pub fn import_edges(&self) -> Vec<(&str, &str)> {
447 self.edges
448 .iter()
449 .map(|(a, b)| {
450 (
451 self.modules[a.0 as usize].dotted.as_str(),
452 self.modules[b.0 as usize].dotted.as_str(),
453 )
454 })
455 .collect()
456 }
457
458 pub fn path_of_dotted(&self, dotted: &str) -> Option<&Utf8Path> {
460 self.modules
461 .iter()
462 .find(|m| m.dotted == dotted)
463 .map(|m| m.path.as_path())
464 }
465
466 pub fn find_cycles(&self) -> Vec<Vec<FileId>> {
470 let n = self.modules.len();
471 let mut adj: Vec<Vec<usize>> = vec![Vec::new(); n];
472 let mut self_loops: Vec<usize> = Vec::new();
473 for (a, b) in &self.edges {
474 if a == b {
475 self_loops.push(a.0 as usize);
476 } else {
477 adj[a.0 as usize].push(b.0 as usize);
478 }
479 }
480
481 let mut index = vec![u32::MAX; n];
483 let mut low = vec![0u32; n];
484 let mut on_stack = vec![false; n];
485 let mut stack: Vec<usize> = Vec::new();
486 let mut idx: u32 = 0;
487 let mut out: Vec<Vec<FileId>> = Vec::new();
488
489 for start in 0..n {
491 if index[start] != u32::MAX {
492 continue;
493 }
494 let mut call: Vec<(usize, usize)> = vec![(start, 0)];
495 while let Some(&(v, ci)) = call.last() {
496 if ci == 0 {
497 index[v] = idx;
498 low[v] = idx;
499 idx += 1;
500 stack.push(v);
501 on_stack[v] = true;
502 }
503 if ci < adj[v].len() {
504 let w = adj[v][ci];
505 call.last_mut().unwrap().1 += 1;
506 if index[w] == u32::MAX {
507 call.push((w, 0));
508 } else if on_stack[w] {
509 low[v] = low[v].min(index[w]);
510 }
511 } else {
512 if low[v] == index[v] {
513 let mut comp = Vec::new();
514 loop {
515 let w = stack.pop().unwrap();
516 on_stack[w] = false;
517 comp.push(FileId(w as u32));
518 if w == v {
519 break;
520 }
521 }
522 if comp.len() > 1 {
523 comp.sort();
524 out.push(comp);
525 }
526 }
527 call.pop();
528 if let Some(&(parent, _)) = call.last() {
529 low[parent] = low[parent].min(low[v]);
530 }
531 }
532 }
533 }
534
535 for s in self_loops {
536 out.push(vec![FileId(s as u32)]);
537 }
538 out.sort();
539 out
540 }
541}
542
543fn resolve_relative(importer_dotted: &str, dots: u8, module: &str) -> String {
546 let parts: Vec<&str> = importer_dotted.split('.').collect();
547 let keep = parts.len().saturating_sub(dots as usize);
550 let base = parts[..keep].join(".");
551 match (base.is_empty(), module.is_empty()) {
552 (true, _) => module.to_string(),
553 (false, true) => base,
554 (false, false) => format!("{base}.{module}"),
555 }
556}
557
558#[cfg(test)]
559mod tests {
560 use super::*;
561
562 fn write(dir: &Utf8Path, rel: &str, src: &str) {
563 let p = dir.join(rel);
564 std::fs::create_dir_all(p.parent().unwrap()).unwrap();
565 std::fs::write(p, src).unwrap();
566 }
567
568 fn temp(tag: &str) -> Utf8PathBuf {
569 let base =
570 std::env::temp_dir().join(format!("mollify-graph-test-{}-{tag}", std::process::id()));
571 let _ = std::fs::remove_dir_all(&base);
572 Utf8PathBuf::from_path_buf(base).unwrap()
573 }
574
575 #[test]
576 fn dotted_name_handles_init_and_src() {
577 let root = Utf8Path::new("/proj");
578 assert_eq!(
579 dotted_name(root, Utf8Path::new("/proj/pkg/mod.py")),
580 "pkg.mod"
581 );
582 assert_eq!(
583 dotted_name(root, Utf8Path::new("/proj/pkg/__init__.py")),
584 "pkg"
585 );
586 assert_eq!(dotted_name(root, Utf8Path::new("/proj/src/a/b.py")), "a.b");
587 }
588
589 #[test]
590 fn relative_import_resolution() {
591 assert_eq!(resolve_relative("a.b.c", 1, "d"), "a.b.d");
592 assert_eq!(resolve_relative("a.b.c", 2, "d"), "a.d");
593 assert_eq!(resolve_relative("a.b.c", 1, ""), "a.b");
594 }
595
596 #[test]
597 fn unused_file_detected() {
598 let d = temp("unused");
599 write(&d, "__main__.py", "from used import helper\nhelper()\n");
600 write(&d, "used.py", "def helper():\n return 1\n");
601 write(&d, "orphan.py", "def never():\n return 2\n");
602 let files = discover_python_files(&d);
603 let g = ModuleGraph::build(&d, &files);
604 let unused: Vec<_> = g.unused_files().iter().map(|m| m.dotted.clone()).collect();
605 assert!(unused.contains(&"orphan".to_string()), "got {unused:?}");
606 assert!(!unused.contains(&"used".to_string()));
607 std::fs::remove_dir_all(&d).ok();
608 }
609
610 #[test]
611 fn reads_notebook_code_cells() {
612 let d = temp("nb");
613 let nb = r##"{"cells":[{"cell_type":"markdown","source":["title"]},{"cell_type":"code","source":["def nb_fn(x):\n"," return x\n"]}]}"##;
614 write(&d, "analysis.ipynb", nb);
615 let files = discover_python_files(&d);
616 assert!(files.iter().any(|f| f.as_str().ends_with("analysis.ipynb")));
617 let g = ModuleGraph::build(&d, &files);
618 let nbmod = g
619 .modules
620 .iter()
621 .find(|m| m.path.as_str().ends_with("analysis.ipynb"))
622 .unwrap();
623 assert!(
624 nbmod.parsed.definitions.iter().any(|x| x.name == "nb_fn"),
625 "notebook code not parsed"
626 );
627 std::fs::remove_dir_all(&d).ok();
628 }
629
630 #[test]
631 fn detects_import_cycle() {
632 let d = temp("cycle");
633 write(
634 &d,
635 "__init__.py",
636 "import a
637import b
638",
639 );
640 write(
641 &d,
642 "a.py",
643 "import b
644",
645 );
646 write(
647 &d,
648 "b.py",
649 "import a
650",
651 );
652 let files = discover_python_files(&d);
653 let g = ModuleGraph::build(&d, &files);
654 let cycles = g.find_cycles();
655 assert!(
656 cycles.iter().any(|c| c.len() == 2),
657 "expected a 2-cycle, got {cycles:?}"
658 );
659 std::fs::remove_dir_all(&d).ok();
660 }
661
662 #[test]
663 fn symbol_use_cross_module() {
664 let d = temp("symuse");
665 write(&d, "__main__.py", "from lib import used_fn\nused_fn()\n");
666 write(
667 &d,
668 "lib.py",
669 "def used_fn():\n return 1\n\ndef dead_fn():\n return 2\n",
670 );
671 let files = discover_python_files(&d);
672 let g = ModuleGraph::build(&d, &files);
673 let lib = g.modules.iter().find(|m| m.dotted == "lib").unwrap().id;
674 assert!(g.symbol_used(lib, "used_fn", 1));
675 assert!(!g.symbol_used(lib, "dead_fn", 1));
676 std::fs::remove_dir_all(&d).ok();
677 }
678
679 #[test]
680 fn discovery_skips_venvs_vcs_and_caches() {
681 let d = temp("excluded-dirs");
682 write(&d, "src/app.py", "def main():\n return 1\n");
683 write(
684 &d,
685 ".venv/lib/python3.12/site-packages/somepkg/__init__.py",
686 "def pkg_fn():\n return 1\n",
687 );
688 write(&d, ".git/hooks/pre-commit.py", "raise SystemExit(0)\n");
689 write(&d, "__pycache__/app.cpython-312.py", "stale_cache = 1\n");
690 write(&d, "node_modules/foo/bar.py", "bar = 1\n");
691 write(&d, "myenv/pyvenv.cfg", "home = /usr/bin\n");
693 write(&d, "myenv/lib/site.py", "site_fn = 1\n");
694
695 let files = discover_python_files(&d);
696 let rel: Vec<String> = files
697 .iter()
698 .map(|f| f.strip_prefix(&d).unwrap().to_string())
699 .collect();
700 assert_eq!(rel, vec!["src/app.py".to_string()], "got {rel:?}");
701 std::fs::remove_dir_all(&d).ok();
702 }
703
704 #[test]
705 fn discovery_honors_extra_excludes() {
706 let d = temp("extra-excludes");
707 write(&d, "src/app.py", "def main():\n return 1\n");
708 write(&d, "vendor/thirdparty.py", "x = 1\n");
709
710 let files = discover_python_files_excluding(&d, &["vendor".to_string()]);
711 let rel: Vec<String> = files
712 .iter()
713 .map(|f| f.strip_prefix(&d).unwrap().to_string())
714 .collect();
715 assert_eq!(rel, vec!["src/app.py".to_string()], "got {rel:?}");
716 std::fs::remove_dir_all(&d).ok();
717 }
718}