1use camino::{Utf8Path, Utf8PathBuf};
9use mollify_parse::{Import, ParsedModule, PyParser};
10use rayon::prelude::*;
11use rustc_hash::{FxHashMap, FxHashSet};
12
13#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
15pub struct FileId(pub u32);
16
17#[derive(Debug, Clone)]
19pub struct ModuleInfo {
20 pub id: FileId,
21 pub path: Utf8PathBuf,
22 pub dotted: String,
24 pub parsed: ParsedModule,
25 pub is_entry: bool,
27}
28
29#[derive(Debug, Clone)]
31pub struct UnresolvedImport {
32 pub importer: Utf8PathBuf,
34 pub display: String,
36 pub line: u32,
37 pub relative: bool,
39}
40
41pub struct ModuleGraph {
43 pub modules: Vec<ModuleInfo>,
44 by_dotted: FxHashMap<String, FileId>,
45 edges: Vec<(FileId, FileId)>,
47 imported_symbols: FxHashMap<FileId, FxHashSet<String>>,
50 reachable: FxHashSet<FileId>,
51 pub global_dynamic: bool,
53}
54
55pub fn discover_python_files(root: &Utf8Path) -> Vec<Utf8PathBuf> {
58 let mut out = Vec::new();
59 for entry in ignore::WalkBuilder::new(root)
60 .hidden(false)
61 .build()
62 .flatten()
63 {
64 let p = entry.path();
65 if p.extension().is_some_and(|e| e == "py" || e == "ipynb") {
66 if let Ok(u) = Utf8PathBuf::from_path_buf(p.to_path_buf()) {
67 out.push(u);
68 }
69 }
70 }
71 out.sort();
72 out
73}
74
75pub fn read_source(path: &Utf8Path) -> Option<String> {
79 let raw = std::fs::read_to_string(path).ok()?;
80 if path.extension() != Some("ipynb") {
81 return Some(raw);
82 }
83 let nb: serde_json::Value = serde_json::from_str(&raw).ok()?;
84 let cells = nb.get("cells")?.as_array()?;
85 let mut src = String::new();
86 for cell in cells {
87 if cell.get("cell_type").and_then(|t| t.as_str()) != Some("code") {
88 continue;
89 }
90 match cell.get("source") {
91 Some(serde_json::Value::Array(lines)) => {
92 for l in lines {
93 if let Some(s) = l.as_str() {
94 let t = s.trim_start();
95 if t.starts_with('%') || t.starts_with('!') {
96 continue;
97 }
98 src.push_str(s);
99 }
100 }
101 src.push('\n');
102 }
103 Some(serde_json::Value::String(s)) => {
104 src.push_str(s);
105 src.push('\n');
106 }
107 _ => {}
108 }
109 }
110 Some(src)
111}
112
113fn dotted_name(root: &Utf8Path, path: &Utf8Path) -> String {
116 let rel = path.strip_prefix(root).unwrap_or(path);
117 let mut rel = rel.to_owned();
118 if rel.starts_with("src") {
120 if let Ok(stripped) = rel.strip_prefix("src") {
121 rel = stripped.to_owned();
122 }
123 }
124 let no_ext = rel.as_str().strip_suffix(".py").unwrap_or(rel.as_str());
125 let no_init = no_ext.strip_suffix("/__init__").unwrap_or(no_ext);
126 no_init.replace('/', ".").trim_matches('.').to_string()
127}
128
129fn is_entry(path: &Utf8Path) -> bool {
130 let name = path.file_name().unwrap_or("");
131 name == "__main__.py"
132 || name == "conftest.py"
133 || name == "setup.py"
134 || name == "__init__.py" || name.starts_with("test_")
136 || name.ends_with("_test.py")
137 || path.extension() == Some("ipynb")
138}
139
140impl ModuleGraph {
141 pub fn build(root: &Utf8Path, files: &[Utf8PathBuf]) -> Self {
143 let parsed: Vec<(Utf8PathBuf, ParsedModule)> = files
145 .par_iter()
146 .filter_map(|p| {
147 let src = read_source(p)?;
148 let mut parser = PyParser::new().ok()?;
149 let pm = parser.parse(p, &src).ok()?;
150 Some((p.clone(), pm))
151 })
152 .collect();
153
154 let mut parsed = parsed;
156 parsed.sort_by(|a, b| a.0.cmp(&b.0));
157
158 let mut modules = Vec::with_capacity(parsed.len());
159 let mut by_dotted = FxHashMap::default();
160 let mut global_dynamic = false;
161 for (i, (path, pm)) in parsed.into_iter().enumerate() {
162 let id = FileId(i as u32);
163 let dotted = dotted_name(root, &path);
164 global_dynamic |= pm.has_dynamic_sink;
165 by_dotted.entry(dotted.clone()).or_insert(id);
166 modules.push(ModuleInfo {
167 id,
168 is_entry: is_entry(&path),
169 path,
170 dotted,
171 parsed: pm,
172 });
173 }
174
175 let mut g = ModuleGraph {
176 modules,
177 by_dotted,
178 edges: Vec::new(),
179 imported_symbols: FxHashMap::default(),
180 reachable: FxHashSet::default(),
181 global_dynamic,
182 };
183 g.resolve_edges();
184 g.compute_reachability();
185 g
186 }
187
188 fn module(&self, id: FileId) -> &ModuleInfo {
189 &self.modules[id.0 as usize]
190 }
191
192 fn resolve_edges(&mut self) {
195 let mut edges = Vec::new();
196 let mut imported_symbols: FxHashMap<FileId, FxHashSet<String>> = FxHashMap::default();
197
198 for m in &self.modules {
199 for imp in &m.parsed.imports {
200 let target_dotted = if imp.relative_dots > 0 {
201 resolve_relative(&m.dotted, imp.relative_dots, &imp.module)
202 } else {
203 imp.module.clone()
204 };
205 if let Some(&tid) = self.lookup(&target_dotted) {
209 edges.push((m.id, tid));
210 let set = imported_symbols.entry(tid).or_default();
211 for n in &imp.names {
212 set.insert(n.clone());
213 }
214 if imp.is_star {
215 set.insert("*".into());
216 }
217 } else if !imp.names.is_empty() {
218 for n in &imp.names {
220 let candidate = format!("{target_dotted}.{n}");
221 if let Some(&tid) = self.lookup(&candidate) {
222 edges.push((m.id, tid));
223 }
224 }
225 }
226 }
227 }
228 edges.sort();
229 edges.dedup();
230 self.edges = edges;
231 self.imported_symbols = imported_symbols;
232 }
233
234 fn lookup(&self, dotted: &str) -> Option<&FileId> {
235 self.by_dotted.get(dotted)
236 }
237
238 fn import_resolves(&self, target: &str, imp: &Import) -> bool {
241 if self.lookup(target).is_some() {
242 return true;
243 }
244 imp.names
245 .iter()
246 .any(|n| self.lookup(&format!("{target}.{n}")).is_some())
247 }
248
249 pub fn unresolved_imports(&self) -> Vec<UnresolvedImport> {
254 let mut first_party: FxHashSet<&str> = FxHashSet::default();
256 for k in self.by_dotted.keys() {
257 if let Some(top) = k.split('.').next() {
258 first_party.insert(top);
259 }
260 }
261 let mut out = Vec::new();
262 for m in &self.modules {
263 for imp in &m.parsed.imports {
264 let relative = imp.relative_dots > 0;
265 let target = if relative {
266 resolve_relative(&m.dotted, imp.relative_dots, &imp.module)
267 } else {
268 imp.module.clone()
269 };
270 if target.is_empty() || self.import_resolves(&target, imp) {
271 continue;
272 }
273 if !relative {
274 let top = target.split('.').next().unwrap_or(&target);
275 if !first_party.contains(top) {
276 continue; }
278 }
279 let display = if relative {
280 format!("{}{}", ".".repeat(imp.relative_dots as usize), imp.module)
281 } else {
282 imp.module.clone()
283 };
284 out.push(UnresolvedImport {
285 importer: m.path.clone(),
286 display,
287 line: imp.line,
288 relative,
289 });
290 }
291 }
292 out.sort_by(|a, b| {
293 a.importer
294 .cmp(&b.importer)
295 .then(a.line.cmp(&b.line))
296 .then(a.display.cmp(&b.display))
297 });
298 out.dedup_by(|a, b| a.importer == b.importer && a.line == b.line && a.display == b.display);
299 out
300 }
301
302 fn compute_reachability(&mut self) {
304 let mut adj: FxHashMap<FileId, Vec<FileId>> = FxHashMap::default();
305 for (a, b) in &self.edges {
306 adj.entry(*a).or_default().push(*b);
307 }
308 let mut queue: Vec<FileId> = self
309 .modules
310 .iter()
311 .filter(|m| m.is_entry)
312 .map(|m| m.id)
313 .collect();
314 let mut seen: FxHashSet<FileId> = queue.iter().copied().collect();
315 while let Some(id) = queue.pop() {
316 if let Some(neighbors) = adj.get(&id) {
317 for &n in neighbors {
318 if seen.insert(n) {
319 queue.push(n);
320 }
321 }
322 }
323 }
324 self.reachable = seen;
325 }
326
327 pub fn unused_files(&self) -> Vec<&ModuleInfo> {
329 self.modules
330 .iter()
331 .filter(|m| !m.is_entry && !self.reachable.contains(&m.id))
332 .collect()
333 }
334
335 pub fn symbol_used(&self, module: FileId, name: &str, defs_named: u32) -> bool {
339 let m = self.module(module);
340 let internal = if m.parsed.has_dynamic_sink {
346 m.parsed.name_counts.get(name).copied().unwrap_or(0) > defs_named
347 } else {
348 m.parsed
349 .module_used
350 .binary_search_by(|s| s.as_str().cmp(name))
351 .is_ok()
352 };
353 if internal {
354 return true;
355 }
356 if let Some(set) = self.imported_symbols.get(&module) {
358 if set.contains(name) || set.contains("*") {
359 return true;
360 }
361 }
362 let importers: Vec<FileId> = self
364 .edges
365 .iter()
366 .filter(|(_, b)| *b == module)
367 .map(|(a, _)| *a)
368 .collect();
369 for imp in importers {
370 let im = self.module(imp);
371 if im.parsed.name_counts.contains_key(name) {
372 return true;
373 }
374 }
375 false
376 }
377
378 pub fn edge_count(&self) -> usize {
379 self.edges.len()
380 }
381
382 pub fn import_edges(&self) -> Vec<(&str, &str)> {
385 self.edges
386 .iter()
387 .map(|(a, b)| {
388 (
389 self.modules[a.0 as usize].dotted.as_str(),
390 self.modules[b.0 as usize].dotted.as_str(),
391 )
392 })
393 .collect()
394 }
395
396 pub fn path_of_dotted(&self, dotted: &str) -> Option<&Utf8Path> {
398 self.modules
399 .iter()
400 .find(|m| m.dotted == dotted)
401 .map(|m| m.path.as_path())
402 }
403
404 pub fn find_cycles(&self) -> Vec<Vec<FileId>> {
408 let n = self.modules.len();
409 let mut adj: Vec<Vec<usize>> = vec![Vec::new(); n];
410 let mut self_loops: Vec<usize> = Vec::new();
411 for (a, b) in &self.edges {
412 if a == b {
413 self_loops.push(a.0 as usize);
414 } else {
415 adj[a.0 as usize].push(b.0 as usize);
416 }
417 }
418
419 let mut index = vec![u32::MAX; n];
421 let mut low = vec![0u32; n];
422 let mut on_stack = vec![false; n];
423 let mut stack: Vec<usize> = Vec::new();
424 let mut idx: u32 = 0;
425 let mut out: Vec<Vec<FileId>> = Vec::new();
426
427 for start in 0..n {
429 if index[start] != u32::MAX {
430 continue;
431 }
432 let mut call: Vec<(usize, usize)> = vec![(start, 0)];
433 while let Some(&(v, ci)) = call.last() {
434 if ci == 0 {
435 index[v] = idx;
436 low[v] = idx;
437 idx += 1;
438 stack.push(v);
439 on_stack[v] = true;
440 }
441 if ci < adj[v].len() {
442 let w = adj[v][ci];
443 call.last_mut().unwrap().1 += 1;
444 if index[w] == u32::MAX {
445 call.push((w, 0));
446 } else if on_stack[w] {
447 low[v] = low[v].min(index[w]);
448 }
449 } else {
450 if low[v] == index[v] {
451 let mut comp = Vec::new();
452 loop {
453 let w = stack.pop().unwrap();
454 on_stack[w] = false;
455 comp.push(FileId(w as u32));
456 if w == v {
457 break;
458 }
459 }
460 if comp.len() > 1 {
461 comp.sort();
462 out.push(comp);
463 }
464 }
465 call.pop();
466 if let Some(&(parent, _)) = call.last() {
467 low[parent] = low[parent].min(low[v]);
468 }
469 }
470 }
471 }
472
473 for s in self_loops {
474 out.push(vec![FileId(s as u32)]);
475 }
476 out.sort();
477 out
478 }
479}
480
481fn resolve_relative(importer_dotted: &str, dots: u8, module: &str) -> String {
484 let parts: Vec<&str> = importer_dotted.split('.').collect();
485 let keep = parts.len().saturating_sub(dots as usize);
488 let base = parts[..keep].join(".");
489 match (base.is_empty(), module.is_empty()) {
490 (true, _) => module.to_string(),
491 (false, true) => base,
492 (false, false) => format!("{base}.{module}"),
493 }
494}
495
496#[cfg(test)]
497mod tests {
498 use super::*;
499
500 fn write(dir: &Utf8Path, rel: &str, src: &str) {
501 let p = dir.join(rel);
502 std::fs::create_dir_all(p.parent().unwrap()).unwrap();
503 std::fs::write(p, src).unwrap();
504 }
505
506 fn temp(tag: &str) -> Utf8PathBuf {
507 let base =
508 std::env::temp_dir().join(format!("mollify-graph-test-{}-{tag}", std::process::id()));
509 let _ = std::fs::remove_dir_all(&base);
510 Utf8PathBuf::from_path_buf(base).unwrap()
511 }
512
513 #[test]
514 fn dotted_name_handles_init_and_src() {
515 let root = Utf8Path::new("/proj");
516 assert_eq!(
517 dotted_name(root, Utf8Path::new("/proj/pkg/mod.py")),
518 "pkg.mod"
519 );
520 assert_eq!(
521 dotted_name(root, Utf8Path::new("/proj/pkg/__init__.py")),
522 "pkg"
523 );
524 assert_eq!(dotted_name(root, Utf8Path::new("/proj/src/a/b.py")), "a.b");
525 }
526
527 #[test]
528 fn relative_import_resolution() {
529 assert_eq!(resolve_relative("a.b.c", 1, "d"), "a.b.d");
530 assert_eq!(resolve_relative("a.b.c", 2, "d"), "a.d");
531 assert_eq!(resolve_relative("a.b.c", 1, ""), "a.b");
532 }
533
534 #[test]
535 fn unused_file_detected() {
536 let d = temp("unused");
537 write(&d, "__main__.py", "from used import helper\nhelper()\n");
538 write(&d, "used.py", "def helper():\n return 1\n");
539 write(&d, "orphan.py", "def never():\n return 2\n");
540 let files = discover_python_files(&d);
541 let g = ModuleGraph::build(&d, &files);
542 let unused: Vec<_> = g.unused_files().iter().map(|m| m.dotted.clone()).collect();
543 assert!(unused.contains(&"orphan".to_string()), "got {unused:?}");
544 assert!(!unused.contains(&"used".to_string()));
545 std::fs::remove_dir_all(&d).ok();
546 }
547
548 #[test]
549 fn reads_notebook_code_cells() {
550 let d = temp("nb");
551 let nb = r##"{"cells":[{"cell_type":"markdown","source":["title"]},{"cell_type":"code","source":["def nb_fn(x):\n"," return x\n"]}]}"##;
552 write(&d, "analysis.ipynb", nb);
553 let files = discover_python_files(&d);
554 assert!(files.iter().any(|f| f.as_str().ends_with("analysis.ipynb")));
555 let g = ModuleGraph::build(&d, &files);
556 let nbmod = g
557 .modules
558 .iter()
559 .find(|m| m.path.as_str().ends_with("analysis.ipynb"))
560 .unwrap();
561 assert!(
562 nbmod.parsed.definitions.iter().any(|x| x.name == "nb_fn"),
563 "notebook code not parsed"
564 );
565 std::fs::remove_dir_all(&d).ok();
566 }
567
568 #[test]
569 fn detects_import_cycle() {
570 let d = temp("cycle");
571 write(
572 &d,
573 "__init__.py",
574 "import a
575import b
576",
577 );
578 write(
579 &d,
580 "a.py",
581 "import b
582",
583 );
584 write(
585 &d,
586 "b.py",
587 "import a
588",
589 );
590 let files = discover_python_files(&d);
591 let g = ModuleGraph::build(&d, &files);
592 let cycles = g.find_cycles();
593 assert!(
594 cycles.iter().any(|c| c.len() == 2),
595 "expected a 2-cycle, got {cycles:?}"
596 );
597 std::fs::remove_dir_all(&d).ok();
598 }
599
600 #[test]
601 fn symbol_use_cross_module() {
602 let d = temp("symuse");
603 write(&d, "__main__.py", "from lib import used_fn\nused_fn()\n");
604 write(
605 &d,
606 "lib.py",
607 "def used_fn():\n return 1\n\ndef dead_fn():\n return 2\n",
608 );
609 let files = discover_python_files(&d);
610 let g = ModuleGraph::build(&d, &files);
611 let lib = g.modules.iter().find(|m| m.dotted == "lib").unwrap().id;
612 assert!(g.symbol_used(lib, "used_fn", 1));
613 assert!(!g.symbol_used(lib, "dead_fn", 1));
614 std::fs::remove_dir_all(&d).ok();
615 }
616}