thread_flow/incremental/extractors/python.rs
1// SPDX-FileCopyrightText: 2025 Knitli Inc. <knitli@knit.li>
2// SPDX-License-Identifier: AGPL-3.0-or-later
3
4//! Python dependency extractor using tree-sitter queries.
5//!
6//! Extracts `import` and `from ... import` statements from Python source files,
7//! producing [`ImportInfo`] records for the dependency graph. Supports:
8//!
9//! - Absolute imports: `import os`, `import os.path`
10//! - From imports: `from os import path`, `from os.path import join, exists`
11//! - Relative imports: `from .utils import helper`, `from ..core import Engine`
12//! - Wildcard imports: `from module import *`
13//! - Aliased imports: `import numpy as np`, `from os import path as ospath`
14//!
15//! # Examples
16//!
17//! ```rust,ignore
18//! use thread_flow::incremental::extractors::python::PythonDependencyExtractor;
19//! use std::path::Path;
20//!
21//! let extractor = PythonDependencyExtractor::new();
22//! let source = "import os\nfrom pathlib import Path";
23//! let imports = extractor.extract_imports(source, Path::new("main.py")).unwrap();
24//! assert_eq!(imports.len(), 2);
25//! ```
26//!
27//! # Performance
28//!
29//! Target: <5ms per file extraction. Tree-sitter parses the full AST and a
30//! single recursive walk collects all import nodes, avoiding repeated traversals.
31
32use std::path::{Path, PathBuf};
33use thiserror::Error;
34
35/// Errors that can occur during import extraction.
36#[derive(Debug, Error)]
37pub enum ExtractionError {
38 /// The source code could not be parsed by tree-sitter.
39 #[error("failed to parse source: {0}")]
40 ParseError(String),
41
42 /// A tree-sitter query failed to compile.
43 #[error("invalid tree-sitter query: {0}")]
44 QueryError(String),
45
46 /// Module path resolution failed.
47 #[error("cannot resolve module path '{module}' from '{source_file}': {reason}")]
48 ResolutionError {
49 /// The module path that could not be resolved.
50 module: String,
51 /// The source file containing the import.
52 source_file: PathBuf,
53 /// Explanation of why resolution failed.
54 reason: String,
55 },
56}
57
58/// Information extracted from a single Python import statement.
59///
60/// Represents the parsed form of either `import X` or `from X import Y`
61/// statements. The coordinator (Task 3.5) converts these into
62/// [`DependencyEdge`](crate::incremental::types::DependencyEdge) entries.
63#[derive(Debug, Clone, PartialEq, Eq)]
64pub struct ImportInfo {
65 /// The module path, with leading dots stripped for relative imports.
66 ///
67 /// For `import os.path` this is `"os.path"`.
68 /// For `from .utils import helper` this is `"utils"` (dots conveyed by `relative_level`).
69 /// For `from . import x` (no module name), this is `""`.
70 pub module_path: String,
71
72 /// Specific symbols imported from the module.
73 ///
74 /// Empty for bare `import` statements (e.g., `import os`).
75 /// Contains `["join", "exists"]` for `from os.path import join, exists`.
76 pub symbols: Vec<String>,
77
78 /// Whether this is a wildcard import (`from module import *`).
79 pub is_wildcard: bool,
80
81 /// The relative import depth.
82 ///
83 /// `0` for absolute imports, `1` for `.`, `2` for `..`, etc.
84 pub relative_level: usize,
85
86 /// Aliases for imported names.
87 ///
88 /// Maps original name to alias. For `import numpy as np`, contains
89 /// `[("numpy", "np")]`. For `from os import path as ospath`, contains
90 /// `[("path", "ospath")]`.
91 pub aliases: Vec<(String, String)>,
92}
93
94/// Extracts Python import dependencies using tree-sitter AST walking.
95///
96/// Uses tree-sitter's Python grammar to parse import statements without
97/// executing the Python code. Thread-safe and reusable across files.
98///
99/// # Architecture
100///
101/// The extractor operates in two phases:
102/// 1. **Parse**: Tree-sitter parses the source into an AST
103/// 2. **Walk**: Recursive traversal matches `import_statement` and
104/// `import_from_statement` nodes, extracting structured data
105///
106/// Module path resolution (converting `"os.path"` to a filesystem path)
107/// is a separate concern handled by [`resolve_module_path`](Self::resolve_module_path).
108pub struct PythonDependencyExtractor {
109 _private: (),
110}
111
112impl PythonDependencyExtractor {
113 /// Creates a new Python dependency extractor.
114 pub fn new() -> Self {
115 Self { _private: () }
116 }
117
118 /// Extracts all import statements from Python source code.
119 ///
120 /// Parses the source with tree-sitter and walks the AST to find both
121 /// `import_statement` and `import_from_statement` nodes. Imports inside
122 /// function bodies, try/except blocks, and other nested scopes are
123 /// included.
124 ///
125 /// # Arguments
126 ///
127 /// * `source` - Python source code to analyze.
128 /// * `_file_path` - Path of the source file (reserved for future error context).
129 ///
130 /// # Returns
131 ///
132 /// A vector of [`ImportInfo`] records. Bare `import os, sys` statements
133 /// produce one `ImportInfo` per module.
134 ///
135 /// # Errors
136 ///
137 /// Returns [`ExtractionError::ParseError`] if tree-sitter cannot parse
138 /// the source.
139 pub fn extract_imports(
140 &self,
141 source: &str,
142 _file_path: &Path,
143 ) -> Result<Vec<ImportInfo>, ExtractionError> {
144 if source.is_empty() {
145 return Ok(Vec::new());
146 }
147
148 let language = thread_language::parsers::language_python();
149 let mut parser = tree_sitter::Parser::new();
150 parser
151 .set_language(&language)
152 .map_err(|e| ExtractionError::ParseError(e.to_string()))?;
153
154 let tree = parser
155 .parse(source, None)
156 .ok_or_else(|| ExtractionError::ParseError("tree-sitter returned None".into()))?;
157
158 let root = tree.root_node();
159 let mut imports = Vec::new();
160 let src = source.as_bytes();
161
162 Self::walk_imports(root, src, &mut imports);
163
164 Ok(imports)
165 }
166
167 /// Recursively walk the AST collecting import nodes.
168 ///
169 /// Descends into all nodes (including function bodies, try/except blocks)
170 /// to capture conditional and lazy imports.
171 fn walk_imports(node: tree_sitter::Node<'_>, source: &[u8], imports: &mut Vec<ImportInfo>) {
172 match node.kind() {
173 "import_statement" => {
174 Self::extract_import_statement(node, source, imports);
175 return;
176 }
177 "import_from_statement" => {
178 Self::extract_import_from_statement(node, source, imports);
179 return;
180 }
181 _ => {}
182 }
183
184 let mut cursor = node.walk();
185 for child in node.children(&mut cursor) {
186 Self::walk_imports(child, source, imports);
187 }
188 }
189
190 /// Extract from a bare `import` statement.
191 ///
192 /// Handles:
193 /// - `import os` (single module)
194 /// - `import os.path` (dotted module)
195 /// - `import os, sys` (multiple modules produce multiple [`ImportInfo`]s)
196 /// - `import numpy as np` (aliased)
197 fn extract_import_statement(
198 node: tree_sitter::Node<'_>,
199 source: &[u8],
200 imports: &mut Vec<ImportInfo>,
201 ) {
202 let mut cursor = node.walk();
203 for child in node.children(&mut cursor) {
204 match child.kind() {
205 "dotted_name" => {
206 if let Ok(name) = child.utf8_text(source) {
207 imports.push(ImportInfo {
208 module_path: name.to_string(),
209 symbols: Vec::new(),
210 is_wildcard: false,
211 relative_level: 0,
212 aliases: Vec::new(),
213 });
214 }
215 }
216 "aliased_import" => {
217 if let Some(info) = Self::parse_bare_aliased_import(child, source) {
218 imports.push(info);
219 }
220 }
221 _ => {}
222 }
223 }
224 }
225
226 /// Parse an `aliased_import` node inside a bare `import` statement.
227 ///
228 /// For `import numpy as np`, returns module_path="numpy" with alias ("numpy","np").
229 fn parse_bare_aliased_import(node: tree_sitter::Node<'_>, source: &[u8]) -> Option<ImportInfo> {
230 let name_node = node.child_by_field_name("name")?;
231 let alias_node = node.child_by_field_name("alias")?;
232
233 let name = name_node.utf8_text(source).ok()?;
234 let alias = alias_node.utf8_text(source).ok()?;
235
236 Some(ImportInfo {
237 module_path: name.to_string(),
238 symbols: Vec::new(),
239 is_wildcard: false,
240 relative_level: 0,
241 aliases: vec![(name.to_string(), alias.to_string())],
242 })
243 }
244
245 /// Extract from a `from ... import` statement.
246 ///
247 /// Handles all `from` import variants including relative imports,
248 /// wildcard imports, aliased symbols, and parenthesized import lists.
249 fn extract_import_from_statement(
250 node: tree_sitter::Node<'_>,
251 source: &[u8],
252 imports: &mut Vec<ImportInfo>,
253 ) {
254 let mut module_path = String::new();
255 let mut relative_level: usize = 0;
256 let mut symbols: Vec<String> = Vec::new();
257 let mut is_wildcard = false;
258 let mut aliases: Vec<(String, String)> = Vec::new();
259
260 // Track whether we have seen the module name already (before 'import' keyword).
261 // The first dotted_name child is the module; subsequent ones are imported symbols.
262 let mut module_name_found = false;
263
264 let mut cursor = node.walk();
265 for child in node.children(&mut cursor) {
266 match child.kind() {
267 // Relative import: contains import_prefix (dots) + optional dotted_name
268 "relative_import" => {
269 let mut rc = child.walk();
270 for rchild in child.children(&mut rc) {
271 match rchild.kind() {
272 "import_prefix" => {
273 if let Ok(prefix) = rchild.utf8_text(source) {
274 relative_level = prefix.chars().filter(|&c| c == '.').count();
275 }
276 }
277 "dotted_name" => {
278 if let Ok(name) = rchild.utf8_text(source) {
279 module_path = name.to_string();
280 }
281 }
282 _ => {}
283 }
284 }
285 module_name_found = true;
286 }
287 // Absolute module name (first dotted_name before 'import' keyword)
288 // or a bare symbol in the import list (dotted_name after 'import')
289 "dotted_name" => {
290 if !module_name_found {
291 if let Ok(name) = child.utf8_text(source) {
292 module_path = name.to_string();
293 }
294 module_name_found = true;
295 } else {
296 // Imported symbol name
297 if let Ok(name) = child.utf8_text(source) {
298 symbols.push(name.to_string());
299 }
300 }
301 }
302 "wildcard_import" => {
303 is_wildcard = true;
304 }
305 "aliased_import" => {
306 if let Some((sym, al)) = Self::parse_from_aliased_symbol(child, source) {
307 symbols.push(sym.clone());
308 aliases.push((sym, al));
309 }
310 }
311 _ => {}
312 }
313 }
314
315 imports.push(ImportInfo {
316 module_path,
317 symbols,
318 is_wildcard,
319 relative_level,
320 aliases,
321 });
322 }
323
324 /// Parse an aliased import symbol inside a from-import.
325 ///
326 /// For `path as ospath` inside `from os import path as ospath`,
327 /// returns `("path", "ospath")`.
328 fn parse_from_aliased_symbol(
329 node: tree_sitter::Node<'_>,
330 source: &[u8],
331 ) -> Option<(String, String)> {
332 let name_node = node.child_by_field_name("name")?;
333 let alias_node = node.child_by_field_name("alias")?;
334
335 let name = name_node.utf8_text(source).ok()?.to_string();
336 let alias = alias_node.utf8_text(source).ok()?.to_string();
337
338 Some((name, alias))
339 }
340
341 /// Resolves a Python module path to a filesystem path.
342 ///
343 /// For absolute imports (`relative_level == 0`), converts dots to path
344 /// separators and appends `.py`. For relative imports, navigates up from
345 /// the source file's directory according to the dot count.
346 ///
347 /// # Arguments
348 ///
349 /// * `source_file` - The file containing the import statement.
350 /// * `module_path` - The dotted module path (e.g., `"os.path"`, `"utils"`),
351 /// with leading dots already stripped (conveyed via `relative_level`).
352 /// * `relative_level` - The relative import depth (0 for absolute).
353 ///
354 /// # Returns
355 ///
356 /// The resolved filesystem path to the target module.
357 ///
358 /// # Errors
359 ///
360 /// Returns [`ExtractionError::ResolutionError`] if the source file has no
361 /// parent directory, or relative navigation exceeds the filesystem root.
362 pub fn resolve_module_path(
363 &self,
364 source_file: &Path,
365 module_path: &str,
366 relative_level: usize,
367 ) -> Result<PathBuf, ExtractionError> {
368 if relative_level == 0 {
369 // Absolute import: dots become path separators
370 let fs_path = module_path.replace('.', "/");
371 return Ok(PathBuf::from(format!("{fs_path}.py")));
372 }
373
374 // Relative import: navigate up from source file's parent directory
375 let source_dir = source_file
376 .parent()
377 .ok_or_else(|| ExtractionError::ResolutionError {
378 module: module_path.to_string(),
379 source_file: source_file.to_path_buf(),
380 reason: "source file has no parent directory".into(),
381 })?;
382
383 // Level 1 (`.`) stays in the same directory.
384 // Level 2 (`..`) goes one directory up, etc.
385 let mut base = source_dir.to_path_buf();
386 for _ in 1..relative_level {
387 base = base.parent().map(Path::to_path_buf).ok_or_else(|| {
388 ExtractionError::ResolutionError {
389 module: module_path.to_string(),
390 source_file: source_file.to_path_buf(),
391 reason: format!(
392 "cannot navigate {} levels up from {}",
393 relative_level,
394 source_dir.display()
395 ),
396 }
397 })?;
398 }
399
400 if module_path.is_empty() {
401 // `from . import X` resolves to the package __init__.py
402 return Ok(base.join("__init__.py"));
403 }
404
405 let fs_path = module_path.replace('.', "/");
406 Ok(base.join(format!("{fs_path}.py")))
407 }
408
409 /// Extract [`DependencyEdge`] values from a Python source file.
410 ///
411 /// Combines import extraction with path resolution to produce edges
412 /// suitable for the incremental dependency graph. Only resolvable
413 /// relative imports produce edges; absolute imports and unresolvable
414 /// paths are silently skipped.
415 ///
416 /// # Errors
417 ///
418 /// Returns an error if the source file cannot be parsed.
419 pub fn extract_dependency_edges(
420 &self,
421 source: &str,
422 file_path: &Path,
423 ) -> Result<Vec<super::super::types::DependencyEdge>, ExtractionError> {
424 let imports = self.extract_imports(source, file_path)?;
425 let mut edges = Vec::new();
426
427 for import in &imports {
428 // Only create edges for resolvable module paths
429 // External packages and unresolvable paths are silently skipped per design spec
430 if let Ok(resolved) =
431 self.resolve_module_path(file_path, &import.module_path, import.relative_level)
432 {
433 edges.push(super::super::types::DependencyEdge::new(
434 file_path.to_path_buf(),
435 resolved,
436 super::super::types::DependencyType::Import,
437 ));
438 }
439 }
440
441 Ok(edges)
442 }
443}
444
445impl Default for PythonDependencyExtractor {
446 fn default() -> Self {
447 Self::new()
448 }
449}