Skip to main content

thread_flow/incremental/extractors/
python.rs

1// SPDX-FileCopyrightText: 2025 Knitli Inc. <knitli@knit.li>
2// SPDX-License-Identifier: AGPL-3.0-or-later
3
4//! Python dependency extractor using tree-sitter queries.
5//!
6//! Extracts `import` and `from ... import` statements from Python source files,
7//! producing [`ImportInfo`] records for the dependency graph. Supports:
8//!
9//! - Absolute imports: `import os`, `import os.path`
10//! - From imports: `from os import path`, `from os.path import join, exists`
11//! - Relative imports: `from .utils import helper`, `from ..core import Engine`
12//! - Wildcard imports: `from module import *`
13//! - Aliased imports: `import numpy as np`, `from os import path as ospath`
14//!
15//! # Examples
16//!
17//! ```rust,ignore
18//! use thread_flow::incremental::extractors::python::PythonDependencyExtractor;
19//! use std::path::Path;
20//!
21//! let extractor = PythonDependencyExtractor::new();
22//! let source = "import os\nfrom pathlib import Path";
23//! let imports = extractor.extract_imports(source, Path::new("main.py")).unwrap();
24//! assert_eq!(imports.len(), 2);
25//! ```
26//!
27//! # Performance
28//!
29//! Target: <5ms per file extraction. Tree-sitter parses the full AST and a
30//! single recursive walk collects all import nodes, avoiding repeated traversals.
31
32use std::path::{Path, PathBuf};
33use thiserror::Error;
34
35/// Errors that can occur during import extraction.
36#[derive(Debug, Error)]
37pub enum ExtractionError {
38    /// The source code could not be parsed by tree-sitter.
39    #[error("failed to parse source: {0}")]
40    ParseError(String),
41
42    /// A tree-sitter query failed to compile.
43    #[error("invalid tree-sitter query: {0}")]
44    QueryError(String),
45
46    /// Module path resolution failed.
47    #[error("cannot resolve module path '{module}' from '{source_file}': {reason}")]
48    ResolutionError {
49        /// The module path that could not be resolved.
50        module: String,
51        /// The source file containing the import.
52        source_file: PathBuf,
53        /// Explanation of why resolution failed.
54        reason: String,
55    },
56}
57
58/// Information extracted from a single Python import statement.
59///
60/// Represents the parsed form of either `import X` or `from X import Y`
61/// statements. The coordinator (Task 3.5) converts these into
62/// [`DependencyEdge`](crate::incremental::types::DependencyEdge) entries.
63#[derive(Debug, Clone, PartialEq, Eq)]
64pub struct ImportInfo {
65    /// The module path, with leading dots stripped for relative imports.
66    ///
67    /// For `import os.path` this is `"os.path"`.
68    /// For `from .utils import helper` this is `"utils"` (dots conveyed by `relative_level`).
69    /// For `from . import x` (no module name), this is `""`.
70    pub module_path: String,
71
72    /// Specific symbols imported from the module.
73    ///
74    /// Empty for bare `import` statements (e.g., `import os`).
75    /// Contains `["join", "exists"]` for `from os.path import join, exists`.
76    pub symbols: Vec<String>,
77
78    /// Whether this is a wildcard import (`from module import *`).
79    pub is_wildcard: bool,
80
81    /// The relative import depth.
82    ///
83    /// `0` for absolute imports, `1` for `.`, `2` for `..`, etc.
84    pub relative_level: usize,
85
86    /// Aliases for imported names.
87    ///
88    /// Maps original name to alias. For `import numpy as np`, contains
89    /// `[("numpy", "np")]`. For `from os import path as ospath`, contains
90    /// `[("path", "ospath")]`.
91    pub aliases: Vec<(String, String)>,
92}
93
94/// Extracts Python import dependencies using tree-sitter AST walking.
95///
96/// Uses tree-sitter's Python grammar to parse import statements without
97/// executing the Python code. Thread-safe and reusable across files.
98///
99/// # Architecture
100///
101/// The extractor operates in two phases:
102/// 1. **Parse**: Tree-sitter parses the source into an AST
103/// 2. **Walk**: Recursive traversal matches `import_statement` and
104///    `import_from_statement` nodes, extracting structured data
105///
106/// Module path resolution (converting `"os.path"` to a filesystem path)
107/// is a separate concern handled by [`resolve_module_path`](Self::resolve_module_path).
108pub struct PythonDependencyExtractor {
109    _private: (),
110}
111
112impl PythonDependencyExtractor {
113    /// Creates a new Python dependency extractor.
114    pub fn new() -> Self {
115        Self { _private: () }
116    }
117
118    /// Extracts all import statements from Python source code.
119    ///
120    /// Parses the source with tree-sitter and walks the AST to find both
121    /// `import_statement` and `import_from_statement` nodes. Imports inside
122    /// function bodies, try/except blocks, and other nested scopes are
123    /// included.
124    ///
125    /// # Arguments
126    ///
127    /// * `source` - Python source code to analyze.
128    /// * `_file_path` - Path of the source file (reserved for future error context).
129    ///
130    /// # Returns
131    ///
132    /// A vector of [`ImportInfo`] records. Bare `import os, sys` statements
133    /// produce one `ImportInfo` per module.
134    ///
135    /// # Errors
136    ///
137    /// Returns [`ExtractionError::ParseError`] if tree-sitter cannot parse
138    /// the source.
139    pub fn extract_imports(
140        &self,
141        source: &str,
142        _file_path: &Path,
143    ) -> Result<Vec<ImportInfo>, ExtractionError> {
144        if source.is_empty() {
145            return Ok(Vec::new());
146        }
147
148        let language = thread_language::parsers::language_python();
149        let mut parser = tree_sitter::Parser::new();
150        parser
151            .set_language(&language)
152            .map_err(|e| ExtractionError::ParseError(e.to_string()))?;
153
154        let tree = parser
155            .parse(source, None)
156            .ok_or_else(|| ExtractionError::ParseError("tree-sitter returned None".into()))?;
157
158        let root = tree.root_node();
159        let mut imports = Vec::new();
160        let src = source.as_bytes();
161
162        Self::walk_imports(root, src, &mut imports);
163
164        Ok(imports)
165    }
166
167    /// Recursively walk the AST collecting import nodes.
168    ///
169    /// Descends into all nodes (including function bodies, try/except blocks)
170    /// to capture conditional and lazy imports.
171    fn walk_imports(node: tree_sitter::Node<'_>, source: &[u8], imports: &mut Vec<ImportInfo>) {
172        match node.kind() {
173            "import_statement" => {
174                Self::extract_import_statement(node, source, imports);
175                return;
176            }
177            "import_from_statement" => {
178                Self::extract_import_from_statement(node, source, imports);
179                return;
180            }
181            _ => {}
182        }
183
184        let mut cursor = node.walk();
185        for child in node.children(&mut cursor) {
186            Self::walk_imports(child, source, imports);
187        }
188    }
189
190    /// Extract from a bare `import` statement.
191    ///
192    /// Handles:
193    /// - `import os` (single module)
194    /// - `import os.path` (dotted module)
195    /// - `import os, sys` (multiple modules produce multiple [`ImportInfo`]s)
196    /// - `import numpy as np` (aliased)
197    fn extract_import_statement(
198        node: tree_sitter::Node<'_>,
199        source: &[u8],
200        imports: &mut Vec<ImportInfo>,
201    ) {
202        let mut cursor = node.walk();
203        for child in node.children(&mut cursor) {
204            match child.kind() {
205                "dotted_name" => {
206                    if let Ok(name) = child.utf8_text(source) {
207                        imports.push(ImportInfo {
208                            module_path: name.to_string(),
209                            symbols: Vec::new(),
210                            is_wildcard: false,
211                            relative_level: 0,
212                            aliases: Vec::new(),
213                        });
214                    }
215                }
216                "aliased_import" => {
217                    if let Some(info) = Self::parse_bare_aliased_import(child, source) {
218                        imports.push(info);
219                    }
220                }
221                _ => {}
222            }
223        }
224    }
225
226    /// Parse an `aliased_import` node inside a bare `import` statement.
227    ///
228    /// For `import numpy as np`, returns module_path="numpy" with alias ("numpy","np").
229    fn parse_bare_aliased_import(node: tree_sitter::Node<'_>, source: &[u8]) -> Option<ImportInfo> {
230        let name_node = node.child_by_field_name("name")?;
231        let alias_node = node.child_by_field_name("alias")?;
232
233        let name = name_node.utf8_text(source).ok()?;
234        let alias = alias_node.utf8_text(source).ok()?;
235
236        Some(ImportInfo {
237            module_path: name.to_string(),
238            symbols: Vec::new(),
239            is_wildcard: false,
240            relative_level: 0,
241            aliases: vec![(name.to_string(), alias.to_string())],
242        })
243    }
244
245    /// Extract from a `from ... import` statement.
246    ///
247    /// Handles all `from` import variants including relative imports,
248    /// wildcard imports, aliased symbols, and parenthesized import lists.
249    fn extract_import_from_statement(
250        node: tree_sitter::Node<'_>,
251        source: &[u8],
252        imports: &mut Vec<ImportInfo>,
253    ) {
254        let mut module_path = String::new();
255        let mut relative_level: usize = 0;
256        let mut symbols: Vec<String> = Vec::new();
257        let mut is_wildcard = false;
258        let mut aliases: Vec<(String, String)> = Vec::new();
259
260        // Track whether we have seen the module name already (before 'import' keyword).
261        // The first dotted_name child is the module; subsequent ones are imported symbols.
262        let mut module_name_found = false;
263
264        let mut cursor = node.walk();
265        for child in node.children(&mut cursor) {
266            match child.kind() {
267                // Relative import: contains import_prefix (dots) + optional dotted_name
268                "relative_import" => {
269                    let mut rc = child.walk();
270                    for rchild in child.children(&mut rc) {
271                        match rchild.kind() {
272                            "import_prefix" => {
273                                if let Ok(prefix) = rchild.utf8_text(source) {
274                                    relative_level = prefix.chars().filter(|&c| c == '.').count();
275                                }
276                            }
277                            "dotted_name" => {
278                                if let Ok(name) = rchild.utf8_text(source) {
279                                    module_path = name.to_string();
280                                }
281                            }
282                            _ => {}
283                        }
284                    }
285                    module_name_found = true;
286                }
287                // Absolute module name (first dotted_name before 'import' keyword)
288                // or a bare symbol in the import list (dotted_name after 'import')
289                "dotted_name" => {
290                    if !module_name_found {
291                        if let Ok(name) = child.utf8_text(source) {
292                            module_path = name.to_string();
293                        }
294                        module_name_found = true;
295                    } else {
296                        // Imported symbol name
297                        if let Ok(name) = child.utf8_text(source) {
298                            symbols.push(name.to_string());
299                        }
300                    }
301                }
302                "wildcard_import" => {
303                    is_wildcard = true;
304                }
305                "aliased_import" => {
306                    if let Some((sym, al)) = Self::parse_from_aliased_symbol(child, source) {
307                        symbols.push(sym.clone());
308                        aliases.push((sym, al));
309                    }
310                }
311                _ => {}
312            }
313        }
314
315        imports.push(ImportInfo {
316            module_path,
317            symbols,
318            is_wildcard,
319            relative_level,
320            aliases,
321        });
322    }
323
324    /// Parse an aliased import symbol inside a from-import.
325    ///
326    /// For `path as ospath` inside `from os import path as ospath`,
327    /// returns `("path", "ospath")`.
328    fn parse_from_aliased_symbol(
329        node: tree_sitter::Node<'_>,
330        source: &[u8],
331    ) -> Option<(String, String)> {
332        let name_node = node.child_by_field_name("name")?;
333        let alias_node = node.child_by_field_name("alias")?;
334
335        let name = name_node.utf8_text(source).ok()?.to_string();
336        let alias = alias_node.utf8_text(source).ok()?.to_string();
337
338        Some((name, alias))
339    }
340
341    /// Resolves a Python module path to a filesystem path.
342    ///
343    /// For absolute imports (`relative_level == 0`), converts dots to path
344    /// separators and appends `.py`. For relative imports, navigates up from
345    /// the source file's directory according to the dot count.
346    ///
347    /// # Arguments
348    ///
349    /// * `source_file` - The file containing the import statement.
350    /// * `module_path` - The dotted module path (e.g., `"os.path"`, `"utils"`),
351    ///   with leading dots already stripped (conveyed via `relative_level`).
352    /// * `relative_level` - The relative import depth (0 for absolute).
353    ///
354    /// # Returns
355    ///
356    /// The resolved filesystem path to the target module.
357    ///
358    /// # Errors
359    ///
360    /// Returns [`ExtractionError::ResolutionError`] if the source file has no
361    /// parent directory, or relative navigation exceeds the filesystem root.
362    pub fn resolve_module_path(
363        &self,
364        source_file: &Path,
365        module_path: &str,
366        relative_level: usize,
367    ) -> Result<PathBuf, ExtractionError> {
368        if relative_level == 0 {
369            // Absolute import: dots become path separators
370            let fs_path = module_path.replace('.', "/");
371            return Ok(PathBuf::from(format!("{fs_path}.py")));
372        }
373
374        // Relative import: navigate up from source file's parent directory
375        let source_dir = source_file
376            .parent()
377            .ok_or_else(|| ExtractionError::ResolutionError {
378                module: module_path.to_string(),
379                source_file: source_file.to_path_buf(),
380                reason: "source file has no parent directory".into(),
381            })?;
382
383        // Level 1 (`.`) stays in the same directory.
384        // Level 2 (`..`) goes one directory up, etc.
385        let mut base = source_dir.to_path_buf();
386        for _ in 1..relative_level {
387            base = base.parent().map(Path::to_path_buf).ok_or_else(|| {
388                ExtractionError::ResolutionError {
389                    module: module_path.to_string(),
390                    source_file: source_file.to_path_buf(),
391                    reason: format!(
392                        "cannot navigate {} levels up from {}",
393                        relative_level,
394                        source_dir.display()
395                    ),
396                }
397            })?;
398        }
399
400        if module_path.is_empty() {
401            // `from . import X` resolves to the package __init__.py
402            return Ok(base.join("__init__.py"));
403        }
404
405        let fs_path = module_path.replace('.', "/");
406        Ok(base.join(format!("{fs_path}.py")))
407    }
408
409    /// Extract [`DependencyEdge`] values from a Python source file.
410    ///
411    /// Combines import extraction with path resolution to produce edges
412    /// suitable for the incremental dependency graph. Only resolvable
413    /// relative imports produce edges; absolute imports and unresolvable
414    /// paths are silently skipped.
415    ///
416    /// # Errors
417    ///
418    /// Returns an error if the source file cannot be parsed.
419    pub fn extract_dependency_edges(
420        &self,
421        source: &str,
422        file_path: &Path,
423    ) -> Result<Vec<super::super::types::DependencyEdge>, ExtractionError> {
424        let imports = self.extract_imports(source, file_path)?;
425        let mut edges = Vec::new();
426
427        for import in &imports {
428            // Only create edges for resolvable module paths
429            // External packages and unresolvable paths are silently skipped per design spec
430            if let Ok(resolved) =
431                self.resolve_module_path(file_path, &import.module_path, import.relative_level)
432            {
433                edges.push(super::super::types::DependencyEdge::new(
434                    file_path.to_path_buf(),
435                    resolved,
436                    super::super::types::DependencyType::Import,
437                ));
438            }
439        }
440
441        Ok(edges)
442    }
443}
444
445impl Default for PythonDependencyExtractor {
446    fn default() -> Self {
447        Self::new()
448    }
449}