Skip to main content

portalis_ingest/
project_parser.rs

1//! Project-Level Parser for Multi-File Python Projects
2//!
3//! Handles parsing entire Python projects with multiple modules and dependencies.
4
5use crate::{enhanced_parser::EnhancedParser, PythonAst};
6use portalis_core::{Error, Result};
7use std::collections::HashMap;
8use std::path::{Path, PathBuf};
9use serde::{Deserialize, Serialize};
10
11/// Represents a complete Python project
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct PythonProject {
14    pub root_path: PathBuf,
15    pub modules: HashMap<String, PythonModule>,
16    pub dependency_graph: DependencyGraph,
17}
18
19/// Represents a single Python module/file
20#[derive(Debug, Clone, Serialize, Deserialize)]
21pub struct PythonModule {
22    pub name: String,
23    pub path: PathBuf,
24    pub ast: PythonAst,
25    pub imports: Vec<ImportStatement>,
26}
27
28/// Import statement information
29#[derive(Debug, Clone, Serialize, Deserialize)]
30pub struct ImportStatement {
31    pub module: String,
32    pub items: Vec<String>,
33    pub alias: Option<String>,
34    pub is_relative: bool,
35}
36
37/// Dependency graph for module ordering
38#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct DependencyGraph {
40    pub nodes: HashMap<String, ModuleNode>,
41    pub edges: Vec<(String, String)>,
42}
43
44#[derive(Debug, Clone, Serialize, Deserialize)]
45pub struct ModuleNode {
46    pub name: String,
47    pub dependencies: Vec<String>,
48    pub dependents: Vec<String>,
49}
50
51/// Project parser implementation
52pub struct ProjectParser {
53    parser: EnhancedParser,
54}
55
56impl ProjectParser {
57    pub fn new() -> Self {
58        Self {
59            parser: EnhancedParser::new(),
60        }
61    }
62
63    /// Parse an entire Python project from a root directory
64    pub fn parse_project(&self, root_path: &Path) -> Result<PythonProject> {
65        let mut modules = HashMap::new();
66
67        // Discover all Python files
68        let python_files = self.discover_python_files(root_path)?;
69
70        // Parse each file
71        for file_path in python_files {
72            let module_name = self.path_to_module_name(root_path, &file_path)?;
73            let source = std::fs::read_to_string(&file_path)?;
74
75            let ast = self.parser.parse(&source)?;
76            let imports = self.extract_imports(&ast);
77
78            modules.insert(
79                module_name.clone(),
80                PythonModule {
81                    name: module_name,
82                    path: file_path,
83                    ast,
84                    imports,
85                },
86            );
87        }
88
89        // Build dependency graph
90        let dependency_graph = self.build_dependency_graph(&modules)?;
91
92        Ok(PythonProject {
93            root_path: root_path.to_path_buf(),
94            modules,
95            dependency_graph,
96        })
97    }
98
99    /// Discover all Python files in a directory tree
100    fn discover_python_files(&self, root: &Path) -> Result<Vec<PathBuf>> {
101        let mut files = Vec::new();
102
103        if !root.exists() {
104            return Err(Error::Parse(format!("Path does not exist: {:?}", root)));
105        }
106
107        self.walk_directory(root, &mut files)?;
108
109        Ok(files)
110    }
111
112    fn walk_directory(&self, dir: &Path, files: &mut Vec<PathBuf>) -> Result<()> {
113        if !dir.is_dir() {
114            return Ok(());
115        }
116
117        let entries = std::fs::read_dir(dir)?;
118
119        for entry in entries {
120            let entry = entry?;
121            let path = entry.path();
122
123            if path.is_dir() {
124                // Skip common directories
125                if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
126                    if name.starts_with('.') || name == "__pycache__" || name == "venv" {
127                        continue;
128                    }
129                }
130                self.walk_directory(&path, files)?;
131            } else if path.extension().and_then(|s| s.to_str()) == Some("py") {
132                files.push(path);
133            }
134        }
135
136        Ok(())
137    }
138
139    /// Convert file path to module name
140    fn path_to_module_name(&self, root: &Path, file_path: &Path) -> Result<String> {
141        let relative = file_path
142            .strip_prefix(root)
143            .map_err(|_| Error::Parse("Invalid path".into()))?;
144
145        let mut parts: Vec<String> = relative
146            .iter()
147            .map(|s| s.to_string_lossy().to_string())
148            .collect();
149
150        // Remove .py extension from last part
151        if let Some(last) = parts.last_mut() {
152            if last.ends_with(".py") {
153                *last = last.trim_end_matches(".py").to_string();
154            }
155            // __init__.py becomes the parent module
156            if *last == "__init__" {
157                parts.pop();
158            }
159        }
160
161        Ok(parts.join("."))
162    }
163
164    /// Extract import statements from AST
165    fn extract_imports(&self, ast: &PythonAst) -> Vec<ImportStatement> {
166        ast.imports
167            .iter()
168            .map(|imp| ImportStatement {
169                module: imp.module.clone(),
170                items: imp.items.clone(),
171                alias: imp.alias.clone(),
172                is_relative: imp.module.starts_with('.'),
173            })
174            .collect()
175    }
176
177    /// Build dependency graph from modules
178    fn build_dependency_graph(
179        &self,
180        modules: &HashMap<String, PythonModule>,
181    ) -> Result<DependencyGraph> {
182        let mut nodes = HashMap::new();
183        let mut edges = Vec::new();
184
185        // Initialize nodes
186        for module_name in modules.keys() {
187            nodes.insert(
188                module_name.clone(),
189                ModuleNode {
190                    name: module_name.clone(),
191                    dependencies: Vec::new(),
192                    dependents: Vec::new(),
193                },
194            );
195        }
196
197        // Build edges
198        for (module_name, module) in modules {
199            for import in &module.imports {
200                // Only track internal dependencies
201                if modules.contains_key(&import.module) {
202                    edges.push((module_name.clone(), import.module.clone()));
203
204                    // Update dependency lists
205                    if let Some(node) = nodes.get_mut(module_name) {
206                        node.dependencies.push(import.module.clone());
207                    }
208                    if let Some(dep_node) = nodes.get_mut(&import.module) {
209                        dep_node.dependents.push(module_name.clone());
210                    }
211                }
212            }
213        }
214
215        Ok(DependencyGraph { nodes, edges })
216    }
217
218    /// Get modules in topological order (dependencies first)
219    pub fn topological_sort(&self, graph: &DependencyGraph) -> Result<Vec<String>> {
220        let mut result = Vec::new();
221        let mut visited = HashMap::new();
222        let mut temp_mark = HashMap::new();
223
224        for node_name in graph.nodes.keys() {
225            if !visited.contains_key(node_name) {
226                self.visit(
227                    node_name,
228                    &graph.nodes,
229                    &mut visited,
230                    &mut temp_mark,
231                    &mut result,
232                )?;
233            }
234        }
235
236        // DFS post-order already gives us the correct topological order (dependencies first)
237        Ok(result)
238    }
239
240    fn visit(
241        &self,
242        node: &str,
243        nodes: &HashMap<String, ModuleNode>,
244        visited: &mut HashMap<String, bool>,
245        temp_mark: &mut HashMap<String, bool>,
246        result: &mut Vec<String>,
247    ) -> Result<()> {
248        if temp_mark.get(node).copied().unwrap_or(false) {
249            return Err(Error::Parse(format!("Circular dependency detected: {}", node)));
250        }
251
252        if visited.get(node).copied().unwrap_or(false) {
253            return Ok(());
254        }
255
256        temp_mark.insert(node.to_string(), true);
257
258        if let Some(module_node) = nodes.get(node) {
259            for dep in &module_node.dependencies {
260                self.visit(dep, nodes, visited, temp_mark, result)?;
261            }
262        }
263
264        temp_mark.insert(node.to_string(), false);
265        visited.insert(node.to_string(), true);
266        result.push(node.to_string());
267
268        Ok(())
269    }
270}
271
272impl Default for ProjectParser {
273    fn default() -> Self {
274        Self::new()
275    }
276}
277
278#[cfg(test)]
279mod tests {
280    use super::*;
281
282    #[test]
283    fn test_module_name_conversion() {
284        let parser = ProjectParser::new();
285        let root = Path::new("/project");
286        let file = Path::new("/project/math/utils.py");
287
288        let module_name = parser.path_to_module_name(root, file).unwrap();
289        assert_eq!(module_name, "math.utils");
290    }
291
292    #[test]
293    fn test_init_module_name() {
294        let parser = ProjectParser::new();
295        let root = Path::new("/project");
296        let file = Path::new("/project/math/__init__.py");
297
298        let module_name = parser.path_to_module_name(root, file).unwrap();
299        assert_eq!(module_name, "math");
300    }
301
302    #[test]
303    fn test_dependency_graph_simple() {
304        let parser = ProjectParser::new();
305        let mut modules = HashMap::new();
306
307        // Create a simple dependency: module_a imports module_b
308        modules.insert(
309            "module_a".to_string(),
310            PythonModule {
311                name: "module_a".to_string(),
312                path: PathBuf::from("module_a.py"),
313                ast: PythonAst {
314                    functions: vec![],
315                    classes: vec![],
316                    imports: vec![],
317                },
318                imports: vec![ImportStatement {
319                    module: "module_b".to_string(),
320                    items: vec![],
321                    alias: None,
322                    is_relative: false,
323                }],
324            },
325        );
326
327        modules.insert(
328            "module_b".to_string(),
329            PythonModule {
330                name: "module_b".to_string(),
331                path: PathBuf::from("module_b.py"),
332                ast: PythonAst {
333                    functions: vec![],
334                    classes: vec![],
335                    imports: vec![],
336                },
337                imports: vec![],
338            },
339        );
340
341        let graph = parser.build_dependency_graph(&modules).unwrap();
342
343        assert_eq!(graph.nodes.len(), 2);
344        assert_eq!(graph.edges.len(), 1);
345        assert_eq!(graph.edges[0], ("module_a".to_string(), "module_b".to_string()));
346    }
347
348    #[test]
349    fn test_topological_sort() {
350        let parser = ProjectParser::new();
351
352        // Create graph: c depends on b, b depends on a
353        let mut nodes = HashMap::new();
354        nodes.insert(
355            "a".to_string(),
356            ModuleNode {
357                name: "a".to_string(),
358                dependencies: vec![],
359                dependents: vec!["b".to_string()],
360            },
361        );
362        nodes.insert(
363            "b".to_string(),
364            ModuleNode {
365                name: "b".to_string(),
366                dependencies: vec!["a".to_string()],
367                dependents: vec!["c".to_string()],
368            },
369        );
370        nodes.insert(
371            "c".to_string(),
372            ModuleNode {
373                name: "c".to_string(),
374                dependencies: vec!["b".to_string()],
375                dependents: vec![],
376            },
377        );
378
379        let graph = DependencyGraph {
380            nodes,
381            edges: vec![
382                ("b".to_string(), "a".to_string()),
383                ("c".to_string(), "b".to_string()),
384            ],
385        };
386
387        let sorted = parser.topological_sort(&graph).unwrap();
388
389        // a should come before b, b should come before c
390        let a_pos = sorted.iter().position(|s| s == "a").unwrap();
391        let b_pos = sorted.iter().position(|s| s == "b").unwrap();
392        let c_pos = sorted.iter().position(|s| s == "c").unwrap();
393
394        assert!(a_pos < b_pos);
395        assert!(b_pos < c_pos);
396    }
397}