Skip to main content

agentic_codebase/parse/
parser.rs

1//! Main parser orchestrator.
2//!
3//! Delegates to language-specific parsers based on file extension.
4//! Collects files via the `ignore` crate (respects .gitignore),
5//! runs tree-sitter, and calls language extractors.
6
7use std::collections::HashMap;
8use std::path::{Path, PathBuf};
9use std::time::Instant;
10
11use crate::types::{AcbError, AcbResult, Language};
12
13use super::go::GoParser;
14use super::python::PythonParser;
15use super::rust::RustParser;
16use super::treesitter::parse_with_language;
17use super::typescript::TypeScriptParser;
18use super::{LanguageParser, ParseFileError, RawCodeUnit, Severity};
19
20/// Options controlling what and how to parse.
21#[derive(Debug, Clone)]
22pub struct ParseOptions {
23    /// Languages to include (empty = all supported).
24    pub languages: Vec<Language>,
25    /// Glob patterns to exclude.
26    pub exclude: Vec<String>,
27    /// Include test files.
28    pub include_tests: bool,
29    /// Maximum file size to parse (bytes).
30    pub max_file_size: usize,
31}
32
33impl Default for ParseOptions {
34    fn default() -> Self {
35        Self {
36            languages: vec![],
37            exclude: vec![
38                "**/node_modules/**".into(),
39                "**/target/**".into(),
40                "**/.git/**".into(),
41                "**/__pycache__/**".into(),
42                "**/venv/**".into(),
43                "**/.venv/**".into(),
44                "**/dist/**".into(),
45                "**/build/**".into(),
46            ],
47            include_tests: true,
48            max_file_size: 10 * 1024 * 1024, // 10MB
49        }
50    }
51}
52
53/// Result of parsing a directory or set of files.
54#[derive(Debug)]
55pub struct ParseResult {
56    /// All extracted code units.
57    pub units: Vec<RawCodeUnit>,
58    /// Errors and warnings encountered.
59    pub errors: Vec<ParseFileError>,
60    /// Aggregate statistics.
61    pub stats: ParseStats,
62}
63
64/// Aggregate statistics from a parse run.
65#[derive(Debug, Clone)]
66pub struct ParseStats {
67    /// Number of files successfully parsed.
68    pub files_parsed: usize,
69    /// Number of files skipped (excluded, too large, unknown lang).
70    pub files_skipped: usize,
71    /// Number of files that errored during parsing.
72    pub files_errored: usize,
73    /// Total source lines across all parsed files.
74    pub total_lines: usize,
75    /// Total parse time in milliseconds.
76    pub parse_time_ms: u64,
77    /// Files parsed per language.
78    pub by_language: HashMap<Language, usize>,
79}
80
81/// Main parser that orchestrates multi-language parsing.
82pub struct Parser {
83    /// Language-specific parsers, keyed by Language.
84    parsers: HashMap<Language, Box<dyn LanguageParser>>,
85}
86
87impl Parser {
88    /// Create a new parser with all supported language parsers.
89    pub fn new() -> Self {
90        let mut parsers: HashMap<Language, Box<dyn LanguageParser>> = HashMap::new();
91        parsers.insert(Language::Python, Box::new(PythonParser::new()));
92        parsers.insert(Language::Rust, Box::new(RustParser::new()));
93        parsers.insert(Language::TypeScript, Box::new(TypeScriptParser::new()));
94        parsers.insert(Language::JavaScript, Box::new(TypeScriptParser::new()));
95        parsers.insert(Language::Go, Box::new(GoParser::new()));
96        Self { parsers }
97    }
98
99    /// Parse a single file given its path and content.
100    pub fn parse_file(&self, path: &Path, content: &str) -> AcbResult<Vec<RawCodeUnit>> {
101        let lang = Language::from_path(path);
102        if lang == Language::Unknown {
103            return Err(AcbError::ParseError {
104                path: path.to_path_buf(),
105                message: "Unknown language".into(),
106            });
107        }
108
109        let parser = self
110            .parsers
111            .get(&lang)
112            .ok_or_else(|| AcbError::ParseError {
113                path: path.to_path_buf(),
114                message: format!("No parser for language: {}", lang),
115            })?;
116
117        // For TSX files, use the TSX language
118        let ts_lang = if matches!(
119            path.extension().and_then(|e| e.to_str()),
120            Some("tsx") | Some("jsx")
121        ) {
122            tree_sitter_typescript::language_tsx()
123        } else {
124            lang.tree_sitter_language()
125                .ok_or_else(|| AcbError::ParseError {
126                    path: path.to_path_buf(),
127                    message: format!("No tree-sitter grammar for: {}", lang),
128                })?
129        };
130
131        let tree = parse_with_language(content, ts_lang)?;
132        parser.extract_units(&tree, content, path)
133    }
134
135    /// Parse all matching files in a directory tree.
136    pub fn parse_directory(&self, root: &Path, options: &ParseOptions) -> AcbResult<ParseResult> {
137        let start = Instant::now();
138
139        let files = self.collect_files(root, options)?;
140
141        let mut all_units = Vec::new();
142        let mut all_errors = Vec::new();
143        let mut files_parsed = 0usize;
144        let mut files_skipped = 0usize;
145        let mut files_errored = 0usize;
146        let mut total_lines = 0usize;
147        let mut by_language: HashMap<Language, usize> = HashMap::new();
148
149        for file_path in &files {
150            let content = match std::fs::read_to_string(file_path) {
151                Ok(c) => c,
152                Err(e) => {
153                    all_errors.push(ParseFileError {
154                        path: file_path.clone(),
155                        span: None,
156                        message: format!("Could not read file: {}", e),
157                        severity: Severity::Error,
158                    });
159                    files_errored += 1;
160                    continue;
161                }
162            };
163
164            // Check file size
165            if content.len() > options.max_file_size {
166                files_skipped += 1;
167                continue;
168            }
169
170            let lang = Language::from_path(file_path);
171            if lang == Language::Unknown {
172                files_skipped += 1;
173                continue;
174            }
175
176            // Check test file filtering
177            if !options.include_tests {
178                if let Some(parser) = self.parsers.get(&lang) {
179                    if parser.is_test_file(file_path, &content) {
180                        files_skipped += 1;
181                        continue;
182                    }
183                }
184            }
185
186            match self.parse_file(file_path, &content) {
187                Ok(units) => {
188                    total_lines += content.lines().count();
189                    *by_language.entry(lang).or_insert(0) += 1;
190                    all_units.extend(units);
191                    files_parsed += 1;
192                }
193                Err(e) => {
194                    all_errors.push(ParseFileError {
195                        path: file_path.clone(),
196                        span: None,
197                        message: format!("{}", e),
198                        severity: Severity::Error,
199                    });
200                    files_errored += 1;
201                }
202            }
203        }
204
205        let elapsed = start.elapsed();
206
207        Ok(ParseResult {
208            units: all_units,
209            errors: all_errors,
210            stats: ParseStats {
211                files_parsed,
212                files_skipped,
213                files_errored,
214                total_lines,
215                parse_time_ms: elapsed.as_millis() as u64,
216                by_language,
217            },
218        })
219    }
220
221    /// Returns true if a file should be parsed based on language filters.
222    pub fn should_parse(&self, path: &Path) -> bool {
223        let lang = Language::from_path(path);
224        lang != Language::Unknown && self.parsers.contains_key(&lang)
225    }
226
227    /// Collect files to parse from a directory tree using the `ignore` crate.
228    fn collect_files(&self, root: &Path, options: &ParseOptions) -> AcbResult<Vec<PathBuf>> {
229        use ignore::WalkBuilder;
230
231        let mut files = Vec::new();
232
233        let walker = WalkBuilder::new(root).hidden(true).git_ignore(true).build();
234
235        for entry in walker {
236            let entry = match entry {
237                Ok(e) => e,
238                Err(_) => continue,
239            };
240            let path = entry.path();
241
242            if !path.is_file() {
243                continue;
244            }
245
246            let lang = Language::from_path(path);
247            if lang == Language::Unknown {
248                continue;
249            }
250
251            // Check language filter
252            if !options.languages.is_empty() && !options.languages.contains(&lang) {
253                continue;
254            }
255
256            // Check exclude patterns
257            if self.is_excluded(path, &options.exclude) {
258                continue;
259            }
260
261            files.push(path.to_path_buf());
262        }
263
264        Ok(files)
265    }
266
267    /// Check if a path matches any exclude patterns.
268    fn is_excluded(&self, path: &Path, excludes: &[String]) -> bool {
269        let path_str = path.to_string_lossy();
270        for pattern in excludes {
271            // Simple glob matching: check if any component matches
272            let pattern_str = pattern.replace("**", "");
273            let pattern_str = pattern_str.trim_matches('/');
274            if !pattern_str.is_empty() && path_str.contains(pattern_str) {
275                return true;
276            }
277        }
278        false
279    }
280}
281
282impl Default for Parser {
283    fn default() -> Self {
284        Self::new()
285    }
286}