Skip to main content

agentic_codebase/parse/
parser.rs

1//! Main parser orchestrator.
2//!
3//! Delegates to language-specific parsers based on file extension.
4//! Collects files via the `ignore` crate (respects .gitignore),
5//! runs tree-sitter, and calls language extractors.
6
7use std::collections::HashMap;
8use std::path::{Path, PathBuf};
9use std::time::Instant;
10
11use crate::types::{AcbError, AcbResult, Language};
12
13use super::go::GoParser;
14use super::python::PythonParser;
15use super::rust::RustParser;
16use super::treesitter::parse_with_language;
17use super::typescript::TypeScriptParser;
18use super::{LanguageParser, ParseFileError, RawCodeUnit, Severity};
19
20/// Options controlling what and how to parse.
21#[derive(Debug, Clone)]
22pub struct ParseOptions {
23    /// Languages to include (empty = all supported).
24    pub languages: Vec<Language>,
25    /// Glob patterns to exclude.
26    pub exclude: Vec<String>,
27    /// Include test files.
28    pub include_tests: bool,
29    /// Maximum file size to parse (bytes).
30    pub max_file_size: usize,
31}
32
33impl Default for ParseOptions {
34    fn default() -> Self {
35        Self {
36            languages: vec![],
37            exclude: vec![
38                "**/node_modules/**".into(),
39                "**/target/**".into(),
40                "**/.git/**".into(),
41                "**/__pycache__/**".into(),
42                "**/venv/**".into(),
43                "**/.venv/**".into(),
44                "**/dist/**".into(),
45                "**/build/**".into(),
46            ],
47            include_tests: true,
48            max_file_size: 10 * 1024 * 1024, // 10MB
49        }
50    }
51}
52
53/// Result of parsing a directory or set of files.
54#[derive(Debug)]
55pub struct ParseResult {
56    /// All extracted code units.
57    pub units: Vec<RawCodeUnit>,
58    /// Errors and warnings encountered.
59    pub errors: Vec<ParseFileError>,
60    /// Aggregate statistics.
61    pub stats: ParseStats,
62}
63
64/// Aggregate statistics from a parse run.
65#[derive(Debug, Clone)]
66pub struct ParseStats {
67    /// Number of files successfully parsed.
68    pub files_parsed: usize,
69    /// Number of files skipped (excluded, too large, unknown lang).
70    pub files_skipped: usize,
71    /// Number of files that errored during parsing.
72    pub files_errored: usize,
73    /// Total source lines across all parsed files.
74    pub total_lines: usize,
75    /// Total parse time in milliseconds.
76    pub parse_time_ms: u64,
77    /// Files parsed per language.
78    pub by_language: HashMap<Language, usize>,
79    /// Detailed ingestion/skip accounting for auditability.
80    pub coverage: ParseCoverageStats,
81}
82
83/// Detailed counters for ingestion fidelity and skip reasons.
84#[derive(Debug, Clone, Default)]
85pub struct ParseCoverageStats {
86    /// Number of filesystem files seen by the walker.
87    pub files_seen: usize,
88    /// Number of files that made it into parser candidates.
89    pub files_candidate: usize,
90    /// Files skipped because language could not be resolved.
91    pub skipped_unknown_language: usize,
92    /// Files skipped by an explicit language filter.
93    pub skipped_language_filter: usize,
94    /// Files skipped by configured exclude patterns.
95    pub skipped_excluded_pattern: usize,
96    /// Files skipped because they exceeded size limits.
97    pub skipped_too_large: usize,
98    /// Files skipped because test files were disabled.
99    pub skipped_test_file: usize,
100    /// Files that failed to read from disk.
101    pub read_errors: usize,
102    /// Files that failed during parser/extractor execution.
103    pub parse_errors: usize,
104}
105
106impl ParseCoverageStats {
107    /// Total number of files skipped for known reasons.
108    pub fn total_skipped(&self) -> usize {
109        self.skipped_unknown_language
110            + self.skipped_language_filter
111            + self.skipped_excluded_pattern
112            + self.skipped_too_large
113            + self.skipped_test_file
114    }
115}
116
117struct CollectFilesResult {
118    files: Vec<PathBuf>,
119    coverage: ParseCoverageStats,
120}
121
122/// Main parser that orchestrates multi-language parsing.
123pub struct Parser {
124    /// Language-specific parsers, keyed by Language.
125    parsers: HashMap<Language, Box<dyn LanguageParser>>,
126}
127
128impl Parser {
129    /// Create a new parser with all supported language parsers.
130    pub fn new() -> Self {
131        let mut parsers: HashMap<Language, Box<dyn LanguageParser>> = HashMap::new();
132        parsers.insert(Language::Python, Box::new(PythonParser::new()));
133        parsers.insert(Language::Rust, Box::new(RustParser::new()));
134        parsers.insert(Language::TypeScript, Box::new(TypeScriptParser::new()));
135        parsers.insert(Language::JavaScript, Box::new(TypeScriptParser::new()));
136        parsers.insert(Language::Go, Box::new(GoParser::new()));
137        Self { parsers }
138    }
139
140    /// Parse a single file given its path and content.
141    pub fn parse_file(&self, path: &Path, content: &str) -> AcbResult<Vec<RawCodeUnit>> {
142        let lang = Language::from_path(path);
143        if lang == Language::Unknown {
144            return Err(AcbError::ParseError {
145                path: path.to_path_buf(),
146                message: "Unknown language".into(),
147            });
148        }
149
150        let parser = self
151            .parsers
152            .get(&lang)
153            .ok_or_else(|| AcbError::ParseError {
154                path: path.to_path_buf(),
155                message: format!("No parser for language: {}", lang),
156            })?;
157
158        // For TSX files, use the TSX language
159        let ts_lang = if matches!(
160            path.extension().and_then(|e| e.to_str()),
161            Some("tsx") | Some("jsx")
162        ) {
163            tree_sitter_typescript::language_tsx()
164        } else {
165            lang.tree_sitter_language()
166                .ok_or_else(|| AcbError::ParseError {
167                    path: path.to_path_buf(),
168                    message: format!("No tree-sitter grammar for: {}", lang),
169                })?
170        };
171
172        let tree = parse_with_language(content, ts_lang)?;
173        parser.extract_units(&tree, content, path)
174    }
175
176    /// Parse all matching files in a directory tree.
177    pub fn parse_directory(&self, root: &Path, options: &ParseOptions) -> AcbResult<ParseResult> {
178        let start = Instant::now();
179
180        let collected = self.collect_files(root, options)?;
181        let files = collected.files;
182
183        let mut all_units = Vec::new();
184        let mut all_errors = Vec::new();
185        let mut files_parsed = 0usize;
186        let mut files_errored = 0usize;
187        let mut total_lines = 0usize;
188        let mut by_language: HashMap<Language, usize> = HashMap::new();
189        let mut coverage = collected.coverage;
190
191        for file_path in &files {
192            let content = match std::fs::read_to_string(file_path) {
193                Ok(c) => c,
194                Err(e) => {
195                    all_errors.push(ParseFileError {
196                        path: file_path.clone(),
197                        span: None,
198                        message: format!("Could not read file: {}", e),
199                        severity: Severity::Error,
200                    });
201                    files_errored += 1;
202                    coverage.read_errors += 1;
203                    continue;
204                }
205            };
206
207            // Check file size
208            if content.len() > options.max_file_size {
209                coverage.skipped_too_large += 1;
210                continue;
211            }
212
213            let lang = Language::from_path(file_path);
214            if lang == Language::Unknown {
215                coverage.skipped_unknown_language += 1;
216                continue;
217            }
218
219            // Check test file filtering
220            if !options.include_tests {
221                if let Some(parser) = self.parsers.get(&lang) {
222                    if parser.is_test_file(file_path, &content) {
223                        coverage.skipped_test_file += 1;
224                        continue;
225                    }
226                }
227            }
228
229            match self.parse_file(file_path, &content) {
230                Ok(units) => {
231                    total_lines += content.lines().count();
232                    *by_language.entry(lang).or_insert(0) += 1;
233                    all_units.extend(units);
234                    files_parsed += 1;
235                }
236                Err(e) => {
237                    all_errors.push(ParseFileError {
238                        path: file_path.clone(),
239                        span: None,
240                        message: format!("{}", e),
241                        severity: Severity::Error,
242                    });
243                    files_errored += 1;
244                    coverage.parse_errors += 1;
245                }
246            }
247        }
248
249        let elapsed = start.elapsed();
250        let files_skipped = coverage.total_skipped();
251
252        Ok(ParseResult {
253            units: all_units,
254            errors: all_errors,
255            stats: ParseStats {
256                files_parsed,
257                files_skipped,
258                files_errored,
259                total_lines,
260                parse_time_ms: elapsed.as_millis() as u64,
261                by_language,
262                coverage,
263            },
264        })
265    }
266
267    /// Returns true if a file should be parsed based on language filters.
268    pub fn should_parse(&self, path: &Path) -> bool {
269        let lang = Language::from_path(path);
270        lang != Language::Unknown && self.parsers.contains_key(&lang)
271    }
272
273    /// Collect files to parse from a directory tree using the `ignore` crate.
274    fn collect_files(&self, root: &Path, options: &ParseOptions) -> AcbResult<CollectFilesResult> {
275        use ignore::WalkBuilder;
276
277        let mut files = Vec::new();
278        let mut coverage = ParseCoverageStats::default();
279
280        let walker = WalkBuilder::new(root).hidden(true).git_ignore(true).build();
281
282        for entry in walker {
283            let entry = match entry {
284                Ok(e) => e,
285                Err(_) => continue,
286            };
287            let path = entry.path();
288
289            if !path.is_file() {
290                continue;
291            }
292            coverage.files_seen += 1;
293
294            let lang = Language::from_path(path);
295            if lang == Language::Unknown {
296                coverage.skipped_unknown_language += 1;
297                continue;
298            }
299
300            // Check language filter
301            if !options.languages.is_empty() && !options.languages.contains(&lang) {
302                coverage.skipped_language_filter += 1;
303                continue;
304            }
305
306            // Check exclude patterns
307            if self.is_excluded(path, &options.exclude) {
308                coverage.skipped_excluded_pattern += 1;
309                continue;
310            }
311
312            files.push(path.to_path_buf());
313        }
314        coverage.files_candidate = files.len();
315
316        Ok(CollectFilesResult { files, coverage })
317    }
318
319    /// Check if a path matches any exclude patterns.
320    fn is_excluded(&self, path: &Path, excludes: &[String]) -> bool {
321        let path_str = path.to_string_lossy();
322        for pattern in excludes {
323            // Simple glob matching: check if any component matches
324            let pattern_str = pattern.replace("**", "");
325            let pattern_str = pattern_str.trim_matches('/');
326            if !pattern_str.is_empty() && path_str.contains(pattern_str) {
327                return true;
328            }
329        }
330        false
331    }
332}
333
334impl Default for Parser {
335    fn default() -> Self {
336        Self::new()
337    }
338}