Skip to main content

agentic_codebase/parse/
parser.rs

1//! Main parser orchestrator.
2//!
3//! Delegates to language-specific parsers based on file extension.
4//! Collects files via the `ignore` crate (respects .gitignore),
5//! runs tree-sitter, and calls language extractors.
6
7use std::collections::HashMap;
8use std::path::{Path, PathBuf};
9use std::time::Instant;
10
11use crate::types::{AcbError, AcbResult, Language};
12
13use super::cpp::CppParser;
14use super::go::GoParser;
15use super::python::PythonParser;
16use super::rust::RustParser;
17use super::treesitter::parse_with_language;
18use super::typescript::TypeScriptParser;
19use super::{LanguageParser, ParseFileError, RawCodeUnit, Severity};
20
21/// Options controlling what and how to parse.
22#[derive(Debug, Clone)]
23pub struct ParseOptions {
24    /// Languages to include (empty = all supported).
25    pub languages: Vec<Language>,
26    /// Glob patterns to exclude.
27    pub exclude: Vec<String>,
28    /// Include test files.
29    pub include_tests: bool,
30    /// Maximum file size to parse (bytes).
31    pub max_file_size: usize,
32}
33
34impl Default for ParseOptions {
35    fn default() -> Self {
36        Self {
37            languages: vec![],
38            exclude: vec![
39                "**/node_modules/**".into(),
40                "**/target/**".into(),
41                "**/.git/**".into(),
42                "**/__pycache__/**".into(),
43                "**/venv/**".into(),
44                "**/.venv/**".into(),
45                "**/dist/**".into(),
46                "**/build/**".into(),
47            ],
48            include_tests: true,
49            max_file_size: 10 * 1024 * 1024, // 10MB
50        }
51    }
52}
53
54/// Result of parsing a directory or set of files.
55#[derive(Debug)]
56pub struct ParseResult {
57    /// All extracted code units.
58    pub units: Vec<RawCodeUnit>,
59    /// Errors and warnings encountered.
60    pub errors: Vec<ParseFileError>,
61    /// Aggregate statistics.
62    pub stats: ParseStats,
63}
64
65/// Aggregate statistics from a parse run.
66#[derive(Debug, Clone)]
67pub struct ParseStats {
68    /// Number of files successfully parsed.
69    pub files_parsed: usize,
70    /// Number of files skipped (excluded, too large, unknown lang).
71    pub files_skipped: usize,
72    /// Number of files that errored during parsing.
73    pub files_errored: usize,
74    /// Total source lines across all parsed files.
75    pub total_lines: usize,
76    /// Total parse time in milliseconds.
77    pub parse_time_ms: u64,
78    /// Files parsed per language.
79    pub by_language: HashMap<Language, usize>,
80    /// Detailed ingestion/skip accounting for auditability.
81    pub coverage: ParseCoverageStats,
82}
83
84/// Detailed counters for ingestion fidelity and skip reasons.
85#[derive(Debug, Clone, Default)]
86pub struct ParseCoverageStats {
87    /// Number of filesystem files seen by the walker.
88    pub files_seen: usize,
89    /// Number of files that made it into parser candidates.
90    pub files_candidate: usize,
91    /// Files skipped because language could not be resolved.
92    pub skipped_unknown_language: usize,
93    /// Files skipped by an explicit language filter.
94    pub skipped_language_filter: usize,
95    /// Files skipped by configured exclude patterns.
96    pub skipped_excluded_pattern: usize,
97    /// Files skipped because they exceeded size limits.
98    pub skipped_too_large: usize,
99    /// Files skipped because test files were disabled.
100    pub skipped_test_file: usize,
101    /// Files that failed to read from disk.
102    pub read_errors: usize,
103    /// Files that failed during parser/extractor execution.
104    pub parse_errors: usize,
105}
106
107impl ParseCoverageStats {
108    /// Total number of files skipped for known reasons.
109    pub fn total_skipped(&self) -> usize {
110        self.skipped_unknown_language
111            + self.skipped_language_filter
112            + self.skipped_excluded_pattern
113            + self.skipped_too_large
114            + self.skipped_test_file
115    }
116}
117
118struct CollectFilesResult {
119    files: Vec<PathBuf>,
120    coverage: ParseCoverageStats,
121}
122
123/// Main parser that orchestrates multi-language parsing.
124pub struct Parser {
125    /// Language-specific parsers, keyed by Language.
126    parsers: HashMap<Language, Box<dyn LanguageParser>>,
127}
128
129impl Parser {
130    /// Create a new parser with all supported language parsers.
131    pub fn new() -> Self {
132        let mut parsers: HashMap<Language, Box<dyn LanguageParser>> = HashMap::new();
133        parsers.insert(Language::Python, Box::new(PythonParser::new()));
134        parsers.insert(Language::Rust, Box::new(RustParser::new()));
135        parsers.insert(Language::TypeScript, Box::new(TypeScriptParser::new()));
136        parsers.insert(Language::JavaScript, Box::new(TypeScriptParser::new()));
137        parsers.insert(Language::Go, Box::new(GoParser::new()));
138        parsers.insert(Language::Cpp, Box::new(CppParser::new()));
139        Self { parsers }
140    }
141
142    /// Parse a single file given its path and content.
143    pub fn parse_file(&self, path: &Path, content: &str) -> AcbResult<Vec<RawCodeUnit>> {
144        let lang = Language::from_path(path);
145        if lang == Language::Unknown {
146            return Err(AcbError::ParseError {
147                path: path.to_path_buf(),
148                message: "Unknown language".into(),
149            });
150        }
151
152        let parser = self
153            .parsers
154            .get(&lang)
155            .ok_or_else(|| AcbError::ParseError {
156                path: path.to_path_buf(),
157                message: format!("No parser for language: {}", lang),
158            })?;
159
160        // For TSX files, use the TSX language
161        let ts_lang = if matches!(
162            path.extension().and_then(|e| e.to_str()),
163            Some("tsx") | Some("jsx")
164        ) {
165            tree_sitter_typescript::language_tsx()
166        } else {
167            lang.tree_sitter_language()
168                .ok_or_else(|| AcbError::ParseError {
169                    path: path.to_path_buf(),
170                    message: format!("No tree-sitter grammar for: {}", lang),
171                })?
172        };
173
174        let tree = parse_with_language(content, ts_lang)?;
175        parser.extract_units(&tree, content, path)
176    }
177
178    /// Parse all matching files in a directory tree.
179    pub fn parse_directory(&self, root: &Path, options: &ParseOptions) -> AcbResult<ParseResult> {
180        let start = Instant::now();
181
182        let collected = self.collect_files(root, options)?;
183        let files = collected.files;
184
185        let mut all_units = Vec::new();
186        let mut all_errors = Vec::new();
187        let mut files_parsed = 0usize;
188        let mut files_errored = 0usize;
189        let mut total_lines = 0usize;
190        let mut by_language: HashMap<Language, usize> = HashMap::new();
191        let mut coverage = collected.coverage;
192
193        for file_path in &files {
194            let content = match std::fs::read_to_string(file_path) {
195                Ok(c) => c,
196                Err(e) => {
197                    all_errors.push(ParseFileError {
198                        path: file_path.clone(),
199                        span: None,
200                        message: format!("Could not read file: {}", e),
201                        severity: Severity::Error,
202                    });
203                    files_errored += 1;
204                    coverage.read_errors += 1;
205                    continue;
206                }
207            };
208
209            // Check file size
210            if content.len() > options.max_file_size {
211                coverage.skipped_too_large += 1;
212                continue;
213            }
214
215            let lang = Language::from_path(file_path);
216            if lang == Language::Unknown {
217                coverage.skipped_unknown_language += 1;
218                continue;
219            }
220
221            // Check test file filtering
222            if !options.include_tests {
223                if let Some(parser) = self.parsers.get(&lang) {
224                    if parser.is_test_file(file_path, &content) {
225                        coverage.skipped_test_file += 1;
226                        continue;
227                    }
228                }
229            }
230
231            match self.parse_file(file_path, &content) {
232                Ok(units) => {
233                    total_lines += content.lines().count();
234                    *by_language.entry(lang).or_insert(0) += 1;
235                    all_units.extend(units);
236                    files_parsed += 1;
237                }
238                Err(e) => {
239                    all_errors.push(ParseFileError {
240                        path: file_path.clone(),
241                        span: None,
242                        message: format!("{}", e),
243                        severity: Severity::Error,
244                    });
245                    files_errored += 1;
246                    coverage.parse_errors += 1;
247                }
248            }
249        }
250
251        let elapsed = start.elapsed();
252        let files_skipped = coverage.total_skipped();
253
254        Ok(ParseResult {
255            units: all_units,
256            errors: all_errors,
257            stats: ParseStats {
258                files_parsed,
259                files_skipped,
260                files_errored,
261                total_lines,
262                parse_time_ms: elapsed.as_millis() as u64,
263                by_language,
264                coverage,
265            },
266        })
267    }
268
269    /// Returns true if a file should be parsed based on language filters.
270    pub fn should_parse(&self, path: &Path) -> bool {
271        let lang = Language::from_path(path);
272        lang != Language::Unknown && self.parsers.contains_key(&lang)
273    }
274
275    /// Collect files to parse from a directory tree using the `ignore` crate.
276    fn collect_files(&self, root: &Path, options: &ParseOptions) -> AcbResult<CollectFilesResult> {
277        use ignore::WalkBuilder;
278
279        let mut files = Vec::new();
280        let mut coverage = ParseCoverageStats::default();
281
282        let walker = WalkBuilder::new(root).hidden(true).git_ignore(true).build();
283
284        for entry in walker {
285            let entry = match entry {
286                Ok(e) => e,
287                Err(_) => continue,
288            };
289            let path = entry.path();
290
291            if !path.is_file() {
292                continue;
293            }
294            coverage.files_seen += 1;
295
296            let lang = Language::from_path(path);
297            if lang == Language::Unknown {
298                coverage.skipped_unknown_language += 1;
299                continue;
300            }
301
302            // Check language filter
303            if !options.languages.is_empty() && !options.languages.contains(&lang) {
304                coverage.skipped_language_filter += 1;
305                continue;
306            }
307
308            // Check exclude patterns
309            if self.is_excluded(path, &options.exclude) {
310                coverage.skipped_excluded_pattern += 1;
311                continue;
312            }
313
314            files.push(path.to_path_buf());
315        }
316        coverage.files_candidate = files.len();
317
318        Ok(CollectFilesResult { files, coverage })
319    }
320
321    /// Check if a path matches any exclude patterns.
322    fn is_excluded(&self, path: &Path, excludes: &[String]) -> bool {
323        let path_str = path.to_string_lossy();
324        for pattern in excludes {
325            // Simple glob matching: check if any component matches
326            let pattern_str = pattern.replace("**", "");
327            let pattern_str = pattern_str.trim_matches('/');
328            if !pattern_str.is_empty() && path_str.contains(pattern_str) {
329                return true;
330            }
331        }
332        false
333    }
334}
335
336impl Default for Parser {
337    fn default() -> Self {
338        Self::new()
339    }
340}