Skip to main content

agentic_codebase/parse/
parser.rs

1//! Main parser orchestrator.
2//!
3//! Delegates to language-specific parsers based on file extension.
4//! Collects files via the `ignore` crate (respects .gitignore),
5//! runs tree-sitter, and calls language extractors.
6
7use std::collections::HashMap;
8use std::path::{Path, PathBuf};
9use std::time::Instant;
10
11use crate::types::{AcbError, AcbResult, Language};
12
13use super::cpp::CppParser;
14use super::csharp::CSharpParser;
15use super::go::GoParser;
16use super::java::JavaParser;
17use super::python::PythonParser;
18use super::rust::RustParser;
19use super::treesitter::parse_with_language;
20use super::typescript::TypeScriptParser;
21use super::{LanguageParser, ParseFileError, RawCodeUnit, Severity};
22
23/// Options controlling what and how to parse.
24#[derive(Debug, Clone)]
25pub struct ParseOptions {
26    /// Languages to include (empty = all supported).
27    pub languages: Vec<Language>,
28    /// Glob patterns to exclude.
29    pub exclude: Vec<String>,
30    /// Include test files.
31    pub include_tests: bool,
32    /// Maximum file size to parse (bytes).
33    pub max_file_size: usize,
34}
35
36impl Default for ParseOptions {
37    fn default() -> Self {
38        Self {
39            languages: vec![],
40            exclude: vec![
41                "**/node_modules/**".into(),
42                "**/target/**".into(),
43                "**/.git/**".into(),
44                "**/__pycache__/**".into(),
45                "**/venv/**".into(),
46                "**/.venv/**".into(),
47                "**/dist/**".into(),
48                "**/build/**".into(),
49            ],
50            include_tests: true,
51            max_file_size: 10 * 1024 * 1024, // 10MB
52        }
53    }
54}
55
56/// Result of parsing a directory or set of files.
57#[derive(Debug)]
58pub struct ParseResult {
59    /// All extracted code units.
60    pub units: Vec<RawCodeUnit>,
61    /// Errors and warnings encountered.
62    pub errors: Vec<ParseFileError>,
63    /// Aggregate statistics.
64    pub stats: ParseStats,
65}
66
67/// Aggregate statistics from a parse run.
68#[derive(Debug, Clone)]
69pub struct ParseStats {
70    /// Number of files successfully parsed.
71    pub files_parsed: usize,
72    /// Number of files skipped (excluded, too large, unknown lang).
73    pub files_skipped: usize,
74    /// Number of files that errored during parsing.
75    pub files_errored: usize,
76    /// Total source lines across all parsed files.
77    pub total_lines: usize,
78    /// Total parse time in milliseconds.
79    pub parse_time_ms: u64,
80    /// Files parsed per language.
81    pub by_language: HashMap<Language, usize>,
82    /// Detailed ingestion/skip accounting for auditability.
83    pub coverage: ParseCoverageStats,
84}
85
86/// Detailed counters for ingestion fidelity and skip reasons.
87#[derive(Debug, Clone, Default)]
88pub struct ParseCoverageStats {
89    /// Number of filesystem files seen by the walker.
90    pub files_seen: usize,
91    /// Number of files that made it into parser candidates.
92    pub files_candidate: usize,
93    /// Files skipped because language could not be resolved.
94    pub skipped_unknown_language: usize,
95    /// Files skipped by an explicit language filter.
96    pub skipped_language_filter: usize,
97    /// Files skipped by configured exclude patterns.
98    pub skipped_excluded_pattern: usize,
99    /// Files skipped because they exceeded size limits.
100    pub skipped_too_large: usize,
101    /// Files skipped because test files were disabled.
102    pub skipped_test_file: usize,
103    /// Files that failed to read from disk.
104    pub read_errors: usize,
105    /// Files that failed during parser/extractor execution.
106    pub parse_errors: usize,
107    /// Extension breakdown of unsupported files (e.g. {"xml": 42, "txt": 10}).
108    pub unsupported_extensions: HashMap<String, usize>,
109}
110
111impl ParseCoverageStats {
112    /// Total number of files skipped for known reasons.
113    pub fn total_skipped(&self) -> usize {
114        self.skipped_unknown_language
115            + self.skipped_language_filter
116            + self.skipped_excluded_pattern
117            + self.skipped_too_large
118            + self.skipped_test_file
119    }
120}
121
122struct CollectFilesResult {
123    files: Vec<PathBuf>,
124    coverage: ParseCoverageStats,
125}
126
127/// Main parser that orchestrates multi-language parsing.
128pub struct Parser {
129    /// Language-specific parsers, keyed by Language.
130    parsers: HashMap<Language, Box<dyn LanguageParser>>,
131}
132
133impl Parser {
134    /// Create a new parser with all supported language parsers.
135    pub fn new() -> Self {
136        let mut parsers: HashMap<Language, Box<dyn LanguageParser>> = HashMap::new();
137        parsers.insert(Language::Python, Box::new(PythonParser::new()));
138        parsers.insert(Language::Rust, Box::new(RustParser::new()));
139        parsers.insert(Language::TypeScript, Box::new(TypeScriptParser::new()));
140        parsers.insert(Language::JavaScript, Box::new(TypeScriptParser::new()));
141        parsers.insert(Language::Go, Box::new(GoParser::new()));
142        parsers.insert(Language::Cpp, Box::new(CppParser::new()));
143        parsers.insert(Language::Java, Box::new(JavaParser::new()));
144        parsers.insert(Language::CSharp, Box::new(CSharpParser::new()));
145        Self { parsers }
146    }
147
148    /// Parse a single file given its path and content.
149    pub fn parse_file(&self, path: &Path, content: &str) -> AcbResult<Vec<RawCodeUnit>> {
150        let lang = Language::from_path(path);
151        if lang == Language::Unknown {
152            return Err(AcbError::ParseError {
153                path: path.to_path_buf(),
154                message: "Unknown language".into(),
155            });
156        }
157
158        let parser = self
159            .parsers
160            .get(&lang)
161            .ok_or_else(|| AcbError::ParseError {
162                path: path.to_path_buf(),
163                message: format!("No parser for language: {}", lang),
164            })?;
165
166        // For TSX files, use the TSX language
167        let ts_lang = if matches!(
168            path.extension().and_then(|e| e.to_str()),
169            Some("tsx") | Some("jsx")
170        ) {
171            tree_sitter_typescript::language_tsx()
172        } else {
173            lang.tree_sitter_language()
174                .ok_or_else(|| AcbError::ParseError {
175                    path: path.to_path_buf(),
176                    message: format!("No tree-sitter grammar for: {}", lang),
177                })?
178        };
179
180        let tree = parse_with_language(content, ts_lang)?;
181        parser.extract_units(&tree, content, path)
182    }
183
184    /// Parse all matching files in a directory tree.
185    pub fn parse_directory(&self, root: &Path, options: &ParseOptions) -> AcbResult<ParseResult> {
186        let start = Instant::now();
187
188        let collected = self.collect_files(root, options)?;
189        let files = collected.files;
190
191        let mut all_units = Vec::new();
192        let mut all_errors = Vec::new();
193        let mut files_parsed = 0usize;
194        let mut files_errored = 0usize;
195        let mut total_lines = 0usize;
196        let mut by_language: HashMap<Language, usize> = HashMap::new();
197        let mut coverage = collected.coverage;
198
199        for file_path in &files {
200            let content = match std::fs::read_to_string(file_path) {
201                Ok(c) => c,
202                Err(e) => {
203                    all_errors.push(ParseFileError {
204                        path: file_path.clone(),
205                        span: None,
206                        message: format!("Could not read file: {}", e),
207                        severity: Severity::Error,
208                    });
209                    files_errored += 1;
210                    coverage.read_errors += 1;
211                    continue;
212                }
213            };
214
215            // Check file size
216            if content.len() > options.max_file_size {
217                coverage.skipped_too_large += 1;
218                continue;
219            }
220
221            let lang = Language::from_path(file_path);
222            if lang == Language::Unknown {
223                coverage.skipped_unknown_language += 1;
224                continue;
225            }
226
227            // Check test file filtering
228            if !options.include_tests {
229                if let Some(parser) = self.parsers.get(&lang) {
230                    if parser.is_test_file(file_path, &content) {
231                        coverage.skipped_test_file += 1;
232                        continue;
233                    }
234                }
235            }
236
237            match self.parse_file(file_path, &content) {
238                Ok(units) => {
239                    total_lines += content.lines().count();
240                    *by_language.entry(lang).or_insert(0) += 1;
241                    all_units.extend(units);
242                    files_parsed += 1;
243                }
244                Err(e) => {
245                    all_errors.push(ParseFileError {
246                        path: file_path.clone(),
247                        span: None,
248                        message: format!("{}", e),
249                        severity: Severity::Error,
250                    });
251                    files_errored += 1;
252                    coverage.parse_errors += 1;
253                }
254            }
255        }
256
257        let elapsed = start.elapsed();
258        let files_skipped = coverage.total_skipped();
259
260        Ok(ParseResult {
261            units: all_units,
262            errors: all_errors,
263            stats: ParseStats {
264                files_parsed,
265                files_skipped,
266                files_errored,
267                total_lines,
268                parse_time_ms: elapsed.as_millis() as u64,
269                by_language,
270                coverage,
271            },
272        })
273    }
274
275    /// Returns true if a file should be parsed based on language filters.
276    pub fn should_parse(&self, path: &Path) -> bool {
277        let lang = Language::from_path(path);
278        lang != Language::Unknown && self.parsers.contains_key(&lang)
279    }
280
281    /// Collect files to parse from a directory tree using the `ignore` crate.
282    fn collect_files(&self, root: &Path, options: &ParseOptions) -> AcbResult<CollectFilesResult> {
283        use ignore::WalkBuilder;
284
285        let mut files = Vec::new();
286        let mut coverage = ParseCoverageStats::default();
287
288        let walker = WalkBuilder::new(root).hidden(true).git_ignore(true).build();
289
290        for entry in walker {
291            let entry = match entry {
292                Ok(e) => e,
293                Err(_) => continue,
294            };
295            let path = entry.path();
296
297            if !path.is_file() {
298                continue;
299            }
300            coverage.files_seen += 1;
301
302            let lang = Language::from_path(path);
303            if lang == Language::Unknown {
304                coverage.skipped_unknown_language += 1;
305                if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
306                    *coverage
307                        .unsupported_extensions
308                        .entry(ext.to_lowercase())
309                        .or_insert(0) += 1;
310                }
311                continue;
312            }
313
314            // Check language filter
315            if !options.languages.is_empty() && !options.languages.contains(&lang) {
316                coverage.skipped_language_filter += 1;
317                continue;
318            }
319
320            // Check exclude patterns
321            if self.is_excluded(path, &options.exclude) {
322                coverage.skipped_excluded_pattern += 1;
323                continue;
324            }
325
326            files.push(path.to_path_buf());
327        }
328        coverage.files_candidate = files.len();
329
330        Ok(CollectFilesResult { files, coverage })
331    }
332
333    /// Check if a path matches any exclude patterns.
334    fn is_excluded(&self, path: &Path, excludes: &[String]) -> bool {
335        let path_str = path.to_string_lossy();
336        for pattern in excludes {
337            // Simple glob matching: check if any component matches
338            let pattern_str = pattern.replace("**", "");
339            let pattern_str = pattern_str.trim_matches('/');
340            if !pattern_str.is_empty() && path_str.contains(pattern_str) {
341                return true;
342            }
343        }
344        false
345    }
346}
347
348impl Default for Parser {
349    fn default() -> Self {
350        Self::new()
351    }
352}