Skip to main content

rma_parser/
lib.rs

1//! Tree-sitter based polyglot parser for Rust Monorepo Analyzer
2//!
3//! This crate provides high-performance parallel parsing of source code
4//! using tree-sitter grammars for multiple languages.
5
6pub mod languages;
7pub mod walker;
8
9use anyhow::{Context, Result};
10use rayon::prelude::*;
11use rma_common::{Language, RmaConfig, RmaError, SourceLocation};
12use serde::{Deserialize, Serialize};
13use std::path::{Path, PathBuf};
14use std::sync::Arc;
15use tracing::{debug, info, instrument, warn};
16use tree_sitter::{Node, Parser, Tree};
17
18/// A parsed source file with its AST
19#[derive(Debug)]
20pub struct ParsedFile {
21    pub path: PathBuf,
22    pub language: Language,
23    pub content: String,
24    pub tree: Tree,
25    pub parse_errors: Vec<ParseError>,
26}
27
28/// A parsing error
29#[derive(Debug, Clone, Serialize, Deserialize)]
30pub struct ParseError {
31    pub location: SourceLocation,
32    pub message: String,
33}
34
35/// Statistics from a parsing operation
36#[derive(Debug, Clone, Default, Serialize, Deserialize)]
37pub struct ParseStats {
38    pub files_parsed: usize,
39    pub files_failed: usize,
40    pub files_skipped: usize,
41    pub total_bytes: usize,
42    pub parse_errors: usize,
43}
44
45/// The main parser engine
46pub struct ParserEngine {
47    config: Arc<RmaConfig>,
48}
49
50impl ParserEngine {
51    /// Create a new parser engine with the given configuration
52    pub fn new(config: RmaConfig) -> Self {
53        Self {
54            config: Arc::new(config),
55        }
56    }
57
58    /// Parse a single file
59    #[instrument(skip(self, content), fields(path = %path.display()))]
60    pub fn parse_file(&self, path: &Path, content: &str) -> Result<ParsedFile> {
61        let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
62
63        let language = Language::from_extension(ext);
64
65        if language == Language::Unknown {
66            return Err(RmaError::UnsupportedLanguage(ext.to_string()).into());
67        }
68
69        let mut parser = Parser::new();
70        let ts_language = languages::get_language(language)?;
71        parser.set_language(&ts_language)?;
72
73        let tree = parser
74            .parse(content, None)
75            .context("Failed to parse file")?;
76
77        let parse_errors = collect_parse_errors(&tree, path, content);
78
79        debug!(
80            "Parsed {} ({}) - {} errors",
81            path.display(),
82            language,
83            parse_errors.len()
84        );
85
86        Ok(ParsedFile {
87            path: path.to_path_buf(),
88            language,
89            content: content.to_string(),
90            tree,
91            parse_errors,
92        })
93    }
94
95    /// Parse all supported files in a directory tree in parallel
96    #[instrument(skip(self))]
97    pub fn parse_directory(&self, root: &Path) -> Result<(Vec<ParsedFile>, ParseStats)> {
98        info!("Starting parallel parse of {}", root.display());
99
100        let files = walker::collect_files(root, &self.config)?;
101        info!("Found {} files to parse", files.len());
102
103        let results: Vec<_> = files
104            .par_iter()
105            .filter_map(|path| match std::fs::read_to_string(path) {
106                Ok(content) => {
107                    if content.len() > self.config.max_file_size {
108                        warn!("Skipping large file: {}", path.display());
109                        return None;
110                    }
111                    Some((path.clone(), content))
112                }
113                Err(e) => {
114                    warn!("Failed to read {}: {}", path.display(), e);
115                    None
116                }
117            })
118            .map(|(path, content)| {
119                let result = self.parse_file(&path, &content);
120                (path, result, content.len())
121            })
122            .collect();
123
124        let mut parsed_files = Vec::new();
125        let mut stats = ParseStats::default();
126
127        for (path, result, bytes) in results {
128            match result {
129                Ok(parsed) => {
130                    stats.parse_errors += parsed.parse_errors.len();
131                    stats.total_bytes += bytes;
132                    stats.files_parsed += 1;
133                    parsed_files.push(parsed);
134                }
135                Err(e) => {
136                    if e.downcast_ref::<RmaError>()
137                        .map(|e| matches!(e, RmaError::UnsupportedLanguage(_)))
138                        .unwrap_or(false)
139                    {
140                        stats.files_skipped += 1;
141                    } else {
142                        debug!("Failed to parse {}: {}", path.display(), e);
143                        stats.files_failed += 1;
144                    }
145                }
146            }
147        }
148
149        info!(
150            "Parsing complete: {} parsed, {} failed, {} skipped",
151            stats.files_parsed, stats.files_failed, stats.files_skipped
152        );
153
154        Ok((parsed_files, stats))
155    }
156}
157
158/// Collect parse errors from a tree-sitter tree
159fn collect_parse_errors(tree: &Tree, path: &Path, content: &str) -> Vec<ParseError> {
160    let mut errors = Vec::new();
161    let mut cursor = tree.walk();
162
163    collect_errors_recursive(&mut cursor, path, content, &mut errors);
164
165    errors
166}
167
168fn collect_errors_recursive(
169    cursor: &mut tree_sitter::TreeCursor,
170    path: &Path,
171    _content: &str,
172    errors: &mut Vec<ParseError>,
173) {
174    let node = cursor.node();
175
176    if node.is_error() || node.is_missing() {
177        let start = node.start_position();
178        let end = node.end_position();
179
180        errors.push(ParseError {
181            location: SourceLocation::new(
182                path.to_path_buf(),
183                start.row + 1,
184                start.column + 1,
185                end.row + 1,
186                end.column + 1,
187            ),
188            message: if node.is_missing() {
189                format!("Missing {}", node.kind())
190            } else {
191                "Syntax error".to_string()
192            },
193        });
194    }
195
196    if cursor.goto_first_child() {
197        loop {
198            collect_errors_recursive(cursor, path, _content, errors);
199            if !cursor.goto_next_sibling() {
200                break;
201            }
202        }
203        cursor.goto_parent();
204    }
205}
206
207/// Helper trait for AST traversal
208pub trait AstVisitor {
209    fn visit_node(&mut self, node: Node, content: &str);
210}
211
212/// Traverse an AST with a visitor
213pub fn traverse_ast<V: AstVisitor>(tree: &Tree, content: &str, visitor: &mut V) {
214    let mut cursor = tree.walk();
215    traverse_recursive(&mut cursor, content, visitor);
216}
217
218fn traverse_recursive<V: AstVisitor>(
219    cursor: &mut tree_sitter::TreeCursor,
220    content: &str,
221    visitor: &mut V,
222) {
223    let node = cursor.node();
224    visitor.visit_node(node, content);
225
226    if cursor.goto_first_child() {
227        loop {
228            traverse_recursive(cursor, content, visitor);
229            if !cursor.goto_next_sibling() {
230                break;
231            }
232        }
233        cursor.goto_parent();
234    }
235}
236
237#[cfg(test)]
238mod tests {
239    use super::*;
240
241    #[test]
242    fn test_parse_rust_file() {
243        let engine = ParserEngine::new(RmaConfig::default());
244        let content = r#"
245fn main() {
246    println!("Hello, world!");
247}
248"#;
249        let result = engine.parse_file(Path::new("test.rs"), content);
250        assert!(result.is_ok());
251        let parsed = result.unwrap();
252        assert_eq!(parsed.language, Language::Rust);
253        assert!(parsed.parse_errors.is_empty());
254    }
255
256    #[test]
257    fn test_parse_python_file() {
258        let engine = ParserEngine::new(RmaConfig::default());
259        let content = r#"
260def hello():
261    print("Hello, world!")
262
263if __name__ == "__main__":
264    hello()
265"#;
266        let result = engine.parse_file(Path::new("test.py"), content);
267        assert!(result.is_ok());
268        let parsed = result.unwrap();
269        assert_eq!(parsed.language, Language::Python);
270    }
271
272    #[test]
273    fn test_parse_javascript_file() {
274        let engine = ParserEngine::new(RmaConfig::default());
275        let content = r#"
276function hello() {
277    console.log("Hello, world!");
278}
279hello();
280"#;
281        let result = engine.parse_file(Path::new("test.js"), content);
282        assert!(result.is_ok());
283        let parsed = result.unwrap();
284        assert_eq!(parsed.language, Language::JavaScript);
285    }
286
287    #[test]
288    fn test_unsupported_language() {
289        let engine = ParserEngine::new(RmaConfig::default());
290        let result = engine.parse_file(Path::new("test.xyz"), "content");
291        assert!(result.is_err());
292    }
293}