Skip to main content

arbor_core/
parser.rs

1//! Parser module - the heart of code analysis.
2//!
3//! This module wraps Tree-sitter and provides a clean API for parsing
4//! source files into CodeNodes. Language detection is automatic based
5//! on file extension.
6
7use crate::error::{ParseError, Result};
8use crate::languages::{get_parser, LanguageParser};
9use crate::node::CodeNode;
10use std::fs;
11use std::path::Path;
12
13/// Parses a source file and extracts all code nodes.
14///
15/// This is the main entry point for parsing. It handles:
16/// - Reading the file from disk
17/// - Detecting the language from the extension
18/// - Parsing with Tree-sitter
19/// - Extracting meaningful code entities
20///
21/// # Example
22///
23/// ```no_run
24/// use arbor_core::parse_file;
25/// use std::path::Path;
26///
27/// let nodes = parse_file(Path::new("src/lib.rs")).unwrap();
28/// println!("Found {} nodes", nodes.len());
29/// ```
30pub fn parse_file(path: &Path) -> Result<Vec<CodeNode>> {
31    // Read the source file
32    let source = fs::read_to_string(path).map_err(|e| ParseError::io(path, e))?;
33
34    if source.is_empty() {
35        // Empty __init__.py files are valid Python module indicators
36        if path
37            .file_name()
38            .map(|n| n == "__init__.py")
39            .unwrap_or(false)
40        {
41            return Ok(vec![]); // Return empty nodes, not an error
42        }
43        return Err(ParseError::EmptyFile(path.to_path_buf()));
44    }
45
46    // Get the appropriate parser for this file type
47    let parser =
48        detect_language(path).ok_or_else(|| ParseError::UnsupportedLanguage(path.to_path_buf()))?;
49
50    // Use the file path as a string for node IDs
51    let file_path = path.to_string_lossy().to_string();
52
53    parse_source(&source, &file_path, parser.as_ref())
54}
55
56/// Parses source code directly (useful for testing or in-memory content).
57///
58/// You need to provide a language parser explicitly since there's no
59/// file extension to detect from.
60pub fn parse_source(
61    source: &str,
62    file_path: &str,
63    lang_parser: &dyn LanguageParser,
64) -> Result<Vec<CodeNode>> {
65    // Create and configure Tree-sitter parser
66    let mut parser = tree_sitter::Parser::new();
67    parser
68        .set_language(&lang_parser.language())
69        .map_err(|e| ParseError::ParserError(format!("Failed to set language: {}", e)))?;
70
71    // Parse the source
72    let tree = parser
73        .parse(source, None)
74        .ok_or_else(|| ParseError::ParserError("Tree-sitter returned no tree".into()))?;
75
76    // Extract nodes using the language-specific extractor
77    let nodes = lang_parser.extract_nodes(&tree, source, file_path);
78
79    Ok(nodes)
80}
81
82/// Detects the programming language from a file path.
83///
84/// Returns None if we don't support the file's extension.
85pub fn detect_language(path: &Path) -> Option<Box<dyn LanguageParser>> {
86    let extension = path.extension()?.to_str()?;
87    get_parser(extension)
88}
89
90#[cfg(test)]
91mod tests {
92    use super::*;
93    use crate::node::NodeKind;
94
95    #[test]
96    fn test_detect_language() {
97        assert!(detect_language(Path::new("foo.rs")).is_some());
98        assert!(detect_language(Path::new("bar.ts")).is_some());
99        assert!(detect_language(Path::new("baz.py")).is_some());
100        assert!(detect_language(Path::new("unknown.xyz")).is_none());
101    }
102
103    #[test]
104    fn test_parse_rust_source() {
105        let source = r#"
106            fn hello_world() {
107                println!("Hello!");
108            }
109
110            pub struct User {
111                name: String,
112            }
113        "#;
114
115        let parser = get_parser("rs").unwrap();
116        let nodes = parse_source(source, "test.rs", parser.as_ref()).unwrap();
117
118        // Should find at least the function and struct
119        assert!(nodes
120            .iter()
121            .any(|n| n.name == "hello_world" && n.kind == NodeKind::Function));
122        assert!(nodes
123            .iter()
124            .any(|n| n.name == "User" && n.kind == NodeKind::Struct));
125    }
126
127    #[test]
128    fn test_parse_typescript_source() {
129        let source = r#"
130            export function greet(name: string): string {
131                return `Hello, ${name}!`;
132            }
133
134            export class UserService {
135                validate() {}
136            }
137        "#;
138
139        let parser = get_parser("ts").unwrap();
140        let nodes = parse_source(source, "test.ts", parser.as_ref()).unwrap();
141
142        assert!(nodes
143            .iter()
144            .any(|n| n.name == "greet" && n.kind == NodeKind::Function));
145        assert!(nodes
146            .iter()
147            .any(|n| n.name == "UserService" && n.kind == NodeKind::Class));
148    }
149}