Skip to main content

arbor_core/
parser.rs

1//! Parser module - the heart of code analysis.
2//!
3//! This module wraps Tree-sitter and provides a clean API for parsing
4//! source files into CodeNodes. Language detection is automatic based
5//! on file extension.
6
7use crate::error::{ParseError, Result};
8use crate::fallback_parser;
9use crate::languages::{get_parser, LanguageParser};
10use crate::node::CodeNode;
11use std::fs;
12use std::path::Path;
13
14/// Parses a source file and extracts all code nodes.
15///
16/// This is the main entry point for parsing. It handles:
17/// - Reading the file from disk
18/// - Detecting the language from the extension
19/// - Parsing with Tree-sitter
20/// - Extracting meaningful code entities
21///
22/// # Example
23///
24/// ```no_run
25/// use arbor_core::parse_file;
26/// use std::path::Path;
27///
28/// let nodes = parse_file(Path::new("src/lib.rs")).unwrap();
29/// println!("Found {} nodes", nodes.len());
30/// ```
31pub fn parse_file(path: &Path) -> Result<Vec<CodeNode>> {
32    // Read the source file
33    let source = fs::read_to_string(path).map_err(|e| ParseError::io(path, e))?;
34
35    if source.is_empty() {
36        // Empty __init__.py files are valid Python module indicators
37        if path
38            .file_name()
39            .map(|n| n == "__init__.py")
40            .unwrap_or(false)
41        {
42            return Ok(vec![]); // Return empty nodes, not an error
43        }
44        return Err(ParseError::EmptyFile(path.to_path_buf()));
45    }
46
47    let extension = path
48        .extension()
49        .and_then(|e| e.to_str())
50        .unwrap_or_default();
51
52    // Get the appropriate parser for this file type (tree-sitter path first)
53    let parser = detect_language(path);
54
55    if parser.is_none() {
56        if fallback_parser::is_fallback_supported_extension(extension) {
57            let file_path = path.to_string_lossy().to_string();
58            return Ok(fallback_parser::parse_fallback_source(
59                &source, &file_path, extension,
60            ));
61        }
62        return Err(ParseError::UnsupportedLanguage(path.to_path_buf()));
63    }
64
65    let parser = parser.unwrap();
66
67    // Use the file path as a string for node IDs
68    let file_path = path.to_string_lossy().to_string();
69
70    parse_source(&source, &file_path, parser.as_ref())
71}
72
73/// Parses source code directly (useful for testing or in-memory content).
74///
75/// You need to provide a language parser explicitly since there's no
76/// file extension to detect from.
77pub fn parse_source(
78    source: &str,
79    file_path: &str,
80    lang_parser: &dyn LanguageParser,
81) -> Result<Vec<CodeNode>> {
82    // Create and configure Tree-sitter parser
83    let mut parser = tree_sitter::Parser::new();
84    parser
85        .set_language(&lang_parser.language())
86        .map_err(|e| ParseError::ParserError(format!("Failed to set language: {}", e)))?;
87
88    // Parse the source
89    let tree = parser
90        .parse(source, None)
91        .ok_or_else(|| ParseError::ParserError("Tree-sitter returned no tree".into()))?;
92
93    // Extract nodes using the language-specific extractor
94    let nodes = lang_parser.extract_nodes(&tree, source, file_path);
95
96    Ok(nodes)
97}
98
99/// Detects the programming language from a file path.
100///
101/// Returns None if we don't support the file's extension.
102pub fn detect_language(path: &Path) -> Option<Box<dyn LanguageParser>> {
103    let extension = path.extension()?.to_str()?;
104    get_parser(extension)
105}
106
107#[cfg(test)]
108mod tests {
109    use super::*;
110    use crate::node::NodeKind;
111
112    #[test]
113    fn test_detect_language() {
114        assert!(detect_language(Path::new("foo.rs")).is_some());
115        assert!(detect_language(Path::new("bar.ts")).is_some());
116        assert!(detect_language(Path::new("baz.py")).is_some());
117        assert!(detect_language(Path::new("unknown.xyz")).is_none());
118    }
119
120    #[test]
121    fn test_parse_fallback_language_source() {
122        let source = r#"
123            fun calculateTax(price: Double): Double = price * 0.18
124            class TaxService
125        "#;
126
127        let dir = tempfile::tempdir().unwrap();
128        let path = dir.path().join("billing.kt");
129        std::fs::write(&path, source).unwrap();
130
131        let nodes = parse_file(&path).unwrap();
132        assert!(nodes.iter().any(|n| n.name == "calculateTax"));
133        assert!(nodes.iter().any(|n| n.name == "TaxService"));
134    }
135
136    #[test]
137    fn test_parse_rust_source() {
138        let source = r#"
139            fn hello_world() {
140                println!("Hello!");
141            }
142
143            pub struct User {
144                name: String,
145            }
146        "#;
147
148        let parser = get_parser("rs").unwrap();
149        let nodes = parse_source(source, "test.rs", parser.as_ref()).unwrap();
150
151        // Should find at least the function and struct
152        assert!(nodes
153            .iter()
154            .any(|n| n.name == "hello_world" && n.kind == NodeKind::Function));
155        assert!(nodes
156            .iter()
157            .any(|n| n.name == "User" && n.kind == NodeKind::Struct));
158    }
159
160    #[test]
161    fn test_parse_typescript_source() {
162        let source = r#"
163            export function greet(name: string): string {
164                return `Hello, ${name}!`;
165            }
166
167            export class UserService {
168                validate() {}
169            }
170        "#;
171
172        let parser = get_parser("ts").unwrap();
173        let nodes = parse_source(source, "test.ts", parser.as_ref()).unwrap();
174
175        assert!(nodes
176            .iter()
177            .any(|n| n.name == "greet" && n.kind == NodeKind::Function));
178        assert!(nodes
179            .iter()
180            .any(|n| n.name == "UserService" && n.kind == NodeKind::Class));
181    }
182}