sigil_parser/
tree_sitter_support.rs

1//! Tree-sitter integration for Sigil
2//!
3//! Provides syntax parsing capabilities for multiple programming languages
4//! using tree-sitter grammars. This enables Samael and other AI agents to
5//! perform real syntax analysis on source code.
6
7use std::cell::RefCell;
8use std::collections::HashMap;
9use std::rc::Rc;
10use tree_sitter::{Language, Parser, Tree, Node};
11
12/// Supported languages for tree-sitter parsing
13#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
14pub enum TSLanguage {
15    Rust,
16    Python,
17    JavaScript,
18    TypeScript,
19    TypeScriptTsx,
20    Go,
21    C,
22    Cpp,
23    Java,
24    Json,
25    Css,
26    Bash,
27}
28
29impl TSLanguage {
30    /// Get the tree-sitter Language for this enum variant
31    pub fn get_language(&self) -> Language {
32        match self {
33            TSLanguage::Rust => tree_sitter_rust::language(),
34            TSLanguage::Python => tree_sitter_python::language(),
35            TSLanguage::JavaScript => tree_sitter_javascript::language(),
36            TSLanguage::TypeScript => tree_sitter_typescript::language_typescript(),
37            TSLanguage::TypeScriptTsx => tree_sitter_typescript::language_tsx(),
38            TSLanguage::Go => tree_sitter_go::language(),
39            TSLanguage::C => tree_sitter_c::language(),
40            TSLanguage::Cpp => tree_sitter_cpp::language(),
41            TSLanguage::Java => tree_sitter_java::language(),
42            TSLanguage::Json => tree_sitter_json::language(),
43            TSLanguage::Css => tree_sitter_css::language(),
44            TSLanguage::Bash => tree_sitter_bash::language(),
45        }
46    }
47
48    /// Parse a string to get the language enum
49    /// Handles formats like "Rust", "rust", "Language::Rust", etc.
50    pub fn from_str(s: &str) -> Option<Self> {
51        // Handle enum-style strings like "Language::Rust"
52        let name = s.rsplit("::").next().unwrap_or(s);
53
54        match name.to_lowercase().as_str() {
55            "rust" | "rs" | "sigil" => Some(TSLanguage::Rust),
56            "python" | "py" => Some(TSLanguage::Python),
57            "javascript" | "js" => Some(TSLanguage::JavaScript),
58            "typescript" | "ts" => Some(TSLanguage::TypeScript),
59            "tsx" | "typescripttsx" => Some(TSLanguage::TypeScriptTsx),
60            "go" | "golang" => Some(TSLanguage::Go),
61            "c" => Some(TSLanguage::C),
62            "cpp" | "c++" | "cxx" => Some(TSLanguage::Cpp),
63            "java" => Some(TSLanguage::Java),
64            "json" => Some(TSLanguage::Json),
65            "css" => Some(TSLanguage::Css),
66            "bash" | "sh" | "shell" => Some(TSLanguage::Bash),
67            // Languages not yet supported - return None
68            "html" | "htm" => None,  // tree-sitter-html uses incompatible version
69            "kotlin" | "kt" => None,
70            "yaml" | "yml" => None,
71            "toml" => None,
72            "sql" => None,
73            "markdown" | "md" => None,
74            _ => None,
75        }
76    }
77
78    /// Get the canonical name for this language
79    pub fn name(&self) -> &'static str {
80        match self {
81            TSLanguage::Rust => "Rust",
82            TSLanguage::Python => "Python",
83            TSLanguage::JavaScript => "JavaScript",
84            TSLanguage::TypeScript => "TypeScript",
85            TSLanguage::TypeScriptTsx => "TypeScriptTsx",
86            TSLanguage::Go => "Go",
87            TSLanguage::C => "C",
88            TSLanguage::Cpp => "Cpp",
89            TSLanguage::Java => "Java",
90            TSLanguage::Json => "Json",
91            TSLanguage::Css => "Css",
92            TSLanguage::Bash => "Bash",
93        }
94    }
95}
96
97/// Tree-sitter parser wrapper
98pub struct TSParser {
99    parser: Parser,
100    language: TSLanguage,
101}
102
103impl TSParser {
104    /// Create a new parser for the given language
105    pub fn new(language: TSLanguage) -> Result<Self, String> {
106        let mut parser = Parser::new();
107        parser.set_language(language.get_language())
108            .map_err(|e| format!("Failed to set language: {:?}", e))?;
109
110        Ok(TSParser { parser, language })
111    }
112
113    /// Parse source code and return a tree
114    pub fn parse(&mut self, source: &str) -> Result<TSTree, String> {
115        self.parser.parse(source, None)
116            .map(|tree| TSTree {
117                tree,
118                source: source.to_string(),
119                language: self.language,
120            })
121            .ok_or_else(|| "Failed to parse source code".to_string())
122    }
123
124    /// Get the language this parser is configured for
125    pub fn language(&self) -> TSLanguage {
126        self.language
127    }
128}
129
130/// Wrapper for a parsed syntax tree
131pub struct TSTree {
132    tree: Tree,
133    source: String,
134    language: TSLanguage,
135}
136
137impl TSTree {
138    /// Get the root node of the tree
139    pub fn root_node(&self) -> Node {
140        self.tree.root_node()
141    }
142
143    /// Get the source code that was parsed
144    pub fn source(&self) -> &str {
145        &self.source
146    }
147
148    /// Get the language of this tree
149    pub fn language(&self) -> TSLanguage {
150        self.language
151    }
152}
153
154/// Parse source code with a given language
155pub fn parse_source(language_str: &str, source: &str) -> Result<TSTree, String> {
156    let language = TSLanguage::from_str(language_str)
157        .ok_or_else(|| format!("Unsupported language: {}", language_str))?;
158
159    let mut parser = TSParser::new(language)?;
160    parser.parse(source)
161}
162
163/// Convert a tree-sitter Node to interpreter Value fields
164/// Returns a HashMap suitable for creating a SyntaxNode struct
165pub fn node_to_value(node: &Node) -> HashMap<String, crate::interpreter::Value> {
166    use crate::interpreter::Value;
167
168    let mut fields = HashMap::new();
169
170    // Basic node info
171    fields.insert("kind".to_string(), Value::String(Rc::new(node.kind().to_string())));
172    fields.insert("is_named".to_string(), Value::Bool(node.is_named()));
173    fields.insert("is_error".to_string(), Value::Bool(node.is_error()));
174    fields.insert("is_missing".to_string(), Value::Bool(node.is_missing()));
175
176    // Position info
177    let start = node.start_position();
178    let end = node.end_position();
179
180    let mut start_fields = HashMap::new();
181    start_fields.insert("row".to_string(), Value::Int(start.row as i64));
182    start_fields.insert("column".to_string(), Value::Int(start.column as i64));
183    fields.insert("start".to_string(), Value::Struct {
184        name: "Position".to_string(),
185        fields: Rc::new(RefCell::new(start_fields)),
186    });
187
188    let mut end_fields = HashMap::new();
189    end_fields.insert("row".to_string(), Value::Int(end.row as i64));
190    end_fields.insert("column".to_string(), Value::Int(end.column as i64));
191    fields.insert("end".to_string(), Value::Struct {
192        name: "Position".to_string(),
193        fields: Rc::new(RefCell::new(end_fields)),
194    });
195
196    // Byte range
197    fields.insert("start_byte".to_string(), Value::Int(node.start_byte() as i64));
198    fields.insert("end_byte".to_string(), Value::Int(node.end_byte() as i64));
199
200    // Child count
201    fields.insert("child_count".to_string(), Value::Int(node.child_count() as i64));
202    fields.insert("named_child_count".to_string(), Value::Int(node.named_child_count() as i64));
203
204    // Children (recursively converted)
205    let children: Vec<Value> = (0..node.child_count())
206        .filter_map(|i| node.child(i))
207        .map(|child| Value::Struct {
208            name: "SyntaxNode".to_string(),
209            fields: Rc::new(RefCell::new(node_to_value(&child))),
210        })
211        .collect();
212    fields.insert("children".to_string(), Value::Array(Rc::new(RefCell::new(children))));
213
214    // Named children only
215    let named_children: Vec<Value> = (0..node.named_child_count())
216        .filter_map(|i| node.named_child(i))
217        .map(|child| Value::Struct {
218            name: "SyntaxNode".to_string(),
219            fields: Rc::new(RefCell::new(node_to_value(&child))),
220        })
221        .collect();
222    fields.insert("named_children".to_string(), Value::Array(Rc::new(RefCell::new(named_children))));
223
224    fields
225}
226
227/// Get the text content of a node from source
228pub fn node_text<'a>(node: &Node, source: &'a str) -> &'a str {
229    &source[node.start_byte()..node.end_byte()]
230}
231
232/// List all supported languages
233pub fn supported_languages() -> Vec<&'static str> {
234    vec![
235        "Rust", "Python", "JavaScript", "TypeScript", "TypeScriptTsx",
236        "Go", "C", "Cpp", "Java", "Json", "Css", "Bash",
237    ]
238}
239
240#[cfg(test)]
241mod tests {
242    use super::*;
243
244    #[test]
245    fn test_parse_rust() {
246        let source = r#"
247fn main() {
248    println!("Hello, world!");
249}
250"#;
251        let tree = parse_source("rust", source).unwrap();
252        let root = tree.root_node();
253        assert_eq!(root.kind(), "source_file");
254        assert!(root.child_count() > 0);
255    }
256
257    #[test]
258    fn test_parse_python() {
259        let source = r#"
260def greet(name):
261    print(f"Hello, {name}!")
262
263greet("world")
264"#;
265        let tree = parse_source("python", source).unwrap();
266        let root = tree.root_node();
267        assert_eq!(root.kind(), "module");
268    }
269
270    #[test]
271    fn test_parse_javascript() {
272        let source = r#"
273function greet(name) {
274    console.log(`Hello, ${name}!`);
275}
276greet("world");
277"#;
278        let tree = parse_source("javascript", source).unwrap();
279        let root = tree.root_node();
280        assert_eq!(root.kind(), "program");
281    }
282
283    #[test]
284    fn test_language_from_str() {
285        assert_eq!(TSLanguage::from_str("rust"), Some(TSLanguage::Rust));
286        assert_eq!(TSLanguage::from_str("Rust"), Some(TSLanguage::Rust));
287        assert_eq!(TSLanguage::from_str("Language::Rust"), Some(TSLanguage::Rust));
288        assert_eq!(TSLanguage::from_str("python"), Some(TSLanguage::Python));
289        assert_eq!(TSLanguage::from_str("py"), Some(TSLanguage::Python));
290        assert_eq!(TSLanguage::from_str("unknown"), None);
291    }
292
293    #[test]
294    fn test_unsupported_languages() {
295        assert!(TSLanguage::from_str("kotlin").is_none());
296        assert!(TSLanguage::from_str("yaml").is_none());
297        assert!(TSLanguage::from_str("toml").is_none());
298    }
299}