sigil_parser/
tree_sitter_support.rs

1//! Tree-sitter integration for Sigil
2//!
3//! Provides syntax parsing capabilities for multiple programming languages
4//! using tree-sitter grammars. This enables Samael and other AI agents to
5//! perform real syntax analysis on source code.
6
7use std::cell::RefCell;
8use std::collections::HashMap;
9use std::rc::Rc;
10use tree_sitter::{Language, Node, Parser, Tree};
11
12/// Supported languages for tree-sitter parsing
13#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
14pub enum TSLanguage {
15    Rust,
16    Python,
17    JavaScript,
18    TypeScript,
19    TypeScriptTsx,
20    Go,
21    C,
22    Cpp,
23    Java,
24    Json,
25    Css,
26    Bash,
27}
28
29impl TSLanguage {
30    /// Get the tree-sitter Language for this enum variant
31    pub fn get_language(&self) -> Language {
32        match self {
33            TSLanguage::Rust => tree_sitter_rust::language(),
34            TSLanguage::Python => tree_sitter_python::language(),
35            TSLanguage::JavaScript => tree_sitter_javascript::language(),
36            TSLanguage::TypeScript => tree_sitter_typescript::language_typescript(),
37            TSLanguage::TypeScriptTsx => tree_sitter_typescript::language_tsx(),
38            TSLanguage::Go => tree_sitter_go::language(),
39            TSLanguage::C => tree_sitter_c::language(),
40            TSLanguage::Cpp => tree_sitter_cpp::language(),
41            TSLanguage::Java => tree_sitter_java::language(),
42            TSLanguage::Json => tree_sitter_json::language(),
43            TSLanguage::Css => tree_sitter_css::language(),
44            TSLanguage::Bash => tree_sitter_bash::language(),
45        }
46    }
47
48    /// Parse a string to get the language enum
49    /// Handles formats like "Rust", "rust", "Language::Rust", etc.
50    pub fn from_str(s: &str) -> Option<Self> {
51        // Handle enum-style strings like "Language::Rust"
52        let name = s.rsplit("::").next().unwrap_or(s);
53
54        match name.to_lowercase().as_str() {
55            "rust" | "rs" | "sigil" => Some(TSLanguage::Rust),
56            "python" | "py" => Some(TSLanguage::Python),
57            "javascript" | "js" => Some(TSLanguage::JavaScript),
58            "typescript" | "ts" => Some(TSLanguage::TypeScript),
59            "tsx" | "typescripttsx" => Some(TSLanguage::TypeScriptTsx),
60            "go" | "golang" => Some(TSLanguage::Go),
61            "c" => Some(TSLanguage::C),
62            "cpp" | "c++" | "cxx" => Some(TSLanguage::Cpp),
63            "java" => Some(TSLanguage::Java),
64            "json" => Some(TSLanguage::Json),
65            "css" => Some(TSLanguage::Css),
66            "bash" | "sh" | "shell" => Some(TSLanguage::Bash),
67            // Languages not yet supported - return None
68            "html" | "htm" => None, // tree-sitter-html uses incompatible version
69            "kotlin" | "kt" => None,
70            "yaml" | "yml" => None,
71            "toml" => None,
72            "sql" => None,
73            "markdown" | "md" => None,
74            _ => None,
75        }
76    }
77
78    /// Get the canonical name for this language
79    pub fn name(&self) -> &'static str {
80        match self {
81            TSLanguage::Rust => "Rust",
82            TSLanguage::Python => "Python",
83            TSLanguage::JavaScript => "JavaScript",
84            TSLanguage::TypeScript => "TypeScript",
85            TSLanguage::TypeScriptTsx => "TypeScriptTsx",
86            TSLanguage::Go => "Go",
87            TSLanguage::C => "C",
88            TSLanguage::Cpp => "Cpp",
89            TSLanguage::Java => "Java",
90            TSLanguage::Json => "Json",
91            TSLanguage::Css => "Css",
92            TSLanguage::Bash => "Bash",
93        }
94    }
95}
96
97/// Tree-sitter parser wrapper
98pub struct TSParser {
99    parser: Parser,
100    language: TSLanguage,
101}
102
103impl TSParser {
104    /// Create a new parser for the given language
105    pub fn new(language: TSLanguage) -> Result<Self, String> {
106        let mut parser = Parser::new();
107        parser
108            .set_language(language.get_language())
109            .map_err(|e| format!("Failed to set language: {:?}", e))?;
110
111        Ok(TSParser { parser, language })
112    }
113
114    /// Parse source code and return a tree
115    pub fn parse(&mut self, source: &str) -> Result<TSTree, String> {
116        self.parser
117            .parse(source, None)
118            .map(|tree| TSTree {
119                tree,
120                source: source.to_string(),
121                language: self.language,
122            })
123            .ok_or_else(|| "Failed to parse source code".to_string())
124    }
125
126    /// Get the language this parser is configured for
127    pub fn language(&self) -> TSLanguage {
128        self.language
129    }
130}
131
132/// Wrapper for a parsed syntax tree
133pub struct TSTree {
134    tree: Tree,
135    source: String,
136    language: TSLanguage,
137}
138
139impl TSTree {
140    /// Get the root node of the tree
141    pub fn root_node(&self) -> Node {
142        self.tree.root_node()
143    }
144
145    /// Get the source code that was parsed
146    pub fn source(&self) -> &str {
147        &self.source
148    }
149
150    /// Get the language of this tree
151    pub fn language(&self) -> TSLanguage {
152        self.language
153    }
154}
155
156/// Parse source code with a given language
157pub fn parse_source(language_str: &str, source: &str) -> Result<TSTree, String> {
158    let language = TSLanguage::from_str(language_str)
159        .ok_or_else(|| format!("Unsupported language: {}", language_str))?;
160
161    let mut parser = TSParser::new(language)?;
162    parser.parse(source)
163}
164
165/// Convert a tree-sitter Node to interpreter Value fields
166/// Returns a HashMap suitable for creating a SyntaxNode struct
167pub fn node_to_value(node: &Node) -> HashMap<String, crate::interpreter::Value> {
168    use crate::interpreter::Value;
169
170    let mut fields = HashMap::new();
171
172    // Basic node info
173    fields.insert(
174        "kind".to_string(),
175        Value::String(Rc::new(node.kind().to_string())),
176    );
177    fields.insert("is_named".to_string(), Value::Bool(node.is_named()));
178    fields.insert("is_error".to_string(), Value::Bool(node.is_error()));
179    fields.insert("is_missing".to_string(), Value::Bool(node.is_missing()));
180
181    // Position info
182    let start = node.start_position();
183    let end = node.end_position();
184
185    let mut start_fields = HashMap::new();
186    start_fields.insert("row".to_string(), Value::Int(start.row as i64));
187    start_fields.insert("column".to_string(), Value::Int(start.column as i64));
188    fields.insert(
189        "start".to_string(),
190        Value::Struct {
191            name: "Position".to_string(),
192            fields: Rc::new(RefCell::new(start_fields)),
193        },
194    );
195
196    let mut end_fields = HashMap::new();
197    end_fields.insert("row".to_string(), Value::Int(end.row as i64));
198    end_fields.insert("column".to_string(), Value::Int(end.column as i64));
199    fields.insert(
200        "end".to_string(),
201        Value::Struct {
202            name: "Position".to_string(),
203            fields: Rc::new(RefCell::new(end_fields)),
204        },
205    );
206
207    // Byte range
208    fields.insert(
209        "start_byte".to_string(),
210        Value::Int(node.start_byte() as i64),
211    );
212    fields.insert("end_byte".to_string(), Value::Int(node.end_byte() as i64));
213
214    // Child count
215    fields.insert(
216        "child_count".to_string(),
217        Value::Int(node.child_count() as i64),
218    );
219    fields.insert(
220        "named_child_count".to_string(),
221        Value::Int(node.named_child_count() as i64),
222    );
223
224    // Children (recursively converted)
225    let children: Vec<Value> = (0..node.child_count())
226        .filter_map(|i| node.child(i))
227        .map(|child| Value::Struct {
228            name: "SyntaxNode".to_string(),
229            fields: Rc::new(RefCell::new(node_to_value(&child))),
230        })
231        .collect();
232    fields.insert(
233        "children".to_string(),
234        Value::Array(Rc::new(RefCell::new(children))),
235    );
236
237    // Named children only
238    let named_children: Vec<Value> = (0..node.named_child_count())
239        .filter_map(|i| node.named_child(i))
240        .map(|child| Value::Struct {
241            name: "SyntaxNode".to_string(),
242            fields: Rc::new(RefCell::new(node_to_value(&child))),
243        })
244        .collect();
245    fields.insert(
246        "named_children".to_string(),
247        Value::Array(Rc::new(RefCell::new(named_children))),
248    );
249
250    fields
251}
252
253/// Get the text content of a node from source
254pub fn node_text<'a>(node: &Node, source: &'a str) -> &'a str {
255    &source[node.start_byte()..node.end_byte()]
256}
257
258/// List all supported languages
259pub fn supported_languages() -> Vec<&'static str> {
260    vec![
261        "Rust",
262        "Python",
263        "JavaScript",
264        "TypeScript",
265        "TypeScriptTsx",
266        "Go",
267        "C",
268        "Cpp",
269        "Java",
270        "Json",
271        "Css",
272        "Bash",
273    ]
274}
275
276#[cfg(test)]
277mod tests {
278    use super::*;
279
280    #[test]
281    fn test_parse_rust() {
282        let source = r#"
283fn main() {
284    println!("Hello, world!");
285}
286"#;
287        let tree = parse_source("rust", source).unwrap();
288        let root = tree.root_node();
289        assert_eq!(root.kind(), "source_file");
290        assert!(root.child_count() > 0);
291    }
292
293    #[test]
294    fn test_parse_python() {
295        let source = r#"
296def greet(name):
297    print(f"Hello, {name}!")
298
299greet("world")
300"#;
301        let tree = parse_source("python", source).unwrap();
302        let root = tree.root_node();
303        assert_eq!(root.kind(), "module");
304    }
305
306    #[test]
307    fn test_parse_javascript() {
308        let source = r#"
309function greet(name) {
310    console.log(`Hello, ${name}!`);
311}
312greet("world");
313"#;
314        let tree = parse_source("javascript", source).unwrap();
315        let root = tree.root_node();
316        assert_eq!(root.kind(), "program");
317    }
318
319    #[test]
320    fn test_language_from_str() {
321        assert_eq!(TSLanguage::from_str("rust"), Some(TSLanguage::Rust));
322        assert_eq!(TSLanguage::from_str("Rust"), Some(TSLanguage::Rust));
323        assert_eq!(
324            TSLanguage::from_str("Language::Rust"),
325            Some(TSLanguage::Rust)
326        );
327        assert_eq!(TSLanguage::from_str("python"), Some(TSLanguage::Python));
328        assert_eq!(TSLanguage::from_str("py"), Some(TSLanguage::Python));
329        assert_eq!(TSLanguage::from_str("unknown"), None);
330    }
331
332    #[test]
333    fn test_unsupported_languages() {
334        assert!(TSLanguage::from_str("kotlin").is_none());
335        assert!(TSLanguage::from_str("yaml").is_none());
336        assert!(TSLanguage::from_str("toml").is_none());
337    }
338}