Skip to main content

merge_engine/
parser.rs

1//! Tree-sitter CST parser integration.
2//!
3//! Following LASTMERGE (Duarte, Borba, Cavalcanti — 2025), we parse source code
4//! into concrete syntax trees using Tree-sitter. Unlike abstract syntax trees,
5//! CSTs preserve all syntactic elements (whitespace, punctuation, comments),
6//! enabling faithful source reconstruction.
7//!
8//! The parser maps Tree-sitter's node representation into our CstNode types,
9//! classifying nodes as Leaf (terminal), Constructed (fixed-arity non-terminal),
10//! or List (variable-length non-terminal with ordering semantics).
11
12use std::sync::atomic::{AtomicUsize, Ordering};
13
14use crate::types::{CstNode, Language, ListOrdering};
15
16static NEXT_ID: AtomicUsize = AtomicUsize::new(1);
17
18fn fresh_id() -> usize {
19    NEXT_ID.fetch_add(1, Ordering::Relaxed)
20}
21
22/// Reset the ID counter (for testing determinism).
23pub fn reset_ids() {
24    NEXT_ID.store(1, Ordering::Relaxed);
25}
26
27/// Parse source code into a CstNode tree using tree-sitter.
28pub fn parse_to_cst(source: &str, lang: Language) -> Result<CstNode, ParseError> {
29    let ts_lang = get_tree_sitter_language(lang)?;
30    let mut parser = tree_sitter::Parser::new();
31    parser
32        .set_language(&ts_lang)
33        .map_err(|e| ParseError::LanguageError(e.to_string()))?;
34
35    let tree = parser.parse(source, None).ok_or(ParseError::ParseFailed)?;
36
37    let root = tree.root_node();
38    Ok(ts_node_to_cst(&root, source.as_bytes()))
39}
40
41/// Recursively convert a tree-sitter node to our CstNode representation.
42fn ts_node_to_cst(node: &tree_sitter::Node, source: &[u8]) -> CstNode {
43    let kind = node.kind().to_string();
44    let id = fresh_id();
45
46    if node.child_count() == 0 {
47        // Terminal / leaf node
48        let value = node.utf8_text(source).unwrap_or("").to_string();
49        return CstNode::Leaf { id, kind, value };
50    }
51
52    // Collect children (including anonymous nodes for whitespace/punctuation fidelity)
53    let children: Vec<CstNode> = (0..node.child_count())
54        .filter_map(|i| node.child(i))
55        .map(|child| ts_node_to_cst(&child, source))
56        .collect();
57
58    // Classify as List or Constructed based on node kind.
59    // Per LASTMERGE: list nodes have variable-length children of the same kind.
60    let ordering = classify_ordering(&kind);
61
62    if is_list_node(&kind) || children.len() > 3 {
63        CstNode::List {
64            id,
65            kind,
66            ordering,
67            children,
68        }
69    } else {
70        CstNode::Constructed { id, kind, children }
71    }
72}
73
74/// Determine if a node kind represents an unordered collection.
75/// Per LASTMERGE: import blocks and class member lists are unordered because
76/// their children can be permuted without affecting semantics.
77fn classify_ordering(kind: &str) -> ListOrdering {
78    match kind {
79        // Import / use declarations — order doesn't matter
80        "use_declaration_list"
81        | "import_list"
82        | "import_statement"
83        | "imports"
84        | "import_header" => ListOrdering::Unordered,
85        // Class/struct member lists — order usually doesn't matter for semantics
86        "class_body" | "enum_body" | "interface_body" | "declaration_list" | "companion_object"
87        | "enum_class_body" | "object_declaration" => ListOrdering::Unordered,
88        // TOML tables — key ordering within a table doesn't affect semantics
89        "table" | "inline_table" | "document" | "table_array_element" => ListOrdering::Unordered,
90        // YAML mappings — key ordering doesn't affect semantics
91        "block_mapping" | "flow_mapping" => ListOrdering::Unordered,
92        // Everything else is ordered by default
93        _ => ListOrdering::Ordered,
94    }
95}
96
97/// Heuristic: nodes that typically hold lists of children.
98fn is_list_node(kind: &str) -> bool {
99    kind.contains("block")
100        || kind.contains("body")
101        || kind.contains("list")
102        || kind.contains("statements")
103        || kind.contains("arguments")
104        || kind.contains("parameters")
105        || kind.ends_with("_list")
106        || kind == "program"
107        || kind == "source_file"
108        || kind == "module"
109        || kind == "translation_unit"
110        // TOML
111        || kind == "document"
112        || kind == "table"
113        || kind == "array"
114        || kind == "inline_table"
115        // YAML
116        || kind == "stream"
117        || kind == "block_node"
118        || kind == "block_mapping"
119        || kind == "block_sequence"
120        || kind == "flow_mapping"
121        || kind == "flow_sequence"
122}
123
124/// Get the tree-sitter Language object for a given language.
125fn get_tree_sitter_language(lang: Language) -> Result<tree_sitter::Language, ParseError> {
126    let lang_ref = match lang {
127        Language::Rust => tree_sitter_rust::LANGUAGE,
128        Language::JavaScript => tree_sitter_javascript::LANGUAGE,
129        Language::TypeScript => tree_sitter_typescript::LANGUAGE_TYPESCRIPT,
130        Language::Python => tree_sitter_python::LANGUAGE,
131        Language::Java => tree_sitter_java::LANGUAGE,
132        Language::Go => tree_sitter_go::LANGUAGE,
133        Language::C => tree_sitter_c::LANGUAGE,
134        Language::Cpp => tree_sitter_cpp::LANGUAGE,
135        Language::Kotlin => tree_sitter_kotlin_ng::LANGUAGE,
136        Language::Toml => tree_sitter_toml_ng::LANGUAGE,
137        Language::Yaml => tree_sitter_yaml::LANGUAGE,
138    };
139    Ok(lang_ref.into())
140}
141
142#[derive(Debug)]
143pub enum ParseError {
144    LanguageError(String),
145    ParseFailed,
146}
147
148impl std::fmt::Display for ParseError {
149    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
150        match self {
151            ParseError::LanguageError(s) => write!(f, "language error: {}", s),
152            ParseError::ParseFailed => write!(f, "parse failed"),
153        }
154    }
155}
156
157impl std::error::Error for ParseError {}
158
159#[cfg(test)]
160mod tests {
161    use super::*;
162
163    #[test]
164    fn test_parse_rust() {
165        let src = "fn main() { let x = 1; }";
166        let tree = parse_to_cst(src, Language::Rust).unwrap();
167        assert_eq!(tree.kind(), "source_file");
168        assert!(!tree.children().is_empty());
169    }
170
171    #[test]
172    fn test_parse_javascript() {
173        let src = "function foo() { return 42; }";
174        let tree = parse_to_cst(src, Language::JavaScript).unwrap();
175        assert_eq!(tree.kind(), "program");
176        assert!(!tree.children().is_empty());
177    }
178
179    #[test]
180    fn test_parse_kotlin() {
181        let src = "fun main() { val x = 1 }";
182        let tree = parse_to_cst(src, Language::Kotlin).unwrap();
183        assert_eq!(tree.kind(), "source_file");
184        assert!(!tree.children().is_empty());
185        let reconstructed = tree.to_source();
186        assert!(reconstructed.contains("fun"));
187        assert!(reconstructed.contains("val"));
188    }
189
190    #[test]
191    fn test_parse_toml() {
192        let src = "[package]\nname = \"merge-engine\"\nversion = \"0.1.0\"\n";
193        let tree = parse_to_cst(src, Language::Toml).unwrap();
194        assert_eq!(tree.kind(), "document");
195        assert!(!tree.children().is_empty());
196        let reconstructed = tree.to_source();
197        assert!(reconstructed.contains("package"));
198        assert!(reconstructed.contains("name"));
199    }
200
201    #[test]
202    fn test_parse_yaml() {
203        let src = "name: test\non:\n  push:\n    branches: [main]\njobs:\n  build:\n    runs-on: ubuntu-latest\n";
204        let tree = parse_to_cst(src, Language::Yaml).unwrap();
205        assert_eq!(tree.kind(), "stream");
206        let reconstructed = tree.to_source();
207        assert!(reconstructed.contains("ubuntu-latest"));
208        assert!(reconstructed.contains("branches"));
209    }
210
211    #[test]
212    fn test_leaf_reconstruction() {
213        let src = "let x = 1;";
214        let tree = parse_to_cst(src, Language::JavaScript).unwrap();
215        // Leaves should reconstruct back to original source
216        let reconstructed = tree.to_source();
217        assert!(reconstructed.contains("let"));
218        assert!(reconstructed.contains("x"));
219        assert!(reconstructed.contains("1"));
220    }
221}