panproto_parse/languages/common.rs
1//! Common language parser implementation shared by all tree-sitter-based parsers.
2//!
3//! Since the generic [`AstWalker`](crate::walker::AstWalker) handles all languages
4//! uniformly (the node kind IS the vertex kind, the field name IS the edge kind),
5//! per-language parsers are thin wrappers that provide:
6//!
7//! 1. The tree-sitter Language object
8//! 2. The embedded `NODE_TYPES` JSON
9//! 3. Language-specific [`WalkerConfig`](crate::walker::WalkerConfig) overrides
10//! 4. File extension mapping
11
12use panproto_schema::{Protocol, Schema};
13
14use crate::error::ParseError;
15use crate::registry::AstParser;
16use crate::theory_extract::{ExtractedTheoryMeta, extract_theory_from_node_types};
17use crate::walker::{AstWalker, WalkerConfig};
18
19/// A generic language parser built from a tree-sitter grammar.
20///
21/// This struct is the shared implementation behind all 10 language parsers.
22/// Each language constructs one with its specific grammar, node types, and config.
23pub struct LanguageParser {
24 /// The protocol name (e.g. `"typescript"`, `"python"`).
25 protocol_name: String,
26 /// File extensions this language handles.
27 extensions: Vec<&'static str>,
28 /// The resolved tree-sitter language.
29 language: tree_sitter::Language,
30 /// The auto-derived theory metadata.
31 theory_meta: ExtractedTheoryMeta,
32 /// The panproto protocol definition (used for `SchemaBuilder` validation).
33 protocol: Protocol,
34 /// Per-language walker configuration.
35 walker_config: WalkerConfig,
36}
37
38impl LanguageParser {
39 /// Create a new language parser from a pre-constructed [`Language`](tree_sitter::Language).
40 ///
41 /// # Errors
42 ///
43 /// Returns [`ParseError`] if theory extraction from `node_types_json` fails.
44 pub fn from_language(
45 protocol_name: &str,
46 extensions: Vec<&'static str>,
47 language: tree_sitter::Language,
48 node_types_json: &[u8],
49 walker_config: WalkerConfig,
50 ) -> Result<Self, ParseError> {
51 let theory_name = format!("Th{}FullAST", capitalize_first(protocol_name));
52 let theory_meta = extract_theory_from_node_types(&theory_name, node_types_json)?;
53 let protocol = build_full_ast_protocol(protocol_name, &theory_name);
54
55 Ok(Self {
56 protocol_name: protocol_name.to_owned(),
57 extensions,
58 language,
59 theory_meta,
60 protocol,
61 walker_config,
62 })
63 }
64}
65
66impl AstParser for LanguageParser {
67 fn protocol_name(&self) -> &str {
68 &self.protocol_name
69 }
70
71 fn parse(&self, source: &[u8], file_path: &str) -> Result<Schema, ParseError> {
72 let mut parser = tree_sitter::Parser::new();
73 parser
74 .set_language(&self.language)
75 .map_err(|e| ParseError::TreeSitterParse {
76 path: format!("{file_path}: set_language failed: {e}"),
77 })?;
78
79 let tree = parser
80 .parse(source, None)
81 .ok_or_else(|| ParseError::TreeSitterParse {
82 path: format!("{file_path}: parse returned None (timeout or cancellation)"),
83 })?;
84
85 let walker = AstWalker::new(
86 source,
87 &self.theory_meta,
88 &self.protocol,
89 self.walker_config.clone(),
90 );
91
92 walker.walk(&tree, file_path)
93 }
94
95 fn emit(&self, schema: &Schema) -> Result<Vec<u8>, ParseError> {
96 // Reconstruct source text from the schema's structural information.
97 //
98 // The walker stores two types of text constraints:
99 // 1. `literal-value` on leaf nodes: the source text of identifiers, literals, etc.
100 // 2. `interstitial-N` on parent nodes: the text between named children, which
101 // contains keywords, punctuation, whitespace, and comments.
102 //
103 // The emitter walks the schema tree depth-first, interleaving interstitial text
104 // with child emissions to reconstruct the full source.
105 emit_from_schema(schema, &self.protocol_name)
106 }
107
108 fn supported_extensions(&self) -> &[&str] {
109 &self.extensions
110 }
111
112 fn theory_meta(&self) -> &ExtractedTheoryMeta {
113 &self.theory_meta
114 }
115}
116
117/// Reconstruct source text from a schema using interstitial text and leaf literals.
118///
119/// The walker stores two types of text data:
120/// - `literal-value` on leaf nodes: identifiers, literals, keywords that are named nodes
121/// - `interstitial-N` on parent nodes: text between named children (keywords, punctuation,
122/// whitespace, comments from anonymous/unnamed tokens)
123///
124/// The emitter reconstructs source by collecting ALL text fragments (both interstitials
125/// and leaf literals) and sorting them by their byte position in the original source.
126/// This produces exact round-trip fidelity: `emit(parse(source))` = `source`.
127fn emit_from_schema(schema: &Schema, protocol: &str) -> Result<Vec<u8>, ParseError> {
128 // Collect all text fragments with their byte positions.
129 // Each fragment is (start_byte, text).
130 let mut fragments: Vec<(usize, String)> = Vec::new();
131
132 for name in schema.vertices.keys() {
133 if let Some(constraints) = schema.constraints.get(name) {
134 // Get start-byte for this vertex.
135 let start_byte = constraints
136 .iter()
137 .find(|c| c.sort.as_ref() == "start-byte")
138 .and_then(|c| c.value.parse::<usize>().ok());
139
140 // Collect literal-value from leaf nodes.
141 let literal = constraints
142 .iter()
143 .find(|c| c.sort.as_ref() == "literal-value")
144 .map(|c| c.value.clone());
145
146 if let (Some(start), Some(text)) = (start_byte, literal) {
147 fragments.push((start, text));
148 }
149
150 // Collect interstitial text fragments.
151 // Each interstitial has a byte position derived from its parent and index.
152 for c in constraints {
153 let sort_str = c.sort.as_ref();
154 if sort_str.starts_with("interstitial-") {
155 // The interstitial's position is encoded in a companion constraint.
156 // We stored interstitial-N-start-byte alongside interstitial-N.
157 let pos_sort = format!("{sort_str}-start-byte");
158 let pos = constraints
159 .iter()
160 .find(|c2| c2.sort.as_ref() == pos_sort.as_str())
161 .and_then(|c2| c2.value.parse::<usize>().ok());
162
163 if let Some(p) = pos {
164 fragments.push((p, c.value.clone()));
165 }
166 }
167 }
168 }
169 }
170
171 if fragments.is_empty() {
172 return Err(ParseError::EmitFailed {
173 protocol: protocol.to_owned(),
174 reason: "schema has no text fragments".to_owned(),
175 });
176 }
177
178 // Sort by byte position and concatenate.
179 fragments.sort_by_key(|(pos, _)| *pos);
180
181 // Deduplicate overlapping fragments (parent interstitials may overlap with
182 // child literals). Keep the first fragment at each position.
183 let mut output = Vec::new();
184 let mut cursor = 0;
185
186 for (pos, text) in &fragments {
187 if *pos >= cursor {
188 output.extend_from_slice(text.as_bytes());
189 cursor = pos + text.len();
190 }
191 }
192
193 Ok(output)
194}
195
196/// Build the standard Protocol for a full-AST language parser.
197///
198/// Shared by `LanguageParser::new` and `LanguageParser::from_language`
199/// to avoid duplicating the constraint sorts and flag definitions.
200fn build_full_ast_protocol(protocol_name: &str, theory_name: &str) -> Protocol {
201 Protocol {
202 name: protocol_name.into(),
203 schema_theory: theory_name.into(),
204 instance_theory: format!("{theory_name}Instance"),
205 obj_kinds: vec![],
206 edge_rules: vec![],
207 constraint_sorts: vec![
208 "literal-value".into(),
209 "literal-type".into(),
210 "operator".into(),
211 "visibility".into(),
212 "mutability".into(),
213 "async".into(),
214 "static".into(),
215 "generator".into(),
216 "comment".into(),
217 "indent".into(),
218 "trailing-comma".into(),
219 "semicolon".into(),
220 "blank-lines-before".into(),
221 "start-byte".into(),
222 "end-byte".into(),
223 ],
224 has_order: true,
225 has_coproducts: false,
226 has_recursion: true,
227 has_causal: false,
228 nominal_identity: false,
229 has_defaults: false,
230 has_coercions: false,
231 has_mergers: false,
232 has_policies: false,
233 }
234}
235
236/// Capitalize the first letter of a string.
237fn capitalize_first(s: &str) -> String {
238 let mut chars = s.chars();
239 chars.next().map_or_else(String::new, |c| {
240 c.to_uppercase().collect::<String>() + chars.as_str()
241 })
242}