panproto_parse/languages/common.rs
1//! Common language parser implementation shared by all tree-sitter-based parsers.
2//!
3//! Since the generic [`AstWalker`](crate::walker::AstWalker) handles all languages
4//! uniformly (the node kind IS the vertex kind, the field name IS the edge kind),
5//! per-language parsers are thin wrappers that provide:
6//!
7//! 1. The tree-sitter Language object
8//! 2. The embedded `NODE_TYPES` JSON
9//! 3. Language-specific [`WalkerConfig`](crate::walker::WalkerConfig) overrides
10//! 4. File extension mapping
11
12use std::sync::{Mutex, OnceLock};
13
14use panproto_schema::{Protocol, Schema};
15
16use crate::emit_pretty::{FormatPolicy, Grammar as EmitGrammar, emit_pretty as emit_pretty_inner};
17use crate::error::ParseError;
18use crate::registry::AstParser;
19use crate::scope_detector::ScopeDetector;
20use crate::theory_extract::{ExtractedTheoryMeta, extract_theory_from_node_types};
21use crate::walker::{AstWalker, WalkerConfig};
22
23/// A generic language parser built from a tree-sitter grammar.
24///
25/// This struct is the shared implementation behind all language parsers.
26/// Each language constructs one with its specific grammar, node types,
27/// tags query, and config.
28pub struct LanguageParser {
29 /// The protocol name (e.g. `"typescript"`, `"python"`).
30 protocol_name: String,
31 /// File extensions this language handles.
32 extensions: Vec<&'static str>,
33 /// The resolved tree-sitter language.
34 language: tree_sitter::Language,
35 /// The grammar's bundled `tags.scm`, if any (for named-scope detection).
36 tags_query: Option<&'static str>,
37 /// Project-level tags-query override (concatenated in front of
38 /// `tags_query` when constructing the [`ScopeDetector`]).
39 project_tags_override: Option<String>,
40 /// The auto-derived theory metadata.
41 theory_meta: ExtractedTheoryMeta,
42 /// The panproto protocol definition (used for `SchemaBuilder` validation).
43 protocol: Protocol,
44 /// Per-language walker configuration.
45 walker_config: WalkerConfig,
46 /// A reusable [`ScopeDetector`] for this language.
47 ///
48 /// Held behind a `Mutex` because `parse()` on [`AstParser`] takes `&self`
49 /// but the detector's `TagsContext` (and internal `QueryCursor`) need
50 /// `&mut` access during a tags query run. A single parser instance is
51 /// typically used serially; contention here is rare.
52 scope_detector: Mutex<ScopeDetector>,
53 /// Raw `grammar.json` bytes for the de-novo emit walker. `None`
54 /// when the upstream grammar does not ship `grammar.json` and
55 /// `tools/fetch-grammar-json.py` could not regenerate one.
56 grammar_json: Option<&'static [u8]>,
57 /// Lazily-parsed grammar. Populated on first call to `emit_pretty`.
58 grammar_cache: OnceLock<Result<EmitGrammar, ParseError>>,
59}
60
61impl LanguageParser {
62 /// Create a new language parser from a pre-constructed [`Language`](tree_sitter::Language).
63 ///
64 /// `tags_query` is the grammar's `queries/tags.scm` content, usually
65 /// sourced from [`panproto_grammars::Grammar::tags_query`]; pass `None`
66 /// if the grammar does not ship one.
67 ///
68 /// # Errors
69 ///
70 /// Returns [`ParseError`] if theory extraction from `node_types_json`
71 /// fails, or if the grammar's tags query fails to compile.
72 pub fn from_language(
73 protocol_name: &str,
74 extensions: Vec<&'static str>,
75 language: tree_sitter::Language,
76 node_types_json: &[u8],
77 tags_query: Option<&'static str>,
78 walker_config: WalkerConfig,
79 ) -> Result<Self, ParseError> {
80 Self::from_language_with_grammar_json(
81 protocol_name,
82 extensions,
83 language,
84 node_types_json,
85 tags_query,
86 walker_config,
87 None,
88 )
89 }
90
91 /// Construct a `LanguageParser` with vendored `grammar.json` bytes
92 /// for de-novo emission via [`AstParser::emit_pretty`].
93 ///
94 /// `grammar_json` should come from
95 /// [`panproto_grammars::Grammar::grammar_json`]; pass `None` to
96 /// signal that the language has no production-rule table available.
97 /// Without it, `emit_pretty` returns
98 /// [`ParseError::EmitFailed`] with a `grammar.json missing` reason.
99 ///
100 /// # Errors
101 ///
102 /// Returns [`ParseError`] if theory extraction from
103 /// `node_types_json` fails or if the tags query rejects compilation.
104 pub fn from_language_with_grammar_json(
105 protocol_name: &str,
106 extensions: Vec<&'static str>,
107 language: tree_sitter::Language,
108 node_types_json: &[u8],
109 tags_query: Option<&'static str>,
110 walker_config: WalkerConfig,
111 grammar_json: Option<&'static [u8]>,
112 ) -> Result<Self, ParseError> {
113 let theory_name = format!("Th{}FullAST", capitalize_first(protocol_name));
114 let theory_meta = extract_theory_from_node_types(&theory_name, node_types_json)?;
115 let protocol = build_full_ast_protocol(protocol_name, &theory_name);
116 let scope_detector = ScopeDetector::new(&language, tags_query, None)?;
117
118 Ok(Self {
119 protocol_name: protocol_name.to_owned(),
120 extensions,
121 language,
122 tags_query,
123 project_tags_override: None,
124 theory_meta,
125 protocol,
126 walker_config,
127 scope_detector: Mutex::new(scope_detector),
128 grammar_json,
129 grammar_cache: OnceLock::new(),
130 })
131 }
132
133 /// Install a project-level tags-query override.
134 ///
135 /// The override string is concatenated in front of the grammar's
136 /// bundled `tags.scm` when the detector is rebuilt. Tree-sitter unions
137 /// all patterns, so overrides augment the defaults without replacing
138 /// them. Pass `None` to clear an existing override.
139 ///
140 /// Typical source: `panproto.toml`'s `[parse.tags.<lang>] path = "..."`.
141 ///
142 /// # Errors
143 ///
144 /// Returns [`ParseError::ScopeQueryCompile`] if the combined query
145 /// fails to compile against this language.
146 pub fn set_tags_override(&mut self, override_query: Option<String>) -> Result<(), ParseError> {
147 let detector =
148 ScopeDetector::new(&self.language, self.tags_query, override_query.as_deref())?;
149 self.project_tags_override = override_query;
150 if let Ok(mut guard) = self.scope_detector.lock() {
151 *guard = detector;
152 }
153 Ok(())
154 }
155}
156
157impl AstParser for LanguageParser {
158 fn protocol_name(&self) -> &str {
159 &self.protocol_name
160 }
161
162 fn parse(&self, source: &[u8], file_path: &str) -> Result<Schema, ParseError> {
163 let mut parser = tree_sitter::Parser::new();
164 parser
165 .set_language(&self.language)
166 .map_err(|e| ParseError::TreeSitterParse {
167 path: format!("{file_path}: set_language failed: {e}"),
168 })?;
169
170 let tree = parser
171 .parse(source, None)
172 .ok_or_else(|| ParseError::TreeSitterParse {
173 path: format!("{file_path}: parse returned None (timeout or cancellation)"),
174 })?;
175
176 // Build the walker (which runs the tags query once via the
177 // detector) inside the guard scope, then drop the guard before
178 // walking the tree. The scope map is copied into the walker, so
179 // the detector lock is no longer needed past that point.
180 let walker = {
181 let mut detector_guard =
182 self.scope_detector
183 .lock()
184 .map_err(|_| ParseError::SchemaConstruction {
185 reason: "scope-detector mutex poisoned".to_owned(),
186 })?;
187 AstWalker::new(
188 source,
189 &self.theory_meta,
190 &self.protocol,
191 self.walker_config.clone(),
192 Some(&mut *detector_guard),
193 )
194 };
195
196 walker.walk(&tree, file_path)
197 }
198
199 fn emit(&self, schema: &Schema) -> Result<Vec<u8>, ParseError> {
200 // Reconstruct source text from the schema's structural information.
201 //
202 // The walker stores two types of text constraints:
203 // 1. `literal-value` on leaf nodes: the source text of identifiers, literals, etc.
204 // 2. `interstitial-N` on parent nodes: the text between named children, which
205 // contains keywords, punctuation, whitespace, and comments.
206 //
207 // The emitter walks the schema tree depth-first, interleaving interstitial text
208 // with child emissions to reconstruct the full source.
209 emit_from_schema(schema, &self.protocol_name)
210 }
211
212 fn supported_extensions(&self) -> &[&str] {
213 &self.extensions
214 }
215
216 fn theory_meta(&self) -> &ExtractedTheoryMeta {
217 &self.theory_meta
218 }
219
220 fn emit_pretty_with_policy(
221 &self,
222 schema: &Schema,
223 policy: &FormatPolicy,
224 ) -> Result<Vec<u8>, ParseError> {
225 let bytes = self.grammar_json.ok_or_else(|| ParseError::EmitFailed {
226 protocol: self.protocol_name.clone(),
227 reason: "grammar.json not vendored for this protocol; \
228 run tools/fetch-grammar-json.py to populate it"
229 .to_owned(),
230 })?;
231 let cached = self
232 .grammar_cache
233 .get_or_init(|| EmitGrammar::from_bytes(&self.protocol_name, bytes));
234 let grammar = match cached {
235 Ok(g) => g,
236 Err(e) => {
237 return Err(ParseError::EmitFailed {
238 protocol: self.protocol_name.clone(),
239 reason: format!("grammar.json parse failed: {e}"),
240 });
241 }
242 };
243 emit_pretty_inner(&self.protocol_name, schema, grammar, policy)
244 }
245}
246
247/// Reconstruct source text from a schema using interstitial text and leaf literals.
248///
249/// The walker stores two types of text data:
250/// - `literal-value` on leaf nodes: identifiers, literals, keywords that are named nodes
251/// - `interstitial-N` on parent nodes: text between named children (keywords, punctuation,
252/// whitespace, comments from anonymous/unnamed tokens)
253///
254/// The emitter reconstructs source by collecting ALL text fragments (both interstitials
255/// and leaf literals) and sorting them by their byte position in the original source.
256/// This produces exact round-trip fidelity: `emit(parse(source))` = `source`.
257fn emit_from_schema(schema: &Schema, protocol: &str) -> Result<Vec<u8>, ParseError> {
258 // Collect all text fragments with their byte positions.
259 // Each fragment is (start_byte, text).
260 let mut fragments: Vec<(usize, String)> = Vec::new();
261
262 for name in schema.vertices.keys() {
263 if let Some(constraints) = schema.constraints.get(name) {
264 // Get start-byte for this vertex.
265 let start_byte = constraints
266 .iter()
267 .find(|c| c.sort.as_ref() == "start-byte")
268 .and_then(|c| c.value.parse::<usize>().ok());
269
270 // Collect literal-value from leaf nodes.
271 let literal = constraints
272 .iter()
273 .find(|c| c.sort.as_ref() == "literal-value")
274 .map(|c| c.value.clone());
275
276 if let (Some(start), Some(text)) = (start_byte, literal) {
277 fragments.push((start, text));
278 }
279
280 // Collect interstitial text fragments.
281 // Each interstitial has a byte position derived from its parent and index.
282 for c in constraints {
283 let sort_str = c.sort.as_ref();
284 if sort_str.starts_with("interstitial-") {
285 // The interstitial's position is encoded in a companion constraint.
286 // We stored interstitial-N-start-byte alongside interstitial-N.
287 let pos_sort = format!("{sort_str}-start-byte");
288 let pos = constraints
289 .iter()
290 .find(|c2| c2.sort.as_ref() == pos_sort.as_str())
291 .and_then(|c2| c2.value.parse::<usize>().ok());
292
293 if let Some(p) = pos {
294 fragments.push((p, c.value.clone()));
295 }
296 }
297 }
298 }
299 }
300
301 if fragments.is_empty() {
302 return Err(ParseError::EmitFailed {
303 protocol: protocol.to_owned(),
304 reason: "schema has no text fragments".to_owned(),
305 });
306 }
307
308 // Sort by byte position and concatenate.
309 fragments.sort_by_key(|(pos, _)| *pos);
310
311 // Deduplicate overlapping fragments (parent interstitials may overlap with
312 // child literals). Keep the first fragment at each position.
313 let mut output = Vec::new();
314 let mut cursor = 0;
315
316 for (pos, text) in &fragments {
317 if *pos >= cursor {
318 output.extend_from_slice(text.as_bytes());
319 cursor = pos + text.len();
320 }
321 }
322
323 Ok(output)
324}
325
326/// Build the standard Protocol for a full-AST language parser.
327///
328/// Shared by `LanguageParser::new` and `LanguageParser::from_language`
329/// to avoid duplicating the constraint sorts and flag definitions.
330fn build_full_ast_protocol(protocol_name: &str, theory_name: &str) -> Protocol {
331 Protocol {
332 name: protocol_name.into(),
333 schema_theory: theory_name.into(),
334 instance_theory: format!("{theory_name}Instance"),
335 schema_composition: None,
336 instance_composition: None,
337 obj_kinds: vec![],
338 edge_rules: vec![],
339 constraint_sorts: vec![
340 "literal-value".into(),
341 "literal-type".into(),
342 "operator".into(),
343 "visibility".into(),
344 "mutability".into(),
345 "async".into(),
346 "static".into(),
347 "generator".into(),
348 "comment".into(),
349 "indent".into(),
350 "trailing-comma".into(),
351 "semicolon".into(),
352 "blank-lines-before".into(),
353 "start-byte".into(),
354 "end-byte".into(),
355 ],
356 has_order: true,
357 has_coproducts: false,
358 has_recursion: true,
359 has_causal: false,
360 nominal_identity: false,
361 has_defaults: false,
362 has_coercions: false,
363 has_mergers: false,
364 has_policies: false,
365 }
366}
367
368/// Capitalize the first letter of a string.
369fn capitalize_first(s: &str) -> String {
370 let mut chars = s.chars();
371 chars.next().map_or_else(String::new, |c| {
372 c.to_uppercase().collect::<String>() + chars.as_str()
373 })
374}