Skip to main content

panproto_parse/languages/
common.rs

1//! Common language parser implementation shared by all tree-sitter-based parsers.
2//!
3//! Since the generic [`AstWalker`](crate::walker::AstWalker) handles all languages
4//! uniformly (the node kind IS the vertex kind, the field name IS the edge kind),
5//! per-language parsers are thin wrappers that provide:
6//!
7//! 1. The tree-sitter Language object
8//! 2. The embedded `NODE_TYPES` JSON
9//! 3. Language-specific [`WalkerConfig`](crate::walker::WalkerConfig) overrides
10//! 4. File extension mapping
11
12use std::sync::{Arc, Mutex, OnceLock};
13
14use panproto_schema::{Protocol, Schema};
15
16use crate::emit_pretty::{FormatPolicy, Grammar as EmitGrammar, emit_pretty as emit_pretty_inner};
17use crate::error::ParseError;
18use crate::registry::AstParser;
19use crate::scope_detector::ScopeDetector;
20use crate::theory_extract::{ExtractedTheoryMeta, extract_theory_from_node_types};
21use crate::walker::{AstWalker, WalkerConfig};
22
23/// A generic language parser built from a tree-sitter grammar.
24///
25/// This struct is the shared implementation behind all language parsers.
26/// Each language constructs one with its specific grammar, node types,
27/// tags query, and config.
28pub struct LanguageParser {
29    /// The protocol name (e.g. `"typescript"`, `"python"`).
30    protocol_name: String,
31    /// File extensions this language handles.
32    extensions: Vec<&'static str>,
33    /// The resolved tree-sitter language.
34    language: tree_sitter::Language,
35    /// The grammar's bundled `tags.scm`, if any (for named-scope detection).
36    tags_query: Option<&'static str>,
37    /// Project-level tags-query override (concatenated in front of
38    /// `tags_query` when constructing the [`ScopeDetector`]).
39    project_tags_override: Option<String>,
40    /// The auto-derived theory metadata.
41    theory_meta: ExtractedTheoryMeta,
42    /// The panproto protocol definition (used for `SchemaBuilder` validation).
43    protocol: Protocol,
44    /// Per-language walker configuration.
45    walker_config: WalkerConfig,
46    /// A reusable [`ScopeDetector`] for this language.
47    ///
48    /// Held behind a `Mutex` because `parse()` on [`AstParser`] takes `&self`
49    /// but the detector's `TagsContext` (and internal `QueryCursor`) need
50    /// `&mut` access during a tags query run. A single parser instance is
51    /// typically used serially; contention here is rare.
52    scope_detector: Mutex<ScopeDetector>,
53    /// Raw `grammar.json` bytes for the de-novo emit walker. `None`
54    /// when the upstream grammar does not ship `grammar.json` and
55    /// `tools/fetch-grammar-json.py` could not regenerate one.
56    grammar_json: Option<&'static [u8]>,
57    /// Raw `node-types.json` bytes for augmenting the Grammar's subtype
58    /// closure with parser-produced child kinds not in grammar.json.
59    node_types_json_for_emit: Option<Vec<u8>>,
60    /// Lazily-parsed grammar. Populated on first call to `emit_pretty`.
61    grammar_cache: OnceLock<Result<EmitGrammar, ParseError>>,
62    /// Per-grammar defaults for opaque external scanner tokens.
63    cassette: Arc<dyn super::cassettes::GrammarCassette>,
64}
65
66impl LanguageParser {
67    /// Create a new language parser from a pre-constructed [`Language`](tree_sitter::Language).
68    ///
69    /// `tags_query` is the grammar's `queries/tags.scm` content, usually
70    /// sourced from [`panproto_grammars::Grammar::tags_query`]; pass `None`
71    /// if the grammar does not ship one.
72    ///
73    /// # Errors
74    ///
75    /// Returns [`ParseError`] if theory extraction from `node_types_json`
76    /// fails, or if the grammar's tags query fails to compile.
77    pub fn from_language(
78        protocol_name: &str,
79        extensions: Vec<&'static str>,
80        language: tree_sitter::Language,
81        node_types_json: &[u8],
82        tags_query: Option<&'static str>,
83        walker_config: WalkerConfig,
84    ) -> Result<Self, ParseError> {
85        Self::from_language_with_grammar_json(
86            protocol_name,
87            extensions,
88            language,
89            node_types_json,
90            tags_query,
91            walker_config,
92            None,
93        )
94    }
95
96    /// Construct a `LanguageParser` with vendored `grammar.json` bytes
97    /// for de-novo emission via [`AstParser::emit_pretty`].
98    ///
99    /// `grammar_json` should come from
100    /// [`panproto_grammars::Grammar::grammar_json`]; pass `None` to
101    /// signal that the language has no production-rule table available.
102    /// Without it, `emit_pretty` returns
103    /// [`ParseError::EmitFailed`] with a `grammar.json missing` reason.
104    ///
105    /// # Errors
106    ///
107    /// Returns [`ParseError`] if theory extraction from
108    /// `node_types_json` fails or if the tags query rejects compilation.
109    pub fn from_language_with_grammar_json(
110        protocol_name: &str,
111        extensions: Vec<&'static str>,
112        language: tree_sitter::Language,
113        node_types_json: &[u8],
114        tags_query: Option<&'static str>,
115        walker_config: WalkerConfig,
116        grammar_json: Option<&'static [u8]>,
117    ) -> Result<Self, ParseError> {
118        let theory_name = format!("Th{}FullAST", capitalize_first(protocol_name));
119        let theory_meta = extract_theory_from_node_types(&theory_name, node_types_json)?;
120        let protocol = build_full_ast_protocol(protocol_name, &theory_name);
121        // Named-scope detection is a best-effort secondary feature. Some
122        // vendored `tags.scm` files use capture names outside the
123        // tree-sitter-tags vocabulary (e.g. C#'s `@module`, AL's helper
124        // `@_test_attr`), which `TagsConfiguration` rejects. A grammar
125        // must still register for parse/emit in that case, so fall back
126        // to a no-op detector (which `(None, None)` constructs and cannot
127        // fail) rather than dropping the whole grammar.
128        let scope_detector = ScopeDetector::new(&language, tags_query, None)
129            .or_else(|_| ScopeDetector::new(&language, None, None))?;
130
131        Ok(Self {
132            protocol_name: protocol_name.to_owned(),
133            extensions,
134            language,
135            tags_query,
136            project_tags_override: None,
137            theory_meta,
138            protocol,
139            walker_config,
140            scope_detector: Mutex::new(scope_detector),
141            grammar_json,
142            node_types_json_for_emit: Some(node_types_json.to_vec()),
143            grammar_cache: OnceLock::new(),
144            cassette: super::cassettes::cassette_for(protocol_name),
145        })
146    }
147
148    /// Install a project-level tags-query override.
149    ///
150    /// The override string is concatenated in front of the grammar's
151    /// bundled `tags.scm` when the detector is rebuilt. Tree-sitter unions
152    /// all patterns, so overrides augment the defaults without replacing
153    /// them. Pass `None` to clear an existing override.
154    ///
155    /// Typical source: `panproto.toml`'s `[parse.tags.<lang>] path = "..."`.
156    ///
157    /// # Errors
158    ///
159    /// Returns [`ParseError::ScopeQueryCompile`] if the combined query
160    /// fails to compile against this language.
161    pub fn set_tags_override(&mut self, override_query: Option<String>) -> Result<(), ParseError> {
162        let detector =
163            ScopeDetector::new(&self.language, self.tags_query, override_query.as_deref())?;
164        self.project_tags_override = override_query;
165        if let Ok(mut guard) = self.scope_detector.lock() {
166            *guard = detector;
167        }
168        Ok(())
169    }
170}
171
172impl AstParser for LanguageParser {
173    fn protocol_name(&self) -> &str {
174        &self.protocol_name
175    }
176
177    fn parse(&self, source: &[u8], file_path: &str) -> Result<Schema, ParseError> {
178        let mut parser = tree_sitter::Parser::new();
179        parser
180            .set_language(&self.language)
181            .map_err(|e| ParseError::TreeSitterParse {
182                path: format!("{file_path}: set_language failed: {e}"),
183            })?;
184
185        let tree = parser
186            .parse(source, None)
187            .ok_or_else(|| ParseError::TreeSitterParse {
188                path: format!("{file_path}: parse returned None (timeout or cancellation)"),
189            })?;
190
191        // Build the walker (which runs the tags query once via the
192        // detector) inside the guard scope, then drop the guard before
193        // walking the tree. The scope map is copied into the walker, so
194        // the detector lock is no longer needed past that point.
195        let walker = {
196            let mut detector_guard =
197                self.scope_detector
198                    .lock()
199                    .map_err(|_| ParseError::SchemaConstruction {
200                        reason: "scope-detector mutex poisoned".to_owned(),
201                    })?;
202            AstWalker::new(
203                source,
204                &self.theory_meta,
205                &self.protocol,
206                self.walker_config.clone(),
207                Some(&mut *detector_guard),
208            )
209        };
210
211        walker.walk(&tree, file_path)
212    }
213
214    fn emit(&self, schema: &Schema) -> Result<Vec<u8>, ParseError> {
215        // The put-direction of the parse/emit dependent optic, dispatched
216        // on whether the layout complement is present:
217        //
218        // * **Complement present** (a parsed / CST schema, or one edited
219        //   in place by `panproto-io`'s `UnifiedCodec`): replay the layout
220        //   fibre. `emit_from_schema` reconstructs bytes from the
221        //   `start-byte` / `interstitial-N` / `literal-value` constraints
222        //   sorted by source position — byte-faithful by construction.
223        // * **Complement absent** (a by-construction / transpiled abstract
224        //   schema that never carried a parse trace): there is nothing to
225        //   replay, so fall back to the canonical section — the grammar
226        //   walk in `emit_pretty` under the default `FormatPolicy`.
227        //
228        // This makes `emit` total over both worlds: the historical
229        // reconstruction flow (replay) and the canonical de-novo flow are
230        // the two branches of one review. Before, the abstract case
231        // errored with "schema has no text fragments".
232        if has_layout_complement(schema) {
233            emit_from_schema(schema, &self.protocol_name)
234        } else {
235            self.emit_pretty_with_policy(schema, &FormatPolicy::default())
236        }
237    }
238
239    fn supported_extensions(&self) -> &[&str] {
240        &self.extensions
241    }
242
243    fn theory_meta(&self) -> &ExtractedTheoryMeta {
244        &self.theory_meta
245    }
246
247    fn emit_pretty_with_policy(
248        &self,
249        schema: &Schema,
250        policy: &FormatPolicy,
251    ) -> Result<Vec<u8>, ParseError> {
252        let bytes = self.grammar_json.ok_or_else(|| ParseError::EmitFailed {
253            protocol: self.protocol_name.clone(),
254            reason: "grammar.json not vendored for this protocol; \
255                     run tools/fetch-grammar-json.py to populate it"
256                .to_owned(),
257        })?;
258        let nt = self.node_types_json_for_emit.as_deref();
259        let cached = self.grammar_cache.get_or_init(|| {
260            EmitGrammar::from_bytes_with_node_types(&self.protocol_name, bytes, nt)
261        });
262        let grammar = match cached {
263            Ok(g) => g,
264            Err(e) => {
265                return Err(ParseError::EmitFailed {
266                    protocol: self.protocol_name.clone(),
267                    reason: format!("grammar.json parse failed: {e}"),
268                });
269            }
270        };
271        emit_pretty_inner(
272            &self.protocol_name,
273            schema,
274            grammar,
275            policy,
276            Some(&*self.cassette),
277        )
278    }
279}
280
281/// Does `schema` carry the layout complement that `emit_from_schema`
282/// replays? True iff some vertex records a `start-byte` anchor (every
283/// parsed vertex has one; a by-construction / transpiled schema has
284/// none). This is the dependent-optic dispatch in [`LanguageParser::emit`]:
285/// present ⇒ replay the fibre, absent ⇒ canonical section.
286fn has_layout_complement(schema: &Schema) -> bool {
287    schema
288        .constraints
289        .values()
290        .any(|cs| cs.iter().any(|c| c.sort.as_ref() == "start-byte"))
291}
292
293/// Reconstruct source text from a schema using interstitial text and leaf literals.
294///
295/// The walker stores two types of text data:
296/// - `literal-value` on leaf nodes: identifiers, literals, keywords that are named nodes
297/// - `interstitial-N` on parent nodes: text between named children (keywords, punctuation,
298///   whitespace, comments from anonymous/unnamed tokens)
299///
300/// The emitter reconstructs source by collecting ALL text fragments (both interstitials
301/// and leaf literals) and sorting them by their byte position in the original source.
302/// This produces exact round-trip fidelity: `emit(parse(source))` = `source`.
303fn emit_from_schema(schema: &Schema, protocol: &str) -> Result<Vec<u8>, ParseError> {
304    // Collect all text fragments with their byte positions.
305    // Each fragment is (start_byte, text).
306    let mut fragments: Vec<(usize, String)> = Vec::new();
307
308    for name in schema.vertices.keys() {
309        if let Some(constraints) = schema.constraints.get(name) {
310            // Get start-byte for this vertex.
311            let start_byte = constraints
312                .iter()
313                .find(|c| c.sort.as_ref() == "start-byte")
314                .and_then(|c| c.value.parse::<usize>().ok());
315
316            // Collect literal-value from leaf nodes.
317            let literal = constraints
318                .iter()
319                .find(|c| c.sort.as_ref() == "literal-value")
320                .map(|c| c.value.clone());
321
322            if let (Some(start), Some(text)) = (start_byte, literal) {
323                fragments.push((start, text));
324            }
325
326            // Collect interstitial text fragments.
327            // Each interstitial has a byte position derived from its parent and index.
328            for c in constraints {
329                let sort_str = c.sort.as_ref();
330                if sort_str.starts_with("interstitial-") {
331                    // The interstitial's position is encoded in a companion constraint.
332                    // We stored interstitial-N-start-byte alongside interstitial-N.
333                    let pos_sort = format!("{sort_str}-start-byte");
334                    let pos = constraints
335                        .iter()
336                        .find(|c2| c2.sort.as_ref() == pos_sort.as_str())
337                        .and_then(|c2| c2.value.parse::<usize>().ok());
338
339                    if let Some(p) = pos {
340                        fragments.push((p, c.value.clone()));
341                    }
342                }
343            }
344        }
345    }
346
347    if fragments.is_empty() {
348        return Err(ParseError::EmitFailed {
349            protocol: protocol.to_owned(),
350            reason: "schema has no text fragments".to_owned(),
351        });
352    }
353
354    // Sort by byte position and concatenate.
355    fragments.sort_by_key(|(pos, _)| *pos);
356
357    // Deduplicate overlapping fragments (parent interstitials may overlap with
358    // child literals). Keep the first fragment at each position.
359    let mut output = Vec::new();
360    let mut cursor = 0;
361
362    for (pos, text) in &fragments {
363        if *pos >= cursor {
364            output.extend_from_slice(text.as_bytes());
365            cursor = pos + text.len();
366        }
367    }
368
369    Ok(output)
370}
371
372/// Build the standard Protocol for a full-AST language parser.
373///
374/// Shared by `LanguageParser::new` and `LanguageParser::from_language`
375/// to avoid duplicating the constraint sorts and flag definitions.
376fn build_full_ast_protocol(protocol_name: &str, theory_name: &str) -> Protocol {
377    Protocol {
378        name: protocol_name.into(),
379        schema_theory: theory_name.into(),
380        instance_theory: format!("{theory_name}Instance"),
381        schema_composition: None,
382        instance_composition: None,
383        obj_kinds: vec![],
384        edge_rules: vec![],
385        constraint_sorts: vec![
386            "literal-value".into(),
387            "literal-type".into(),
388            "operator".into(),
389            "visibility".into(),
390            "mutability".into(),
391            "async".into(),
392            "static".into(),
393            "generator".into(),
394            "comment".into(),
395            "indent".into(),
396            "trailing-comma".into(),
397            "semicolon".into(),
398            "blank-lines-before".into(),
399            "start-byte".into(),
400            "end-byte".into(),
401        ],
402        has_order: true,
403        has_coproducts: false,
404        has_recursion: true,
405        has_causal: false,
406        nominal_identity: false,
407        has_defaults: false,
408        has_coercions: false,
409        has_mergers: false,
410        has_policies: false,
411    }
412}
413
414/// Capitalize the first letter of a string.
415fn capitalize_first(s: &str) -> String {
416    let mut chars = s.chars();
417    chars.next().map_or_else(String::new, |c| {
418        c.to_uppercase().collect::<String>() + chars.as_str()
419    })
420}