Skip to main content

panproto_parse/languages/
common.rs

1//! Common language parser implementation shared by all tree-sitter-based parsers.
2//!
3//! Since the generic [`AstWalker`](crate::walker::AstWalker) handles all languages
4//! uniformly (the node kind IS the vertex kind, the field name IS the edge kind),
5//! per-language parsers are thin wrappers that provide:
6//!
7//! 1. The tree-sitter Language object
8//! 2. The embedded `NODE_TYPES` JSON
9//! 3. Language-specific [`WalkerConfig`](crate::walker::WalkerConfig) overrides
10//! 4. File extension mapping
11
12use std::sync::{Mutex, OnceLock};
13
14use panproto_schema::{Protocol, Schema};
15
16use crate::emit_pretty::{FormatPolicy, Grammar as EmitGrammar, emit_pretty as emit_pretty_inner};
17use crate::error::ParseError;
18use crate::registry::AstParser;
19use crate::scope_detector::ScopeDetector;
20use crate::theory_extract::{ExtractedTheoryMeta, extract_theory_from_node_types};
21use crate::walker::{AstWalker, WalkerConfig};
22
23/// A generic language parser built from a tree-sitter grammar.
24///
25/// This struct is the shared implementation behind all language parsers.
26/// Each language constructs one with its specific grammar, node types,
27/// tags query, and config.
28pub struct LanguageParser {
29    /// The protocol name (e.g. `"typescript"`, `"python"`).
30    protocol_name: String,
31    /// File extensions this language handles.
32    extensions: Vec<&'static str>,
33    /// The resolved tree-sitter language.
34    language: tree_sitter::Language,
35    /// The grammar's bundled `tags.scm`, if any (for named-scope detection).
36    tags_query: Option<&'static str>,
37    /// Project-level tags-query override (concatenated in front of
38    /// `tags_query` when constructing the [`ScopeDetector`]).
39    project_tags_override: Option<String>,
40    /// The auto-derived theory metadata.
41    theory_meta: ExtractedTheoryMeta,
42    /// The panproto protocol definition (used for `SchemaBuilder` validation).
43    protocol: Protocol,
44    /// Per-language walker configuration.
45    walker_config: WalkerConfig,
46    /// A reusable [`ScopeDetector`] for this language.
47    ///
48    /// Held behind a `Mutex` because `parse()` on [`AstParser`] takes `&self`
49    /// but the detector's `TagsContext` (and internal `QueryCursor`) need
50    /// `&mut` access during a tags query run. A single parser instance is
51    /// typically used serially; contention here is rare.
52    scope_detector: Mutex<ScopeDetector>,
53    /// Raw `grammar.json` bytes for the de-novo emit walker. `None`
54    /// when the upstream grammar does not ship `grammar.json` and
55    /// `tools/fetch-grammar-json.py` could not regenerate one.
56    grammar_json: Option<&'static [u8]>,
57    /// Raw `node-types.json` bytes for augmenting the Grammar's subtype
58    /// closure with parser-produced child kinds not in grammar.json.
59    node_types_json_for_emit: Option<Vec<u8>>,
60    /// Lazily-parsed grammar. Populated on first call to `emit_pretty`.
61    grammar_cache: OnceLock<Result<EmitGrammar, ParseError>>,
62}
63
64impl LanguageParser {
65    /// Create a new language parser from a pre-constructed [`Language`](tree_sitter::Language).
66    ///
67    /// `tags_query` is the grammar's `queries/tags.scm` content, usually
68    /// sourced from [`panproto_grammars::Grammar::tags_query`]; pass `None`
69    /// if the grammar does not ship one.
70    ///
71    /// # Errors
72    ///
73    /// Returns [`ParseError`] if theory extraction from `node_types_json`
74    /// fails, or if the grammar's tags query fails to compile.
75    pub fn from_language(
76        protocol_name: &str,
77        extensions: Vec<&'static str>,
78        language: tree_sitter::Language,
79        node_types_json: &[u8],
80        tags_query: Option<&'static str>,
81        walker_config: WalkerConfig,
82    ) -> Result<Self, ParseError> {
83        Self::from_language_with_grammar_json(
84            protocol_name,
85            extensions,
86            language,
87            node_types_json,
88            tags_query,
89            walker_config,
90            None,
91        )
92    }
93
94    /// Construct a `LanguageParser` with vendored `grammar.json` bytes
95    /// for de-novo emission via [`AstParser::emit_pretty`].
96    ///
97    /// `grammar_json` should come from
98    /// [`panproto_grammars::Grammar::grammar_json`]; pass `None` to
99    /// signal that the language has no production-rule table available.
100    /// Without it, `emit_pretty` returns
101    /// [`ParseError::EmitFailed`] with a `grammar.json missing` reason.
102    ///
103    /// # Errors
104    ///
105    /// Returns [`ParseError`] if theory extraction from
106    /// `node_types_json` fails or if the tags query rejects compilation.
107    pub fn from_language_with_grammar_json(
108        protocol_name: &str,
109        extensions: Vec<&'static str>,
110        language: tree_sitter::Language,
111        node_types_json: &[u8],
112        tags_query: Option<&'static str>,
113        walker_config: WalkerConfig,
114        grammar_json: Option<&'static [u8]>,
115    ) -> Result<Self, ParseError> {
116        let theory_name = format!("Th{}FullAST", capitalize_first(protocol_name));
117        let theory_meta = extract_theory_from_node_types(&theory_name, node_types_json)?;
118        let protocol = build_full_ast_protocol(protocol_name, &theory_name);
119        let scope_detector = ScopeDetector::new(&language, tags_query, None)?;
120
121        Ok(Self {
122            protocol_name: protocol_name.to_owned(),
123            extensions,
124            language,
125            tags_query,
126            project_tags_override: None,
127            theory_meta,
128            protocol,
129            walker_config,
130            scope_detector: Mutex::new(scope_detector),
131            grammar_json,
132            node_types_json_for_emit: Some(node_types_json.to_vec()),
133            grammar_cache: OnceLock::new(),
134        })
135    }
136
137    /// Install a project-level tags-query override.
138    ///
139    /// The override string is concatenated in front of the grammar's
140    /// bundled `tags.scm` when the detector is rebuilt. Tree-sitter unions
141    /// all patterns, so overrides augment the defaults without replacing
142    /// them. Pass `None` to clear an existing override.
143    ///
144    /// Typical source: `panproto.toml`'s `[parse.tags.<lang>] path = "..."`.
145    ///
146    /// # Errors
147    ///
148    /// Returns [`ParseError::ScopeQueryCompile`] if the combined query
149    /// fails to compile against this language.
150    pub fn set_tags_override(&mut self, override_query: Option<String>) -> Result<(), ParseError> {
151        let detector =
152            ScopeDetector::new(&self.language, self.tags_query, override_query.as_deref())?;
153        self.project_tags_override = override_query;
154        if let Ok(mut guard) = self.scope_detector.lock() {
155            *guard = detector;
156        }
157        Ok(())
158    }
159}
160
161impl AstParser for LanguageParser {
162    fn protocol_name(&self) -> &str {
163        &self.protocol_name
164    }
165
166    fn parse(&self, source: &[u8], file_path: &str) -> Result<Schema, ParseError> {
167        let mut parser = tree_sitter::Parser::new();
168        parser
169            .set_language(&self.language)
170            .map_err(|e| ParseError::TreeSitterParse {
171                path: format!("{file_path}: set_language failed: {e}"),
172            })?;
173
174        let tree = parser
175            .parse(source, None)
176            .ok_or_else(|| ParseError::TreeSitterParse {
177                path: format!("{file_path}: parse returned None (timeout or cancellation)"),
178            })?;
179
180        // Build the walker (which runs the tags query once via the
181        // detector) inside the guard scope, then drop the guard before
182        // walking the tree. The scope map is copied into the walker, so
183        // the detector lock is no longer needed past that point.
184        let walker = {
185            let mut detector_guard =
186                self.scope_detector
187                    .lock()
188                    .map_err(|_| ParseError::SchemaConstruction {
189                        reason: "scope-detector mutex poisoned".to_owned(),
190                    })?;
191            AstWalker::new(
192                source,
193                &self.theory_meta,
194                &self.protocol,
195                self.walker_config.clone(),
196                Some(&mut *detector_guard),
197            )
198        };
199
200        walker.walk(&tree, file_path)
201    }
202
203    fn emit(&self, schema: &Schema) -> Result<Vec<u8>, ParseError> {
204        // Reconstruct source text from the schema's structural information.
205        //
206        // The walker stores two types of text constraints:
207        // 1. `literal-value` on leaf nodes: the source text of identifiers, literals, etc.
208        // 2. `interstitial-N` on parent nodes: the text between named children, which
209        //    contains keywords, punctuation, whitespace, and comments.
210        //
211        // The emitter walks the schema tree depth-first, interleaving interstitial text
212        // with child emissions to reconstruct the full source.
213        emit_from_schema(schema, &self.protocol_name)
214    }
215
216    fn supported_extensions(&self) -> &[&str] {
217        &self.extensions
218    }
219
220    fn theory_meta(&self) -> &ExtractedTheoryMeta {
221        &self.theory_meta
222    }
223
224    fn emit_pretty_with_policy(
225        &self,
226        schema: &Schema,
227        policy: &FormatPolicy,
228    ) -> Result<Vec<u8>, ParseError> {
229        let bytes = self.grammar_json.ok_or_else(|| ParseError::EmitFailed {
230            protocol: self.protocol_name.clone(),
231            reason: "grammar.json not vendored for this protocol; \
232                     run tools/fetch-grammar-json.py to populate it"
233                .to_owned(),
234        })?;
235        let nt = self.node_types_json_for_emit.as_deref();
236        let cached = self.grammar_cache.get_or_init(|| {
237            EmitGrammar::from_bytes_with_node_types(&self.protocol_name, bytes, nt)
238        });
239        let grammar = match cached {
240            Ok(g) => g,
241            Err(e) => {
242                return Err(ParseError::EmitFailed {
243                    protocol: self.protocol_name.clone(),
244                    reason: format!("grammar.json parse failed: {e}"),
245                });
246            }
247        };
248        emit_pretty_inner(&self.protocol_name, schema, grammar, policy)
249    }
250}
251
252/// Reconstruct source text from a schema using interstitial text and leaf literals.
253///
254/// The walker stores two types of text data:
255/// - `literal-value` on leaf nodes: identifiers, literals, keywords that are named nodes
256/// - `interstitial-N` on parent nodes: text between named children (keywords, punctuation,
257///   whitespace, comments from anonymous/unnamed tokens)
258///
259/// The emitter reconstructs source by collecting ALL text fragments (both interstitials
260/// and leaf literals) and sorting them by their byte position in the original source.
261/// This produces exact round-trip fidelity: `emit(parse(source))` = `source`.
262fn emit_from_schema(schema: &Schema, protocol: &str) -> Result<Vec<u8>, ParseError> {
263    // Collect all text fragments with their byte positions.
264    // Each fragment is (start_byte, text).
265    let mut fragments: Vec<(usize, String)> = Vec::new();
266
267    for name in schema.vertices.keys() {
268        if let Some(constraints) = schema.constraints.get(name) {
269            // Get start-byte for this vertex.
270            let start_byte = constraints
271                .iter()
272                .find(|c| c.sort.as_ref() == "start-byte")
273                .and_then(|c| c.value.parse::<usize>().ok());
274
275            // Collect literal-value from leaf nodes.
276            let literal = constraints
277                .iter()
278                .find(|c| c.sort.as_ref() == "literal-value")
279                .map(|c| c.value.clone());
280
281            if let (Some(start), Some(text)) = (start_byte, literal) {
282                fragments.push((start, text));
283            }
284
285            // Collect interstitial text fragments.
286            // Each interstitial has a byte position derived from its parent and index.
287            for c in constraints {
288                let sort_str = c.sort.as_ref();
289                if sort_str.starts_with("interstitial-") {
290                    // The interstitial's position is encoded in a companion constraint.
291                    // We stored interstitial-N-start-byte alongside interstitial-N.
292                    let pos_sort = format!("{sort_str}-start-byte");
293                    let pos = constraints
294                        .iter()
295                        .find(|c2| c2.sort.as_ref() == pos_sort.as_str())
296                        .and_then(|c2| c2.value.parse::<usize>().ok());
297
298                    if let Some(p) = pos {
299                        fragments.push((p, c.value.clone()));
300                    }
301                }
302            }
303        }
304    }
305
306    if fragments.is_empty() {
307        return Err(ParseError::EmitFailed {
308            protocol: protocol.to_owned(),
309            reason: "schema has no text fragments".to_owned(),
310        });
311    }
312
313    // Sort by byte position and concatenate.
314    fragments.sort_by_key(|(pos, _)| *pos);
315
316    // Deduplicate overlapping fragments (parent interstitials may overlap with
317    // child literals). Keep the first fragment at each position.
318    let mut output = Vec::new();
319    let mut cursor = 0;
320
321    for (pos, text) in &fragments {
322        if *pos >= cursor {
323            output.extend_from_slice(text.as_bytes());
324            cursor = pos + text.len();
325        }
326    }
327
328    Ok(output)
329}
330
331/// Build the standard Protocol for a full-AST language parser.
332///
333/// Shared by `LanguageParser::new` and `LanguageParser::from_language`
334/// to avoid duplicating the constraint sorts and flag definitions.
335fn build_full_ast_protocol(protocol_name: &str, theory_name: &str) -> Protocol {
336    Protocol {
337        name: protocol_name.into(),
338        schema_theory: theory_name.into(),
339        instance_theory: format!("{theory_name}Instance"),
340        schema_composition: None,
341        instance_composition: None,
342        obj_kinds: vec![],
343        edge_rules: vec![],
344        constraint_sorts: vec![
345            "literal-value".into(),
346            "literal-type".into(),
347            "operator".into(),
348            "visibility".into(),
349            "mutability".into(),
350            "async".into(),
351            "static".into(),
352            "generator".into(),
353            "comment".into(),
354            "indent".into(),
355            "trailing-comma".into(),
356            "semicolon".into(),
357            "blank-lines-before".into(),
358            "start-byte".into(),
359            "end-byte".into(),
360        ],
361        has_order: true,
362        has_coproducts: false,
363        has_recursion: true,
364        has_causal: false,
365        nominal_identity: false,
366        has_defaults: false,
367        has_coercions: false,
368        has_mergers: false,
369        has_policies: false,
370    }
371}
372
373/// Capitalize the first letter of a string.
374fn capitalize_first(s: &str) -> String {
375    let mut chars = s.chars();
376    chars.next().map_or_else(String::new, |c| {
377        c.to_uppercase().collect::<String>() + chars.as_str()
378    })
379}