Skip to main content

panproto_parse/
registry.rs

1//! Parser registry mapping protocol names to full-AST parser implementations.
2
3use std::path::Path;
4use std::sync::Arc;
5
6use panproto_schema::{AbstractSchema, DecoratedSchema, Schema};
7use rustc_hash::FxHashMap;
8
9use crate::error::ParseError;
10use crate::layout_policy::LayoutPolicy;
11use crate::theory_extract::ExtractedTheoryMeta;
12
13/// A full-AST parser and emitter for a specific programming language.
14///
15/// Each implementation wraps a tree-sitter grammar and its auto-derived theory,
16/// providing parse (source → Schema) and emit (Schema → source) operations.
17pub trait AstParser: Send + Sync {
18    /// The panproto protocol name (e.g. `"typescript"`, `"python"`).
19    fn protocol_name(&self) -> &str;
20
21    /// Parse source code into a full-AST [`Schema`].
22    ///
23    /// # Errors
24    ///
25    /// Returns [`ParseError`] if tree-sitter parsing fails or schema construction fails.
26    fn parse(&self, source: &[u8], file_path: &str) -> Result<Schema, ParseError>;
27
28    /// Emit a [`Schema`] back to source code bytes.
29    ///
30    /// The emitter walks the schema graph top-down, using formatting constraints
31    /// (comment, indent, blank-lines-before) to reproduce the original formatting.
32    ///
33    /// # Errors
34    ///
35    /// Returns [`ParseError::EmitFailed`] if emission fails.
36    fn emit(&self, schema: &Schema) -> Result<Vec<u8>, ParseError>;
37
38    /// File extensions this parser handles (e.g. `["ts", "tsx"]`).
39    fn supported_extensions(&self) -> &[&str];
40
41    /// The auto-derived theory metadata for this language.
42    fn theory_meta(&self) -> &ExtractedTheoryMeta;
43
44    /// Render a by-construction [`Schema`] (one with no parse-recovered
45    /// byte positions or interstitials) to source bytes.
46    ///
47    /// Unlike [`emit`](Self::emit), which reconstructs source from
48    /// byte-position fragments stored on the schema during `parse`,
49    /// `emit_pretty` walks tree-sitter `grammar.json` production rules
50    /// to render schemas built from scratch via `SchemaBuilder`.
51    ///
52    /// # Errors
53    ///
54    /// Returns [`ParseError::EmitFailed`] when the language has no
55    /// vendored `grammar.json`, when a vertex's kind is not a grammar
56    /// rule, or when a required field has no corresponding schema edge.
57    fn emit_pretty(&self, schema: &Schema) -> Result<Vec<u8>, ParseError> {
58        self.emit_pretty_with_policy(schema, &crate::emit_pretty::FormatPolicy::default())
59    }
60
61    /// Render a by-construction [`Schema`] under a caller-supplied
62    /// [`FormatPolicy`](crate::emit_pretty::FormatPolicy).
63    ///
64    /// The policy governs every configurable aspect of the rendered
65    /// output: separator between glued tokens, newline byte sequence,
66    /// indent width, line-break and indent-open/close token sets. The
67    /// default policy (used by [`emit_pretty`](Self::emit_pretty))
68    /// targets syntactic validity with ASCII conventions; callers
69    /// supplying their own policy can pin idiomatic formatting.
70    ///
71    /// # Errors
72    ///
73    /// Returns [`ParseError::EmitFailed`] when the language has no
74    /// vendored `grammar.json`, when a vertex's kind is not a grammar
75    /// rule, or when a required field has no corresponding schema edge.
76    fn emit_pretty_with_policy(
77        &self,
78        schema: &Schema,
79        policy: &crate::emit_pretty::FormatPolicy,
80    ) -> Result<Vec<u8>, ParseError> {
81        let _ = (schema, policy);
82        Err(ParseError::EmitFailed {
83            protocol: self.protocol_name().to_owned(),
84            reason: format!(
85                "emit_pretty_with_policy not implemented for protocol '{}'",
86                self.protocol_name()
87            ),
88        })
89    }
90}
91
92/// Registry of all full-AST parsers, keyed by protocol name.
93///
94/// Provides language detection by file extension and dispatches parse/emit
95/// operations to the appropriate language parser.
96pub struct ParserRegistry {
97    /// Parsers keyed by protocol name.
98    ///
99    /// Held by `Arc` (not `Box`) so the same handle can be shared with
100    /// the layout-enrichment registry without re-wrapping at every
101    /// lookup. Registration installs both: the parser into `parsers`
102    /// and a thin adapter into the lens crate's enrichment registry.
103    parsers: FxHashMap<String, Arc<dyn AstParser>>,
104    /// Extension → protocol name mapping.
105    extension_map: FxHashMap<String, String>,
106}
107
108impl ParserRegistry {
109    /// Create a new registry populated with all enabled language parsers.
110    ///
111    /// With the `grammars` feature (default), this populates the registry from
112    /// `panproto-grammars`, which provides up to 259 tree-sitter languages.
113    /// Without the `grammars` feature, this returns an empty registry; call
114    /// [`register`](Self::register) to add parsers manually using individual
115    /// grammar crates.
116    #[must_use]
117    pub fn new() -> Self {
118        let mut registry = Self {
119            parsers: FxHashMap::default(),
120            extension_map: FxHashMap::default(),
121        };
122
123        #[cfg(feature = "grammars")]
124        for grammar in panproto_grammars::grammars() {
125            let config = crate::languages::walker_configs::walker_config_for(grammar.name);
126            match crate::languages::common::LanguageParser::from_language_with_grammar_json(
127                grammar.name,
128                grammar.extensions.to_vec(),
129                grammar.language,
130                grammar.node_types,
131                grammar.tags_query,
132                config,
133                grammar.grammar_json,
134            ) {
135                Ok(p) => registry.register(Box::new(p)),
136                Err(err) => {
137                    let _ = err;
138                    #[cfg(debug_assertions)]
139                    eprintln!(
140                        "warning: grammar '{}' theory extraction failed: {err}",
141                        grammar.name
142                    );
143                }
144            }
145        }
146
147        registry
148    }
149
150    /// Register a parser implementation.
151    ///
152    /// In addition to keying the parser by its protocol name, this
153    /// installs a [`LayoutEnricher`](panproto_lens::enrichment_registry::LayoutEnricher)
154    /// adapter into the global enrichment registry so that a
155    /// `parse_emit_protolens(protocol, …)` instantiation finds a
156    /// synthesis driver without any further wiring.
157    pub fn register(&mut self, parser: Box<dyn AstParser>) {
158        let name = parser.protocol_name().to_owned();
159        for ext in parser.supported_extensions() {
160            self.extension_map.insert((*ext).to_owned(), name.clone());
161        }
162        let arc: Arc<dyn AstParser> = Arc::from(parser);
163        crate::decorate::register_layout_enricher(Arc::clone(&arc));
164        self.parsers.insert(name, arc);
165    }
166
167    /// Register a tree-sitter language as a full-AST parser.
168    ///
169    /// Used by `panproto-grammars-*` companion crates that ship grammars
170    /// outside the default `panproto-grammars` build. The byte-slice
171    /// arguments must outlive this registry; the canonical pattern is
172    /// for the companion to bake the data into `&'static` rodata at
173    /// compile time and pass references that are valid for the process
174    /// lifetime.
175    ///
176    /// `walker_config` is looked up by `name` from the bundled per-language
177    /// configuration table. Languages without a tailored configuration
178    /// fall back to the default walker config.
179    ///
180    /// # Errors
181    ///
182    /// Returns [`ParseError`] if theory extraction from `node_types_json`
183    /// fails or if the tags query rejects compilation.
184    pub fn register_external_grammar(
185        &mut self,
186        name: &'static str,
187        extensions: Vec<&'static str>,
188        language: tree_sitter::Language,
189        node_types_json: &'static [u8],
190        tags_query: Option<&'static str>,
191        grammar_json: Option<&'static [u8]>,
192    ) -> Result<(), crate::error::ParseError> {
193        let config = crate::languages::walker_configs::walker_config_for(name);
194        let parser = crate::languages::common::LanguageParser::from_language_with_grammar_json(
195            name,
196            extensions,
197            language,
198            node_types_json,
199            tags_query,
200            config,
201            grammar_json,
202        )?;
203        self.register(Box::new(parser));
204        Ok(())
205    }
206
207    /// Owned-data variant of [`register_external_grammar`](Self::register_external_grammar).
208    ///
209    /// Accepts `String` / `Vec<u8>` rather than `&'static` references. The
210    /// caller is presumed not to have process-lifetime rodata available
211    /// (typical dev-time use: bytes read from disk via the Python binding's
212    /// override hook). To match the trait's `'static` lifetime requirement
213    /// the inputs are leaked into the heap; the leak is one-time per
214    /// override.
215    ///
216    /// This is the registration primitive for grammar-author workflows
217    /// where a grammar's `parser.c` / `grammar.json` / `node-types.json`
218    /// are evolving outside the panproto release cadence. Production
219    /// builds should continue to use [`register_external_grammar`](Self::register_external_grammar) with
220    /// `'static` data baked into the binary at compile time.
221    ///
222    /// # Errors
223    ///
224    /// Returns [`ParseError`] if theory extraction or tags-query
225    /// compilation fails.
226    pub fn register_external_grammar_owned(
227        &mut self,
228        name: String,
229        extensions: Vec<String>,
230        language: tree_sitter::Language,
231        node_types_json: Vec<u8>,
232        tags_query: Option<String>,
233        grammar_json: Option<Vec<u8>>,
234    ) -> Result<(), crate::error::ParseError> {
235        let name_static: &'static str = Box::leak(name.into_boxed_str());
236        let extensions_static: Vec<&'static str> = extensions
237            .into_iter()
238            .map(|s| Box::leak(s.into_boxed_str()) as &'static str)
239            .collect();
240        let node_types_static: &'static [u8] = Box::leak(node_types_json.into_boxed_slice());
241        let tags_query_static: Option<&'static str> =
242            tags_query.map(|s| Box::leak(s.into_boxed_str()) as &'static str);
243        let grammar_json_static: Option<&'static [u8]> =
244            grammar_json.map(|v| Box::leak(v.into_boxed_slice()) as &'static [u8]);
245
246        self.register_external_grammar(
247            name_static,
248            extensions_static,
249            language,
250            node_types_static,
251            tags_query_static,
252            grammar_json_static,
253        )
254    }
255
256    /// Remove a registration by protocol name.
257    ///
258    /// Drops the parser and any extension mappings that pointed at it.
259    /// Returns `true` if a parser was removed, `false` if no such
260    /// registration existed. Primarily intended for grammar-author
261    /// workflows where a registered grammar is being replaced by a
262    /// freshly-compiled version mid-process.
263    pub fn unregister(&mut self, name: &str) -> bool {
264        let removed = self.parsers.remove(name).is_some();
265        if removed {
266            self.extension_map.retain(|_, v| v != name);
267        }
268        removed
269    }
270
271    /// Override a registered grammar with new owned data.
272    ///
273    /// Equivalent to [`unregister`](Self::unregister) followed by
274    /// [`register_external_grammar_owned`](Self::register_external_grammar_owned),
275    /// and intended for the same grammar-author dev workflow. Any
276    /// extension mappings previously bound to `name` are replaced by
277    /// the new `extensions`.
278    ///
279    /// # Errors
280    ///
281    /// Returns [`ParseError`] if theory extraction or tags-query
282    /// compilation fails on the new grammar; in that case the prior
283    /// registration is already gone.
284    pub fn override_grammar(
285        &mut self,
286        name: String,
287        extensions: Vec<String>,
288        language: tree_sitter::Language,
289        node_types_json: Vec<u8>,
290        tags_query: Option<String>,
291        grammar_json: Option<Vec<u8>>,
292    ) -> Result<(), crate::error::ParseError> {
293        self.unregister(&name);
294        self.register_external_grammar_owned(
295            name,
296            extensions,
297            language,
298            node_types_json,
299            tags_query,
300            grammar_json,
301        )
302    }
303
304    /// Detect the language protocol for a file path by its extension.
305    ///
306    /// Returns `None` if the extension is not recognized (caller should
307    /// fall back to the `raw_file` protocol).
308    #[must_use]
309    pub fn detect_language(&self, path: &Path) -> Option<&str> {
310        path.extension()
311            .and_then(|ext| ext.to_str())
312            .and_then(|ext| self.extension_map.get(ext))
313            .map(String::as_str)
314    }
315
316    /// Parse a file by detecting its language from the file path.
317    ///
318    /// # Errors
319    ///
320    /// Returns [`ParseError::UnknownLanguage`] if the file extension is not recognized.
321    /// Returns other [`ParseError`] variants if parsing fails.
322    pub fn parse_file(&self, path: &Path, content: &[u8]) -> Result<Schema, ParseError> {
323        let protocol = self
324            .detect_language(path)
325            .ok_or_else(|| ParseError::UnknownLanguage {
326                extension: path
327                    .extension()
328                    .and_then(|e| e.to_str())
329                    .unwrap_or("")
330                    .to_owned(),
331            })?;
332
333        self.parse_with_protocol(protocol, content, &path.display().to_string())
334    }
335
336    /// Parse source code with a specific protocol name.
337    ///
338    /// # Errors
339    ///
340    /// Returns [`ParseError::UnknownLanguage`] if the protocol is not registered.
341    pub fn parse_with_protocol(
342        &self,
343        protocol: &str,
344        content: &[u8],
345        file_path: &str,
346    ) -> Result<Schema, ParseError> {
347        let parser = self
348            .parsers
349            .get(protocol)
350            .ok_or_else(|| ParseError::UnknownLanguage {
351                extension: protocol.to_owned(),
352            })?;
353
354        parser.parse(content, file_path)
355    }
356
357    /// Emit a schema back to source code bytes using the specified protocol.
358    ///
359    /// # Errors
360    ///
361    /// Returns [`ParseError::UnknownLanguage`] if the protocol is not registered.
362    pub fn emit_with_protocol(
363        &self,
364        protocol: &str,
365        schema: &Schema,
366    ) -> Result<Vec<u8>, ParseError> {
367        let parser = self
368            .parsers
369            .get(protocol)
370            .ok_or_else(|| ParseError::UnknownLanguage {
371                extension: protocol.to_owned(),
372            })?;
373
374        parser.emit(schema)
375    }
376
377    /// Render a by-construction schema using the named protocol.
378    ///
379    /// # Errors
380    ///
381    /// Returns [`ParseError::UnknownLanguage`] if the protocol is not
382    /// registered, or [`ParseError::EmitFailed`] from the underlying
383    /// parser's `emit_pretty`.
384    pub fn emit_pretty_with_protocol(
385        &self,
386        protocol: &str,
387        schema: &Schema,
388    ) -> Result<Vec<u8>, ParseError> {
389        let parser = self
390            .parsers
391            .get(protocol)
392            .ok_or_else(|| ParseError::UnknownLanguage {
393                extension: protocol.to_owned(),
394            })?;
395
396        parser.emit_pretty(schema)
397    }
398
399    /// Decorate an [`AbstractSchema`] with the layout enrichment
400    /// fibre required by `emit_pretty_with_protocol` and friends.
401    ///
402    /// This is the put-direction of the parse / decorate / emit lens
403    /// at `protocol`. The implementation routes through the same
404    /// grammar walker as `emit_pretty` followed by `parse`, so the
405    /// resulting [`DecoratedSchema`] carries a complete layout fibre
406    /// recovered by the parse-side walker — `start-byte`, `end-byte`,
407    /// every `interstitial-N`, `chose-alt-fingerprint`, and
408    /// `chose-alt-child-kinds`.
409    ///
410    /// The section law holds up to kind- and edge-multiset
411    /// equivalence: `forget_layout(decorate(a)) ≅ a` modulo vertex-id
412    /// renaming. Grammars where parsing consolidates tokens that the
413    /// emitter rendered as separate sequences (e.g. lilypond's `c'4`
414    /// re-parses to a single note) do not preserve a one-to-one
415    /// vertex correspondence, so the result's vertex IDs are always
416    /// freshly minted by the parser.
417    ///
418    /// # Errors
419    ///
420    /// Returns [`ParseError::UnknownLanguage`] when `protocol` is not
421    /// registered, [`ParseError::SchemaConstruction`] when the
422    /// abstract schema was built for a different protocol than
423    /// `protocol`, [`ParseError::EmitFailed`] when the grammar walker
424    /// cannot render the abstract schema (missing `grammar.json`,
425    /// vertex kind not a rule), or any other parser error if the
426    /// re-parse step rejects the canonical bytes (a regression in the
427    /// parse/emit pipeline, not a user bug).
428    pub fn decorate(
429        &self,
430        protocol: &str,
431        abstract_schema: &AbstractSchema,
432        policy: &LayoutPolicy,
433    ) -> Result<DecoratedSchema, ParseError> {
434        let parser = self
435            .parsers
436            .get(protocol)
437            .ok_or_else(|| ParseError::UnknownLanguage {
438                extension: protocol.to_owned(),
439            })?;
440        // `decorate_with_parser` enforces the protocol-match invariant
441        // between the parser and the abstract schema, so no extra guard
442        // is needed here.
443        crate::decorate::decorate_with_parser(parser.as_ref(), abstract_schema, policy)
444    }
445
446    /// Render an [`AbstractSchema`] to canonical source bytes under
447    /// `policy`.
448    ///
449    /// Implementation note: this is exactly the first emit step of
450    /// [`decorate`](Self::decorate) — `decorate` then re-parses to
451    /// recover the layout fibre, but if all the caller wants is the
452    /// bytes, the re-parse is wasted work. Going through
453    /// `emit_pretty_with_policy` directly preserves every field of
454    /// `policy` in the output (`separator`, `newline`, `indent_width`,
455    /// `line_break_after`, `indent_open` / `indent_close`).
456    ///
457    /// # Errors
458    ///
459    /// See [`decorate`](Self::decorate).
460    pub fn pretty_with_protocol(
461        &self,
462        protocol: &str,
463        abstract_schema: &AbstractSchema,
464        policy: &LayoutPolicy,
465    ) -> Result<Vec<u8>, ParseError> {
466        let parser = self
467            .parsers
468            .get(protocol)
469            .ok_or_else(|| ParseError::UnknownLanguage {
470                extension: protocol.to_owned(),
471            })?;
472        check_protocol_match(
473            protocol,
474            abstract_schema.as_schema(),
475            "pretty_with_protocol",
476        )?;
477        parser.emit_pretty_with_policy(abstract_schema.as_schema(), policy)
478    }
479
480    /// Return the canonical [`Protolens`](panproto_lens::Protolens)
481    /// describing the parse / decorate / emit relationship at
482    /// `protocol`.
483    ///
484    /// The protolens encodes the schema-level structure of the
485    /// relationship: source-side strips the layout enrichment fibre,
486    /// target-side adds it via the registered
487    /// [`LayoutEnricher`](panproto_lens::enrichment_registry::LayoutEnricher).
488    /// It composes with the rest of the `panproto-lens` protolens
489    /// algebra for chain-law reasoning. The operational entry points
490    /// for running the relationship on real schemas are
491    /// [`decorate`](Self::decorate),
492    /// [`pretty_with_protocol`](Self::pretty_with_protocol), and
493    /// [`emit_pretty_with_protocol`](Self::emit_pretty_with_protocol).
494    ///
495    /// # Errors
496    ///
497    /// Returns [`ParseError::UnknownLanguage`] when `protocol` is not
498    /// registered.
499    pub fn parse_emit_protolens(
500        &self,
501        protocol: &str,
502        policy: &LayoutPolicy,
503    ) -> Result<panproto_lens::Protolens, ParseError> {
504        if !self.parsers.contains_key(protocol) {
505            return Err(ParseError::UnknownLanguage {
506                extension: protocol.to_owned(),
507            });
508        }
509        Ok(crate::parse_emit_protolens::parse_emit_protolens(
510            protocol, policy,
511        ))
512    }
513
514    /// Get the theory metadata for a specific protocol.
515    #[must_use]
516    pub fn theory_meta(&self, protocol: &str) -> Option<&ExtractedTheoryMeta> {
517        self.parsers.get(protocol).map(|p| p.theory_meta())
518    }
519
520    /// List all registered protocol names.
521    pub fn protocol_names(&self) -> impl Iterator<Item = &str> {
522        self.parsers.keys().map(String::as_str)
523    }
524
525    /// O(1) lookup: is a parser already registered for `protocol`?
526    ///
527    /// Useful for dedup at the registration boundary. The umbrella
528    /// `panproto-grammars-all` companion pack overlaps with both the
529    /// built-in core grammars and every per-group pack; callers can
530    /// short-circuit before re-registering rather than scanning
531    /// `protocol_names()` linearly.
532    #[must_use]
533    pub fn has_parser(&self, protocol: &str) -> bool {
534        self.parsers.contains_key(protocol)
535    }
536
537    /// Get the number of registered parsers.
538    #[must_use]
539    pub fn len(&self) -> usize {
540        self.parsers.len()
541    }
542
543    /// Check if the registry is empty.
544    #[must_use]
545    pub fn is_empty(&self) -> bool {
546        self.parsers.is_empty()
547    }
548}
549
550impl Default for ParserRegistry {
551    fn default() -> Self {
552        Self::new()
553    }
554}
555
556/// Guard against running parser-tied operations on a schema built
557/// for a different protocol. Catches the user-visible error of
558/// passing (say) a JSON schema to a Python parser before the
559/// underlying grammar walker would surface it as an opaque rule
560/// mismatch.
561fn check_protocol_match(
562    expected: &str,
563    schema: &Schema,
564    operation: &'static str,
565) -> Result<(), ParseError> {
566    if schema.protocol == expected {
567        Ok(())
568    } else {
569        Err(ParseError::SchemaConstruction {
570            reason: format!(
571                "{operation}: protocol mismatch — registry called with '{expected}' but \
572                 schema carries protocol '{}'",
573                schema.protocol,
574            ),
575        })
576    }
577}