Skip to main content

panproto_parse/
registry.rs

1//! Parser registry mapping protocol names to full-AST parser implementations.
2
3use std::path::Path;
4
5use panproto_schema::Schema;
6use rustc_hash::FxHashMap;
7
8use crate::error::ParseError;
9use crate::theory_extract::ExtractedTheoryMeta;
10
11/// A full-AST parser and emitter for a specific programming language.
12///
13/// Each implementation wraps a tree-sitter grammar and its auto-derived theory,
14/// providing parse (source → Schema) and emit (Schema → source) operations.
15pub trait AstParser: Send + Sync {
16    /// The panproto protocol name (e.g. `"typescript"`, `"python"`).
17    fn protocol_name(&self) -> &str;
18
19    /// Parse source code into a full-AST [`Schema`].
20    ///
21    /// # Errors
22    ///
23    /// Returns [`ParseError`] if tree-sitter parsing fails or schema construction fails.
24    fn parse(&self, source: &[u8], file_path: &str) -> Result<Schema, ParseError>;
25
26    /// Emit a [`Schema`] back to source code bytes.
27    ///
28    /// The emitter walks the schema graph top-down, using formatting constraints
29    /// (comment, indent, blank-lines-before) to reproduce the original formatting.
30    ///
31    /// # Errors
32    ///
33    /// Returns [`ParseError::EmitFailed`] if emission fails.
34    fn emit(&self, schema: &Schema) -> Result<Vec<u8>, ParseError>;
35
36    /// File extensions this parser handles (e.g. `["ts", "tsx"]`).
37    fn supported_extensions(&self) -> &[&str];
38
39    /// The auto-derived theory metadata for this language.
40    fn theory_meta(&self) -> &ExtractedTheoryMeta;
41
42    /// Render a by-construction [`Schema`] (one with no parse-recovered
43    /// byte positions or interstitials) to source bytes.
44    ///
45    /// Unlike [`emit`](Self::emit), which reconstructs source from
46    /// byte-position fragments stored on the schema during `parse`,
47    /// `emit_pretty` walks tree-sitter `grammar.json` production rules
48    /// to render schemas built from scratch via `SchemaBuilder`.
49    ///
50    /// # Errors
51    ///
52    /// Returns [`ParseError::EmitFailed`] when the language has no
53    /// vendored `grammar.json`, when a vertex's kind is not a grammar
54    /// rule, or when a required field has no corresponding schema edge.
55    fn emit_pretty(&self, schema: &Schema) -> Result<Vec<u8>, ParseError> {
56        let _ = schema;
57        Err(ParseError::EmitFailed {
58            protocol: self.protocol_name().to_owned(),
59            reason: format!(
60                "emit_pretty not implemented for protocol '{}'",
61                self.protocol_name()
62            ),
63        })
64    }
65}
66
67/// Registry of all full-AST parsers, keyed by protocol name.
68///
69/// Provides language detection by file extension and dispatches parse/emit
70/// operations to the appropriate language parser.
71pub struct ParserRegistry {
72    /// Parsers keyed by protocol name.
73    parsers: FxHashMap<String, Box<dyn AstParser>>,
74    /// Extension → protocol name mapping.
75    extension_map: FxHashMap<String, String>,
76}
77
78impl ParserRegistry {
79    /// Create a new registry populated with all enabled language parsers.
80    ///
81    /// With the `grammars` feature (default), this populates the registry from
82    /// `panproto-grammars`, which provides up to 259 tree-sitter languages.
83    /// Without the `grammars` feature, this returns an empty registry; call
84    /// [`register`](Self::register) to add parsers manually using individual
85    /// grammar crates.
86    #[must_use]
87    pub fn new() -> Self {
88        let mut registry = Self {
89            parsers: FxHashMap::default(),
90            extension_map: FxHashMap::default(),
91        };
92
93        #[cfg(feature = "grammars")]
94        for grammar in panproto_grammars::grammars() {
95            let config = crate::languages::walker_configs::walker_config_for(grammar.name);
96            match crate::languages::common::LanguageParser::from_language_with_grammar_json(
97                grammar.name,
98                grammar.extensions.to_vec(),
99                grammar.language,
100                grammar.node_types,
101                grammar.tags_query,
102                config,
103                grammar.grammar_json,
104            ) {
105                Ok(p) => registry.register(Box::new(p)),
106                Err(err) => {
107                    let _ = err;
108                    #[cfg(debug_assertions)]
109                    eprintln!(
110                        "warning: grammar '{}' theory extraction failed: {err}",
111                        grammar.name
112                    );
113                }
114            }
115        }
116
117        registry
118    }
119
120    /// Register a parser implementation.
121    pub fn register(&mut self, parser: Box<dyn AstParser>) {
122        let name = parser.protocol_name().to_owned();
123        for ext in parser.supported_extensions() {
124            self.extension_map.insert((*ext).to_owned(), name.clone());
125        }
126        self.parsers.insert(name, parser);
127    }
128
129    /// Register a tree-sitter language as a full-AST parser.
130    ///
131    /// Used by `panproto-grammars-*` companion crates that ship grammars
132    /// outside the default `panproto-grammars` build. The byte-slice
133    /// arguments must outlive this registry; the canonical pattern is
134    /// for the companion to bake the data into `&'static` rodata at
135    /// compile time and pass references that are valid for the process
136    /// lifetime.
137    ///
138    /// `walker_config` is looked up by `name` from the bundled per-language
139    /// configuration table. Languages without a tailored configuration
140    /// fall back to the default walker config.
141    ///
142    /// # Errors
143    ///
144    /// Returns [`ParseError`] if theory extraction from `node_types_json`
145    /// fails or if the tags query rejects compilation.
146    pub fn register_external_grammar(
147        &mut self,
148        name: &'static str,
149        extensions: Vec<&'static str>,
150        language: tree_sitter::Language,
151        node_types_json: &'static [u8],
152        tags_query: Option<&'static str>,
153        grammar_json: Option<&'static [u8]>,
154    ) -> Result<(), crate::error::ParseError> {
155        let config = crate::languages::walker_configs::walker_config_for(name);
156        let parser = crate::languages::common::LanguageParser::from_language_with_grammar_json(
157            name,
158            extensions,
159            language,
160            node_types_json,
161            tags_query,
162            config,
163            grammar_json,
164        )?;
165        self.register(Box::new(parser));
166        Ok(())
167    }
168
169    /// Owned-data variant of [`register_external_grammar`](Self::register_external_grammar).
170    ///
171    /// Accepts `String` / `Vec<u8>` rather than `&'static` references. The
172    /// caller is presumed not to have process-lifetime rodata available
173    /// (typical dev-time use: bytes read from disk via the Python binding's
174    /// override hook). To match the trait's `'static` lifetime requirement
175    /// the inputs are leaked into the heap; the leak is one-time per
176    /// override.
177    ///
178    /// This is the registration primitive for grammar-author workflows
179    /// where a grammar's `parser.c` / `grammar.json` / `node-types.json`
180    /// are evolving outside the panproto release cadence. Production
181    /// builds should continue to use [`register_external_grammar`](Self::register_external_grammar) with
182    /// `'static` data baked into the binary at compile time.
183    ///
184    /// # Errors
185    ///
186    /// Returns [`ParseError`] if theory extraction or tags-query
187    /// compilation fails.
188    pub fn register_external_grammar_owned(
189        &mut self,
190        name: String,
191        extensions: Vec<String>,
192        language: tree_sitter::Language,
193        node_types_json: Vec<u8>,
194        tags_query: Option<String>,
195        grammar_json: Option<Vec<u8>>,
196    ) -> Result<(), crate::error::ParseError> {
197        let name_static: &'static str = Box::leak(name.into_boxed_str());
198        let extensions_static: Vec<&'static str> = extensions
199            .into_iter()
200            .map(|s| Box::leak(s.into_boxed_str()) as &'static str)
201            .collect();
202        let node_types_static: &'static [u8] = Box::leak(node_types_json.into_boxed_slice());
203        let tags_query_static: Option<&'static str> =
204            tags_query.map(|s| Box::leak(s.into_boxed_str()) as &'static str);
205        let grammar_json_static: Option<&'static [u8]> =
206            grammar_json.map(|v| Box::leak(v.into_boxed_slice()) as &'static [u8]);
207
208        self.register_external_grammar(
209            name_static,
210            extensions_static,
211            language,
212            node_types_static,
213            tags_query_static,
214            grammar_json_static,
215        )
216    }
217
218    /// Remove a registration by protocol name.
219    ///
220    /// Drops the parser and any extension mappings that pointed at it.
221    /// Returns `true` if a parser was removed, `false` if no such
222    /// registration existed. Primarily intended for grammar-author
223    /// workflows where a registered grammar is being replaced by a
224    /// freshly-compiled version mid-process.
225    pub fn unregister(&mut self, name: &str) -> bool {
226        let removed = self.parsers.remove(name).is_some();
227        if removed {
228            self.extension_map.retain(|_, v| v != name);
229        }
230        removed
231    }
232
233    /// Override a registered grammar with new owned data.
234    ///
235    /// Equivalent to [`unregister`](Self::unregister) followed by
236    /// [`register_external_grammar_owned`](Self::register_external_grammar_owned),
237    /// and intended for the same grammar-author dev workflow. Any
238    /// extension mappings previously bound to `name` are replaced by
239    /// the new `extensions`.
240    ///
241    /// # Errors
242    ///
243    /// Returns [`ParseError`] if theory extraction or tags-query
244    /// compilation fails on the new grammar; in that case the prior
245    /// registration is already gone.
246    pub fn override_grammar(
247        &mut self,
248        name: String,
249        extensions: Vec<String>,
250        language: tree_sitter::Language,
251        node_types_json: Vec<u8>,
252        tags_query: Option<String>,
253        grammar_json: Option<Vec<u8>>,
254    ) -> Result<(), crate::error::ParseError> {
255        self.unregister(&name);
256        self.register_external_grammar_owned(
257            name,
258            extensions,
259            language,
260            node_types_json,
261            tags_query,
262            grammar_json,
263        )
264    }
265
266    /// Detect the language protocol for a file path by its extension.
267    ///
268    /// Returns `None` if the extension is not recognized (caller should
269    /// fall back to the `raw_file` protocol).
270    #[must_use]
271    pub fn detect_language(&self, path: &Path) -> Option<&str> {
272        path.extension()
273            .and_then(|ext| ext.to_str())
274            .and_then(|ext| self.extension_map.get(ext))
275            .map(String::as_str)
276    }
277
278    /// Parse a file by detecting its language from the file path.
279    ///
280    /// # Errors
281    ///
282    /// Returns [`ParseError::UnknownLanguage`] if the file extension is not recognized.
283    /// Returns other [`ParseError`] variants if parsing fails.
284    pub fn parse_file(&self, path: &Path, content: &[u8]) -> Result<Schema, ParseError> {
285        let protocol = self
286            .detect_language(path)
287            .ok_or_else(|| ParseError::UnknownLanguage {
288                extension: path
289                    .extension()
290                    .and_then(|e| e.to_str())
291                    .unwrap_or("")
292                    .to_owned(),
293            })?;
294
295        self.parse_with_protocol(protocol, content, &path.display().to_string())
296    }
297
298    /// Parse source code with a specific protocol name.
299    ///
300    /// # Errors
301    ///
302    /// Returns [`ParseError::UnknownLanguage`] if the protocol is not registered.
303    pub fn parse_with_protocol(
304        &self,
305        protocol: &str,
306        content: &[u8],
307        file_path: &str,
308    ) -> Result<Schema, ParseError> {
309        let parser = self
310            .parsers
311            .get(protocol)
312            .ok_or_else(|| ParseError::UnknownLanguage {
313                extension: protocol.to_owned(),
314            })?;
315
316        parser.parse(content, file_path)
317    }
318
319    /// Emit a schema back to source code bytes using the specified protocol.
320    ///
321    /// # Errors
322    ///
323    /// Returns [`ParseError::UnknownLanguage`] if the protocol is not registered.
324    pub fn emit_with_protocol(
325        &self,
326        protocol: &str,
327        schema: &Schema,
328    ) -> Result<Vec<u8>, ParseError> {
329        let parser = self
330            .parsers
331            .get(protocol)
332            .ok_or_else(|| ParseError::UnknownLanguage {
333                extension: protocol.to_owned(),
334            })?;
335
336        parser.emit(schema)
337    }
338
339    /// Render a by-construction schema using the named protocol.
340    ///
341    /// # Errors
342    ///
343    /// Returns [`ParseError::UnknownLanguage`] if the protocol is not
344    /// registered, or [`ParseError::EmitFailed`] from the underlying
345    /// parser's `emit_pretty`.
346    pub fn emit_pretty_with_protocol(
347        &self,
348        protocol: &str,
349        schema: &Schema,
350    ) -> Result<Vec<u8>, ParseError> {
351        let parser = self
352            .parsers
353            .get(protocol)
354            .ok_or_else(|| ParseError::UnknownLanguage {
355                extension: protocol.to_owned(),
356            })?;
357
358        parser.emit_pretty(schema)
359    }
360
361    /// Get the theory metadata for a specific protocol.
362    #[must_use]
363    pub fn theory_meta(&self, protocol: &str) -> Option<&ExtractedTheoryMeta> {
364        self.parsers.get(protocol).map(|p| p.theory_meta())
365    }
366
367    /// List all registered protocol names.
368    pub fn protocol_names(&self) -> impl Iterator<Item = &str> {
369        self.parsers.keys().map(String::as_str)
370    }
371
372    /// O(1) lookup: is a parser already registered for `protocol`?
373    ///
374    /// Useful for dedup at the registration boundary. The umbrella
375    /// `panproto-grammars-all` companion pack overlaps with both the
376    /// built-in core grammars and every per-group pack; callers can
377    /// short-circuit before re-registering rather than scanning
378    /// `protocol_names()` linearly.
379    #[must_use]
380    pub fn has_parser(&self, protocol: &str) -> bool {
381        self.parsers.contains_key(protocol)
382    }
383
384    /// Get the number of registered parsers.
385    #[must_use]
386    pub fn len(&self) -> usize {
387        self.parsers.len()
388    }
389
390    /// Check if the registry is empty.
391    #[must_use]
392    pub fn is_empty(&self) -> bool {
393        self.parsers.is_empty()
394    }
395}
396
397impl Default for ParserRegistry {
398    fn default() -> Self {
399        Self::new()
400    }
401}