panproto_parse/registry.rs
1//! Parser registry mapping protocol names to full-AST parser implementations.
2
3use std::path::Path;
4
5use panproto_schema::Schema;
6use rustc_hash::FxHashMap;
7
8use crate::error::ParseError;
9use crate::theory_extract::ExtractedTheoryMeta;
10
11/// A full-AST parser and emitter for a specific programming language.
12///
13/// Each implementation wraps a tree-sitter grammar and its auto-derived theory,
14/// providing parse (source → Schema) and emit (Schema → source) operations.
15pub trait AstParser: Send + Sync {
16 /// The panproto protocol name (e.g. `"typescript"`, `"python"`).
17 fn protocol_name(&self) -> &str;
18
19 /// Parse source code into a full-AST [`Schema`].
20 ///
21 /// # Errors
22 ///
23 /// Returns [`ParseError`] if tree-sitter parsing fails or schema construction fails.
24 fn parse(&self, source: &[u8], file_path: &str) -> Result<Schema, ParseError>;
25
26 /// Emit a [`Schema`] back to source code bytes.
27 ///
28 /// The emitter walks the schema graph top-down, using formatting constraints
29 /// (comment, indent, blank-lines-before) to reproduce the original formatting.
30 ///
31 /// # Errors
32 ///
33 /// Returns [`ParseError::EmitFailed`] if emission fails.
34 fn emit(&self, schema: &Schema) -> Result<Vec<u8>, ParseError>;
35
36 /// File extensions this parser handles (e.g. `["ts", "tsx"]`).
37 fn supported_extensions(&self) -> &[&str];
38
39 /// The auto-derived theory metadata for this language.
40 fn theory_meta(&self) -> &ExtractedTheoryMeta;
41
42 /// Render a by-construction [`Schema`] (one with no parse-recovered
43 /// byte positions or interstitials) to source bytes.
44 ///
45 /// Unlike [`emit`](Self::emit), which reconstructs source from
46 /// byte-position fragments stored on the schema during `parse`,
47 /// `emit_pretty` walks tree-sitter `grammar.json` production rules
48 /// to render schemas built from scratch via `SchemaBuilder`.
49 ///
50 /// # Errors
51 ///
52 /// Returns [`ParseError::EmitFailed`] when the language has no
53 /// vendored `grammar.json`, when a vertex's kind is not a grammar
54 /// rule, or when a required field has no corresponding schema edge.
55 fn emit_pretty(&self, schema: &Schema) -> Result<Vec<u8>, ParseError> {
56 let _ = schema;
57 Err(ParseError::EmitFailed {
58 protocol: self.protocol_name().to_owned(),
59 reason: format!(
60 "emit_pretty not implemented for protocol '{}'",
61 self.protocol_name()
62 ),
63 })
64 }
65}
66
67/// Registry of all full-AST parsers, keyed by protocol name.
68///
69/// Provides language detection by file extension and dispatches parse/emit
70/// operations to the appropriate language parser.
71pub struct ParserRegistry {
72 /// Parsers keyed by protocol name.
73 parsers: FxHashMap<String, Box<dyn AstParser>>,
74 /// Extension → protocol name mapping.
75 extension_map: FxHashMap<String, String>,
76}
77
78impl ParserRegistry {
79 /// Create a new registry populated with all enabled language parsers.
80 ///
81 /// With the `grammars` feature (default), this populates the registry from
82 /// `panproto-grammars`, which provides up to 259 tree-sitter languages.
83 /// Without the `grammars` feature, this returns an empty registry; call
84 /// [`register`](Self::register) to add parsers manually using individual
85 /// grammar crates.
86 #[must_use]
87 pub fn new() -> Self {
88 let mut registry = Self {
89 parsers: FxHashMap::default(),
90 extension_map: FxHashMap::default(),
91 };
92
93 #[cfg(feature = "grammars")]
94 for grammar in panproto_grammars::grammars() {
95 let config = crate::languages::walker_configs::walker_config_for(grammar.name);
96 match crate::languages::common::LanguageParser::from_language_with_grammar_json(
97 grammar.name,
98 grammar.extensions.to_vec(),
99 grammar.language,
100 grammar.node_types,
101 grammar.tags_query,
102 config,
103 grammar.grammar_json,
104 ) {
105 Ok(p) => registry.register(Box::new(p)),
106 Err(err) => {
107 let _ = err;
108 #[cfg(debug_assertions)]
109 eprintln!(
110 "warning: grammar '{}' theory extraction failed: {err}",
111 grammar.name
112 );
113 }
114 }
115 }
116
117 registry
118 }
119
120 /// Register a parser implementation.
121 pub fn register(&mut self, parser: Box<dyn AstParser>) {
122 let name = parser.protocol_name().to_owned();
123 for ext in parser.supported_extensions() {
124 self.extension_map.insert((*ext).to_owned(), name.clone());
125 }
126 self.parsers.insert(name, parser);
127 }
128
129 /// Register a tree-sitter language as a full-AST parser.
130 ///
131 /// Used by `panproto-grammars-*` companion crates that ship grammars
132 /// outside the default `panproto-grammars` build. The byte-slice
133 /// arguments must outlive this registry; the canonical pattern is
134 /// for the companion to bake the data into `&'static` rodata at
135 /// compile time and pass references that are valid for the process
136 /// lifetime.
137 ///
138 /// `walker_config` is looked up by `name` from the bundled per-language
139 /// configuration table. Languages without a tailored configuration
140 /// fall back to the default walker config.
141 ///
142 /// # Errors
143 ///
144 /// Returns [`ParseError`] if theory extraction from `node_types_json`
145 /// fails or if the tags query rejects compilation.
146 pub fn register_external_grammar(
147 &mut self,
148 name: &'static str,
149 extensions: Vec<&'static str>,
150 language: tree_sitter::Language,
151 node_types_json: &'static [u8],
152 tags_query: Option<&'static str>,
153 grammar_json: Option<&'static [u8]>,
154 ) -> Result<(), crate::error::ParseError> {
155 let config = crate::languages::walker_configs::walker_config_for(name);
156 let parser = crate::languages::common::LanguageParser::from_language_with_grammar_json(
157 name,
158 extensions,
159 language,
160 node_types_json,
161 tags_query,
162 config,
163 grammar_json,
164 )?;
165 self.register(Box::new(parser));
166 Ok(())
167 }
168
169 /// Owned-data variant of [`register_external_grammar`](Self::register_external_grammar).
170 ///
171 /// Accepts `String` / `Vec<u8>` rather than `&'static` references. The
172 /// caller is presumed not to have process-lifetime rodata available
173 /// (typical dev-time use: bytes read from disk via the Python binding's
174 /// override hook). To match the trait's `'static` lifetime requirement
175 /// the inputs are leaked into the heap; the leak is one-time per
176 /// override.
177 ///
178 /// This is the registration primitive for grammar-author workflows
179 /// where a grammar's `parser.c` / `grammar.json` / `node-types.json`
180 /// are evolving outside the panproto release cadence. Production
181 /// builds should continue to use [`register_external_grammar`](Self::register_external_grammar) with
182 /// `'static` data baked into the binary at compile time.
183 ///
184 /// # Errors
185 ///
186 /// Returns [`ParseError`] if theory extraction or tags-query
187 /// compilation fails.
188 pub fn register_external_grammar_owned(
189 &mut self,
190 name: String,
191 extensions: Vec<String>,
192 language: tree_sitter::Language,
193 node_types_json: Vec<u8>,
194 tags_query: Option<String>,
195 grammar_json: Option<Vec<u8>>,
196 ) -> Result<(), crate::error::ParseError> {
197 let name_static: &'static str = Box::leak(name.into_boxed_str());
198 let extensions_static: Vec<&'static str> = extensions
199 .into_iter()
200 .map(|s| Box::leak(s.into_boxed_str()) as &'static str)
201 .collect();
202 let node_types_static: &'static [u8] = Box::leak(node_types_json.into_boxed_slice());
203 let tags_query_static: Option<&'static str> =
204 tags_query.map(|s| Box::leak(s.into_boxed_str()) as &'static str);
205 let grammar_json_static: Option<&'static [u8]> =
206 grammar_json.map(|v| Box::leak(v.into_boxed_slice()) as &'static [u8]);
207
208 self.register_external_grammar(
209 name_static,
210 extensions_static,
211 language,
212 node_types_static,
213 tags_query_static,
214 grammar_json_static,
215 )
216 }
217
218 /// Remove a registration by protocol name.
219 ///
220 /// Drops the parser and any extension mappings that pointed at it.
221 /// Returns `true` if a parser was removed, `false` if no such
222 /// registration existed. Primarily intended for grammar-author
223 /// workflows where a registered grammar is being replaced by a
224 /// freshly-compiled version mid-process.
225 pub fn unregister(&mut self, name: &str) -> bool {
226 let removed = self.parsers.remove(name).is_some();
227 if removed {
228 self.extension_map.retain(|_, v| v != name);
229 }
230 removed
231 }
232
233 /// Override a registered grammar with new owned data.
234 ///
235 /// Equivalent to [`unregister`](Self::unregister) followed by
236 /// [`register_external_grammar_owned`](Self::register_external_grammar_owned),
237 /// and intended for the same grammar-author dev workflow. Any
238 /// extension mappings previously bound to `name` are replaced by
239 /// the new `extensions`.
240 ///
241 /// # Errors
242 ///
243 /// Returns [`ParseError`] if theory extraction or tags-query
244 /// compilation fails on the new grammar; in that case the prior
245 /// registration is already gone.
246 pub fn override_grammar(
247 &mut self,
248 name: String,
249 extensions: Vec<String>,
250 language: tree_sitter::Language,
251 node_types_json: Vec<u8>,
252 tags_query: Option<String>,
253 grammar_json: Option<Vec<u8>>,
254 ) -> Result<(), crate::error::ParseError> {
255 self.unregister(&name);
256 self.register_external_grammar_owned(
257 name,
258 extensions,
259 language,
260 node_types_json,
261 tags_query,
262 grammar_json,
263 )
264 }
265
266 /// Detect the language protocol for a file path by its extension.
267 ///
268 /// Returns `None` if the extension is not recognized (caller should
269 /// fall back to the `raw_file` protocol).
270 #[must_use]
271 pub fn detect_language(&self, path: &Path) -> Option<&str> {
272 path.extension()
273 .and_then(|ext| ext.to_str())
274 .and_then(|ext| self.extension_map.get(ext))
275 .map(String::as_str)
276 }
277
278 /// Parse a file by detecting its language from the file path.
279 ///
280 /// # Errors
281 ///
282 /// Returns [`ParseError::UnknownLanguage`] if the file extension is not recognized.
283 /// Returns other [`ParseError`] variants if parsing fails.
284 pub fn parse_file(&self, path: &Path, content: &[u8]) -> Result<Schema, ParseError> {
285 let protocol = self
286 .detect_language(path)
287 .ok_or_else(|| ParseError::UnknownLanguage {
288 extension: path
289 .extension()
290 .and_then(|e| e.to_str())
291 .unwrap_or("")
292 .to_owned(),
293 })?;
294
295 self.parse_with_protocol(protocol, content, &path.display().to_string())
296 }
297
298 /// Parse source code with a specific protocol name.
299 ///
300 /// # Errors
301 ///
302 /// Returns [`ParseError::UnknownLanguage`] if the protocol is not registered.
303 pub fn parse_with_protocol(
304 &self,
305 protocol: &str,
306 content: &[u8],
307 file_path: &str,
308 ) -> Result<Schema, ParseError> {
309 let parser = self
310 .parsers
311 .get(protocol)
312 .ok_or_else(|| ParseError::UnknownLanguage {
313 extension: protocol.to_owned(),
314 })?;
315
316 parser.parse(content, file_path)
317 }
318
319 /// Emit a schema back to source code bytes using the specified protocol.
320 ///
321 /// # Errors
322 ///
323 /// Returns [`ParseError::UnknownLanguage`] if the protocol is not registered.
324 pub fn emit_with_protocol(
325 &self,
326 protocol: &str,
327 schema: &Schema,
328 ) -> Result<Vec<u8>, ParseError> {
329 let parser = self
330 .parsers
331 .get(protocol)
332 .ok_or_else(|| ParseError::UnknownLanguage {
333 extension: protocol.to_owned(),
334 })?;
335
336 parser.emit(schema)
337 }
338
339 /// Render a by-construction schema using the named protocol.
340 ///
341 /// # Errors
342 ///
343 /// Returns [`ParseError::UnknownLanguage`] if the protocol is not
344 /// registered, or [`ParseError::EmitFailed`] from the underlying
345 /// parser's `emit_pretty`.
346 pub fn emit_pretty_with_protocol(
347 &self,
348 protocol: &str,
349 schema: &Schema,
350 ) -> Result<Vec<u8>, ParseError> {
351 let parser = self
352 .parsers
353 .get(protocol)
354 .ok_or_else(|| ParseError::UnknownLanguage {
355 extension: protocol.to_owned(),
356 })?;
357
358 parser.emit_pretty(schema)
359 }
360
361 /// Get the theory metadata for a specific protocol.
362 #[must_use]
363 pub fn theory_meta(&self, protocol: &str) -> Option<&ExtractedTheoryMeta> {
364 self.parsers.get(protocol).map(|p| p.theory_meta())
365 }
366
367 /// List all registered protocol names.
368 pub fn protocol_names(&self) -> impl Iterator<Item = &str> {
369 self.parsers.keys().map(String::as_str)
370 }
371
372 /// O(1) lookup: is a parser already registered for `protocol`?
373 ///
374 /// Useful for dedup at the registration boundary. The umbrella
375 /// `panproto-grammars-all` companion pack overlaps with both the
376 /// built-in core grammars and every per-group pack; callers can
377 /// short-circuit before re-registering rather than scanning
378 /// `protocol_names()` linearly.
379 #[must_use]
380 pub fn has_parser(&self, protocol: &str) -> bool {
381 self.parsers.contains_key(protocol)
382 }
383
384 /// Get the number of registered parsers.
385 #[must_use]
386 pub fn len(&self) -> usize {
387 self.parsers.len()
388 }
389
390 /// Check if the registry is empty.
391 #[must_use]
392 pub fn is_empty(&self) -> bool {
393 self.parsers.is_empty()
394 }
395}
396
397impl Default for ParserRegistry {
398 fn default() -> Self {
399 Self::new()
400 }
401}