panproto_parse/registry.rs
1//! Parser registry mapping protocol names to full-AST parser implementations.
2
3use std::path::Path;
4use std::sync::Arc;
5
6use panproto_schema::{AbstractSchema, DecoratedSchema, Schema};
7use rustc_hash::FxHashMap;
8
9use crate::error::ParseError;
10use crate::layout_policy::LayoutPolicy;
11use crate::theory_extract::ExtractedTheoryMeta;
12
13/// A full-AST parser and emitter for a specific programming language.
14///
15/// Each implementation wraps a tree-sitter grammar and its auto-derived theory,
16/// providing parse (source → Schema) and emit (Schema → source) operations.
17pub trait AstParser: Send + Sync {
18 /// The panproto protocol name (e.g. `"typescript"`, `"python"`).
19 fn protocol_name(&self) -> &str;
20
21 /// Parse source code into a full-AST [`Schema`].
22 ///
23 /// # Errors
24 ///
25 /// Returns [`ParseError`] if tree-sitter parsing fails or schema construction fails.
26 fn parse(&self, source: &[u8], file_path: &str) -> Result<Schema, ParseError>;
27
28 /// Emit a [`Schema`] back to source code bytes.
29 ///
30 /// The emitter walks the schema graph top-down, using formatting constraints
31 /// (comment, indent, blank-lines-before) to reproduce the original formatting.
32 ///
33 /// # Errors
34 ///
35 /// Returns [`ParseError::EmitFailed`] if emission fails.
36 fn emit(&self, schema: &Schema) -> Result<Vec<u8>, ParseError>;
37
38 /// File extensions this parser handles (e.g. `["ts", "tsx"]`).
39 fn supported_extensions(&self) -> &[&str];
40
41 /// The auto-derived theory metadata for this language.
42 fn theory_meta(&self) -> &ExtractedTheoryMeta;
43
44 /// Render a by-construction [`Schema`] (one with no parse-recovered
45 /// byte positions or interstitials) to source bytes.
46 ///
47 /// Unlike [`emit`](Self::emit), which reconstructs source from
48 /// byte-position fragments stored on the schema during `parse`,
49 /// `emit_pretty` walks tree-sitter `grammar.json` production rules
50 /// to render schemas built from scratch via `SchemaBuilder`.
51 ///
52 /// # Errors
53 ///
54 /// Returns [`ParseError::EmitFailed`] when the language has no
55 /// vendored `grammar.json`, when a vertex's kind is not a grammar
56 /// rule, or when a required field has no corresponding schema edge.
57 fn emit_pretty(&self, schema: &Schema) -> Result<Vec<u8>, ParseError> {
58 self.emit_pretty_with_policy(schema, &crate::emit_pretty::FormatPolicy::default())
59 }
60
61 /// Render a by-construction [`Schema`] under a caller-supplied
62 /// [`FormatPolicy`](crate::emit_pretty::FormatPolicy).
63 ///
64 /// The policy governs every configurable aspect of the rendered
65 /// output: separator between glued tokens, newline byte sequence,
66 /// indent width, line-break and indent-open/close token sets. The
67 /// default policy (used by [`emit_pretty`](Self::emit_pretty))
68 /// targets syntactic validity with ASCII conventions; callers
69 /// supplying their own policy can pin idiomatic formatting.
70 ///
71 /// # Errors
72 ///
73 /// Returns [`ParseError::EmitFailed`] when the language has no
74 /// vendored `grammar.json`, when a vertex's kind is not a grammar
75 /// rule, or when a required field has no corresponding schema edge.
76 fn emit_pretty_with_policy(
77 &self,
78 schema: &Schema,
79 policy: &crate::emit_pretty::FormatPolicy,
80 ) -> Result<Vec<u8>, ParseError> {
81 let _ = (schema, policy);
82 Err(ParseError::EmitFailed {
83 protocol: self.protocol_name().to_owned(),
84 reason: format!(
85 "emit_pretty_with_policy not implemented for protocol '{}'",
86 self.protocol_name()
87 ),
88 })
89 }
90}
91
92/// Registry of all full-AST parsers, keyed by protocol name.
93///
94/// Provides language detection by file extension and dispatches parse/emit
95/// operations to the appropriate language parser.
96pub struct ParserRegistry {
97 /// Parsers keyed by protocol name.
98 ///
99 /// Held by `Arc` (not `Box`) so the same handle can be shared with
100 /// the layout-enrichment registry without re-wrapping at every
101 /// lookup. Registration installs both: the parser into `parsers`
102 /// and a thin adapter into the lens crate's enrichment registry.
103 parsers: FxHashMap<String, Arc<dyn AstParser>>,
104 /// Extension → protocol name mapping.
105 extension_map: FxHashMap<String, String>,
106}
107
108impl ParserRegistry {
109 /// Create a new registry populated with all enabled language parsers.
110 ///
111 /// With the `grammars` feature (default), this populates the registry from
112 /// `panproto-grammars`, which provides up to 259 tree-sitter languages.
113 /// Without the `grammars` feature, this returns an empty registry; call
114 /// [`register`](Self::register) to add parsers manually using individual
115 /// grammar crates.
116 #[must_use]
117 pub fn new() -> Self {
118 let mut registry = Self {
119 parsers: FxHashMap::default(),
120 extension_map: FxHashMap::default(),
121 };
122
123 #[cfg(feature = "grammars")]
124 for grammar in panproto_grammars::grammars() {
125 let config = crate::languages::walker_configs::walker_config_for(grammar.name);
126 match crate::languages::common::LanguageParser::from_language_with_grammar_json(
127 grammar.name,
128 grammar.extensions.to_vec(),
129 grammar.language,
130 grammar.node_types,
131 grammar.tags_query,
132 config,
133 grammar.grammar_json,
134 ) {
135 Ok(p) => registry.register(Box::new(p)),
136 Err(err) => {
137 let _ = err;
138 #[cfg(debug_assertions)]
139 eprintln!(
140 "warning: grammar '{}' theory extraction failed: {err}",
141 grammar.name
142 );
143 }
144 }
145 }
146
147 registry
148 }
149
150 /// Register a parser implementation.
151 ///
152 /// In addition to keying the parser by its protocol name, this
153 /// installs a [`LayoutEnricher`](panproto_lens::enrichment_registry::LayoutEnricher)
154 /// adapter into the global enrichment registry so that a
155 /// `parse_emit_protolens(protocol, …)` instantiation finds a
156 /// synthesis driver without any further wiring.
157 pub fn register(&mut self, parser: Box<dyn AstParser>) {
158 let name = parser.protocol_name().to_owned();
159 for ext in parser.supported_extensions() {
160 self.extension_map.insert((*ext).to_owned(), name.clone());
161 }
162 let arc: Arc<dyn AstParser> = Arc::from(parser);
163 crate::decorate::register_layout_enricher(Arc::clone(&arc));
164 self.parsers.insert(name, arc);
165 }
166
167 /// Register a tree-sitter language as a full-AST parser.
168 ///
169 /// Used by `panproto-grammars-*` companion crates that ship grammars
170 /// outside the default `panproto-grammars` build. The byte-slice
171 /// arguments must outlive this registry; the canonical pattern is
172 /// for the companion to bake the data into `&'static` rodata at
173 /// compile time and pass references that are valid for the process
174 /// lifetime.
175 ///
176 /// `walker_config` is looked up by `name` from the bundled per-language
177 /// configuration table. Languages without a tailored configuration
178 /// fall back to the default walker config.
179 ///
180 /// # Errors
181 ///
182 /// Returns [`ParseError`] if theory extraction from `node_types_json`
183 /// fails or if the tags query rejects compilation.
184 pub fn register_external_grammar(
185 &mut self,
186 name: &'static str,
187 extensions: Vec<&'static str>,
188 language: tree_sitter::Language,
189 node_types_json: &'static [u8],
190 tags_query: Option<&'static str>,
191 grammar_json: Option<&'static [u8]>,
192 ) -> Result<(), crate::error::ParseError> {
193 let config = crate::languages::walker_configs::walker_config_for(name);
194 let parser = crate::languages::common::LanguageParser::from_language_with_grammar_json(
195 name,
196 extensions,
197 language,
198 node_types_json,
199 tags_query,
200 config,
201 grammar_json,
202 )?;
203 self.register(Box::new(parser));
204 Ok(())
205 }
206
207 /// Owned-data variant of [`register_external_grammar`](Self::register_external_grammar).
208 ///
209 /// Accepts `String` / `Vec<u8>` rather than `&'static` references. The
210 /// caller is presumed not to have process-lifetime rodata available
211 /// (typical dev-time use: bytes read from disk via the Python binding's
212 /// override hook). To match the trait's `'static` lifetime requirement
213 /// the inputs are leaked into the heap; the leak is one-time per
214 /// override.
215 ///
216 /// This is the registration primitive for grammar-author workflows
217 /// where a grammar's `parser.c` / `grammar.json` / `node-types.json`
218 /// are evolving outside the panproto release cadence. Production
219 /// builds should continue to use [`register_external_grammar`](Self::register_external_grammar) with
220 /// `'static` data baked into the binary at compile time.
221 ///
222 /// # Errors
223 ///
224 /// Returns [`ParseError`] if theory extraction or tags-query
225 /// compilation fails.
226 pub fn register_external_grammar_owned(
227 &mut self,
228 name: String,
229 extensions: Vec<String>,
230 language: tree_sitter::Language,
231 node_types_json: Vec<u8>,
232 tags_query: Option<String>,
233 grammar_json: Option<Vec<u8>>,
234 ) -> Result<(), crate::error::ParseError> {
235 let name_static: &'static str = Box::leak(name.into_boxed_str());
236 let extensions_static: Vec<&'static str> = extensions
237 .into_iter()
238 .map(|s| Box::leak(s.into_boxed_str()) as &'static str)
239 .collect();
240 let node_types_static: &'static [u8] = Box::leak(node_types_json.into_boxed_slice());
241 let tags_query_static: Option<&'static str> =
242 tags_query.map(|s| Box::leak(s.into_boxed_str()) as &'static str);
243 let grammar_json_static: Option<&'static [u8]> =
244 grammar_json.map(|v| Box::leak(v.into_boxed_slice()) as &'static [u8]);
245
246 self.register_external_grammar(
247 name_static,
248 extensions_static,
249 language,
250 node_types_static,
251 tags_query_static,
252 grammar_json_static,
253 )
254 }
255
256 /// Remove a registration by protocol name.
257 ///
258 /// Drops the parser and any extension mappings that pointed at it.
259 /// Returns `true` if a parser was removed, `false` if no such
260 /// registration existed. Primarily intended for grammar-author
261 /// workflows where a registered grammar is being replaced by a
262 /// freshly-compiled version mid-process.
263 pub fn unregister(&mut self, name: &str) -> bool {
264 let removed = self.parsers.remove(name).is_some();
265 if removed {
266 self.extension_map.retain(|_, v| v != name);
267 }
268 removed
269 }
270
271 /// Override a registered grammar with new owned data.
272 ///
273 /// Equivalent to [`unregister`](Self::unregister) followed by
274 /// [`register_external_grammar_owned`](Self::register_external_grammar_owned),
275 /// and intended for the same grammar-author dev workflow. Any
276 /// extension mappings previously bound to `name` are replaced by
277 /// the new `extensions`.
278 ///
279 /// # Errors
280 ///
281 /// Returns [`ParseError`] if theory extraction or tags-query
282 /// compilation fails on the new grammar; in that case the prior
283 /// registration is already gone.
284 pub fn override_grammar(
285 &mut self,
286 name: String,
287 extensions: Vec<String>,
288 language: tree_sitter::Language,
289 node_types_json: Vec<u8>,
290 tags_query: Option<String>,
291 grammar_json: Option<Vec<u8>>,
292 ) -> Result<(), crate::error::ParseError> {
293 self.unregister(&name);
294 self.register_external_grammar_owned(
295 name,
296 extensions,
297 language,
298 node_types_json,
299 tags_query,
300 grammar_json,
301 )
302 }
303
304 /// Detect the language protocol for a file path by its extension.
305 ///
306 /// Returns `None` if the extension is not recognized (caller should
307 /// fall back to the `raw_file` protocol).
308 #[must_use]
309 pub fn detect_language(&self, path: &Path) -> Option<&str> {
310 path.extension()
311 .and_then(|ext| ext.to_str())
312 .and_then(|ext| self.extension_map.get(ext))
313 .map(String::as_str)
314 }
315
316 /// Parse a file by detecting its language from the file path.
317 ///
318 /// # Errors
319 ///
320 /// Returns [`ParseError::UnknownLanguage`] if the file extension is not recognized.
321 /// Returns other [`ParseError`] variants if parsing fails.
322 pub fn parse_file(&self, path: &Path, content: &[u8]) -> Result<Schema, ParseError> {
323 let protocol = self
324 .detect_language(path)
325 .ok_or_else(|| ParseError::UnknownLanguage {
326 extension: path
327 .extension()
328 .and_then(|e| e.to_str())
329 .unwrap_or("")
330 .to_owned(),
331 })?;
332
333 self.parse_with_protocol(protocol, content, &path.display().to_string())
334 }
335
336 /// Parse source code with a specific protocol name.
337 ///
338 /// # Errors
339 ///
340 /// Returns [`ParseError::UnknownLanguage`] if the protocol is not registered.
341 pub fn parse_with_protocol(
342 &self,
343 protocol: &str,
344 content: &[u8],
345 file_path: &str,
346 ) -> Result<Schema, ParseError> {
347 let parser = self
348 .parsers
349 .get(protocol)
350 .ok_or_else(|| ParseError::UnknownLanguage {
351 extension: protocol.to_owned(),
352 })?;
353
354 parser.parse(content, file_path)
355 }
356
357 /// Emit a schema back to source code bytes using the specified protocol.
358 ///
359 /// # Errors
360 ///
361 /// Returns [`ParseError::UnknownLanguage`] if the protocol is not registered.
362 pub fn emit_with_protocol(
363 &self,
364 protocol: &str,
365 schema: &Schema,
366 ) -> Result<Vec<u8>, ParseError> {
367 let parser = self
368 .parsers
369 .get(protocol)
370 .ok_or_else(|| ParseError::UnknownLanguage {
371 extension: protocol.to_owned(),
372 })?;
373
374 parser.emit(schema)
375 }
376
377 /// Render a by-construction schema using the named protocol.
378 ///
379 /// # Errors
380 ///
381 /// Returns [`ParseError::UnknownLanguage`] if the protocol is not
382 /// registered, or [`ParseError::EmitFailed`] from the underlying
383 /// parser's `emit_pretty`.
384 pub fn emit_pretty_with_protocol(
385 &self,
386 protocol: &str,
387 schema: &Schema,
388 ) -> Result<Vec<u8>, ParseError> {
389 let parser = self
390 .parsers
391 .get(protocol)
392 .ok_or_else(|| ParseError::UnknownLanguage {
393 extension: protocol.to_owned(),
394 })?;
395
396 parser.emit_pretty(schema)
397 }
398
399 /// Decorate an [`AbstractSchema`] with the layout enrichment
400 /// fibre required by `emit_pretty_with_protocol` and friends.
401 ///
402 /// This is the put-direction of the parse / decorate / emit lens
403 /// at `protocol`. The implementation routes through the same
404 /// grammar walker as `emit_pretty` followed by `parse`, so the
405 /// resulting [`DecoratedSchema`] carries a complete layout fibre
406 /// recovered by the parse-side walker — `start-byte`, `end-byte`,
407 /// every `interstitial-N`, `chose-alt-fingerprint`, and
408 /// `chose-alt-child-kinds`.
409 ///
410 /// The section law holds up to kind- and edge-multiset
411 /// equivalence: `forget_layout(decorate(a)) ≅ a` modulo vertex-id
412 /// renaming. Grammars where parsing consolidates tokens that the
413 /// emitter rendered as separate sequences (e.g. lilypond's `c'4`
414 /// re-parses to a single note) do not preserve a one-to-one
415 /// vertex correspondence, so the result's vertex IDs are always
416 /// freshly minted by the parser.
417 ///
418 /// # Errors
419 ///
420 /// Returns [`ParseError::UnknownLanguage`] when `protocol` is not
421 /// registered, [`ParseError::SchemaConstruction`] when the
422 /// abstract schema was built for a different protocol than
423 /// `protocol`, [`ParseError::EmitFailed`] when the grammar walker
424 /// cannot render the abstract schema (missing `grammar.json`,
425 /// vertex kind not a rule), or any other parser error if the
426 /// re-parse step rejects the canonical bytes (a regression in the
427 /// parse/emit pipeline, not a user bug).
428 pub fn decorate(
429 &self,
430 protocol: &str,
431 abstract_schema: &AbstractSchema,
432 policy: &LayoutPolicy,
433 ) -> Result<DecoratedSchema, ParseError> {
434 let parser = self
435 .parsers
436 .get(protocol)
437 .ok_or_else(|| ParseError::UnknownLanguage {
438 extension: protocol.to_owned(),
439 })?;
440 // `decorate_with_parser` enforces the protocol-match invariant
441 // between the parser and the abstract schema, so no extra guard
442 // is needed here.
443 crate::decorate::decorate_with_parser(parser.as_ref(), abstract_schema, policy)
444 }
445
446 /// Render an [`AbstractSchema`] to canonical source bytes under
447 /// `policy`.
448 ///
449 /// Implementation note: this is exactly the first emit step of
450 /// [`decorate`](Self::decorate) — `decorate` then re-parses to
451 /// recover the layout fibre, but if all the caller wants is the
452 /// bytes, the re-parse is wasted work. Going through
453 /// `emit_pretty_with_policy` directly preserves every field of
454 /// `policy` in the output (`separator`, `newline`, `indent_width`,
455 /// `line_break_after`, `indent_open` / `indent_close`).
456 ///
457 /// # Errors
458 ///
459 /// See [`decorate`](Self::decorate).
460 pub fn pretty_with_protocol(
461 &self,
462 protocol: &str,
463 abstract_schema: &AbstractSchema,
464 policy: &LayoutPolicy,
465 ) -> Result<Vec<u8>, ParseError> {
466 let parser = self
467 .parsers
468 .get(protocol)
469 .ok_or_else(|| ParseError::UnknownLanguage {
470 extension: protocol.to_owned(),
471 })?;
472 check_protocol_match(
473 protocol,
474 abstract_schema.as_schema(),
475 "pretty_with_protocol",
476 )?;
477 parser.emit_pretty_with_policy(abstract_schema.as_schema(), policy)
478 }
479
480 /// Return the canonical [`Protolens`](panproto_lens::Protolens)
481 /// describing the parse / decorate / emit relationship at
482 /// `protocol`.
483 ///
484 /// The protolens encodes the schema-level structure of the
485 /// relationship: source-side strips the layout enrichment fibre,
486 /// target-side adds it via the registered
487 /// [`LayoutEnricher`](panproto_lens::enrichment_registry::LayoutEnricher).
488 /// It composes with the rest of the `panproto-lens` protolens
489 /// algebra for chain-law reasoning. The operational entry points
490 /// for running the relationship on real schemas are
491 /// [`decorate`](Self::decorate),
492 /// [`pretty_with_protocol`](Self::pretty_with_protocol), and
493 /// [`emit_pretty_with_protocol`](Self::emit_pretty_with_protocol).
494 ///
495 /// # Errors
496 ///
497 /// Returns [`ParseError::UnknownLanguage`] when `protocol` is not
498 /// registered.
499 pub fn parse_emit_protolens(
500 &self,
501 protocol: &str,
502 policy: &LayoutPolicy,
503 ) -> Result<panproto_lens::Protolens, ParseError> {
504 if !self.parsers.contains_key(protocol) {
505 return Err(ParseError::UnknownLanguage {
506 extension: protocol.to_owned(),
507 });
508 }
509 Ok(crate::parse_emit_protolens::parse_emit_protolens(
510 protocol, policy,
511 ))
512 }
513
514 /// Get the theory metadata for a specific protocol.
515 #[must_use]
516 pub fn theory_meta(&self, protocol: &str) -> Option<&ExtractedTheoryMeta> {
517 self.parsers.get(protocol).map(|p| p.theory_meta())
518 }
519
520 /// List all registered protocol names.
521 pub fn protocol_names(&self) -> impl Iterator<Item = &str> {
522 self.parsers.keys().map(String::as_str)
523 }
524
525 /// O(1) lookup: is a parser already registered for `protocol`?
526 ///
527 /// Useful for dedup at the registration boundary. The umbrella
528 /// `panproto-grammars-all` companion pack overlaps with both the
529 /// built-in core grammars and every per-group pack; callers can
530 /// short-circuit before re-registering rather than scanning
531 /// `protocol_names()` linearly.
532 #[must_use]
533 pub fn has_parser(&self, protocol: &str) -> bool {
534 self.parsers.contains_key(protocol)
535 }
536
537 /// Get the number of registered parsers.
538 #[must_use]
539 pub fn len(&self) -> usize {
540 self.parsers.len()
541 }
542
543 /// Check if the registry is empty.
544 #[must_use]
545 pub fn is_empty(&self) -> bool {
546 self.parsers.is_empty()
547 }
548}
549
550impl Default for ParserRegistry {
551 fn default() -> Self {
552 Self::new()
553 }
554}
555
556/// Guard against running parser-tied operations on a schema built
557/// for a different protocol. Catches the user-visible error of
558/// passing (say) a JSON schema to a Python parser before the
559/// underlying grammar walker would surface it as an opaque rule
560/// mismatch.
561fn check_protocol_match(
562 expected: &str,
563 schema: &Schema,
564 operation: &'static str,
565) -> Result<(), ParseError> {
566 if schema.protocol == expected {
567 Ok(())
568 } else {
569 Err(ParseError::SchemaConstruction {
570 reason: format!(
571 "{operation}: protocol mismatch — registry called with '{expected}' but \
572 schema carries protocol '{}'",
573 schema.protocol,
574 ),
575 })
576 }
577}