panproto_parse/emit_pretty.rs
1#![allow(
2 clippy::module_name_repetitions,
3 clippy::too_many_lines,
4 clippy::too_many_arguments,
5 clippy::map_unwrap_or,
6 clippy::option_if_let_else,
7 clippy::elidable_lifetime_names,
8 clippy::items_after_statements,
9 clippy::needless_pass_by_value,
10 clippy::single_match_else,
11 clippy::manual_let_else,
12 clippy::match_same_arms,
13 clippy::missing_const_for_fn,
14 clippy::single_char_pattern,
15 clippy::naive_bytecount,
16 clippy::expect_used,
17 clippy::redundant_pub_crate,
18 clippy::used_underscore_binding,
19 clippy::redundant_field_names,
20 clippy::struct_field_names,
21 clippy::redundant_else,
22 clippy::similar_names
23)]
24
25//! De-novo source emission from a by-construction schema.
26//!
27//! [`AstParser::emit`] reconstructs source from byte-position fragments
28//! that the parser stored on the schema during `parse`. That works for
29//! edit pipelines (`parse → transform → emit`) but fails for schemas
30//! built by hand (`SchemaBuilder` with no parse history): they carry
31//! no `start-byte`, no `interstitial-N`, no `literal-value`, and the
32//! reconstructor returns `Err(EmitFailed { reason: "schema has no
33//! text fragments" })`.
34//!
35//! This module renders such schemas to source bytes by walking
36//! tree-sitter's `grammar.json` production rules. For each schema
37//! vertex of kind `K`, the walker looks up `K`'s production in the
38//! grammar and emits its body in order:
39//!
40//! - `STRING` nodes contribute literal token bytes directly.
41//! - `SYMBOL` and `FIELD` nodes recurse into the schema's children,
42//! matching by edge kind (which is the tree-sitter field name).
43//! - `SEQ` emits its members in order.
44//! - `CHOICE` picks the alternative whose head `SYMBOL` matches an
45//! actual child kind, or whose terminals appear in the rendered
46//! prefix; falls back to the first non-`BLANK` alternative when no
47//! alternative matches.
48//! - `REPEAT` and `REPEAT1` emit their content once per matching
49//! child edge in declared order.
50//! - `OPTIONAL` emits its content iff a corresponding child edge or
51//! constraint is populated.
52//! - `PATTERN` is a regex placeholder for variable-text terminals
53//! (identifiers, numbers, quoted strings). The walker emits a
54//! `literal-value` constraint when present and otherwise falls
55//! back to a placeholder derived from the regex shape.
56//! - `BLANK`, `TOKEN`, `IMMEDIATE_TOKEN`, `ALIAS`, `PREC*` are
57//! handled transparently (the inner content is emitted; the
58//! wrapper is dropped).
59//!
60//! Whitespace and indentation come from a `FormatPolicy` applied
61//! during emission. The default policy inserts a single space between
62//! adjacent tokens, a newline after `;` / `}` / `{`, and tracks an
63//! indent counter on `{` / `}` boundaries.
64//!
65//! Output is *syntactically valid* for any grammar that ships
66//! `grammar.json`. Idiomatic formatting (rustfmt-style spacing rules,
67//! per-language conventions) is a polish layer that lives outside
68//! this module.
69
70use std::collections::BTreeMap;
71
72use panproto_schema::{Edge, Schema};
73use serde::Deserialize;
74
75use crate::error::ParseError;
76
77// ═══════════════════════════════════════════════════════════════════
78// Grammar JSON model
79// ═══════════════════════════════════════════════════════════════════
80
81/// A single tree-sitter production rule.
82///
83/// Mirrors the shape emitted by `tree-sitter generate`: every node has
84/// a `type` discriminator that selects a structural variant. The
85/// untyped subset (`PATTERN`, `STRING`, `SYMBOL`, `BLANK`) handles
86/// terminals; the structural subset (`SEQ`, `CHOICE`, `REPEAT`,
87/// `REPEAT1`, `OPTIONAL`, `FIELD`, `ALIAS`, `TOKEN`,
88/// `IMMEDIATE_TOKEN`, `PREC*`) builds composite productions.
89#[derive(Debug, Clone, Deserialize)]
90#[serde(tag = "type")]
91#[non_exhaustive]
92pub enum Production {
93 /// Concatenation of productions.
94 #[serde(rename = "SEQ")]
95 Seq {
96 /// Ordered members; each is emitted in turn.
97 members: Vec<Self>,
98 },
99 /// Alternation between productions.
100 #[serde(rename = "CHOICE")]
101 Choice {
102 /// Alternatives; the walker picks one based on the schema's
103 /// children and constraints.
104 members: Vec<Self>,
105 },
106 /// Zero-or-more repetition.
107 #[serde(rename = "REPEAT")]
108 Repeat {
109 /// The repeated body.
110 content: Box<Self>,
111 },
112 /// One-or-more repetition.
113 #[serde(rename = "REPEAT1")]
114 Repeat1 {
115 /// The repeated body.
116 content: Box<Self>,
117 },
118 /// Optional inclusion (zero or one).
119 ///
120 /// Tree-sitter usually emits `OPTIONAL` as `CHOICE { content,
121 /// BLANK }`, but recent generator versions also emit explicit
122 /// `OPTIONAL` nodes; both shapes are accepted.
123 #[serde(rename = "OPTIONAL")]
124 Optional {
125 /// The optional body.
126 content: Box<Self>,
127 },
128 /// Reference to another rule by name.
129 #[serde(rename = "SYMBOL")]
130 Symbol {
131 /// Name of the referenced rule (matches a vertex kind on the
132 /// schema side).
133 name: String,
134 },
135 /// Literal token bytes.
136 #[serde(rename = "STRING")]
137 String {
138 /// The literal token. Emitted verbatim.
139 value: String,
140 },
141 /// Regex-matched terminal.
142 ///
143 /// At parse time this matches arbitrary bytes; at emit time the
144 /// walker substitutes a `literal-value` constraint when present
145 /// and falls back to a placeholder otherwise.
146 #[serde(rename = "PATTERN")]
147 Pattern {
148 /// The original regex.
149 value: String,
150 },
151 /// The empty production. Emits nothing.
152 #[serde(rename = "BLANK")]
153 Blank,
154 /// Named field over a content production.
155 ///
156 /// The field `name` matches an edge kind on the schema side; the
157 /// walker resolves the corresponding child vertex and recurses
158 /// into `content` with that child as context.
159 #[serde(rename = "FIELD")]
160 Field {
161 /// Field name (matches edge kind).
162 name: String,
163 /// The contents of the field.
164 content: Box<Self>,
165 },
166 /// An aliased production.
167 ///
168 /// `value` records the parser-visible kind; the walker emits
169 /// `content` and ignores the alias rename.
170 #[serde(rename = "ALIAS")]
171 Alias {
172 /// The aliased content.
173 content: Box<Self>,
174 /// Whether the alias is a named node.
175 #[serde(default)]
176 named: bool,
177 /// The alias's surface name.
178 #[serde(default)]
179 value: String,
180 },
181 /// A token wrapper.
182 ///
183 /// Tree-sitter uses `TOKEN` to mark a sub-rule as a single
184 /// lexical token; the walker emits the inner content unchanged.
185 #[serde(rename = "TOKEN")]
186 Token {
187 /// The wrapped content.
188 content: Box<Self>,
189 },
190 /// An immediate-token wrapper (no preceding whitespace).
191 ///
192 /// Treated like [`Production::Token`] for emit purposes.
193 #[serde(rename = "IMMEDIATE_TOKEN")]
194 ImmediateToken {
195 /// The wrapped content.
196 content: Box<Self>,
197 },
198 /// Precedence wrapper.
199 #[serde(rename = "PREC")]
200 Prec {
201 /// Precedence value (numeric or string). Ignored at emit time.
202 #[allow(dead_code)]
203 value: serde_json::Value,
204 /// The wrapped content.
205 content: Box<Self>,
206 },
207 /// Left-associative precedence wrapper.
208 #[serde(rename = "PREC_LEFT")]
209 PrecLeft {
210 /// Precedence value. Ignored at emit time.
211 #[allow(dead_code)]
212 value: serde_json::Value,
213 /// The wrapped content.
214 content: Box<Self>,
215 },
216 /// Right-associative precedence wrapper.
217 #[serde(rename = "PREC_RIGHT")]
218 PrecRight {
219 /// Precedence value. Ignored at emit time.
220 #[allow(dead_code)]
221 value: serde_json::Value,
222 /// The wrapped content.
223 content: Box<Self>,
224 },
225 /// Dynamic precedence wrapper.
226 #[serde(rename = "PREC_DYNAMIC")]
227 PrecDynamic {
228 /// Precedence value. Ignored at emit time.
229 #[allow(dead_code)]
230 value: serde_json::Value,
231 /// The wrapped content.
232 content: Box<Self>,
233 },
234 /// Reserved-word wrapper (tree-sitter ≥ 0.25).
235 ///
236 /// Tree-sitter's `RESERVED` rule marks an inner production as a
237 /// reserved-word context: the parser excludes the listed identifiers
238 /// from being treated as the inner symbol. The `context_name`
239 /// metadata names the reserved-word set; the emitter does not need
240 /// it (we are walking schema → bytes, not enforcing reserved-word
241 /// constraints), so we emit the inner content unchanged, the same
242 /// way [`Production::Token`] and [`Production::ImmediateToken`] do.
243 #[serde(rename = "RESERVED")]
244 Reserved {
245 /// The wrapped content.
246 content: Box<Self>,
247 /// Name of the reserved-word context. Ignored at emit time.
248 #[allow(dead_code)]
249 #[serde(default)]
250 context_name: String,
251 },
252}
253
254/// A grammar's production-rule table, deserialized from `grammar.json`.
255///
256/// Only the fields the emitter consumes are decoded; precedences,
257/// conflicts, externals, and other parser-only metadata are ignored.
258#[derive(Debug, Clone, Deserialize)]
259#[non_exhaustive]
260pub struct Grammar {
261 /// Grammar name (e.g. `"rust"`, `"typescript"`).
262 #[allow(dead_code)]
263 pub name: String,
264 /// Map from rule name (a vertex kind on the schema side) to
265 /// production. Entries are kept in lexical order so iteration
266 /// is deterministic.
267 pub rules: BTreeMap<String, Production>,
268 /// Supertypes declared in the grammar's `supertypes` field. A
269 /// supertype is a rule whose body is a `CHOICE` of `SYMBOL`
270 /// references; tree-sitter parsers report a node's kind as one
271 /// of the subtypes (e.g. `identifier`, `typed_parameter`) rather
272 /// than the supertype name (`parameter`), so the emitter needs to
273 /// know that a child kind in a subtype set should match the
274 /// supertype name when a SYMBOL references it.
275 #[serde(default, deserialize_with = "deserialize_supertypes")]
276 pub supertypes: std::collections::HashSet<String>,
277 /// Tree-sitter `extras` rules: the named symbols (typically comments)
278 /// that tree-sitter skips at parse time but records as children of the
279 /// surrounding vertex. They appear nowhere in the production grammar,
280 /// so the rule walker cannot reconcile them against the cursor — the
281 /// emit pass therefore drains them as a side channel: at vertex entry
282 /// and between REPEAT iterations any leading extras-kind edges are
283 /// consumed and emitted directly. The set is populated at
284 /// `Grammar::from_bytes` by collecting every `SYMBOL { name }` and
285 /// named `ALIAS { value, named: true }` under the top-level `extras`
286 /// array. Pattern-only extras (e.g. `\s` whitespace) are not vertex
287 /// kinds and are excluded.
288 #[serde(default, deserialize_with = "deserialize_extras")]
289 pub extras: std::collections::HashSet<String>,
290 /// Precomputed subtyping closure: `subtypes[symbol_name]` is the
291 /// set of vertex kinds that satisfy a SYMBOL `symbol_name`
292 /// reference on the schema side.
293 ///
294 /// Built once at [`Grammar::from_bytes`] time by walking each
295 /// hidden rule (`_`-prefixed), declared supertype, and named
296 /// `ALIAS { value: K, ... }` production to its leaf SYMBOLs and
297 /// recording the closure. This replaces the prior heuristic
298 /// `kind_satisfies_symbol` that walked the rule body on every
299 /// query: lookups are now O(1) and the relation is exactly the
300 /// transitive closure of "is reachable via hidden / supertype /
301 /// alias dispatch", with no over-expansion through non-hidden
302 /// non-supertype rule references.
303 #[serde(skip)]
304 pub subtypes: std::collections::HashMap<String, std::collections::HashSet<String>>,
305 /// Precomputed Yield sets: `yield_sets[rule_name]` is the set of
306 /// concrete vertex kinds that can appear as the **first named
307 /// child** when that rule's production is taken.
308 ///
309 /// Defined inductively:
310 /// - `Yield(SYMBOL S)` where S is hidden/supertype = `Yield(rules[S])`
311 /// - `Yield(SYMBOL S)` where S is concrete = `{S}`
312 /// - `Yield(SEQ [M1, ...])` = `Yield(M1)` (only first member)
313 /// - `Yield(CHOICE [M1, ..., Mn])` = `⋃ Yield(Mi)`
314 /// - `Yield(OPTIONAL { c })` = `Yield(c) ∪ {ε}`
315 /// - `Yield(BLANK)` = `{ε}`
316 /// - Wrappers (PREC*, TOKEN, FIELD, REPEAT, etc.) = `Yield(content)`
317 /// - `Yield(STRING)` = `Yield(PATTERN)` = `∅`
318 /// - `Yield(ALIAS { value: V, named: true })` = `{V}`
319 ///
320 /// Epsilon is represented as the empty string `""`.
321 #[serde(skip)]
322 pub yield_sets: std::collections::HashMap<String, std::collections::HashSet<String>>,
323 /// Child kinds allowed per parent kind, derived from node-types.json.
324 /// Maps parent kind to the set of ALL named child kinds that tree-sitter's
325 /// parser can produce for that parent (from both `children.types` and
326 /// `fields.*.types`). Used by `augment_subtypes_from_node_types` to
327 /// close the grammar/parser divergence gap.
328 #[serde(skip)]
329 pub node_type_children: std::collections::HashMap<String, std::collections::HashSet<String>>,
330 /// Anonymous ALIAS values for external scanner tokens. Maps external
331 /// symbol name (e.g. `_ternary_qmark`) to the ALIAS value string
332 /// (e.g. `"?"`). Built by scanning grammar.json rule bodies for
333 /// `ALIAS { content: SYMBOL S, named: false, value: V }` where S
334 /// has no grammar rule.
335 #[serde(skip)]
336 pub external_alias_map: std::collections::HashMap<String, String>,
337 /// Rules whose `{`/`}` STRING tokens are inline delimiters (e.g.
338 /// string interpolation) rather than block scopes. Identified
339 /// structurally: a rule whose SEQ contains `{` and `}` but no
340 /// REPEAT/REPEAT1 between them.
341 #[serde(skip)]
342 pub inline_brace_rules: std::collections::HashSet<String>,
343}
344
345fn deserialize_supertypes<'de, D>(
346 deserializer: D,
347) -> Result<std::collections::HashSet<String>, D::Error>
348where
349 D: serde::Deserializer<'de>,
350{
351 let entries: Vec<serde_json::Value> = Vec::deserialize(deserializer)?;
352 let mut out = std::collections::HashSet::new();
353 for entry in entries {
354 match entry {
355 serde_json::Value::String(s) => {
356 out.insert(s);
357 }
358 serde_json::Value::Object(map) => {
359 if let Some(serde_json::Value::String(name)) = map.get("name") {
360 out.insert(name.clone());
361 }
362 }
363 _ => {}
364 }
365 }
366 Ok(out)
367}
368
369fn deserialize_extras<'de, D>(
370 deserializer: D,
371) -> Result<std::collections::HashSet<String>, D::Error>
372where
373 D: serde::Deserializer<'de>,
374{
375 let entries: Vec<serde_json::Value> = Vec::deserialize(deserializer)?;
376 let mut out = std::collections::HashSet::new();
377 for entry in entries {
378 if let serde_json::Value::Object(map) = entry {
379 let ty = map.get("type").and_then(serde_json::Value::as_str);
380 match ty {
381 // SYMBOL { name: K } — the extras rule is a named symbol
382 // (typically `line_comment` / `block_comment`). The kind
383 // K appears as a real child vertex on the schema side.
384 Some("SYMBOL") => {
385 if let Some(serde_json::Value::String(name)) = map.get("name") {
386 out.insert(name.clone());
387 }
388 }
389 // ALIAS { content, value: V, named: true } — the extras
390 // rule renames its content; V is the kind on the schema.
391 Some("ALIAS") => {
392 let named = map
393 .get("named")
394 .and_then(serde_json::Value::as_bool)
395 .unwrap_or(false);
396 if named {
397 if let Some(serde_json::Value::String(value)) = map.get("value") {
398 out.insert(value.clone());
399 }
400 }
401 }
402 // PATTERN / STRING / TOKEN entries describe inter-token
403 // whitespace and have no vertex-side representation.
404 _ => {}
405 }
406 }
407 }
408 Ok(out)
409}
410
411impl Grammar {
412 /// Parse a grammar's `grammar.json` bytes.
413 ///
414 /// Builds the subtyping closure as part of construction so every
415 /// downstream lookup is O(1). The closure is the least relation
416 /// containing `(K, K)` for every rule key `K` and closed under:
417 ///
418 /// - hidden-rule expansion: if `S` is hidden and a SYMBOL `S` may
419 /// reach SYMBOL `K`, then `K ⊑ S`.
420 /// - supertype expansion: if `S` is in the grammar's supertypes
421 /// block and `K` is one of `S`'s alternatives, then `K ⊑ S`.
422 /// - alias renaming: if a rule body contains
423 /// `ALIAS { content: SYMBOL R, value: A, named: true }` where
424 /// `R` reaches kind `K` (or `K = R` when no further hop), then
425 /// `A ⊑ R` and `K ⊑ A`.
426 ///
427 /// # Errors
428 ///
429 /// Returns [`ParseError::EmitFailed`] when the bytes are not a
430 /// valid `grammar.json` document.
431 pub fn from_bytes(protocol: &str, bytes: &[u8]) -> Result<Self, ParseError> {
432 Self::from_bytes_with_node_types(protocol, bytes, None)
433 }
434
435 /// Parse a grammar from both `grammar.json` and optionally
436 /// `node-types.json` bytes.
437 ///
438 /// # Errors
439 ///
440 /// Returns [`ParseError::EmitFailed`] when `grammar_bytes` is
441 /// not a valid `grammar.json` document.
442 pub fn from_bytes_with_node_types(
443 protocol: &str,
444 grammar_bytes: &[u8],
445 node_types_bytes: Option<&[u8]>,
446 ) -> Result<Self, ParseError> {
447 let mut grammar: Self =
448 serde_json::from_slice(grammar_bytes).map_err(|e| ParseError::EmitFailed {
449 protocol: protocol.to_owned(),
450 reason: format!("grammar.json deserialization failed: {e}"),
451 })?;
452 grammar.subtypes = compute_subtype_closure(&grammar);
453 if let Some(nt_bytes) = node_types_bytes {
454 grammar.node_type_children = build_node_type_children(nt_bytes);
455 augment_subtypes_from_node_types(&mut grammar);
456 }
457 grammar.external_alias_map = build_external_alias_map(&grammar);
458 grammar.inline_brace_rules = identify_inline_brace_rules(&grammar);
459 grammar.yield_sets = compute_yield_sets(&grammar);
460 Ok(grammar)
461 }
462}
463
464/// Compute the subtyping relation as a forward-indexed map from a
465/// SYMBOL name to the set of vertex kinds that satisfy that SYMBOL.
466fn compute_subtype_closure(
467 grammar: &Grammar,
468) -> std::collections::HashMap<String, std::collections::HashSet<String>> {
469 use std::collections::{HashMap, HashSet};
470 // Edges of the "kind X satisfies SYMBOL Y" relation. `K ⊑ Y` is
471 // recorded whenever Y is reached by walking the grammar's
472 // ALIAS / hidden-rule / supertype dispatch from a position where
473 // K is the actual vertex kind.
474 let mut subtypes: HashMap<String, HashSet<String>> = HashMap::new();
475 for name in grammar.rules.keys() {
476 subtypes
477 .entry(name.clone())
478 .or_default()
479 .insert(name.clone());
480 }
481
482 // First pass: collect the immediate "satisfies" edges from each
483 // expandable rule (hidden, supertype) to the kinds reachable by
484 // walking its body, plus alias edges.
485 fn walk<'g>(
486 grammar: &'g Grammar,
487 production: &'g Production,
488 visited: &mut HashSet<&'g str>,
489 out: &mut HashSet<String>,
490 ) {
491 match production {
492 Production::Symbol { name } => {
493 // Direct subtype.
494 out.insert(name.clone());
495 // Continue expansion through hidden / supertype rules
496 // so the closure traverses pass-through dispatch.
497 let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
498 if expand && visited.insert(name.as_str()) {
499 if let Some(rule) = grammar.rules.get(name) {
500 walk(grammar, rule, visited, out);
501 }
502 }
503 }
504 Production::Choice { members } | Production::Seq { members } => {
505 for m in members {
506 walk(grammar, m, visited, out);
507 }
508 }
509 Production::Alias {
510 content,
511 named,
512 value,
513 } => {
514 if *named && !value.is_empty() {
515 out.insert(value.clone());
516 }
517 walk(grammar, content, visited, out);
518 }
519 Production::Repeat { content }
520 | Production::Repeat1 { content }
521 | Production::Optional { content }
522 | Production::Field { content, .. }
523 | Production::Token { content }
524 | Production::ImmediateToken { content }
525 | Production::Prec { content, .. }
526 | Production::PrecLeft { content, .. }
527 | Production::PrecRight { content, .. }
528 | Production::PrecDynamic { content, .. }
529 | Production::Reserved { content, .. } => {
530 walk(grammar, content, visited, out);
531 }
532 _ => {}
533 }
534 }
535
536 for (name, rule) in &grammar.rules {
537 let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
538 if !expand {
539 continue;
540 }
541 let mut visited: HashSet<&str> = HashSet::new();
542 visited.insert(name.as_str());
543 let mut reachable: HashSet<String> = HashSet::new();
544 walk(grammar, rule, &mut visited, &mut reachable);
545 for kind in &reachable {
546 subtypes
547 .entry(kind.clone())
548 .or_default()
549 .insert(name.clone());
550 }
551 }
552
553 // Aliases: scan every rule body for ALIAS { content, value }
554 // declarations. The kinds reachable from `content` satisfy
555 // `value`, AND (by construction) `value` satisfies the
556 // surrounding rule. Walking the ENTIRE grammar once captures
557 // every alias site, irrespective of which rule introduces it.
558 fn collect_aliases<'g>(production: &'g Production, out: &mut Vec<(String, &'g Production)>) {
559 match production {
560 Production::Alias {
561 content,
562 named,
563 value,
564 } => {
565 if *named && !value.is_empty() {
566 out.push((value.clone(), content.as_ref()));
567 }
568 collect_aliases(content, out);
569 }
570 Production::Choice { members } | Production::Seq { members } => {
571 for m in members {
572 collect_aliases(m, out);
573 }
574 }
575 Production::Repeat { content }
576 | Production::Repeat1 { content }
577 | Production::Optional { content }
578 | Production::Field { content, .. }
579 | Production::Token { content }
580 | Production::ImmediateToken { content }
581 | Production::Prec { content, .. }
582 | Production::PrecLeft { content, .. }
583 | Production::PrecRight { content, .. }
584 | Production::PrecDynamic { content, .. }
585 | Production::Reserved { content, .. } => {
586 collect_aliases(content, out);
587 }
588 _ => {}
589 }
590 }
591 let mut aliases: Vec<(String, &Production)> = Vec::new();
592 for rule in grammar.rules.values() {
593 collect_aliases(rule, &mut aliases);
594 }
595 for (alias_value, content) in aliases {
596 let mut visited: HashSet<&str> = HashSet::new();
597 let mut reachable: HashSet<String> = HashSet::new();
598 walk(grammar, content, &mut visited, &mut reachable);
599 // Aliased value satisfies itself and is satisfied by every
600 // kind its content can reach.
601 subtypes
602 .entry(alias_value.clone())
603 .or_default()
604 .insert(alias_value.clone());
605 for kind in reachable {
606 subtypes
607 .entry(kind)
608 .or_default()
609 .insert(alias_value.clone());
610 }
611 }
612
613 // Transitive close: `K ⊑ A` and `A ⊑ B` implies `K ⊑ B`. Iterate
614 // a few rounds; the relation is small so a quick fixed-point
615 // suffices in practice.
616 for _ in 0..8 {
617 let snapshot = subtypes.clone();
618 let mut changed = false;
619 for (kind, supers) in &snapshot {
620 let extra: HashSet<String> = supers
621 .iter()
622 .flat_map(|s| snapshot.get(s).cloned().unwrap_or_default())
623 .collect();
624 let entry = subtypes.entry(kind.clone()).or_default();
625 for s in extra {
626 if entry.insert(s) {
627 changed = true;
628 }
629 }
630 }
631 if !changed {
632 break;
633 }
634 }
635
636 subtypes
637}
638
639/// Compute the Yield set for every rule in the grammar.
640///
641/// `Yield(P)` is the set of concrete vertex kinds that can appear as
642/// the first named child when production P is taken. See the
643/// `Grammar::yield_sets` doc comment for the inductive definition.
644fn compute_yield_sets(
645 grammar: &Grammar,
646) -> std::collections::HashMap<String, std::collections::HashSet<String>> {
647 let mut cache: std::collections::HashMap<String, std::collections::HashSet<String>> =
648 std::collections::HashMap::new();
649 for (name, rule) in &grammar.rules {
650 if cache.contains_key(name) {
651 continue;
652 }
653 let mut visited = std::collections::HashSet::new();
654 let ys = yield_of_production(grammar, rule, &mut visited, &mut cache);
655 cache.insert(name.clone(), ys);
656 }
657 cache
658}
659
660/// Compute the Yield set of an arbitrary production node.
661///
662/// Uses `cache` (the partially-built `yield_sets` map) as
663/// memoization. `visited` tracks the current recursion path to
664/// detect cycles through hidden/supertype rules; a cycle returns ∅
665/// (a cycle that never passes through a concrete named symbol
666/// cannot produce a first child).
667fn yield_of_production(
668 grammar: &Grammar,
669 production: &Production,
670 visited: &mut std::collections::HashSet<String>,
671 cache: &mut std::collections::HashMap<String, std::collections::HashSet<String>>,
672) -> std::collections::HashSet<String> {
673 match production {
674 Production::Symbol { name } => {
675 if let Some(cached) = cache.get(name) {
676 return cached.clone();
677 }
678 let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
679 if expand {
680 if !visited.insert(name.clone()) {
681 return std::collections::HashSet::new();
682 }
683 let result = if let Some(rule) = grammar.rules.get(name) {
684 yield_of_production(grammar, rule, visited, cache)
685 } else {
686 std::collections::HashSet::new()
687 };
688 visited.remove(name);
689 cache.insert(name.clone(), result.clone());
690 result
691 } else {
692 let mut set = std::collections::HashSet::new();
693 set.insert(name.clone());
694 set
695 }
696 }
697 Production::Alias {
698 content,
699 named,
700 value,
701 } => {
702 if *named && !value.is_empty() {
703 let mut set = std::collections::HashSet::new();
704 set.insert(value.clone());
705 set
706 } else {
707 yield_of_production(grammar, content, visited, cache)
708 }
709 }
710 Production::Seq { members } => {
711 if members.is_empty() {
712 let mut set = std::collections::HashSet::new();
713 set.insert(String::new());
714 set
715 } else {
716 // Walk the SEQ members left-to-right, returning the
717 // Yield of the first member that can produce a named
718 // child. STRING and PATTERN yield ∅ (anonymous
719 // tokens); skip them to reach the first named-child-
720 // producing position. This handles hidden rules like
721 // `_initializer = SEQ ["=", FIELD { value, ... }]`
722 // where the leading "=" is a STRING.
723 for m in members {
724 let ys = yield_of_production(grammar, m, visited, cache);
725 if !ys.is_empty() {
726 return ys;
727 }
728 }
729 std::collections::HashSet::new()
730 }
731 }
732 Production::Choice { members } => {
733 let mut union = std::collections::HashSet::new();
734 for m in members {
735 union.extend(yield_of_production(grammar, m, visited, cache));
736 }
737 union
738 }
739 Production::Optional { content } => {
740 let mut set = yield_of_production(grammar, content, visited, cache);
741 set.insert(String::new());
742 set
743 }
744 Production::Blank => {
745 let mut set = std::collections::HashSet::new();
746 set.insert(String::new());
747 set
748 }
749 Production::String { .. } | Production::Pattern { .. } => std::collections::HashSet::new(),
750 Production::Repeat { content }
751 | Production::Repeat1 { content }
752 | Production::Field { content, .. }
753 | Production::Token { content }
754 | Production::ImmediateToken { content }
755 | Production::Prec { content, .. }
756 | Production::PrecLeft { content, .. }
757 | Production::PrecRight { content, .. }
758 | Production::PrecDynamic { content, .. }
759 | Production::Reserved { content, .. } => {
760 yield_of_production(grammar, content, visited, cache)
761 }
762 }
763}
764
765// ═══════════════════════════════════════════════════════════════════
766// node-types.json integration
767// ═══════════════════════════════════════════════════════════════════
768
769/// Parse node-types.json and build a map from parent kind to the set
770/// of all named child kinds the parser can produce for that parent.
771fn build_node_type_children(
772 nt_bytes: &[u8],
773) -> std::collections::HashMap<String, std::collections::HashSet<String>> {
774 let node_types: Vec<crate::theory_extract::NodeType> = match serde_json::from_slice(nt_bytes) {
775 Ok(v) => v,
776 Err(_) => return std::collections::HashMap::new(),
777 };
778 let mut map: std::collections::HashMap<String, std::collections::HashSet<String>> =
779 std::collections::HashMap::new();
780 for entry in &node_types {
781 if !entry.named {
782 continue;
783 }
784 let mut child_kinds = std::collections::HashSet::new();
785 for field_value in entry.fields.values() {
786 if let Some(types) = field_value.get("types").and_then(|t| t.as_array()) {
787 for t in types {
788 if let (Some(name), Some(true)) = (
789 t.get("type").and_then(|n| n.as_str()),
790 t.get("named").and_then(serde_json::Value::as_bool),
791 ) {
792 child_kinds.insert(name.to_owned());
793 }
794 }
795 }
796 }
797 if let Some(ref children) = entry.children {
798 for t in &children.types {
799 if t.named {
800 child_kinds.insert(t.node_type.clone());
801 }
802 }
803 }
804 if !child_kinds.is_empty() {
805 map.insert(entry.node_type.clone(), child_kinds);
806 }
807 }
808 map
809}
810
811/// Augment `grammar.subtypes` with child-kind data from node-types.json.
812///
813/// For each parent kind P with node-type children, for each SYMBOL S
814/// referenced in P's grammar rule, for each child kind C in
815/// `node_type_children[P]`: if C does not already satisfy S, record
816/// C satisfies S. This closes the grammar/parser divergence where
817/// tree-sitter's parser produces child kinds not reachable from
818/// grammar.json's production rules.
819fn augment_subtypes_from_node_types(grammar: &mut Grammar) {
820 let pairs: Vec<(String, String)> = grammar
821 .node_type_children
822 .iter()
823 .flat_map(|(parent_kind, allowed_children)| {
824 let symbols: Vec<&str> = grammar
825 .rules
826 .get(parent_kind)
827 .map(|rule| referenced_symbols(rule))
828 .unwrap_or_default();
829 let mut out = Vec::new();
830 for child_kind in allowed_children {
831 // Only augment if this child kind doesn't already
832 // satisfy ANY symbol referenced by this parent's rule.
833 // If it already has a home (e.g. `integer` satisfies
834 // `_right_hand_side`), adding it as satisfying `type`
835 // would cause CHOICE dispatch to pick the wrong
836 // alternative.
837 let already_satisfies_some = symbols
838 .iter()
839 .any(|s| kind_satisfies_symbol(grammar, Some(child_kind), s));
840 if already_satisfies_some {
841 continue;
842 }
843 for sym_name in &symbols {
844 out.push((child_kind.clone(), (*sym_name).to_owned()));
845 }
846 }
847 out
848 })
849 .collect();
850 for (child_kind, sym_name) in pairs {
851 grammar
852 .subtypes
853 .entry(child_kind)
854 .or_default()
855 .insert(sym_name);
856 }
857}
858
859/// Build a map from external scanner symbol names to their anonymous
860/// ALIAS values by walking every rule body in the grammar.
861fn build_external_alias_map(grammar: &Grammar) -> std::collections::HashMap<String, String> {
862 let mut map = std::collections::HashMap::new();
863 fn walk(
864 grammar: &Grammar,
865 prod: &Production,
866 map: &mut std::collections::HashMap<String, String>,
867 ) {
868 match prod {
869 Production::Alias {
870 content,
871 named,
872 value,
873 } => {
874 if !*named && !value.is_empty() {
875 if let Production::Symbol { name } = content.as_ref() {
876 if name.starts_with('_') && !grammar.rules.contains_key(name) {
877 map.entry(name.clone()).or_insert_with(|| value.clone());
878 }
879 }
880 }
881 walk(grammar, content, map);
882 }
883 Production::Choice { members } | Production::Seq { members } => {
884 for m in members {
885 walk(grammar, m, map);
886 }
887 }
888 Production::Repeat { content }
889 | Production::Repeat1 { content }
890 | Production::Optional { content }
891 | Production::Field { content, .. }
892 | Production::Token { content }
893 | Production::ImmediateToken { content }
894 | Production::Prec { content, .. }
895 | Production::PrecLeft { content, .. }
896 | Production::PrecRight { content, .. }
897 | Production::PrecDynamic { content, .. }
898 | Production::Reserved { content, .. } => walk(grammar, content, map),
899 _ => {}
900 }
901 }
902 for rule in grammar.rules.values() {
903 walk(grammar, rule, &mut map);
904 }
905 map
906}
907
908/// Identify rules whose `{`/`}` tokens are inline delimiters (e.g.
909/// interpolation) rather than block scopes. A rule is inline-brace
910/// iff its production SEQ contains both an opening brace token and
911/// `}`, and the members between them contain no REPEAT/REPEAT1
912/// (which would indicate a statement-list block).
913fn identify_inline_brace_rules(grammar: &Grammar) -> std::collections::HashSet<String> {
914 fn is_inline_brace_body(prod: &Production) -> bool {
915 match prod {
916 Production::Seq { members } => {
917 let open_idx = members.iter().position(|m| match m {
918 Production::String { value } => value.contains('{'),
919 _ => false,
920 });
921 let close_idx = members
922 .iter()
923 .rposition(|m| matches!(m, Production::String { value } if value == "}"));
924 if let (Some(open), Some(close)) = (open_idx, close_idx) {
925 if open < close {
926 let between = &members[open + 1..close];
927 return !has_repeat(between);
928 }
929 }
930 false
931 }
932 Production::Prec { content, .. }
933 | Production::PrecLeft { content, .. }
934 | Production::PrecRight { content, .. }
935 | Production::PrecDynamic { content, .. }
936 | Production::Token { content }
937 | Production::ImmediateToken { content }
938 | Production::Reserved { content, .. } => is_inline_brace_body(content),
939 _ => false,
940 }
941 }
942 fn has_repeat(members: &[Production]) -> bool {
943 members.iter().any(|m| match m {
944 Production::Repeat { .. } | Production::Repeat1 { .. } => true,
945 Production::Prec { content, .. }
946 | Production::PrecLeft { content, .. }
947 | Production::PrecRight { content, .. }
948 | Production::PrecDynamic { content, .. } => {
949 matches!(
950 content.as_ref(),
951 Production::Repeat { .. } | Production::Repeat1 { .. }
952 )
953 }
954 _ => false,
955 })
956 }
957 let mut result = std::collections::HashSet::new();
958 for (name, rule) in &grammar.rules {
959 if is_inline_brace_body(rule) {
960 result.insert(name.clone());
961 }
962 }
963 result
964}
965
966// ═══════════════════════════════════════════════════════════════════
967// Format policy
968// ═══════════════════════════════════════════════════════════════════
969
970/// Whitespace and indentation policy applied during emission.
971///
972/// The default policy inserts a single space between adjacent tokens,
973/// a newline after `;` / `}` / `{`, and tracks indent on `{` / `}`
974/// boundaries. Per-language overrides (idiomatic indent width,
975/// trailing-comma rules, blank-line conventions) can ride alongside
976/// this struct in a follow-up branch; today's defaults aim only for
977/// syntactic validity.
978#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
979pub struct FormatPolicy {
980 /// Number of spaces per indent level.
981 pub indent_width: usize,
982 /// Separator inserted between adjacent terminals that the lexer
983 /// would otherwise glue together (word ↔ word, operator ↔ operator).
984 /// Default is a single space.
985 pub separator: String,
986 /// Newline byte sequence emitted after `line_break_after` tokens
987 /// and at end-of-output. Default is `"\n"`.
988 pub newline: String,
989 /// Tokens after which the walker breaks to a new line.
990 pub line_break_after: Vec<String>,
991 /// Tokens that increase indent on emission.
992 pub indent_open: Vec<String>,
993 /// Tokens that decrease indent on emission.
994 pub indent_close: Vec<String>,
995}
996
997impl Default for FormatPolicy {
998 fn default() -> Self {
999 Self {
1000 indent_width: 2,
1001 separator: " ".to_owned(),
1002 newline: "\n".to_owned(),
1003 line_break_after: vec![";".into(), "{".into(), "}".into()],
1004 indent_open: vec!["{".into()],
1005 indent_close: vec!["}".into()],
1006 }
1007 }
1008}
1009
1010// ═══════════════════════════════════════════════════════════════════
1011// Emitter
1012// ═══════════════════════════════════════════════════════════════════
1013
1014/// Emit a by-construction schema to source bytes.
1015///
1016/// `protocol` is the grammar / language name (used in error messages
1017/// and to label the entry point).
1018///
1019/// The walker treats `schema.entries` as the ordered list of root
1020/// vertices, falling back to a deterministic by-id ordering when
1021/// `entries` is empty. Each root is emitted using the production
1022/// associated with its kind in `grammar.rules`.
1023///
1024/// # Errors
1025///
1026/// Returns [`ParseError::EmitFailed`] when:
1027///
1028/// - the schema has no vertices
1029/// - a root vertex's kind is not a grammar rule
1030/// - a `SYMBOL` reference points at a kind with no rule and no schema
1031/// child to resolve it to
1032/// - a required `FIELD` has no corresponding edge in the schema
1033pub fn emit_pretty(
1034 protocol: &str,
1035 schema: &Schema,
1036 grammar: &Grammar,
1037 policy: &FormatPolicy,
1038) -> Result<Vec<u8>, ParseError> {
1039 let roots = collect_roots(schema);
1040 if roots.is_empty() {
1041 return Err(ParseError::EmitFailed {
1042 protocol: protocol.to_owned(),
1043 reason: "schema has no entry vertices".to_owned(),
1044 });
1045 }
1046
1047 let mut out = Output::new(policy);
1048 for (i, root) in roots.iter().enumerate() {
1049 if i > 0 {
1050 out.newline();
1051 }
1052 emit_vertex(protocol, schema, grammar, root, &mut out)?;
1053 }
1054 Ok(out.finish())
1055}
1056
1057fn collect_roots(schema: &Schema) -> Vec<&panproto_gat::Name> {
1058 if !schema.entries.is_empty() {
1059 return schema
1060 .entries
1061 .iter()
1062 .filter(|name| schema.vertices.contains_key(*name))
1063 .collect();
1064 }
1065
1066 // Fallback: every vertex that is not the target of any structural edge
1067 // (sorted by id for determinism).
1068 let mut targets: std::collections::HashSet<&panproto_gat::Name> =
1069 std::collections::HashSet::new();
1070 for edge in schema.edges.keys() {
1071 targets.insert(&edge.tgt);
1072 }
1073 let mut roots: Vec<&panproto_gat::Name> = schema
1074 .vertices
1075 .keys()
1076 .filter(|name| !targets.contains(name))
1077 .collect();
1078 roots.sort();
1079 roots
1080}
1081
1082fn emit_vertex(
1083 protocol: &str,
1084 schema: &Schema,
1085 grammar: &Grammar,
1086 vertex_id: &panproto_gat::Name,
1087 out: &mut Output<'_>,
1088) -> Result<(), ParseError> {
1089 let vertex = schema
1090 .vertices
1091 .get(vertex_id)
1092 .ok_or_else(|| ParseError::EmitFailed {
1093 protocol: protocol.to_owned(),
1094 reason: format!("vertex '{vertex_id}' not found"),
1095 })?;
1096
1097 // Leaf shortcut: a vertex carrying a `literal-value` constraint
1098 // and no outgoing structural edges is a terminal token. Emit the
1099 // captured value directly. This handles identifiers, numeric
1100 // literals, and string literals that the parser stored as
1101 // `literal-value` even on by-construction schemas.
1102 if let Some(literal) = literal_value(schema, vertex_id) {
1103 if children_for(schema, vertex_id).is_empty() {
1104 out.token(literal);
1105 return Ok(());
1106 }
1107 }
1108
1109 let kind = vertex.kind.as_ref();
1110 let edges = children_for(schema, vertex_id);
1111 if let Some(rule) = grammar.rules.get(kind) {
1112 let old_suppress = out.suppress_brace_indent;
1113 if grammar.inline_brace_rules.contains(kind) {
1114 out.suppress_brace_indent = true;
1115 }
1116 let mut cursor = ChildCursor::new(&edges);
1117 emit_production(protocol, schema, grammar, vertex_id, rule, &mut cursor, out)?;
1118 // Drain any extras left after the rule walk completed; tree-sitter
1119 // may record trailing comments as children of the surrounding
1120 // vertex (i.e. after the last structural child the rule matched).
1121 drain_extras(protocol, schema, grammar, &mut cursor, out)?;
1122 out.suppress_brace_indent = old_suppress;
1123 return Ok(());
1124 }
1125
1126 // No rule for this kind. The parser produced it via an ALIAS
1127 // (tree-sitter's `alias($.something, $.actual_kind)`) or via an
1128 // external scanner (e.g. YAML's `document` root). Fall back to
1129 // walking the children directly so the inner content survives;
1130 // surrounding tokens — whose only source is the missing rule —
1131 // are necessarily absent.
1132 for edge in &edges {
1133 emit_vertex(protocol, schema, grammar, &edge.tgt, out)?;
1134 }
1135 Ok(())
1136}
1137
1138/// Linear cursor over a vertex's outgoing edges, used to thread
1139/// children through a production rule without double-consuming them.
1140struct ChildCursor<'a> {
1141 edges: &'a [&'a Edge],
1142 consumed: Vec<bool>,
1143}
1144
1145impl<'a> ChildCursor<'a> {
1146 fn new(edges: &'a [&'a Edge]) -> Self {
1147 Self {
1148 edges,
1149 consumed: vec![false; edges.len()],
1150 }
1151 }
1152
1153 /// Take the next unconsumed edge whose kind equals `field_name`.
1154 fn take_field(&mut self, field_name: &str) -> Option<&'a Edge> {
1155 for (i, edge) in self.edges.iter().enumerate() {
1156 if !self.consumed[i] && edge.kind.as_ref() == field_name {
1157 self.consumed[i] = true;
1158 return Some(edge);
1159 }
1160 }
1161 None
1162 }
1163
1164 /// Whether any unconsumed edge satisfies `predicate`. Used by the
1165 /// unit tests; the live emit path went through `has_matching` on
1166 /// each alternative until cursor-driven dispatch was rewritten to
1167 /// pick the first-unconsumed-edge's kind directly.
1168 #[cfg(test)]
1169 fn has_matching(&self, predicate: impl Fn(&Edge) -> bool) -> bool {
1170 self.edges
1171 .iter()
1172 .enumerate()
1173 .any(|(i, edge)| !self.consumed[i] && predicate(edge))
1174 }
1175
1176 /// Take the next unconsumed edge whose target vertex satisfies
1177 /// `predicate`. Returns the edge and the underlying production
1178 /// resolution path is the caller's job.
1179 fn take_matching(&mut self, predicate: impl Fn(&Edge) -> bool) -> Option<&'a Edge> {
1180 for (i, edge) in self.edges.iter().enumerate() {
1181 if !self.consumed[i] && predicate(edge) {
1182 self.consumed[i] = true;
1183 return Some(edge);
1184 }
1185 }
1186 None
1187 }
1188}
1189
1190thread_local! {
1191 static EMIT_DEPTH: std::cell::Cell<usize> = const { std::cell::Cell::new(0) };
1192 /// Set of `(vertex_id, rule_name)` pairs that are currently being
1193 /// walked by the recursion. A SYMBOL that resolves to a rule
1194 /// already on this stack closes a μ-binder cycle: in the
1195 /// coinductive reading, the rule walk at any vertex is the least
1196 /// fixed point of `body[μ X . body / X]`, which unfolds at most
1197 /// once, with the second visit returning the empty sequence (the
1198 /// unit of the free token monoid). Examples that trigger this:
1199 /// YAML's `stream` ⊃ `_b_blk_*` mutually-recursive chain, Rust's
1200 /// `_expression` ⊃ `binary_expression` ⊃ `_expression`.
1201 static EMIT_MU_FRAMES: std::cell::RefCell<std::collections::HashSet<(String, String)>> =
1202 std::cell::RefCell::new(std::collections::HashSet::new());
1203 /// The name of the FIELD whose body the walker is currently inside,
1204 /// or `None` at top level. Lets a SYMBOL nested arbitrarily deep
1205 /// in the field's content (under SEQ, CHOICE, REPEAT, OPTIONAL)
1206 /// consume from the *outer* cursor by edge-kind rather than from
1207 /// the child's own cursor by symbol-match. Without this, shapes
1208 /// like `field('args', commaSep1($.X))` — which expands to
1209 /// `FIELD(SEQ(SYMBOL X, REPEAT(SEQ(',', SYMBOL X))))` — emit only
1210 /// the first matched edge: the FIELD handler consumed one edge,
1211 /// the inner REPEAT searched the consumed child's cursor (which
1212 /// has no more sibling field edges), and the REPEAT broke after
1213 /// one iteration. Setting the context here so the inner SYMBOL
1214 /// pulls successive field-named edges from the outer cursor
1215 /// recovers every matched edge across arbitrary nesting.
1216 static EMIT_FIELD_CONTEXT: std::cell::RefCell<Option<String>> =
1217 const { std::cell::RefCell::new(None) };
1218}
1219
1220/// RAII guard that restores the prior `EMIT_FIELD_CONTEXT` value on drop.
1221struct FieldContextGuard(Option<String>);
1222
1223impl Drop for FieldContextGuard {
1224 fn drop(&mut self) {
1225 EMIT_FIELD_CONTEXT.with(|f| *f.borrow_mut() = self.0.take());
1226 }
1227}
1228
1229fn push_field_context(name: &str) -> FieldContextGuard {
1230 let prev = EMIT_FIELD_CONTEXT.with(|f| f.borrow_mut().replace(name.to_owned()));
1231 FieldContextGuard(prev)
1232}
1233
1234/// Clear the field context for the duration of a child-context walk.
1235/// The child's own production has its own FIELDs that set their own
1236/// context; the outer field hint must not leak into them.
1237fn clear_field_context() -> FieldContextGuard {
1238 let prev = EMIT_FIELD_CONTEXT.with(|f| f.borrow_mut().take());
1239 FieldContextGuard(prev)
1240}
1241
1242fn current_field_context() -> Option<String> {
1243 EMIT_FIELD_CONTEXT.with(|f| f.borrow().clone())
1244}
1245
1246/// Walk a rule at a vertex inside a μ-binder. The wrapping frame is
1247/// pushed before recursion and popped after, so any SYMBOL inside
1248/// `rule` that re-enters the same `(vertex_id, rule_name)` pair
1249/// returns the empty sequence (μ X . body unfolds once).
1250fn walk_in_mu_frame(
1251 protocol: &str,
1252 schema: &Schema,
1253 grammar: &Grammar,
1254 vertex_id: &panproto_gat::Name,
1255 rule_name: &str,
1256 rule: &Production,
1257 cursor: &mut ChildCursor<'_>,
1258 out: &mut Output<'_>,
1259) -> Result<(), ParseError> {
1260 let key = (vertex_id.to_string(), rule_name.to_owned());
1261 let inserted = EMIT_MU_FRAMES.with(|frames| frames.borrow_mut().insert(key.clone()));
1262 if !inserted {
1263 // We are already walking this rule at this vertex deeper in
1264 // the call stack. The coinductive μ-fixed-point reading
1265 // returns the empty sequence here; the surrounding
1266 // production resumes after the SYMBOL.
1267 return Ok(());
1268 }
1269 let result = emit_production(protocol, schema, grammar, vertex_id, rule, cursor, out);
1270 EMIT_MU_FRAMES.with(|frames| {
1271 frames.borrow_mut().remove(&key);
1272 });
1273 result
1274}
1275
1276fn emit_production(
1277 protocol: &str,
1278 schema: &Schema,
1279 grammar: &Grammar,
1280 vertex_id: &panproto_gat::Name,
1281 production: &Production,
1282 cursor: &mut ChildCursor<'_>,
1283 out: &mut Output<'_>,
1284) -> Result<(), ParseError> {
1285 let depth = EMIT_DEPTH.with(|d| {
1286 let v = d.get() + 1;
1287 d.set(v);
1288 v
1289 });
1290 if depth > 500 {
1291 EMIT_DEPTH.with(|d| d.set(d.get() - 1));
1292 return Err(ParseError::EmitFailed {
1293 protocol: protocol.to_owned(),
1294 reason: format!(
1295 "emit_production recursion >500 (likely a cyclic grammar; \
1296 vertex='{vertex_id}')"
1297 ),
1298 });
1299 }
1300 drain_extras(protocol, schema, grammar, cursor, out)?;
1301 let result = emit_production_inner(
1302 protocol, schema, grammar, vertex_id, production, cursor, out,
1303 );
1304 EMIT_DEPTH.with(|d| d.set(d.get() - 1));
1305 result
1306}
1307
1308/// Consume and emit every leading edge on `cursor` whose target kind
1309/// is in `grammar.extras` (typically `line_comment` / `block_comment`).
1310/// Extras live outside the production grammar — tree-sitter skips them
1311/// at parse time and records them as children of the surrounding
1312/// vertex — so the rule walker cannot reconcile them against the
1313/// cursor. Draining them as a side channel preserves their content in
1314/// the output without confusing the structural matchers.
1315fn drain_extras(
1316 protocol: &str,
1317 schema: &Schema,
1318 grammar: &Grammar,
1319 cursor: &mut ChildCursor<'_>,
1320 out: &mut Output<'_>,
1321) -> Result<(), ParseError> {
1322 if grammar.extras.is_empty() {
1323 return Ok(());
1324 }
1325 loop {
1326 let next_extra: Option<usize> = cursor
1327 .edges
1328 .iter()
1329 .enumerate()
1330 .find(|(i, _)| !cursor.consumed[*i])
1331 .and_then(|(i, edge)| {
1332 let kind = schema.vertices.get(&edge.tgt).map(|v| v.kind.as_ref())?;
1333 if grammar.extras.contains(kind) {
1334 Some(i)
1335 } else {
1336 None
1337 }
1338 });
1339 let Some(idx) = next_extra else {
1340 return Ok(());
1341 };
1342 cursor.consumed[idx] = true;
1343 let target = &cursor.edges[idx].tgt;
1344 emit_vertex(protocol, schema, grammar, target, out)?;
1345 }
1346}
1347
1348fn emit_production_inner(
1349 protocol: &str,
1350 schema: &Schema,
1351 grammar: &Grammar,
1352 vertex_id: &panproto_gat::Name,
1353 production: &Production,
1354 cursor: &mut ChildCursor<'_>,
1355 out: &mut Output<'_>,
1356) -> Result<(), ParseError> {
1357 match production {
1358 Production::String { value } => {
1359 out.token(value);
1360 Ok(())
1361 }
1362 Production::Pattern { value } => {
1363 if let Some(literal) = literal_value(schema, vertex_id) {
1364 out.token(literal);
1365 } else if is_newline_like_pattern(value) {
1366 // Patterns like `\r?\n`, `\n`, `\r\n` are the structural
1367 // newline tokens grammars use to separate top-level
1368 // statements (csound's `_new_line`, ABC's line-end, etc.).
1369 // Emitting them through the placeholder fallback rendered
1370 // the bare `_` sentinel between siblings; route them to
1371 // the layout pass's line-break instead so the output
1372 // re-parses.
1373 out.newline();
1374 } else if is_whitespace_only_pattern(value) {
1375 // `\s+`, `[ \t]+` and friends are interstitial whitespace
1376 // tokens. Emit nothing: the layout pass inserts the
1377 // policy separator between adjacent Lits if needed.
1378 } else {
1379 out.token(&placeholder_for_pattern(value));
1380 }
1381 Ok(())
1382 }
1383 Production::Blank => Ok(()),
1384 Production::Symbol { name } => {
1385 // Inside a FIELD body, a SYMBOL consumes by field-name on
1386 // the outer cursor rather than searching by symbol-match.
1387 // This covers the simple `FIELD(SYMBOL X)` case as well as
1388 // every nesting under FIELD that contains SYMBOLs (SEQ,
1389 // CHOICE, REPEAT, OPTIONAL, ALIAS). Without the override,
1390 // shapes like `field('args', commaSep1($.X))` consume one
1391 // field edge in the FIELD handler and then the REPEAT
1392 // inside SEQ searches the consumed child's cursor — where
1393 // no sibling field edges sit — and breaks after one
1394 // iteration.
1395 if let Some(field) = current_field_context() {
1396 if let Some(edge) = cursor.take_field(&field) {
1397 return emit_in_child_context(
1398 protocol, schema, grammar, &edge.tgt, production, out,
1399 );
1400 }
1401 // No matching field-named edge left on the outer
1402 // cursor. Surface nothing; the surrounding REPEAT /
1403 // OPTIONAL / CHOICE backtracks the literal tokens it
1404 // emitted on this iteration when it sees no progress.
1405 return Ok(());
1406 }
1407 if name.starts_with('_') {
1408 // Hidden rule: not a vertex kind on the schema side.
1409 // Inline-expand the rule body so its children take
1410 // edges from the current cursor, instead of trying to
1411 // take a single child edge that "satisfies" the
1412 // hidden rule and discarding the rest of the body
1413 // (which would drop tokens like `=` and the trailing
1414 // value SYMBOL inside e.g. TOML's `_inline_pair`).
1415 //
1416 // Wrapped in a μ-frame so a hidden rule that
1417 // references its own kind cyclically (or another
1418 // hidden rule that closes the cycle) unfolds once
1419 // and then collapses to the empty sequence at the
1420 // second visit, rather than blowing the stack.
1421 if let Some(rule) = grammar.rules.get(name) {
1422 walk_in_mu_frame(
1423 protocol, schema, grammar, vertex_id, name, rule, cursor, out,
1424 )
1425 } else {
1426 // External hidden rule (declared in the
1427 // grammar's `externals` block, scanned by C code,
1428 // not listed in `rules`). Heuristic fallback by
1429 // name:
1430 //
1431 // - `_indent` / `*_indent`: open an indent block.
1432 // Indent-based grammars (Python, YAML, qvr)
1433 // declare an `_indent` external scanner before
1434 // the body of a block-bodied declaration; the
1435 // emitted output is unparseable without the
1436 // corresponding indentation jump.
1437 // - `_dedent` / `*_dedent`: close the matching
1438 // indent block.
1439 // - `_newline` / `*_line_ending` / `*_or_eof`:
1440 // universally newline-or-empty; emitting a
1441 // single newline is the right default for
1442 // grammars like TOML whose `pair` SEQ trails
1443 // into `_line_ending_or_eof`.
1444 //
1445 // Check the precomputed alias map first: if this
1446 // external token appears as the content of an
1447 // anonymous ALIAS elsewhere in the grammar, emit
1448 // the alias value as the token text.
1449 if let Some(alias_value) = grammar.external_alias_map.get(name) {
1450 out.token(alias_value);
1451 return Ok(());
1452 }
1453 if name == "_indent" || name.ends_with("_indent") {
1454 out.indent_open();
1455 } else if name == "_dedent" || name.ends_with("_dedent") {
1456 out.indent_close();
1457 } else if name.contains("line_ending")
1458 || name.contains("newline")
1459 || name.ends_with("_or_eof")
1460 {
1461 out.newline();
1462 } else if name.contains("semicolon") {
1463 out.token(";");
1464 }
1465 Ok(())
1466 }
1467 } else if let Some(edge) = { take_symbol_match(grammar, schema, cursor, name) } {
1468 // For supertype / hidden-rule dispatch the child's
1469 // own kind names the actual production to walk
1470 // (`child.kind` IS the subtype). For ALIAS the
1471 // dependent-optic context is carried by the
1472 // surrounding `Production::Alias` branch, which calls
1473 // `emit_aliased_child` directly; we don't reach here
1474 // for that case. So walking `grammar.rules[child.kind]`
1475 // via `emit_vertex` is correct: the dependent-optic
1476 // path is preserved at every site where it actually
1477 // diverges from `child.kind`.
1478 emit_vertex(protocol, schema, grammar, &edge.tgt, out)
1479 } else if vertex_id_kind(schema, vertex_id) == Some(name.as_str()) {
1480 let rule = grammar
1481 .rules
1482 .get(name)
1483 .ok_or_else(|| ParseError::EmitFailed {
1484 protocol: protocol.to_owned(),
1485 reason: format!("no production for SYMBOL '{name}'"),
1486 })?;
1487 // Self-reference (`X = ... SYMBOL X ...`): wrap in a
1488 // μ-frame so re-entry collapses to the empty sequence.
1489 walk_in_mu_frame(
1490 protocol, schema, grammar, vertex_id, name, rule, cursor, out,
1491 )
1492 } else {
1493 // Named rule with no matching child: emit nothing and
1494 // let the surrounding CHOICE / OPTIONAL / REPEAT
1495 // resolve the absence.
1496 Ok(())
1497 }
1498 }
1499 Production::Seq { members } => {
1500 for member in members {
1501 emit_production(protocol, schema, grammar, vertex_id, member, cursor, out)?;
1502 }
1503 Ok(())
1504 }
1505 Production::Choice { members } => {
1506 if let Some(matched) =
1507 pick_choice_with_cursor(schema, grammar, vertex_id, cursor, members)
1508 {
1509 emit_production(protocol, schema, grammar, vertex_id, matched, cursor, out)
1510 } else {
1511 Ok(())
1512 }
1513 }
1514 Production::Repeat { content } | Production::Repeat1 { content } => {
1515 // Detect a "separator-leading SEQ" iteration body: SEQ whose
1516 // first member is a CHOICE containing BLANK (or an OPTIONAL),
1517 // i.e. the source-level separator between two iterations is
1518 // syntactically optional. When the chosen alternative for
1519 // that separator slot emits zero content tokens at runtime,
1520 // there was no source-level separator between this iteration
1521 // and the previous one; the layout pass must suppress its
1522 // policy separator to match the source's tight adjacency.
1523 //
1524 // Categorical reading: REPEAT body `B = SEQ(SEP, BODY)` is
1525 // the pullback of two halves. The bytes emitted in iteration
1526 // k+1 are a concatenation of `SEP_k+1` and `BODY_k+1`; if
1527 // `SEP_k+1` is the empty word, the concatenation of
1528 // `BODY_k` and `BODY_k+1` must remain a single contiguous
1529 // span. Hence the NoSpace marker.
1530 let separator_leading_seq: Option<&[Production]> = match content.as_ref() {
1531 Production::Seq { members } if members.len() >= 2 => {
1532 let first = &members[0];
1533 let is_separator_slot = match first {
1534 Production::Choice { members } => {
1535 members.iter().any(|m| matches!(m, Production::Blank))
1536 }
1537 Production::Optional { .. } => true,
1538 _ => false,
1539 };
1540 if is_separator_slot {
1541 Some(members.as_slice())
1542 } else {
1543 None
1544 }
1545 }
1546 _ => None,
1547 };
1548
1549 let mut emitted_any = false;
1550 loop {
1551 let cursor_snap = cursor.consumed.clone();
1552 let out_snap = out.snapshot();
1553 let consumed_before = cursor.consumed.iter().filter(|&&c| c).count();
1554 let result: Result<(), ParseError> =
1555 if let Some(seq_members) = separator_leading_seq {
1556 // Emit the separator slot first and observe
1557 // whether it contributed any Lit. If not, push
1558 // a NoSpace marker before walking the remaining
1559 // SEQ members. The OutputSnapshot here covers
1560 // only the separator's emission window.
1561 let pre_sep = out.snapshot();
1562 let sep_result = emit_production(
1563 protocol,
1564 schema,
1565 grammar,
1566 vertex_id,
1567 &seq_members[0],
1568 cursor,
1569 out,
1570 );
1571 match sep_result {
1572 Err(e) => Err(e),
1573 Ok(()) => {
1574 if !out.lit_emitted_since(pre_sep) {
1575 out.no_space();
1576 }
1577 let mut rest_result = Ok(());
1578 for member in &seq_members[1..] {
1579 rest_result = emit_production(
1580 protocol, schema, grammar, vertex_id, member, cursor, out,
1581 );
1582 if rest_result.is_err() {
1583 break;
1584 }
1585 }
1586 rest_result
1587 }
1588 }
1589 } else {
1590 emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
1591 };
1592 let consumed_after = cursor.consumed.iter().filter(|&&c| c).count();
1593 if result.is_err() || consumed_after == consumed_before {
1594 cursor.consumed = cursor_snap;
1595 out.restore(out_snap);
1596 break;
1597 }
1598 emitted_any = true;
1599 }
1600 if matches!(production, Production::Repeat1 { .. }) && !emitted_any {
1601 emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)?;
1602 }
1603 Ok(())
1604 }
1605 Production::Optional { content } => {
1606 let cursor_snap = cursor.consumed.clone();
1607 let out_snap = out.snapshot();
1608 let consumed_before = cursor.consumed.iter().filter(|&&c| c).count();
1609 let result =
1610 emit_production(protocol, schema, grammar, vertex_id, content, cursor, out);
1611 // OPTIONAL is a backtracking site: if the inner production
1612 // errored *or* made no progress without leaving a witness
1613 // constraint, restore both cursor and output to their
1614 // pre-attempt state. Mirrors `Repeat`'s loop body.
1615 if result.is_err() {
1616 cursor.consumed = cursor_snap;
1617 out.restore(out_snap);
1618 return result;
1619 }
1620 let consumed_after = cursor.consumed.iter().filter(|&&c| c).count();
1621 if consumed_after == consumed_before
1622 && !has_relevant_constraint(content, schema, vertex_id)
1623 {
1624 cursor.consumed = cursor_snap;
1625 out.restore(out_snap);
1626 }
1627 Ok(())
1628 }
1629 Production::Field { name, content } => {
1630 // Set the field context for the duration of `content`'s
1631 // walk and emit the content against the *outer* cursor.
1632 // The SYMBOL handler picks up the context and pulls
1633 // successive `take_field(name)` edges as it encounters
1634 // SYMBOLs anywhere under `content` (under SEQ, CHOICE,
1635 // REPEAT, OPTIONAL, ALIAS — arbitrarily nested). This
1636 // subsumes the prior carve-outs for FIELD(REPEAT(...)),
1637 // FIELD(REPEAT1(...)), and the bare FIELD(SYMBOL ...)
1638 // case, and adds coverage for
1639 // `field('xs', commaSep1($.X))` which expands to
1640 // FIELD(SEQ(SYMBOL X, REPEAT(SEQ(',', SYMBOL X)))) and
1641 // any other shape where REPEAT/REPEAT1 sits inside SEQ /
1642 // CHOICE / OPTIONAL under a FIELD. A FIELD that wraps a
1643 // non-SYMBOL production (e.g. `field('op', '+')` or
1644 // `field('op', CHOICE(STRING ...))`) still works: STRING
1645 // handlers ignore the context and emit literals
1646 // directly, so the operator token survives the round
1647 // trip.
1648 let _guard = push_field_context(name);
1649 emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
1650 }
1651 Production::Alias {
1652 content,
1653 named,
1654 value,
1655 } => {
1656 // A named ALIAS rewrites the parser-visible kind to
1657 // `value`. If the cursor has an unconsumed child whose
1658 // kind matches that alias name, take it and emit the
1659 // child using the alias's INNER content as the rule
1660 // (e.g. `ALIAS { SYMBOL real_rule, value: "kind_x" }`
1661 // means a `kind_x` vertex on the schema should be walked
1662 // through `real_rule`'s body, not through whatever rule
1663 // happens to be keyed under `kind_x`). This is the
1664 // dependent-optic shape: the rule the emitter walks at a
1665 // child position is determined by the parent's chosen
1666 // alias, not by the child kind alone — without it,
1667 // grammars like YAML that introduce the same kind through
1668 // many ALIAS sites lose the parent context the moment
1669 // emit_vertex is called.
1670 if *named && !value.is_empty() {
1671 if let Some(edge) = cursor.take_matching(|edge| {
1672 schema
1673 .vertices
1674 .get(&edge.tgt)
1675 .map(|v| v.kind.as_ref() == value.as_str())
1676 .unwrap_or(false)
1677 }) {
1678 return emit_aliased_child(protocol, schema, grammar, &edge.tgt, content, out);
1679 }
1680 }
1681 // For anonymous aliases (named: false) whose content is an
1682 // external scanner token with no grammar rule (e.g.
1683 // JavaScript's `_ternary_qmark` aliased to `"?"`), emit the
1684 // alias value directly. The content's SYMBOL handler would
1685 // fall through the external-token heuristic and produce
1686 // nothing; the alias value IS the token text.
1687 if !*named && !value.is_empty() {
1688 if let Production::Symbol { name: sym } = content.as_ref() {
1689 if sym.starts_with('_') && !grammar.rules.contains_key(sym) {
1690 out.token(value);
1691 return Ok(());
1692 }
1693 }
1694 }
1695 emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
1696 }
1697 Production::Token { content }
1698 | Production::ImmediateToken { content }
1699 | Production::Prec { content, .. }
1700 | Production::PrecLeft { content, .. }
1701 | Production::PrecRight { content, .. }
1702 | Production::PrecDynamic { content, .. }
1703 | Production::Reserved { content, .. } => {
1704 emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
1705 }
1706 }
1707}
1708
1709/// Take the next cursor edge whose target vertex's kind matches the
1710/// SYMBOL `name` directly or via inline expansion of a hidden rule.
1711fn take_symbol_match<'a>(
1712 grammar: &Grammar,
1713 schema: &Schema,
1714 cursor: &mut ChildCursor<'a>,
1715 name: &str,
1716) -> Option<&'a Edge> {
1717 // Prefer non-field edges (`child_of`) to avoid consuming a
1718 // field-named edge that a later FIELD handler should claim.
1719 // Field-named edges (edge.kind != "child_of") are reserved for
1720 // the FIELD production that names them; consuming one here would
1721 // steal it from its intended handler (e.g. `as_pattern`'s
1722 // `alias` field edge consumed by the leading `expression`
1723 // SYMBOL instead of the trailing FIELD "alias" handler).
1724 if let Some(edge) = cursor.take_matching(|edge| {
1725 edge.kind.as_ref() == "child_of" && {
1726 let target_kind = schema.vertices.get(&edge.tgt).map(|v| v.kind.as_ref());
1727 kind_satisfies_symbol(grammar, target_kind, name)
1728 }
1729 }) {
1730 return Some(edge);
1731 }
1732 cursor.take_matching(|edge| {
1733 let target_kind = schema.vertices.get(&edge.tgt).map(|v| v.kind.as_ref());
1734 kind_satisfies_symbol(grammar, target_kind, name)
1735 })
1736}
1737
1738/// Decide whether a schema vertex of kind `target_kind` satisfies a
1739/// SYMBOL `name` reference in the grammar.
1740///
1741/// Operates as an O(1) lookup against the precomputed subtype
1742/// closure built at [`Grammar::from_bytes`]. The semantic content is
1743/// "K satisfies SYMBOL S iff K is reachable from S by walking the
1744/// grammar's hidden, supertype, and named-alias dispatch": this is
1745/// exactly the relation tree-sitter induces on `(parser-visible kind,
1746/// rule-position)` pairs.
1747fn kind_satisfies_symbol(grammar: &Grammar, target_kind: Option<&str>, name: &str) -> bool {
1748 let Some(target) = target_kind else {
1749 return false;
1750 };
1751 if target == name {
1752 return true;
1753 }
1754 grammar
1755 .subtypes
1756 .get(target)
1757 .is_some_and(|set| set.contains(name))
1758}
1759
1760/// Emit a child reached through an ALIAS production using the
1761/// alias's inner content as the rule, not `grammar.rules[child.kind]`.
1762///
1763/// This carries the dependent-optic context across the ALIAS edge:
1764/// at the parent rule's site we know which underlying production the
1765/// alias wraps (typically `SYMBOL real_rule`), and that's the
1766/// production that should drive the emit walk on the child's
1767/// children. Looking up `grammar.rules.get(child.kind)` instead would
1768/// either fail (the renamed kind has no top-level rule, e.g. YAML's
1769/// `block_mapping_pair`) or pick an arbitrary same-kinded rule from
1770/// elsewhere in the grammar.
1771///
1772/// Walk-context invariant. The dependent-optic shape of `emit_pretty`
1773/// says: the production walked at any vertex is determined by the
1774/// path from the root through the grammar, not by the vertex kind in
1775/// isolation. Two dispatch sites realise that invariant:
1776///
1777/// * [`emit_vertex`] looks up `grammar.rules[child.kind]` and walks
1778/// it. Correct for supertype / hidden-rule dispatch: the child's
1779/// kind on the schema IS the subtype tree-sitter selected, so its
1780/// top-level rule is the right production to walk.
1781/// * `emit_aliased_child` threads the parent rule's `Production`
1782/// directly (the inner `content` of `Production::Alias`) and walks
1783/// it on the child's children. Correct for ALIAS dispatch: the
1784/// child's kind on the schema is the alias's `value` (a renamed
1785/// kind that may have no top-level rule), and the production to
1786/// walk is the alias's content body, supplied by the parent.
1787///
1788/// Together these cover every site where the rule-walked-at-child
1789/// diverges from `grammar.rules[child.kind]`; the recursion site for
1790/// plain SYMBOL therefore correctly delegates to `emit_vertex`, and
1791/// we do not need a richer `WalkContext` value passed by reference.
1792/// The grammar dependency is the thread.
1793fn emit_aliased_child(
1794 protocol: &str,
1795 schema: &Schema,
1796 grammar: &Grammar,
1797 child_id: &panproto_gat::Name,
1798 content: &Production,
1799 out: &mut Output<'_>,
1800) -> Result<(), ParseError> {
1801 // Leaf shortcut: if the child has a literal-value and no
1802 // structural children, emit the captured text. Identifiers and
1803 // similar terminals reach here when an ALIAS wraps a SYMBOL that
1804 // resolves to a PATTERN.
1805 if let Some(literal) = literal_value(schema, child_id) {
1806 if children_for(schema, child_id).is_empty() {
1807 out.token(literal);
1808 return Ok(());
1809 }
1810 }
1811
1812 // Clear the enclosing FIELD context so it does not leak into the
1813 // aliased child's production walk. Without this, a FIELD("alias")
1814 // containing an ALIAS whose content is SYMBOL "expression" would
1815 // cause the inner SYMBOL handler to pull by field name "alias"
1816 // instead of by symbol match, failing to find the child edge.
1817 let _guard = clear_field_context();
1818
1819 // Resolve `content` to a rule when it's a SYMBOL (the dominant
1820 // shape: `ALIAS { content: SYMBOL real_rule, value: "kind_x" }`).
1821 if let Production::Symbol { name } = content {
1822 if let Some(rule) = grammar.rules.get(name) {
1823 let edges = children_for(schema, child_id);
1824 let mut cursor = ChildCursor::new(&edges);
1825 return emit_production(protocol, schema, grammar, child_id, rule, &mut cursor, out);
1826 }
1827 }
1828
1829 // Other ALIAS contents (CHOICE, SEQ, literals) walk in place.
1830 let edges = children_for(schema, child_id);
1831 let mut cursor = ChildCursor::new(&edges);
1832 emit_production(
1833 protocol,
1834 schema,
1835 grammar,
1836 child_id,
1837 content,
1838 &mut cursor,
1839 out,
1840 )
1841}
1842
1843fn emit_in_child_context(
1844 protocol: &str,
1845 schema: &Schema,
1846 grammar: &Grammar,
1847 child_id: &panproto_gat::Name,
1848 production: &Production,
1849 out: &mut Output<'_>,
1850) -> Result<(), ParseError> {
1851 // The child walks under its own production tree, with its own
1852 // FIELDs setting their own contexts. Clear the outer FIELD hint
1853 // so it does not leak through and cause sibling SYMBOLs inside
1854 // the child's body to mistakenly pull edges from the child's
1855 // cursor by the parent's field name.
1856 let _guard = clear_field_context();
1857 // If `production` is a structural wrapper (CHOICE / SEQ /
1858 // OPTIONAL / ...) whose referenced symbols cover the child's own
1859 // kind, the child IS the production's target node and the right
1860 // emit path is `emit_vertex(child)` (which honours the
1861 // literal-value leaf shortcut). Without this guard, FIELD(pattern,
1862 // CHOICE { _pattern, self }) on an identifier child walks the
1863 // CHOICE on the identifier's empty cursor, falls through to the
1864 // first non-BLANK alt, and loses the captured identifier text.
1865 if !matches!(production, Production::Symbol { .. }) {
1866 let child_kind = schema.vertices.get(child_id).map(|v| v.kind.as_ref());
1867 let symbols = referenced_symbols(production);
1868 if symbols
1869 .iter()
1870 .any(|s| kind_satisfies_symbol(grammar, child_kind, s) || child_kind == Some(s))
1871 {
1872 return emit_vertex(protocol, schema, grammar, child_id, out);
1873 }
1874 }
1875 match production {
1876 Production::Symbol { .. } => emit_vertex(protocol, schema, grammar, child_id, out),
1877 _ => {
1878 let edges = children_for(schema, child_id);
1879 let mut cursor = ChildCursor::new(&edges);
1880 emit_production(
1881 protocol,
1882 schema,
1883 grammar,
1884 child_id,
1885 production,
1886 &mut cursor,
1887 out,
1888 )
1889 }
1890 }
1891}
1892
1893fn pick_choice_with_cursor<'a>(
1894 schema: &Schema,
1895 grammar: &Grammar,
1896 vertex_id: &panproto_gat::Name,
1897 cursor: &ChildCursor<'_>,
1898 alternatives: &'a [Production],
1899) -> Option<&'a Production> {
1900 // Discriminator-driven dispatch (highest priority): when the
1901 // walker recorded a `chose-alt-fingerprint` constraint at parse
1902 // time, dispatch directly against that. This is the categorical
1903 // discriminator: it survives stripping of byte-position
1904 // constraints (so by-construction round-trips work) and is the
1905 // explicit witness of which CHOICE alternative the parser took.
1906 //
1907 // Falls back to the live `interstitial-*` substring blob when no
1908 // fingerprint is present (e.g. instances built by callers that
1909 // bypass the AstWalker). Both blobs are scored by the longest
1910 // STRING-literal token in an alternative that matches; the
1911 // length tiebreak prefers `&&` over `&`, `==` over `=`, etc.
1912 let constraint_blob = schema
1913 .constraints
1914 .get(vertex_id)
1915 .map(|cs| {
1916 let fingerprint: Option<&str> = cs
1917 .iter()
1918 .find(|c| c.sort.as_ref() == "chose-alt-fingerprint")
1919 .map(|c| c.value.as_str());
1920 if let Some(fp) = fingerprint {
1921 fp.to_owned()
1922 } else {
1923 cs.iter()
1924 .filter(|c| {
1925 let s = c.sort.as_ref();
1926 s.starts_with("interstitial-") && !s.ends_with("-start-byte")
1927 })
1928 .map(|c| c.value.as_str())
1929 .collect::<Vec<&str>>()
1930 .join(" ")
1931 }
1932 })
1933 .unwrap_or_default();
1934 let child_kinds: Vec<&str> = schema
1935 .constraints
1936 .get(vertex_id)
1937 .and_then(|cs| {
1938 cs.iter()
1939 .find(|c| c.sort.as_ref() == "chose-alt-child-kinds")
1940 .map(|c| c.value.split_whitespace().collect())
1941 })
1942 .unwrap_or_default();
1943 // Cursor-exhaustion BLANK-preference: when all cursor edges have
1944 // been consumed AND `BLANK` is one of the alternatives, the only
1945 // alt that won't introduce a non-existent child is `BLANK`.
1946 //
1947 // This gate fires before the literal-blob discriminator because
1948 // the fingerprint is shared across every CHOICE position in the
1949 // vertex's rule body: a vertex like `sample_step` that ends in
1950 // `..., REPEAT(SEQ(",", arg)), CHOICE(",", BLANK)` records all of
1951 // its `","` interstitials in a single blob, so the literal-score
1952 // matcher would otherwise prefer `","` for the trailing CHOICE
1953 // even when the source had no trailing comma. By the time the
1954 // emitter reaches the trailing CHOICE, the REPEAT has consumed
1955 // every arg edge in cursor order; the residual unconsumed multiset
1956 // is empty; and the categorical reading of a CHOICE-with-BLANK at
1957 // a position with no remaining children is the no-op alternative.
1958 let any_unconsumed = cursor
1959 .edges
1960 .iter()
1961 .enumerate()
1962 .any(|(i, _)| !cursor.consumed[i]);
1963 let blank_present = alternatives.iter().any(|a| matches!(a, Production::Blank));
1964 if !any_unconsumed && blank_present {
1965 return alternatives.iter().find(|a| matches!(a, Production::Blank));
1966 }
1967 if !any_unconsumed && !blank_present {
1968 let mut visited = std::collections::HashSet::new();
1969 let mut yield_cache = grammar.yield_sets.clone();
1970 for alt in alternatives {
1971 let ys = yield_of_production(grammar, alt, &mut visited, &mut yield_cache);
1972 if ys.contains("") {
1973 return Some(alt);
1974 }
1975 visited.clear();
1976 }
1977 }
1978
1979 if !constraint_blob.is_empty() {
1980 // Primary score: literal-token match length. This dominates
1981 // alt selection so existing language tests that depend on
1982 // literal-only fingerprints keep working.
1983 // Secondary score (tiebreaker only): named-symbol kind match
1984 // count, read from the separate `chose-alt-child-kinds`
1985 // constraint (kept apart from the literal fingerprint so
1986 // identifiers like `:` in the kind list don't contaminate the
1987 // literal match). An alt that matches the recorded kinds is a
1988 // stronger witness than one whose only
1989 // overlap is literal punctuation.
1990 let mut best_literal: usize = 0;
1991 let mut best_symbols: usize = 0;
1992 let mut best_alt: Option<&Production> = None;
1993 let mut tied = false;
1994 for alt in alternatives {
1995 let strings = literal_strings(alt);
1996 if strings.is_empty() {
1997 continue;
1998 }
1999 let literal_score = strings
2000 .iter()
2001 .filter(|s| constraint_blob.contains(s.as_str()))
2002 .map(String::len)
2003 .sum::<usize>();
2004 if literal_score == 0 {
2005 continue;
2006 }
2007 // Symbol score is computed only as a tiebreaker among alts
2008 // whose literal-token coverage is the same; it never lifts
2009 // an alt above one with a strictly higher literal score.
2010 // Reads the `chose-alt-child-kinds` constraint (a separate
2011 // sequence the walker emits, kept apart from the literal
2012 // fingerprint to avoid cross-contamination).
2013 let symbol_score = if literal_score >= best_literal && !child_kinds.is_empty() {
2014 let symbols = referenced_symbols(alt);
2015 symbols
2016 .iter()
2017 .filter(|sym| {
2018 let sym_str: &str = sym;
2019 if child_kinds.contains(&sym_str) {
2020 return true;
2021 }
2022 grammar.subtypes.get(sym_str).is_some_and(|sub_set| {
2023 sub_set
2024 .iter()
2025 .any(|sub| child_kinds.contains(&sub.as_str()))
2026 })
2027 })
2028 .count()
2029 } else {
2030 0
2031 };
2032 let better = literal_score > best_literal
2033 || (literal_score == best_literal && symbol_score > best_symbols);
2034 let same = literal_score == best_literal && symbol_score == best_symbols;
2035 if better {
2036 best_literal = literal_score;
2037 best_symbols = symbol_score;
2038 best_alt = Some(alt);
2039 tied = false;
2040 } else if same && best_alt.is_some() {
2041 tied = true;
2042 }
2043 }
2044 // Only commit to an alt when the fingerprint discriminates it
2045 // uniquely. A tie means the alts share the same literal token
2046 // set (e.g. JSON's `string = CHOICE { SEQ { '"', '"' }, SEQ {
2047 // '"', _string_content, '"' } }` — both alts contain just the
2048 // two `"` tokens). In that case fall through to cursor-based
2049 // dispatch, which uses the actual edge structure.
2050 if let Some(alt) = best_alt {
2051 if !tied {
2052 return Some(alt);
2053 }
2054 }
2055 }
2056
2057 // Cursor-driven dispatch via Yield-set preimage.
2058 //
2059 // For a CHOICE C = A1 | ... | An, Yield(Ai) is the set of vertex
2060 // kinds that can appear as the first named child when Ai is taken
2061 // (see `yield_of_production`). Given the first unconsumed cursor
2062 // edge with target kind K, select the first Ai (grammar order)
2063 // where K ∈ Yield(Ai). This is deterministic: grammar order is
2064 // the tiebreak, matching tree-sitter's own disambiguation.
2065 let first_unconsumed_kind: Option<&str> = cursor
2066 .edges
2067 .iter()
2068 .enumerate()
2069 .find(|(i, _)| !cursor.consumed[*i])
2070 .and_then(|(_, edge)| schema.vertices.get(&edge.tgt).map(|v| v.kind.as_ref()));
2071 if let Some(target_kind) = first_unconsumed_kind {
2072 // The subtype closure `subtypes[target_kind]` contains every
2073 // symbol name S such that a vertex of kind `target_kind` can
2074 // appear where the grammar says `SYMBOL S`. For a CHOICE
2075 // C = A1 | ... | An, the correct alternative is the one whose
2076 // top-level symbol is in `subtypes[target_kind]` (the target
2077 // kind IS a subtype of that symbol, so the symbol's rule body
2078 // dispatches to the target kind at parse time). This is an
2079 // O(1) set-membership check per alternative — no recursive
2080 // Yield computation needed.
2081 //
2082 // Preference order:
2083 // 1. Direct name match (target_kind == symbol name)
2084 // 2. Subtype match (symbol name ∈ subtypes[target_kind])
2085 // 3. Yield-set match (target_kind ∈ Yield(alt)) as fallback
2086 // for non-SYMBOL alternatives (ALIAS, SEQ, etc.)
2087 let target_supers = grammar.subtypes.get(target_kind);
2088
2089 // Indented-form preference: when multiple alternatives match
2090 // the target kind (e.g. Python _suite where all three alts
2091 // produce `block`), prefer the alternative containing an
2092 // `_indent` SYMBOL. Check this BEFORE the standard passes
2093 // since they would pick the first match in grammar order.
2094 {
2095 let mut match_count = 0usize;
2096 let mut indent_alt_idx: Option<usize> = None;
2097 let mut visited = std::collections::HashSet::new();
2098 let mut yield_cache = grammar.yield_sets.clone();
2099 for (i, alt) in alternatives.iter().enumerate() {
2100 let ys = yield_of_production(grammar, alt, &mut visited, &mut yield_cache);
2101 if ys.contains(target_kind) {
2102 match_count += 1;
2103 if indent_alt_idx.is_none()
2104 && referenced_symbols(alt)
2105 .iter()
2106 .any(|s| *s == "_indent" || s.ends_with("_indent"))
2107 {
2108 indent_alt_idx = Some(i);
2109 }
2110 }
2111 visited.clear();
2112 }
2113 if match_count > 1 {
2114 if let Some(idx) = indent_alt_idx {
2115 return Some(&alternatives[idx]);
2116 }
2117 }
2118 }
2119
2120 // Pass 1: direct name match
2121 for alt in alternatives {
2122 if let Production::Symbol { name } = alt {
2123 if name.as_str() == target_kind {
2124 return Some(alt);
2125 }
2126 }
2127 if let Production::Alias {
2128 named: true, value, ..
2129 } = alt
2130 {
2131 if value.as_str() == target_kind {
2132 return Some(alt);
2133 }
2134 }
2135 }
2136
2137 // Pass 2: subtype match (the target kind's supertype set
2138 // tells us which SYMBOL names it satisfies)
2139 if let Some(supers) = target_supers {
2140 for alt in alternatives {
2141 if let Production::Symbol { name } = alt {
2142 if supers.contains(name.as_str()) {
2143 return Some(alt);
2144 }
2145 }
2146 if let Production::Alias {
2147 named: true, value, ..
2148 } = alt
2149 {
2150 if supers.contains(value.as_str()) {
2151 return Some(alt);
2152 }
2153 }
2154 }
2155 }
2156
2157 // Pass 3: Yield-set fallback for alternatives that are not
2158 // plain SYMBOLs or named ALIASes (e.g. SEQ, PREC wrappers
2159 // around SYMBOLs that the above passes don't unwrap).
2160 let mut visited = std::collections::HashSet::new();
2161 let mut yield_cache = grammar.yield_sets.clone();
2162 for alt in alternatives {
2163 let ys = yield_of_production(grammar, alt, &mut visited, &mut yield_cache);
2164 if ys.contains(target_kind) {
2165 return Some(alt);
2166 }
2167 visited.clear();
2168 }
2169 }
2170
2171 // FIELD dispatch: pick an alternative whose FIELD name matches an
2172 // unconsumed edge kind.
2173 let edge_kinds: Vec<&str> = cursor
2174 .edges
2175 .iter()
2176 .enumerate()
2177 .filter(|(i, _)| !cursor.consumed[*i])
2178 .map(|(_, e)| e.kind.as_ref())
2179 .collect();
2180 for alt in alternatives {
2181 if has_field_in(alt, &edge_kinds) {
2182 return Some(alt);
2183 }
2184 }
2185
2186 // No dispatch tier matched. The final selection follows the
2187 // categorical semantics of CHOICE-with-BLANK: BLANK represents ε
2188 // (produce nothing at this position). It is correct if and only
2189 // if no child remains to consume at this cursor position.
2190 //
2191 // When unconsumed non-extra children remain, selecting BLANK
2192 // would silently drop them. Select the first non-BLANK
2193 // alternative instead so the production walk can attempt to
2194 // consume them (the grammar rule may reference a symbol name
2195 // that doesn't exactly match the parse output's child kind,
2196 // e.g. Julia's macrocall_expression receives `argument_list`
2197 // children when grammar.json only references
2198 // `macro_argument_list`).
2199 let _ = (schema, vertex_id);
2200 if alternatives.iter().any(|a| matches!(a, Production::Blank)) {
2201 return alternatives.iter().find(|a| matches!(a, Production::Blank));
2202 }
2203 alternatives
2204 .iter()
2205 .find(|alt| !matches!(alt, Production::Blank))
2206}
2207
2208/// Collect every literal STRING token directly inside `production`
2209/// (without descending into SYMBOLs / hidden rules). Used to score
2210/// CHOICE alternatives against the parent vertex's interstitials so
2211/// the right operator / keyword form is picked when the schema
2212/// preserves interstitial fragments from a prior parse.
2213fn literal_strings(production: &Production) -> Vec<String> {
2214 let mut out = Vec::new();
2215 fn walk(p: &Production, out: &mut Vec<String>) {
2216 match p {
2217 Production::String { value } if !value.is_empty() => {
2218 out.push(value.clone());
2219 }
2220 Production::Choice { members } | Production::Seq { members } => {
2221 for m in members {
2222 walk(m, out);
2223 }
2224 }
2225 Production::Repeat { content }
2226 | Production::Repeat1 { content }
2227 | Production::Optional { content }
2228 | Production::Field { content, .. }
2229 | Production::Alias { content, .. }
2230 | Production::Token { content }
2231 | Production::ImmediateToken { content }
2232 | Production::Prec { content, .. }
2233 | Production::PrecLeft { content, .. }
2234 | Production::PrecRight { content, .. }
2235 | Production::PrecDynamic { content, .. }
2236 | Production::Reserved { content, .. } => walk(content, out),
2237 _ => {}
2238 }
2239 }
2240 walk(production, &mut out);
2241 out
2242}
2243
2244/// Collect every SYMBOL name reachable from `production` without
2245/// crossing into nested rules. Used by `pick_choice_with_cursor` to
2246/// rank alternatives by "any SYMBOL inside this alt matches something
2247/// on the cursor", instead of just the first SYMBOL: a leading
2248/// optional like `attribute_item` then `parameter` is otherwise
2249/// rejected when only the parameter children are present.
2250fn referenced_symbols(production: &Production) -> Vec<&str> {
2251 let mut out = Vec::new();
2252 fn walk<'a>(p: &'a Production, out: &mut Vec<&'a str>) {
2253 match p {
2254 Production::Symbol { name } => out.push(name.as_str()),
2255 Production::Choice { members } | Production::Seq { members } => {
2256 for m in members {
2257 walk(m, out);
2258 }
2259 }
2260 Production::Alias {
2261 content,
2262 named,
2263 value,
2264 } => {
2265 // A named ALIAS produces a child vertex whose kind is
2266 // the alias `value` (e.g. `ALIAS { content: STRING "=",
2267 // value: "punctuation", named: true }` introduces a
2268 // `punctuation` child). For cursor-driven dispatch to
2269 // recognise alts that emit such children, yield the
2270 // alias value as a referenced symbol. Anonymous aliases
2271 // do not introduce a named node and only need their
2272 // inner content's symbols.
2273 if *named && !value.is_empty() {
2274 out.push(value.as_str());
2275 }
2276 walk(content, out);
2277 }
2278 Production::Repeat { content }
2279 | Production::Repeat1 { content }
2280 | Production::Optional { content }
2281 | Production::Field { content, .. }
2282 | Production::Token { content }
2283 | Production::ImmediateToken { content }
2284 | Production::Prec { content, .. }
2285 | Production::PrecLeft { content, .. }
2286 | Production::PrecRight { content, .. }
2287 | Production::PrecDynamic { content, .. }
2288 | Production::Reserved { content, .. } => walk(content, out),
2289 _ => {}
2290 }
2291 }
2292 walk(production, &mut out);
2293 out
2294}
2295
2296#[cfg(test)]
2297fn first_symbol(production: &Production) -> Option<&str> {
2298 match production {
2299 Production::Symbol { name } => Some(name),
2300 Production::Seq { members } => members.iter().find_map(first_symbol),
2301 Production::Choice { members } => members.iter().find_map(first_symbol),
2302 Production::Repeat { content }
2303 | Production::Repeat1 { content }
2304 | Production::Optional { content }
2305 | Production::Field { content, .. }
2306 | Production::Alias { content, .. }
2307 | Production::Token { content }
2308 | Production::ImmediateToken { content }
2309 | Production::Prec { content, .. }
2310 | Production::PrecLeft { content, .. }
2311 | Production::PrecRight { content, .. }
2312 | Production::PrecDynamic { content, .. }
2313 | Production::Reserved { content, .. } => first_symbol(content),
2314 _ => None,
2315 }
2316}
2317
2318fn has_field_in(production: &Production, edge_kinds: &[&str]) -> bool {
2319 match production {
2320 Production::Field { name, .. } => edge_kinds.contains(&name.as_str()),
2321 Production::Seq { members } | Production::Choice { members } => {
2322 members.iter().any(|m| has_field_in(m, edge_kinds))
2323 }
2324 Production::Repeat { content }
2325 | Production::Repeat1 { content }
2326 | Production::Optional { content }
2327 | Production::Alias { content, .. }
2328 | Production::Token { content }
2329 | Production::ImmediateToken { content }
2330 | Production::Prec { content, .. }
2331 | Production::PrecLeft { content, .. }
2332 | Production::PrecRight { content, .. }
2333 | Production::PrecDynamic { content, .. }
2334 | Production::Reserved { content, .. } => has_field_in(content, edge_kinds),
2335 _ => false,
2336 }
2337}
2338
2339fn has_relevant_constraint(
2340 production: &Production,
2341 schema: &Schema,
2342 vertex_id: &panproto_gat::Name,
2343) -> bool {
2344 let constraints = match schema.constraints.get(vertex_id) {
2345 Some(c) => c,
2346 None => return false,
2347 };
2348 fn walk(production: &Production, constraints: &[panproto_schema::Constraint]) -> bool {
2349 match production {
2350 Production::String { value } => constraints
2351 .iter()
2352 .any(|c| c.value == *value || c.sort.as_ref() == value),
2353 Production::Field { name, content } => {
2354 constraints.iter().any(|c| c.sort.as_ref() == name) || walk(content, constraints)
2355 }
2356 Production::Seq { members } | Production::Choice { members } => {
2357 members.iter().any(|m| walk(m, constraints))
2358 }
2359 Production::Repeat { content }
2360 | Production::Repeat1 { content }
2361 | Production::Optional { content }
2362 | Production::Alias { content, .. }
2363 | Production::Token { content }
2364 | Production::ImmediateToken { content }
2365 | Production::Prec { content, .. }
2366 | Production::PrecLeft { content, .. }
2367 | Production::PrecRight { content, .. }
2368 | Production::PrecDynamic { content, .. }
2369 | Production::Reserved { content, .. } => walk(content, constraints),
2370 _ => false,
2371 }
2372 }
2373 walk(production, constraints)
2374}
2375
2376fn children_for<'a>(schema: &'a Schema, vertex_id: &panproto_gat::Name) -> Vec<&'a Edge> {
2377 // Walk `outgoing` (insertion-ordered by SchemaBuilder via SmallVec
2378 // append) rather than the unordered `edges` HashMap so abstract
2379 // schemas under REPEAT(CHOICE(...)) preserve the order their edges
2380 // were inserted in. The previous implementation walked the HashMap
2381 // and sorted lexicographically by (kind, target id), which fused
2382 // interleaved children of the same kind into runs (e.g. a sequence
2383 // [symbol, punct, int, symbol, punct, int] became [symbol, symbol,
2384 // punct, punct, int, int] after the lex sort).
2385 let Some(edges) = schema.outgoing.get(vertex_id) else {
2386 return Vec::new();
2387 };
2388
2389 // Look up the canonical Edge reference (the key in `schema.edges`)
2390 // for each entry in `outgoing`. Falls back to the SmallVec entry if
2391 // the canonical key is missing, which would indicate index drift.
2392 let mut indexed: Vec<(usize, u32, &Edge)> = edges
2393 .iter()
2394 .enumerate()
2395 .map(|(i, e)| {
2396 let canonical = schema.edges.get_key_value(e).map_or(e, |(k, _)| k);
2397 let pos = schema.orderings.get(canonical).copied().unwrap_or(u32::MAX);
2398 (i, pos, canonical)
2399 })
2400 .collect();
2401
2402 // Stable sort by (explicit-ordering, insertion-index). Edges with
2403 // an explicit `orderings` entry come first in their declared order;
2404 // the remainder fall through in insertion order.
2405 indexed.sort_by_key(|(i, pos, _)| (*pos, *i));
2406 indexed.into_iter().map(|(_, _, e)| e).collect()
2407}
2408
2409fn vertex_id_kind<'a>(schema: &'a Schema, vertex_id: &panproto_gat::Name) -> Option<&'a str> {
2410 schema.vertices.get(vertex_id).map(|v| v.kind.as_ref())
2411}
2412
2413fn literal_value<'a>(schema: &'a Schema, vertex_id: &panproto_gat::Name) -> Option<&'a str> {
2414 schema
2415 .constraints
2416 .get(vertex_id)?
2417 .iter()
2418 .find(|c| c.sort.as_ref() == "literal-value")
2419 .map(|c| c.value.as_str())
2420}
2421
2422/// True iff `pattern` matches a (possibly optional / repeated) sequence
2423/// of carriage-return and newline characters only. Examples: `\r?\n`,
2424/// `\n`, `\r\n`, `\n+`, `\r?\n+`. Distinguishes structural newline
2425/// terminals from generic whitespace and from other patterns that
2426/// happen to contain a newline escape inside a larger class.
2427fn is_newline_like_pattern(pattern: &str) -> bool {
2428 if pattern.is_empty() {
2429 return false;
2430 }
2431 let mut chars = pattern.chars();
2432 let mut saw_newline_atom = false;
2433 while let Some(c) = chars.next() {
2434 match c {
2435 '\\' => match chars.next() {
2436 Some('n' | 'r') => saw_newline_atom = true,
2437 _ => return false,
2438 },
2439 '?' | '*' | '+' => {} // quantifiers on the previous atom
2440 _ => return false,
2441 }
2442 }
2443 saw_newline_atom
2444}
2445
2446/// True iff `pattern` matches a (possibly quantified) run of generic
2447/// whitespace characters: `\s+`, `[ \t]+`, ` +`, `\s*`. Such patterns
2448/// describe interstitial spacing rather than syntactic content, so the
2449/// pretty emitter can drop them and let the layout pass insert the
2450/// configured separator.
2451fn is_whitespace_only_pattern(pattern: &str) -> bool {
2452 if pattern.is_empty() {
2453 return false;
2454 }
2455 // Strip an outer quantifier suffix.
2456 let trimmed = pattern.trim_end_matches(['?', '*', '+']);
2457 if trimmed.is_empty() {
2458 return false;
2459 }
2460 // Bare `\s` / ` ` / `\t`.
2461 if matches!(trimmed, "\\s" | " " | "\\t") {
2462 return true;
2463 }
2464 // Character class containing only whitespace atoms.
2465 if let Some(inner) = trimmed.strip_prefix('[').and_then(|s| s.strip_suffix(']')) {
2466 let mut chars = inner.chars();
2467 let mut saw_atom = false;
2468 while let Some(c) = chars.next() {
2469 match c {
2470 '\\' => match chars.next() {
2471 Some('s' | 't' | 'r' | 'n') => saw_atom = true,
2472 _ => return false,
2473 },
2474 ' ' | '\t' => saw_atom = true,
2475 _ => return false,
2476 }
2477 }
2478 return saw_atom;
2479 }
2480 false
2481}
2482
2483fn placeholder_for_pattern(pattern: &str) -> String {
2484 // Heuristic placeholder for unconstrained PATTERN terminals.
2485 //
2486 // First handle the "the regex IS a literal escape" cases that
2487 // tree-sitter grammars use as separators (`\n`, `\r\n`, `;`,
2488 // etc.); emitting the matching character is always preferable
2489 // to a `_x` identifier-like placeholder when the surrounding
2490 // grammar expects a separator.
2491 let simple_lit = decode_simple_pattern_literal(pattern);
2492 if let Some(lit) = simple_lit {
2493 return lit;
2494 }
2495
2496 if pattern.contains("[0-9]") || pattern.contains("\\d") {
2497 "0".into()
2498 } else if pattern.contains("[a-zA-Z_]") || pattern.contains("\\w") {
2499 "_x".into()
2500 } else if pattern.contains('"') || pattern.contains('\'') {
2501 "\"\"".into()
2502 } else {
2503 "_".into()
2504 }
2505}
2506
2507/// Decode a tree-sitter PATTERN whose regex is a simple literal
2508/// (newline, semicolon, comma, etc.) to the byte sequence it matches.
2509/// Returns `None` for patterns with character classes, alternations,
2510/// or quantifiers; the caller falls back to the heuristic placeholder.
2511fn decode_simple_pattern_literal(pattern: &str) -> Option<String> {
2512 // Skip patterns containing regex metachars that would broaden the
2513 // match beyond a single literal byte sequence.
2514 if pattern
2515 .chars()
2516 .any(|c| matches!(c, '[' | ']' | '(' | ')' | '*' | '+' | '?' | '|' | '{' | '}'))
2517 {
2518 return None;
2519 }
2520 let mut out = String::new();
2521 let mut chars = pattern.chars();
2522 while let Some(c) = chars.next() {
2523 if c == '\\' {
2524 match chars.next() {
2525 Some('n') => out.push('\n'),
2526 Some('r') => out.push('\r'),
2527 Some('t') => out.push('\t'),
2528 Some('\\') => out.push('\\'),
2529 Some('/') => out.push('/'),
2530 Some(other) => out.push(other),
2531 None => return None,
2532 }
2533 } else {
2534 out.push(c);
2535 }
2536 }
2537 Some(out)
2538}
2539
2540// ═══════════════════════════════════════════════════════════════════
2541// Token list output with Spacing algebra
2542// ═══════════════════════════════════════════════════════════════════
2543//
2544// Emit produces a free monoid over `Token`. Layout (spaces, newlines,
2545// indentation) is a homomorphism `Vec<Token> -> Vec<u8>` parameterised
2546// by `FormatPolicy`. Separating the structural output from the layout
2547// decision means each phase has one job: emit walks the grammar and
2548// pushes tokens; layout is a single fold, locally driven by adjacent
2549// pairs and a depth counter. Snapshot/restore is just `tokens.len()`.
2550
2551#[derive(Clone)]
2552enum Token {
2553 /// A user-visible terminal contributed by the grammar.
2554 Lit(String),
2555 /// `indent_open` marker emitted when a `Lit` matched the policy's
2556 /// open list. Carried as a separate token so layout can decide to
2557 /// break + indent without re-scanning.
2558 IndentOpen,
2559 /// `indent_close` marker emitted before a closer-`Lit`.
2560 IndentClose,
2561 /// "Break a line here if not already at line start" — used after
2562 /// statements/declarations and after open braces.
2563 LineBreak,
2564 /// Suppress the next inter-Lit separator. Pushed by the REPEAT
2565 /// walker when an iteration's "separator slot" (a CHOICE-with-BLANK
2566 /// or OPTIONAL at SEQ position 0) emitted zero content tokens, so
2567 /// the categorical reading is "no source-level separator existed
2568 /// between these two sibling iterations of the body".
2569 NoSpace,
2570}
2571
2572struct Output<'a> {
2573 tokens: Vec<Token>,
2574 policy: &'a FormatPolicy,
2575 suppress_brace_indent: bool,
2576}
2577
2578#[derive(Clone)]
2579struct OutputSnapshot {
2580 tokens_len: usize,
2581}
2582
2583impl<'a> Output<'a> {
2584 fn new(policy: &'a FormatPolicy) -> Self {
2585 Self {
2586 tokens: Vec::new(),
2587 policy,
2588 suppress_brace_indent: false,
2589 }
2590 }
2591
2592 fn token(&mut self, value: &str) {
2593 if value.is_empty() {
2594 return;
2595 }
2596
2597 // A grammar STRING whose value is a newline (e.g. abc's `_NL = "\n"`
2598 // or any rule that uses `"\n"` as a structural line terminator)
2599 // must route through the layout's `LineBreak` channel. Emitting it
2600 // as a `Lit` leaves the newline character in the byte stream but
2601 // also makes `needs_space_between` insert the configured separator
2602 // between the newline and the following token, producing leading
2603 // spaces on every line after the first.
2604 if value == "\n" || value == "\r\n" || value == "\r" {
2605 self.tokens.push(Token::LineBreak);
2606 return;
2607 }
2608
2609 // A captured literal value (typically a vertex's `literal-value`
2610 // constraint covering the full source span of a terminal-like
2611 // rule, e.g. abc's `reference_number_line` matching `"X:1\n"`)
2612 // may contain trailing newlines. Splitting the trailing newlines
2613 // off as a `LineBreak` lets the layout pass treat the next Lit
2614 // as starting a new line; otherwise the next Lit pair would
2615 // trigger `needs_space_between` against the embedded `\n` and
2616 // insert the policy separator at column 0 of the new line.
2617 let trimmed = value.trim_end_matches(['\n', '\r']);
2618 let trailing_newlines = value.len() - trimmed.len();
2619 if trailing_newlines > 0 && !trimmed.is_empty() {
2620 if !self.suppress_brace_indent && self.policy.indent_close.iter().any(|t| t == trimmed)
2621 {
2622 self.tokens.push(Token::IndentClose);
2623 }
2624 self.tokens.push(Token::Lit(trimmed.to_owned()));
2625 if !self.suppress_brace_indent && self.policy.indent_open.iter().any(|t| t == trimmed) {
2626 self.tokens.push(Token::IndentOpen);
2627 } else if self.policy.line_break_after.iter().any(|t| t == trimmed) {
2628 // already emitting a LineBreak below for the trailing \n
2629 }
2630 self.tokens.push(Token::LineBreak);
2631 return;
2632 }
2633
2634 if !self.suppress_brace_indent && self.policy.indent_close.iter().any(|t| t == value) {
2635 self.tokens.push(Token::IndentClose);
2636 }
2637
2638 self.tokens.push(Token::Lit(value.to_owned()));
2639
2640 if !self.suppress_brace_indent && self.policy.indent_open.iter().any(|t| t == value) {
2641 self.tokens.push(Token::IndentOpen);
2642 self.tokens.push(Token::LineBreak);
2643 } else if self.policy.line_break_after.iter().any(|t| t == value)
2644 && !(self.suppress_brace_indent && (value == "{" || value == "}"))
2645 {
2646 self.tokens.push(Token::LineBreak);
2647 }
2648 }
2649
2650 fn newline(&mut self) {
2651 self.tokens.push(Token::LineBreak);
2652 }
2653
2654 /// Open an indent scope: subsequent `LineBreak`s render at the
2655 /// new depth until a matching `indent_close` pops it. Used by the
2656 /// external-token fallback to render indent-based grammars'
2657 /// `_indent` scanner outputs.
2658 fn indent_open(&mut self) {
2659 self.tokens.push(Token::IndentOpen);
2660 self.tokens.push(Token::LineBreak);
2661 }
2662
2663 /// Close one indent scope opened by `indent_open`.
2664 fn indent_close(&mut self) {
2665 self.tokens.push(Token::IndentClose);
2666 }
2667
2668 fn snapshot(&self) -> OutputSnapshot {
2669 OutputSnapshot {
2670 tokens_len: self.tokens.len(),
2671 }
2672 }
2673
2674 fn restore(&mut self, snap: OutputSnapshot) {
2675 self.tokens.truncate(snap.tokens_len);
2676 }
2677
2678 /// True iff at least one `Token::Lit` was pushed since `snap`.
2679 /// Control-only emissions (`LineBreak`, `IndentOpen` / `IndentClose`,
2680 /// `NoSpace`) do not count as content. Used by the REPEAT walker
2681 /// to detect that a "separator slot" CHOICE picked its BLANK
2682 /// alternative, so the next iteration's content can be marked
2683 /// tight against the previous iteration's content.
2684 fn lit_emitted_since(&self, snap: OutputSnapshot) -> bool {
2685 self.tokens[snap.tokens_len..]
2686 .iter()
2687 .any(|t| matches!(t, Token::Lit(_)))
2688 }
2689
2690 /// Push a marker that suppresses the next inter-Lit separator the
2691 /// layout pass would otherwise insert. Used to encode "no source-
2692 /// level separator was emitted between these two Lits" without
2693 /// having to make per-grammar adjacency decisions in the layout.
2694 fn no_space(&mut self) {
2695 self.tokens.push(Token::NoSpace);
2696 }
2697
2698 fn finish(self) -> Vec<u8> {
2699 layout(&self.tokens, self.policy)
2700 }
2701}
2702
2703/// Fold a token list into bytes. The algebra:
2704/// * adjacent `Lit`s get a single space iff `needs_space_between(a, b)`,
2705/// * `IndentOpen` / `IndentClose` adjust a depth counter,
2706/// * `LineBreak` writes `\n` if not already at line start, then the
2707/// next `Lit` writes `indent * indent_width` spaces of indent.
2708fn layout(tokens: &[Token], policy: &FormatPolicy) -> Vec<u8> {
2709 let mut bytes = Vec::new();
2710 let mut indent: usize = 0;
2711 let mut at_line_start = true;
2712 let mut last_lit: Option<&str> = None;
2713 // True iff, at the moment `last_lit` was emitted, the cursor was at a
2714 // position where the grammar expects an operand: start of stream / line,
2715 // just after an open paren / bracket / brace, just after a separator like
2716 // `,` or `;`, or just after a binary / assignment operator. Used by
2717 // `needs_space_between` to recognise `last_lit` as a tight unary prefix
2718 // (`f(-1.0)`) rather than a spaced binary operator (`a - b`).
2719 let mut last_was_in_operand_position = true;
2720 let mut expecting_operand = true;
2721 // Set when a `Token::NoSpace` marker is seen; cleared when the next
2722 // Lit consumes it. While set, suppress the policy separator that
2723 // would otherwise be inserted before the next Lit.
2724 let mut suppress_next_separator = false;
2725 let newline = policy.newline.as_bytes();
2726 let separator = policy.separator.as_bytes();
2727
2728 for tok in tokens {
2729 match tok {
2730 Token::IndentOpen => indent += 1,
2731 Token::IndentClose => {
2732 indent = indent.saturating_sub(1);
2733 if !at_line_start {
2734 bytes.extend_from_slice(newline);
2735 at_line_start = true;
2736 expecting_operand = true;
2737 }
2738 }
2739 Token::LineBreak => {
2740 if !at_line_start {
2741 bytes.extend_from_slice(newline);
2742 at_line_start = true;
2743 expecting_operand = true;
2744 }
2745 }
2746 Token::NoSpace => {
2747 suppress_next_separator = true;
2748 }
2749 Token::Lit(value) => {
2750 if at_line_start {
2751 bytes.extend(std::iter::repeat_n(b' ', indent * policy.indent_width));
2752 } else if let Some(prev) = last_lit {
2753 if !suppress_next_separator
2754 && needs_space_between(prev, value, last_was_in_operand_position)
2755 {
2756 bytes.extend_from_slice(separator);
2757 }
2758 }
2759 suppress_next_separator = false;
2760 bytes.extend_from_slice(value.as_bytes());
2761 at_line_start = false;
2762 last_was_in_operand_position = expecting_operand;
2763 expecting_operand = leaves_operand_position(value);
2764 last_lit = Some(value.as_str());
2765 }
2766 }
2767 }
2768
2769 if !at_line_start {
2770 bytes.extend_from_slice(newline);
2771 }
2772 bytes
2773}
2774
2775/// True iff emitting `tok` leaves the cursor in a position where the
2776/// grammar expects an operand next. Operand-introducing tokens are open
2777/// punctuation, separators, and operator-like strings; operand-terminating
2778/// tokens are identifiers, literals, and closing punctuation.
2779fn leaves_operand_position(tok: &str) -> bool {
2780 if tok.is_empty() {
2781 return true;
2782 }
2783 if is_punct_open(tok) {
2784 return true;
2785 }
2786 if matches!(tok, "," | ";") {
2787 return true;
2788 }
2789 if is_punct_close(tok) {
2790 return false;
2791 }
2792 if first_is_alnum_or_underscore(tok) || last_ends_with_alnum(tok) {
2793 return false;
2794 }
2795 // Pure punctuation/operator runs (`=`, `+`, `-`, `<=`, `>>`, …) leave
2796 // the cursor expecting another operand.
2797 true
2798}
2799
2800fn needs_space_between(last: &str, next: &str, expecting_operand: bool) -> bool {
2801 if last.is_empty() || next.is_empty() {
2802 return false;
2803 }
2804 if is_punct_open(last) || is_punct_open(next) {
2805 return false;
2806 }
2807 if is_punct_close(next) {
2808 return false;
2809 }
2810 if is_punct_close(last) && is_punct_punctuation(next) {
2811 return false;
2812 }
2813 if last == "." || next == "." {
2814 return false;
2815 }
2816 // Tight unary prefix: `last` is a sign/logical-not operator emitted
2817 // where the grammar expected an operand, so it glues to `next`.
2818 // `expecting_operand` here means: just before `last` was emitted,
2819 // the cursor expected an operand, which makes `last` a unary prefix.
2820 // Examples: `f(-1.0)`, `[ -2, 3 ]`, `return -x`, `a = !flag`.
2821 if expecting_operand && is_unary_prefix_operator(last) && first_is_operand_start(next) {
2822 return false;
2823 }
2824 if last_is_word_like(last) && first_is_word_like(next) {
2825 return true;
2826 }
2827 if last_ends_with_alnum(last) && first_is_alnum_or_underscore(next) {
2828 return true;
2829 }
2830 // Adjacent operator runs: keep them apart so the lexer doesn't glue
2831 // `>` and `=` into `>=` unintentionally.
2832 true
2833}
2834
2835fn is_unary_prefix_operator(s: &str) -> bool {
2836 matches!(s, "-" | "+" | "!" | "~")
2837}
2838
2839fn first_is_operand_start(s: &str) -> bool {
2840 s.chars()
2841 .next()
2842 .map(|c| c.is_alphanumeric() || c == '_' || c == '.' || c == '(')
2843 .unwrap_or(false)
2844}
2845
2846fn is_punct_open(s: &str) -> bool {
2847 matches!(s, "(" | "[" | "{" | "\"" | "'" | "`" | "@" | "#")
2848 || s.ends_with('{')
2849 || s.ends_with('(')
2850 || s.ends_with('[')
2851}
2852
2853fn is_punct_close(s: &str) -> bool {
2854 matches!(s, ")" | "]" | "}" | "," | ";" | ":" | "\"" | "'" | "`")
2855}
2856
2857fn is_punct_punctuation(s: &str) -> bool {
2858 matches!(s, "," | ";" | ":" | "." | ")" | "]" | "}")
2859}
2860
2861fn last_is_word_like(s: &str) -> bool {
2862 s.chars()
2863 .next_back()
2864 .map(|c| c.is_alphanumeric() || c == '_')
2865 .unwrap_or(false)
2866}
2867
2868fn first_is_word_like(s: &str) -> bool {
2869 s.chars()
2870 .next()
2871 .map(|c| c.is_alphanumeric() || c == '_')
2872 .unwrap_or(false)
2873}
2874
2875fn last_ends_with_alnum(s: &str) -> bool {
2876 s.chars()
2877 .next_back()
2878 .map(char::is_alphanumeric)
2879 .unwrap_or(false)
2880}
2881
2882fn first_is_alnum_or_underscore(s: &str) -> bool {
2883 s.chars()
2884 .next()
2885 .map(|c| c.is_alphanumeric() || c == '_')
2886 .unwrap_or(false)
2887}
2888
2889#[cfg(test)]
2890#[allow(clippy::unwrap_used)]
2891mod tests {
2892 use super::*;
2893
2894 #[test]
2895 fn parses_simple_grammar_json() {
2896 let bytes = br#"{
2897 "name": "tiny",
2898 "rules": {
2899 "program": {
2900 "type": "SEQ",
2901 "members": [
2902 {"type": "STRING", "value": "hello"},
2903 {"type": "STRING", "value": ";"}
2904 ]
2905 }
2906 }
2907 }"#;
2908 let g = Grammar::from_bytes("tiny", bytes).expect("valid tiny grammar");
2909 assert!(g.rules.contains_key("program"));
2910 }
2911
2912 #[test]
2913 fn output_emits_punctuation_without_leading_space() {
2914 let policy = FormatPolicy::default();
2915 let mut out = Output::new(&policy);
2916 out.token("foo");
2917 out.token("(");
2918 out.token(")");
2919 out.token(";");
2920 let bytes = out.finish();
2921 let s = std::str::from_utf8(&bytes).expect("ascii output");
2922 assert!(s.starts_with("foo();"), "got {s:?}");
2923 }
2924
2925 #[test]
2926 fn grammar_from_bytes_rejects_malformed_input() {
2927 let result = Grammar::from_bytes("malformed", b"not json");
2928 let err = result.expect_err("malformed bytes must yield Err");
2929 let msg = err.to_string();
2930 assert!(
2931 msg.contains("malformed"),
2932 "error message should name the protocol: {msg:?}"
2933 );
2934 }
2935
2936 #[test]
2937 fn output_indents_after_open_brace() {
2938 let policy = FormatPolicy::default();
2939 let mut out = Output::new(&policy);
2940 out.token("fn");
2941 out.token("foo");
2942 out.token("(");
2943 out.token(")");
2944 out.token("{");
2945 out.token("body");
2946 out.token("}");
2947 let bytes = out.finish();
2948 let s = std::str::from_utf8(&bytes).expect("ascii output");
2949 assert!(s.contains("{\n"), "newline after opening brace: {s:?}");
2950 assert!(s.contains("body"), "body inside block: {s:?}");
2951 assert!(s.ends_with("}\n"), "newline after closing brace: {s:?}");
2952 }
2953
2954 #[test]
2955 fn output_no_space_between_word_and_dot() {
2956 let policy = FormatPolicy::default();
2957 let mut out = Output::new(&policy);
2958 out.token("foo");
2959 out.token(".");
2960 out.token("bar");
2961 let bytes = out.finish();
2962 let s = std::str::from_utf8(&bytes).expect("ascii output");
2963 assert!(s.starts_with("foo.bar"), "no space around dot: {s:?}");
2964 }
2965
2966 #[test]
2967 fn output_snapshot_restore_truncates_bytes() {
2968 let policy = FormatPolicy::default();
2969 let mut out = Output::new(&policy);
2970 out.token("keep");
2971 let snap = out.snapshot();
2972 out.token("drop");
2973 out.token("more");
2974 out.restore(snap);
2975 out.token("after");
2976 let bytes = out.finish();
2977 let s = std::str::from_utf8(&bytes).expect("ascii output");
2978 assert!(s.contains("keep"), "kept token survives: {s:?}");
2979 assert!(s.contains("after"), "post-restore token visible: {s:?}");
2980 assert!(!s.contains("drop"), "rolled-back token removed: {s:?}");
2981 assert!(!s.contains("more"), "rolled-back token removed: {s:?}");
2982 }
2983
2984 #[test]
2985 fn child_cursor_take_field_consumes_once() {
2986 let edges_owned: Vec<Edge> = vec![Edge {
2987 src: panproto_gat::Name::from("p"),
2988 tgt: panproto_gat::Name::from("c"),
2989 kind: panproto_gat::Name::from("name"),
2990 name: None,
2991 }];
2992 let edges: Vec<&Edge> = edges_owned.iter().collect();
2993 let mut cursor = ChildCursor::new(&edges);
2994 let first = cursor.take_field("name");
2995 let second = cursor.take_field("name");
2996 assert!(first.is_some(), "first take returns the edge");
2997 assert!(
2998 second.is_none(),
2999 "second take returns None (already consumed)"
3000 );
3001 }
3002
3003 #[test]
3004 fn child_cursor_take_matching_predicate() {
3005 let edges_owned: Vec<Edge> = vec![
3006 Edge {
3007 src: "p".into(),
3008 tgt: "c1".into(),
3009 kind: "child_of".into(),
3010 name: None,
3011 },
3012 Edge {
3013 src: "p".into(),
3014 tgt: "c2".into(),
3015 kind: "key".into(),
3016 name: None,
3017 },
3018 ];
3019 let edges: Vec<&Edge> = edges_owned.iter().collect();
3020 let mut cursor = ChildCursor::new(&edges);
3021 assert!(cursor.has_matching(|e| e.kind.as_ref() == "key"));
3022 let taken = cursor.take_matching(|e| e.kind.as_ref() == "key");
3023 assert!(taken.is_some());
3024 assert!(
3025 !cursor.has_matching(|e| e.kind.as_ref() == "key"),
3026 "consumed edge no longer matches"
3027 );
3028 assert!(
3029 cursor.has_matching(|e| e.kind.as_ref() == "child_of"),
3030 "the other edge is still available"
3031 );
3032 }
3033
3034 #[test]
3035 fn kind_satisfies_symbol_direct_match() {
3036 let bytes = br#"{
3037 "name": "tiny",
3038 "rules": {
3039 "x": {"type": "STRING", "value": "x"}
3040 }
3041 }"#;
3042 let g = Grammar::from_bytes("tiny", bytes).expect("valid grammar");
3043 assert!(kind_satisfies_symbol(&g, Some("x"), "x"));
3044 assert!(!kind_satisfies_symbol(&g, Some("y"), "x"));
3045 assert!(!kind_satisfies_symbol(&g, None, "x"));
3046 }
3047
3048 #[test]
3049 fn kind_satisfies_symbol_through_hidden_rule() {
3050 let bytes = br#"{
3051 "name": "tiny",
3052 "rules": {
3053 "_value": {
3054 "type": "CHOICE",
3055 "members": [
3056 {"type": "SYMBOL", "name": "object"},
3057 {"type": "SYMBOL", "name": "number"}
3058 ]
3059 },
3060 "object": {"type": "STRING", "value": "{}"},
3061 "number": {"type": "PATTERN", "value": "[0-9]+"}
3062 }
3063 }"#;
3064 let g = Grammar::from_bytes("tiny", bytes).expect("valid grammar");
3065 assert!(
3066 kind_satisfies_symbol(&g, Some("number"), "_value"),
3067 "number is reachable from _value via CHOICE"
3068 );
3069 assert!(
3070 kind_satisfies_symbol(&g, Some("object"), "_value"),
3071 "object is reachable from _value via CHOICE"
3072 );
3073 assert!(
3074 !kind_satisfies_symbol(&g, Some("string"), "_value"),
3075 "string is NOT among the alternatives"
3076 );
3077 }
3078
3079 #[test]
3080 fn first_symbol_skips_string_terminals() {
3081 let prod: Production = serde_json::from_str(
3082 r#"{
3083 "type": "SEQ",
3084 "members": [
3085 {"type": "STRING", "value": "{"},
3086 {"type": "SYMBOL", "name": "body"},
3087 {"type": "STRING", "value": "}"}
3088 ]
3089 }"#,
3090 )
3091 .expect("valid SEQ");
3092 assert_eq!(first_symbol(&prod), Some("body"));
3093 }
3094
3095 #[test]
3096 fn placeholder_for_pattern_routes_by_regex_class() {
3097 assert_eq!(placeholder_for_pattern("[0-9]+"), "0");
3098 assert_eq!(placeholder_for_pattern("[a-zA-Z_]\\w*"), "_x");
3099 assert_eq!(placeholder_for_pattern("\"[^\"]*\""), "\"\"");
3100 assert_eq!(placeholder_for_pattern("\\d+\\.\\d+"), "0");
3101 }
3102
3103 #[test]
3104 fn format_policy_default_breaks_after_semicolon() {
3105 let policy = FormatPolicy::default();
3106 assert!(policy.line_break_after.iter().any(|t| t == ";"));
3107 assert!(policy.indent_open.iter().any(|t| t == "{"));
3108 assert!(policy.indent_close.iter().any(|t| t == "}"));
3109 assert_eq!(policy.indent_width, 2);
3110 }
3111
3112 #[test]
3113 fn placeholder_decodes_literal_pattern_separators() {
3114 // PATTERN regexes that match a single literal byte sequence
3115 // (newline, semicolon, comma) emit the bytes verbatim instead
3116 // of falling through to the `_` catch-all.
3117 assert_eq!(placeholder_for_pattern("\\n"), "\n");
3118 assert_eq!(placeholder_for_pattern("\\r\\n"), "\r\n");
3119 assert_eq!(placeholder_for_pattern(";"), ";");
3120 // Patterns with character classes / alternation still route
3121 // through the heuristic.
3122 assert_eq!(placeholder_for_pattern("[0-9]+"), "0");
3123 assert_eq!(placeholder_for_pattern("a|b"), "_");
3124 }
3125
3126 #[test]
3127 fn supertypes_decode_from_grammar_json_strings() {
3128 // Tree-sitter older grammars list supertypes as bare strings.
3129 let bytes = br#"{
3130 "name": "tiny",
3131 "supertypes": ["expression"],
3132 "rules": {
3133 "expression": {
3134 "type": "CHOICE",
3135 "members": [
3136 {"type": "SYMBOL", "name": "binary_expression"},
3137 {"type": "SYMBOL", "name": "identifier"}
3138 ]
3139 },
3140 "binary_expression": {"type": "STRING", "value": "x"},
3141 "identifier": {"type": "PATTERN", "value": "[a-z]+"}
3142 }
3143 }"#;
3144 let g = Grammar::from_bytes("tiny", bytes).expect("parse");
3145 assert!(g.supertypes.contains("expression"));
3146 // identifier matches the supertype `expression`.
3147 assert!(kind_satisfies_symbol(&g, Some("identifier"), "expression"));
3148 // unrelated kinds do not.
3149 assert!(!kind_satisfies_symbol(&g, Some("string"), "expression"));
3150 }
3151
3152 #[test]
3153 fn supertypes_decode_from_grammar_json_objects() {
3154 // Recent grammars list supertypes as `{type: SYMBOL, name: ...}`
3155 // entries instead of bare strings.
3156 let bytes = br#"{
3157 "name": "tiny",
3158 "supertypes": [{"type": "SYMBOL", "name": "stmt"}],
3159 "rules": {
3160 "stmt": {
3161 "type": "CHOICE",
3162 "members": [
3163 {"type": "SYMBOL", "name": "while_stmt"},
3164 {"type": "SYMBOL", "name": "if_stmt"}
3165 ]
3166 },
3167 "while_stmt": {"type": "STRING", "value": "while"},
3168 "if_stmt": {"type": "STRING", "value": "if"}
3169 }
3170 }"#;
3171 let g = Grammar::from_bytes("tiny", bytes).expect("parse");
3172 assert!(g.supertypes.contains("stmt"));
3173 assert!(kind_satisfies_symbol(&g, Some("while_stmt"), "stmt"));
3174 }
3175
3176 #[test]
3177 fn alias_value_matches_kind() {
3178 // A named ALIAS rewrites the parser-visible kind to `value`;
3179 // `kind_satisfies_symbol` should accept that rewritten kind
3180 // when looking up the original SYMBOL.
3181 let bytes = br#"{
3182 "name": "tiny",
3183 "rules": {
3184 "_package_identifier": {
3185 "type": "ALIAS",
3186 "named": true,
3187 "value": "package_identifier",
3188 "content": {"type": "SYMBOL", "name": "identifier"}
3189 },
3190 "identifier": {"type": "PATTERN", "value": "[a-z]+"}
3191 }
3192 }"#;
3193 let g = Grammar::from_bytes("tiny", bytes).expect("parse");
3194 assert!(kind_satisfies_symbol(
3195 &g,
3196 Some("package_identifier"),
3197 "_package_identifier"
3198 ));
3199 }
3200
3201 #[test]
3202 fn referenced_symbols_walks_nested_seq() {
3203 let prod: Production = serde_json::from_str(
3204 r#"{
3205 "type": "SEQ",
3206 "members": [
3207 {"type": "CHOICE", "members": [
3208 {"type": "SYMBOL", "name": "attribute_item"},
3209 {"type": "BLANK"}
3210 ]},
3211 {"type": "SYMBOL", "name": "parameter"},
3212 {"type": "REPEAT", "content": {
3213 "type": "SEQ",
3214 "members": [
3215 {"type": "STRING", "value": ","},
3216 {"type": "SYMBOL", "name": "parameter"}
3217 ]
3218 }}
3219 ]
3220 }"#,
3221 )
3222 .expect("seq");
3223 let symbols = referenced_symbols(&prod);
3224 assert!(symbols.contains(&"attribute_item"));
3225 assert!(symbols.contains(&"parameter"));
3226 }
3227
3228 #[test]
3229 fn literal_strings_collects_choice_members() {
3230 let prod: Production = serde_json::from_str(
3231 r#"{
3232 "type": "CHOICE",
3233 "members": [
3234 {"type": "STRING", "value": "+"},
3235 {"type": "STRING", "value": "-"},
3236 {"type": "STRING", "value": "*"}
3237 ]
3238 }"#,
3239 )
3240 .expect("choice");
3241 let strings = literal_strings(&prod);
3242 assert_eq!(strings, vec!["+", "-", "*"]);
3243 }
3244
3245 /// The ocaml and javascript grammars (tree-sitter ≥ 0.25) emit a
3246 /// `RESERVED` rule kind that an earlier deserialiser rejected
3247 /// with `unknown variant "RESERVED"`. Verify both that the bare
3248 /// variant deserialises and that a `RESERVED`-wrapped grammar is
3249 /// loadable end-to-end via [`Grammar::from_bytes`].
3250 #[test]
3251 fn reserved_variant_deserialises() {
3252 let prod: Production = serde_json::from_str(
3253 r#"{
3254 "type": "RESERVED",
3255 "content": {"type": "SYMBOL", "name": "_lowercase_identifier"},
3256 "context_name": "attribute_id"
3257 }"#,
3258 )
3259 .expect("RESERVED parses");
3260 match prod {
3261 Production::Reserved { content, .. } => match *content {
3262 Production::Symbol { name } => assert_eq!(name, "_lowercase_identifier"),
3263 other => panic!("expected inner SYMBOL, got {other:?}"),
3264 },
3265 other => panic!("expected RESERVED, got {other:?}"),
3266 }
3267 }
3268
3269 #[test]
3270 fn reserved_grammar_loads_end_to_end() {
3271 let bytes = br#"{
3272 "name": "tiny_reserved",
3273 "rules": {
3274 "program": {
3275 "type": "RESERVED",
3276 "content": {"type": "SYMBOL", "name": "ident"},
3277 "context_name": "keywords"
3278 },
3279 "ident": {"type": "PATTERN", "value": "[a-z]+"}
3280 }
3281 }"#;
3282 let g = Grammar::from_bytes("tiny_reserved", bytes).expect("RESERVED-using grammar loads");
3283 assert!(g.rules.contains_key("program"));
3284 }
3285
3286 #[test]
3287 fn reserved_walker_helpers_recurse_into_content() {
3288 // The walker's helpers (first_symbol, has_field_in,
3289 // referenced_symbols, ...) all need to descend through
3290 // RESERVED into its content. If they bail at RESERVED, the
3291 // `pick_choice_with_cursor` heuristic ranks the alt below
3292 // alts that DO recurse, which produces wrong emit output
3293 // even when the deserialiser doesn't crash.
3294 let prod: Production = serde_json::from_str(
3295 r#"{
3296 "type": "RESERVED",
3297 "content": {
3298 "type": "FIELD",
3299 "name": "lhs",
3300 "content": {"type": "SYMBOL", "name": "expr"}
3301 },
3302 "context_name": "ctx"
3303 }"#,
3304 )
3305 .expect("nested RESERVED parses");
3306 assert_eq!(first_symbol(&prod), Some("expr"));
3307 assert!(has_field_in(&prod, &["lhs"]));
3308 let symbols = referenced_symbols(&prod);
3309 assert!(symbols.contains(&"expr"));
3310 }
3311
3312 // -- Yield-set tests --
3313
3314 fn yield_of(grammar: &Grammar, prod: &Production) -> std::collections::HashSet<String> {
3315 let mut visited = std::collections::HashSet::new();
3316 let mut cache = grammar.yield_sets.clone();
3317 yield_of_production(grammar, prod, &mut visited, &mut cache)
3318 }
3319
3320 #[test]
3321 fn yield_set_seq_only_first_member() {
3322 let prod: Production = serde_json::from_str(
3323 r#"{
3324 "type": "SEQ",
3325 "members": [
3326 {"type": "SYMBOL", "name": "identifier"},
3327 {"type": "STRING", "value": "as"},
3328 {"type": "SYMBOL", "name": "target"}
3329 ]
3330 }"#,
3331 )
3332 .expect("valid SEQ");
3333 let g = Grammar::from_bytes("test", b"{}").unwrap_or_else(|_| {
3334 serde_json::from_str::<Grammar>(r#"{"name":"t","rules":{}}"#).unwrap()
3335 });
3336 let ys = yield_of(&g, &prod);
3337 assert!(ys.contains("identifier"), "SEQ yields first member");
3338 assert!(
3339 !ys.contains("target"),
3340 "SEQ must NOT yield non-first members"
3341 );
3342 }
3343
3344 #[test]
3345 fn yield_set_choice_union() {
3346 let prod: Production = serde_json::from_str(
3347 r#"{
3348 "type": "CHOICE",
3349 "members": [
3350 {"type": "SYMBOL", "name": "a"},
3351 {"type": "SYMBOL", "name": "b"}
3352 ]
3353 }"#,
3354 )
3355 .expect("valid CHOICE");
3356 let g = serde_json::from_str::<Grammar>(r#"{"name":"t","rules":{}}"#).unwrap();
3357 let ys = yield_of(&g, &prod);
3358 assert_eq!(ys.len(), 2);
3359 assert!(ys.contains("a"));
3360 assert!(ys.contains("b"));
3361 }
3362
3363 #[test]
3364 fn yield_set_hidden_expansion() {
3365 let g = serde_json::from_str::<Grammar>(
3366 r#"{"name":"t","rules":{
3367 "_value": {
3368 "type": "CHOICE",
3369 "members": [
3370 {"type": "SYMBOL", "name": "number"},
3371 {"type": "SYMBOL", "name": "object"}
3372 ]
3373 }
3374 }}"#,
3375 )
3376 .unwrap();
3377 let mut g = g;
3378 g.subtypes = compute_subtype_closure(&g);
3379 g.yield_sets = compute_yield_sets(&g);
3380 let sym: Production =
3381 serde_json::from_str(r#"{"type": "SYMBOL", "name": "_value"}"#).unwrap();
3382 let ys = yield_of(&g, &sym);
3383 assert!(
3384 ys.contains("number"),
3385 "hidden rule expands into its CHOICE members"
3386 );
3387 assert!(ys.contains("object"));
3388 assert!(
3389 !ys.contains("_value"),
3390 "hidden rule name is not in yield set"
3391 );
3392 }
3393
3394 #[test]
3395 fn yield_set_optional_includes_epsilon() {
3396 let prod: Production = serde_json::from_str(
3397 r#"{"type": "OPTIONAL", "content": {"type": "SYMBOL", "name": "x"}}"#,
3398 )
3399 .unwrap();
3400 let g = serde_json::from_str::<Grammar>(r#"{"name":"t","rules":{}}"#).unwrap();
3401 let ys = yield_of(&g, &prod);
3402 assert!(ys.contains("x"));
3403 assert!(ys.contains(""), "OPTIONAL includes epsilon");
3404 }
3405
3406 #[test]
3407 fn yield_set_alias_uses_value() {
3408 let prod: Production = serde_json::from_str(
3409 r#"{"type": "ALIAS", "content": {"type": "SYMBOL", "name": "real"},
3410 "named": true, "value": "alias_name"}"#,
3411 )
3412 .unwrap();
3413 let g = serde_json::from_str::<Grammar>(r#"{"name":"t","rules":{}}"#).unwrap();
3414 let ys = yield_of(&g, &prod);
3415 assert_eq!(ys.len(), 1);
3416 assert!(ys.contains("alias_name"), "named ALIAS yields its value");
3417 }
3418}