Skip to main content

panproto_parse/
emit_pretty.rs

1#![allow(
2    clippy::module_name_repetitions,
3    clippy::too_many_lines,
4    clippy::too_many_arguments,
5    clippy::map_unwrap_or,
6    clippy::option_if_let_else,
7    clippy::elidable_lifetime_names,
8    clippy::items_after_statements,
9    clippy::needless_pass_by_value,
10    clippy::single_match_else,
11    clippy::manual_let_else,
12    clippy::match_same_arms,
13    clippy::missing_const_for_fn,
14    clippy::single_char_pattern,
15    clippy::naive_bytecount,
16    clippy::expect_used,
17    clippy::redundant_pub_crate,
18    clippy::used_underscore_binding,
19    clippy::redundant_field_names,
20    clippy::struct_field_names,
21    clippy::redundant_else,
22    clippy::similar_names
23)]
24
25//! De-novo source emission from a by-construction schema.
26//!
27//! [`AstParser::emit`] reconstructs source from byte-position fragments
28//! that the parser stored on the schema during `parse`. That works for
29//! edit pipelines (`parse → transform → emit`) but fails for schemas
30//! built by hand (`SchemaBuilder` with no parse history): they carry
31//! no `start-byte`, no `interstitial-N`, no `literal-value`, and the
32//! reconstructor returns `Err(EmitFailed { reason: "schema has no
33//! text fragments" })`.
34//!
35//! This module renders such schemas to source bytes by walking
36//! tree-sitter's `grammar.json` production rules. For each schema
37//! vertex of kind `K`, the walker looks up `K`'s production in the
38//! grammar and emits its body in order:
39//!
40//! - `STRING` nodes contribute literal token bytes directly.
41//! - `SYMBOL` and `FIELD` nodes recurse into the schema's children,
42//!   matching by edge kind (which is the tree-sitter field name).
43//! - `SEQ` emits its members in order.
44//! - `CHOICE` picks the alternative whose head `SYMBOL` matches an
45//!   actual child kind, or whose terminals appear in the rendered
46//!   prefix; falls back to the first non-`BLANK` alternative when no
47//!   alternative matches.
48//! - `REPEAT` and `REPEAT1` emit their content once per matching
49//!   child edge in declared order.
50//! - `OPTIONAL` emits its content iff a corresponding child edge or
51//!   constraint is populated.
52//! - `PATTERN` is a regex placeholder for variable-text terminals
53//!   (identifiers, numbers, quoted strings). The walker emits a
54//!   `literal-value` constraint when present and otherwise falls
55//!   back to a placeholder derived from the regex shape.
56//! - `BLANK`, `TOKEN`, `IMMEDIATE_TOKEN`, `ALIAS`, `PREC*` are
57//!   handled transparently (the inner content is emitted; the
58//!   wrapper is dropped).
59//!
60//! Whitespace and indentation come from a `FormatPolicy` applied
61//! during emission. The default policy inserts a single space between
62//! adjacent tokens, a newline after `;` / `}` / `{`, and tracks an
63//! indent counter on `{` / `}` boundaries.
64//!
65//! Output is *syntactically valid* for any grammar that ships
66//! `grammar.json`. Idiomatic formatting (rustfmt-style spacing rules,
67//! per-language conventions) is a polish layer that lives outside
68//! this module.
69
70use std::collections::BTreeMap;
71
72use panproto_schema::{Edge, Schema};
73use serde::Deserialize;
74
75use crate::error::ParseError;
76
77// ═══════════════════════════════════════════════════════════════════
78// Grammar JSON model
79// ═══════════════════════════════════════════════════════════════════
80
81/// A single tree-sitter production rule.
82///
83/// Mirrors the shape emitted by `tree-sitter generate`: every node has
84/// a `type` discriminator that selects a structural variant. The
85/// untyped subset (`PATTERN`, `STRING`, `SYMBOL`, `BLANK`) handles
86/// terminals; the structural subset (`SEQ`, `CHOICE`, `REPEAT`,
87/// `REPEAT1`, `OPTIONAL`, `FIELD`, `ALIAS`, `TOKEN`,
88/// `IMMEDIATE_TOKEN`, `PREC*`) builds composite productions.
89#[derive(Debug, Clone, Deserialize)]
90#[serde(tag = "type")]
91#[non_exhaustive]
92pub enum Production {
93    /// Concatenation of productions.
94    #[serde(rename = "SEQ")]
95    Seq {
96        /// Ordered members; each is emitted in turn.
97        members: Vec<Self>,
98    },
99    /// Alternation between productions.
100    #[serde(rename = "CHOICE")]
101    Choice {
102        /// Alternatives; the walker picks one based on the schema's
103        /// children and constraints.
104        members: Vec<Self>,
105    },
106    /// Zero-or-more repetition.
107    #[serde(rename = "REPEAT")]
108    Repeat {
109        /// The repeated body.
110        content: Box<Self>,
111    },
112    /// One-or-more repetition.
113    #[serde(rename = "REPEAT1")]
114    Repeat1 {
115        /// The repeated body.
116        content: Box<Self>,
117    },
118    /// Optional inclusion (zero or one).
119    ///
120    /// Tree-sitter usually emits `OPTIONAL` as `CHOICE { content,
121    /// BLANK }`, but recent generator versions also emit explicit
122    /// `OPTIONAL` nodes; both shapes are accepted.
123    #[serde(rename = "OPTIONAL")]
124    Optional {
125        /// The optional body.
126        content: Box<Self>,
127    },
128    /// Reference to another rule by name.
129    #[serde(rename = "SYMBOL")]
130    Symbol {
131        /// Name of the referenced rule (matches a vertex kind on the
132        /// schema side).
133        name: String,
134    },
135    /// Literal token bytes.
136    #[serde(rename = "STRING")]
137    String {
138        /// The literal token. Emitted verbatim.
139        value: String,
140    },
141    /// Regex-matched terminal.
142    ///
143    /// At parse time this matches arbitrary bytes; at emit time the
144    /// walker substitutes a `literal-value` constraint when present
145    /// and falls back to a placeholder otherwise.
146    #[serde(rename = "PATTERN")]
147    Pattern {
148        /// The original regex.
149        value: String,
150    },
151    /// The empty production. Emits nothing.
152    #[serde(rename = "BLANK")]
153    Blank,
154    /// Named field over a content production.
155    ///
156    /// The field `name` matches an edge kind on the schema side; the
157    /// walker resolves the corresponding child vertex and recurses
158    /// into `content` with that child as context.
159    #[serde(rename = "FIELD")]
160    Field {
161        /// Field name (matches edge kind).
162        name: String,
163        /// The contents of the field.
164        content: Box<Self>,
165    },
166    /// An aliased production.
167    ///
168    /// `value` records the parser-visible kind; the walker emits
169    /// `content` and ignores the alias rename.
170    #[serde(rename = "ALIAS")]
171    Alias {
172        /// The aliased content.
173        content: Box<Self>,
174        /// Whether the alias is a named node.
175        #[serde(default)]
176        named: bool,
177        /// The alias's surface name.
178        #[serde(default)]
179        value: String,
180    },
181    /// A token wrapper.
182    ///
183    /// Tree-sitter uses `TOKEN` to mark a sub-rule as a single
184    /// lexical token; the walker emits the inner content unchanged.
185    #[serde(rename = "TOKEN")]
186    Token {
187        /// The wrapped content.
188        content: Box<Self>,
189    },
190    /// An immediate-token wrapper (no preceding whitespace).
191    ///
192    /// Treated like [`Production::Token`] for emit purposes.
193    #[serde(rename = "IMMEDIATE_TOKEN")]
194    ImmediateToken {
195        /// The wrapped content.
196        content: Box<Self>,
197    },
198    /// Precedence wrapper.
199    #[serde(rename = "PREC")]
200    Prec {
201        /// Precedence value (numeric or string). Ignored at emit time.
202        #[allow(dead_code)]
203        value: serde_json::Value,
204        /// The wrapped content.
205        content: Box<Self>,
206    },
207    /// Left-associative precedence wrapper.
208    #[serde(rename = "PREC_LEFT")]
209    PrecLeft {
210        /// Precedence value. Ignored at emit time.
211        #[allow(dead_code)]
212        value: serde_json::Value,
213        /// The wrapped content.
214        content: Box<Self>,
215    },
216    /// Right-associative precedence wrapper.
217    #[serde(rename = "PREC_RIGHT")]
218    PrecRight {
219        /// Precedence value. Ignored at emit time.
220        #[allow(dead_code)]
221        value: serde_json::Value,
222        /// The wrapped content.
223        content: Box<Self>,
224    },
225    /// Dynamic precedence wrapper.
226    #[serde(rename = "PREC_DYNAMIC")]
227    PrecDynamic {
228        /// Precedence value. Ignored at emit time.
229        #[allow(dead_code)]
230        value: serde_json::Value,
231        /// The wrapped content.
232        content: Box<Self>,
233    },
234    /// Reserved-word wrapper (tree-sitter ≥ 0.25).
235    ///
236    /// Tree-sitter's `RESERVED` rule marks an inner production as a
237    /// reserved-word context: the parser excludes the listed identifiers
238    /// from being treated as the inner symbol. The `context_name`
239    /// metadata names the reserved-word set; the emitter does not need
240    /// it (we are walking schema → bytes, not enforcing reserved-word
241    /// constraints), so we emit the inner content unchanged, the same
242    /// way [`Production::Token`] and [`Production::ImmediateToken`] do.
243    #[serde(rename = "RESERVED")]
244    Reserved {
245        /// The wrapped content.
246        content: Box<Self>,
247        /// Name of the reserved-word context. Ignored at emit time.
248        #[allow(dead_code)]
249        #[serde(default)]
250        context_name: String,
251    },
252}
253
254/// A grammar's production-rule table, deserialized from `grammar.json`.
255///
256/// Only the fields the emitter consumes are decoded; precedences,
257/// conflicts, externals, and other parser-only metadata are ignored.
258#[derive(Debug, Clone, Deserialize)]
259#[non_exhaustive]
260pub struct Grammar {
261    /// Grammar name (e.g. `"rust"`, `"typescript"`).
262    #[allow(dead_code)]
263    pub name: String,
264    /// Map from rule name (a vertex kind on the schema side) to
265    /// production. Entries are kept in lexical order so iteration
266    /// is deterministic.
267    pub rules: BTreeMap<String, Production>,
268    /// Supertypes declared in the grammar's `supertypes` field. A
269    /// supertype is a rule whose body is a `CHOICE` of `SYMBOL`
270    /// references; tree-sitter parsers report a node's kind as one
271    /// of the subtypes (e.g. `identifier`, `typed_parameter`) rather
272    /// than the supertype name (`parameter`), so the emitter needs to
273    /// know that a child kind in a subtype set should match the
274    /// supertype name when a SYMBOL references it.
275    #[serde(default, deserialize_with = "deserialize_supertypes")]
276    pub supertypes: std::collections::HashSet<String>,
277    /// Tree-sitter `extras` rules: the named symbols (typically comments)
278    /// that tree-sitter skips at parse time but records as children of the
279    /// surrounding vertex. They appear nowhere in the production grammar,
280    /// so the rule walker cannot reconcile them against the cursor — the
281    /// emit pass therefore drains them as a side channel: at vertex entry
282    /// and between REPEAT iterations any leading extras-kind edges are
283    /// consumed and emitted directly. The set is populated at
284    /// `Grammar::from_bytes` by collecting every `SYMBOL { name }` and
285    /// named `ALIAS { value, named: true }` under the top-level `extras`
286    /// array. Pattern-only extras (e.g. `\s` whitespace) are not vertex
287    /// kinds and are excluded.
288    #[serde(default, deserialize_with = "deserialize_extras")]
289    pub extras: std::collections::HashSet<String>,
290    /// Precomputed subtyping closure: `subtypes[symbol_name]` is the
291    /// set of vertex kinds that satisfy a SYMBOL `symbol_name`
292    /// reference on the schema side.
293    ///
294    /// Built once at [`Grammar::from_bytes`] time by walking each
295    /// hidden rule (`_`-prefixed), declared supertype, and named
296    /// `ALIAS { value: K, ... }` production to its leaf SYMBOLs and
297    /// recording the closure. This replaces the prior heuristic
298    /// `kind_satisfies_symbol` that walked the rule body on every
299    /// query: lookups are now O(1) and the relation is exactly the
300    /// transitive closure of "is reachable via hidden / supertype /
301    /// alias dispatch", with no over-expansion through non-hidden
302    /// non-supertype rule references.
303    #[serde(skip)]
304    pub subtypes: std::collections::HashMap<String, std::collections::HashSet<String>>,
305    /// Precomputed Yield sets: `yield_sets[rule_name]` is the set of
306    /// concrete vertex kinds that can appear as the **first named
307    /// child** when that rule's production is taken.
308    ///
309    /// Defined inductively:
310    /// - `Yield(SYMBOL S)` where S is hidden/supertype = `Yield(rules[S])`
311    /// - `Yield(SYMBOL S)` where S is concrete = `{S}`
312    /// - `Yield(SEQ [M1, ...])` = `Yield(M1)` (only first member)
313    /// - `Yield(CHOICE [M1, ..., Mn])` = `⋃ Yield(Mi)`
314    /// - `Yield(OPTIONAL { c })` = `Yield(c) ∪ {ε}`
315    /// - `Yield(BLANK)` = `{ε}`
316    /// - Wrappers (PREC*, TOKEN, FIELD, REPEAT, etc.) = `Yield(content)`
317    /// - `Yield(STRING)` = `Yield(PATTERN)` = `∅`
318    /// - `Yield(ALIAS { value: V, named: true })` = `{V}`
319    ///
320    /// Epsilon is represented as the empty string `""`.
321    #[serde(skip)]
322    pub yield_sets: std::collections::HashMap<String, std::collections::HashSet<String>>,
323    /// Child kinds allowed per parent kind, derived from node-types.json.
324    /// Maps parent kind to the set of ALL named child kinds that tree-sitter's
325    /// parser can produce for that parent (from both `children.types` and
326    /// `fields.*.types`). Used by `augment_subtypes_from_node_types` to
327    /// close the grammar/parser divergence gap.
328    #[serde(skip)]
329    pub node_type_children: std::collections::HashMap<String, std::collections::HashSet<String>>,
330    /// Anonymous ALIAS values for external scanner tokens. Maps external
331    /// symbol name (e.g. `_ternary_qmark`) to the ALIAS value string
332    /// (e.g. `"?"`). Built by scanning grammar.json rule bodies for
333    /// `ALIAS { content: SYMBOL S, named: false, value: V }` where S
334    /// has no grammar rule.
335    #[serde(skip)]
336    pub external_alias_map: std::collections::HashMap<String, String>,
337    /// Rules whose `{`/`}` STRING tokens are inline delimiters (e.g.
338    /// string interpolation) rather than block scopes. Identified
339    /// structurally: a rule whose SEQ contains `{` and `}` but no
340    /// REPEAT/REPEAT1 between them.
341    #[serde(skip)]
342    pub inline_brace_rules: std::collections::HashSet<String>,
343}
344
345fn deserialize_supertypes<'de, D>(
346    deserializer: D,
347) -> Result<std::collections::HashSet<String>, D::Error>
348where
349    D: serde::Deserializer<'de>,
350{
351    let entries: Vec<serde_json::Value> = Vec::deserialize(deserializer)?;
352    let mut out = std::collections::HashSet::new();
353    for entry in entries {
354        match entry {
355            serde_json::Value::String(s) => {
356                out.insert(s);
357            }
358            serde_json::Value::Object(map) => {
359                if let Some(serde_json::Value::String(name)) = map.get("name") {
360                    out.insert(name.clone());
361                }
362            }
363            _ => {}
364        }
365    }
366    Ok(out)
367}
368
369fn deserialize_extras<'de, D>(
370    deserializer: D,
371) -> Result<std::collections::HashSet<String>, D::Error>
372where
373    D: serde::Deserializer<'de>,
374{
375    let entries: Vec<serde_json::Value> = Vec::deserialize(deserializer)?;
376    let mut out = std::collections::HashSet::new();
377    for entry in entries {
378        if let serde_json::Value::Object(map) = entry {
379            let ty = map.get("type").and_then(serde_json::Value::as_str);
380            match ty {
381                // SYMBOL { name: K } — the extras rule is a named symbol
382                // (typically `line_comment` / `block_comment`). The kind
383                // K appears as a real child vertex on the schema side.
384                Some("SYMBOL") => {
385                    if let Some(serde_json::Value::String(name)) = map.get("name") {
386                        out.insert(name.clone());
387                    }
388                }
389                // ALIAS { content, value: V, named: true } — the extras
390                // rule renames its content; V is the kind on the schema.
391                Some("ALIAS") => {
392                    let named = map
393                        .get("named")
394                        .and_then(serde_json::Value::as_bool)
395                        .unwrap_or(false);
396                    if named {
397                        if let Some(serde_json::Value::String(value)) = map.get("value") {
398                            out.insert(value.clone());
399                        }
400                    }
401                }
402                // PATTERN / STRING / TOKEN entries describe inter-token
403                // whitespace and have no vertex-side representation.
404                _ => {}
405            }
406        }
407    }
408    Ok(out)
409}
410
411impl Grammar {
412    /// Parse a grammar's `grammar.json` bytes.
413    ///
414    /// Builds the subtyping closure as part of construction so every
415    /// downstream lookup is O(1). The closure is the least relation
416    /// containing `(K, K)` for every rule key `K` and closed under:
417    ///
418    /// - hidden-rule expansion: if `S` is hidden and a SYMBOL `S` may
419    ///   reach SYMBOL `K`, then `K ⊑ S`.
420    /// - supertype expansion: if `S` is in the grammar's supertypes
421    ///   block and `K` is one of `S`'s alternatives, then `K ⊑ S`.
422    /// - alias renaming: if a rule body contains
423    ///   `ALIAS { content: SYMBOL R, value: A, named: true }` where
424    ///   `R` reaches kind `K` (or `K = R` when no further hop), then
425    ///   `A ⊑ R` and `K ⊑ A`.
426    ///
427    /// # Errors
428    ///
429    /// Returns [`ParseError::EmitFailed`] when the bytes are not a
430    /// valid `grammar.json` document.
431    pub fn from_bytes(protocol: &str, bytes: &[u8]) -> Result<Self, ParseError> {
432        Self::from_bytes_with_node_types(protocol, bytes, None)
433    }
434
435    /// Parse a grammar from both `grammar.json` and optionally
436    /// `node-types.json` bytes.
437    ///
438    /// # Errors
439    ///
440    /// Returns [`ParseError::EmitFailed`] when `grammar_bytes` is
441    /// not a valid `grammar.json` document.
442    pub fn from_bytes_with_node_types(
443        protocol: &str,
444        grammar_bytes: &[u8],
445        node_types_bytes: Option<&[u8]>,
446    ) -> Result<Self, ParseError> {
447        let mut grammar: Self =
448            serde_json::from_slice(grammar_bytes).map_err(|e| ParseError::EmitFailed {
449                protocol: protocol.to_owned(),
450                reason: format!("grammar.json deserialization failed: {e}"),
451            })?;
452        grammar.subtypes = compute_subtype_closure(&grammar);
453        if let Some(nt_bytes) = node_types_bytes {
454            grammar.node_type_children = build_node_type_children(nt_bytes);
455            augment_subtypes_from_node_types(&mut grammar);
456        }
457        grammar.external_alias_map = build_external_alias_map(&grammar);
458        grammar.inline_brace_rules = identify_inline_brace_rules(&grammar);
459        grammar.yield_sets = compute_yield_sets(&grammar);
460        Ok(grammar)
461    }
462}
463
464/// Compute the subtyping relation as a forward-indexed map from a
465/// SYMBOL name to the set of vertex kinds that satisfy that SYMBOL.
466fn compute_subtype_closure(
467    grammar: &Grammar,
468) -> std::collections::HashMap<String, std::collections::HashSet<String>> {
469    use std::collections::{HashMap, HashSet};
470    // Edges of the "kind X satisfies SYMBOL Y" relation. `K ⊑ Y` is
471    // recorded whenever Y is reached by walking the grammar's
472    // ALIAS / hidden-rule / supertype dispatch from a position where
473    // K is the actual vertex kind.
474    let mut subtypes: HashMap<String, HashSet<String>> = HashMap::new();
475    for name in grammar.rules.keys() {
476        subtypes
477            .entry(name.clone())
478            .or_default()
479            .insert(name.clone());
480    }
481
482    // First pass: collect the immediate "satisfies" edges from each
483    // expandable rule (hidden, supertype) to the kinds reachable by
484    // walking its body, plus alias edges.
485    fn walk<'g>(
486        grammar: &'g Grammar,
487        production: &'g Production,
488        visited: &mut HashSet<&'g str>,
489        out: &mut HashSet<String>,
490    ) {
491        match production {
492            Production::Symbol { name } => {
493                // Direct subtype.
494                out.insert(name.clone());
495                // Continue expansion through hidden / supertype rules
496                // so the closure traverses pass-through dispatch.
497                let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
498                if expand && visited.insert(name.as_str()) {
499                    if let Some(rule) = grammar.rules.get(name) {
500                        walk(grammar, rule, visited, out);
501                    }
502                }
503            }
504            Production::Choice { members } | Production::Seq { members } => {
505                for m in members {
506                    walk(grammar, m, visited, out);
507                }
508            }
509            Production::Alias {
510                content,
511                named,
512                value,
513            } => {
514                if *named && !value.is_empty() {
515                    out.insert(value.clone());
516                }
517                walk(grammar, content, visited, out);
518            }
519            Production::Repeat { content }
520            | Production::Repeat1 { content }
521            | Production::Optional { content }
522            | Production::Field { content, .. }
523            | Production::Token { content }
524            | Production::ImmediateToken { content }
525            | Production::Prec { content, .. }
526            | Production::PrecLeft { content, .. }
527            | Production::PrecRight { content, .. }
528            | Production::PrecDynamic { content, .. }
529            | Production::Reserved { content, .. } => {
530                walk(grammar, content, visited, out);
531            }
532            _ => {}
533        }
534    }
535
536    for (name, rule) in &grammar.rules {
537        let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
538        if !expand {
539            continue;
540        }
541        let mut visited: HashSet<&str> = HashSet::new();
542        visited.insert(name.as_str());
543        let mut reachable: HashSet<String> = HashSet::new();
544        walk(grammar, rule, &mut visited, &mut reachable);
545        for kind in &reachable {
546            subtypes
547                .entry(kind.clone())
548                .or_default()
549                .insert(name.clone());
550        }
551    }
552
553    // Aliases: scan every rule body for ALIAS { content, value }
554    // declarations. The kinds reachable from `content` satisfy
555    // `value`, AND (by construction) `value` satisfies the
556    // surrounding rule. Walking the ENTIRE grammar once captures
557    // every alias site, irrespective of which rule introduces it.
558    fn collect_aliases<'g>(production: &'g Production, out: &mut Vec<(String, &'g Production)>) {
559        match production {
560            Production::Alias {
561                content,
562                named,
563                value,
564            } => {
565                if *named && !value.is_empty() {
566                    out.push((value.clone(), content.as_ref()));
567                }
568                collect_aliases(content, out);
569            }
570            Production::Choice { members } | Production::Seq { members } => {
571                for m in members {
572                    collect_aliases(m, out);
573                }
574            }
575            Production::Repeat { content }
576            | Production::Repeat1 { content }
577            | Production::Optional { content }
578            | Production::Field { content, .. }
579            | Production::Token { content }
580            | Production::ImmediateToken { content }
581            | Production::Prec { content, .. }
582            | Production::PrecLeft { content, .. }
583            | Production::PrecRight { content, .. }
584            | Production::PrecDynamic { content, .. }
585            | Production::Reserved { content, .. } => {
586                collect_aliases(content, out);
587            }
588            _ => {}
589        }
590    }
591    let mut aliases: Vec<(String, &Production)> = Vec::new();
592    for rule in grammar.rules.values() {
593        collect_aliases(rule, &mut aliases);
594    }
595    for (alias_value, content) in aliases {
596        let mut visited: HashSet<&str> = HashSet::new();
597        let mut reachable: HashSet<String> = HashSet::new();
598        walk(grammar, content, &mut visited, &mut reachable);
599        // Aliased value satisfies itself and is satisfied by every
600        // kind its content can reach.
601        subtypes
602            .entry(alias_value.clone())
603            .or_default()
604            .insert(alias_value.clone());
605        for kind in reachable {
606            subtypes
607                .entry(kind)
608                .or_default()
609                .insert(alias_value.clone());
610        }
611    }
612
613    // Transitive close: `K ⊑ A` and `A ⊑ B` implies `K ⊑ B`. Iterate
614    // a few rounds; the relation is small so a quick fixed-point
615    // suffices in practice.
616    for _ in 0..8 {
617        let snapshot = subtypes.clone();
618        let mut changed = false;
619        for (kind, supers) in &snapshot {
620            let extra: HashSet<String> = supers
621                .iter()
622                .flat_map(|s| snapshot.get(s).cloned().unwrap_or_default())
623                .collect();
624            let entry = subtypes.entry(kind.clone()).or_default();
625            for s in extra {
626                if entry.insert(s) {
627                    changed = true;
628                }
629            }
630        }
631        if !changed {
632            break;
633        }
634    }
635
636    subtypes
637}
638
639/// Compute the Yield set for every rule in the grammar.
640///
641/// `Yield(P)` is the set of concrete vertex kinds that can appear as
642/// the first named child when production P is taken. See the
643/// `Grammar::yield_sets` doc comment for the inductive definition.
644fn compute_yield_sets(
645    grammar: &Grammar,
646) -> std::collections::HashMap<String, std::collections::HashSet<String>> {
647    let mut cache: std::collections::HashMap<String, std::collections::HashSet<String>> =
648        std::collections::HashMap::new();
649    for (name, rule) in &grammar.rules {
650        let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
651        if !expand {
652            continue;
653        }
654        if cache.contains_key(name) {
655            continue;
656        }
657        let mut visited = std::collections::HashSet::new();
658        let ys = yield_of_production(grammar, rule, &mut visited, &mut cache);
659        cache.insert(name.clone(), ys);
660    }
661    cache
662}
663
664/// Compute the Yield set of an arbitrary production node.
665///
666/// Uses `cache` (the partially-built `yield_sets` map) as
667/// memoization. `visited` tracks the current recursion path to
668/// detect cycles through hidden/supertype rules; a cycle returns ∅
669/// (a cycle that never passes through a concrete named symbol
670/// cannot produce a first child).
671fn yield_of_production(
672    grammar: &Grammar,
673    production: &Production,
674    visited: &mut std::collections::HashSet<String>,
675    cache: &mut std::collections::HashMap<String, std::collections::HashSet<String>>,
676) -> std::collections::HashSet<String> {
677    match production {
678        Production::Symbol { name } => {
679            let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
680            if !expand {
681                let mut set = std::collections::HashSet::new();
682                set.insert(name.clone());
683                return set;
684            }
685            if let Some(cached) = cache.get(name) {
686                return cached.clone();
687            }
688            {
689                if !visited.insert(name.clone()) {
690                    return std::collections::HashSet::new();
691                }
692                let result = if let Some(rule) = grammar.rules.get(name) {
693                    yield_of_production(grammar, rule, visited, cache)
694                } else {
695                    std::collections::HashSet::new()
696                };
697                visited.remove(name);
698                cache.insert(name.clone(), result.clone());
699                result
700            }
701        }
702        Production::Alias {
703            content,
704            named,
705            value,
706        } => {
707            if *named && !value.is_empty() {
708                let mut set = std::collections::HashSet::new();
709                set.insert(value.clone());
710                set
711            } else {
712                yield_of_production(grammar, content, visited, cache)
713            }
714        }
715        Production::Seq { members } => {
716            if members.is_empty() {
717                let mut set = std::collections::HashSet::new();
718                set.insert(String::new());
719                set
720            } else {
721                // Walk the SEQ members left-to-right, returning the
722                // Yield of the first member that can produce a named
723                // child. STRING and PATTERN yield ∅ (anonymous
724                // tokens); skip them to reach the first named-child-
725                // producing position.  This handles hidden rules like
726                // `_initializer = SEQ ["=", FIELD { value, ... }]`
727                // where the leading "=" is a STRING.
728                for m in members {
729                    let ys = yield_of_production(grammar, m, visited, cache);
730                    if !ys.is_empty() {
731                        return ys;
732                    }
733                }
734                std::collections::HashSet::new()
735            }
736        }
737        Production::Choice { members } => {
738            let mut union = std::collections::HashSet::new();
739            for m in members {
740                union.extend(yield_of_production(grammar, m, visited, cache));
741            }
742            union
743        }
744        Production::Optional { content } => {
745            let mut set = yield_of_production(grammar, content, visited, cache);
746            set.insert(String::new());
747            set
748        }
749        Production::Blank => {
750            let mut set = std::collections::HashSet::new();
751            set.insert(String::new());
752            set
753        }
754        Production::String { .. } | Production::Pattern { .. } => std::collections::HashSet::new(),
755        Production::Repeat { content }
756        | Production::Repeat1 { content }
757        | Production::Field { content, .. }
758        | Production::Token { content }
759        | Production::ImmediateToken { content }
760        | Production::Prec { content, .. }
761        | Production::PrecLeft { content, .. }
762        | Production::PrecRight { content, .. }
763        | Production::PrecDynamic { content, .. }
764        | Production::Reserved { content, .. } => {
765            yield_of_production(grammar, content, visited, cache)
766        }
767    }
768}
769
770// ═══════════════════════════════════════════════════════════════════
771// node-types.json integration
772// ═══════════════════════════════════════════════════════════════════
773
774/// Parse node-types.json and build a map from parent kind to the set
775/// of all named child kinds the parser can produce for that parent.
776fn build_node_type_children(
777    nt_bytes: &[u8],
778) -> std::collections::HashMap<String, std::collections::HashSet<String>> {
779    let node_types: Vec<crate::theory_extract::NodeType> = match serde_json::from_slice(nt_bytes) {
780        Ok(v) => v,
781        Err(_) => return std::collections::HashMap::new(),
782    };
783    let mut map: std::collections::HashMap<String, std::collections::HashSet<String>> =
784        std::collections::HashMap::new();
785    for entry in &node_types {
786        if !entry.named {
787            continue;
788        }
789        let mut child_kinds = std::collections::HashSet::new();
790        for field_value in entry.fields.values() {
791            if let Some(types) = field_value.get("types").and_then(|t| t.as_array()) {
792                for t in types {
793                    if let (Some(name), Some(true)) = (
794                        t.get("type").and_then(|n| n.as_str()),
795                        t.get("named").and_then(serde_json::Value::as_bool),
796                    ) {
797                        child_kinds.insert(name.to_owned());
798                    }
799                }
800            }
801        }
802        if let Some(ref children) = entry.children {
803            for t in &children.types {
804                if t.named {
805                    child_kinds.insert(t.node_type.clone());
806                }
807            }
808        }
809        if !child_kinds.is_empty() {
810            map.insert(entry.node_type.clone(), child_kinds);
811        }
812    }
813    map
814}
815
816/// Augment `grammar.subtypes` with child-kind data from node-types.json.
817///
818/// For each parent kind P with node-type children, for each SYMBOL S
819/// referenced in P's grammar rule, for each child kind C in
820/// `node_type_children[P]`: if C does not already satisfy S, record
821/// C satisfies S. This closes the grammar/parser divergence where
822/// tree-sitter's parser produces child kinds not reachable from
823/// grammar.json's production rules.
824fn augment_subtypes_from_node_types(grammar: &mut Grammar) {
825    let pairs: Vec<(String, String)> = grammar
826        .node_type_children
827        .iter()
828        .flat_map(|(parent_kind, allowed_children)| {
829            let symbols: Vec<&str> = grammar
830                .rules
831                .get(parent_kind)
832                .map(|rule| referenced_symbols(rule))
833                .unwrap_or_default();
834            let mut out = Vec::new();
835            for child_kind in allowed_children {
836                let already_satisfies_some = symbols
837                    .iter()
838                    .any(|s| kind_satisfies_symbol(grammar, Some(child_kind), s));
839                if already_satisfies_some {
840                    continue;
841                }
842                for sym_name in &symbols {
843                    out.push((child_kind.clone(), (*sym_name).to_owned()));
844                }
845            }
846            out
847        })
848        .collect();
849    for (child_kind, sym_name) in pairs {
850        grammar
851            .subtypes
852            .entry(child_kind)
853            .or_default()
854            .insert(sym_name);
855    }
856}
857
858/// Build a map from external scanner symbol names to their anonymous
859/// ALIAS values by walking every rule body in the grammar.
860fn build_external_alias_map(grammar: &Grammar) -> std::collections::HashMap<String, String> {
861    let mut map = std::collections::HashMap::new();
862    fn walk(
863        grammar: &Grammar,
864        prod: &Production,
865        map: &mut std::collections::HashMap<String, String>,
866    ) {
867        match prod {
868            Production::Alias {
869                content,
870                named,
871                value,
872            } => {
873                if !*named && !value.is_empty() {
874                    if let Production::Symbol { name } = content.as_ref() {
875                        if name.starts_with('_') && !grammar.rules.contains_key(name) {
876                            map.entry(name.clone()).or_insert_with(|| value.clone());
877                        }
878                    }
879                }
880                walk(grammar, content, map);
881            }
882            Production::Choice { members } | Production::Seq { members } => {
883                for m in members {
884                    walk(grammar, m, map);
885                }
886            }
887            Production::Repeat { content }
888            | Production::Repeat1 { content }
889            | Production::Optional { content }
890            | Production::Field { content, .. }
891            | Production::Token { content }
892            | Production::ImmediateToken { content }
893            | Production::Prec { content, .. }
894            | Production::PrecLeft { content, .. }
895            | Production::PrecRight { content, .. }
896            | Production::PrecDynamic { content, .. }
897            | Production::Reserved { content, .. } => walk(grammar, content, map),
898            _ => {}
899        }
900    }
901    for rule in grammar.rules.values() {
902        walk(grammar, rule, &mut map);
903    }
904    map
905}
906
907/// Identify rules whose `{`/`}` tokens are inline delimiters (e.g.
908/// interpolation) rather than block scopes. A rule is inline-brace
909/// iff its production SEQ contains both an opening brace token and
910/// `}`, and the members between them contain no REPEAT/REPEAT1
911/// (which would indicate a statement-list block).
912fn identify_inline_brace_rules(grammar: &Grammar) -> std::collections::HashSet<String> {
913    fn is_inline_brace_body(prod: &Production) -> bool {
914        match prod {
915            Production::Seq { members } => {
916                let open_idx = members.iter().position(|m| match m {
917                    Production::String { value } => value.contains('{'),
918                    _ => false,
919                });
920                let close_idx = members
921                    .iter()
922                    .rposition(|m| matches!(m, Production::String { value } if value == "}"));
923                if let (Some(open), Some(close)) = (open_idx, close_idx) {
924                    if open < close {
925                        let between = &members[open + 1..close];
926                        return !between.iter().any(has_repeat);
927                    }
928                }
929                false
930            }
931            Production::Prec { content, .. }
932            | Production::PrecLeft { content, .. }
933            | Production::PrecRight { content, .. }
934            | Production::PrecDynamic { content, .. }
935            | Production::Token { content }
936            | Production::ImmediateToken { content }
937            | Production::Reserved { content, .. } => is_inline_brace_body(content),
938            _ => false,
939        }
940    }
941    fn has_repeat(prod: &Production) -> bool {
942        match prod {
943            Production::Repeat { .. } | Production::Repeat1 { .. } => true,
944            Production::Choice { members } | Production::Seq { members } => {
945                members.iter().any(has_repeat)
946            }
947            Production::Prec { content, .. }
948            | Production::PrecLeft { content, .. }
949            | Production::PrecRight { content, .. }
950            | Production::PrecDynamic { content, .. }
951            | Production::Optional { content }
952            | Production::Field { content, .. }
953            | Production::Token { content }
954            | Production::ImmediateToken { content }
955            | Production::Reserved { content, .. } => has_repeat(content),
956            _ => false,
957        }
958    }
959    let mut result = std::collections::HashSet::new();
960    for (name, rule) in &grammar.rules {
961        if is_inline_brace_body(rule) {
962            result.insert(name.clone());
963        }
964    }
965    result
966}
967
968// ═══════════════════════════════════════════════════════════════════
969// Format policy
970// ═══════════════════════════════════════════════════════════════════
971
972/// Whitespace and indentation policy applied during emission.
973///
974/// The default policy inserts a single space between adjacent tokens,
975/// a newline after `;` / `}` / `{`, and tracks indent on `{` / `}`
976/// boundaries. Per-language overrides (idiomatic indent width,
977/// trailing-comma rules, blank-line conventions) can ride alongside
978/// this struct in a follow-up branch; today's defaults aim only for
979/// syntactic validity.
980#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
981pub struct FormatPolicy {
982    /// Number of spaces per indent level.
983    pub indent_width: usize,
984    /// Separator inserted between adjacent terminals that the lexer
985    /// would otherwise glue together (word ↔ word, operator ↔ operator).
986    /// Default is a single space.
987    pub separator: String,
988    /// Newline byte sequence emitted after `line_break_after` tokens
989    /// and at end-of-output. Default is `"\n"`.
990    pub newline: String,
991    /// Tokens after which the walker breaks to a new line.
992    pub line_break_after: Vec<String>,
993    /// Tokens that increase indent on emission.
994    pub indent_open: Vec<String>,
995    /// Tokens that decrease indent on emission.
996    pub indent_close: Vec<String>,
997}
998
999impl Default for FormatPolicy {
1000    fn default() -> Self {
1001        Self {
1002            indent_width: 2,
1003            separator: " ".to_owned(),
1004            newline: "\n".to_owned(),
1005            line_break_after: vec![";".into(), "{".into(), "}".into()],
1006            indent_open: vec!["{".into()],
1007            indent_close: vec!["}".into()],
1008        }
1009    }
1010}
1011
1012// ═══════════════════════════════════════════════════════════════════
1013// Emitter
1014// ═══════════════════════════════════════════════════════════════════
1015
1016/// Emit a by-construction schema to source bytes.
1017///
1018/// `protocol` is the grammar / language name (used in error messages
1019/// and to label the entry point).
1020///
1021/// The walker treats `schema.entries` as the ordered list of root
1022/// vertices, falling back to a deterministic by-id ordering when
1023/// `entries` is empty. Each root is emitted using the production
1024/// associated with its kind in `grammar.rules`.
1025///
1026/// # Errors
1027///
1028/// Returns [`ParseError::EmitFailed`] when:
1029///
1030/// - the schema has no vertices
1031/// - a root vertex's kind is not a grammar rule
1032/// - a `SYMBOL` reference points at a kind with no rule and no schema
1033///   child to resolve it to
1034/// - a required `FIELD` has no corresponding edge in the schema
1035pub fn emit_pretty(
1036    protocol: &str,
1037    schema: &Schema,
1038    grammar: &Grammar,
1039    policy: &FormatPolicy,
1040) -> Result<Vec<u8>, ParseError> {
1041    let roots = collect_roots(schema);
1042    if roots.is_empty() {
1043        return Err(ParseError::EmitFailed {
1044            protocol: protocol.to_owned(),
1045            reason: "schema has no entry vertices".to_owned(),
1046        });
1047    }
1048
1049    let mut out = Output::new(policy);
1050    for (i, root) in roots.iter().enumerate() {
1051        if i > 0 {
1052            out.newline();
1053        }
1054        emit_vertex(protocol, schema, grammar, root, &mut out)?;
1055    }
1056    Ok(out.finish())
1057}
1058
1059fn collect_roots(schema: &Schema) -> Vec<&panproto_gat::Name> {
1060    if !schema.entries.is_empty() {
1061        return schema
1062            .entries
1063            .iter()
1064            .filter(|name| schema.vertices.contains_key(*name))
1065            .collect();
1066    }
1067
1068    // Fallback: every vertex that is not the target of any structural edge
1069    // (sorted by id for determinism).
1070    let mut targets: std::collections::HashSet<&panproto_gat::Name> =
1071        std::collections::HashSet::new();
1072    for edge in schema.edges.keys() {
1073        targets.insert(&edge.tgt);
1074    }
1075    let mut roots: Vec<&panproto_gat::Name> = schema
1076        .vertices
1077        .keys()
1078        .filter(|name| !targets.contains(name))
1079        .collect();
1080    roots.sort();
1081    roots
1082}
1083
1084fn emit_vertex(
1085    protocol: &str,
1086    schema: &Schema,
1087    grammar: &Grammar,
1088    vertex_id: &panproto_gat::Name,
1089    out: &mut Output<'_>,
1090) -> Result<(), ParseError> {
1091    let vertex = schema
1092        .vertices
1093        .get(vertex_id)
1094        .ok_or_else(|| ParseError::EmitFailed {
1095            protocol: protocol.to_owned(),
1096            reason: format!("vertex '{vertex_id}' not found"),
1097        })?;
1098
1099    // Leaf shortcut: a vertex carrying a `literal-value` constraint
1100    // and no outgoing structural edges is a terminal token. Emit the
1101    // captured value directly. This handles identifiers, numeric
1102    // literals, and string literals that the parser stored as
1103    // `literal-value` even on by-construction schemas.
1104    if let Some(literal) = literal_value(schema, vertex_id) {
1105        if children_for(schema, vertex_id).is_empty() {
1106            out.token(literal);
1107            return Ok(());
1108        }
1109    }
1110
1111    let kind = vertex.kind.as_ref();
1112    let edges = children_for(schema, vertex_id);
1113    if let Some(rule) = grammar.rules.get(kind) {
1114        let old_suppress = out.suppress_brace_indent;
1115        if grammar.inline_brace_rules.contains(kind) {
1116            out.suppress_brace_indent = true;
1117        }
1118        let mut cursor = ChildCursor::new(&edges);
1119        emit_production(protocol, schema, grammar, vertex_id, rule, &mut cursor, out)?;
1120        // Drain any extras left after the rule walk completed; tree-sitter
1121        // may record trailing comments as children of the surrounding
1122        // vertex (i.e. after the last structural child the rule matched).
1123        drain_extras(protocol, schema, grammar, &mut cursor, out)?;
1124        out.suppress_brace_indent = old_suppress;
1125        return Ok(());
1126    }
1127
1128    // No rule for this kind. The parser produced it via an ALIAS
1129    // (tree-sitter's `alias($.something, $.actual_kind)`) or via an
1130    // external scanner (e.g. YAML's `document` root). Fall back to
1131    // walking the children directly so the inner content survives;
1132    // surrounding tokens — whose only source is the missing rule —
1133    // are necessarily absent.
1134    for edge in &edges {
1135        emit_vertex(protocol, schema, grammar, &edge.tgt, out)?;
1136    }
1137    Ok(())
1138}
1139
1140/// Linear cursor over a vertex's outgoing edges, used to thread
1141/// children through a production rule without double-consuming them.
1142struct ChildCursor<'a> {
1143    edges: &'a [&'a Edge],
1144    consumed: Vec<bool>,
1145}
1146
1147impl<'a> ChildCursor<'a> {
1148    fn new(edges: &'a [&'a Edge]) -> Self {
1149        Self {
1150            edges,
1151            consumed: vec![false; edges.len()],
1152        }
1153    }
1154
1155    /// Take the next unconsumed edge whose kind equals `field_name`.
1156    fn take_field(&mut self, field_name: &str) -> Option<&'a Edge> {
1157        for (i, edge) in self.edges.iter().enumerate() {
1158            if !self.consumed[i] && edge.kind.as_ref() == field_name {
1159                self.consumed[i] = true;
1160                return Some(edge);
1161            }
1162        }
1163        None
1164    }
1165
1166    /// Whether any unconsumed edge satisfies `predicate`. Used by the
1167    /// unit tests; the live emit path went through `has_matching` on
1168    /// each alternative until cursor-driven dispatch was rewritten to
1169    /// pick the first-unconsumed-edge's kind directly.
1170    #[cfg(test)]
1171    fn has_matching(&self, predicate: impl Fn(&Edge) -> bool) -> bool {
1172        self.edges
1173            .iter()
1174            .enumerate()
1175            .any(|(i, edge)| !self.consumed[i] && predicate(edge))
1176    }
1177
1178    /// Take the next unconsumed edge whose target vertex satisfies
1179    /// `predicate`. Returns the edge and the underlying production
1180    /// resolution path is the caller's job.
1181    fn take_matching(&mut self, predicate: impl Fn(&Edge) -> bool) -> Option<&'a Edge> {
1182        for (i, edge) in self.edges.iter().enumerate() {
1183            if !self.consumed[i] && predicate(edge) {
1184                self.consumed[i] = true;
1185                return Some(edge);
1186            }
1187        }
1188        None
1189    }
1190}
1191
1192thread_local! {
1193    static EMIT_DEPTH: std::cell::Cell<usize> = const { std::cell::Cell::new(0) };
1194    /// Set of `(vertex_id, rule_name)` pairs that are currently being
1195    /// walked by the recursion. A SYMBOL that resolves to a rule
1196    /// already on this stack closes a μ-binder cycle: in the
1197    /// coinductive reading, the rule walk at any vertex is the least
1198    /// fixed point of `body[μ X . body / X]`, which unfolds at most
1199    /// once, with the second visit returning the empty sequence (the
1200    /// unit of the free token monoid). Examples that trigger this:
1201    /// YAML's `stream` ⊃ `_b_blk_*` mutually-recursive chain, Rust's
1202    /// `_expression` ⊃ `binary_expression` ⊃ `_expression`.
1203    static EMIT_MU_FRAMES: std::cell::RefCell<std::collections::HashSet<(String, String)>> =
1204        std::cell::RefCell::new(std::collections::HashSet::new());
1205    /// The name of the FIELD whose body the walker is currently inside,
1206    /// or `None` at top level. Lets a SYMBOL nested arbitrarily deep
1207    /// in the field's content (under SEQ, CHOICE, REPEAT, OPTIONAL)
1208    /// consume from the *outer* cursor by edge-kind rather than from
1209    /// the child's own cursor by symbol-match. Without this, shapes
1210    /// like `field('args', commaSep1($.X))` — which expands to
1211    /// `FIELD(SEQ(SYMBOL X, REPEAT(SEQ(',', SYMBOL X))))` — emit only
1212    /// the first matched edge: the FIELD handler consumed one edge,
1213    /// the inner REPEAT searched the consumed child's cursor (which
1214    /// has no more sibling field edges), and the REPEAT broke after
1215    /// one iteration. Setting the context here so the inner SYMBOL
1216    /// pulls successive field-named edges from the outer cursor
1217    /// recovers every matched edge across arbitrary nesting.
1218    static EMIT_FIELD_CONTEXT: std::cell::RefCell<Option<String>> =
1219        const { std::cell::RefCell::new(None) };
1220}
1221
1222/// RAII guard that restores the prior `EMIT_FIELD_CONTEXT` value on drop.
1223struct FieldContextGuard(Option<String>);
1224
1225impl Drop for FieldContextGuard {
1226    fn drop(&mut self) {
1227        EMIT_FIELD_CONTEXT.with(|f| *f.borrow_mut() = self.0.take());
1228    }
1229}
1230
1231fn push_field_context(name: &str) -> FieldContextGuard {
1232    let prev = EMIT_FIELD_CONTEXT.with(|f| f.borrow_mut().replace(name.to_owned()));
1233    FieldContextGuard(prev)
1234}
1235
1236/// Clear the field context for the duration of a child-context walk.
1237/// The child's own production has its own FIELDs that set their own
1238/// context; the outer field hint must not leak into them.
1239fn clear_field_context() -> FieldContextGuard {
1240    let prev = EMIT_FIELD_CONTEXT.with(|f| f.borrow_mut().take());
1241    FieldContextGuard(prev)
1242}
1243
1244fn current_field_context() -> Option<String> {
1245    EMIT_FIELD_CONTEXT.with(|f| f.borrow().clone())
1246}
1247
1248/// Walk a rule at a vertex inside a μ-binder. The wrapping frame is
1249/// pushed before recursion and popped after, so any SYMBOL inside
1250/// `rule` that re-enters the same `(vertex_id, rule_name)` pair
1251/// returns the empty sequence (μ X . body unfolds once).
1252fn walk_in_mu_frame(
1253    protocol: &str,
1254    schema: &Schema,
1255    grammar: &Grammar,
1256    vertex_id: &panproto_gat::Name,
1257    rule_name: &str,
1258    rule: &Production,
1259    cursor: &mut ChildCursor<'_>,
1260    out: &mut Output<'_>,
1261) -> Result<(), ParseError> {
1262    let key = (vertex_id.to_string(), rule_name.to_owned());
1263    let inserted = EMIT_MU_FRAMES.with(|frames| frames.borrow_mut().insert(key.clone()));
1264    if !inserted {
1265        // We are already walking this rule at this vertex deeper in
1266        // the call stack. The coinductive μ-fixed-point reading
1267        // returns the empty sequence here; the surrounding
1268        // production resumes after the SYMBOL.
1269        return Ok(());
1270    }
1271    let result = emit_production(protocol, schema, grammar, vertex_id, rule, cursor, out);
1272    EMIT_MU_FRAMES.with(|frames| {
1273        frames.borrow_mut().remove(&key);
1274    });
1275    result
1276}
1277
1278fn emit_production(
1279    protocol: &str,
1280    schema: &Schema,
1281    grammar: &Grammar,
1282    vertex_id: &panproto_gat::Name,
1283    production: &Production,
1284    cursor: &mut ChildCursor<'_>,
1285    out: &mut Output<'_>,
1286) -> Result<(), ParseError> {
1287    let depth = EMIT_DEPTH.with(|d| {
1288        let v = d.get() + 1;
1289        d.set(v);
1290        v
1291    });
1292    if depth > 500 {
1293        EMIT_DEPTH.with(|d| d.set(d.get() - 1));
1294        return Err(ParseError::EmitFailed {
1295            protocol: protocol.to_owned(),
1296            reason: format!(
1297                "emit_production recursion >500 (likely a cyclic grammar; \
1298                     vertex='{vertex_id}')"
1299            ),
1300        });
1301    }
1302    drain_extras(protocol, schema, grammar, cursor, out)?;
1303    let result = emit_production_inner(
1304        protocol, schema, grammar, vertex_id, production, cursor, out,
1305    );
1306    EMIT_DEPTH.with(|d| d.set(d.get() - 1));
1307    result
1308}
1309
1310/// Consume and emit every leading edge on `cursor` whose target kind
1311/// is in `grammar.extras` (typically `line_comment` / `block_comment`).
1312/// Extras live outside the production grammar — tree-sitter skips them
1313/// at parse time and records them as children of the surrounding
1314/// vertex — so the rule walker cannot reconcile them against the
1315/// cursor. Draining them as a side channel preserves their content in
1316/// the output without confusing the structural matchers.
1317fn drain_extras(
1318    protocol: &str,
1319    schema: &Schema,
1320    grammar: &Grammar,
1321    cursor: &mut ChildCursor<'_>,
1322    out: &mut Output<'_>,
1323) -> Result<(), ParseError> {
1324    if grammar.extras.is_empty() {
1325        return Ok(());
1326    }
1327    loop {
1328        let next_extra: Option<usize> = cursor
1329            .edges
1330            .iter()
1331            .enumerate()
1332            .find(|(i, _)| !cursor.consumed[*i])
1333            .and_then(|(i, edge)| {
1334                let kind = schema.vertices.get(&edge.tgt).map(|v| v.kind.as_ref())?;
1335                if grammar.extras.contains(kind) {
1336                    Some(i)
1337                } else {
1338                    None
1339                }
1340            });
1341        let Some(idx) = next_extra else {
1342            return Ok(());
1343        };
1344        cursor.consumed[idx] = true;
1345        let target = &cursor.edges[idx].tgt;
1346        emit_vertex(protocol, schema, grammar, target, out)?;
1347    }
1348}
1349
1350fn emit_production_inner(
1351    protocol: &str,
1352    schema: &Schema,
1353    grammar: &Grammar,
1354    vertex_id: &panproto_gat::Name,
1355    production: &Production,
1356    cursor: &mut ChildCursor<'_>,
1357    out: &mut Output<'_>,
1358) -> Result<(), ParseError> {
1359    match production {
1360        Production::String { value } => {
1361            out.token(value);
1362            Ok(())
1363        }
1364        Production::Pattern { value } => {
1365            if let Some(literal) = literal_value(schema, vertex_id) {
1366                out.token(literal);
1367            } else if is_newline_like_pattern(value) {
1368                // Patterns like `\r?\n`, `\n`, `\r\n` are the structural
1369                // newline tokens grammars use to separate top-level
1370                // statements (csound's `_new_line`, ABC's line-end, etc.).
1371                // Emitting them through the placeholder fallback rendered
1372                // the bare `_` sentinel between siblings; route them to
1373                // the layout pass's line-break instead so the output
1374                // re-parses.
1375                out.newline();
1376            } else if is_whitespace_only_pattern(value) {
1377                // `\s+`, `[ \t]+` and friends are interstitial whitespace
1378                // tokens. Emit nothing: the layout pass inserts the
1379                // policy separator between adjacent Lits if needed.
1380            } else {
1381                out.token(&placeholder_for_pattern(value));
1382            }
1383            Ok(())
1384        }
1385        Production::Blank => Ok(()),
1386        Production::Symbol { name } => {
1387            // Inside a FIELD body, a SYMBOL consumes by field-name on
1388            // the outer cursor rather than searching by symbol-match.
1389            // This covers the simple `FIELD(SYMBOL X)` case as well as
1390            // every nesting under FIELD that contains SYMBOLs (SEQ,
1391            // CHOICE, REPEAT, OPTIONAL, ALIAS). Without the override,
1392            // shapes like `field('args', commaSep1($.X))` consume one
1393            // field edge in the FIELD handler and then the REPEAT
1394            // inside SEQ searches the consumed child's cursor — where
1395            // no sibling field edges sit — and breaks after one
1396            // iteration.
1397            if let Some(field) = current_field_context() {
1398                if let Some(edge) = cursor.take_field(&field) {
1399                    return emit_in_child_context(
1400                        protocol, schema, grammar, &edge.tgt, production, out,
1401                    );
1402                }
1403                // No matching field-named edge left on the outer
1404                // cursor. Surface nothing; the surrounding REPEAT /
1405                // OPTIONAL / CHOICE backtracks the literal tokens it
1406                // emitted on this iteration when it sees no progress.
1407                return Ok(());
1408            }
1409            if name.starts_with('_') {
1410                // Hidden rule: not a vertex kind on the schema side.
1411                // Inline-expand the rule body so its children take
1412                // edges from the current cursor, instead of trying to
1413                // take a single child edge that "satisfies" the
1414                // hidden rule and discarding the rest of the body
1415                // (which would drop tokens like `=` and the trailing
1416                // value SYMBOL inside e.g. TOML's `_inline_pair`).
1417                //
1418                // Wrapped in a μ-frame so a hidden rule that
1419                // references its own kind cyclically (or another
1420                // hidden rule that closes the cycle) unfolds once
1421                // and then collapses to the empty sequence at the
1422                // second visit, rather than blowing the stack.
1423                if let Some(rule) = grammar.rules.get(name) {
1424                    walk_in_mu_frame(
1425                        protocol, schema, grammar, vertex_id, name, rule, cursor, out,
1426                    )
1427                } else {
1428                    // External hidden rule (declared in the
1429                    // grammar's `externals` block, scanned by C code,
1430                    // not listed in `rules`). Heuristic fallback by
1431                    // name:
1432                    //
1433                    // - `_indent` / `*_indent`: open an indent block.
1434                    //   Indent-based grammars (Python, YAML, qvr)
1435                    //   declare an `_indent` external scanner before
1436                    //   the body of a block-bodied declaration; the
1437                    //   emitted output is unparseable without the
1438                    //   corresponding indentation jump.
1439                    // - `_dedent` / `*_dedent`: close the matching
1440                    //   indent block.
1441                    // - `_newline` / `*_line_ending` / `*_or_eof`:
1442                    //   universally newline-or-empty; emitting a
1443                    //   single newline is the right default for
1444                    //   grammars like TOML whose `pair` SEQ trails
1445                    //   into `_line_ending_or_eof`.
1446                    //
1447                    // Check the precomputed alias map first: if this
1448                    // external token appears as the content of an
1449                    // anonymous ALIAS elsewhere in the grammar, emit
1450                    // the alias value as the token text.
1451                    if let Some(alias_value) = grammar.external_alias_map.get(name) {
1452                        out.token(alias_value);
1453                        return Ok(());
1454                    }
1455                    if name == "_indent" || name.ends_with("_indent") {
1456                        out.indent_open();
1457                    } else if name == "_dedent" || name.ends_with("_dedent") {
1458                        out.indent_close();
1459                    } else if name.contains("line_ending")
1460                        || name.contains("newline")
1461                        || name.ends_with("_or_eof")
1462                    {
1463                        out.newline();
1464                    } else if name.contains("semicolon") {
1465                        out.token(";");
1466                    }
1467                    Ok(())
1468                }
1469            } else if let Some(edge) = { take_symbol_match(grammar, schema, cursor, name) } {
1470                // For supertype / hidden-rule dispatch the child's
1471                // own kind names the actual production to walk
1472                // (`child.kind` IS the subtype). For ALIAS the
1473                // dependent-optic context is carried by the
1474                // surrounding `Production::Alias` branch, which calls
1475                // `emit_aliased_child` directly; we don't reach here
1476                // for that case. So walking `grammar.rules[child.kind]`
1477                // via `emit_vertex` is correct: the dependent-optic
1478                // path is preserved at every site where it actually
1479                // diverges from `child.kind`.
1480                emit_vertex(protocol, schema, grammar, &edge.tgt, out)
1481            } else if vertex_id_kind(schema, vertex_id) == Some(name.as_str()) {
1482                let rule = grammar
1483                    .rules
1484                    .get(name)
1485                    .ok_or_else(|| ParseError::EmitFailed {
1486                        protocol: protocol.to_owned(),
1487                        reason: format!("no production for SYMBOL '{name}'"),
1488                    })?;
1489                // Self-reference (`X = ... SYMBOL X ...`): wrap in a
1490                // μ-frame so re-entry collapses to the empty sequence.
1491                walk_in_mu_frame(
1492                    protocol, schema, grammar, vertex_id, name, rule, cursor, out,
1493                )
1494            } else {
1495                // Named rule with no matching child: emit nothing and
1496                // let the surrounding CHOICE / OPTIONAL / REPEAT
1497                // resolve the absence.
1498                Ok(())
1499            }
1500        }
1501        Production::Seq { members } => {
1502            for member in members {
1503                emit_production(protocol, schema, grammar, vertex_id, member, cursor, out)?;
1504            }
1505            Ok(())
1506        }
1507        Production::Choice { members } => {
1508            if let Some(matched) =
1509                pick_choice_with_cursor(schema, grammar, vertex_id, cursor, members)
1510            {
1511                emit_production(protocol, schema, grammar, vertex_id, matched, cursor, out)
1512            } else {
1513                Ok(())
1514            }
1515        }
1516        Production::Repeat { content } | Production::Repeat1 { content } => {
1517            // Detect a "separator-leading SEQ" iteration body: SEQ whose
1518            // first member is a CHOICE containing BLANK (or an OPTIONAL),
1519            // i.e. the source-level separator between two iterations is
1520            // syntactically optional. When the chosen alternative for
1521            // that separator slot emits zero content tokens at runtime,
1522            // there was no source-level separator between this iteration
1523            // and the previous one; the layout pass must suppress its
1524            // policy separator to match the source's tight adjacency.
1525            //
1526            // Categorical reading: REPEAT body `B = SEQ(SEP, BODY)` is
1527            // the pullback of two halves. The bytes emitted in iteration
1528            // k+1 are a concatenation of `SEP_k+1` and `BODY_k+1`; if
1529            // `SEP_k+1` is the empty word, the concatenation of
1530            // `BODY_k` and `BODY_k+1` must remain a single contiguous
1531            // span. Hence the NoSpace marker.
1532            let separator_leading_seq: Option<&[Production]> = match content.as_ref() {
1533                Production::Seq { members } if members.len() >= 2 => {
1534                    let first = &members[0];
1535                    let is_separator_slot = match first {
1536                        Production::Choice { members } => {
1537                            members.iter().any(|m| matches!(m, Production::Blank))
1538                        }
1539                        Production::Optional { .. } => true,
1540                        _ => false,
1541                    };
1542                    if is_separator_slot {
1543                        Some(members.as_slice())
1544                    } else {
1545                        None
1546                    }
1547                }
1548                _ => None,
1549            };
1550
1551            let mut emitted_any = false;
1552            loop {
1553                let cursor_snap = cursor.consumed.clone();
1554                let out_snap = out.snapshot();
1555                let consumed_before = cursor.consumed.iter().filter(|&&c| c).count();
1556                let result: Result<(), ParseError> =
1557                    if let Some(seq_members) = separator_leading_seq {
1558                        // Emit the separator slot first and observe
1559                        // whether it contributed any Lit. If not, push
1560                        // a NoSpace marker before walking the remaining
1561                        // SEQ members. The OutputSnapshot here covers
1562                        // only the separator's emission window.
1563                        let pre_sep = out.snapshot();
1564                        let sep_result = emit_production(
1565                            protocol,
1566                            schema,
1567                            grammar,
1568                            vertex_id,
1569                            &seq_members[0],
1570                            cursor,
1571                            out,
1572                        );
1573                        match sep_result {
1574                            Err(e) => Err(e),
1575                            Ok(()) => {
1576                                if !out.lit_emitted_since(pre_sep) {
1577                                    out.no_space();
1578                                }
1579                                let mut rest_result = Ok(());
1580                                for member in &seq_members[1..] {
1581                                    rest_result = emit_production(
1582                                        protocol, schema, grammar, vertex_id, member, cursor, out,
1583                                    );
1584                                    if rest_result.is_err() {
1585                                        break;
1586                                    }
1587                                }
1588                                rest_result
1589                            }
1590                        }
1591                    } else {
1592                        emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
1593                    };
1594                let consumed_after = cursor.consumed.iter().filter(|&&c| c).count();
1595                if result.is_err() || consumed_after == consumed_before {
1596                    cursor.consumed = cursor_snap;
1597                    out.restore(out_snap);
1598                    break;
1599                }
1600                emitted_any = true;
1601            }
1602            if matches!(production, Production::Repeat1 { .. }) && !emitted_any {
1603                emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)?;
1604            }
1605            Ok(())
1606        }
1607        Production::Optional { content } => {
1608            let cursor_snap = cursor.consumed.clone();
1609            let out_snap = out.snapshot();
1610            let consumed_before = cursor.consumed.iter().filter(|&&c| c).count();
1611            let result =
1612                emit_production(protocol, schema, grammar, vertex_id, content, cursor, out);
1613            // OPTIONAL is a backtracking site: if the inner production
1614            // errored *or* made no progress without leaving a witness
1615            // constraint, restore both cursor and output to their
1616            // pre-attempt state. Mirrors `Repeat`'s loop body.
1617            if result.is_err() {
1618                cursor.consumed = cursor_snap;
1619                out.restore(out_snap);
1620                return result;
1621            }
1622            let consumed_after = cursor.consumed.iter().filter(|&&c| c).count();
1623            if consumed_after == consumed_before
1624                && !has_relevant_constraint(content, schema, vertex_id)
1625            {
1626                cursor.consumed = cursor_snap;
1627                out.restore(out_snap);
1628            }
1629            Ok(())
1630        }
1631        Production::Field { name, content } => {
1632            // Set the field context for the duration of `content`'s
1633            // walk and emit the content against the *outer* cursor.
1634            // The SYMBOL handler picks up the context and pulls
1635            // successive `take_field(name)` edges as it encounters
1636            // SYMBOLs anywhere under `content` (under SEQ, CHOICE,
1637            // REPEAT, OPTIONAL, ALIAS — arbitrarily nested). This
1638            // subsumes the prior carve-outs for FIELD(REPEAT(...)),
1639            // FIELD(REPEAT1(...)), and the bare FIELD(SYMBOL ...)
1640            // case, and adds coverage for
1641            // `field('xs', commaSep1($.X))` which expands to
1642            // FIELD(SEQ(SYMBOL X, REPEAT(SEQ(',', SYMBOL X)))) and
1643            // any other shape where REPEAT/REPEAT1 sits inside SEQ /
1644            // CHOICE / OPTIONAL under a FIELD. A FIELD that wraps a
1645            // non-SYMBOL production (e.g. `field('op', '+')` or
1646            // `field('op', CHOICE(STRING ...))`) still works: STRING
1647            // handlers ignore the context and emit literals
1648            // directly, so the operator token survives the round
1649            // trip.
1650            let _guard = push_field_context(name);
1651            emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
1652        }
1653        Production::Alias {
1654            content,
1655            named,
1656            value,
1657        } => {
1658            // A named ALIAS rewrites the parser-visible kind to
1659            // `value`. If the cursor has an unconsumed child whose
1660            // kind matches that alias name, take it and emit the
1661            // child using the alias's INNER content as the rule
1662            // (e.g. `ALIAS { SYMBOL real_rule, value: "kind_x" }`
1663            // means a `kind_x` vertex on the schema should be walked
1664            // through `real_rule`'s body, not through whatever rule
1665            // happens to be keyed under `kind_x`). This is the
1666            // dependent-optic shape: the rule the emitter walks at a
1667            // child position is determined by the parent's chosen
1668            // alias, not by the child kind alone — without it,
1669            // grammars like YAML that introduce the same kind through
1670            // many ALIAS sites lose the parent context the moment
1671            // emit_vertex is called.
1672            if *named && !value.is_empty() {
1673                if let Some(edge) = cursor.take_matching(|edge| {
1674                    schema
1675                        .vertices
1676                        .get(&edge.tgt)
1677                        .map(|v| v.kind.as_ref() == value.as_str())
1678                        .unwrap_or(false)
1679                }) {
1680                    return emit_aliased_child(protocol, schema, grammar, &edge.tgt, content, out);
1681                }
1682            }
1683            // For anonymous aliases (named: false) whose content is an
1684            // external scanner token with no grammar rule (e.g.
1685            // JavaScript's `_ternary_qmark` aliased to `"?"`), emit the
1686            // alias value directly. The content's SYMBOL handler would
1687            // fall through the external-token heuristic and produce
1688            // nothing; the alias value IS the token text.
1689            if !*named && !value.is_empty() {
1690                if let Production::Symbol { name: sym } = content.as_ref() {
1691                    if sym.starts_with('_') && !grammar.rules.contains_key(sym) {
1692                        out.token(value);
1693                        return Ok(());
1694                    }
1695                }
1696            }
1697            emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
1698        }
1699        Production::Token { content }
1700        | Production::ImmediateToken { content }
1701        | Production::Prec { content, .. }
1702        | Production::PrecLeft { content, .. }
1703        | Production::PrecRight { content, .. }
1704        | Production::PrecDynamic { content, .. }
1705        | Production::Reserved { content, .. } => {
1706            emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
1707        }
1708    }
1709}
1710
1711/// Take the next cursor edge whose target vertex's kind matches the
1712/// SYMBOL `name` directly or via inline expansion of a hidden rule.
1713fn take_symbol_match<'a>(
1714    grammar: &Grammar,
1715    schema: &Schema,
1716    cursor: &mut ChildCursor<'a>,
1717    name: &str,
1718) -> Option<&'a Edge> {
1719    // Prefer non-field edges (`child_of`) to avoid consuming a
1720    // field-named edge that a later FIELD handler should claim.
1721    // Field-named edges (edge.kind != "child_of") are reserved for
1722    // the FIELD production that names them; consuming one here would
1723    // steal it from its intended handler (e.g. `as_pattern`'s
1724    // `alias` field edge consumed by the leading `expression`
1725    // SYMBOL instead of the trailing FIELD "alias" handler).
1726    if let Some(edge) = cursor.take_matching(|edge| {
1727        edge.kind.as_ref() == "child_of" && {
1728            let target_kind = schema.vertices.get(&edge.tgt).map(|v| v.kind.as_ref());
1729            kind_satisfies_symbol(grammar, target_kind, name)
1730        }
1731    }) {
1732        return Some(edge);
1733    }
1734    cursor.take_matching(|edge| {
1735        let target_kind = schema.vertices.get(&edge.tgt).map(|v| v.kind.as_ref());
1736        kind_satisfies_symbol(grammar, target_kind, name)
1737    })
1738}
1739
1740/// Decide whether a schema vertex of kind `target_kind` satisfies a
1741/// SYMBOL `name` reference in the grammar.
1742///
1743/// Operates as an O(1) lookup against the precomputed subtype
1744/// closure built at [`Grammar::from_bytes`]. The semantic content is
1745/// "K satisfies SYMBOL S iff K is reachable from S by walking the
1746/// grammar's hidden, supertype, and named-alias dispatch": this is
1747/// exactly the relation tree-sitter induces on `(parser-visible kind,
1748/// rule-position)` pairs.
1749fn kind_satisfies_symbol(grammar: &Grammar, target_kind: Option<&str>, name: &str) -> bool {
1750    let Some(target) = target_kind else {
1751        return false;
1752    };
1753    if target == name {
1754        return true;
1755    }
1756    grammar
1757        .subtypes
1758        .get(target)
1759        .is_some_and(|set| set.contains(name))
1760}
1761
1762/// Emit a child reached through an ALIAS production using the
1763/// alias's inner content as the rule, not `grammar.rules[child.kind]`.
1764///
1765/// This carries the dependent-optic context across the ALIAS edge:
1766/// at the parent rule's site we know which underlying production the
1767/// alias wraps (typically `SYMBOL real_rule`), and that's the
1768/// production that should drive the emit walk on the child's
1769/// children. Looking up `grammar.rules.get(child.kind)` instead would
1770/// either fail (the renamed kind has no top-level rule, e.g. YAML's
1771/// `block_mapping_pair`) or pick an arbitrary same-kinded rule from
1772/// elsewhere in the grammar.
1773///
1774/// Walk-context invariant. The dependent-optic shape of `emit_pretty`
1775/// says: the production walked at any vertex is determined by the
1776/// path from the root through the grammar, not by the vertex kind in
1777/// isolation. Two dispatch sites realise that invariant:
1778///
1779/// * [`emit_vertex`] looks up `grammar.rules[child.kind]` and walks
1780///   it. Correct for supertype / hidden-rule dispatch: the child's
1781///   kind on the schema IS the subtype tree-sitter selected, so its
1782///   top-level rule is the right production to walk.
1783/// * `emit_aliased_child` threads the parent rule's `Production`
1784///   directly (the inner `content` of `Production::Alias`) and walks
1785///   it on the child's children. Correct for ALIAS dispatch: the
1786///   child's kind on the schema is the alias's `value` (a renamed
1787///   kind that may have no top-level rule), and the production to
1788///   walk is the alias's content body, supplied by the parent.
1789///
1790/// Together these cover every site where the rule-walked-at-child
1791/// diverges from `grammar.rules[child.kind]`; the recursion site for
1792/// plain SYMBOL therefore correctly delegates to `emit_vertex`, and
1793/// we do not need a richer `WalkContext` value passed by reference.
1794/// The grammar dependency is the thread.
1795fn emit_aliased_child(
1796    protocol: &str,
1797    schema: &Schema,
1798    grammar: &Grammar,
1799    child_id: &panproto_gat::Name,
1800    content: &Production,
1801    out: &mut Output<'_>,
1802) -> Result<(), ParseError> {
1803    // Leaf shortcut: if the child has a literal-value and no
1804    // structural children, emit the captured text. Identifiers and
1805    // similar terminals reach here when an ALIAS wraps a SYMBOL that
1806    // resolves to a PATTERN.
1807    if let Some(literal) = literal_value(schema, child_id) {
1808        if children_for(schema, child_id).is_empty() {
1809            out.token(literal);
1810            return Ok(());
1811        }
1812    }
1813
1814    // Clear the enclosing FIELD context so it does not leak into the
1815    // aliased child's production walk. Without this, a FIELD("alias")
1816    // containing an ALIAS whose content is SYMBOL "expression" would
1817    // cause the inner SYMBOL handler to pull by field name "alias"
1818    // instead of by symbol match, failing to find the child edge.
1819    let _guard = clear_field_context();
1820
1821    // Resolve `content` to a rule when it's a SYMBOL (the dominant
1822    // shape: `ALIAS { content: SYMBOL real_rule, value: "kind_x" }`).
1823    if let Production::Symbol { name } = content {
1824        if let Some(rule) = grammar.rules.get(name) {
1825            let edges = children_for(schema, child_id);
1826            let mut cursor = ChildCursor::new(&edges);
1827            return emit_production(protocol, schema, grammar, child_id, rule, &mut cursor, out);
1828        }
1829    }
1830
1831    // Other ALIAS contents (CHOICE, SEQ, literals) walk in place.
1832    let edges = children_for(schema, child_id);
1833    let mut cursor = ChildCursor::new(&edges);
1834    emit_production(
1835        protocol,
1836        schema,
1837        grammar,
1838        child_id,
1839        content,
1840        &mut cursor,
1841        out,
1842    )
1843}
1844
1845fn emit_in_child_context(
1846    protocol: &str,
1847    schema: &Schema,
1848    grammar: &Grammar,
1849    child_id: &panproto_gat::Name,
1850    production: &Production,
1851    out: &mut Output<'_>,
1852) -> Result<(), ParseError> {
1853    // The child walks under its own production tree, with its own
1854    // FIELDs setting their own contexts. Clear the outer FIELD hint
1855    // so it does not leak through and cause sibling SYMBOLs inside
1856    // the child's body to mistakenly pull edges from the child's
1857    // cursor by the parent's field name.
1858    let _guard = clear_field_context();
1859    // If `production` is a structural wrapper (CHOICE / SEQ /
1860    // OPTIONAL / ...) whose referenced symbols cover the child's own
1861    // kind, the child IS the production's target node and the right
1862    // emit path is `emit_vertex(child)` (which honours the
1863    // literal-value leaf shortcut). Without this guard, FIELD(pattern,
1864    // CHOICE { _pattern, self }) on an identifier child walks the
1865    // CHOICE on the identifier's empty cursor, falls through to the
1866    // first non-BLANK alt, and loses the captured identifier text.
1867    if !matches!(production, Production::Symbol { .. }) {
1868        let child_kind = schema.vertices.get(child_id).map(|v| v.kind.as_ref());
1869        let symbols = referenced_symbols(production);
1870        if symbols
1871            .iter()
1872            .any(|s| kind_satisfies_symbol(grammar, child_kind, s) || child_kind == Some(s))
1873        {
1874            return emit_vertex(protocol, schema, grammar, child_id, out);
1875        }
1876    }
1877    match production {
1878        Production::Symbol { .. } => emit_vertex(protocol, schema, grammar, child_id, out),
1879        _ => {
1880            let edges = children_for(schema, child_id);
1881            let mut cursor = ChildCursor::new(&edges);
1882            emit_production(
1883                protocol,
1884                schema,
1885                grammar,
1886                child_id,
1887                production,
1888                &mut cursor,
1889                out,
1890            )
1891        }
1892    }
1893}
1894
1895fn pick_choice_with_cursor<'a>(
1896    schema: &Schema,
1897    grammar: &Grammar,
1898    vertex_id: &panproto_gat::Name,
1899    cursor: &ChildCursor<'_>,
1900    alternatives: &'a [Production],
1901) -> Option<&'a Production> {
1902    // Discriminator-driven dispatch (highest priority): when the
1903    // walker recorded a `chose-alt-fingerprint` constraint at parse
1904    // time, dispatch directly against that. This is the categorical
1905    // discriminator: it survives stripping of byte-position
1906    // constraints (so by-construction round-trips work) and is the
1907    // explicit witness of which CHOICE alternative the parser took.
1908    //
1909    // Falls back to the live `interstitial-*` substring blob when no
1910    // fingerprint is present (e.g. instances built by callers that
1911    // bypass the AstWalker). Both blobs are scored by the longest
1912    // STRING-literal token in an alternative that matches; the
1913    // length tiebreak prefers `&&` over `&`, `==` over `=`, etc.
1914    let constraint_blob = schema
1915        .constraints
1916        .get(vertex_id)
1917        .map(|cs| {
1918            let fingerprint: Option<&str> = cs
1919                .iter()
1920                .find(|c| c.sort.as_ref() == "chose-alt-fingerprint")
1921                .map(|c| c.value.as_str());
1922            if let Some(fp) = fingerprint {
1923                fp.to_owned()
1924            } else {
1925                cs.iter()
1926                    .filter(|c| {
1927                        let s = c.sort.as_ref();
1928                        s.starts_with("interstitial-") && !s.ends_with("-start-byte")
1929                    })
1930                    .map(|c| c.value.as_str())
1931                    .collect::<Vec<&str>>()
1932                    .join(" ")
1933            }
1934        })
1935        .unwrap_or_default();
1936    let child_kinds: Vec<&str> = schema
1937        .constraints
1938        .get(vertex_id)
1939        .and_then(|cs| {
1940            cs.iter()
1941                .find(|c| c.sort.as_ref() == "chose-alt-child-kinds")
1942                .map(|c| c.value.split_whitespace().collect())
1943        })
1944        .unwrap_or_default();
1945    // Cursor-exhaustion BLANK-preference: when all cursor edges have
1946    // been consumed AND `BLANK` is one of the alternatives, the only
1947    // alt that won't introduce a non-existent child is `BLANK`.
1948    //
1949    // This gate fires before the literal-blob discriminator because
1950    // the fingerprint is shared across every CHOICE position in the
1951    // vertex's rule body: a vertex like `sample_step` that ends in
1952    // `..., REPEAT(SEQ(",", arg)), CHOICE(",", BLANK)` records all of
1953    // its `","` interstitials in a single blob, so the literal-score
1954    // matcher would otherwise prefer `","` for the trailing CHOICE
1955    // even when the source had no trailing comma. By the time the
1956    // emitter reaches the trailing CHOICE, the REPEAT has consumed
1957    // every arg edge in cursor order; the residual unconsumed multiset
1958    // is empty; and the categorical reading of a CHOICE-with-BLANK at
1959    // a position with no remaining children is the no-op alternative.
1960    let any_unconsumed = cursor
1961        .edges
1962        .iter()
1963        .enumerate()
1964        .any(|(i, _)| !cursor.consumed[i]);
1965    let blank_present = alternatives.iter().any(|a| matches!(a, Production::Blank));
1966    if !any_unconsumed && blank_present {
1967        return alternatives.iter().find(|a| matches!(a, Production::Blank));
1968    }
1969    if !any_unconsumed && !blank_present {
1970        let mut visited = std::collections::HashSet::new();
1971        let mut yield_cache = grammar.yield_sets.clone();
1972        for alt in alternatives {
1973            let ys = yield_of_production(grammar, alt, &mut visited, &mut yield_cache);
1974            if ys.contains("") {
1975                return Some(alt);
1976            }
1977            visited.clear();
1978        }
1979    }
1980
1981    if !constraint_blob.is_empty() {
1982        // Primary score: literal-token match length. This dominates
1983        // alt selection so existing language tests that depend on
1984        // literal-only fingerprints keep working.
1985        // Secondary score (tiebreaker only): named-symbol kind match
1986        // count, read from the separate `chose-alt-child-kinds`
1987        // constraint (kept apart from the literal fingerprint so
1988        // identifiers like `:` in the kind list don't contaminate the
1989        // literal match). An alt that matches the recorded kinds is a
1990        // stronger witness than one whose only
1991        // overlap is literal punctuation.
1992        let mut best_literal: usize = 0;
1993        let mut best_symbols: usize = 0;
1994        let mut best_alt: Option<&Production> = None;
1995        let mut tied = false;
1996        for alt in alternatives {
1997            let strings = literal_strings(alt);
1998            if strings.is_empty() {
1999                continue;
2000            }
2001            let literal_score = strings
2002                .iter()
2003                .filter(|s| constraint_blob.contains(s.as_str()))
2004                .map(String::len)
2005                .sum::<usize>();
2006            if literal_score == 0 {
2007                continue;
2008            }
2009            // Symbol score is computed only as a tiebreaker among alts
2010            // whose literal-token coverage is the same; it never lifts
2011            // an alt above one with a strictly higher literal score.
2012            // Reads the `chose-alt-child-kinds` constraint (a separate
2013            // sequence the walker emits, kept apart from the literal
2014            // fingerprint to avoid cross-contamination).
2015            let symbol_score = if literal_score >= best_literal && !child_kinds.is_empty() {
2016                let symbols = referenced_symbols(alt);
2017                symbols
2018                    .iter()
2019                    .filter(|sym| {
2020                        let sym_str: &str = sym;
2021                        if child_kinds.contains(&sym_str) {
2022                            return true;
2023                        }
2024                        grammar.subtypes.get(sym_str).is_some_and(|sub_set| {
2025                            sub_set
2026                                .iter()
2027                                .any(|sub| child_kinds.contains(&sub.as_str()))
2028                        })
2029                    })
2030                    .count()
2031            } else {
2032                0
2033            };
2034            let better = literal_score > best_literal
2035                || (literal_score == best_literal && symbol_score > best_symbols);
2036            let same = literal_score == best_literal && symbol_score == best_symbols;
2037            if better {
2038                best_literal = literal_score;
2039                best_symbols = symbol_score;
2040                best_alt = Some(alt);
2041                tied = false;
2042            } else if same && best_alt.is_some() {
2043                tied = true;
2044            }
2045        }
2046        // Only commit to an alt when the fingerprint discriminates it
2047        // uniquely. A tie means the alts share the same literal token
2048        // set (e.g. JSON's `string = CHOICE { SEQ { '"', '"' }, SEQ {
2049        // '"', _string_content, '"' } }` — both alts contain just the
2050        // two `"` tokens). In that case fall through to cursor-based
2051        // dispatch, which uses the actual edge structure.
2052        if let Some(alt) = best_alt {
2053            if !tied {
2054                return Some(alt);
2055            }
2056        }
2057    }
2058
2059    // Cursor-driven dispatch via Yield-set preimage.
2060    //
2061    // For a CHOICE C = A1 | ... | An, Yield(Ai) is the set of vertex
2062    // kinds that can appear as the first named child when Ai is taken
2063    // (see `yield_of_production`). Given the first unconsumed cursor
2064    // edge with target kind K, select the first Ai (grammar order)
2065    // where K ∈ Yield(Ai). This is deterministic: grammar order is
2066    // the tiebreak, matching tree-sitter's own disambiguation.
2067    let first_unconsumed_kind: Option<&str> = cursor
2068        .edges
2069        .iter()
2070        .enumerate()
2071        .find(|(i, _)| !cursor.consumed[*i])
2072        .and_then(|(_, edge)| schema.vertices.get(&edge.tgt).map(|v| v.kind.as_ref()));
2073    if let Some(target_kind) = first_unconsumed_kind {
2074        // The subtype closure `subtypes[target_kind]` contains every
2075        // symbol name S such that a vertex of kind `target_kind` can
2076        // appear where the grammar says `SYMBOL S`. For a CHOICE
2077        // C = A1 | ... | An, the correct alternative is the one whose
2078        // top-level symbol is in `subtypes[target_kind]` (the target
2079        // kind IS a subtype of that symbol, so the symbol's rule body
2080        // dispatches to the target kind at parse time). This is an
2081        // O(1) set-membership check per alternative — no recursive
2082        // Yield computation needed.
2083        //
2084        // Preference order:
2085        //   1. Direct name match (target_kind == symbol name)
2086        //   2. Subtype match (symbol name ∈ subtypes[target_kind])
2087        //   3. Yield-set match (target_kind ∈ Yield(alt)) as fallback
2088        //      for non-SYMBOL alternatives (ALIAS, SEQ, etc.)
2089        let target_supers = grammar.subtypes.get(target_kind);
2090
2091        // Indented-form preference: when multiple alternatives match
2092        // the target kind (e.g. Python _suite where all three alts
2093        // produce `block`), prefer the alternative containing an
2094        // `_indent` SYMBOL. Check this BEFORE the standard passes
2095        // since they would pick the first match in grammar order.
2096        {
2097            let mut match_count = 0usize;
2098            let mut indent_alt_idx: Option<usize> = None;
2099            let mut visited = std::collections::HashSet::new();
2100            let mut yield_cache = grammar.yield_sets.clone();
2101            for (i, alt) in alternatives.iter().enumerate() {
2102                let ys = yield_of_production(grammar, alt, &mut visited, &mut yield_cache);
2103                if ys.contains(target_kind) {
2104                    match_count += 1;
2105                    if indent_alt_idx.is_none()
2106                        && referenced_symbols(alt)
2107                            .iter()
2108                            .any(|s| *s == "_indent" || s.ends_with("_indent"))
2109                    {
2110                        indent_alt_idx = Some(i);
2111                    }
2112                }
2113                visited.clear();
2114            }
2115            if match_count > 1 {
2116                if let Some(idx) = indent_alt_idx {
2117                    return Some(&alternatives[idx]);
2118                }
2119            }
2120        }
2121
2122        // Pass 1: direct name match
2123        for alt in alternatives {
2124            if let Production::Symbol { name } = alt {
2125                if name.as_str() == target_kind {
2126                    return Some(alt);
2127                }
2128            }
2129            if let Production::Alias {
2130                named: true, value, ..
2131            } = alt
2132            {
2133                if value.as_str() == target_kind {
2134                    return Some(alt);
2135                }
2136            }
2137        }
2138
2139        // Pass 2: subtype match (the target kind's supertype set
2140        // tells us which SYMBOL names it satisfies)
2141        if let Some(supers) = target_supers {
2142            for alt in alternatives {
2143                if let Production::Symbol { name } = alt {
2144                    if supers.contains(name.as_str()) {
2145                        return Some(alt);
2146                    }
2147                }
2148                if let Production::Alias {
2149                    named: true, value, ..
2150                } = alt
2151                {
2152                    if supers.contains(value.as_str()) {
2153                        return Some(alt);
2154                    }
2155                }
2156            }
2157        }
2158
2159        // Pass 3: Yield-set fallback for alternatives that are not
2160        // plain SYMBOLs or named ALIASes (e.g. SEQ, PREC wrappers
2161        // around SYMBOLs that the above passes don't unwrap).
2162        let mut visited = std::collections::HashSet::new();
2163        let mut yield_cache = grammar.yield_sets.clone();
2164        for alt in alternatives {
2165            let ys = yield_of_production(grammar, alt, &mut visited, &mut yield_cache);
2166            if ys.contains(target_kind) {
2167                return Some(alt);
2168            }
2169            visited.clear();
2170        }
2171    }
2172
2173    // FIELD dispatch: pick an alternative whose FIELD name matches an
2174    // unconsumed edge kind.
2175    let edge_kinds: Vec<&str> = cursor
2176        .edges
2177        .iter()
2178        .enumerate()
2179        .filter(|(i, _)| !cursor.consumed[*i])
2180        .map(|(_, e)| e.kind.as_ref())
2181        .collect();
2182    for alt in alternatives {
2183        if has_field_in(alt, &edge_kinds) {
2184            return Some(alt);
2185        }
2186    }
2187
2188    // No dispatch tier matched. The final selection follows the
2189    // categorical semantics of CHOICE-with-BLANK: BLANK represents ε
2190    // (produce nothing at this position). It is correct if and only
2191    // if no child remains to consume at this cursor position.
2192    //
2193    // When unconsumed non-extra children remain, selecting BLANK
2194    // would silently drop them. Select the first non-BLANK
2195    // alternative instead so the production walk can attempt to
2196    // consume them (the grammar rule may reference a symbol name
2197    // that doesn't exactly match the parse output's child kind,
2198    // e.g. Julia's macrocall_expression receives `argument_list`
2199    // children when grammar.json only references
2200    // `macro_argument_list`).
2201    let _ = (schema, vertex_id);
2202    if alternatives.iter().any(|a| matches!(a, Production::Blank)) {
2203        return alternatives.iter().find(|a| matches!(a, Production::Blank));
2204    }
2205    alternatives
2206        .iter()
2207        .find(|alt| !matches!(alt, Production::Blank))
2208}
2209
2210/// Collect every literal STRING token directly inside `production`
2211/// (without descending into SYMBOLs / hidden rules). Used to score
2212/// CHOICE alternatives against the parent vertex's interstitials so
2213/// the right operator / keyword form is picked when the schema
2214/// preserves interstitial fragments from a prior parse.
2215fn literal_strings(production: &Production) -> Vec<String> {
2216    let mut out = Vec::new();
2217    fn walk(p: &Production, out: &mut Vec<String>) {
2218        match p {
2219            Production::String { value } if !value.is_empty() => {
2220                out.push(value.clone());
2221            }
2222            Production::Choice { members } | Production::Seq { members } => {
2223                for m in members {
2224                    walk(m, out);
2225                }
2226            }
2227            Production::Repeat { content }
2228            | Production::Repeat1 { content }
2229            | Production::Optional { content }
2230            | Production::Field { content, .. }
2231            | Production::Alias { content, .. }
2232            | Production::Token { content }
2233            | Production::ImmediateToken { content }
2234            | Production::Prec { content, .. }
2235            | Production::PrecLeft { content, .. }
2236            | Production::PrecRight { content, .. }
2237            | Production::PrecDynamic { content, .. }
2238            | Production::Reserved { content, .. } => walk(content, out),
2239            _ => {}
2240        }
2241    }
2242    walk(production, &mut out);
2243    out
2244}
2245
2246/// Collect every SYMBOL name reachable from `production` without
2247/// crossing into nested rules. Used by `pick_choice_with_cursor` to
2248/// rank alternatives by "any SYMBOL inside this alt matches something
2249/// on the cursor", instead of just the first SYMBOL: a leading
2250/// optional like `attribute_item` then `parameter` is otherwise
2251/// rejected when only the parameter children are present.
2252fn referenced_symbols(production: &Production) -> Vec<&str> {
2253    let mut out = Vec::new();
2254    fn walk<'a>(p: &'a Production, out: &mut Vec<&'a str>) {
2255        match p {
2256            Production::Symbol { name } => out.push(name.as_str()),
2257            Production::Choice { members } | Production::Seq { members } => {
2258                for m in members {
2259                    walk(m, out);
2260                }
2261            }
2262            Production::Alias {
2263                content,
2264                named,
2265                value,
2266            } => {
2267                // A named ALIAS produces a child vertex whose kind is
2268                // the alias `value` (e.g. `ALIAS { content: STRING "=",
2269                // value: "punctuation", named: true }` introduces a
2270                // `punctuation` child). For cursor-driven dispatch to
2271                // recognise alts that emit such children, yield the
2272                // alias value as a referenced symbol. Anonymous aliases
2273                // do not introduce a named node and only need their
2274                // inner content's symbols.
2275                if *named && !value.is_empty() {
2276                    out.push(value.as_str());
2277                }
2278                walk(content, out);
2279            }
2280            Production::Repeat { content }
2281            | Production::Repeat1 { content }
2282            | Production::Optional { content }
2283            | Production::Field { content, .. }
2284            | Production::Token { content }
2285            | Production::ImmediateToken { content }
2286            | Production::Prec { content, .. }
2287            | Production::PrecLeft { content, .. }
2288            | Production::PrecRight { content, .. }
2289            | Production::PrecDynamic { content, .. }
2290            | Production::Reserved { content, .. } => walk(content, out),
2291            _ => {}
2292        }
2293    }
2294    walk(production, &mut out);
2295    out
2296}
2297
2298#[cfg(test)]
2299fn first_symbol(production: &Production) -> Option<&str> {
2300    match production {
2301        Production::Symbol { name } => Some(name),
2302        Production::Seq { members } => members.iter().find_map(first_symbol),
2303        Production::Choice { members } => members.iter().find_map(first_symbol),
2304        Production::Repeat { content }
2305        | Production::Repeat1 { content }
2306        | Production::Optional { content }
2307        | Production::Field { content, .. }
2308        | Production::Alias { content, .. }
2309        | Production::Token { content }
2310        | Production::ImmediateToken { content }
2311        | Production::Prec { content, .. }
2312        | Production::PrecLeft { content, .. }
2313        | Production::PrecRight { content, .. }
2314        | Production::PrecDynamic { content, .. }
2315        | Production::Reserved { content, .. } => first_symbol(content),
2316        _ => None,
2317    }
2318}
2319
2320fn has_field_in(production: &Production, edge_kinds: &[&str]) -> bool {
2321    match production {
2322        Production::Field { name, .. } => edge_kinds.contains(&name.as_str()),
2323        Production::Seq { members } | Production::Choice { members } => {
2324            members.iter().any(|m| has_field_in(m, edge_kinds))
2325        }
2326        Production::Repeat { content }
2327        | Production::Repeat1 { content }
2328        | Production::Optional { content }
2329        | Production::Alias { content, .. }
2330        | Production::Token { content }
2331        | Production::ImmediateToken { content }
2332        | Production::Prec { content, .. }
2333        | Production::PrecLeft { content, .. }
2334        | Production::PrecRight { content, .. }
2335        | Production::PrecDynamic { content, .. }
2336        | Production::Reserved { content, .. } => has_field_in(content, edge_kinds),
2337        _ => false,
2338    }
2339}
2340
2341fn has_relevant_constraint(
2342    production: &Production,
2343    schema: &Schema,
2344    vertex_id: &panproto_gat::Name,
2345) -> bool {
2346    let constraints = match schema.constraints.get(vertex_id) {
2347        Some(c) => c,
2348        None => return false,
2349    };
2350    fn walk(production: &Production, constraints: &[panproto_schema::Constraint]) -> bool {
2351        match production {
2352            Production::String { value } => constraints
2353                .iter()
2354                .any(|c| c.value == *value || c.sort.as_ref() == value),
2355            Production::Field { name, content } => {
2356                constraints.iter().any(|c| c.sort.as_ref() == name) || walk(content, constraints)
2357            }
2358            Production::Seq { members } | Production::Choice { members } => {
2359                members.iter().any(|m| walk(m, constraints))
2360            }
2361            Production::Repeat { content }
2362            | Production::Repeat1 { content }
2363            | Production::Optional { content }
2364            | Production::Alias { content, .. }
2365            | Production::Token { content }
2366            | Production::ImmediateToken { content }
2367            | Production::Prec { content, .. }
2368            | Production::PrecLeft { content, .. }
2369            | Production::PrecRight { content, .. }
2370            | Production::PrecDynamic { content, .. }
2371            | Production::Reserved { content, .. } => walk(content, constraints),
2372            _ => false,
2373        }
2374    }
2375    walk(production, constraints)
2376}
2377
2378fn children_for<'a>(schema: &'a Schema, vertex_id: &panproto_gat::Name) -> Vec<&'a Edge> {
2379    // Walk `outgoing` (insertion-ordered by SchemaBuilder via SmallVec
2380    // append) rather than the unordered `edges` HashMap so abstract
2381    // schemas under REPEAT(CHOICE(...)) preserve the order their edges
2382    // were inserted in. The previous implementation walked the HashMap
2383    // and sorted lexicographically by (kind, target id), which fused
2384    // interleaved children of the same kind into runs (e.g. a sequence
2385    // [symbol, punct, int, symbol, punct, int] became [symbol, symbol,
2386    // punct, punct, int, int] after the lex sort).
2387    let Some(edges) = schema.outgoing.get(vertex_id) else {
2388        return Vec::new();
2389    };
2390
2391    // Look up the canonical Edge reference (the key in `schema.edges`)
2392    // for each entry in `outgoing`. Falls back to the SmallVec entry if
2393    // the canonical key is missing, which would indicate index drift.
2394    let mut indexed: Vec<(usize, u32, &Edge)> = edges
2395        .iter()
2396        .enumerate()
2397        .map(|(i, e)| {
2398            let canonical = schema.edges.get_key_value(e).map_or(e, |(k, _)| k);
2399            let pos = schema.orderings.get(canonical).copied().unwrap_or(u32::MAX);
2400            (i, pos, canonical)
2401        })
2402        .collect();
2403
2404    // Stable sort by (explicit-ordering, insertion-index). Edges with
2405    // an explicit `orderings` entry come first in their declared order;
2406    // the remainder fall through in insertion order.
2407    indexed.sort_by_key(|(i, pos, _)| (*pos, *i));
2408    indexed.into_iter().map(|(_, _, e)| e).collect()
2409}
2410
2411fn vertex_id_kind<'a>(schema: &'a Schema, vertex_id: &panproto_gat::Name) -> Option<&'a str> {
2412    schema.vertices.get(vertex_id).map(|v| v.kind.as_ref())
2413}
2414
2415fn literal_value<'a>(schema: &'a Schema, vertex_id: &panproto_gat::Name) -> Option<&'a str> {
2416    schema
2417        .constraints
2418        .get(vertex_id)?
2419        .iter()
2420        .find(|c| c.sort.as_ref() == "literal-value")
2421        .map(|c| c.value.as_str())
2422}
2423
2424/// True iff `pattern` matches a (possibly optional / repeated) sequence
2425/// of carriage-return and newline characters only. Examples: `\r?\n`,
2426/// `\n`, `\r\n`, `\n+`, `\r?\n+`. Distinguishes structural newline
2427/// terminals from generic whitespace and from other patterns that
2428/// happen to contain a newline escape inside a larger class.
2429fn is_newline_like_pattern(pattern: &str) -> bool {
2430    if pattern.is_empty() {
2431        return false;
2432    }
2433    let mut chars = pattern.chars();
2434    let mut saw_newline_atom = false;
2435    while let Some(c) = chars.next() {
2436        match c {
2437            '\\' => match chars.next() {
2438                Some('n' | 'r') => saw_newline_atom = true,
2439                _ => return false,
2440            },
2441            '?' | '*' | '+' => {} // quantifiers on the previous atom
2442            _ => return false,
2443        }
2444    }
2445    saw_newline_atom
2446}
2447
2448/// True iff `pattern` matches a (possibly quantified) run of generic
2449/// whitespace characters: `\s+`, `[ \t]+`, ` +`, `\s*`. Such patterns
2450/// describe interstitial spacing rather than syntactic content, so the
2451/// pretty emitter can drop them and let the layout pass insert the
2452/// configured separator.
2453fn is_whitespace_only_pattern(pattern: &str) -> bool {
2454    if pattern.is_empty() {
2455        return false;
2456    }
2457    // Strip an outer quantifier suffix.
2458    let trimmed = pattern.trim_end_matches(['?', '*', '+']);
2459    if trimmed.is_empty() {
2460        return false;
2461    }
2462    // Bare `\s` / ` ` / `\t`.
2463    if matches!(trimmed, "\\s" | " " | "\\t") {
2464        return true;
2465    }
2466    // Character class containing only whitespace atoms.
2467    if let Some(inner) = trimmed.strip_prefix('[').and_then(|s| s.strip_suffix(']')) {
2468        let mut chars = inner.chars();
2469        let mut saw_atom = false;
2470        while let Some(c) = chars.next() {
2471            match c {
2472                '\\' => match chars.next() {
2473                    Some('s' | 't' | 'r' | 'n') => saw_atom = true,
2474                    _ => return false,
2475                },
2476                ' ' | '\t' => saw_atom = true,
2477                _ => return false,
2478            }
2479        }
2480        return saw_atom;
2481    }
2482    false
2483}
2484
2485fn placeholder_for_pattern(pattern: &str) -> String {
2486    // Heuristic placeholder for unconstrained PATTERN terminals.
2487    //
2488    // First handle the "the regex IS a literal escape" cases that
2489    // tree-sitter grammars use as separators (`\n`, `\r\n`, `;`,
2490    // etc.); emitting the matching character is always preferable
2491    // to a `_x` identifier-like placeholder when the surrounding
2492    // grammar expects a separator.
2493    let simple_lit = decode_simple_pattern_literal(pattern);
2494    if let Some(lit) = simple_lit {
2495        return lit;
2496    }
2497
2498    if pattern.contains("[0-9]") || pattern.contains("\\d") {
2499        "0".into()
2500    } else if pattern.contains("[a-zA-Z_]") || pattern.contains("\\w") {
2501        "_x".into()
2502    } else if pattern.contains('"') || pattern.contains('\'') {
2503        "\"\"".into()
2504    } else {
2505        "_".into()
2506    }
2507}
2508
2509/// Decode a tree-sitter PATTERN whose regex is a simple literal
2510/// (newline, semicolon, comma, etc.) to the byte sequence it matches.
2511/// Returns `None` for patterns with character classes, alternations,
2512/// or quantifiers; the caller falls back to the heuristic placeholder.
2513fn decode_simple_pattern_literal(pattern: &str) -> Option<String> {
2514    // Skip patterns containing regex metachars that would broaden the
2515    // match beyond a single literal byte sequence.
2516    if pattern
2517        .chars()
2518        .any(|c| matches!(c, '[' | ']' | '(' | ')' | '*' | '+' | '?' | '|' | '{' | '}'))
2519    {
2520        return None;
2521    }
2522    let mut out = String::new();
2523    let mut chars = pattern.chars();
2524    while let Some(c) = chars.next() {
2525        if c == '\\' {
2526            match chars.next() {
2527                Some('n') => out.push('\n'),
2528                Some('r') => out.push('\r'),
2529                Some('t') => out.push('\t'),
2530                Some('\\') => out.push('\\'),
2531                Some('/') => out.push('/'),
2532                Some(other) => out.push(other),
2533                None => return None,
2534            }
2535        } else {
2536            out.push(c);
2537        }
2538    }
2539    Some(out)
2540}
2541
2542// ═══════════════════════════════════════════════════════════════════
2543// Token list output with Spacing algebra
2544// ═══════════════════════════════════════════════════════════════════
2545//
2546// Emit produces a free monoid over `Token`. Layout (spaces, newlines,
2547// indentation) is a homomorphism `Vec<Token> -> Vec<u8>` parameterised
2548// by `FormatPolicy`. Separating the structural output from the layout
2549// decision means each phase has one job: emit walks the grammar and
2550// pushes tokens; layout is a single fold, locally driven by adjacent
2551// pairs and a depth counter. Snapshot/restore is just `tokens.len()`.
2552
2553#[derive(Clone)]
2554enum Token {
2555    /// A user-visible terminal contributed by the grammar.
2556    Lit(String),
2557    /// `indent_open` marker emitted when a `Lit` matched the policy's
2558    /// open list. Carried as a separate token so layout can decide to
2559    /// break + indent without re-scanning.
2560    IndentOpen,
2561    /// `indent_close` marker emitted before a closer-`Lit`.
2562    IndentClose,
2563    /// "Break a line here if not already at line start" — used after
2564    /// statements/declarations and after open braces.
2565    LineBreak,
2566    /// Suppress the next inter-Lit separator. Pushed by the REPEAT
2567    /// walker when an iteration's "separator slot" (a CHOICE-with-BLANK
2568    /// or OPTIONAL at SEQ position 0) emitted zero content tokens, so
2569    /// the categorical reading is "no source-level separator existed
2570    /// between these two sibling iterations of the body".
2571    NoSpace,
2572}
2573
2574struct Output<'a> {
2575    tokens: Vec<Token>,
2576    policy: &'a FormatPolicy,
2577    suppress_brace_indent: bool,
2578}
2579
2580#[derive(Clone)]
2581struct OutputSnapshot {
2582    tokens_len: usize,
2583}
2584
2585impl<'a> Output<'a> {
2586    fn new(policy: &'a FormatPolicy) -> Self {
2587        Self {
2588            tokens: Vec::new(),
2589            policy,
2590            suppress_brace_indent: false,
2591        }
2592    }
2593
2594    fn token(&mut self, value: &str) {
2595        if value.is_empty() {
2596            return;
2597        }
2598
2599        // A grammar STRING whose value is a newline (e.g. abc's `_NL = "\n"`
2600        // or any rule that uses `"\n"` as a structural line terminator)
2601        // must route through the layout's `LineBreak` channel. Emitting it
2602        // as a `Lit` leaves the newline character in the byte stream but
2603        // also makes `needs_space_between` insert the configured separator
2604        // between the newline and the following token, producing leading
2605        // spaces on every line after the first.
2606        if value == "\n" || value == "\r\n" || value == "\r" {
2607            self.tokens.push(Token::LineBreak);
2608            return;
2609        }
2610
2611        // A captured literal value (typically a vertex's `literal-value`
2612        // constraint covering the full source span of a terminal-like
2613        // rule, e.g. abc's `reference_number_line` matching `"X:1\n"`)
2614        // may contain trailing newlines. Splitting the trailing newlines
2615        // off as a `LineBreak` lets the layout pass treat the next Lit
2616        // as starting a new line; otherwise the next Lit pair would
2617        // trigger `needs_space_between` against the embedded `\n` and
2618        // insert the policy separator at column 0 of the new line.
2619        let trimmed = value.trim_end_matches(['\n', '\r']);
2620        let trailing_newlines = value.len() - trimmed.len();
2621        if trailing_newlines > 0 && !trimmed.is_empty() {
2622            if !self.suppress_brace_indent && self.policy.indent_close.iter().any(|t| t == trimmed)
2623            {
2624                self.tokens.push(Token::IndentClose);
2625            }
2626            self.tokens.push(Token::Lit(trimmed.to_owned()));
2627            if !self.suppress_brace_indent && self.policy.indent_open.iter().any(|t| t == trimmed) {
2628                self.tokens.push(Token::IndentOpen);
2629            } else if self.policy.line_break_after.iter().any(|t| t == trimmed) {
2630                // already emitting a LineBreak below for the trailing \n
2631            }
2632            self.tokens.push(Token::LineBreak);
2633            return;
2634        }
2635
2636        if !self.suppress_brace_indent && self.policy.indent_close.iter().any(|t| t == value) {
2637            self.tokens.push(Token::IndentClose);
2638        }
2639
2640        self.tokens.push(Token::Lit(value.to_owned()));
2641
2642        if !self.suppress_brace_indent && self.policy.indent_open.iter().any(|t| t == value) {
2643            self.tokens.push(Token::IndentOpen);
2644            self.tokens.push(Token::LineBreak);
2645        } else if self.policy.line_break_after.iter().any(|t| t == value)
2646            && !(self.suppress_brace_indent && (value == "{" || value == "}"))
2647        {
2648            self.tokens.push(Token::LineBreak);
2649        }
2650    }
2651
2652    fn newline(&mut self) {
2653        self.tokens.push(Token::LineBreak);
2654    }
2655
2656    /// Open an indent scope: subsequent `LineBreak`s render at the
2657    /// new depth until a matching `indent_close` pops it. Used by the
2658    /// external-token fallback to render indent-based grammars'
2659    /// `_indent` scanner outputs.
2660    fn indent_open(&mut self) {
2661        self.tokens.push(Token::IndentOpen);
2662        self.tokens.push(Token::LineBreak);
2663    }
2664
2665    /// Close one indent scope opened by `indent_open`.
2666    fn indent_close(&mut self) {
2667        self.tokens.push(Token::IndentClose);
2668    }
2669
2670    fn snapshot(&self) -> OutputSnapshot {
2671        OutputSnapshot {
2672            tokens_len: self.tokens.len(),
2673        }
2674    }
2675
2676    fn restore(&mut self, snap: OutputSnapshot) {
2677        self.tokens.truncate(snap.tokens_len);
2678    }
2679
2680    /// True iff at least one `Token::Lit` was pushed since `snap`.
2681    /// Control-only emissions (`LineBreak`, `IndentOpen` / `IndentClose`,
2682    /// `NoSpace`) do not count as content. Used by the REPEAT walker
2683    /// to detect that a "separator slot" CHOICE picked its BLANK
2684    /// alternative, so the next iteration's content can be marked
2685    /// tight against the previous iteration's content.
2686    fn lit_emitted_since(&self, snap: OutputSnapshot) -> bool {
2687        self.tokens[snap.tokens_len..]
2688            .iter()
2689            .any(|t| matches!(t, Token::Lit(_)))
2690    }
2691
2692    /// Push a marker that suppresses the next inter-Lit separator the
2693    /// layout pass would otherwise insert. Used to encode "no source-
2694    /// level separator was emitted between these two Lits" without
2695    /// having to make per-grammar adjacency decisions in the layout.
2696    fn no_space(&mut self) {
2697        self.tokens.push(Token::NoSpace);
2698    }
2699
2700    fn finish(self) -> Vec<u8> {
2701        layout(&self.tokens, self.policy)
2702    }
2703}
2704
2705/// Fold a token list into bytes. The algebra:
2706/// * adjacent `Lit`s get a single space iff `needs_space_between(a, b)`,
2707/// * `IndentOpen` / `IndentClose` adjust a depth counter,
2708/// * `LineBreak` writes `\n` if not already at line start, then the
2709///   next `Lit` writes `indent * indent_width` spaces of indent.
2710fn layout(tokens: &[Token], policy: &FormatPolicy) -> Vec<u8> {
2711    let mut bytes = Vec::new();
2712    let mut indent: usize = 0;
2713    let mut at_line_start = true;
2714    let mut last_lit: Option<&str> = None;
2715    // True iff, at the moment `last_lit` was emitted, the cursor was at a
2716    // position where the grammar expects an operand: start of stream / line,
2717    // just after an open paren / bracket / brace, just after a separator like
2718    // `,` or `;`, or just after a binary / assignment operator. Used by
2719    // `needs_space_between` to recognise `last_lit` as a tight unary prefix
2720    // (`f(-1.0)`) rather than a spaced binary operator (`a - b`).
2721    let mut last_was_in_operand_position = true;
2722    let mut expecting_operand = true;
2723    // Set when a `Token::NoSpace` marker is seen; cleared when the next
2724    // Lit consumes it. While set, suppress the policy separator that
2725    // would otherwise be inserted before the next Lit.
2726    let mut suppress_next_separator = false;
2727    let newline = policy.newline.as_bytes();
2728    let separator = policy.separator.as_bytes();
2729
2730    for tok in tokens {
2731        match tok {
2732            Token::IndentOpen => indent += 1,
2733            Token::IndentClose => {
2734                indent = indent.saturating_sub(1);
2735                if !at_line_start {
2736                    bytes.extend_from_slice(newline);
2737                    at_line_start = true;
2738                    expecting_operand = true;
2739                }
2740            }
2741            Token::LineBreak => {
2742                if !at_line_start {
2743                    bytes.extend_from_slice(newline);
2744                    at_line_start = true;
2745                    expecting_operand = true;
2746                }
2747            }
2748            Token::NoSpace => {
2749                suppress_next_separator = true;
2750            }
2751            Token::Lit(value) => {
2752                if at_line_start {
2753                    bytes.extend(std::iter::repeat_n(b' ', indent * policy.indent_width));
2754                } else if let Some(prev) = last_lit {
2755                    if !suppress_next_separator
2756                        && needs_space_between(prev, value, last_was_in_operand_position)
2757                    {
2758                        bytes.extend_from_slice(separator);
2759                    }
2760                }
2761                suppress_next_separator = false;
2762                bytes.extend_from_slice(value.as_bytes());
2763                at_line_start = false;
2764                last_was_in_operand_position = expecting_operand;
2765                expecting_operand = leaves_operand_position(value);
2766                last_lit = Some(value.as_str());
2767                // Line comments consume text to end-of-line but the
2768                // newline terminator is not part of their literal
2769                // value. Force a line break after any Lit that starts
2770                // with a line-comment prefix so subsequent tokens
2771                // don't appear on the comment line.
2772                if value.starts_with("//") || value.starts_with('#') {
2773                    bytes.extend_from_slice(newline);
2774                    at_line_start = true;
2775                    expecting_operand = true;
2776                }
2777            }
2778        }
2779    }
2780
2781    if !at_line_start {
2782        bytes.extend_from_slice(newline);
2783    }
2784    bytes
2785}
2786
2787/// True iff emitting `tok` leaves the cursor in a position where the
2788/// grammar expects an operand next. Operand-introducing tokens are open
2789/// punctuation, separators, and operator-like strings; operand-terminating
2790/// tokens are identifiers, literals, and closing punctuation.
2791fn leaves_operand_position(tok: &str) -> bool {
2792    if tok.is_empty() {
2793        return true;
2794    }
2795    if is_punct_open(tok) {
2796        return true;
2797    }
2798    if matches!(tok, "," | ";") {
2799        return true;
2800    }
2801    if is_punct_close(tok) {
2802        return false;
2803    }
2804    if first_is_alnum_or_underscore(tok) || last_ends_with_alnum(tok) {
2805        return false;
2806    }
2807    // Pure punctuation/operator runs (`=`, `+`, `-`, `<=`, `>>`, …) leave
2808    // the cursor expecting another operand.
2809    true
2810}
2811
2812fn needs_space_between(last: &str, next: &str, expecting_operand: bool) -> bool {
2813    if last.is_empty() || next.is_empty() {
2814        return false;
2815    }
2816    if is_punct_open(last) || is_punct_open(next) {
2817        return false;
2818    }
2819    if is_punct_close(next) {
2820        return false;
2821    }
2822    if is_punct_close(last) && is_punct_punctuation(next) {
2823        return false;
2824    }
2825    if last == "." || next == "." {
2826        return false;
2827    }
2828    // Tight unary prefix: `last` is a sign/logical-not operator emitted
2829    // where the grammar expected an operand, so it glues to `next`.
2830    // `expecting_operand` here means: just before `last` was emitted,
2831    // the cursor expected an operand, which makes `last` a unary prefix.
2832    // Examples: `f(-1.0)`, `[ -2, 3 ]`, `return -x`, `a = !flag`.
2833    if expecting_operand && is_unary_prefix_operator(last) && first_is_operand_start(next) {
2834        return false;
2835    }
2836    if last_is_word_like(last) && first_is_word_like(next) {
2837        return true;
2838    }
2839    if last_ends_with_alnum(last) && first_is_alnum_or_underscore(next) {
2840        return true;
2841    }
2842    // Adjacent operator runs: keep them apart so the lexer doesn't glue
2843    // `>` and `=` into `>=` unintentionally.
2844    true
2845}
2846
2847fn is_unary_prefix_operator(s: &str) -> bool {
2848    matches!(s, "-" | "+" | "!" | "~")
2849}
2850
2851fn first_is_operand_start(s: &str) -> bool {
2852    s.chars()
2853        .next()
2854        .map(|c| c.is_alphanumeric() || c == '_' || c == '.' || c == '(')
2855        .unwrap_or(false)
2856}
2857
2858fn is_punct_open(s: &str) -> bool {
2859    matches!(s, "(" | "[" | "{" | "\"" | "'" | "`" | "@" | "#")
2860        || s.ends_with('{')
2861        || s.ends_with('(')
2862        || s.ends_with('[')
2863}
2864
2865fn is_punct_close(s: &str) -> bool {
2866    matches!(s, ")" | "]" | "}" | "," | ";" | ":" | "\"" | "'" | "`")
2867}
2868
2869fn is_punct_punctuation(s: &str) -> bool {
2870    matches!(s, "," | ";" | ":" | "." | ")" | "]" | "}")
2871}
2872
2873fn last_is_word_like(s: &str) -> bool {
2874    s.chars()
2875        .next_back()
2876        .map(|c| c.is_alphanumeric() || c == '_')
2877        .unwrap_or(false)
2878}
2879
2880fn first_is_word_like(s: &str) -> bool {
2881    s.chars()
2882        .next()
2883        .map(|c| c.is_alphanumeric() || c == '_')
2884        .unwrap_or(false)
2885}
2886
2887fn last_ends_with_alnum(s: &str) -> bool {
2888    s.chars()
2889        .next_back()
2890        .map(char::is_alphanumeric)
2891        .unwrap_or(false)
2892}
2893
2894fn first_is_alnum_or_underscore(s: &str) -> bool {
2895    s.chars()
2896        .next()
2897        .map(|c| c.is_alphanumeric() || c == '_')
2898        .unwrap_or(false)
2899}
2900
2901#[cfg(test)]
2902#[allow(clippy::unwrap_used)]
2903mod tests {
2904    use super::*;
2905
2906    #[test]
2907    fn parses_simple_grammar_json() {
2908        let bytes = br#"{
2909            "name": "tiny",
2910            "rules": {
2911                "program": {
2912                    "type": "SEQ",
2913                    "members": [
2914                        {"type": "STRING", "value": "hello"},
2915                        {"type": "STRING", "value": ";"}
2916                    ]
2917                }
2918            }
2919        }"#;
2920        let g = Grammar::from_bytes("tiny", bytes).expect("valid tiny grammar");
2921        assert!(g.rules.contains_key("program"));
2922    }
2923
2924    #[test]
2925    fn output_emits_punctuation_without_leading_space() {
2926        let policy = FormatPolicy::default();
2927        let mut out = Output::new(&policy);
2928        out.token("foo");
2929        out.token("(");
2930        out.token(")");
2931        out.token(";");
2932        let bytes = out.finish();
2933        let s = std::str::from_utf8(&bytes).expect("ascii output");
2934        assert!(s.starts_with("foo();"), "got {s:?}");
2935    }
2936
2937    #[test]
2938    fn grammar_from_bytes_rejects_malformed_input() {
2939        let result = Grammar::from_bytes("malformed", b"not json");
2940        let err = result.expect_err("malformed bytes must yield Err");
2941        let msg = err.to_string();
2942        assert!(
2943            msg.contains("malformed"),
2944            "error message should name the protocol: {msg:?}"
2945        );
2946    }
2947
2948    #[test]
2949    fn output_indents_after_open_brace() {
2950        let policy = FormatPolicy::default();
2951        let mut out = Output::new(&policy);
2952        out.token("fn");
2953        out.token("foo");
2954        out.token("(");
2955        out.token(")");
2956        out.token("{");
2957        out.token("body");
2958        out.token("}");
2959        let bytes = out.finish();
2960        let s = std::str::from_utf8(&bytes).expect("ascii output");
2961        assert!(s.contains("{\n"), "newline after opening brace: {s:?}");
2962        assert!(s.contains("body"), "body inside block: {s:?}");
2963        assert!(s.ends_with("}\n"), "newline after closing brace: {s:?}");
2964    }
2965
2966    #[test]
2967    fn output_no_space_between_word_and_dot() {
2968        let policy = FormatPolicy::default();
2969        let mut out = Output::new(&policy);
2970        out.token("foo");
2971        out.token(".");
2972        out.token("bar");
2973        let bytes = out.finish();
2974        let s = std::str::from_utf8(&bytes).expect("ascii output");
2975        assert!(s.starts_with("foo.bar"), "no space around dot: {s:?}");
2976    }
2977
2978    #[test]
2979    fn output_snapshot_restore_truncates_bytes() {
2980        let policy = FormatPolicy::default();
2981        let mut out = Output::new(&policy);
2982        out.token("keep");
2983        let snap = out.snapshot();
2984        out.token("drop");
2985        out.token("more");
2986        out.restore(snap);
2987        out.token("after");
2988        let bytes = out.finish();
2989        let s = std::str::from_utf8(&bytes).expect("ascii output");
2990        assert!(s.contains("keep"), "kept token survives: {s:?}");
2991        assert!(s.contains("after"), "post-restore token visible: {s:?}");
2992        assert!(!s.contains("drop"), "rolled-back token removed: {s:?}");
2993        assert!(!s.contains("more"), "rolled-back token removed: {s:?}");
2994    }
2995
2996    #[test]
2997    fn child_cursor_take_field_consumes_once() {
2998        let edges_owned: Vec<Edge> = vec![Edge {
2999            src: panproto_gat::Name::from("p"),
3000            tgt: panproto_gat::Name::from("c"),
3001            kind: panproto_gat::Name::from("name"),
3002            name: None,
3003        }];
3004        let edges: Vec<&Edge> = edges_owned.iter().collect();
3005        let mut cursor = ChildCursor::new(&edges);
3006        let first = cursor.take_field("name");
3007        let second = cursor.take_field("name");
3008        assert!(first.is_some(), "first take returns the edge");
3009        assert!(
3010            second.is_none(),
3011            "second take returns None (already consumed)"
3012        );
3013    }
3014
3015    #[test]
3016    fn child_cursor_take_matching_predicate() {
3017        let edges_owned: Vec<Edge> = vec![
3018            Edge {
3019                src: "p".into(),
3020                tgt: "c1".into(),
3021                kind: "child_of".into(),
3022                name: None,
3023            },
3024            Edge {
3025                src: "p".into(),
3026                tgt: "c2".into(),
3027                kind: "key".into(),
3028                name: None,
3029            },
3030        ];
3031        let edges: Vec<&Edge> = edges_owned.iter().collect();
3032        let mut cursor = ChildCursor::new(&edges);
3033        assert!(cursor.has_matching(|e| e.kind.as_ref() == "key"));
3034        let taken = cursor.take_matching(|e| e.kind.as_ref() == "key");
3035        assert!(taken.is_some());
3036        assert!(
3037            !cursor.has_matching(|e| e.kind.as_ref() == "key"),
3038            "consumed edge no longer matches"
3039        );
3040        assert!(
3041            cursor.has_matching(|e| e.kind.as_ref() == "child_of"),
3042            "the other edge is still available"
3043        );
3044    }
3045
3046    #[test]
3047    fn kind_satisfies_symbol_direct_match() {
3048        let bytes = br#"{
3049            "name": "tiny",
3050            "rules": {
3051                "x": {"type": "STRING", "value": "x"}
3052            }
3053        }"#;
3054        let g = Grammar::from_bytes("tiny", bytes).expect("valid grammar");
3055        assert!(kind_satisfies_symbol(&g, Some("x"), "x"));
3056        assert!(!kind_satisfies_symbol(&g, Some("y"), "x"));
3057        assert!(!kind_satisfies_symbol(&g, None, "x"));
3058    }
3059
3060    #[test]
3061    fn kind_satisfies_symbol_through_hidden_rule() {
3062        let bytes = br#"{
3063            "name": "tiny",
3064            "rules": {
3065                "_value": {
3066                    "type": "CHOICE",
3067                    "members": [
3068                        {"type": "SYMBOL", "name": "object"},
3069                        {"type": "SYMBOL", "name": "number"}
3070                    ]
3071                },
3072                "object": {"type": "STRING", "value": "{}"},
3073                "number": {"type": "PATTERN", "value": "[0-9]+"}
3074            }
3075        }"#;
3076        let g = Grammar::from_bytes("tiny", bytes).expect("valid grammar");
3077        assert!(
3078            kind_satisfies_symbol(&g, Some("number"), "_value"),
3079            "number is reachable from _value via CHOICE"
3080        );
3081        assert!(
3082            kind_satisfies_symbol(&g, Some("object"), "_value"),
3083            "object is reachable from _value via CHOICE"
3084        );
3085        assert!(
3086            !kind_satisfies_symbol(&g, Some("string"), "_value"),
3087            "string is NOT among the alternatives"
3088        );
3089    }
3090
3091    #[test]
3092    fn first_symbol_skips_string_terminals() {
3093        let prod: Production = serde_json::from_str(
3094            r#"{
3095                "type": "SEQ",
3096                "members": [
3097                    {"type": "STRING", "value": "{"},
3098                    {"type": "SYMBOL", "name": "body"},
3099                    {"type": "STRING", "value": "}"}
3100                ]
3101            }"#,
3102        )
3103        .expect("valid SEQ");
3104        assert_eq!(first_symbol(&prod), Some("body"));
3105    }
3106
3107    #[test]
3108    fn placeholder_for_pattern_routes_by_regex_class() {
3109        assert_eq!(placeholder_for_pattern("[0-9]+"), "0");
3110        assert_eq!(placeholder_for_pattern("[a-zA-Z_]\\w*"), "_x");
3111        assert_eq!(placeholder_for_pattern("\"[^\"]*\""), "\"\"");
3112        assert_eq!(placeholder_for_pattern("\\d+\\.\\d+"), "0");
3113    }
3114
3115    #[test]
3116    fn format_policy_default_breaks_after_semicolon() {
3117        let policy = FormatPolicy::default();
3118        assert!(policy.line_break_after.iter().any(|t| t == ";"));
3119        assert!(policy.indent_open.iter().any(|t| t == "{"));
3120        assert!(policy.indent_close.iter().any(|t| t == "}"));
3121        assert_eq!(policy.indent_width, 2);
3122    }
3123
3124    #[test]
3125    fn placeholder_decodes_literal_pattern_separators() {
3126        // PATTERN regexes that match a single literal byte sequence
3127        // (newline, semicolon, comma) emit the bytes verbatim instead
3128        // of falling through to the `_` catch-all.
3129        assert_eq!(placeholder_for_pattern("\\n"), "\n");
3130        assert_eq!(placeholder_for_pattern("\\r\\n"), "\r\n");
3131        assert_eq!(placeholder_for_pattern(";"), ";");
3132        // Patterns with character classes / alternation still route
3133        // through the heuristic.
3134        assert_eq!(placeholder_for_pattern("[0-9]+"), "0");
3135        assert_eq!(placeholder_for_pattern("a|b"), "_");
3136    }
3137
3138    #[test]
3139    fn supertypes_decode_from_grammar_json_strings() {
3140        // Tree-sitter older grammars list supertypes as bare strings.
3141        let bytes = br#"{
3142            "name": "tiny",
3143            "supertypes": ["expression"],
3144            "rules": {
3145                "expression": {
3146                    "type": "CHOICE",
3147                    "members": [
3148                        {"type": "SYMBOL", "name": "binary_expression"},
3149                        {"type": "SYMBOL", "name": "identifier"}
3150                    ]
3151                },
3152                "binary_expression": {"type": "STRING", "value": "x"},
3153                "identifier": {"type": "PATTERN", "value": "[a-z]+"}
3154            }
3155        }"#;
3156        let g = Grammar::from_bytes("tiny", bytes).expect("parse");
3157        assert!(g.supertypes.contains("expression"));
3158        // identifier matches the supertype `expression`.
3159        assert!(kind_satisfies_symbol(&g, Some("identifier"), "expression"));
3160        // unrelated kinds do not.
3161        assert!(!kind_satisfies_symbol(&g, Some("string"), "expression"));
3162    }
3163
3164    #[test]
3165    fn supertypes_decode_from_grammar_json_objects() {
3166        // Recent grammars list supertypes as `{type: SYMBOL, name: ...}`
3167        // entries instead of bare strings.
3168        let bytes = br#"{
3169            "name": "tiny",
3170            "supertypes": [{"type": "SYMBOL", "name": "stmt"}],
3171            "rules": {
3172                "stmt": {
3173                    "type": "CHOICE",
3174                    "members": [
3175                        {"type": "SYMBOL", "name": "while_stmt"},
3176                        {"type": "SYMBOL", "name": "if_stmt"}
3177                    ]
3178                },
3179                "while_stmt": {"type": "STRING", "value": "while"},
3180                "if_stmt": {"type": "STRING", "value": "if"}
3181            }
3182        }"#;
3183        let g = Grammar::from_bytes("tiny", bytes).expect("parse");
3184        assert!(g.supertypes.contains("stmt"));
3185        assert!(kind_satisfies_symbol(&g, Some("while_stmt"), "stmt"));
3186    }
3187
3188    #[test]
3189    fn alias_value_matches_kind() {
3190        // A named ALIAS rewrites the parser-visible kind to `value`;
3191        // `kind_satisfies_symbol` should accept that rewritten kind
3192        // when looking up the original SYMBOL.
3193        let bytes = br#"{
3194            "name": "tiny",
3195            "rules": {
3196                "_package_identifier": {
3197                    "type": "ALIAS",
3198                    "named": true,
3199                    "value": "package_identifier",
3200                    "content": {"type": "SYMBOL", "name": "identifier"}
3201                },
3202                "identifier": {"type": "PATTERN", "value": "[a-z]+"}
3203            }
3204        }"#;
3205        let g = Grammar::from_bytes("tiny", bytes).expect("parse");
3206        assert!(kind_satisfies_symbol(
3207            &g,
3208            Some("package_identifier"),
3209            "_package_identifier"
3210        ));
3211    }
3212
3213    #[test]
3214    fn referenced_symbols_walks_nested_seq() {
3215        let prod: Production = serde_json::from_str(
3216            r#"{
3217                "type": "SEQ",
3218                "members": [
3219                    {"type": "CHOICE", "members": [
3220                        {"type": "SYMBOL", "name": "attribute_item"},
3221                        {"type": "BLANK"}
3222                    ]},
3223                    {"type": "SYMBOL", "name": "parameter"},
3224                    {"type": "REPEAT", "content": {
3225                        "type": "SEQ",
3226                        "members": [
3227                            {"type": "STRING", "value": ","},
3228                            {"type": "SYMBOL", "name": "parameter"}
3229                        ]
3230                    }}
3231                ]
3232            }"#,
3233        )
3234        .expect("seq");
3235        let symbols = referenced_symbols(&prod);
3236        assert!(symbols.contains(&"attribute_item"));
3237        assert!(symbols.contains(&"parameter"));
3238    }
3239
3240    #[test]
3241    fn literal_strings_collects_choice_members() {
3242        let prod: Production = serde_json::from_str(
3243            r#"{
3244                "type": "CHOICE",
3245                "members": [
3246                    {"type": "STRING", "value": "+"},
3247                    {"type": "STRING", "value": "-"},
3248                    {"type": "STRING", "value": "*"}
3249                ]
3250            }"#,
3251        )
3252        .expect("choice");
3253        let strings = literal_strings(&prod);
3254        assert_eq!(strings, vec!["+", "-", "*"]);
3255    }
3256
3257    /// The ocaml and javascript grammars (tree-sitter ≥ 0.25) emit a
3258    /// `RESERVED` rule kind that an earlier deserialiser rejected
3259    /// with `unknown variant "RESERVED"`. Verify both that the bare
3260    /// variant deserialises and that a `RESERVED`-wrapped grammar is
3261    /// loadable end-to-end via [`Grammar::from_bytes`].
3262    #[test]
3263    fn reserved_variant_deserialises() {
3264        let prod: Production = serde_json::from_str(
3265            r#"{
3266                "type": "RESERVED",
3267                "content": {"type": "SYMBOL", "name": "_lowercase_identifier"},
3268                "context_name": "attribute_id"
3269            }"#,
3270        )
3271        .expect("RESERVED parses");
3272        match prod {
3273            Production::Reserved { content, .. } => match *content {
3274                Production::Symbol { name } => assert_eq!(name, "_lowercase_identifier"),
3275                other => panic!("expected inner SYMBOL, got {other:?}"),
3276            },
3277            other => panic!("expected RESERVED, got {other:?}"),
3278        }
3279    }
3280
3281    #[test]
3282    fn reserved_grammar_loads_end_to_end() {
3283        let bytes = br#"{
3284            "name": "tiny_reserved",
3285            "rules": {
3286                "program": {
3287                    "type": "RESERVED",
3288                    "content": {"type": "SYMBOL", "name": "ident"},
3289                    "context_name": "keywords"
3290                },
3291                "ident": {"type": "PATTERN", "value": "[a-z]+"}
3292            }
3293        }"#;
3294        let g = Grammar::from_bytes("tiny_reserved", bytes).expect("RESERVED-using grammar loads");
3295        assert!(g.rules.contains_key("program"));
3296    }
3297
3298    #[test]
3299    fn reserved_walker_helpers_recurse_into_content() {
3300        // The walker's helpers (first_symbol, has_field_in,
3301        // referenced_symbols, ...) all need to descend through
3302        // RESERVED into its content. If they bail at RESERVED, the
3303        // `pick_choice_with_cursor` heuristic ranks the alt below
3304        // alts that DO recurse, which produces wrong emit output
3305        // even when the deserialiser doesn't crash.
3306        let prod: Production = serde_json::from_str(
3307            r#"{
3308                "type": "RESERVED",
3309                "content": {
3310                    "type": "FIELD",
3311                    "name": "lhs",
3312                    "content": {"type": "SYMBOL", "name": "expr"}
3313                },
3314                "context_name": "ctx"
3315            }"#,
3316        )
3317        .expect("nested RESERVED parses");
3318        assert_eq!(first_symbol(&prod), Some("expr"));
3319        assert!(has_field_in(&prod, &["lhs"]));
3320        let symbols = referenced_symbols(&prod);
3321        assert!(symbols.contains(&"expr"));
3322    }
3323
3324    // -- Yield-set tests --
3325
3326    fn yield_of(grammar: &Grammar, prod: &Production) -> std::collections::HashSet<String> {
3327        let mut visited = std::collections::HashSet::new();
3328        let mut cache = grammar.yield_sets.clone();
3329        yield_of_production(grammar, prod, &mut visited, &mut cache)
3330    }
3331
3332    #[test]
3333    fn yield_set_seq_only_first_member() {
3334        let prod: Production = serde_json::from_str(
3335            r#"{
3336                "type": "SEQ",
3337                "members": [
3338                    {"type": "SYMBOL", "name": "identifier"},
3339                    {"type": "STRING", "value": "as"},
3340                    {"type": "SYMBOL", "name": "target"}
3341                ]
3342            }"#,
3343        )
3344        .expect("valid SEQ");
3345        let g = Grammar::from_bytes("test", b"{}").unwrap_or_else(|_| {
3346            serde_json::from_str::<Grammar>(r#"{"name":"t","rules":{}}"#).unwrap()
3347        });
3348        let ys = yield_of(&g, &prod);
3349        assert!(ys.contains("identifier"), "SEQ yields first member");
3350        assert!(
3351            !ys.contains("target"),
3352            "SEQ must NOT yield non-first members"
3353        );
3354    }
3355
3356    #[test]
3357    fn yield_set_choice_union() {
3358        let prod: Production = serde_json::from_str(
3359            r#"{
3360                "type": "CHOICE",
3361                "members": [
3362                    {"type": "SYMBOL", "name": "a"},
3363                    {"type": "SYMBOL", "name": "b"}
3364                ]
3365            }"#,
3366        )
3367        .expect("valid CHOICE");
3368        let g = serde_json::from_str::<Grammar>(r#"{"name":"t","rules":{}}"#).unwrap();
3369        let ys = yield_of(&g, &prod);
3370        assert_eq!(ys.len(), 2);
3371        assert!(ys.contains("a"));
3372        assert!(ys.contains("b"));
3373    }
3374
3375    #[test]
3376    fn yield_set_hidden_expansion() {
3377        let g = serde_json::from_str::<Grammar>(
3378            r#"{"name":"t","rules":{
3379                "_value": {
3380                    "type": "CHOICE",
3381                    "members": [
3382                        {"type": "SYMBOL", "name": "number"},
3383                        {"type": "SYMBOL", "name": "object"}
3384                    ]
3385                }
3386            }}"#,
3387        )
3388        .unwrap();
3389        let mut g = g;
3390        g.subtypes = compute_subtype_closure(&g);
3391        g.yield_sets = compute_yield_sets(&g);
3392        let sym: Production =
3393            serde_json::from_str(r#"{"type": "SYMBOL", "name": "_value"}"#).unwrap();
3394        let ys = yield_of(&g, &sym);
3395        assert!(
3396            ys.contains("number"),
3397            "hidden rule expands into its CHOICE members"
3398        );
3399        assert!(ys.contains("object"));
3400        assert!(
3401            !ys.contains("_value"),
3402            "hidden rule name is not in yield set"
3403        );
3404    }
3405
3406    #[test]
3407    fn yield_set_optional_includes_epsilon() {
3408        let prod: Production = serde_json::from_str(
3409            r#"{"type": "OPTIONAL", "content": {"type": "SYMBOL", "name": "x"}}"#,
3410        )
3411        .unwrap();
3412        let g = serde_json::from_str::<Grammar>(r#"{"name":"t","rules":{}}"#).unwrap();
3413        let ys = yield_of(&g, &prod);
3414        assert!(ys.contains("x"));
3415        assert!(ys.contains(""), "OPTIONAL includes epsilon");
3416    }
3417
3418    #[test]
3419    fn yield_set_alias_uses_value() {
3420        let prod: Production = serde_json::from_str(
3421            r#"{"type": "ALIAS", "content": {"type": "SYMBOL", "name": "real"},
3422                "named": true, "value": "alias_name"}"#,
3423        )
3424        .unwrap();
3425        let g = serde_json::from_str::<Grammar>(r#"{"name":"t","rules":{}}"#).unwrap();
3426        let ys = yield_of(&g, &prod);
3427        assert_eq!(ys.len(), 1);
3428        assert!(ys.contains("alias_name"), "named ALIAS yields its value");
3429    }
3430}