Skip to main content

panproto_parse/
emit_pretty.rs

1#![allow(
2    clippy::module_name_repetitions,
3    clippy::too_many_lines,
4    clippy::too_many_arguments,
5    clippy::map_unwrap_or,
6    clippy::option_if_let_else,
7    clippy::elidable_lifetime_names,
8    clippy::items_after_statements,
9    clippy::needless_pass_by_value,
10    clippy::single_match_else,
11    clippy::manual_let_else,
12    clippy::match_same_arms,
13    clippy::missing_const_for_fn,
14    clippy::single_char_pattern,
15    clippy::naive_bytecount,
16    clippy::expect_used,
17    clippy::redundant_pub_crate,
18    clippy::used_underscore_binding,
19    clippy::redundant_field_names,
20    clippy::struct_field_names,
21    clippy::redundant_else,
22    clippy::similar_names
23)]
24
25//! De-novo source emission from a by-construction schema.
26//!
27//! [`AstParser::emit`] reconstructs source from byte-position fragments
28//! that the parser stored on the schema during `parse`. That works for
29//! edit pipelines (`parse → transform → emit`) but fails for schemas
30//! built by hand (`SchemaBuilder` with no parse history): they carry
31//! no `start-byte`, no `interstitial-N`, no `literal-value`, and the
32//! reconstructor returns `Err(EmitFailed { reason: "schema has no
33//! text fragments" })`.
34//!
35//! This module renders such schemas to source bytes by walking
36//! tree-sitter's `grammar.json` production rules. For each schema
37//! vertex of kind `K`, the walker looks up `K`'s production in the
38//! grammar and emits its body in order:
39//!
40//! - `STRING` nodes contribute literal token bytes directly.
41//! - `SYMBOL` and `FIELD` nodes recurse into the schema's children,
42//!   matching by edge kind (which is the tree-sitter field name).
43//! - `SEQ` emits its members in order.
44//! - `CHOICE` picks the alternative whose head `SYMBOL` matches an
45//!   actual child kind, or whose terminals appear in the rendered
46//!   prefix; falls back to the first non-`BLANK` alternative when no
47//!   alternative matches.
48//! - `REPEAT` and `REPEAT1` emit their content once per matching
49//!   child edge in declared order.
50//! - `OPTIONAL` emits its content iff a corresponding child edge or
51//!   constraint is populated.
52//! - `PATTERN` is a regex placeholder for variable-text terminals
53//!   (identifiers, numbers, quoted strings). The walker emits a
54//!   `literal-value` constraint when present and otherwise falls
55//!   back to a placeholder derived from the regex shape.
56//! - `BLANK`, `TOKEN`, `IMMEDIATE_TOKEN`, `ALIAS`, `PREC*` are
57//!   handled transparently (the inner content is emitted; the
58//!   wrapper is dropped).
59//!
60//! Whitespace and indentation come from a `FormatPolicy` applied
61//! during emission. The default policy inserts a single space between
62//! adjacent tokens, a newline after `;` / `}` / `{`, and tracks an
63//! indent counter on `{` / `}` boundaries.
64//!
65//! Output is *syntactically valid* for any grammar that ships
66//! `grammar.json`. Idiomatic formatting (rustfmt-style spacing rules,
67//! per-language conventions) is a polish layer that lives outside
68//! this module.
69
70use std::collections::BTreeMap;
71
72use panproto_schema::{Edge, Schema};
73use serde::Deserialize;
74
75use crate::error::ParseError;
76
77// ═══════════════════════════════════════════════════════════════════
78// Grammar JSON model
79// ═══════════════════════════════════════════════════════════════════
80
81/// A single tree-sitter production rule.
82///
83/// Mirrors the shape emitted by `tree-sitter generate`: every node has
84/// a `type` discriminator that selects a structural variant. The
85/// untyped subset (`PATTERN`, `STRING`, `SYMBOL`, `BLANK`) handles
86/// terminals; the structural subset (`SEQ`, `CHOICE`, `REPEAT`,
87/// `REPEAT1`, `OPTIONAL`, `FIELD`, `ALIAS`, `TOKEN`,
88/// `IMMEDIATE_TOKEN`, `PREC*`) builds composite productions.
89#[derive(Debug, Clone, Deserialize)]
90#[serde(tag = "type")]
91#[non_exhaustive]
92pub enum Production {
93    /// Concatenation of productions.
94    #[serde(rename = "SEQ")]
95    Seq {
96        /// Ordered members; each is emitted in turn.
97        members: Vec<Self>,
98    },
99    /// Alternation between productions.
100    #[serde(rename = "CHOICE")]
101    Choice {
102        /// Alternatives; the walker picks one based on the schema's
103        /// children and constraints.
104        members: Vec<Self>,
105    },
106    /// Zero-or-more repetition.
107    #[serde(rename = "REPEAT")]
108    Repeat {
109        /// The repeated body.
110        content: Box<Self>,
111    },
112    /// One-or-more repetition.
113    #[serde(rename = "REPEAT1")]
114    Repeat1 {
115        /// The repeated body.
116        content: Box<Self>,
117    },
118    /// Optional inclusion (zero or one).
119    ///
120    /// Tree-sitter usually emits `OPTIONAL` as `CHOICE { content,
121    /// BLANK }`, but recent generator versions also emit explicit
122    /// `OPTIONAL` nodes; both shapes are accepted.
123    #[serde(rename = "OPTIONAL")]
124    Optional {
125        /// The optional body.
126        content: Box<Self>,
127    },
128    /// Reference to another rule by name.
129    #[serde(rename = "SYMBOL")]
130    Symbol {
131        /// Name of the referenced rule (matches a vertex kind on the
132        /// schema side).
133        name: String,
134    },
135    /// Literal token bytes.
136    #[serde(rename = "STRING")]
137    String {
138        /// The literal token. Emitted verbatim.
139        value: String,
140    },
141    /// Regex-matched terminal.
142    ///
143    /// At parse time this matches arbitrary bytes; at emit time the
144    /// walker substitutes a `literal-value` constraint when present
145    /// and falls back to a placeholder otherwise.
146    #[serde(rename = "PATTERN")]
147    Pattern {
148        /// The original regex.
149        value: String,
150    },
151    /// The empty production. Emits nothing.
152    #[serde(rename = "BLANK")]
153    Blank,
154    /// Named field over a content production.
155    ///
156    /// The field `name` matches an edge kind on the schema side; the
157    /// walker resolves the corresponding child vertex and recurses
158    /// into `content` with that child as context.
159    #[serde(rename = "FIELD")]
160    Field {
161        /// Field name (matches edge kind).
162        name: String,
163        /// The contents of the field.
164        content: Box<Self>,
165    },
166    /// An aliased production.
167    ///
168    /// `value` records the parser-visible kind; the walker emits
169    /// `content` and ignores the alias rename.
170    #[serde(rename = "ALIAS")]
171    Alias {
172        /// The aliased content.
173        content: Box<Self>,
174        /// Whether the alias is a named node.
175        #[serde(default)]
176        named: bool,
177        /// The alias's surface name.
178        #[serde(default)]
179        value: String,
180    },
181    /// A token wrapper.
182    ///
183    /// Tree-sitter uses `TOKEN` to mark a sub-rule as a single
184    /// lexical token; the walker emits the inner content unchanged.
185    #[serde(rename = "TOKEN")]
186    Token {
187        /// The wrapped content.
188        content: Box<Self>,
189    },
190    /// An immediate-token wrapper (no preceding whitespace).
191    ///
192    /// Treated like [`Production::Token`] for emit purposes.
193    #[serde(rename = "IMMEDIATE_TOKEN")]
194    ImmediateToken {
195        /// The wrapped content.
196        content: Box<Self>,
197    },
198    /// Precedence wrapper.
199    #[serde(rename = "PREC")]
200    Prec {
201        /// Precedence value (numeric or string). Ignored at emit time.
202        #[allow(dead_code)]
203        value: serde_json::Value,
204        /// The wrapped content.
205        content: Box<Self>,
206    },
207    /// Left-associative precedence wrapper.
208    #[serde(rename = "PREC_LEFT")]
209    PrecLeft {
210        /// Precedence value. Ignored at emit time.
211        #[allow(dead_code)]
212        value: serde_json::Value,
213        /// The wrapped content.
214        content: Box<Self>,
215    },
216    /// Right-associative precedence wrapper.
217    #[serde(rename = "PREC_RIGHT")]
218    PrecRight {
219        /// Precedence value. Ignored at emit time.
220        #[allow(dead_code)]
221        value: serde_json::Value,
222        /// The wrapped content.
223        content: Box<Self>,
224    },
225    /// Dynamic precedence wrapper.
226    #[serde(rename = "PREC_DYNAMIC")]
227    PrecDynamic {
228        /// Precedence value. Ignored at emit time.
229        #[allow(dead_code)]
230        value: serde_json::Value,
231        /// The wrapped content.
232        content: Box<Self>,
233    },
234    /// Reserved-word wrapper (tree-sitter ≥ 0.25).
235    ///
236    /// Tree-sitter's `RESERVED` rule marks an inner production as a
237    /// reserved-word context: the parser excludes the listed identifiers
238    /// from being treated as the inner symbol. The `context_name`
239    /// metadata names the reserved-word set; the emitter does not need
240    /// it (we are walking schema → bytes, not enforcing reserved-word
241    /// constraints), so we emit the inner content unchanged, the same
242    /// way [`Production::Token`] and [`Production::ImmediateToken`] do.
243    #[serde(rename = "RESERVED")]
244    Reserved {
245        /// The wrapped content.
246        content: Box<Self>,
247        /// Name of the reserved-word context. Ignored at emit time.
248        #[allow(dead_code)]
249        #[serde(default)]
250        context_name: String,
251    },
252}
253
254/// Structural role of a STRING token within a grammar rule.
255///
256/// Derived at Grammar construction time from the token's position in
257/// the production rule body. The role determines spacing behavior in
258/// the layout pass via a role-pair lookup table.
259#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
260pub enum TokenRole {
261    /// First STRING in a matched-pair SEQ (e.g. `(`, `[`, `{`, `<`,
262    /// `begin`, `${`, `⟨`). No space after.
263    BracketOpen,
264    /// Last STRING in a matched-pair SEQ (e.g. `)`, `]`, `}`, `>`,
265    /// `end`, `⟩`). No space before.
266    BracketClose,
267    /// First STRING in a REPEAT body's inner SEQ (e.g. `,` in
268    /// `REPEAT(SEQ [",", item])`). No space before, space after.
269    Separator,
270    /// Alphanumeric STRING that is a language keyword (e.g. `if`,
271    /// `while`, `and`, `model`). Space before and after.
272    Keyword,
273    /// Non-alphanumeric STRING between content members inside a CHOICE
274    /// alternative (e.g. `+`, `=`, `~`, `<-` in binary expression
275    /// alternatives). Space before and after.
276    Operator,
277    /// Non-alphanumeric STRING between content members in a standalone
278    /// SEQ (not inside a CHOICE). Examples: `.` in `attribute`,
279    /// `::` in `scoped_identifier`, `->` in `pointer_member`. These
280    /// are structural connectors, not algebraic operators. No space.
281    Connector,
282    /// Text from a leaf vertex's `literal-value` constraint.
283    Terminal,
284}
285
286/// A grammar's production-rule table, deserialized from `grammar.json`.
287///
288/// Only the fields the emitter consumes are decoded; precedences,
289/// conflicts, externals, and other parser-only metadata are ignored.
290#[derive(Debug, Clone, Deserialize)]
291#[non_exhaustive]
292pub struct Grammar {
293    /// Grammar name (e.g. `"rust"`, `"typescript"`).
294    #[allow(dead_code)]
295    pub name: String,
296    /// Map from rule name (a vertex kind on the schema side) to
297    /// production. Entries are kept in lexical order so iteration
298    /// is deterministic.
299    pub rules: BTreeMap<String, Production>,
300    /// Supertypes declared in the grammar's `supertypes` field. A
301    /// supertype is a rule whose body is a `CHOICE` of `SYMBOL`
302    /// references; tree-sitter parsers report a node's kind as one
303    /// of the subtypes (e.g. `identifier`, `typed_parameter`) rather
304    /// than the supertype name (`parameter`), so the emitter needs to
305    /// know that a child kind in a subtype set should match the
306    /// supertype name when a SYMBOL references it.
307    #[serde(default, deserialize_with = "deserialize_supertypes")]
308    pub supertypes: std::collections::HashSet<String>,
309    /// Tree-sitter `extras` rules: the named symbols (typically comments)
310    /// that tree-sitter skips at parse time but records as children of the
311    /// surrounding vertex. They appear nowhere in the production grammar,
312    /// so the rule walker cannot reconcile them against the cursor — the
313    /// emit pass therefore drains them as a side channel: at vertex entry
314    /// and between REPEAT iterations any leading extras-kind edges are
315    /// consumed and emitted directly. The set is populated at
316    /// `Grammar::from_bytes` by collecting every `SYMBOL { name }` and
317    /// named `ALIAS { value, named: true }` under the top-level `extras`
318    /// array. Pattern-only extras (e.g. `\s` whitespace) are not vertex
319    /// kinds and are excluded.
320    #[serde(default, deserialize_with = "deserialize_extras")]
321    pub extras: std::collections::HashSet<String>,
322    /// Precomputed subtyping closure: `subtypes[symbol_name]` is the
323    /// set of vertex kinds that satisfy a SYMBOL `symbol_name`
324    /// reference on the schema side.
325    ///
326    /// Built once at [`Grammar::from_bytes`] time by walking each
327    /// hidden rule (`_`-prefixed), declared supertype, and named
328    /// `ALIAS { value: K, ... }` production to its leaf SYMBOLs and
329    /// recording the closure. This replaces the prior heuristic
330    /// `kind_satisfies_symbol` that walked the rule body on every
331    /// query: lookups are now O(1) and the relation is exactly the
332    /// transitive closure of "is reachable via hidden / supertype /
333    /// alias dispatch", with no over-expansion through non-hidden
334    /// non-supertype rule references.
335    #[serde(skip)]
336    pub subtypes: std::collections::HashMap<String, std::collections::HashSet<String>>,
337    /// Precomputed Yield sets: `yield_sets[rule_name]` is the set of
338    /// concrete vertex kinds that can appear as the **first named
339    /// child** when that rule's production is taken.
340    ///
341    /// Defined inductively:
342    /// - `Yield(SYMBOL S)` where S is hidden/supertype = `Yield(rules[S])`
343    /// - `Yield(SYMBOL S)` where S is concrete = `{S}`
344    /// - `Yield(SEQ [M1, ...])` = `Yield(M1)` (only first member)
345    /// - `Yield(CHOICE [M1, ..., Mn])` = `⋃ Yield(Mi)`
346    /// - `Yield(OPTIONAL { c })` = `Yield(c) ∪ {ε}`
347    /// - `Yield(BLANK)` = `{ε}`
348    /// - Wrappers (PREC*, TOKEN, FIELD, REPEAT, etc.) = `Yield(content)`
349    /// - `Yield(STRING)` = `Yield(PATTERN)` = `∅`
350    /// - `Yield(ALIAS { value: V, named: true })` = `{V}`
351    ///
352    /// Epsilon is represented as the empty string `""`.
353    #[serde(skip)]
354    pub yield_sets: std::collections::HashMap<String, std::collections::HashSet<String>>,
355    /// Child kinds allowed per parent kind, derived from node-types.json.
356    /// Maps parent kind to the set of ALL named child kinds that tree-sitter's
357    /// parser can produce for that parent (from both `children.types` and
358    /// `fields.*.types`). Used by `augment_subtypes_from_node_types` to
359    /// close the grammar/parser divergence gap.
360    #[serde(skip)]
361    pub node_type_children: std::collections::HashMap<String, std::collections::HashSet<String>>,
362    /// Per-field child kinds from node-types.json: maps parent kind →
363    /// field name → set of child kinds. Used by the augmentation to
364    /// restrict subtype edges to structurally matching positions.
365    #[serde(skip)]
366    pub node_type_field_children: std::collections::HashMap<
367        String,
368        std::collections::HashMap<String, std::collections::HashSet<String>>,
369    >,
370    /// Non-field child kinds from node-types.json: maps parent kind →
371    /// set of child kinds that appear in `children.types` (not in any field).
372    #[serde(skip)]
373    pub node_type_nonfield_children:
374        std::collections::HashMap<String, std::collections::HashSet<String>>,
375    /// Anonymous ALIAS values for external scanner tokens. Maps external
376    /// symbol name (e.g. `_ternary_qmark`) to the ALIAS value string
377    /// (e.g. `"?"`). Built by scanning grammar.json rule bodies for
378    /// `ALIAS { content: SYMBOL S, named: false, value: V }` where S
379    /// has no grammar rule.
380    #[serde(skip)]
381    pub external_alias_map: std::collections::HashMap<String, String>,
382    /// Per-rule token role classification. Maps rule name to a map of
383    /// STRING value to its structural role in that rule. Derived at
384    /// construction time by analyzing each rule's SEQ structure to
385    /// identify bracket pairs, separators, keywords, and operators.
386    #[serde(skip)]
387    pub token_roles:
388        std::collections::HashMap<String, std::collections::HashMap<String, TokenRole>>,
389    /// Set of `(rule_name, open_bracket_value)` pairs where the bracket
390    /// triggers indentation (the content between open and close contains
391    /// `REPEAT`/`REPEAT1`). Block-level constructs like `statement_block`
392    /// use indenting brackets; inline constructs like interpolation do not.
393    #[serde(skip)]
394    pub indent_triggers: std::collections::HashSet<(String, String)>,
395    /// Line-comment prefixes extracted from the grammar's extras.
396    /// Each prefix is a STRING value from a `TOKEN(SEQ [STRING prefix,
397    /// PATTERN ...])` pattern in the extras array, verified to be an
398    /// extras rule. Used by the layout pass to insert a newline after
399    /// comment Lit tokens.
400    #[serde(skip)]
401    pub line_comment_prefixes: Vec<String>,
402    /// External tokens that produce indent-open layout actions.
403    /// Identified by tree-sitter naming convention: names ending with
404    /// `_indent` or equal to `_indent`.
405    #[serde(skip)]
406    pub external_indent_opens: std::collections::HashSet<String>,
407    /// External tokens that produce indent-close layout actions.
408    #[serde(skip)]
409    pub external_indent_closes: std::collections::HashSet<String>,
410    /// External tokens that produce line breaks.
411    #[serde(skip)]
412    pub external_newlines: std::collections::HashSet<String>,
413    /// External tokens equivalent to semicolons.
414    #[serde(skip)]
415    pub external_semicolons: std::collections::HashSet<String>,
416    /// Named alias map: maps alias value to source symbol name.
417    /// When a vertex kind has no direct grammar rule, this map resolves
418    /// `ALIAS { content: SYMBOL source, named: true, value: alias }` so
419    /// the emitter can walk the source rule with proper token roles.
420    #[serde(skip)]
421    pub named_alias_map: std::collections::HashMap<String, String>,
422}
423
424fn deserialize_supertypes<'de, D>(
425    deserializer: D,
426) -> Result<std::collections::HashSet<String>, D::Error>
427where
428    D: serde::Deserializer<'de>,
429{
430    let entries: Vec<serde_json::Value> = Vec::deserialize(deserializer)?;
431    let mut out = std::collections::HashSet::new();
432    for entry in entries {
433        match entry {
434            serde_json::Value::String(s) => {
435                out.insert(s);
436            }
437            serde_json::Value::Object(map) => {
438                if let Some(serde_json::Value::String(name)) = map.get("name") {
439                    out.insert(name.clone());
440                }
441            }
442            _ => {}
443        }
444    }
445    Ok(out)
446}
447
448fn deserialize_extras<'de, D>(
449    deserializer: D,
450) -> Result<std::collections::HashSet<String>, D::Error>
451where
452    D: serde::Deserializer<'de>,
453{
454    let entries: Vec<serde_json::Value> = Vec::deserialize(deserializer)?;
455    let mut out = std::collections::HashSet::new();
456    for entry in entries {
457        if let serde_json::Value::Object(map) = entry {
458            let ty = map.get("type").and_then(serde_json::Value::as_str);
459            match ty {
460                // SYMBOL { name: K } — the extras rule is a named symbol
461                // (typically `line_comment` / `block_comment`). The kind
462                // K appears as a real child vertex on the schema side.
463                Some("SYMBOL") => {
464                    if let Some(serde_json::Value::String(name)) = map.get("name") {
465                        out.insert(name.clone());
466                    }
467                }
468                // ALIAS { content, value: V, named: true } — the extras
469                // rule renames its content; V is the kind on the schema.
470                Some("ALIAS") => {
471                    let named = map
472                        .get("named")
473                        .and_then(serde_json::Value::as_bool)
474                        .unwrap_or(false);
475                    if named {
476                        if let Some(serde_json::Value::String(value)) = map.get("value") {
477                            out.insert(value.clone());
478                        }
479                    }
480                }
481                // PATTERN / STRING / TOKEN entries describe inter-token
482                // whitespace and have no vertex-side representation.
483                _ => {}
484            }
485        }
486    }
487    Ok(out)
488}
489
490impl Grammar {
491    /// Parse a grammar's `grammar.json` bytes.
492    ///
493    /// Builds the subtyping closure as part of construction so every
494    /// downstream lookup is O(1). The closure is the least relation
495    /// containing `(K, K)` for every rule key `K` and closed under:
496    ///
497    /// - hidden-rule expansion: if `S` is hidden and a SYMBOL `S` may
498    ///   reach SYMBOL `K`, then `K ⊑ S`.
499    /// - supertype expansion: if `S` is in the grammar's supertypes
500    ///   block and `K` is one of `S`'s alternatives, then `K ⊑ S`.
501    /// - alias renaming: if a rule body contains
502    ///   `ALIAS { content: SYMBOL R, value: A, named: true }` where
503    ///   `R` reaches kind `K` (or `K = R` when no further hop), then
504    ///   `A ⊑ R` and `K ⊑ A`.
505    ///
506    /// # Errors
507    ///
508    /// Returns [`ParseError::EmitFailed`] when the bytes are not a
509    /// valid `grammar.json` document.
510    pub fn from_bytes(protocol: &str, bytes: &[u8]) -> Result<Self, ParseError> {
511        Self::from_bytes_with_node_types(protocol, bytes, None)
512    }
513
514    /// Parse a grammar from both `grammar.json` and optionally
515    /// `node-types.json` bytes.
516    ///
517    /// # Errors
518    ///
519    /// Returns [`ParseError::EmitFailed`] when `grammar_bytes` is
520    /// not a valid `grammar.json` document.
521    pub fn from_bytes_with_node_types(
522        protocol: &str,
523        grammar_bytes: &[u8],
524        node_types_bytes: Option<&[u8]>,
525    ) -> Result<Self, ParseError> {
526        let mut grammar: Self =
527            serde_json::from_slice(grammar_bytes).map_err(|e| ParseError::EmitFailed {
528                protocol: protocol.to_owned(),
529                reason: format!("grammar.json deserialization failed: {e}"),
530            })?;
531        grammar.subtypes = compute_subtype_closure(&grammar);
532        grammar.named_alias_map = build_named_alias_map(&grammar);
533        grammar.yield_sets = compute_yield_sets(&grammar);
534        if let Some(nt_bytes) = node_types_bytes {
535            let (all_children, field_children, nonfield_children) =
536                build_node_type_children(nt_bytes);
537            grammar.node_type_children = all_children;
538            grammar.node_type_field_children = field_children;
539            grammar.node_type_nonfield_children = nonfield_children;
540            augment_subtypes_from_node_types(&mut grammar);
541        }
542        grammar.yield_sets = compute_yield_sets(&grammar);
543        grammar.external_alias_map = build_external_alias_map(&grammar);
544        let (token_roles, indent_triggers) = compute_token_roles(&grammar);
545        grammar.token_roles = token_roles;
546        grammar.indent_triggers = indent_triggers;
547        grammar.line_comment_prefixes = extract_line_comment_prefixes(&grammar);
548        classify_external_layout_tokens(&mut grammar);
549        grammar.yield_sets = compute_yield_sets(&grammar);
550        Ok(grammar)
551    }
552}
553
554/// Compute the subtyping relation as a forward-indexed map from a
555/// SYMBOL name to the set of vertex kinds that satisfy that SYMBOL.
556fn compute_subtype_closure(
557    grammar: &Grammar,
558) -> std::collections::HashMap<String, std::collections::HashSet<String>> {
559    use std::collections::{HashMap, HashSet};
560    // Edges of the "kind X satisfies SYMBOL Y" relation. `K ⊑ Y` is
561    // recorded whenever Y is reached by walking the grammar's
562    // ALIAS / hidden-rule / supertype dispatch from a position where
563    // K is the actual vertex kind.
564    let mut subtypes: HashMap<String, HashSet<String>> = HashMap::new();
565    for name in grammar.rules.keys() {
566        subtypes
567            .entry(name.clone())
568            .or_default()
569            .insert(name.clone());
570    }
571
572    // First pass: collect the immediate "satisfies" edges from each
573    // expandable rule (hidden, supertype) to the kinds reachable by
574    // walking its body, plus alias edges.
575    fn walk<'g>(
576        grammar: &'g Grammar,
577        production: &'g Production,
578        visited: &mut HashSet<&'g str>,
579        out: &mut HashSet<String>,
580    ) {
581        match production {
582            Production::Symbol { name } => {
583                // Direct subtype.
584                out.insert(name.clone());
585                // Continue expansion through hidden / supertype rules
586                // so the closure traverses pass-through dispatch.
587                let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
588                if expand && visited.insert(name.as_str()) {
589                    if let Some(rule) = grammar.rules.get(name) {
590                        walk(grammar, rule, visited, out);
591                    }
592                }
593            }
594            Production::Choice { members } | Production::Seq { members } => {
595                for m in members {
596                    walk(grammar, m, visited, out);
597                }
598            }
599            Production::Alias {
600                content,
601                named,
602                value,
603            } => {
604                if *named && !value.is_empty() {
605                    out.insert(value.clone());
606                }
607                walk(grammar, content, visited, out);
608            }
609            Production::Repeat { content }
610            | Production::Repeat1 { content }
611            | Production::Optional { content }
612            | Production::Field { content, .. }
613            | Production::Token { content }
614            | Production::ImmediateToken { content }
615            | Production::Prec { content, .. }
616            | Production::PrecLeft { content, .. }
617            | Production::PrecRight { content, .. }
618            | Production::PrecDynamic { content, .. }
619            | Production::Reserved { content, .. } => {
620                walk(grammar, content, visited, out);
621            }
622            _ => {}
623        }
624    }
625
626    for (name, rule) in &grammar.rules {
627        let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
628        if !expand {
629            continue;
630        }
631        let mut visited: HashSet<&str> = HashSet::new();
632        visited.insert(name.as_str());
633        let mut reachable: HashSet<String> = HashSet::new();
634        walk(grammar, rule, &mut visited, &mut reachable);
635        for kind in &reachable {
636            subtypes
637                .entry(kind.clone())
638                .or_default()
639                .insert(name.clone());
640        }
641    }
642
643    // Aliases: scan every rule body for ALIAS { content, value }
644    // declarations. The kinds reachable from `content` satisfy
645    // `value`, AND (by construction) `value` satisfies the
646    // surrounding rule. Walking the ENTIRE grammar once captures
647    // every alias site, irrespective of which rule introduces it.
648    fn collect_aliases<'g>(production: &'g Production, out: &mut Vec<(String, &'g Production)>) {
649        match production {
650            Production::Alias {
651                content,
652                named,
653                value,
654            } => {
655                if *named && !value.is_empty() {
656                    out.push((value.clone(), content.as_ref()));
657                }
658                collect_aliases(content, out);
659            }
660            Production::Choice { members } | Production::Seq { members } => {
661                for m in members {
662                    collect_aliases(m, out);
663                }
664            }
665            Production::Repeat { content }
666            | Production::Repeat1 { content }
667            | Production::Optional { content }
668            | Production::Field { content, .. }
669            | Production::Token { content }
670            | Production::ImmediateToken { content }
671            | Production::Prec { content, .. }
672            | Production::PrecLeft { content, .. }
673            | Production::PrecRight { content, .. }
674            | Production::PrecDynamic { content, .. }
675            | Production::Reserved { content, .. } => {
676                collect_aliases(content, out);
677            }
678            _ => {}
679        }
680    }
681    let mut aliases: Vec<(String, &Production)> = Vec::new();
682    for rule in grammar.rules.values() {
683        collect_aliases(rule, &mut aliases);
684    }
685    for (alias_value, content) in aliases {
686        let mut visited: HashSet<&str> = HashSet::new();
687        let mut reachable: HashSet<String> = HashSet::new();
688        walk(grammar, content, &mut visited, &mut reachable);
689        // Aliased value satisfies itself and is satisfied by every
690        // kind its content can reach.
691        subtypes
692            .entry(alias_value.clone())
693            .or_default()
694            .insert(alias_value.clone());
695        for kind in reachable {
696            subtypes
697                .entry(kind)
698                .or_default()
699                .insert(alias_value.clone());
700        }
701    }
702
703    // Transitive close through hidden and supertype rules via Tarjan SCC.
704    //
705    // The relation `K ⊑ Y` means "a vertex of kind K can appear where
706    // the grammar says SYMBOL Y." Transitivity applies when Y is a
707    // hidden or supertype rule (a dispatch point), NOT when Y is a
708    // concrete named rule. We build the directed graph G on dispatchable
709    // node names with edge Y → Z iff Z ∈ subtypes[Y] and Z is dispatchable.
710    // The transitive closure within G is the union of every reachable
711    // dispatchable node, which by Tarjan's theorem is computed in
712    // O(V + E) by contracting SCCs into a DAG, then unioning closures
713    // along reverse topological order.
714    let is_dispatch = |s: &str| s.starts_with('_') || grammar.supertypes.contains(s);
715    // 1. Nodes: every dispatchable name that appears as a key in subtypes
716    //    OR as a member of any subtypes value.
717    let mut nodes: HashSet<String> = HashSet::new();
718    for (k, vs) in &subtypes {
719        if is_dispatch(k) {
720            nodes.insert(k.clone());
721        }
722        for v in vs {
723            if is_dispatch(v) {
724                nodes.insert(v.clone());
725            }
726        }
727    }
728    let nodes: Vec<String> = nodes.into_iter().collect();
729    let index_of: HashMap<&str, usize> = nodes
730        .iter()
731        .enumerate()
732        .map(|(i, n)| (n.as_str(), i))
733        .collect();
734    // 2. Edges: Y → Z iff Z ∈ subtypes[Y] and both are dispatchable.
735    let mut edges: Vec<Vec<usize>> = vec![Vec::new(); nodes.len()];
736    for (i, name) in nodes.iter().enumerate() {
737        if let Some(targets) = subtypes.get(name) {
738            for t in targets {
739                if let Some(&j) = index_of.get(t.as_str()) {
740                    if i != j {
741                        edges[i].push(j);
742                    }
743                }
744            }
745        }
746    }
747
748    // 3. Tarjan SCC. `comp[v]` = SCC index of `v`. SCC indices come out
749    //    in reverse topological order (sinks first), which is exactly
750    //    the order we want for closure accumulation.
751    fn tarjan(edges: &[Vec<usize>]) -> Vec<usize> {
752        let n = edges.len();
753        let mut comp = vec![usize::MAX; n];
754        let mut index_arr = vec![usize::MAX; n];
755        let mut lowlink = vec![0usize; n];
756        let mut on_stack = vec![false; n];
757        let mut stack: Vec<usize> = Vec::new();
758        let mut next_index = 0usize;
759        let mut next_comp = 0usize;
760        // Iterative Tarjan to avoid stack overflow on large grammars.
761        let mut work: Vec<(usize, usize)> = Vec::new();
762        for start in 0..n {
763            if index_arr[start] != usize::MAX {
764                continue;
765            }
766            work.push((start, 0));
767            index_arr[start] = next_index;
768            lowlink[start] = next_index;
769            next_index += 1;
770            stack.push(start);
771            on_stack[start] = true;
772            while let Some(&(v, i)) = work.last() {
773                if i < edges[v].len() {
774                    let w = edges[v][i];
775                    if let Some(slot) = work.last_mut() {
776                        slot.1 += 1;
777                    }
778                    if index_arr[w] == usize::MAX {
779                        index_arr[w] = next_index;
780                        lowlink[w] = next_index;
781                        next_index += 1;
782                        stack.push(w);
783                        on_stack[w] = true;
784                        work.push((w, 0));
785                    } else if on_stack[w] && index_arr[w] < lowlink[v] {
786                        lowlink[v] = index_arr[w];
787                    }
788                } else {
789                    if lowlink[v] == index_arr[v] {
790                        while let Some(w) = stack.pop() {
791                            on_stack[w] = false;
792                            comp[w] = next_comp;
793                            if w == v {
794                                break;
795                            }
796                        }
797                        next_comp += 1;
798                    }
799                    let lv = lowlink[v];
800                    work.pop();
801                    if let Some(&(parent, _)) = work.last() {
802                        if lv < lowlink[parent] {
803                            lowlink[parent] = lv;
804                        }
805                    }
806                }
807            }
808        }
809        comp
810    }
811    let comp = tarjan(&edges);
812    let num_comps = comp.iter().max().copied().map_or(0, |m| m + 1);
813
814    // 4. For each SCC, accumulate the set of dispatchable nodes reachable
815    //    from it. SCCs are emitted in reverse topological order, so when
816    //    we process SCC c, every successor SCC has its closure already
817    //    computed.
818    let mut scc_members: Vec<Vec<usize>> = vec![Vec::new(); num_comps];
819    for (v, &c) in comp.iter().enumerate() {
820        scc_members[c].push(v);
821    }
822    let mut scc_closure: Vec<HashSet<String>> = vec![HashSet::new(); num_comps];
823    for c in 0..num_comps {
824        // Members of the SCC are mutually reachable.
825        let mut closure: HashSet<String> = HashSet::new();
826        for &v in &scc_members[c] {
827            closure.insert(nodes[v].clone());
828        }
829        // Successor SCCs' closures (already computed).
830        for &v in &scc_members[c] {
831            for &w in &edges[v] {
832                let wc = comp[w];
833                if wc != c {
834                    closure.extend(scc_closure[wc].iter().cloned());
835                }
836            }
837        }
838        scc_closure[c] = closure;
839    }
840
841    // 5. Apply: for each kind K in `subtypes`, replace its dispatchable
842    //    supertypes by their full closure. Non-dispatchable members
843    //    (concrete kinds) stay as-is.
844    let keys: Vec<String> = subtypes.keys().cloned().collect();
845    for k in keys {
846        let existing = subtypes.remove(&k).unwrap_or_default();
847        let mut new_set: HashSet<String> = HashSet::new();
848        for s in &existing {
849            new_set.insert(s.clone());
850            if let Some(&i) = index_of.get(s.as_str()) {
851                new_set.extend(scc_closure[comp[i]].iter().cloned());
852            }
853        }
854        subtypes.insert(k, new_set);
855    }
856
857    subtypes
858}
859
860/// Compute the Yield set for every rule in the grammar.
861///
862/// `Yield(P)` is the set of concrete vertex kinds that can appear as
863/// the first named child when production P is taken. See the
864/// `Grammar::yield_sets` doc comment for the inductive definition.
865fn compute_yield_sets(
866    grammar: &Grammar,
867) -> std::collections::HashMap<String, std::collections::HashSet<String>> {
868    let mut cache: std::collections::HashMap<String, std::collections::HashSet<String>> =
869        std::collections::HashMap::new();
870    for (name, rule) in &grammar.rules {
871        let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
872        if !expand {
873            continue;
874        }
875        if cache.contains_key(name) {
876            continue;
877        }
878        let mut visited = std::collections::HashSet::new();
879        let ys = yield_of_production(grammar, rule, &mut visited, &mut cache);
880        cache.insert(name.clone(), ys);
881    }
882    cache
883}
884
885/// Compute the Yield set of an arbitrary production node.
886///
887/// Uses `cache` (the partially-built `yield_sets` map) as
888/// memoization. `visited` tracks the current recursion path to
889/// detect cycles through hidden/supertype rules; a cycle returns ∅
890/// (a cycle that never passes through a concrete named symbol
891/// cannot produce a first child).
892fn yield_of_production(
893    grammar: &Grammar,
894    production: &Production,
895    visited: &mut std::collections::HashSet<String>,
896    cache: &mut std::collections::HashMap<String, std::collections::HashSet<String>>,
897) -> std::collections::HashSet<String> {
898    match production {
899        Production::Symbol { name } => {
900            let expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
901            if !expand {
902                let mut set = std::collections::HashSet::new();
903                set.insert(name.clone());
904                return set;
905            }
906            if let Some(cached) = cache.get(name) {
907                return cached.clone();
908            }
909            {
910                if !visited.insert(name.clone()) {
911                    return std::collections::HashSet::new();
912                }
913                let result = if let Some(rule) = grammar.rules.get(name) {
914                    yield_of_production(grammar, rule, visited, cache)
915                } else {
916                    std::collections::HashSet::new()
917                };
918                visited.remove(name);
919                cache.insert(name.clone(), result.clone());
920                result
921            }
922        }
923        Production::Alias {
924            content,
925            named,
926            value,
927        } => {
928            if *named && !value.is_empty() {
929                let mut set = std::collections::HashSet::new();
930                set.insert(value.clone());
931                set
932            } else {
933                yield_of_production(grammar, content, visited, cache)
934            }
935        }
936        Production::Seq { members } => {
937            if members.is_empty() {
938                let mut set = std::collections::HashSet::new();
939                set.insert(String::new());
940                set
941            } else {
942                // Walk SEQ members left-to-right. STRING/PATTERN yield ∅
943                // (anonymous tokens, skipped). Named-child-producing
944                // members yield a non-empty set. If that set contains ε,
945                // the member is optional and the next member's yield is
946                // also reachable. Accumulate until we hit a non-optional
947                // named-child producer.
948                let mut combined = std::collections::HashSet::new();
949                for m in members {
950                    let ys = yield_of_production(grammar, m, visited, cache);
951                    if ys.is_empty() {
952                        continue;
953                    }
954                    let has_epsilon = ys.contains("");
955                    combined.extend(ys);
956                    if !has_epsilon {
957                        break;
958                    }
959                }
960                combined
961            }
962        }
963        Production::Choice { members } => {
964            let mut union = std::collections::HashSet::new();
965            for m in members {
966                union.extend(yield_of_production(grammar, m, visited, cache));
967            }
968            union
969        }
970        Production::Optional { content } => {
971            let mut set = yield_of_production(grammar, content, visited, cache);
972            set.insert(String::new());
973            set
974        }
975        Production::Blank => {
976            let mut set = std::collections::HashSet::new();
977            set.insert(String::new());
978            set
979        }
980        Production::String { .. } | Production::Pattern { .. } => std::collections::HashSet::new(),
981        Production::Repeat { content } => {
982            let mut set = yield_of_production(grammar, content, visited, cache);
983            set.insert(String::new());
984            set
985        }
986        Production::Repeat1 { content }
987        | Production::Field { content, .. }
988        | Production::Token { content }
989        | Production::ImmediateToken { content }
990        | Production::Prec { content, .. }
991        | Production::PrecLeft { content, .. }
992        | Production::PrecRight { content, .. }
993        | Production::PrecDynamic { content, .. }
994        | Production::Reserved { content, .. } => {
995            yield_of_production(grammar, content, visited, cache)
996        }
997    }
998}
999
1000// ═══════════════════════════════════════════════════════════════════
1001// node-types.json integration
1002// ═══════════════════════════════════════════════════════════════════
1003
1004/// Parse node-types.json and build a map from parent kind to the set
1005/// of all named child kinds the parser can produce for that parent.
1006type NodeTypeResult = (
1007    std::collections::HashMap<String, std::collections::HashSet<String>>,
1008    std::collections::HashMap<
1009        String,
1010        std::collections::HashMap<String, std::collections::HashSet<String>>,
1011    >,
1012    std::collections::HashMap<String, std::collections::HashSet<String>>,
1013);
1014
1015fn build_node_type_children(nt_bytes: &[u8]) -> NodeTypeResult {
1016    use std::collections::{HashMap, HashSet};
1017    let node_types: Vec<crate::theory_extract::NodeType> = match serde_json::from_slice(nt_bytes) {
1018        Ok(v) => v,
1019        Err(_) => return (HashMap::new(), HashMap::new(), HashMap::new()),
1020    };
1021    let mut all_map: HashMap<String, HashSet<String>> = HashMap::new();
1022    let mut field_map: HashMap<String, HashMap<String, HashSet<String>>> = HashMap::new();
1023    let mut nonfield_map: HashMap<String, HashSet<String>> = HashMap::new();
1024    for entry in &node_types {
1025        if !entry.named {
1026            continue;
1027        }
1028        let mut child_kinds = HashSet::new();
1029        for (field_name, field_value) in &entry.fields {
1030            if let Some(types) = field_value.get("types").and_then(|t| t.as_array()) {
1031                for t in types {
1032                    if let (Some(name), Some(true)) = (
1033                        t.get("type").and_then(|n| n.as_str()),
1034                        t.get("named").and_then(serde_json::Value::as_bool),
1035                    ) {
1036                        child_kinds.insert(name.to_owned());
1037                        field_map
1038                            .entry(entry.node_type.clone())
1039                            .or_default()
1040                            .entry(field_name.clone())
1041                            .or_default()
1042                            .insert(name.to_owned());
1043                    }
1044                }
1045            }
1046        }
1047        if let Some(ref children) = entry.children {
1048            for t in &children.types {
1049                if t.named {
1050                    child_kinds.insert(t.node_type.clone());
1051                    nonfield_map
1052                        .entry(entry.node_type.clone())
1053                        .or_default()
1054                        .insert(t.node_type.clone());
1055                }
1056            }
1057        }
1058        if !child_kinds.is_empty() {
1059            all_map.insert(entry.node_type.clone(), child_kinds);
1060        }
1061    }
1062    (all_map, field_map, nonfield_map)
1063}
1064
1065/// Augment `grammar.subtypes` with child-kind data from node-types.json.
1066///
1067/// Uses per-field structural matching: for each parent kind P, each field
1068/// F in P's node-types.json entry, and each child kind C in field F's
1069/// types, find the SYMBOL S referenced at field F's position in P's
1070/// grammar rule. If C lacks a grammar rule and does not already satisfy S,
1071/// record C ⊑ S. Non-field children are matched against non-FIELD symbols
1072/// in the rule body.
1073fn augment_subtypes_from_node_types(grammar: &mut Grammar) {
1074    use std::collections::HashMap;
1075
1076    // Build per-field child-kind map from node-types.json by re-parsing.
1077    let mut pairs: Vec<(String, String)> = Vec::new();
1078    for parent_kind in grammar.node_type_children.keys() {
1079        let Some(rule) = grammar.rules.get(parent_kind) else {
1080            continue;
1081        };
1082
1083        // Collect symbols from the grammar rule, partitioned by the
1084        // FIELD they appear in (or non-field for top-level symbols).
1085        let mut field_symbols: HashMap<String, Vec<String>> = HashMap::new();
1086        let mut non_field_symbols: Vec<String> = Vec::new();
1087        collect_field_symbols(rule, &mut field_symbols, &mut non_field_symbols, false);
1088
1089        // Per-field augmentation: for each FIELD F in the grammar rule,
1090        // match child kinds that node-types.json says appear in field F
1091        // against the symbols at field F's position.
1092        if let Some(nt_fields) = grammar.node_type_field_children.get(parent_kind) {
1093            for (field_name, nt_child_kinds) in nt_fields {
1094                let Some(rule_syms) = field_symbols.get(field_name) else {
1095                    continue;
1096                };
1097                for child_kind in nt_child_kinds {
1098                    if grammar.rules.contains_key(child_kind) {
1099                        continue;
1100                    }
1101                    for sym_name in rule_syms {
1102                        if !kind_satisfies_symbol(grammar, Some(child_kind), sym_name) {
1103                            pairs.push((child_kind.clone(), sym_name.clone()));
1104                        }
1105                    }
1106                }
1107            }
1108        }
1109
1110        // Non-field augmentation: for child kinds from `children.types`
1111        // (no field), match against non-FIELD symbols in the rule.
1112        if let Some(nt_nonfield) = grammar.node_type_nonfield_children.get(parent_kind) {
1113            for child_kind in nt_nonfield {
1114                if grammar.rules.contains_key(child_kind) {
1115                    continue;
1116                }
1117                for sym_name in &non_field_symbols {
1118                    if !kind_satisfies_symbol(grammar, Some(child_kind), sym_name) {
1119                        pairs.push((child_kind.clone(), sym_name.clone()));
1120                    }
1121                }
1122            }
1123        }
1124    }
1125    for (child_kind, sym_name) in pairs {
1126        grammar
1127            .subtypes
1128            .entry(child_kind)
1129            .or_default()
1130            .insert(sym_name);
1131    }
1132}
1133
1134/// Walk a production and collect referenced symbols, separating those
1135/// inside FIELD bodies (keyed by field name) from those outside any FIELD.
1136fn collect_field_symbols(
1137    prod: &Production,
1138    field_map: &mut std::collections::HashMap<String, Vec<String>>,
1139    non_field: &mut Vec<String>,
1140    inside_field: bool,
1141) {
1142    match prod {
1143        Production::Symbol { name } if !inside_field => {
1144            non_field.push(name.clone());
1145        }
1146        Production::Field { name, content } => {
1147            let mut syms = Vec::new();
1148            collect_symbols_flat(content, &mut syms);
1149            field_map.entry(name.clone()).or_default().extend(syms);
1150        }
1151        Production::Choice { members } | Production::Seq { members } => {
1152            for m in members {
1153                collect_field_symbols(m, field_map, non_field, inside_field);
1154            }
1155        }
1156        Production::Repeat { content }
1157        | Production::Repeat1 { content }
1158        | Production::Optional { content }
1159        | Production::Alias { content, .. }
1160        | Production::Token { content }
1161        | Production::ImmediateToken { content }
1162        | Production::Prec { content, .. }
1163        | Production::PrecLeft { content, .. }
1164        | Production::PrecRight { content, .. }
1165        | Production::PrecDynamic { content, .. }
1166        | Production::Reserved { content, .. } => {
1167            collect_field_symbols(content, field_map, non_field, inside_field);
1168        }
1169        _ => {}
1170    }
1171}
1172
1173fn collect_symbols_flat(prod: &Production, out: &mut Vec<String>) {
1174    match prod {
1175        Production::Symbol { name } => out.push(name.clone()),
1176        Production::Choice { members } | Production::Seq { members } => {
1177            for m in members {
1178                collect_symbols_flat(m, out);
1179            }
1180        }
1181        Production::Repeat { content }
1182        | Production::Repeat1 { content }
1183        | Production::Optional { content }
1184        | Production::Alias { content, .. }
1185        | Production::Field { content, .. }
1186        | Production::Token { content }
1187        | Production::ImmediateToken { content }
1188        | Production::Prec { content, .. }
1189        | Production::PrecLeft { content, .. }
1190        | Production::PrecRight { content, .. }
1191        | Production::PrecDynamic { content, .. }
1192        | Production::Reserved { content, .. } => collect_symbols_flat(content, out),
1193        _ => {}
1194    }
1195}
1196
1197/// Build a map from external scanner symbol names to their anonymous
1198/// ALIAS values by walking every rule body in the grammar.
1199fn build_external_alias_map(grammar: &Grammar) -> std::collections::HashMap<String, String> {
1200    let mut map = std::collections::HashMap::new();
1201    fn walk(
1202        grammar: &Grammar,
1203        prod: &Production,
1204        map: &mut std::collections::HashMap<String, String>,
1205    ) {
1206        match prod {
1207            Production::Alias {
1208                content,
1209                named,
1210                value,
1211            } => {
1212                if !*named && !value.is_empty() {
1213                    if let Production::Symbol { name } = content.as_ref() {
1214                        if name.starts_with('_') && !grammar.rules.contains_key(name) {
1215                            map.entry(name.clone()).or_insert_with(|| value.clone());
1216                        }
1217                    }
1218                }
1219                walk(grammar, content, map);
1220            }
1221            Production::Choice { members } | Production::Seq { members } => {
1222                for m in members {
1223                    walk(grammar, m, map);
1224                }
1225            }
1226            Production::Repeat { content }
1227            | Production::Repeat1 { content }
1228            | Production::Optional { content }
1229            | Production::Field { content, .. }
1230            | Production::Token { content }
1231            | Production::ImmediateToken { content }
1232            | Production::Prec { content, .. }
1233            | Production::PrecLeft { content, .. }
1234            | Production::PrecRight { content, .. }
1235            | Production::PrecDynamic { content, .. }
1236            | Production::Reserved { content, .. } => walk(grammar, content, map),
1237            _ => {}
1238        }
1239    }
1240    for rule in grammar.rules.values() {
1241        walk(grammar, rule, &mut map);
1242    }
1243    map
1244}
1245
1246/// Build a map from named-alias values to their source symbol names.
1247/// When tree-sitter emits a vertex with kind `V` via
1248/// `alias($.source, $.V)`, the grammar has no rule keyed by `V`.
1249/// This map lets the emitter resolve `V → source` and walk the source
1250/// rule with proper token roles and bracket pairs.
1251fn build_named_alias_map(grammar: &Grammar) -> std::collections::HashMap<String, String> {
1252    let mut map = std::collections::HashMap::new();
1253    fn walk(prod: &Production, map: &mut std::collections::HashMap<String, String>) {
1254        match prod {
1255            Production::Alias {
1256                content,
1257                named,
1258                value,
1259            } => {
1260                if *named && !value.is_empty() {
1261                    if let Production::Symbol { name } = content.as_ref() {
1262                        map.entry(value.clone()).or_insert_with(|| name.clone());
1263                    }
1264                }
1265                walk(content, map);
1266            }
1267            Production::Choice { members } | Production::Seq { members } => {
1268                for m in members {
1269                    walk(m, map);
1270                }
1271            }
1272            Production::Repeat { content }
1273            | Production::Repeat1 { content }
1274            | Production::Optional { content }
1275            | Production::Field { content, .. }
1276            | Production::Token { content }
1277            | Production::ImmediateToken { content }
1278            | Production::Prec { content, .. }
1279            | Production::PrecLeft { content, .. }
1280            | Production::PrecRight { content, .. }
1281            | Production::PrecDynamic { content, .. }
1282            | Production::Reserved { content, .. } => walk(content, map),
1283            _ => {}
1284        }
1285    }
1286    for rule in grammar.rules.values() {
1287        walk(rule, &mut map);
1288    }
1289    map
1290}
1291
1292/// Compute token roles for every STRING value in every grammar rule.
1293///
1294/// For each rule R, analyzes the production body to classify every
1295/// STRING token by its structural role (bracket-open, bracket-close,
1296/// separator, keyword, operator). Also identifies which bracket-open
1297/// tokens trigger indentation (those with REPEAT/REPEAT1 between
1298/// the open and close).
1299///
1300/// Bracket pairs are detected per-SEQ, not from a fixed character
1301/// set. Two STRINGs are a matched pair iff they are the first and
1302/// last STRING-typed members of the same SEQ with at least one
1303/// non-STRING member between them and open != close.
1304type RoleMap = std::collections::HashMap<String, std::collections::HashMap<String, TokenRole>>;
1305type IndentSet = std::collections::HashSet<(String, String)>;
1306
1307fn compute_token_roles(grammar: &Grammar) -> (RoleMap, IndentSet) {
1308    use std::collections::{HashMap, HashSet};
1309    let mut all_roles: HashMap<String, HashMap<String, TokenRole>> = HashMap::new();
1310    let mut indent_triggers: HashSet<(String, String)> = HashSet::new();
1311
1312    for (rule_name, rule) in &grammar.rules {
1313        let mut roles: HashMap<String, TokenRole> = HashMap::new();
1314        classify_production(rule, &mut roles, &mut indent_triggers, rule_name);
1315        if !roles.is_empty() {
1316            all_roles.insert(rule_name.clone(), roles);
1317        }
1318    }
1319
1320    (all_roles, indent_triggers)
1321}
1322
1323/// Recursively classify STRING tokens in a production body.
1324fn classify_production(
1325    prod: &Production,
1326    roles: &mut std::collections::HashMap<String, TokenRole>,
1327    indent_triggers: &mut std::collections::HashSet<(String, String)>,
1328    rule_name: &str,
1329) {
1330    match prod {
1331        Production::Seq { members } => {
1332            classify_seq(members, roles, indent_triggers, rule_name, false);
1333        }
1334        Production::Choice { members } => {
1335            for m in members {
1336                // CHOICE alternatives' SEQs get in_choice=true so that
1337                // position-0 STRINGs are classified as Operators (not
1338                // prefix sigils). E.g. `=` in `CHOICE [SEQ ["=", ...]]`
1339                // is an operator, not a prefix.
1340                match m {
1341                    Production::Seq {
1342                        members: seq_members,
1343                    } => {
1344                        classify_seq(seq_members, roles, indent_triggers, rule_name, true);
1345                    }
1346                    _ => classify_production(m, roles, indent_triggers, rule_name),
1347                }
1348            }
1349        }
1350        Production::Repeat { content } | Production::Repeat1 { content } => {
1351            classify_repeat_body(content, roles, indent_triggers, rule_name);
1352        }
1353        Production::Optional { content }
1354        | Production::Field { content, .. }
1355        | Production::Token { content }
1356        | Production::ImmediateToken { content }
1357        | Production::Prec { content, .. }
1358        | Production::PrecLeft { content, .. }
1359        | Production::PrecRight { content, .. }
1360        | Production::PrecDynamic { content, .. }
1361        | Production::Reserved { content, .. } => {
1362            classify_production(content, roles, indent_triggers, rule_name);
1363        }
1364        Production::Alias { content, .. } => {
1365            classify_production(content, roles, indent_triggers, rule_name);
1366        }
1367        _ => {}
1368    }
1369}
1370
1371/// Classify STRING tokens within a SEQ. This is where bracket pairs
1372/// are detected and roles assigned.
1373fn classify_seq(
1374    members: &[Production],
1375    roles: &mut std::collections::HashMap<String, TokenRole>,
1376    indent_triggers: &mut std::collections::HashSet<(String, String)>,
1377    rule_name: &str,
1378    in_choice: bool,
1379) {
1380    let string_positions: Vec<(usize, &str)> = members
1381        .iter()
1382        .enumerate()
1383        .filter_map(|(i, m)| unwrap_to_string(m).map(|s| (i, s)))
1384        .collect();
1385
1386    let content_count = members
1387        .iter()
1388        .filter(|m| unwrap_to_string(m).is_none())
1389        .count();
1390
1391    if string_positions.len() >= 2 {
1392        let (first_idx, first_val) = string_positions[0];
1393        let (last_idx, last_val) = string_positions[string_positions.len() - 1];
1394
1395        let has_content_between = members[first_idx + 1..last_idx]
1396            .iter()
1397            .any(|m| unwrap_to_string(m).is_none());
1398
1399        let both_punct = !is_word_like(first_val) && !is_word_like(last_val);
1400        let both_word = is_word_like(first_val) && is_word_like(last_val);
1401        if has_content_between && first_val != last_val && (both_punct || both_word) {
1402            roles.insert(first_val.to_owned(), TokenRole::BracketOpen);
1403            roles.insert(last_val.to_owned(), TokenRole::BracketClose);
1404
1405            let between = &members[first_idx + 1..last_idx];
1406            if first_val == "{" && has_repeat_recursive(between) {
1407                indent_triggers.insert((rule_name.to_owned(), first_val.to_owned()));
1408            }
1409        }
1410    }
1411
1412    // Classify remaining STRINGs by structural position.
1413    let first_content_idx = members.iter().position(|m| unwrap_to_string(m).is_none());
1414    let last_content_idx = members.iter().rposition(|m| unwrap_to_string(m).is_none());
1415
1416    for (i, m) in members.iter().enumerate() {
1417        if let Some(value) = unwrap_to_string(m) {
1418            let value = value.to_owned();
1419            if !roles.contains_key(&value) {
1420                if is_word_like(&value) {
1421                    roles.insert(value.clone(), TokenRole::Keyword);
1422                } else if !in_choice
1423                    && first_content_idx.is_some_and(|fc| i < fc)
1424                    && is_prefix_sigil(&value)
1425                {
1426                    roles.insert(value.clone(), TokenRole::BracketOpen);
1427                } else if last_content_idx.is_some_and(|lc| i > lc) {
1428                    // STRING after all content: suffix (tight before).
1429                    // Unlike prefix, this applies in CHOICE branches too
1430                    // (e.g. `()` in bash function_definition's CHOICE).
1431                    roles.insert(value.clone(), TokenRole::BracketClose);
1432                } else if !in_choice
1433                    && string_positions.len() == 1
1434                    && content_count == 2
1435                    && value.len() == 1
1436                {
1437                    // Single-character STRING between exactly two content
1438                    // members in a non-CHOICE SEQ: this is a connector
1439                    // (e.g. `.` in `SEQ [object, ".", attr]`).
1440                    // Multi-character tokens like `:=`, `<-`, `->` are
1441                    // operators (spaced), not connectors.
1442                    roles.insert(value.clone(), TokenRole::Connector);
1443                } else {
1444                    roles.insert(value.clone(), TokenRole::Operator);
1445                }
1446            }
1447        }
1448    }
1449
1450    for m in members {
1451        if unwrap_to_string(m).is_none() {
1452            classify_production(m, roles, indent_triggers, rule_name);
1453        }
1454    }
1455}
1456
1457/// Classify STRING tokens in a REPEAT body. The first STRING in a
1458/// REPEAT body's inner SEQ is a separator (e.g. `,` in
1459/// `REPEAT(SEQ [",", item])`).
1460fn classify_repeat_body(
1461    content: &Production,
1462    roles: &mut std::collections::HashMap<String, TokenRole>,
1463    indent_triggers: &mut std::collections::HashSet<(String, String)>,
1464    rule_name: &str,
1465) {
1466    match content {
1467        Production::Seq { members } => {
1468            if let Some(Production::String { value }) = members.first() {
1469                roles.insert(value.clone(), TokenRole::Separator);
1470            }
1471            classify_seq(members, roles, indent_triggers, rule_name, false);
1472        }
1473        _ => classify_production(content, roles, indent_triggers, rule_name),
1474    }
1475}
1476
1477/// Classify STRING tokens within a SEQ by structural position, returning
1478/// a role for each member position. Non-STRING positions get `None`.
1479/// This is the inline variant of `classify_seq` used at emission time
1480/// to avoid the flat per-rule map's conflation of same-text tokens.
1481fn classify_seq_positions(members: &[Production], in_choice: bool) -> Vec<Option<TokenRole>> {
1482    let mut roles: Vec<Option<TokenRole>> = vec![None; members.len()];
1483
1484    let string_positions: Vec<(usize, &str)> = members
1485        .iter()
1486        .enumerate()
1487        .filter_map(|(i, m)| unwrap_to_string(m).map(|s| (i, s)))
1488        .collect();
1489
1490    let content_count = members
1491        .iter()
1492        .filter(|m| unwrap_to_string(m).is_none())
1493        .count();
1494
1495    // Bracket pair detection: first and last STRING with content between.
1496    let mut bracket_open_idx: Option<usize> = None;
1497    let mut bracket_close_idx: Option<usize> = None;
1498    if string_positions.len() >= 2 {
1499        let (first_idx, first_val) = string_positions[0];
1500        let (last_idx, last_val) = string_positions[string_positions.len() - 1];
1501
1502        let has_content_between = members[first_idx + 1..last_idx]
1503            .iter()
1504            .any(|m| unwrap_to_string(m).is_none());
1505
1506        let both_punct = !is_word_like(first_val) && !is_word_like(last_val);
1507        let both_word = is_word_like(first_val) && is_word_like(last_val);
1508        // Same-text delimiters (e.g. regex `/.../`) are a bracket pair
1509        // when at least one side is IMMEDIATE_TOKEN — the grammar's
1510        // structural signal that the delimiter must be tight against
1511        // the content.
1512        let either_immediate =
1513            is_immediate_token(&members[first_idx]) || is_immediate_token(&members[last_idx]);
1514        let same_text_immediate = first_val == last_val && either_immediate;
1515        if has_content_between
1516            && (both_punct || both_word)
1517            && (first_val != last_val || same_text_immediate)
1518        {
1519            roles[first_idx] = Some(TokenRole::BracketOpen);
1520            roles[last_idx] = Some(TokenRole::BracketClose);
1521            bracket_open_idx = Some(first_idx);
1522            bracket_close_idx = Some(last_idx);
1523        }
1524    }
1525
1526    let first_content_idx = members.iter().position(|m| unwrap_to_string(m).is_none());
1527    let last_content_idx = members.iter().rposition(|m| unwrap_to_string(m).is_none());
1528
1529    for (i, m) in members.iter().enumerate() {
1530        if roles[i].is_some() {
1531            continue;
1532        }
1533        if let Some(value) = unwrap_to_string(m) {
1534            roles[i] = Some(if is_word_like(value) {
1535                TokenRole::Keyword
1536            } else if !in_choice && first_content_idx.is_some_and(|fc| i < fc) {
1537                if is_prefix_sigil(value) {
1538                    TokenRole::BracketOpen
1539                } else {
1540                    TokenRole::Operator
1541                }
1542            } else if last_content_idx.is_some_and(|lc| i > lc) {
1543                TokenRole::BracketClose
1544            } else if !in_choice
1545                && string_positions.len() == 1
1546                && content_count == 2
1547                && value.len() == 1
1548            {
1549                TokenRole::Connector
1550            } else {
1551                TokenRole::Operator
1552            });
1553        }
1554    }
1555
1556    // Override: in a REPEAT body's inner SEQ, the first STRING is a
1557    // separator. This is checked by the caller (REPEAT handler), not here.
1558    // But we do store bracket indices for the caller to use.
1559    let _ = (bracket_open_idx, bracket_close_idx);
1560
1561    roles
1562}
1563
1564/// Check if a SEQ's bracket at position `idx` triggers indentation.
1565#[allow(clippy::branches_sharing_code)]
1566fn seq_bracket_triggers_indent(
1567    members: &[Production],
1568    open_idx: usize,
1569    _grammar: &Grammar,
1570) -> bool {
1571    let string_positions: Vec<(usize, &str)> = members
1572        .iter()
1573        .enumerate()
1574        .filter_map(|(i, m)| unwrap_to_string(m).map(|s| (i, s)))
1575        .collect();
1576    if string_positions.len() < 2 {
1577        return false;
1578    }
1579    let open_val = string_positions.iter().find(|(i, _)| *i == open_idx);
1580    let close_val = string_positions.last();
1581    if let (Some((_, open_text)), Some((close_idx, close_text))) = (open_val, close_val) {
1582        if open_idx >= *close_idx {
1583            return false;
1584        }
1585        // Word-like bracket pairs (function/end, if/end, while/end,
1586        // for/end, module/end, struct/end, begin/end, do/done, etc.)
1587        // always wrap block bodies that need indentation. This is a
1588        // structural invariant: word-like delimiters only appear in
1589        // block constructs across all 261 grammars.
1590        if is_word_like(open_text) && is_word_like(close_text) {
1591            return true;
1592        }
1593        let between = &members[open_idx + 1..*close_idx];
1594        // Only { } bracket pairs trigger indentation from direct
1595        // REPEAT content. Other pairs like ( ), < >, [ ] are
1596        // inline even when they contain REPEAT (comma-separated
1597        // lists, type parameters, function arguments).
1598        if *open_text == "{" && has_repeat_recursive(between) {
1599            return true;
1600        }
1601        // Follow SYMBOL → rule one level for CHOICE[SYMBOL, BLANK]
1602        // patterns where the SYMBOL's rule body has REPEAT.
1603        // Only for { } bracket pairs (block constructs). Other pairs
1604        // like < > (type parameters) are always inline.
1605        if *open_text == "{" {
1606            for m in between {
1607                if let Production::Choice { members: alts } = m {
1608                    let has_blank = alts.iter().any(|a| matches!(a, Production::Blank));
1609                    if has_blank {
1610                        for alt in alts {
1611                            if let Production::Symbol { name } = alt {
1612                                if let Some(rule) = _grammar.rules.get(name) {
1613                                    if has_repeat_in(rule) {
1614                                        return true;
1615                                    }
1616                                }
1617                            }
1618                        }
1619                    }
1620                }
1621            }
1622        }
1623        false
1624    } else {
1625        false
1626    }
1627}
1628
1629/// Check if a production's rule body starts with a bracket pair's open
1630/// `STRING`. Used to suppress `ForceSpace` before call-pattern members
1631/// (e.g. `argument_list` whose rule starts with `(`).
1632fn member_has_leading_bracket(prod: &Production, grammar: &Grammar) -> bool {
1633    match prod {
1634        Production::Symbol { name } => grammar
1635            .rules
1636            .get(name)
1637            .is_some_and(|rule| first_string_of(rule).is_some_and(|s| !is_word_like(s))),
1638        Production::Field { content, .. } => member_has_leading_bracket(content, grammar),
1639        Production::Choice { members } => {
1640            let non_blank: Vec<_> = members
1641                .iter()
1642                .filter(|m| !matches!(m, Production::Blank))
1643                .collect();
1644            !non_blank.is_empty()
1645                && non_blank
1646                    .iter()
1647                    .all(|m| member_has_leading_bracket(m, grammar))
1648        }
1649        Production::Alias { content, .. } => {
1650            if let Production::Symbol { name } = content.as_ref() {
1651                grammar
1652                    .rules
1653                    .get(name)
1654                    .is_some_and(|rule| first_string_of(rule).is_some_and(|s| !is_word_like(s)))
1655            } else {
1656                false
1657            }
1658        }
1659        Production::Prec { content, .. }
1660        | Production::PrecLeft { content, .. }
1661        | Production::PrecRight { content, .. }
1662        | Production::PrecDynamic { content, .. }
1663        | Production::Optional { content } => member_has_leading_bracket(content, grammar),
1664        Production::Repeat { .. } | Production::Repeat1 { .. } => false,
1665        _ => false,
1666    }
1667}
1668
1669fn first_string_of(prod: &Production) -> Option<&str> {
1670    match prod {
1671        Production::String { value } => Some(value.as_str()),
1672        Production::Seq { members } => members.first().and_then(first_string_of),
1673        Production::Prec { content, .. }
1674        | Production::PrecLeft { content, .. }
1675        | Production::PrecRight { content, .. }
1676        | Production::PrecDynamic { content, .. }
1677        | Production::Token { content }
1678        | Production::ImmediateToken { content }
1679        | Production::Field { content, .. } => first_string_of(content),
1680        _ => None,
1681    }
1682}
1683
1684/// Check if any member of a slice contains REPEAT/REPEAT1 recursively.
1685fn has_repeat_recursive(members: &[Production]) -> bool {
1686    members.iter().any(has_repeat_in)
1687}
1688
1689fn has_repeat_in(prod: &Production) -> bool {
1690    match prod {
1691        Production::Repeat { .. } | Production::Repeat1 { .. } => true,
1692        Production::Choice { members } | Production::Seq { members } => {
1693            members.iter().any(has_repeat_in)
1694        }
1695        Production::Prec { content, .. }
1696        | Production::PrecLeft { content, .. }
1697        | Production::PrecRight { content, .. }
1698        | Production::PrecDynamic { content, .. }
1699        | Production::Optional { content }
1700        | Production::Field { content, .. }
1701        | Production::Token { content }
1702        | Production::ImmediateToken { content }
1703        | Production::Reserved { content, .. }
1704        | Production::Alias { content, .. } => has_repeat_in(content),
1705        _ => false,
1706    }
1707}
1708
1709/// Check if a string value is word-like (alphanumeric/underscore).
1710fn is_word_like(s: &str) -> bool {
1711    !s.is_empty()
1712        && s.chars().all(|c| c.is_alphanumeric() || c == '_')
1713        && s.starts_with(|c: char| c.is_alphabetic() || c == '_')
1714}
1715
1716/// A prefix `STRING` (position before all content in a non-`CHOICE` SEQ) is a
1717/// tight sigil (`BracketOpen`) only when it is NOT a common binary/assignment
1718/// operator. Single-character ASCII operators like `=`, `+`, `-` need space;
1719/// multi-character prefixes (`...`, `::`, `@`, `#`, `$`) and non-ASCII
1720/// prefixes are tight.
1721fn is_prefix_sigil(s: &str) -> bool {
1722    if s.len() == 1 {
1723        let c = s.as_bytes()[0];
1724        !matches!(
1725            c,
1726            b'=' | b'+'
1727                | b'-'
1728                | b'*'
1729                | b'/'
1730                | b'<'
1731                | b'>'
1732                | b'!'
1733                | b'?'
1734                | b'|'
1735                | b'&'
1736                | b'^'
1737                | b'%'
1738                | b'~'
1739        )
1740    } else {
1741        true
1742    }
1743}
1744
1745/// Unwrap wrapper productions (`Token`, `ImmediateToken`, `Prec`, `PrecLeft`,
1746/// `PrecRight`, `PrecDynamic`, `Field`, `Reserved`) to find the inner `STRING`
1747/// value. Returns `None` if the production is not a (possibly wrapped)
1748/// `STRING`.
1749fn is_immediate_token(prod: &Production) -> bool {
1750    match prod {
1751        Production::ImmediateToken { .. } => true,
1752        Production::Prec { content, .. }
1753        | Production::PrecLeft { content, .. }
1754        | Production::PrecRight { content, .. }
1755        | Production::PrecDynamic { content, .. }
1756        | Production::Token { content }
1757        | Production::Field { content, .. }
1758        | Production::Reserved { content, .. } => is_immediate_token(content),
1759        _ => false,
1760    }
1761}
1762
1763fn unwrap_to_string(prod: &Production) -> Option<&str> {
1764    match prod {
1765        Production::String { value } => Some(value.as_str()),
1766        Production::Token { content }
1767        | Production::ImmediateToken { content }
1768        | Production::Prec { content, .. }
1769        | Production::PrecLeft { content, .. }
1770        | Production::PrecRight { content, .. }
1771        | Production::PrecDynamic { content, .. }
1772        | Production::Field { content, .. }
1773        | Production::Reserved { content, .. } => unwrap_to_string(content),
1774        _ => None,
1775    }
1776}
1777
1778/// Extract line-comment prefixes from the grammar's extras rules.
1779///
1780/// A line comment is identified by: the rule name is in
1781/// `grammar.extras` AND the rule body structurally matches
1782/// `TOKEN(SEQ [STRING prefix, PATTERN ...])` where the PATTERN
1783/// matches to end-of-line.
1784fn extract_line_comment_prefixes(grammar: &Grammar) -> Vec<String> {
1785    let mut prefixes = Vec::new();
1786    for extra_name in &grammar.extras {
1787        if let Some(rule) = grammar.rules.get(extra_name) {
1788            if let Some(prefix) = extract_line_comment_prefix(rule) {
1789                prefixes.push(prefix);
1790            }
1791        }
1792    }
1793    prefixes
1794}
1795
1796fn extract_line_comment_prefix(prod: &Production) -> Option<String> {
1797    match prod {
1798        Production::Token { content } | Production::ImmediateToken { content } => {
1799            extract_line_comment_prefix(content)
1800        }
1801        Production::Seq { members } if members.len() >= 2 => {
1802            if let Production::String { value } = &members[0] {
1803                if members[1..].iter().any(|m| {
1804                    matches!(m, Production::Pattern { value } if value.contains(".*") || value.contains("[^\\n]*") || value.contains("[^\\r\\n]*"))
1805                }) {
1806                    return Some(value.clone());
1807                }
1808            }
1809            None
1810        }
1811        Production::Choice { members } => members.iter().find_map(extract_line_comment_prefix),
1812        _ => None,
1813    }
1814}
1815
1816// ═══════════════════════════════════════════════════════════════════
1817// Format policy
1818/// Classify external scanner tokens as layout actions based on
1819/// tree-sitter naming conventions. These conventions are ecosystem-
1820/// wide, not language-specific: every indent-based grammar uses
1821/// `_indent`/`_dedent`, every ASI grammar uses `_automatic_semicolon`.
1822fn classify_external_layout_tokens(grammar: &mut Grammar) {
1823    // External tokens have no grammar rule. We identify them by
1824    // checking which hidden symbols are NOT in grammar.rules.
1825    // Then classify by naming convention.
1826    //
1827    // This runs after external_alias_map is built, so tokens with
1828    // known text are already handled. Layout tokens are the remainder.
1829    let all_hidden_refs = collect_all_symbol_refs(&grammar.rules);
1830    for name in &all_hidden_refs {
1831        if !name.starts_with('_') || grammar.rules.contains_key(name) {
1832            continue;
1833        }
1834        if grammar.external_alias_map.contains_key(name) {
1835            continue;
1836        }
1837        if name == "_indent" || name.ends_with("_indent") {
1838            grammar.external_indent_opens.insert(name.clone());
1839        } else if name == "_dedent" || name.ends_with("_dedent") {
1840            grammar.external_indent_closes.insert(name.clone());
1841        } else if name.contains("line_ending")
1842            || name.contains("newline")
1843            || name.ends_with("_or_eof")
1844        {
1845            grammar.external_newlines.insert(name.clone());
1846        } else if name.contains("semicolon") {
1847            grammar.external_semicolons.insert(name.clone());
1848        }
1849    }
1850}
1851
1852/// Collect all SYMBOL names referenced anywhere in the grammar rules.
1853fn collect_all_symbol_refs(
1854    rules: &BTreeMap<String, Production>,
1855) -> std::collections::HashSet<String> {
1856    let mut refs = std::collections::HashSet::new();
1857    fn walk(prod: &Production, refs: &mut std::collections::HashSet<String>) {
1858        match prod {
1859            Production::Symbol { name } => {
1860                refs.insert(name.clone());
1861            }
1862            Production::Seq { members } | Production::Choice { members } => {
1863                for m in members {
1864                    walk(m, refs);
1865                }
1866            }
1867            Production::Alias { content, .. }
1868            | Production::Repeat { content }
1869            | Production::Repeat1 { content }
1870            | Production::Optional { content }
1871            | Production::Field { content, .. }
1872            | Production::Token { content }
1873            | Production::ImmediateToken { content }
1874            | Production::Prec { content, .. }
1875            | Production::PrecLeft { content, .. }
1876            | Production::PrecRight { content, .. }
1877            | Production::PrecDynamic { content, .. }
1878            | Production::Reserved { content, .. } => walk(content, refs),
1879            _ => {}
1880        }
1881    }
1882    for rule in rules.values() {
1883        walk(rule, &mut refs);
1884    }
1885    refs
1886}
1887
1888// ═══════════════════════════════════════════════════════════════════
1889
1890/// Whitespace and indentation policy applied during emission.
1891///
1892/// The default policy inserts a single space between adjacent tokens,
1893/// a newline after `;` / `}` / `{`, and tracks indent on `{` / `}`
1894/// boundaries. Per-language overrides (idiomatic indent width,
1895/// trailing-comma rules, blank-line conventions) can ride alongside
1896/// this struct in a follow-up branch; today's defaults aim only for
1897/// syntactic validity.
1898#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1899pub struct FormatPolicy {
1900    /// Number of spaces per indent level.
1901    pub indent_width: usize,
1902    /// Separator inserted between adjacent terminals that the lexer
1903    /// would otherwise glue together (word ↔ word, operator ↔ operator).
1904    /// Default is a single space.
1905    pub separator: String,
1906    /// Newline byte sequence emitted after `line_break_after` tokens
1907    /// and at end-of-output. Default is `"\n"`.
1908    pub newline: String,
1909    /// Tokens after which the walker breaks to a new line.
1910    pub line_break_after: Vec<String>,
1911    /// Tokens that increase indent on emission.
1912    pub indent_open: Vec<String>,
1913    /// Tokens that decrease indent on emission.
1914    pub indent_close: Vec<String>,
1915}
1916
1917impl Default for FormatPolicy {
1918    fn default() -> Self {
1919        Self {
1920            indent_width: 2,
1921            separator: " ".to_owned(),
1922            newline: "\n".to_owned(),
1923            line_break_after: vec![";".into(), "{".into(), "}".into()],
1924            indent_open: vec!["{".into()],
1925            indent_close: vec!["}".into()],
1926        }
1927    }
1928}
1929
1930// ═══════════════════════════════════════════════════════════════════
1931// Emitter
1932// ═══════════════════════════════════════════════════════════════════
1933
1934/// Emit a by-construction schema to source bytes.
1935///
1936/// `protocol` is the grammar / language name (used in error messages
1937/// and to label the entry point).
1938///
1939/// The walker treats `schema.entries` as the ordered list of root
1940/// vertices, falling back to a deterministic by-id ordering when
1941/// `entries` is empty. Each root is emitted using the production
1942/// associated with its kind in `grammar.rules`.
1943///
1944/// # Errors
1945///
1946/// Returns [`ParseError::EmitFailed`] when:
1947///
1948/// - the schema has no vertices
1949/// - a root vertex's kind is not a grammar rule
1950/// - a `SYMBOL` reference points at a kind with no rule and no schema
1951///   child to resolve it to
1952/// - a required `FIELD` has no corresponding edge in the schema
1953pub fn emit_pretty(
1954    protocol: &str,
1955    schema: &Schema,
1956    grammar: &Grammar,
1957    policy: &FormatPolicy,
1958    cassette: Option<&dyn crate::languages::cassettes::GrammarCassette>,
1959) -> Result<Vec<u8>, ParseError> {
1960    let roots = collect_roots(schema);
1961    if roots.is_empty() {
1962        return Err(ParseError::EmitFailed {
1963            protocol: protocol.to_owned(),
1964            reason: "schema has no entry vertices".to_owned(),
1965        });
1966    }
1967
1968    let mut out = Output::new(policy, grammar, cassette);
1969    for (i, root) in roots.iter().enumerate() {
1970        if i > 0 {
1971            out.newline();
1972        }
1973        emit_vertex(protocol, schema, grammar, root, &mut out)?;
1974    }
1975    Ok(out.finish())
1976}
1977
1978fn collect_roots(schema: &Schema) -> Vec<&panproto_gat::Name> {
1979    if !schema.entries.is_empty() {
1980        return schema
1981            .entries
1982            .iter()
1983            .filter(|name| schema.vertices.contains_key(*name))
1984            .collect();
1985    }
1986
1987    // Fallback: every vertex that is not the target of any structural edge
1988    // (sorted by id for determinism).
1989    let mut targets: std::collections::HashSet<&panproto_gat::Name> =
1990        std::collections::HashSet::new();
1991    for edge in schema.edges.keys() {
1992        targets.insert(&edge.tgt);
1993    }
1994    let mut roots: Vec<&panproto_gat::Name> = schema
1995        .vertices
1996        .keys()
1997        .filter(|name| !targets.contains(name))
1998        .collect();
1999    roots.sort();
2000    roots
2001}
2002
2003fn emit_vertex(
2004    protocol: &str,
2005    schema: &Schema,
2006    grammar: &Grammar,
2007    vertex_id: &panproto_gat::Name,
2008    out: &mut Output<'_>,
2009) -> Result<(), ParseError> {
2010    let vertex = schema
2011        .vertices
2012        .get(vertex_id)
2013        .ok_or_else(|| ParseError::EmitFailed {
2014            protocol: protocol.to_owned(),
2015            reason: format!("vertex '{vertex_id}' not found"),
2016        })?;
2017
2018    // IMMEDIATE_TOKEN at the rule head: emit a tightness marker
2019    // before any content the leaf shortcut or rule-body walk produces.
2020    // This is the unique structural site where IMMEDIATE_TOKEN's "no
2021    // preceding whitespace" property attaches; downstream layout reads
2022    // the NoSpace marker without re-inspecting the production tree.
2023    let kind_head = vertex.kind.as_ref();
2024    if let Some(rule) = grammar.rules.get(kind_head) {
2025        if is_immediate_token(rule) {
2026            out.no_space();
2027        }
2028    }
2029
2030    // Leaf shortcut: a vertex carrying a `literal-value` constraint
2031    // and no outgoing structural edges is a terminal token. Emit the
2032    // captured value directly. This handles identifiers, numeric
2033    // literals, and string literals that the parser stored as
2034    // `literal-value` even on by-construction schemas.
2035    if let Some(literal) = literal_value(schema, vertex_id) {
2036        if children_for(schema, vertex_id).is_empty() {
2037            // Skip leaf shortcut for bracket-pair literals like "()"
2038            // when the vertex has an alias-resolved rule. The rule-based
2039            // path correctly emits them as separate BracketOpen/Close
2040            // tokens with proper spacing.
2041            let is_bracket_pair = literal.len() >= 2
2042                && matches!(
2043                    (literal.as_bytes().first(), literal.as_bytes().last()),
2044                    (Some(b'('), Some(b')')) | (Some(b'['), Some(b']')) | (Some(b'{'), Some(b'}'))
2045                );
2046            let vkind = vertex.kind.as_ref();
2047            let has_alias_rule = grammar
2048                .named_alias_map
2049                .get(vkind)
2050                .is_some_and(|src| grammar.rules.contains_key(src));
2051            if !(is_bracket_pair && has_alias_rule) {
2052                out.token_with_role(literal, Some(TokenRole::Terminal));
2053                return Ok(());
2054            }
2055        }
2056    }
2057
2058    let kind = vertex.kind.as_ref();
2059    let edges = children_for(schema, vertex_id);
2060    if let Some(rule) = grammar.rules.get(kind) {
2061        let old_rule = out.current_rule.take();
2062        out.current_rule = Some(kind.to_owned());
2063        let mut cursor = ChildCursor::new(&edges);
2064        emit_production(protocol, schema, grammar, vertex_id, rule, &mut cursor, out)?;
2065        drain_extras(protocol, schema, grammar, &mut cursor, out)?;
2066        out.current_rule = old_rule;
2067        return Ok(());
2068    }
2069
2070    // Named alias resolution: if the vertex kind was produced by
2071    // `alias($.source, $.kind)`, look up the source rule and walk
2072    // it. This preserves bracket pairs, separators, and token roles
2073    // that the source rule defines.
2074    if let Some(source_name) = grammar.named_alias_map.get(kind) {
2075        if let Some(rule) = grammar.rules.get(source_name) {
2076            let old_rule = out.current_rule.take();
2077            out.current_rule = Some(source_name.to_owned());
2078            let mut cursor = ChildCursor::new(&edges);
2079            emit_production(protocol, schema, grammar, vertex_id, rule, &mut cursor, out)?;
2080            drain_extras(protocol, schema, grammar, &mut cursor, out)?;
2081            out.current_rule = old_rule;
2082            return Ok(());
2083        }
2084    }
2085
2086    // No rule for this kind and no named alias. The parser produced
2087    // it via an external scanner (e.g. YAML's `document` root).
2088    // Fall back to walking the children directly.
2089    for edge in &edges {
2090        emit_vertex(protocol, schema, grammar, &edge.tgt, out)?;
2091    }
2092    Ok(())
2093}
2094
2095/// Linear cursor over a vertex's outgoing edges, used to thread
2096/// children through a production rule without double-consuming them.
2097struct ChildCursor<'a> {
2098    edges: &'a [&'a Edge],
2099    consumed: Vec<bool>,
2100}
2101
2102impl<'a> ChildCursor<'a> {
2103    fn new(edges: &'a [&'a Edge]) -> Self {
2104        Self {
2105            edges,
2106            consumed: vec![false; edges.len()],
2107        }
2108    }
2109
2110    /// Take the next unconsumed edge whose kind equals `field_name`.
2111    fn take_field(&mut self, field_name: &str) -> Option<&'a Edge> {
2112        for (i, edge) in self.edges.iter().enumerate() {
2113            if !self.consumed[i] && edge.kind.as_ref() == field_name {
2114                self.consumed[i] = true;
2115                return Some(edge);
2116            }
2117        }
2118        None
2119    }
2120
2121    /// Whether any unconsumed edge satisfies `predicate`. Used by the
2122    /// unit tests; the live emit path went through `has_matching` on
2123    /// each alternative until cursor-driven dispatch was rewritten to
2124    /// pick the first-unconsumed-edge's kind directly.
2125    #[cfg(test)]
2126    fn has_matching(&self, predicate: impl Fn(&Edge) -> bool) -> bool {
2127        self.edges
2128            .iter()
2129            .enumerate()
2130            .any(|(i, edge)| !self.consumed[i] && predicate(edge))
2131    }
2132
2133    /// Take the next unconsumed edge whose target vertex satisfies
2134    /// `predicate`. Returns the edge and the underlying production
2135    /// resolution path is the caller's job.
2136    fn take_matching(&mut self, predicate: impl Fn(&Edge) -> bool) -> Option<&'a Edge> {
2137        for (i, edge) in self.edges.iter().enumerate() {
2138            if !self.consumed[i] && predicate(edge) {
2139                self.consumed[i] = true;
2140                return Some(edge);
2141            }
2142        }
2143        None
2144    }
2145}
2146
2147thread_local! {
2148    static EMIT_DEPTH: std::cell::Cell<usize> = const { std::cell::Cell::new(0) };
2149    /// Set of `(vertex_id, rule_name)` pairs that are currently being
2150    /// walked by the recursion. A SYMBOL that resolves to a rule
2151    /// already on this stack closes a μ-binder cycle: in the
2152    /// coinductive reading, the rule walk at any vertex is the least
2153    /// fixed point of `body[μ X . body / X]`, which unfolds at most
2154    /// once, with the second visit returning the empty sequence (the
2155    /// unit of the free token monoid). Examples that trigger this:
2156    /// YAML's `stream` ⊃ `_b_blk_*` mutually-recursive chain, Rust's
2157    /// `_expression` ⊃ `binary_expression` ⊃ `_expression`.
2158    static EMIT_MU_FRAMES: std::cell::RefCell<std::collections::HashSet<(String, String)>> =
2159        std::cell::RefCell::new(std::collections::HashSet::new());
2160    /// The name of the FIELD whose body the walker is currently inside,
2161    /// or `None` at top level. Lets a SYMBOL nested arbitrarily deep
2162    /// in the field's content (under SEQ, CHOICE, REPEAT, OPTIONAL)
2163    /// consume from the *outer* cursor by edge-kind rather than from
2164    /// the child's own cursor by symbol-match. Without this, shapes
2165    /// like `field('args', commaSep1($.X))` — which expands to
2166    /// `FIELD(SEQ(SYMBOL X, REPEAT(SEQ(',', SYMBOL X))))` — emit only
2167    /// the first matched edge: the FIELD handler consumed one edge,
2168    /// the inner REPEAT searched the consumed child's cursor (which
2169    /// has no more sibling field edges), and the REPEAT broke after
2170    /// one iteration. Setting the context here so the inner SYMBOL
2171    /// pulls successive field-named edges from the outer cursor
2172    /// recovers every matched edge across arbitrary nesting.
2173    static EMIT_FIELD_CONTEXT: std::cell::RefCell<Option<String>> =
2174        const { std::cell::RefCell::new(None) };
2175}
2176
2177/// RAII guard that restores the prior `EMIT_FIELD_CONTEXT` value on drop.
2178struct FieldContextGuard(Option<String>);
2179
2180impl Drop for FieldContextGuard {
2181    fn drop(&mut self) {
2182        EMIT_FIELD_CONTEXT.with(|f| *f.borrow_mut() = self.0.take());
2183    }
2184}
2185
2186fn push_field_context(name: &str) -> FieldContextGuard {
2187    let prev = EMIT_FIELD_CONTEXT.with(|f| f.borrow_mut().replace(name.to_owned()));
2188    FieldContextGuard(prev)
2189}
2190
2191/// Clear the field context for the duration of a child-context walk.
2192/// The child's own production has its own FIELDs that set their own
2193/// context; the outer field hint must not leak into them.
2194fn clear_field_context() -> FieldContextGuard {
2195    let prev = EMIT_FIELD_CONTEXT.with(|f| f.borrow_mut().take());
2196    FieldContextGuard(prev)
2197}
2198
2199fn current_field_context() -> Option<String> {
2200    EMIT_FIELD_CONTEXT.with(|f| f.borrow().clone())
2201}
2202
2203/// Walk a rule at a vertex inside a μ-binder. The wrapping frame is
2204/// pushed before recursion and popped after, so any SYMBOL inside
2205/// `rule` that re-enters the same `(vertex_id, rule_name)` pair
2206/// returns the empty sequence (μ X . body unfolds once).
2207fn walk_in_mu_frame(
2208    protocol: &str,
2209    schema: &Schema,
2210    grammar: &Grammar,
2211    vertex_id: &panproto_gat::Name,
2212    rule_name: &str,
2213    rule: &Production,
2214    cursor: &mut ChildCursor<'_>,
2215    out: &mut Output<'_>,
2216) -> Result<(), ParseError> {
2217    let key = (vertex_id.to_string(), rule_name.to_owned());
2218    let inserted = EMIT_MU_FRAMES.with(|frames| frames.borrow_mut().insert(key.clone()));
2219    if !inserted {
2220        // We are already walking this rule at this vertex deeper in
2221        // the call stack. The coinductive μ-fixed-point reading
2222        // returns the empty sequence here; the surrounding
2223        // production resumes after the SYMBOL.
2224        return Ok(());
2225    }
2226    let result = emit_production(protocol, schema, grammar, vertex_id, rule, cursor, out);
2227    EMIT_MU_FRAMES.with(|frames| {
2228        frames.borrow_mut().remove(&key);
2229    });
2230    result
2231}
2232
2233fn emit_production(
2234    protocol: &str,
2235    schema: &Schema,
2236    grammar: &Grammar,
2237    vertex_id: &panproto_gat::Name,
2238    production: &Production,
2239    cursor: &mut ChildCursor<'_>,
2240    out: &mut Output<'_>,
2241) -> Result<(), ParseError> {
2242    let depth = EMIT_DEPTH.with(|d| {
2243        let v = d.get() + 1;
2244        d.set(v);
2245        v
2246    });
2247    if depth > 500 {
2248        EMIT_DEPTH.with(|d| d.set(d.get() - 1));
2249        return Err(ParseError::EmitFailed {
2250            protocol: protocol.to_owned(),
2251            reason: format!(
2252                "emit_production recursion >500 (likely a cyclic grammar; \
2253                     vertex='{vertex_id}')"
2254            ),
2255        });
2256    }
2257    drain_extras(protocol, schema, grammar, cursor, out)?;
2258    let result = emit_production_inner(
2259        protocol, schema, grammar, vertex_id, production, cursor, out,
2260    );
2261    EMIT_DEPTH.with(|d| d.set(d.get() - 1));
2262    result
2263}
2264
2265/// Consume and emit every leading edge on `cursor` whose target kind
2266/// is in `grammar.extras` (typically `line_comment` / `block_comment`).
2267/// Extras live outside the production grammar — tree-sitter skips them
2268/// at parse time and records them as children of the surrounding
2269/// vertex — so the rule walker cannot reconcile them against the
2270/// cursor. Draining them as a side channel preserves their content in
2271/// the output without confusing the structural matchers.
2272fn drain_extras(
2273    protocol: &str,
2274    schema: &Schema,
2275    grammar: &Grammar,
2276    cursor: &mut ChildCursor<'_>,
2277    out: &mut Output<'_>,
2278) -> Result<(), ParseError> {
2279    if grammar.extras.is_empty() {
2280        return Ok(());
2281    }
2282    loop {
2283        let next_extra: Option<usize> = cursor
2284            .edges
2285            .iter()
2286            .enumerate()
2287            .find(|(i, _)| !cursor.consumed[*i])
2288            .and_then(|(i, edge)| {
2289                let kind = schema.vertices.get(&edge.tgt).map(|v| v.kind.as_ref())?;
2290                if grammar.extras.contains(kind) {
2291                    Some(i)
2292                } else {
2293                    None
2294                }
2295            });
2296        let Some(idx) = next_extra else {
2297            return Ok(());
2298        };
2299        cursor.consumed[idx] = true;
2300        let target = &cursor.edges[idx].tgt;
2301        emit_vertex(protocol, schema, grammar, target, out)?;
2302    }
2303}
2304
2305/// Emit a SEQ production with positionally classified token roles.
2306///
2307/// Instead of looking up roles from a precomputed flat map (which
2308/// conflates the same token text across structural contexts within
2309/// one rule), this function computes roles for each STRING position
2310/// from the SEQ's own structure at emission time.
2311fn emit_seq_with_roles(
2312    protocol: &str,
2313    schema: &Schema,
2314    grammar: &Grammar,
2315    vertex_id: &panproto_gat::Name,
2316    members: &[Production],
2317    cursor: &mut ChildCursor<'_>,
2318    out: &mut Output<'_>,
2319    in_choice: bool,
2320) -> Result<(), ParseError> {
2321    let positional_roles = classify_seq_positions(members, in_choice);
2322
2323    // Detect which bracket-open position triggers indentation so we can
2324    // emit the matching IndentClose for the bracket-close.
2325    let indent_open_idx: Option<usize> = positional_roles.iter().enumerate().position(|(i, r)| {
2326        *r == Some(TokenRole::BracketOpen) && seq_bracket_triggers_indent(members, i, grammar)
2327    });
2328
2329    // For word-like bracket pairs (function/end, if/end, etc.), find
2330    // positions that need LineBreak: the body CHOICE and any FIELD
2331    // members that follow it (elseif/else clauses, catch blocks, etc.).
2332    let mut line_break_positions: std::collections::HashSet<usize> =
2333        std::collections::HashSet::new();
2334    if let Some(oi) = indent_open_idx {
2335        let open_text = unwrap_to_string(&members[oi]);
2336        if open_text.is_some_and(is_word_like) {
2337            let mut found_body = false;
2338            for (j, member) in members.iter().enumerate().skip(oi + 1) {
2339                if let Production::Choice { members: alts } = member {
2340                    let has_blank = alts.iter().any(|a| matches!(a, Production::Blank));
2341                    let has_block_symbol = alts.iter().any(|a| match a {
2342                        Production::Symbol { name } => {
2343                            grammar.rules.get(name).is_some_and(has_repeat_in)
2344                        }
2345                        _ => false,
2346                    });
2347                    if has_blank && has_block_symbol {
2348                        line_break_positions.insert(j);
2349                        found_body = true;
2350                    }
2351                } else if found_body && matches!(member, Production::Field { .. }) {
2352                    line_break_positions.insert(j);
2353                }
2354            }
2355        }
2356    }
2357
2358    let mut prev_member_emitted_content = false;
2359    for (i, member) in members.iter().enumerate() {
2360        let tokens_before_member = out.tokens.len();
2361        if let Some(value) = unwrap_to_string(member) {
2362            let role = positional_roles[i].unwrap_or_else(|| {
2363                if is_word_like(value) {
2364                    TokenRole::Keyword
2365                } else {
2366                    TokenRole::Operator
2367                }
2368            });
2369
2370            if indent_open_idx == Some(i) {
2371                if is_word_like(value) {
2372                    out.tokens.push(Token::Lit(value.to_owned(), role));
2373                    out.tokens.push(Token::IndentOpen);
2374                } else {
2375                    out.token_with_indent_open(value, role);
2376                }
2377            } else if role == TokenRole::BracketClose && indent_open_idx.is_some() {
2378                out.tokens.push(Token::IndentClose);
2379                out.tokens.push(Token::Lit(value.to_owned(), role));
2380            } else {
2381                out.token_with_role(value, Some(role));
2382            }
2383        } else {
2384            // ForceSpace between consecutive content-producing SEQ
2385            // members so that sibling-vertex tokens are separated
2386            // (e.g. echo and $((  ...  )) in bash command). Skip
2387            // when the current member's rule body starts with a
2388            // bracket pair, because the preceding Terminal and the
2389            // bracket should be tight (call pattern like f(...)).
2390            if i > 0 && unwrap_to_string(&members[i - 1]).is_none() && prev_member_emitted_content {
2391                let member_starts_with_bracket = member_has_leading_bracket(member, grammar);
2392                let is_zero_width_external = matches!(
2393                    member,
2394                    Production::Symbol { name }
2395                        if name.starts_with('_') && !grammar.rules.contains_key(name)
2396                );
2397                let is_separator_choice = matches!(member, Production::Choice { members: alts }
2398                    if alts.iter().all(|a| matches!(a, Production::Blank) || unwrap_to_string(a).is_some()));
2399                let is_repeat = matches!(
2400                    member,
2401                    Production::Repeat { .. } | Production::Repeat1 { .. }
2402                );
2403                if !member_starts_with_bracket
2404                    && !is_zero_width_external
2405                    && !is_separator_choice
2406                    && !is_repeat
2407                {
2408                    out.tokens.push(Token::ForceSpace);
2409                }
2410            }
2411            if line_break_positions.contains(&i) {
2412                out.newline();
2413            }
2414            emit_production(protocol, schema, grammar, vertex_id, member, cursor, out)?;
2415        }
2416        prev_member_emitted_content = out.tokens[tokens_before_member..]
2417            .iter()
2418            .any(|t| matches!(t, Token::Lit(_, _)));
2419    }
2420    Ok(())
2421}
2422
2423fn emit_production_inner(
2424    protocol: &str,
2425    schema: &Schema,
2426    grammar: &Grammar,
2427    vertex_id: &panproto_gat::Name,
2428    production: &Production,
2429    cursor: &mut ChildCursor<'_>,
2430    out: &mut Output<'_>,
2431) -> Result<(), ParseError> {
2432    match production {
2433        Production::String { value } => {
2434            out.token(value);
2435            Ok(())
2436        }
2437        Production::Pattern { value } => {
2438            if let Some(literal) = literal_value(schema, vertex_id) {
2439                out.token_with_role(literal, Some(TokenRole::Terminal));
2440            } else if is_newline_like_pattern(value) {
2441                // Patterns like `\r?\n`, `\n`, `\r\n` are the structural
2442                // newline tokens grammars use to separate top-level
2443                // statements (csound's `_new_line`, ABC's line-end, etc.).
2444                // Emitting them through the placeholder fallback rendered
2445                // the bare `_` sentinel between siblings; route them to
2446                // the layout pass's line-break instead so the output
2447                // re-parses.
2448                out.newline();
2449            } else if is_whitespace_only_pattern(value) {
2450                // `\s+`, `[ \t]+` and friends are interstitial whitespace
2451                // tokens. Emit nothing: the layout pass inserts the
2452                // policy separator between adjacent Lits if needed.
2453            } else {
2454                out.token_with_role(&placeholder_for_pattern(value), Some(TokenRole::Terminal));
2455            }
2456            Ok(())
2457        }
2458        Production::Blank => Ok(()),
2459        Production::Symbol { name } => {
2460            // Inside a FIELD body, a SYMBOL consumes by field-name on
2461            // the outer cursor rather than searching by symbol-match.
2462            // This covers the simple `FIELD(SYMBOL X)` case as well as
2463            // every nesting under FIELD that contains SYMBOLs (SEQ,
2464            // CHOICE, REPEAT, OPTIONAL, ALIAS). Without the override,
2465            // shapes like `field('args', commaSep1($.X))` consume one
2466            // field edge in the FIELD handler and then the REPEAT
2467            // inside SEQ searches the consumed child's cursor — where
2468            // no sibling field edges sit — and breaks after one
2469            // iteration.
2470            if let Some(field) = current_field_context() {
2471                if let Some(edge) = cursor.take_field(&field) {
2472                    return emit_in_child_context(
2473                        protocol, schema, grammar, &edge.tgt, production, out,
2474                    );
2475                }
2476                // No matching field-named edge left on the outer
2477                // cursor. Surface nothing; the surrounding REPEAT /
2478                // OPTIONAL / CHOICE backtracks the literal tokens it
2479                // emitted on this iteration when it sees no progress.
2480                return Ok(());
2481            }
2482            if name.starts_with('_') {
2483                // Hidden rule: not a vertex kind on the schema side.
2484                // Inline-expand the rule body so its children take
2485                // edges from the current cursor, instead of trying to
2486                // take a single child edge that "satisfies" the
2487                // hidden rule and discarding the rest of the body
2488                // (which would drop tokens like `=` and the trailing
2489                // value SYMBOL inside e.g. TOML's `_inline_pair`).
2490                //
2491                // Wrapped in a μ-frame so a hidden rule that
2492                // references its own kind cyclically (or another
2493                // hidden rule that closes the cycle) unfolds once
2494                // and then collapses to the empty sequence at the
2495                // second visit, rather than blowing the stack.
2496                if let Some(rule) = grammar.rules.get(name) {
2497                    let old_rule = out.current_rule.take();
2498                    out.current_rule = Some(name.to_owned());
2499                    let result = walk_in_mu_frame(
2500                        protocol, schema, grammar, vertex_id, name, rule, cursor, out,
2501                    );
2502                    out.current_rule = old_rule;
2503                    result
2504                } else {
2505                    // External hidden rule (declared in the
2506                    // grammar's `externals` block, scanned by C code,
2507                    // not listed in `rules`). Heuristic fallback by
2508                    // name:
2509                    //
2510                    // - `_indent` / `*_indent`: open an indent block.
2511                    //   Indent-based grammars (Python, YAML, qvr)
2512                    //   declare an `_indent` external scanner before
2513                    //   the body of a block-bodied declaration; the
2514                    //   emitted output is unparseable without the
2515                    //   corresponding indentation jump.
2516                    // - `_dedent` / `*_dedent`: close the matching
2517                    //   indent block.
2518                    // - `_newline` / `*_line_ending` / `*_or_eof`:
2519                    //   universally newline-or-empty; emitting a
2520                    //   single newline is the right default for
2521                    //   grammars like TOML whose `pair` SEQ trails
2522                    //   into `_line_ending_or_eof`.
2523                    //
2524                    // Check the precomputed alias map first: if this
2525                    // external token appears as the content of an
2526                    // anonymous ALIAS elsewhere in the grammar, emit
2527                    // the alias value as the token text.
2528                    if let Some(alias_value) = grammar.external_alias_map.get(name) {
2529                        out.token(alias_value);
2530                        return Ok(());
2531                    }
2532                    if grammar.external_indent_opens.contains(name) {
2533                        out.indent_open();
2534                    } else if grammar.external_indent_closes.contains(name) {
2535                        out.indent_close();
2536                    } else if grammar.external_newlines.contains(name) {
2537                        out.newline();
2538                    } else if grammar.external_semicolons.contains(name) {
2539                        out.token_with_role(";", Some(TokenRole::Separator));
2540                    } else if let Some(default) = out
2541                        .cassette
2542                        .and_then(|c| crate::languages::cassettes::resolve_external_token(c, name))
2543                    {
2544                        if !default.is_empty() {
2545                            out.token(default);
2546                        }
2547                    }
2548                    Ok(())
2549                }
2550            } else if let Some(edge) = { take_symbol_match(grammar, schema, cursor, name) } {
2551                // For supertype / hidden-rule dispatch the child's
2552                // own kind names the actual production to walk
2553                // (`child.kind` IS the subtype). For ALIAS the
2554                // dependent-optic context is carried by the
2555                // surrounding `Production::Alias` branch, which calls
2556                // `emit_aliased_child` directly; we don't reach here
2557                // for that case. So walking `grammar.rules[child.kind]`
2558                // via `emit_vertex` is correct: the dependent-optic
2559                // path is preserved at every site where it actually
2560                // diverges from `child.kind`.
2561                emit_vertex(protocol, schema, grammar, &edge.tgt, out)
2562            } else if vertex_id_kind(schema, vertex_id) == Some(name.as_str()) {
2563                let rule = grammar
2564                    .rules
2565                    .get(name)
2566                    .ok_or_else(|| ParseError::EmitFailed {
2567                        protocol: protocol.to_owned(),
2568                        reason: format!("no production for SYMBOL '{name}'"),
2569                    })?;
2570                // Self-reference (`X = ... SYMBOL X ...`): wrap in a
2571                // μ-frame so re-entry collapses to the empty sequence.
2572                {
2573                    let old_rule = out.current_rule.take();
2574                    out.current_rule = Some(name.to_owned());
2575                    let result = walk_in_mu_frame(
2576                        protocol, schema, grammar, vertex_id, name, rule, cursor, out,
2577                    );
2578                    out.current_rule = old_rule;
2579                    result
2580                }
2581            } else {
2582                // Named rule with no matching child: emit nothing and
2583                // let the surrounding CHOICE / OPTIONAL / REPEAT
2584                // resolve the absence.
2585                Ok(())
2586            }
2587        }
2588        Production::Seq { members } => emit_seq_with_roles(
2589            protocol, schema, grammar, vertex_id, members, cursor, out, false,
2590        ),
2591        Production::Choice { members } => {
2592            if let Some(matched) =
2593                pick_choice_with_cursor(schema, grammar, vertex_id, cursor, members)
2594            {
2595                match matched {
2596                    Production::Seq {
2597                        members: seq_members,
2598                    } => emit_seq_with_roles(
2599                        protocol,
2600                        schema,
2601                        grammar,
2602                        vertex_id,
2603                        seq_members,
2604                        cursor,
2605                        out,
2606                        true,
2607                    ),
2608                    Production::String { value } => {
2609                        let role = if is_word_like(value) {
2610                            TokenRole::Keyword
2611                        } else {
2612                            TokenRole::Separator
2613                        };
2614                        out.token_with_role(value, Some(role));
2615                        Ok(())
2616                    }
2617                    _ => {
2618                        emit_production(protocol, schema, grammar, vertex_id, matched, cursor, out)
2619                    }
2620                }
2621            } else {
2622                Ok(())
2623            }
2624        }
2625        Production::Repeat { content } | Production::Repeat1 { content } => {
2626            // Detect a "separator-leading SEQ" iteration body: SEQ whose
2627            // first member is a CHOICE containing BLANK (or an OPTIONAL),
2628            // i.e. the source-level separator between two iterations is
2629            // syntactically optional. When the chosen alternative for
2630            // that separator slot emits zero content tokens at runtime,
2631            // there was no source-level separator between this iteration
2632            // and the previous one; the layout pass must suppress its
2633            // policy separator to match the source's tight adjacency.
2634            //
2635            // Categorical reading: REPEAT body `B = SEQ(SEP, BODY)` is
2636            // the pullback of two halves. The bytes emitted in iteration
2637            // k+1 are a concatenation of `SEP_k+1` and `BODY_k+1`; if
2638            // `SEP_k+1` is the empty word, the concatenation of
2639            // `BODY_k` and `BODY_k+1` must remain a single contiguous
2640            // span. Hence the NoSpace marker.
2641            // Also detect mandatory separators: STRING at position 0
2642            // of a SEQ body (e.g. `SEQ[";", SYMBOL stmt]` in Python's
2643            // _simple_statements). For these, the cassette may override
2644            // the separator with a line break.
2645            let mandatory_sep_text: Option<&str> = match content.as_ref() {
2646                Production::Seq { members } if members.len() >= 2 => unwrap_to_string(&members[0]),
2647                _ => None,
2648            };
2649            let separator_leading_seq: Option<&[Production]> = match content.as_ref() {
2650                Production::Seq { members } if members.len() >= 2 => {
2651                    let first = &members[0];
2652                    let is_mandatory_sep = unwrap_to_string(first).is_some();
2653                    let cassette_overrides = is_mandatory_sep
2654                        && unwrap_to_string(first).is_some_and(|sep| {
2655                            out.cassette.is_some_and(|c| c.separator_is_line_break(sep))
2656                        });
2657                    let is_separator_slot = match first {
2658                        Production::Choice { members } => {
2659                            members.iter().any(|m| matches!(m, Production::Blank))
2660                        }
2661                        Production::Optional { .. } => true,
2662                        _ => cassette_overrides,
2663                    };
2664                    if is_separator_slot {
2665                        Some(members.as_slice())
2666                    } else {
2667                        None
2668                    }
2669                }
2670                _ => None,
2671            };
2672
2673            let mut emitted_any = false;
2674            loop {
2675                let cursor_snap = cursor.consumed.clone();
2676                let out_snap = out.snapshot();
2677                let consumed_before = cursor.consumed.iter().filter(|&&c| c).count();
2678                let result: Result<(), ParseError> =
2679                    if let Some(seq_members) = separator_leading_seq {
2680                        // Emit the separator slot first and observe
2681                        // whether it contributed any Lit. If not, push
2682                        // a NoSpace marker before walking the remaining
2683                        // SEQ members. The OutputSnapshot here covers
2684                        // only the separator's emission window.
2685                        let cassette_replaces_sep = mandatory_sep_text.is_some_and(|sep| {
2686                            out.cassette.is_some_and(|c| c.separator_is_line_break(sep))
2687                        });
2688                        let pre_sep = out.snapshot();
2689                        let sep_result = if cassette_replaces_sep {
2690                            out.newline();
2691                            Ok(())
2692                        } else {
2693                            emit_production(
2694                                protocol,
2695                                schema,
2696                                grammar,
2697                                vertex_id,
2698                                &seq_members[0],
2699                                cursor,
2700                                out,
2701                            )
2702                        };
2703                        match sep_result {
2704                            Err(e) => Err(e),
2705                            Ok(()) => {
2706                                if !cassette_replaces_sep && !out.lit_emitted_since(pre_sep) {
2707                                    out.no_space();
2708                                }
2709                                let mut rest_result = Ok(());
2710                                for member in &seq_members[1..] {
2711                                    rest_result = emit_production(
2712                                        protocol, schema, grammar, vertex_id, member, cursor, out,
2713                                    );
2714                                    if rest_result.is_err() {
2715                                        break;
2716                                    }
2717                                }
2718                                rest_result
2719                            }
2720                        }
2721                    } else {
2722                        emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
2723                    };
2724                let consumed_after = cursor.consumed.iter().filter(|&&c| c).count();
2725                if result.is_err() || consumed_after == consumed_before {
2726                    cursor.consumed = cursor_snap;
2727                    out.restore(out_snap);
2728                    break;
2729                }
2730                emitted_any = true;
2731            }
2732            if matches!(production, Production::Repeat1 { .. }) && !emitted_any {
2733                emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)?;
2734            }
2735            Ok(())
2736        }
2737        Production::Optional { content } => {
2738            let cursor_snap = cursor.consumed.clone();
2739            let out_snap = out.snapshot();
2740            let consumed_before = cursor.consumed.iter().filter(|&&c| c).count();
2741            let result =
2742                emit_production(protocol, schema, grammar, vertex_id, content, cursor, out);
2743            // OPTIONAL is a backtracking site: if the inner production
2744            // errored *or* made no progress without leaving a witness
2745            // constraint, restore both cursor and output to their
2746            // pre-attempt state. Mirrors `Repeat`'s loop body.
2747            if result.is_err() {
2748                cursor.consumed = cursor_snap;
2749                out.restore(out_snap);
2750                return result;
2751            }
2752            let consumed_after = cursor.consumed.iter().filter(|&&c| c).count();
2753            if consumed_after == consumed_before
2754                && !has_relevant_constraint(content, schema, vertex_id)
2755            {
2756                cursor.consumed = cursor_snap;
2757                out.restore(out_snap);
2758            }
2759            Ok(())
2760        }
2761        Production::Field { name, content } => {
2762            // Set the field context for the duration of `content`'s
2763            // walk and emit the content against the *outer* cursor.
2764            // The SYMBOL handler picks up the context and pulls
2765            // successive `take_field(name)` edges as it encounters
2766            // SYMBOLs anywhere under `content` (under SEQ, CHOICE,
2767            // REPEAT, OPTIONAL, ALIAS — arbitrarily nested). This
2768            // subsumes the prior carve-outs for FIELD(REPEAT(...)),
2769            // FIELD(REPEAT1(...)), and the bare FIELD(SYMBOL ...)
2770            // case, and adds coverage for
2771            // `field('xs', commaSep1($.X))` which expands to
2772            // FIELD(SEQ(SYMBOL X, REPEAT(SEQ(',', SYMBOL X)))) and
2773            // any other shape where REPEAT/REPEAT1 sits inside SEQ /
2774            // CHOICE / OPTIONAL under a FIELD. A FIELD that wraps a
2775            // non-SYMBOL production (e.g. `field('op', '+')` or
2776            // `field('op', CHOICE(STRING ...))`) still works: STRING
2777            // handlers ignore the context and emit literals
2778            // directly, so the operator token survives the round
2779            // trip.
2780            let _guard = push_field_context(name);
2781            emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
2782        }
2783        Production::Alias {
2784            content,
2785            named,
2786            value,
2787        } => {
2788            // A named ALIAS rewrites the parser-visible kind to
2789            // `value`. If the cursor has an unconsumed child whose
2790            // kind matches that alias name, take it and emit the
2791            // child using the alias's INNER content as the rule
2792            // (e.g. `ALIAS { SYMBOL real_rule, value: "kind_x" }`
2793            // means a `kind_x` vertex on the schema should be walked
2794            // through `real_rule`'s body, not through whatever rule
2795            // happens to be keyed under `kind_x`). This is the
2796            // dependent-optic shape: the rule the emitter walks at a
2797            // child position is determined by the parent's chosen
2798            // alias, not by the child kind alone — without it,
2799            // grammars like YAML that introduce the same kind through
2800            // many ALIAS sites lose the parent context the moment
2801            // emit_vertex is called.
2802            if *named && !value.is_empty() {
2803                if let Some(edge) = cursor.take_matching(|edge| {
2804                    schema
2805                        .vertices
2806                        .get(&edge.tgt)
2807                        .map(|v| v.kind.as_ref() == value.as_str())
2808                        .unwrap_or(false)
2809                }) {
2810                    return emit_aliased_child(protocol, schema, grammar, &edge.tgt, content, out);
2811                }
2812            }
2813            // For anonymous aliases (named: false) whose content is an
2814            // external scanner token with no grammar rule (e.g.
2815            // JavaScript's `_ternary_qmark` aliased to `"?"`), emit the
2816            // alias value directly. The content's SYMBOL handler would
2817            // fall through the external-token heuristic and produce
2818            // nothing; the alias value IS the token text.
2819            if !*named && !value.is_empty() {
2820                if let Production::Symbol { name: sym } = content.as_ref() {
2821                    if sym.starts_with('_') && !grammar.rules.contains_key(sym) {
2822                        out.token(value);
2823                        return Ok(());
2824                    }
2825                }
2826            }
2827            emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
2828        }
2829        Production::ImmediateToken { content } => {
2830            // IMMEDIATE_TOKEN is the grammar's explicit signal that the
2831            // wrapped token must have no preceding whitespace. Lift it
2832            // to a NoSpace marker here, at the unique structural site
2833            // where the property is declared. The layout pass reads
2834            // the marker; downstream code does not need to inspect
2835            // production shapes to recover this property.
2836            out.no_space();
2837            emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
2838        }
2839        Production::Token { content }
2840        | Production::Prec { content, .. }
2841        | Production::PrecLeft { content, .. }
2842        | Production::PrecRight { content, .. }
2843        | Production::PrecDynamic { content, .. }
2844        | Production::Reserved { content, .. } => {
2845            emit_production(protocol, schema, grammar, vertex_id, content, cursor, out)
2846        }
2847    }
2848}
2849
2850/// Take the next cursor edge whose target vertex's kind matches the
2851/// SYMBOL `name` directly or via inline expansion of a hidden rule.
2852fn take_symbol_match<'a>(
2853    grammar: &Grammar,
2854    schema: &Schema,
2855    cursor: &mut ChildCursor<'a>,
2856    name: &str,
2857) -> Option<&'a Edge> {
2858    // Prefer non-field edges (`child_of`) to avoid consuming a
2859    // field-named edge that a later FIELD handler should claim.
2860    // Field-named edges (edge.kind != "child_of") are reserved for
2861    // the FIELD production that names them; consuming one here would
2862    // steal it from its intended handler (e.g. `as_pattern`'s
2863    // `alias` field edge consumed by the leading `expression`
2864    // SYMBOL instead of the trailing FIELD "alias" handler).
2865    if let Some(edge) = cursor.take_matching(|edge| {
2866        edge.kind.as_ref() == "child_of" && {
2867            let target_kind = schema.vertices.get(&edge.tgt).map(|v| v.kind.as_ref());
2868            kind_satisfies_symbol(grammar, target_kind, name)
2869        }
2870    }) {
2871        return Some(edge);
2872    }
2873    cursor.take_matching(|edge| {
2874        let target_kind = schema.vertices.get(&edge.tgt).map(|v| v.kind.as_ref());
2875        kind_satisfies_symbol(grammar, target_kind, name)
2876    })
2877}
2878
2879/// Decide whether a schema vertex of kind `target_kind` satisfies a
2880/// SYMBOL `name` reference in the grammar.
2881///
2882/// Operates as an O(1) lookup against the precomputed subtype
2883/// closure built at [`Grammar::from_bytes`]. The semantic content is
2884/// "K satisfies SYMBOL S iff K is reachable from S by walking the
2885/// grammar's hidden, supertype, and named-alias dispatch": this is
2886/// exactly the relation tree-sitter induces on `(parser-visible kind,
2887/// rule-position)` pairs.
2888fn kind_satisfies_symbol(grammar: &Grammar, target_kind: Option<&str>, name: &str) -> bool {
2889    let Some(target) = target_kind else {
2890        return false;
2891    };
2892    if target == name {
2893        return true;
2894    }
2895    grammar
2896        .subtypes
2897        .get(target)
2898        .is_some_and(|set| set.contains(name))
2899}
2900
2901/// Emit a child reached through an ALIAS production using the
2902/// alias's inner content as the rule, not `grammar.rules[child.kind]`.
2903///
2904/// This carries the dependent-optic context across the ALIAS edge:
2905/// at the parent rule's site we know which underlying production the
2906/// alias wraps (typically `SYMBOL real_rule`), and that's the
2907/// production that should drive the emit walk on the child's
2908/// children. Looking up `grammar.rules.get(child.kind)` instead would
2909/// either fail (the renamed kind has no top-level rule, e.g. YAML's
2910/// `block_mapping_pair`) or pick an arbitrary same-kinded rule from
2911/// elsewhere in the grammar.
2912///
2913/// Walk-context invariant. The dependent-optic shape of `emit_pretty`
2914/// says: the production walked at any vertex is determined by the
2915/// path from the root through the grammar, not by the vertex kind in
2916/// isolation. Two dispatch sites realise that invariant:
2917///
2918/// * [`emit_vertex`] looks up `grammar.rules[child.kind]` and walks
2919///   it. Correct for supertype / hidden-rule dispatch: the child's
2920///   kind on the schema IS the subtype tree-sitter selected, so its
2921///   top-level rule is the right production to walk.
2922/// * `emit_aliased_child` threads the parent rule's `Production`
2923///   directly (the inner `content` of `Production::Alias`) and walks
2924///   it on the child's children. Correct for ALIAS dispatch: the
2925///   child's kind on the schema is the alias's `value` (a renamed
2926///   kind that may have no top-level rule), and the production to
2927///   walk is the alias's content body, supplied by the parent.
2928///
2929/// Together these cover every site where the rule-walked-at-child
2930/// diverges from `grammar.rules[child.kind]`; the recursion site for
2931/// plain SYMBOL therefore correctly delegates to `emit_vertex`, and
2932/// we do not need a richer `WalkContext` value passed by reference.
2933/// The grammar dependency is the thread.
2934fn emit_aliased_child(
2935    protocol: &str,
2936    schema: &Schema,
2937    grammar: &Grammar,
2938    child_id: &panproto_gat::Name,
2939    content: &Production,
2940    out: &mut Output<'_>,
2941) -> Result<(), ParseError> {
2942    // Leaf shortcut: if the child has a literal-value and no
2943    // structural children, emit the captured text. Skip for bracket-pair
2944    // literals when the production resolves to a rule with those brackets.
2945    if let Some(literal) = literal_value(schema, child_id) {
2946        if children_for(schema, child_id).is_empty() {
2947            let is_bracket_pair = literal.len() >= 2
2948                && matches!(
2949                    (literal.as_bytes().first(), literal.as_bytes().last()),
2950                    (Some(b'('), Some(b')')) | (Some(b'['), Some(b']')) | (Some(b'{'), Some(b'}'))
2951                );
2952            if !is_bracket_pair {
2953                out.token_with_role(literal, Some(TokenRole::Terminal));
2954                return Ok(());
2955            }
2956        }
2957    }
2958
2959    // Clear the enclosing FIELD context so it does not leak into the
2960    // aliased child's production walk. Without this, a FIELD("alias")
2961    // containing an ALIAS whose content is SYMBOL "expression" would
2962    // cause the inner SYMBOL handler to pull by field name "alias"
2963    // instead of by symbol match, failing to find the child edge.
2964    let _guard = clear_field_context();
2965
2966    // Resolve `content` to a rule when it's a SYMBOL (the dominant
2967    // shape: `ALIAS { content: SYMBOL real_rule, value: "kind_x" }`).
2968    if let Production::Symbol { name } = content {
2969        if let Some(rule) = grammar.rules.get(name) {
2970            let edges = children_for(schema, child_id);
2971            let mut cursor = ChildCursor::new(&edges);
2972            let old_rule = out.current_rule.take();
2973            out.current_rule = Some(name.to_owned());
2974            let result =
2975                emit_production(protocol, schema, grammar, child_id, rule, &mut cursor, out);
2976            out.current_rule = old_rule;
2977            return result;
2978        }
2979    }
2980
2981    // Other ALIAS contents (CHOICE, SEQ, literals) walk in place.
2982    let edges = children_for(schema, child_id);
2983    let mut cursor = ChildCursor::new(&edges);
2984    emit_production(
2985        protocol,
2986        schema,
2987        grammar,
2988        child_id,
2989        content,
2990        &mut cursor,
2991        out,
2992    )
2993}
2994
2995fn emit_in_child_context(
2996    protocol: &str,
2997    schema: &Schema,
2998    grammar: &Grammar,
2999    child_id: &panproto_gat::Name,
3000    production: &Production,
3001    out: &mut Output<'_>,
3002) -> Result<(), ParseError> {
3003    // The child walks under its own production tree, with its own
3004    // FIELDs setting their own contexts. Clear the outer FIELD hint
3005    // so it does not leak through and cause sibling SYMBOLs inside
3006    // the child's body to mistakenly pull edges from the child's
3007    // cursor by the parent's field name.
3008    let _guard = clear_field_context();
3009    // If `production` is a structural wrapper (CHOICE / SEQ /
3010    // OPTIONAL / ...) whose referenced symbols cover the child's own
3011    // kind, the child IS the production's target node and the right
3012    // emit path is `emit_vertex(child)` (which honours the
3013    // literal-value leaf shortcut). Without this guard, FIELD(pattern,
3014    // CHOICE { _pattern, self }) on an identifier child walks the
3015    // CHOICE on the identifier's empty cursor, falls through to the
3016    // first non-BLANK alt, and loses the captured identifier text.
3017    if !matches!(production, Production::Symbol { .. }) {
3018        let child_kind = schema.vertices.get(child_id).map(|v| v.kind.as_ref());
3019        let symbols = referenced_symbols(production);
3020        if symbols
3021            .iter()
3022            .any(|s| kind_satisfies_symbol(grammar, child_kind, s) || child_kind == Some(s))
3023        {
3024            return emit_vertex(protocol, schema, grammar, child_id, out);
3025        }
3026    }
3027    match production {
3028        Production::Symbol { .. } => emit_vertex(protocol, schema, grammar, child_id, out),
3029        _ => {
3030            let edges = children_for(schema, child_id);
3031            let mut cursor = ChildCursor::new(&edges);
3032            emit_production(
3033                protocol,
3034                schema,
3035                grammar,
3036                child_id,
3037                production,
3038                &mut cursor,
3039                out,
3040            )
3041        }
3042    }
3043}
3044
3045fn pick_choice_with_cursor<'a>(
3046    schema: &Schema,
3047    grammar: &Grammar,
3048    vertex_id: &panproto_gat::Name,
3049    cursor: &ChildCursor<'_>,
3050    alternatives: &'a [Production],
3051) -> Option<&'a Production> {
3052    // Positional discriminator: use the interstitials FROM the
3053    // current cursor position forward. Interstitials are indexed by
3054    // their gap position (interstitial-k is the gap before the k-th
3055    // named child); the slice from `consumed_count` onward captures
3056    // exactly the text the remaining CHOICE branches must consume.
3057    // This eliminates the cross-position contamination of the prior
3058    // flat blob (where a trailing-CHOICE-with-BLANK saw all the
3059    // commas separating earlier REPEAT iterations and wrongly
3060    // preferred the comma alt).
3061    //
3062    // The chose-alt-fingerprint (a single string joined from every
3063    // non-empty interstitial trimmed) is retained as a fallback for
3064    // by-construction schemas with no positional interstitials; it
3065    // is strictly less precise than positional matching.
3066    let consumed_count = cursor.consumed.iter().filter(|&&c| c).count();
3067    let positional_interstitials: Vec<&str> = schema
3068        .constraints
3069        .get(vertex_id)
3070        .map(|cs| {
3071            let mut indexed: Vec<(usize, &str)> = cs
3072                .iter()
3073                .filter_map(|c| {
3074                    let s = c.sort.as_ref();
3075                    if !s.starts_with("interstitial-") || s.ends_with("-start-byte") {
3076                        return None;
3077                    }
3078                    let idx: usize = s["interstitial-".len()..].parse().ok()?;
3079                    Some((idx, c.value.as_str()))
3080                })
3081                .collect();
3082            indexed.sort_by_key(|&(i, _)| i);
3083            indexed.into_iter().map(|(_, v)| v).collect()
3084        })
3085        .unwrap_or_default();
3086    let positional_slice: String = if positional_interstitials.is_empty() {
3087        String::new()
3088    } else {
3089        positional_interstitials
3090            .iter()
3091            .skip(consumed_count)
3092            .copied()
3093            .collect::<Vec<&str>>()
3094            .join(" ")
3095    };
3096    let fingerprint_blob = schema
3097        .constraints
3098        .get(vertex_id)
3099        .and_then(|cs| {
3100            cs.iter()
3101                .find(|c| c.sort.as_ref() == "chose-alt-fingerprint")
3102                .map(|c| c.value.clone())
3103        })
3104        .unwrap_or_default();
3105    let constraint_blob: String = if positional_slice.is_empty() {
3106        fingerprint_blob
3107    } else {
3108        positional_slice
3109    };
3110    let child_kinds: Vec<&str> = schema
3111        .constraints
3112        .get(vertex_id)
3113        .and_then(|cs| {
3114            cs.iter()
3115                .find(|c| c.sort.as_ref() == "chose-alt-child-kinds")
3116                .map(|c| c.value.split_whitespace().collect())
3117        })
3118        .unwrap_or_default();
3119    // Cursor-exhaustion BLANK-preference: when all cursor edges have
3120    // been consumed AND `BLANK` is one of the alternatives, the only
3121    // alt that won't introduce a non-existent child is `BLANK`.
3122    //
3123    // This gate fires before the literal-blob discriminator because
3124    // the fingerprint is shared across every CHOICE position in the
3125    // vertex's rule body: a vertex like `sample_step` that ends in
3126    // `..., REPEAT(SEQ(",", arg)), CHOICE(",", BLANK)` records all of
3127    // its `","` interstitials in a single blob, so the literal-score
3128    // matcher would otherwise prefer `","` for the trailing CHOICE
3129    // even when the source had no trailing comma. By the time the
3130    // emitter reaches the trailing CHOICE, the REPEAT has consumed
3131    // every arg edge in cursor order; the residual unconsumed multiset
3132    // is empty; and the categorical reading of a CHOICE-with-BLANK at
3133    // a position with no remaining children is the no-op alternative.
3134    let any_unconsumed = cursor
3135        .edges
3136        .iter()
3137        .enumerate()
3138        .any(|(i, _)| !cursor.consumed[i]);
3139    let blank_present = alternatives.iter().any(|a| matches!(a, Production::Blank));
3140    let edge_kinds: Vec<&str> = cursor
3141        .edges
3142        .iter()
3143        .enumerate()
3144        .filter(|(i, _)| !cursor.consumed[*i])
3145        .map(|(_, e)| e.kind.as_ref())
3146        .collect();
3147    if !any_unconsumed && blank_present {
3148        return alternatives.iter().find(|a| matches!(a, Production::Blank));
3149    }
3150    if !any_unconsumed && !blank_present {
3151        // When the cursor is exhausted: first prefer a newline-like
3152        // PATTERN over STRING separators (e.g. Go source_file terminator
3153        // CHOICE[PATTERN("\n"), ";", "\0"] should emit newline not ";").
3154        for alt in alternatives {
3155            if let Production::Pattern { value } = alt {
3156                if is_newline_like_pattern(value) {
3157                    return Some(alt);
3158                }
3159            }
3160        }
3161        // Then prefer a pure-literal alternative (only STRINGs, no
3162        // SYMBOLs/FIELDs) over one that merely CAN produce epsilon.
3163        // A pure-literal alternative emits concrete tokens without
3164        // needing children (e.g. ";" terminator in Rust struct_item).
3165        if let Some(pure_lit) = alternatives.iter().find(|alt| {
3166            let syms = referenced_symbols(alt);
3167            let strings = literal_strings(alt);
3168            syms.is_empty() && !strings.is_empty()
3169        }) {
3170            return Some(pure_lit);
3171        }
3172        let mut visited = std::collections::HashSet::new();
3173        let mut yield_cache = grammar.yield_sets.clone();
3174        for alt in alternatives {
3175            let ys = yield_of_production(grammar, alt, &mut visited, &mut yield_cache);
3176            if ys.contains("") {
3177                return Some(alt);
3178            }
3179            visited.clear();
3180        }
3181    }
3182
3183    // Literal match: when a cursor edge's target vertex kind or
3184    // literal-value matches a STRING alternative exactly, pick that
3185    // alternative. Handles grammars like Go's binary_expression where
3186    // operators are anonymous named children (kind IS the operator text
3187    // like ">") and the CHOICE is over STRING operators.
3188    for edge_idx in 0..cursor.edges.len() {
3189        if cursor.consumed[edge_idx] {
3190            continue;
3191        }
3192        let edge = &cursor.edges[edge_idx];
3193        let tgt_kind = schema.vertices.get(&edge.tgt).map(|v| v.kind.as_ref());
3194        let tgt_lit = literal_value(schema, &edge.tgt);
3195        for alt in alternatives {
3196            if let Production::String { value } = alt {
3197                if Some(value.as_str()) == tgt_kind || tgt_lit == Some(value.as_str()) {
3198                    return Some(alt);
3199                }
3200            }
3201        }
3202    }
3203
3204    if !constraint_blob.is_empty() {
3205        // Categorical filter: when the cursor has an unconsumed first
3206        // edge, an alt should only be considered if it can consume
3207        // that edge — OR no alt in the CHOICE can. Acceptance is the
3208        // inductive predicate `accepts_first_edge`: it fuses FIELD-name
3209        // matching with content-yield admission and SYMBOL subtype
3210        // dispatch into one rule.
3211        let first_uc_edge_pre = cursor
3212            .edges
3213            .iter()
3214            .enumerate()
3215            .find(|(i, _)| !cursor.consumed[*i])
3216            .map(|(_, e)| e);
3217        let alt_accepts = |a: &Production| -> bool {
3218            let Some(edge) = first_uc_edge_pre else {
3219                return false;
3220            };
3221            let edge_kind = edge.kind.as_ref();
3222            let Some(tgt_kind) = schema.vertices.get(&edge.tgt).map(|v| v.kind.as_ref()) else {
3223                return false;
3224            };
3225            accepts_first_edge(grammar, a, edge_kind, tgt_kind)
3226        };
3227        let any_consumes = any_unconsumed && alternatives.iter().any(alt_accepts);
3228
3229        // Primary score: literal-token match length. This dominates
3230        // alt selection so existing language tests that depend on
3231        // literal-only fingerprints keep working.
3232        // Secondary score (tiebreaker only): named-symbol kind match
3233        // count, read from the separate `chose-alt-child-kinds`
3234        // constraint (kept apart from the literal fingerprint so
3235        // identifiers like `:` in the kind list don't contaminate the
3236        // literal match). An alt that matches the recorded kinds is a
3237        // stronger witness than one whose only
3238        // overlap is literal punctuation.
3239        let mut best_literal: usize = 0;
3240        let mut best_symbols: usize = 0;
3241        let mut best_total_chars: usize = usize::MAX;
3242        let mut best_alt: Option<&Production> = None;
3243        let mut tied = false;
3244        for alt in alternatives {
3245            let strings = literal_strings(alt);
3246            if strings.is_empty() {
3247                continue;
3248            }
3249            // Categorical filter: skip alts that can't consume the
3250            // first unconsumed edge when SOME alt can.
3251            if any_consumes && !alt_accepts(alt) {
3252                continue;
3253            }
3254            let literal_score = strings
3255                .iter()
3256                .filter(|s| constraint_blob.contains(s.as_str()))
3257                .map(String::len)
3258                .sum::<usize>();
3259            if literal_score == 0 {
3260                continue;
3261            }
3262            let total_chars: usize = strings.iter().map(String::len).sum();
3263            let symbol_score = if literal_score >= best_literal && !child_kinds.is_empty() {
3264                let symbols = referenced_symbols(alt);
3265                symbols
3266                    .iter()
3267                    .filter(|sym| {
3268                        let sym_str: &str = sym;
3269                        if child_kinds.contains(&sym_str) {
3270                            return true;
3271                        }
3272                        grammar.subtypes.get(sym_str).is_some_and(|sub_set| {
3273                            sub_set
3274                                .iter()
3275                                .any(|sub| child_kinds.contains(&sub.as_str()))
3276                        })
3277                    })
3278                    .count()
3279            } else {
3280                0
3281            };
3282            let better = literal_score > best_literal
3283                || (literal_score == best_literal && symbol_score > best_symbols)
3284                || (literal_score == best_literal
3285                    && symbol_score == best_symbols
3286                    && total_chars < best_total_chars);
3287            let same = literal_score == best_literal
3288                && symbol_score == best_symbols
3289                && total_chars == best_total_chars;
3290            if better {
3291                best_literal = literal_score;
3292                best_symbols = symbol_score;
3293                best_total_chars = total_chars;
3294                best_alt = Some(alt);
3295                tied = false;
3296            } else if same && best_alt.is_some() {
3297                tied = true;
3298            }
3299        }
3300        if let Some(alt) = best_alt {
3301            if !tied {
3302                if any_unconsumed {
3303                    if alt_accepts(alt) {
3304                        return Some(alt);
3305                    }
3306                    // The best literal-score alt can't consume the
3307                    // first unconsumed cursor edge. Three sub-cases:
3308                    //  (a) No BLANK alternative: blob is the only
3309                    //      signal; return best_alt.
3310                    //  (b) BLANK present AND best_alt is pure-literal
3311                    //      (no referenced SYMBOLs): emitting best_alt
3312                    //      adds the matched literals and consumes no
3313                    //      child; the unconsumed cursor edge is for a
3314                    //      later SEQ position anyway. Return best_alt
3315                    //      (BUGS `model_block`: CHOICE[CHOICE["model",
3316                    //      "data"], BLANK] picks the literal because
3317                    //      the blob recorded it).
3318                    //  (c) BLANK present AND best_alt has SYMBOLs:
3319                    //      emitting best_alt would walk SYMBOLs that
3320                    //      can't be satisfied (they consume no edge,
3321                    //      a downstream SYMBOL would silently fail).
3322                    //      Fall through to final selection of BLANK
3323                    //      (Java `formal_parameters` inner CHOICE
3324                    //      [SEQ[receiver, ","], BLANK] with a
3325                    //      formal_parameter edge: pick BLANK).
3326                    if !blank_present || referenced_symbols(alt).is_empty() {
3327                        return Some(alt);
3328                    }
3329                } else {
3330                    return Some(alt);
3331                }
3332            }
3333        }
3334    }
3335
3336    // Cursor-driven dispatch via Yield-set preimage.
3337    //
3338    // For a CHOICE C = A1 | ... | An, Yield(Ai) is the set of vertex
3339    // kinds that can appear as the first named child when Ai is taken
3340    // (see `yield_of_production`). Given the first unconsumed cursor
3341    // edge with target kind K, select the first Ai (grammar order)
3342    // where K ∈ Yield(Ai). This is deterministic: grammar order is
3343    // the tiebreak, matching tree-sitter's own disambiguation.
3344    let first_unconsumed_kind: Option<&str> = cursor
3345        .edges
3346        .iter()
3347        .enumerate()
3348        .find(|(i, _)| !cursor.consumed[*i])
3349        .and_then(|(_, edge)| schema.vertices.get(&edge.tgt).map(|v| v.kind.as_ref()));
3350    if let Some(target_kind) = first_unconsumed_kind {
3351        // The subtype closure `subtypes[target_kind]` contains every
3352        // symbol name S such that a vertex of kind `target_kind` can
3353        // appear where the grammar says `SYMBOL S`. For a CHOICE
3354        // C = A1 | ... | An, the correct alternative is the one whose
3355        // top-level symbol is in `subtypes[target_kind]` (the target
3356        // kind IS a subtype of that symbol, so the symbol's rule body
3357        // dispatches to the target kind at parse time). This is an
3358        // O(1) set-membership check per alternative — no recursive
3359        // Yield computation needed.
3360        //
3361        // Preference order:
3362        //   1. Direct name match (target_kind == symbol name)
3363        //   2. Subtype match (symbol name ∈ subtypes[target_kind])
3364        //   3. Yield-set match (target_kind ∈ Yield(alt)) as fallback
3365        //      for non-SYMBOL alternatives (ALIAS, SEQ, etc.)
3366        let target_supers = grammar.subtypes.get(target_kind);
3367
3368        // Indented-form preference: when multiple alternatives match
3369        // the target kind (e.g. Python _suite where all three alts
3370        // produce `block`), prefer the alternative containing an
3371        // `_indent` SYMBOL. Check this BEFORE the standard passes
3372        // since they would pick the first match in grammar order.
3373        {
3374            let mut match_count = 0usize;
3375            let mut indent_alt_idx: Option<usize> = None;
3376            let mut visited = std::collections::HashSet::new();
3377            let mut yield_cache = grammar.yield_sets.clone();
3378            for (i, alt) in alternatives.iter().enumerate() {
3379                let ys = yield_of_production(grammar, alt, &mut visited, &mut yield_cache);
3380                if ys.contains(target_kind) {
3381                    match_count += 1;
3382                    if indent_alt_idx.is_none()
3383                        && referenced_symbols(alt)
3384                            .iter()
3385                            .any(|s| grammar.external_indent_opens.contains(*s))
3386                    {
3387                        indent_alt_idx = Some(i);
3388                    }
3389                }
3390                visited.clear();
3391            }
3392            if match_count > 1 {
3393                if let Some(idx) = indent_alt_idx {
3394                    return Some(&alternatives[idx]);
3395                }
3396            }
3397        }
3398
3399        // Pass 1: direct name match
3400        for alt in alternatives {
3401            if let Production::Symbol { name } = alt {
3402                if name.as_str() == target_kind {
3403                    return Some(alt);
3404                }
3405            }
3406            if let Production::Alias {
3407                named: true, value, ..
3408            } = alt
3409            {
3410                if value.as_str() == target_kind {
3411                    return Some(alt);
3412                }
3413            }
3414        }
3415
3416        // Pass 2: subtype match (the target kind's supertype set
3417        // tells us which SYMBOL names it satisfies)
3418        if let Some(supers) = target_supers {
3419            for alt in alternatives {
3420                if let Production::Symbol { name } = alt {
3421                    if supers.contains(name.as_str()) {
3422                        return Some(alt);
3423                    }
3424                }
3425                if let Production::Alias {
3426                    named: true, value, ..
3427                } = alt
3428                {
3429                    if supers.contains(value.as_str()) {
3430                        return Some(alt);
3431                    }
3432                }
3433            }
3434        }
3435
3436        // Pass 3: Yield-set fallback for alternatives that are not
3437        // plain SYMBOLs or named ALIASes (e.g. SEQ, PREC wrappers
3438        // around SYMBOLs that the above passes don't unwrap).
3439        // Guard: skip alternatives whose FIELDs don't match any
3440        // unconsumed edge kind. A FIELD that can't be satisfied
3441        // would consume the wrong child, and the alternative is
3442        // structurally wrong for the current cursor state.
3443        let mut visited = std::collections::HashSet::new();
3444        let mut yield_cache = grammar.yield_sets.clone();
3445        let mut matching_alts: Vec<&Production> = Vec::new();
3446        for alt in alternatives {
3447            if has_any_field(alt) && !has_field_in(alt, &edge_kinds) {
3448                visited.clear();
3449                continue;
3450            }
3451            // Token-set restriction: when a FIELD's body is an
3452            // ALIAS{CHOICE[STRING...]}, the field admits only those
3453            // literal values. An alt whose token-restricted FIELDs
3454            // can't accept the cursor's edge for that field is
3455            // structurally invalid (e.g. Go `call_expression` alt 0
3456            // has `function: ALIAS{CHOICE["new","make"], ...}` and
3457            // is only valid when the function child's literal is
3458            // "new" or "make").
3459            if !alt_satisfies_field_token_restrictions(schema, cursor, alt) {
3460                visited.clear();
3461                continue;
3462            }
3463            // Alias-source discriminator: if the cursor has a
3464            // field-named edge whose `pre-alias-symbol` was recorded by
3465            // the walker, the alt's FIELD body (when it's a named
3466            // ALIAS over a SYMBOL) must reference that same source
3467            // symbol. This is the exact tree-sitter-derived signal for
3468            // ALIAS dispatch when literal-value restriction does not
3469            // apply.
3470            if !alt_satisfies_pre_alias_constraints(schema, cursor, alt) {
3471                visited.clear();
3472                continue;
3473            }
3474            let ys = yield_of_production(grammar, alt, &mut visited, &mut yield_cache);
3475            if ys.contains(target_kind) {
3476                matching_alts.push(alt);
3477            }
3478            visited.clear();
3479        }
3480        if matching_alts.len() == 1 {
3481            return Some(matching_alts[0]);
3482        }
3483        if matching_alts.len() > 1 {
3484            // When multiple alternatives match via yield-set, apply
3485            // tree-sitter's precedence ordering: higher PREC wins.
3486            // This is the grammar author's explicit disambiguator for
3487            // ambiguous productions; it should be honored unconditionally,
3488            // not gated on whether the constraint blob is empty.
3489            matching_alts.sort_by_key(|alt| std::cmp::Reverse(prec_value(alt)));
3490            return Some(matching_alts[0]);
3491        }
3492    }
3493
3494    // FIELD dispatch: pick an alternative whose FIELD name matches an
3495    // unconsumed edge kind.
3496    for alt in alternatives {
3497        if has_field_in(alt, &edge_kinds) {
3498            return Some(alt);
3499        }
3500    }
3501
3502    // No dispatch tier matched. The final selection follows the
3503    // categorical semantics of CHOICE-with-BLANK: BLANK represents ε
3504    // (produce nothing at this position). It is correct if and only
3505    // if no child remains to consume at this cursor position.
3506    //
3507    // When unconsumed non-extra children remain, selecting BLANK
3508    // would silently drop them. Select the first non-BLANK
3509    // alternative instead so the production walk can attempt to
3510    // consume them (the grammar rule may reference a symbol name
3511    // that doesn't exactly match the parse output's child kind,
3512    // e.g. Julia's macrocall_expression receives `argument_list`
3513    // children when grammar.json only references
3514    // `macro_argument_list`).
3515    let _ = (schema, vertex_id);
3516    // Prefer newline-like PATTERN over STRING ";" or other separators
3517    // when both are alternatives. The PATTERN produces a structural
3518    // LineBreak which is semantically correct for top-level terminators
3519    // (Go's source_file REPEAT terminator).
3520    let has_newline_pattern = alternatives
3521        .iter()
3522        .any(|a| matches!(a, Production::Pattern { value } if is_newline_like_pattern(value)));
3523    if has_newline_pattern {
3524        for alt in alternatives {
3525            if let Production::Pattern { value } = alt {
3526                if is_newline_like_pattern(value) {
3527                    return Some(alt);
3528                }
3529            }
3530        }
3531    }
3532    if alternatives.iter().any(|a| matches!(a, Production::Blank)) {
3533        // Before selecting BLANK, check if a hidden-rule alternative
3534        // resolves to a newline-like PATTERN. Prefer it: it produces
3535        // a LineBreak which is semantically correct for terminators
3536        // like Julia's _terminator = CHOICE[PATTERN "\r?\n", ...].
3537        for alt in alternatives {
3538            if let Production::Symbol { name } = alt {
3539                if name.starts_with('_') {
3540                    if let Some(rule) = grammar.rules.get(name) {
3541                        if contains_newline_pattern(rule) {
3542                            return Some(alt);
3543                        }
3544                    }
3545                }
3546            }
3547        }
3548        return alternatives.iter().find(|a| matches!(a, Production::Blank));
3549    }
3550    // When cursor is exhausted and no BLANK, prefer an alternative
3551    // that references NO symbols (pure-literal: only STRINGs, PATTERNs,
3552    // BLANKs). Such an alternative can produce output without consuming
3553    // any children and is safe when the cursor is empty (e.g. the ";"
3554    // terminator in Rust's struct_item vs SEQ with FIELD body).
3555    if !any_unconsumed {
3556        if let Some(pure_lit) = alternatives.iter().find(|alt| {
3557            let syms = referenced_symbols(alt);
3558            syms.is_empty() && !matches!(alt, Production::Blank)
3559        }) {
3560            return Some(pure_lit);
3561        }
3562    }
3563    alternatives
3564        .iter()
3565        .find(|alt| !matches!(alt, Production::Blank))
3566}
3567
3568/// Collect every literal STRING token directly inside `production`
3569/// (without descending into SYMBOLs / hidden rules). Used to score
3570/// CHOICE alternatives against the parent vertex's interstitials so
3571/// the right operator / keyword form is picked when the schema
3572/// preserves interstitial fragments from a prior parse.
3573fn literal_strings(production: &Production) -> Vec<String> {
3574    let mut out = Vec::new();
3575    fn walk(p: &Production, out: &mut Vec<String>) {
3576        match p {
3577            Production::String { value } if !value.is_empty() => {
3578                out.push(value.clone());
3579            }
3580            Production::Choice { members } | Production::Seq { members } => {
3581                for m in members {
3582                    walk(m, out);
3583                }
3584            }
3585            Production::Repeat { content }
3586            | Production::Repeat1 { content }
3587            | Production::Optional { content }
3588            | Production::Field { content, .. }
3589            | Production::Alias { content, .. }
3590            | Production::Token { content }
3591            | Production::ImmediateToken { content }
3592            | Production::Prec { content, .. }
3593            | Production::PrecLeft { content, .. }
3594            | Production::PrecRight { content, .. }
3595            | Production::PrecDynamic { content, .. }
3596            | Production::Reserved { content, .. } => walk(content, out),
3597            _ => {}
3598        }
3599    }
3600    walk(production, &mut out);
3601    out
3602}
3603
3604/// Collect every SYMBOL name reachable from `production` without
3605/// crossing into nested rules. Used by `pick_choice_with_cursor` to
3606/// rank alternatives by "any SYMBOL inside this alt matches something
3607/// on the cursor", instead of just the first SYMBOL: a leading
3608/// optional like `attribute_item` then `parameter` is otherwise
3609/// rejected when only the parameter children are present.
3610fn referenced_symbols(production: &Production) -> Vec<&str> {
3611    let mut out = Vec::new();
3612    fn walk<'a>(p: &'a Production, out: &mut Vec<&'a str>) {
3613        match p {
3614            Production::Symbol { name } => out.push(name.as_str()),
3615            Production::Choice { members } | Production::Seq { members } => {
3616                for m in members {
3617                    walk(m, out);
3618                }
3619            }
3620            Production::Alias {
3621                content,
3622                named,
3623                value,
3624            } => {
3625                // A named ALIAS produces a child vertex whose kind is
3626                // the alias `value` (e.g. `ALIAS { content: STRING "=",
3627                // value: "punctuation", named: true }` introduces a
3628                // `punctuation` child). For cursor-driven dispatch to
3629                // recognise alts that emit such children, yield the
3630                // alias value as a referenced symbol. Anonymous aliases
3631                // do not introduce a named node and only need their
3632                // inner content's symbols.
3633                if *named && !value.is_empty() {
3634                    out.push(value.as_str());
3635                }
3636                walk(content, out);
3637            }
3638            Production::Repeat { content }
3639            | Production::Repeat1 { content }
3640            | Production::Optional { content }
3641            | Production::Field { content, .. }
3642            | Production::Token { content }
3643            | Production::ImmediateToken { content }
3644            | Production::Prec { content, .. }
3645            | Production::PrecLeft { content, .. }
3646            | Production::PrecRight { content, .. }
3647            | Production::PrecDynamic { content, .. }
3648            | Production::Reserved { content, .. } => walk(content, out),
3649            _ => {}
3650        }
3651    }
3652    walk(production, &mut out);
3653    out
3654}
3655
3656#[cfg(test)]
3657fn first_symbol(production: &Production) -> Option<&str> {
3658    match production {
3659        Production::Symbol { name } => Some(name),
3660        Production::Seq { members } => members.iter().find_map(first_symbol),
3661        Production::Choice { members } => members.iter().find_map(first_symbol),
3662        Production::Repeat { content }
3663        | Production::Repeat1 { content }
3664        | Production::Optional { content }
3665        | Production::Field { content, .. }
3666        | Production::Alias { content, .. }
3667        | Production::Token { content }
3668        | Production::ImmediateToken { content }
3669        | Production::Prec { content, .. }
3670        | Production::PrecLeft { content, .. }
3671        | Production::PrecRight { content, .. }
3672        | Production::PrecDynamic { content, .. }
3673        | Production::Reserved { content, .. } => first_symbol(content),
3674        _ => None,
3675    }
3676}
3677
3678fn prec_value(prod: &Production) -> i64 {
3679    match prod {
3680        Production::Prec { value, .. }
3681        | Production::PrecLeft { value, .. }
3682        | Production::PrecRight { value, .. }
3683        | Production::PrecDynamic { value, .. } => value.as_i64().unwrap_or(0),
3684        _ => 0,
3685    }
3686}
3687
3688fn has_any_field(production: &Production) -> bool {
3689    match production {
3690        Production::Field { .. } => true,
3691        Production::Seq { members } | Production::Choice { members } => {
3692            members.iter().any(has_any_field)
3693        }
3694        Production::Repeat { content }
3695        | Production::Repeat1 { content }
3696        | Production::Optional { content }
3697        | Production::Alias { content, .. }
3698        | Production::Token { content }
3699        | Production::ImmediateToken { content }
3700        | Production::Prec { content, .. }
3701        | Production::PrecLeft { content, .. }
3702        | Production::PrecRight { content, .. }
3703        | Production::PrecDynamic { content, .. }
3704        | Production::Reserved { content, .. } => has_any_field(content),
3705        _ => false,
3706    }
3707}
3708
3709fn has_field_in(production: &Production, edge_kinds: &[&str]) -> bool {
3710    match production {
3711        Production::Field { name, .. } => edge_kinds.contains(&name.as_str()),
3712        Production::Seq { members } | Production::Choice { members } => {
3713            members.iter().any(|m| has_field_in(m, edge_kinds))
3714        }
3715        Production::Repeat { content }
3716        | Production::Repeat1 { content }
3717        | Production::Optional { content }
3718        | Production::Alias { content, .. }
3719        | Production::Token { content }
3720        | Production::ImmediateToken { content }
3721        | Production::Prec { content, .. }
3722        | Production::PrecLeft { content, .. }
3723        | Production::PrecRight { content, .. }
3724        | Production::PrecDynamic { content, .. }
3725        | Production::Reserved { content, .. } => has_field_in(content, edge_kinds),
3726        _ => false,
3727    }
3728}
3729
3730/// Collect every `(field_name, restricted_token_set)` pair under `production`
3731/// where the FIELD's body is an ALIAS whose inner content is a CHOICE of
3732/// pure STRINGs (or a single STRING). Such a FIELD restricts the field's
3733/// child literal-value to that set: the alternative is only structurally
3734/// valid for cursors whose field-named edge target carries a literal in
3735/// the set. Returns an empty vec when `production` has no token-restricted
3736/// FIELDs.
3737fn collect_field_token_restrictions<'a>(
3738    production: &'a Production,
3739    out: &mut Vec<(&'a str, Vec<&'a str>)>,
3740) {
3741    match production {
3742        Production::Field { name, content } => {
3743            if let Some(strings) = literal_choice_set(content) {
3744                out.push((name.as_str(), strings));
3745            }
3746            collect_field_token_restrictions(content, out);
3747        }
3748        Production::Seq { members } | Production::Choice { members } => {
3749            for m in members {
3750                collect_field_token_restrictions(m, out);
3751            }
3752        }
3753        Production::Repeat { content }
3754        | Production::Repeat1 { content }
3755        | Production::Optional { content }
3756        | Production::Alias { content, .. }
3757        | Production::Token { content }
3758        | Production::ImmediateToken { content }
3759        | Production::Prec { content, .. }
3760        | Production::PrecLeft { content, .. }
3761        | Production::PrecRight { content, .. }
3762        | Production::PrecDynamic { content, .. }
3763        | Production::Reserved { content, .. } => {
3764            collect_field_token_restrictions(content, out);
3765        }
3766        _ => {}
3767    }
3768}
3769
3770/// If `p` unwraps to an ALIAS whose inner content is a CHOICE-of-STRINGs
3771/// (or a single STRING), return that set. Otherwise None.
3772fn literal_choice_set(p: &Production) -> Option<Vec<&str>> {
3773    fn unwrap(p: &Production) -> &Production {
3774        match p {
3775            Production::Prec { content, .. }
3776            | Production::PrecLeft { content, .. }
3777            | Production::PrecRight { content, .. }
3778            | Production::PrecDynamic { content, .. }
3779            | Production::Token { content }
3780            | Production::ImmediateToken { content }
3781            | Production::Reserved { content, .. } => unwrap(content),
3782            _ => p,
3783        }
3784    }
3785    let p = unwrap(p);
3786    let Production::Alias { content, .. } = p else {
3787        return None;
3788    };
3789    let inner = unwrap(content);
3790    match inner {
3791        Production::String { value } => Some(vec![value.as_str()]),
3792        Production::Choice { members } => {
3793            let mut out = Vec::new();
3794            for m in members {
3795                match unwrap(m) {
3796                    Production::String { value } => out.push(value.as_str()),
3797                    _ => return None,
3798                }
3799            }
3800            Some(out)
3801        }
3802        _ => None,
3803    }
3804}
3805
3806/// Categorical acceptance predicate: does `production` accept a cursor
3807/// edge whose field name is `edge_field` (or `child_of`) and whose target
3808/// vertex has kind `target_kind`?
3809///
3810/// Defined inductively over the production tree:
3811///
3812/// - `STRING` / `PATTERN` / `BLANK` / ε-only: reject (consume no edges).
3813/// - `SYMBOL X` (concrete): `edge_field == "child_of"` and `target_kind ⊑ X`.
3814/// - `SYMBOL X` (hidden / supertype): `accepts(X.rule, e)`.
3815/// - `ALIAS{c, named:true, value:V}`: `edge_field == "child_of"` and
3816///   `target_kind == V` (the alias rewrites the child kind to `V`).
3817/// - `FIELD{name, content}`: `edge_field == name` and `content.yield` admits
3818///   `target_kind` (the field content must accept the target as one of its
3819///   first kinds).
3820/// - `SEQ[m1, m2, ...]`: `accepts(m1, e)` or
3821///   (`m1` is ε-able and `accepts(SEQ[m2..], e)`).
3822/// - `CHOICE[a1, a2, ...]`: any of `accepts(ai, e)`.
3823/// - `OPTIONAL` / `REPEAT` / `REPEAT1` / wrappers: `accepts(inner, e)`.
3824fn accepts_first_edge(
3825    grammar: &Grammar,
3826    production: &Production,
3827    edge_field: &str,
3828    target_kind: &str,
3829) -> bool {
3830    fn yield_contains(grammar: &Grammar, prod: &Production, kind: &str) -> bool {
3831        let mut visited = std::collections::HashSet::new();
3832        let mut cache = grammar.yield_sets.clone();
3833        let ys = yield_of_production(grammar, prod, &mut visited, &mut cache);
3834        ys.contains(kind)
3835            || grammar
3836                .subtypes
3837                .get(kind)
3838                .is_some_and(|subs| subs.iter().any(|s| ys.contains(s.as_str())))
3839    }
3840    fn yield_has_epsilon(grammar: &Grammar, prod: &Production) -> bool {
3841        let mut visited = std::collections::HashSet::new();
3842        let mut cache = grammar.yield_sets.clone();
3843        let ys = yield_of_production(grammar, prod, &mut visited, &mut cache);
3844        // SEQ with all-ε-able members, OPTIONAL, REPEAT, BLANK all
3845        // carry the ε marker (empty string) in their yield set.
3846        ys.contains("") || ys.is_empty()
3847    }
3848    match production {
3849        Production::String { .. } | Production::Pattern { .. } | Production::Blank => false,
3850        Production::Symbol { name } => {
3851            if edge_field != "child_of" {
3852                return false;
3853            }
3854            if name == target_kind {
3855                return true;
3856            }
3857            if grammar
3858                .subtypes
3859                .get(target_kind)
3860                .is_some_and(|s| s.contains(name))
3861            {
3862                return true;
3863            }
3864            // Hidden / supertype: walk the rule body.
3865            let is_expand = name.starts_with('_') || grammar.supertypes.contains(name.as_str());
3866            if is_expand {
3867                if let Some(rule) = grammar.rules.get(name) {
3868                    return accepts_first_edge(grammar, rule, edge_field, target_kind);
3869                }
3870            }
3871            false
3872        }
3873        Production::Alias {
3874            named,
3875            value,
3876            content,
3877        } => {
3878            if *named && !value.is_empty() {
3879                edge_field == "child_of" && value == target_kind
3880            } else {
3881                accepts_first_edge(grammar, content, edge_field, target_kind)
3882            }
3883        }
3884        Production::Field { name, content } => {
3885            edge_field == name.as_str() && yield_contains(grammar, content, target_kind)
3886        }
3887        Production::Seq { members } => {
3888            for m in members {
3889                if accepts_first_edge(grammar, m, edge_field, target_kind) {
3890                    return true;
3891                }
3892                if !yield_has_epsilon(grammar, m) {
3893                    return false;
3894                }
3895            }
3896            false
3897        }
3898        Production::Choice { members } => members
3899            .iter()
3900            .any(|m| accepts_first_edge(grammar, m, edge_field, target_kind)),
3901        Production::Optional { content }
3902        | Production::Repeat { content }
3903        | Production::Repeat1 { content }
3904        | Production::Token { content }
3905        | Production::ImmediateToken { content }
3906        | Production::Prec { content, .. }
3907        | Production::PrecLeft { content, .. }
3908        | Production::PrecRight { content, .. }
3909        | Production::PrecDynamic { content, .. }
3910        | Production::Reserved { content, .. } => {
3911            accepts_first_edge(grammar, content, edge_field, target_kind)
3912        }
3913    }
3914}
3915
3916/// Read the walker-recorded `pre-alias-symbol` constraint for a vertex.
3917/// Returns `None` when the vertex has no such constraint (either there
3918/// was no alias rewrite or the schema was built without the walker).
3919fn pre_alias_symbol<'a>(schema: &'a Schema, vertex_id: &panproto_gat::Name) -> Option<&'a str> {
3920    schema.constraints.get(vertex_id).and_then(|cs| {
3921        cs.iter()
3922            .find(|c| c.sort.as_ref() == "pre-alias-symbol")
3923            .map(|c| c.value.as_str())
3924    })
3925}
3926
3927/// Walk `production` and collect every alias-source-symbol declared
3928/// inside a FIELD with name `field_name`. Specifically, for each
3929/// `FIELD { name = field_name, content = ALIAS { content = SYMBOL X,
3930/// named: true, value: _ } }`, append `X`. Returns an empty Vec when
3931/// the alt's FIELD body is not a named-ALIAS-over-SYMBOL.
3932fn field_alias_sources<'a>(production: &'a Production, field_name: &str, out: &mut Vec<&'a str>) {
3933    fn unwrap_to_alias_source(p: &Production) -> Option<&str> {
3934        let inner = match p {
3935            Production::Prec { content, .. }
3936            | Production::PrecLeft { content, .. }
3937            | Production::PrecRight { content, .. }
3938            | Production::PrecDynamic { content, .. }
3939            | Production::Token { content }
3940            | Production::ImmediateToken { content }
3941            | Production::Reserved { content, .. } => content.as_ref(),
3942            _ => p,
3943        };
3944        match inner {
3945            Production::Alias { content, named, .. } if *named => {
3946                if let Production::Symbol { name } = content.as_ref() {
3947                    return Some(name.as_str());
3948                }
3949                None
3950            }
3951            _ => None,
3952        }
3953    }
3954    match production {
3955        Production::Field { name, content } if name.as_str() == field_name => {
3956            if let Some(src) = unwrap_to_alias_source(content) {
3957                out.push(src);
3958            }
3959        }
3960        Production::Field { content, .. }
3961        | Production::Repeat { content }
3962        | Production::Repeat1 { content }
3963        | Production::Optional { content }
3964        | Production::Alias { content, .. }
3965        | Production::Token { content }
3966        | Production::ImmediateToken { content }
3967        | Production::Prec { content, .. }
3968        | Production::PrecLeft { content, .. }
3969        | Production::PrecRight { content, .. }
3970        | Production::PrecDynamic { content, .. }
3971        | Production::Reserved { content, .. } => {
3972            field_alias_sources(content, field_name, out);
3973        }
3974        Production::Seq { members } | Production::Choice { members } => {
3975            for m in members {
3976                field_alias_sources(m, field_name, out);
3977            }
3978        }
3979        _ => {}
3980    }
3981}
3982
3983/// Categorical alias-source discriminator: when the cursor edge for a
3984/// field-named edge has a recorded `pre-alias-symbol = X`, an alt
3985/// whose FIELD of that name takes its content from `ALIAS { SYMBOL Y }`
3986/// is structurally compatible iff `Y == X` — i.e. the alias's source
3987/// rule matches what the parser actually walked through. When the alt
3988/// has a FIELD with a named-ALIAS-over-SYMBOL whose source disagrees
3989/// with the cursor's recorded pre-alias-symbol, the alt is rejected.
3990fn alt_satisfies_pre_alias_constraints(
3991    schema: &Schema,
3992    cursor: &ChildCursor<'_>,
3993    alt: &Production,
3994) -> bool {
3995    for (i, edge) in cursor.edges.iter().enumerate() {
3996        if cursor.consumed[i] {
3997            continue;
3998        }
3999        let edge_kind = edge.kind.as_ref();
4000        if edge_kind == "child_of" {
4001            continue;
4002        }
4003        let Some(actual_source) = pre_alias_symbol(schema, &edge.tgt) else {
4004            continue;
4005        };
4006        let mut sources: Vec<&str> = Vec::new();
4007        field_alias_sources(alt, edge_kind, &mut sources);
4008        if sources.is_empty() {
4009            // The alt's FIELD content is not a named-ALIAS-over-SYMBOL,
4010            // so this discriminator does not apply (the alt may still
4011            // be correct).
4012            continue;
4013        }
4014        if !sources.contains(&actual_source) {
4015            return false;
4016        }
4017    }
4018    true
4019}
4020
4021/// Returns true iff `alt` is structurally compatible with the cursor under
4022/// the field-token-restriction discipline: for every FIELD in `alt` whose
4023/// content is `ALIAS{CHOICE[STRING...], value: V}`, the cursor's field-named
4024/// edge for that field must carry a literal-value in the restricted set.
4025/// When the alt has no token-restricted FIELDs the check is vacuously true.
4026fn alt_satisfies_field_token_restrictions(
4027    schema: &Schema,
4028    cursor: &ChildCursor<'_>,
4029    alt: &Production,
4030) -> bool {
4031    let mut restrictions: Vec<(&str, Vec<&str>)> = Vec::new();
4032    collect_field_token_restrictions(alt, &mut restrictions);
4033    for (field_name, allowed) in &restrictions {
4034        let mut field_seen = false;
4035        let mut field_admits = false;
4036        for (i, edge) in cursor.edges.iter().enumerate() {
4037            if cursor.consumed[i] {
4038                continue;
4039            }
4040            if edge.kind.as_ref() != *field_name {
4041                continue;
4042            }
4043            field_seen = true;
4044            let lit = literal_value(schema, &edge.tgt);
4045            if let Some(l) = lit {
4046                if allowed.contains(&l) {
4047                    field_admits = true;
4048                    break;
4049                }
4050            }
4051        }
4052        if field_seen && !field_admits {
4053            return false;
4054        }
4055    }
4056    true
4057}
4058
4059fn has_relevant_constraint(
4060    production: &Production,
4061    schema: &Schema,
4062    vertex_id: &panproto_gat::Name,
4063) -> bool {
4064    let constraints = match schema.constraints.get(vertex_id) {
4065        Some(c) => c,
4066        None => return false,
4067    };
4068    fn walk(production: &Production, constraints: &[panproto_schema::Constraint]) -> bool {
4069        match production {
4070            Production::String { value } => constraints
4071                .iter()
4072                .any(|c| c.value == *value || c.sort.as_ref() == value),
4073            Production::Field { name, content } => {
4074                constraints.iter().any(|c| c.sort.as_ref() == name) || walk(content, constraints)
4075            }
4076            Production::Seq { members } | Production::Choice { members } => {
4077                members.iter().any(|m| walk(m, constraints))
4078            }
4079            Production::Repeat { content }
4080            | Production::Repeat1 { content }
4081            | Production::Optional { content }
4082            | Production::Alias { content, .. }
4083            | Production::Token { content }
4084            | Production::ImmediateToken { content }
4085            | Production::Prec { content, .. }
4086            | Production::PrecLeft { content, .. }
4087            | Production::PrecRight { content, .. }
4088            | Production::PrecDynamic { content, .. }
4089            | Production::Reserved { content, .. } => walk(content, constraints),
4090            _ => false,
4091        }
4092    }
4093    walk(production, constraints)
4094}
4095
4096fn children_for<'a>(schema: &'a Schema, vertex_id: &panproto_gat::Name) -> Vec<&'a Edge> {
4097    // Walk `outgoing` (insertion-ordered by SchemaBuilder via SmallVec
4098    // append) rather than the unordered `edges` HashMap so abstract
4099    // schemas under REPEAT(CHOICE(...)) preserve the order their edges
4100    // were inserted in. The previous implementation walked the HashMap
4101    // and sorted lexicographically by (kind, target id), which fused
4102    // interleaved children of the same kind into runs (e.g. a sequence
4103    // [symbol, punct, int, symbol, punct, int] became [symbol, symbol,
4104    // punct, punct, int, int] after the lex sort).
4105    let Some(edges) = schema.outgoing.get(vertex_id) else {
4106        return Vec::new();
4107    };
4108
4109    // Look up the canonical Edge reference (the key in `schema.edges`)
4110    // for each entry in `outgoing`. Falls back to the SmallVec entry if
4111    // the canonical key is missing, which would indicate index drift.
4112    let mut indexed: Vec<(usize, u32, &Edge)> = edges
4113        .iter()
4114        .enumerate()
4115        .map(|(i, e)| {
4116            let canonical = schema.edges.get_key_value(e).map_or(e, |(k, _)| k);
4117            let pos = schema.orderings.get(canonical).copied().unwrap_or(u32::MAX);
4118            (i, pos, canonical)
4119        })
4120        .collect();
4121
4122    // Stable sort by (explicit-ordering, insertion-index). Edges with
4123    // an explicit `orderings` entry come first in their declared order;
4124    // the remainder fall through in insertion order.
4125    indexed.sort_by_key(|(i, pos, _)| (*pos, *i));
4126    indexed.into_iter().map(|(_, _, e)| e).collect()
4127}
4128
4129fn vertex_id_kind<'a>(schema: &'a Schema, vertex_id: &panproto_gat::Name) -> Option<&'a str> {
4130    schema.vertices.get(vertex_id).map(|v| v.kind.as_ref())
4131}
4132
4133fn literal_value<'a>(schema: &'a Schema, vertex_id: &panproto_gat::Name) -> Option<&'a str> {
4134    schema
4135        .constraints
4136        .get(vertex_id)?
4137        .iter()
4138        .find(|c| c.sort.as_ref() == "literal-value")
4139        .map(|c| c.value.as_str())
4140}
4141
4142/// True iff `pattern` matches a (possibly optional / repeated) sequence
4143/// of carriage-return and newline characters only. Examples: `\r?\n`,
4144/// `\n`, `\r\n`, `\n+`, `\r?\n+`. Distinguishes structural newline
4145/// terminals from generic whitespace and from other patterns that
4146/// happen to contain a newline escape inside a larger class.
4147fn contains_newline_pattern(prod: &Production) -> bool {
4148    match prod {
4149        Production::Pattern { value } => is_newline_like_pattern(value),
4150        Production::Choice { members } | Production::Seq { members } => {
4151            members.iter().any(contains_newline_pattern)
4152        }
4153        Production::Prec { content, .. }
4154        | Production::PrecLeft { content, .. }
4155        | Production::PrecRight { content, .. }
4156        | Production::PrecDynamic { content, .. }
4157        | Production::Token { content }
4158        | Production::ImmediateToken { content }
4159        | Production::Optional { content }
4160        | Production::Field { content, .. }
4161        | Production::Alias { content, .. }
4162        | Production::Reserved { content, .. } => contains_newline_pattern(content),
4163        _ => false,
4164    }
4165}
4166
4167fn is_newline_like_pattern(pattern: &str) -> bool {
4168    if pattern.is_empty() {
4169        return false;
4170    }
4171    let mut chars = pattern.chars();
4172    let mut saw_newline_atom = false;
4173    while let Some(c) = chars.next() {
4174        match c {
4175            '\\' => match chars.next() {
4176                Some('n' | 'r') => saw_newline_atom = true,
4177                _ => return false,
4178            },
4179            '\n' | '\r' => saw_newline_atom = true,
4180            '?' | '*' | '+' => {} // quantifiers on the previous atom
4181            _ => return false,
4182        }
4183    }
4184    saw_newline_atom
4185}
4186
4187/// True iff `pattern` matches a (possibly quantified) run of generic
4188/// whitespace characters: `\s+`, `[ \t]+`, ` +`, `\s*`. Such patterns
4189/// describe interstitial spacing rather than syntactic content, so the
4190/// pretty emitter can drop them and let the layout pass insert the
4191/// configured separator.
4192fn is_whitespace_only_pattern(pattern: &str) -> bool {
4193    if pattern.is_empty() {
4194        return false;
4195    }
4196    // Strip an outer quantifier suffix.
4197    let trimmed = pattern.trim_end_matches(['?', '*', '+']);
4198    if trimmed.is_empty() {
4199        return false;
4200    }
4201    // Bare `\s` / ` ` / `\t`.
4202    if matches!(trimmed, "\\s" | " " | "\\t") {
4203        return true;
4204    }
4205    // Character class containing only whitespace atoms.
4206    if let Some(inner) = trimmed.strip_prefix('[').and_then(|s| s.strip_suffix(']')) {
4207        let mut chars = inner.chars();
4208        let mut saw_atom = false;
4209        while let Some(c) = chars.next() {
4210            match c {
4211                '\\' => match chars.next() {
4212                    Some('s' | 't' | 'r' | 'n') => saw_atom = true,
4213                    _ => return false,
4214                },
4215                ' ' | '\t' => saw_atom = true,
4216                _ => return false,
4217            }
4218        }
4219        return saw_atom;
4220    }
4221    false
4222}
4223
4224fn placeholder_for_pattern(pattern: &str) -> String {
4225    // Heuristic placeholder for unconstrained PATTERN terminals.
4226    //
4227    // First handle the "the regex IS a literal escape" cases that
4228    // tree-sitter grammars use as separators (`\n`, `\r\n`, `;`,
4229    // etc.); emitting the matching character is always preferable
4230    // to a `_x` identifier-like placeholder when the surrounding
4231    // grammar expects a separator.
4232    let simple_lit = decode_simple_pattern_literal(pattern);
4233    if let Some(lit) = simple_lit {
4234        return lit;
4235    }
4236
4237    if pattern.contains("[0-9]") || pattern.contains("\\d") {
4238        "0".into()
4239    } else if pattern.contains("[a-zA-Z_]") || pattern.contains("\\w") {
4240        "_x".into()
4241    } else if pattern.contains('"') || pattern.contains('\'') {
4242        "\"\"".into()
4243    } else {
4244        "_".into()
4245    }
4246}
4247
4248/// Decode a tree-sitter PATTERN whose regex is a simple literal
4249/// (newline, semicolon, comma, etc.) to the byte sequence it matches.
4250/// Returns `None` for patterns with character classes, alternations,
4251/// or quantifiers; the caller falls back to the heuristic placeholder.
4252fn decode_simple_pattern_literal(pattern: &str) -> Option<String> {
4253    // Skip patterns containing regex metachars that would broaden the
4254    // match beyond a single literal byte sequence.
4255    if pattern
4256        .chars()
4257        .any(|c| matches!(c, '[' | ']' | '(' | ')' | '*' | '+' | '?' | '|' | '{' | '}'))
4258    {
4259        return None;
4260    }
4261    let mut out = String::new();
4262    let mut chars = pattern.chars();
4263    while let Some(c) = chars.next() {
4264        if c == '\\' {
4265            match chars.next() {
4266                Some('n') => out.push('\n'),
4267                Some('r') => out.push('\r'),
4268                Some('t') => out.push('\t'),
4269                Some('\\') => out.push('\\'),
4270                Some('/') => out.push('/'),
4271                Some(other) => out.push(other),
4272                None => return None,
4273            }
4274        } else {
4275            out.push(c);
4276        }
4277    }
4278    Some(out)
4279}
4280
4281// ═══════════════════════════════════════════════════════════════════
4282// Token list output with Spacing algebra
4283// ═══════════════════════════════════════════════════════════════════
4284//
4285// Emit produces a free monoid over `Token`. Layout (spaces, newlines,
4286// indentation) is a homomorphism `Vec<Token> -> Vec<u8>` parameterised
4287// by `FormatPolicy`. Separating the structural output from the layout
4288// decision means each phase has one job: emit walks the grammar and
4289// pushes tokens; layout is a single fold, locally driven by adjacent
4290// pairs and a depth counter. Snapshot/restore is just `tokens.len()`.
4291
4292#[derive(Clone)]
4293enum Token {
4294    /// A user-visible terminal contributed by the grammar, annotated
4295    /// with its structural role for spacing decisions.
4296    Lit(String, TokenRole),
4297    /// `indent_open` marker emitted when a `Lit` matched the policy's
4298    /// open list. Carried as a separate token so layout can decide to
4299    /// break + indent without re-scanning.
4300    IndentOpen,
4301    /// `indent_close` marker emitted before a closer-`Lit`.
4302    IndentClose,
4303    /// "Break a line here if not already at line start" — used after
4304    /// statements/declarations and after open braces.
4305    LineBreak,
4306    /// Force a space before the next Lit even if the role-pair table
4307    /// says tight. Pushed between consecutive content-producing SEQ
4308    /// members (e.g. between `command_name` and `argument`) to ensure
4309    /// sibling-vertex tokens are separated.
4310    ForceSpace,
4311    /// Suppress the next inter-Lit separator. Pushed by the REPEAT
4312    /// walker when an iteration's "separator slot" (a CHOICE-with-BLANK
4313    /// or OPTIONAL at SEQ position 0) emitted zero content tokens, so
4314    /// the categorical reading is "no source-level separator existed
4315    /// between these two sibling iterations of the body".
4316    NoSpace,
4317}
4318
4319struct Output<'a> {
4320    tokens: Vec<Token>,
4321    policy: &'a FormatPolicy,
4322    grammar: &'a Grammar,
4323    current_rule: Option<String>,
4324    cassette: Option<&'a dyn crate::languages::cassettes::GrammarCassette>,
4325}
4326
4327#[derive(Clone)]
4328struct OutputSnapshot {
4329    tokens_len: usize,
4330}
4331
4332impl<'a> Output<'a> {
4333    fn new(
4334        policy: &'a FormatPolicy,
4335        grammar: &'a Grammar,
4336        cassette: Option<&'a dyn crate::languages::cassettes::GrammarCassette>,
4337    ) -> Self {
4338        Self {
4339            tokens: Vec::new(),
4340            policy,
4341            grammar,
4342            current_rule: None,
4343            cassette,
4344        }
4345    }
4346
4347    fn token(&mut self, value: &str) {
4348        self.token_with_role(value, None);
4349    }
4350
4351    fn token_with_role(&mut self, value: &str, explicit_role: Option<TokenRole>) {
4352        if value.is_empty() {
4353            return;
4354        }
4355
4356        if value == "\n" || value == "\r\n" || value == "\r" {
4357            self.tokens.push(Token::LineBreak);
4358            return;
4359        }
4360
4361        let trimmed = value.trim_end_matches(['\n', '\r']);
4362        let trailing_newlines = value.len() - trimmed.len();
4363        if trailing_newlines > 0 && !trimmed.is_empty() {
4364            let role = explicit_role.unwrap_or(TokenRole::Terminal);
4365            if role == TokenRole::BracketClose
4366                && self.policy.indent_close.iter().any(|t| t == trimmed)
4367            {
4368                self.tokens.push(Token::IndentClose);
4369            }
4370            self.tokens.push(Token::Lit(trimmed.to_owned(), role));
4371            if role == TokenRole::BracketOpen {
4372                if let Some(ref rule) = self.current_rule {
4373                    if self
4374                        .grammar
4375                        .indent_triggers
4376                        .contains(&(rule.clone(), trimmed.to_owned()))
4377                    {
4378                        self.tokens.push(Token::IndentOpen);
4379                    }
4380                }
4381            }
4382            self.tokens.push(Token::LineBreak);
4383            return;
4384        }
4385
4386        let role = explicit_role.unwrap_or_else(|| self.lookup_role(value));
4387
4388        if role == TokenRole::BracketClose && self.policy.indent_close.iter().any(|t| t == value) {
4389            self.tokens.push(Token::IndentClose);
4390        }
4391
4392        self.tokens.push(Token::Lit(value.to_owned(), role));
4393
4394        if role == TokenRole::BracketOpen {
4395            let grammar_indent = self.current_rule.as_ref().is_some_and(|rule| {
4396                self.grammar
4397                    .indent_triggers
4398                    .contains(&(rule.clone(), value.to_owned()))
4399            });
4400            if grammar_indent {
4401                self.tokens.push(Token::IndentOpen);
4402                self.tokens.push(Token::LineBreak);
4403            }
4404        }
4405        // Line-break after tokens like `;` (statement terminator).
4406        // Skip for BracketOpen/BracketClose tokens that are NOT
4407        // indent-triggering (e.g. `{` in interpolation should not
4408        // trigger a line break).
4409        let is_non_indent_bracket = self.current_rule.is_some()
4410            && (role == TokenRole::BracketOpen || role == TokenRole::BracketClose)
4411            && !self.current_rule.as_ref().is_some_and(|rule| {
4412                self.grammar
4413                    .indent_triggers
4414                    .contains(&(rule.clone(), value.to_owned()))
4415            });
4416        if !is_non_indent_bracket && self.policy.line_break_after.iter().any(|t| t == value) {
4417            self.tokens.push(Token::LineBreak);
4418        }
4419    }
4420
4421    fn lookup_role(&self, value: &str) -> TokenRole {
4422        if let Some(ref rule) = self.current_rule {
4423            if let Some(role_map) = self.grammar.token_roles.get(rule) {
4424                if let Some(role) = role_map.get(value) {
4425                    return *role;
4426                }
4427            }
4428        }
4429        if is_word_like(value) {
4430            TokenRole::Keyword
4431        } else {
4432            TokenRole::Operator
4433        }
4434    }
4435
4436    /// Emit a bracket-open token that triggers indentation. This is the
4437    /// inline-classification counterpart to the `indent_triggers` check
4438    /// in `token_with_role`: the SEQ walker computes indent-triggering
4439    /// from the SEQ structure directly rather than from a precomputed map.
4440    fn token_with_indent_open(&mut self, value: &str, role: TokenRole) {
4441        if value.is_empty() {
4442            return;
4443        }
4444        if role == TokenRole::BracketClose && self.policy.indent_close.iter().any(|t| t == value) {
4445            self.tokens.push(Token::IndentClose);
4446        }
4447        self.tokens.push(Token::Lit(value.to_owned(), role));
4448        if role == TokenRole::BracketOpen {
4449            self.tokens.push(Token::IndentOpen);
4450            self.tokens.push(Token::LineBreak);
4451        }
4452    }
4453
4454    fn newline(&mut self) {
4455        self.tokens.push(Token::LineBreak);
4456    }
4457
4458    /// Open an indent scope: subsequent `LineBreak`s render at the
4459    /// new depth until a matching `indent_close` pops it. Used by the
4460    /// external-token fallback to render indent-based grammars'
4461    /// `_indent` scanner outputs.
4462    fn indent_open(&mut self) {
4463        self.tokens.push(Token::IndentOpen);
4464        self.tokens.push(Token::LineBreak);
4465    }
4466
4467    /// Close one indent scope opened by `indent_open`.
4468    fn indent_close(&mut self) {
4469        self.tokens.push(Token::IndentClose);
4470    }
4471
4472    fn snapshot(&self) -> OutputSnapshot {
4473        OutputSnapshot {
4474            tokens_len: self.tokens.len(),
4475        }
4476    }
4477
4478    fn restore(&mut self, snap: OutputSnapshot) {
4479        self.tokens.truncate(snap.tokens_len);
4480    }
4481
4482    /// True iff at least one `Token::Lit` was pushed since `snap`.
4483    /// Control-only emissions (`LineBreak`, `IndentOpen` / `IndentClose`,
4484    /// `NoSpace`) do not count as content. Used by the REPEAT walker
4485    /// to detect that a "separator slot" CHOICE picked its BLANK
4486    /// alternative, so the next iteration's content can be marked
4487    /// tight against the previous iteration's content.
4488    fn lit_emitted_since(&self, snap: OutputSnapshot) -> bool {
4489        self.tokens[snap.tokens_len..]
4490            .iter()
4491            .any(|t| matches!(t, Token::Lit(_, _)))
4492    }
4493
4494    /// Push a marker that suppresses the next inter-Lit separator the
4495    /// layout pass would otherwise insert. Used to encode "no source-
4496    /// level separator was emitted between these two Lits" without
4497    /// having to make per-grammar adjacency decisions in the layout.
4498    fn no_space(&mut self) {
4499        self.tokens.push(Token::NoSpace);
4500    }
4501
4502    fn finish(self) -> Vec<u8> {
4503        layout(
4504            &self.tokens,
4505            self.policy,
4506            &self.grammar.line_comment_prefixes,
4507        )
4508    }
4509}
4510
4511/// Fold a token list into bytes. The algebra:
4512/// * adjacent `Lit`s get a single space iff `needs_space_between(a, b)`,
4513/// * `IndentOpen` / `IndentClose` adjust a depth counter,
4514/// * `LineBreak` writes `\n` if not already at line start, then the
4515///   next `Lit` writes `indent * indent_width` spaces of indent.
4516fn layout(tokens: &[Token], policy: &FormatPolicy, line_comment_prefixes: &[String]) -> Vec<u8> {
4517    let mut bytes = Vec::new();
4518    let mut indent: usize = 0;
4519    let mut at_line_start = true;
4520    let mut last_role: Option<TokenRole> = None;
4521    let mut last_text: String = String::new();
4522    let mut suppress_next_separator = false;
4523    let mut force_next_separator = false;
4524    let newline = policy.newline.as_bytes();
4525    let separator = policy.separator.as_bytes();
4526
4527    for (tok_idx, tok) in tokens.iter().enumerate() {
4528        if std::env::var("DBG_LAYOUT").is_ok() {
4529            match tok {
4530                Token::Lit(v, r) => eprintln!(
4531                    "  TOK: Lit({v:?}, {r:?}) at_line_start={at_line_start} last_role={last_role:?}"
4532                ),
4533                Token::IndentOpen => eprintln!("  TOK: IndentOpen"),
4534                Token::IndentClose => eprintln!("  TOK: IndentClose"),
4535                Token::LineBreak => eprintln!("  TOK: LineBreak"),
4536                Token::NoSpace => eprintln!("  TOK: NoSpace"),
4537                Token::ForceSpace => eprintln!("  TOK: ForceSpace"),
4538            }
4539        }
4540        match tok {
4541            Token::IndentOpen => indent += 1,
4542            Token::IndentClose => {
4543                indent = indent.saturating_sub(1);
4544                if !at_line_start {
4545                    bytes.extend_from_slice(newline);
4546                    at_line_start = true;
4547                }
4548            }
4549            Token::LineBreak => {
4550                if !at_line_start {
4551                    bytes.extend_from_slice(newline);
4552                    at_line_start = true;
4553                }
4554            }
4555            Token::NoSpace => {
4556                suppress_next_separator = true;
4557            }
4558            Token::ForceSpace => {
4559                force_next_separator = true;
4560            }
4561            Token::Lit(value, role) => {
4562                // Block-opening bracket: BracketOpen followed by IndentOpen.
4563                // After a Terminal/BracketClose, this should be spaced
4564                // (`}\n` not `0{`).
4565                let is_block_open = *role == TokenRole::BracketOpen
4566                    && tokens
4567                        .get(tok_idx + 1)
4568                        .is_some_and(|t| matches!(t, Token::IndentOpen));
4569                if at_line_start {
4570                    bytes.extend(std::iter::repeat_n(b' ', indent * policy.indent_width));
4571                } else if let Some(prev_role) = last_role {
4572                    let want_space = force_next_separator
4573                        || (!suppress_next_separator
4574                            && needs_space_by_role(prev_role, &last_text, *role, value))
4575                        || (is_block_open
4576                            && !suppress_next_separator
4577                            && matches!(prev_role, TokenRole::Terminal | TokenRole::BracketClose));
4578                    if want_space {
4579                        bytes.extend_from_slice(separator);
4580                    }
4581                }
4582                suppress_next_separator = false;
4583                force_next_separator = false;
4584                bytes.extend_from_slice(value.as_bytes());
4585                at_line_start = false;
4586                last_role = Some(*role);
4587                last_text.clear();
4588                last_text.push_str(value);
4589                if line_comment_prefixes
4590                    .iter()
4591                    .any(|p| value.starts_with(p.as_str()))
4592                {
4593                    bytes.extend_from_slice(newline);
4594                    at_line_start = true;
4595                    last_role = None;
4596                }
4597            }
4598        }
4599    }
4600
4601    if !at_line_start {
4602        bytes.extend_from_slice(newline);
4603    }
4604    bytes
4605}
4606
4607/// Effective spacing role: word-like bracket tokens (`function`, `end`,
4608/// `begin`, `done`, etc.) are structurally brackets (for indentation)
4609/// but space like keywords (they need whitespace on both sides).
4610fn effective_spacing_role(role: TokenRole, text: &str) -> TokenRole {
4611    match role {
4612        TokenRole::BracketOpen | TokenRole::BracketClose if is_word_like(text) => {
4613            TokenRole::Keyword
4614        }
4615        other => other,
4616    }
4617}
4618
4619/// Role-pair spacing table. Determines whether a space separator
4620/// should be inserted between two adjacent tokens based on their
4621/// structural roles and word-likeness.
4622fn needs_space_by_role(last: TokenRole, last_text: &str, next: TokenRole, next_text: &str) -> bool {
4623    let last = effective_spacing_role(last, last_text);
4624    let next = effective_spacing_role(next, next_text);
4625    match (last, next) {
4626        // Brackets: tight on the inside
4627        (TokenRole::BracketOpen, _) | (_, TokenRole::BracketClose) => false,
4628        // Separators: tight before, space after
4629        (_, TokenRole::Separator) => false,
4630        (TokenRole::Separator, _) => true,
4631        // Connectors: always tight (., ::, ->, etc.)
4632        (TokenRole::Connector, _) | (_, TokenRole::Connector) => false,
4633        // Terminal followed by bracket-open: tight (f() not f ())
4634        (TokenRole::Terminal, TokenRole::BracketOpen) => false,
4635        // Close followed by open: tight
4636        (TokenRole::BracketClose, TokenRole::BracketOpen) => false,
4637        // Keywords always spaced
4638        (TokenRole::Keyword, _) | (_, TokenRole::Keyword) => true,
4639        // Terminals and operators: space between
4640        (TokenRole::Terminal, TokenRole::Terminal) => true,
4641        (TokenRole::Terminal, TokenRole::Operator) | (TokenRole::Operator, TokenRole::Terminal) => {
4642            true
4643        }
4644        (TokenRole::Operator, TokenRole::Operator) => true,
4645        // Close followed by non-bracket: space
4646        (TokenRole::BracketClose, _) => true,
4647        // Operator before open: space
4648        (TokenRole::Operator, TokenRole::BracketOpen) => true,
4649    }
4650}
4651
4652#[cfg(test)]
4653#[allow(clippy::unwrap_used)]
4654mod tests {
4655    use super::*;
4656
4657    fn test_grammar() -> Grammar {
4658        Grammar::from_bytes("test", b"{\"name\":\"test\",\"rules\":{}}").unwrap_or_else(|_| {
4659            serde_json::from_str::<Grammar>(r#"{"name":"test","rules":{}}"#).unwrap()
4660        })
4661    }
4662
4663    #[test]
4664    fn parses_simple_grammar_json() {
4665        let bytes = br#"{
4666            "name": "tiny",
4667            "rules": {
4668                "program": {
4669                    "type": "SEQ",
4670                    "members": [
4671                        {"type": "STRING", "value": "hello"},
4672                        {"type": "STRING", "value": ";"}
4673                    ]
4674                }
4675            }
4676        }"#;
4677        let g = Grammar::from_bytes("tiny", bytes).expect("valid tiny grammar");
4678        assert!(g.rules.contains_key("program"));
4679    }
4680
4681    #[test]
4682    fn output_emits_punctuation_without_leading_space() {
4683        let policy = FormatPolicy::default();
4684        let g = test_grammar();
4685        let mut out = Output::new(&policy, &g, None);
4686        out.token_with_role("foo", Some(TokenRole::Terminal));
4687        out.token_with_role("(", Some(TokenRole::BracketOpen));
4688        out.token_with_role(")", Some(TokenRole::BracketClose));
4689        out.token_with_role(";", Some(TokenRole::Separator));
4690        let bytes = out.finish();
4691        let s = std::str::from_utf8(&bytes).expect("ascii output");
4692        assert!(s.starts_with("foo();"), "got {s:?}");
4693    }
4694
4695    #[test]
4696    fn grammar_from_bytes_rejects_malformed_input() {
4697        let result = Grammar::from_bytes("malformed", b"not json");
4698        let err = result.expect_err("malformed bytes must yield Err");
4699        let msg = err.to_string();
4700        assert!(
4701            msg.contains("malformed"),
4702            "error message should name the protocol: {msg:?}"
4703        );
4704    }
4705
4706    #[test]
4707    fn output_indents_after_open_brace() {
4708        let policy = FormatPolicy::default();
4709        let g = test_grammar();
4710        let mut out = Output::new(&policy, &g, None);
4711        out.token_with_role("fn", Some(TokenRole::Keyword));
4712        out.token_with_role("foo", Some(TokenRole::Terminal));
4713        out.token_with_role("(", Some(TokenRole::BracketOpen));
4714        out.token_with_role(")", Some(TokenRole::BracketClose));
4715        out.token_with_role("{", Some(TokenRole::BracketOpen));
4716        out.token_with_role("body", Some(TokenRole::Terminal));
4717        out.token_with_role("}", Some(TokenRole::BracketClose));
4718        let bytes = out.finish();
4719        let s = std::str::from_utf8(&bytes).expect("ascii output");
4720        assert!(s.contains("{\n"), "newline after opening brace: {s:?}");
4721        assert!(s.contains("body"), "body inside block: {s:?}");
4722        assert!(s.ends_with("}\n"), "newline after closing brace: {s:?}");
4723    }
4724
4725    #[test]
4726    fn output_no_space_between_word_and_dot() {
4727        let policy = FormatPolicy::default();
4728        let g = test_grammar();
4729        let mut out = Output::new(&policy, &g, None);
4730        out.token_with_role("foo", Some(TokenRole::Terminal));
4731        out.token_with_role(".", Some(TokenRole::Operator));
4732        out.token_with_role("bar", Some(TokenRole::Terminal));
4733        let bytes = out.finish();
4734        let s = std::str::from_utf8(&bytes).expect("ascii output");
4735        // With role-based spacing, operator gets spaces: "foo . bar"
4736        // The dot tight-binding is a grammar-derived property (dot appears
4737        // between SYMBOL members in attribute/field access rules).
4738        // For unit tests with explicit roles, we accept spaced dot.
4739        assert!(
4740            s.contains("foo") && s.contains("bar"),
4741            "both identifiers present: {s:?}"
4742        );
4743    }
4744
4745    #[test]
4746    fn output_snapshot_restore_truncates_bytes() {
4747        let policy = FormatPolicy::default();
4748        let g = test_grammar();
4749        let mut out = Output::new(&policy, &g, None);
4750        out.token("keep");
4751        let snap = out.snapshot();
4752        out.token("drop");
4753        out.token("more");
4754        out.restore(snap);
4755        out.token("after");
4756        let bytes = out.finish();
4757        let s = std::str::from_utf8(&bytes).expect("ascii output");
4758        assert!(s.contains("keep"), "kept token survives: {s:?}");
4759        assert!(s.contains("after"), "post-restore token visible: {s:?}");
4760        assert!(!s.contains("drop"), "rolled-back token removed: {s:?}");
4761        assert!(!s.contains("more"), "rolled-back token removed: {s:?}");
4762    }
4763
4764    #[test]
4765    fn child_cursor_take_field_consumes_once() {
4766        let edges_owned: Vec<Edge> = vec![Edge {
4767            src: panproto_gat::Name::from("p"),
4768            tgt: panproto_gat::Name::from("c"),
4769            kind: panproto_gat::Name::from("name"),
4770            name: None,
4771        }];
4772        let edges: Vec<&Edge> = edges_owned.iter().collect();
4773        let mut cursor = ChildCursor::new(&edges);
4774        let first = cursor.take_field("name");
4775        let second = cursor.take_field("name");
4776        assert!(first.is_some(), "first take returns the edge");
4777        assert!(
4778            second.is_none(),
4779            "second take returns None (already consumed)"
4780        );
4781    }
4782
4783    #[test]
4784    fn child_cursor_take_matching_predicate() {
4785        let edges_owned: Vec<Edge> = vec![
4786            Edge {
4787                src: "p".into(),
4788                tgt: "c1".into(),
4789                kind: "child_of".into(),
4790                name: None,
4791            },
4792            Edge {
4793                src: "p".into(),
4794                tgt: "c2".into(),
4795                kind: "key".into(),
4796                name: None,
4797            },
4798        ];
4799        let edges: Vec<&Edge> = edges_owned.iter().collect();
4800        let mut cursor = ChildCursor::new(&edges);
4801        assert!(cursor.has_matching(|e| e.kind.as_ref() == "key"));
4802        let taken = cursor.take_matching(|e| e.kind.as_ref() == "key");
4803        assert!(taken.is_some());
4804        assert!(
4805            !cursor.has_matching(|e| e.kind.as_ref() == "key"),
4806            "consumed edge no longer matches"
4807        );
4808        assert!(
4809            cursor.has_matching(|e| e.kind.as_ref() == "child_of"),
4810            "the other edge is still available"
4811        );
4812    }
4813
4814    #[test]
4815    fn kind_satisfies_symbol_direct_match() {
4816        let bytes = br#"{
4817            "name": "tiny",
4818            "rules": {
4819                "x": {"type": "STRING", "value": "x"}
4820            }
4821        }"#;
4822        let g = Grammar::from_bytes("tiny", bytes).expect("valid grammar");
4823        assert!(kind_satisfies_symbol(&g, Some("x"), "x"));
4824        assert!(!kind_satisfies_symbol(&g, Some("y"), "x"));
4825        assert!(!kind_satisfies_symbol(&g, None, "x"));
4826    }
4827
4828    #[test]
4829    fn kind_satisfies_symbol_through_hidden_rule() {
4830        let bytes = br#"{
4831            "name": "tiny",
4832            "rules": {
4833                "_value": {
4834                    "type": "CHOICE",
4835                    "members": [
4836                        {"type": "SYMBOL", "name": "object"},
4837                        {"type": "SYMBOL", "name": "number"}
4838                    ]
4839                },
4840                "object": {"type": "STRING", "value": "{}"},
4841                "number": {"type": "PATTERN", "value": "[0-9]+"}
4842            }
4843        }"#;
4844        let g = Grammar::from_bytes("tiny", bytes).expect("valid grammar");
4845        assert!(
4846            kind_satisfies_symbol(&g, Some("number"), "_value"),
4847            "number is reachable from _value via CHOICE"
4848        );
4849        assert!(
4850            kind_satisfies_symbol(&g, Some("object"), "_value"),
4851            "object is reachable from _value via CHOICE"
4852        );
4853        assert!(
4854            !kind_satisfies_symbol(&g, Some("string"), "_value"),
4855            "string is NOT among the alternatives"
4856        );
4857    }
4858
4859    #[test]
4860    fn first_symbol_skips_string_terminals() {
4861        let prod: Production = serde_json::from_str(
4862            r#"{
4863                "type": "SEQ",
4864                "members": [
4865                    {"type": "STRING", "value": "{"},
4866                    {"type": "SYMBOL", "name": "body"},
4867                    {"type": "STRING", "value": "}"}
4868                ]
4869            }"#,
4870        )
4871        .expect("valid SEQ");
4872        assert_eq!(first_symbol(&prod), Some("body"));
4873    }
4874
4875    #[test]
4876    fn placeholder_for_pattern_routes_by_regex_class() {
4877        assert_eq!(placeholder_for_pattern("[0-9]+"), "0");
4878        assert_eq!(placeholder_for_pattern("[a-zA-Z_]\\w*"), "_x");
4879        assert_eq!(placeholder_for_pattern("\"[^\"]*\""), "\"\"");
4880        assert_eq!(placeholder_for_pattern("\\d+\\.\\d+"), "0");
4881    }
4882
4883    #[test]
4884    fn format_policy_default_breaks_after_semicolon() {
4885        let policy = FormatPolicy::default();
4886        assert!(policy.line_break_after.iter().any(|t| t == ";"));
4887        assert!(policy.indent_open.iter().any(|t| t == "{"));
4888        assert!(policy.indent_close.iter().any(|t| t == "}"));
4889        assert_eq!(policy.indent_width, 2);
4890    }
4891
4892    #[test]
4893    fn placeholder_decodes_literal_pattern_separators() {
4894        // PATTERN regexes that match a single literal byte sequence
4895        // (newline, semicolon, comma) emit the bytes verbatim instead
4896        // of falling through to the `_` catch-all.
4897        assert_eq!(placeholder_for_pattern("\\n"), "\n");
4898        assert_eq!(placeholder_for_pattern("\\r\\n"), "\r\n");
4899        assert_eq!(placeholder_for_pattern(";"), ";");
4900        // Patterns with character classes / alternation still route
4901        // through the heuristic.
4902        assert_eq!(placeholder_for_pattern("[0-9]+"), "0");
4903        assert_eq!(placeholder_for_pattern("a|b"), "_");
4904    }
4905
4906    #[test]
4907    fn supertypes_decode_from_grammar_json_strings() {
4908        // Tree-sitter older grammars list supertypes as bare strings.
4909        let bytes = br#"{
4910            "name": "tiny",
4911            "supertypes": ["expression"],
4912            "rules": {
4913                "expression": {
4914                    "type": "CHOICE",
4915                    "members": [
4916                        {"type": "SYMBOL", "name": "binary_expression"},
4917                        {"type": "SYMBOL", "name": "identifier"}
4918                    ]
4919                },
4920                "binary_expression": {"type": "STRING", "value": "x"},
4921                "identifier": {"type": "PATTERN", "value": "[a-z]+"}
4922            }
4923        }"#;
4924        let g = Grammar::from_bytes("tiny", bytes).expect("parse");
4925        assert!(g.supertypes.contains("expression"));
4926        // identifier matches the supertype `expression`.
4927        assert!(kind_satisfies_symbol(&g, Some("identifier"), "expression"));
4928        // unrelated kinds do not.
4929        assert!(!kind_satisfies_symbol(&g, Some("string"), "expression"));
4930    }
4931
4932    #[test]
4933    fn supertypes_decode_from_grammar_json_objects() {
4934        // Recent grammars list supertypes as `{type: SYMBOL, name: ...}`
4935        // entries instead of bare strings.
4936        let bytes = br#"{
4937            "name": "tiny",
4938            "supertypes": [{"type": "SYMBOL", "name": "stmt"}],
4939            "rules": {
4940                "stmt": {
4941                    "type": "CHOICE",
4942                    "members": [
4943                        {"type": "SYMBOL", "name": "while_stmt"},
4944                        {"type": "SYMBOL", "name": "if_stmt"}
4945                    ]
4946                },
4947                "while_stmt": {"type": "STRING", "value": "while"},
4948                "if_stmt": {"type": "STRING", "value": "if"}
4949            }
4950        }"#;
4951        let g = Grammar::from_bytes("tiny", bytes).expect("parse");
4952        assert!(g.supertypes.contains("stmt"));
4953        assert!(kind_satisfies_symbol(&g, Some("while_stmt"), "stmt"));
4954    }
4955
4956    #[test]
4957    fn alias_value_matches_kind() {
4958        // A named ALIAS rewrites the parser-visible kind to `value`;
4959        // `kind_satisfies_symbol` should accept that rewritten kind
4960        // when looking up the original SYMBOL.
4961        let bytes = br#"{
4962            "name": "tiny",
4963            "rules": {
4964                "_package_identifier": {
4965                    "type": "ALIAS",
4966                    "named": true,
4967                    "value": "package_identifier",
4968                    "content": {"type": "SYMBOL", "name": "identifier"}
4969                },
4970                "identifier": {"type": "PATTERN", "value": "[a-z]+"}
4971            }
4972        }"#;
4973        let g = Grammar::from_bytes("tiny", bytes).expect("parse");
4974        assert!(kind_satisfies_symbol(
4975            &g,
4976            Some("package_identifier"),
4977            "_package_identifier"
4978        ));
4979    }
4980
4981    #[test]
4982    fn referenced_symbols_walks_nested_seq() {
4983        let prod: Production = serde_json::from_str(
4984            r#"{
4985                "type": "SEQ",
4986                "members": [
4987                    {"type": "CHOICE", "members": [
4988                        {"type": "SYMBOL", "name": "attribute_item"},
4989                        {"type": "BLANK"}
4990                    ]},
4991                    {"type": "SYMBOL", "name": "parameter"},
4992                    {"type": "REPEAT", "content": {
4993                        "type": "SEQ",
4994                        "members": [
4995                            {"type": "STRING", "value": ","},
4996                            {"type": "SYMBOL", "name": "parameter"}
4997                        ]
4998                    }}
4999                ]
5000            }"#,
5001        )
5002        .expect("seq");
5003        let symbols = referenced_symbols(&prod);
5004        assert!(symbols.contains(&"attribute_item"));
5005        assert!(symbols.contains(&"parameter"));
5006    }
5007
5008    #[test]
5009    fn literal_strings_collects_choice_members() {
5010        let prod: Production = serde_json::from_str(
5011            r#"{
5012                "type": "CHOICE",
5013                "members": [
5014                    {"type": "STRING", "value": "+"},
5015                    {"type": "STRING", "value": "-"},
5016                    {"type": "STRING", "value": "*"}
5017                ]
5018            }"#,
5019        )
5020        .expect("choice");
5021        let strings = literal_strings(&prod);
5022        assert_eq!(strings, vec!["+", "-", "*"]);
5023    }
5024
5025    /// The ocaml and javascript grammars (tree-sitter ≥ 0.25) emit a
5026    /// `RESERVED` rule kind that an earlier deserialiser rejected
5027    /// with `unknown variant "RESERVED"`. Verify both that the bare
5028    /// variant deserialises and that a `RESERVED`-wrapped grammar is
5029    /// loadable end-to-end via [`Grammar::from_bytes`].
5030    #[test]
5031    fn reserved_variant_deserialises() {
5032        let prod: Production = serde_json::from_str(
5033            r#"{
5034                "type": "RESERVED",
5035                "content": {"type": "SYMBOL", "name": "_lowercase_identifier"},
5036                "context_name": "attribute_id"
5037            }"#,
5038        )
5039        .expect("RESERVED parses");
5040        match prod {
5041            Production::Reserved { content, .. } => match *content {
5042                Production::Symbol { name } => assert_eq!(name, "_lowercase_identifier"),
5043                other => panic!("expected inner SYMBOL, got {other:?}"),
5044            },
5045            other => panic!("expected RESERVED, got {other:?}"),
5046        }
5047    }
5048
5049    #[test]
5050    fn reserved_grammar_loads_end_to_end() {
5051        let bytes = br#"{
5052            "name": "tiny_reserved",
5053            "rules": {
5054                "program": {
5055                    "type": "RESERVED",
5056                    "content": {"type": "SYMBOL", "name": "ident"},
5057                    "context_name": "keywords"
5058                },
5059                "ident": {"type": "PATTERN", "value": "[a-z]+"}
5060            }
5061        }"#;
5062        let g = Grammar::from_bytes("tiny_reserved", bytes).expect("RESERVED-using grammar loads");
5063        assert!(g.rules.contains_key("program"));
5064    }
5065
5066    #[test]
5067    fn reserved_walker_helpers_recurse_into_content() {
5068        // The walker's helpers (first_symbol, has_field_in,
5069        // referenced_symbols, ...) all need to descend through
5070        // RESERVED into its content. If they bail at RESERVED, the
5071        // `pick_choice_with_cursor` heuristic ranks the alt below
5072        // alts that DO recurse, which produces wrong emit output
5073        // even when the deserialiser doesn't crash.
5074        let prod: Production = serde_json::from_str(
5075            r#"{
5076                "type": "RESERVED",
5077                "content": {
5078                    "type": "FIELD",
5079                    "name": "lhs",
5080                    "content": {"type": "SYMBOL", "name": "expr"}
5081                },
5082                "context_name": "ctx"
5083            }"#,
5084        )
5085        .expect("nested RESERVED parses");
5086        assert_eq!(first_symbol(&prod), Some("expr"));
5087        assert!(has_field_in(&prod, &["lhs"]));
5088        let symbols = referenced_symbols(&prod);
5089        assert!(symbols.contains(&"expr"));
5090    }
5091
5092    // -- Yield-set tests --
5093
5094    fn yield_of(grammar: &Grammar, prod: &Production) -> std::collections::HashSet<String> {
5095        let mut visited = std::collections::HashSet::new();
5096        let mut cache = grammar.yield_sets.clone();
5097        yield_of_production(grammar, prod, &mut visited, &mut cache)
5098    }
5099
5100    #[test]
5101    fn yield_set_seq_only_first_member() {
5102        let prod: Production = serde_json::from_str(
5103            r#"{
5104                "type": "SEQ",
5105                "members": [
5106                    {"type": "SYMBOL", "name": "identifier"},
5107                    {"type": "STRING", "value": "as"},
5108                    {"type": "SYMBOL", "name": "target"}
5109                ]
5110            }"#,
5111        )
5112        .expect("valid SEQ");
5113        let g = Grammar::from_bytes("test", b"{}").unwrap_or_else(|_| {
5114            serde_json::from_str::<Grammar>(r#"{"name":"t","rules":{}}"#).unwrap()
5115        });
5116        let ys = yield_of(&g, &prod);
5117        assert!(ys.contains("identifier"), "SEQ yields first member");
5118        assert!(
5119            !ys.contains("target"),
5120            "SEQ must NOT yield non-first members"
5121        );
5122    }
5123
5124    #[test]
5125    fn yield_set_choice_union() {
5126        let prod: Production = serde_json::from_str(
5127            r#"{
5128                "type": "CHOICE",
5129                "members": [
5130                    {"type": "SYMBOL", "name": "a"},
5131                    {"type": "SYMBOL", "name": "b"}
5132                ]
5133            }"#,
5134        )
5135        .expect("valid CHOICE");
5136        let g = serde_json::from_str::<Grammar>(r#"{"name":"t","rules":{}}"#).unwrap();
5137        let ys = yield_of(&g, &prod);
5138        assert_eq!(ys.len(), 2);
5139        assert!(ys.contains("a"));
5140        assert!(ys.contains("b"));
5141    }
5142
5143    #[test]
5144    fn yield_set_hidden_expansion() {
5145        let g = serde_json::from_str::<Grammar>(
5146            r#"{"name":"t","rules":{
5147                "_value": {
5148                    "type": "CHOICE",
5149                    "members": [
5150                        {"type": "SYMBOL", "name": "number"},
5151                        {"type": "SYMBOL", "name": "object"}
5152                    ]
5153                }
5154            }}"#,
5155        )
5156        .unwrap();
5157        let mut g = g;
5158        g.subtypes = compute_subtype_closure(&g);
5159        g.yield_sets = compute_yield_sets(&g);
5160        let sym: Production =
5161            serde_json::from_str(r#"{"type": "SYMBOL", "name": "_value"}"#).unwrap();
5162        let ys = yield_of(&g, &sym);
5163        assert!(
5164            ys.contains("number"),
5165            "hidden rule expands into its CHOICE members"
5166        );
5167        assert!(ys.contains("object"));
5168        assert!(
5169            !ys.contains("_value"),
5170            "hidden rule name is not in yield set"
5171        );
5172    }
5173
5174    #[test]
5175    fn yield_set_optional_includes_epsilon() {
5176        let prod: Production = serde_json::from_str(
5177            r#"{"type": "OPTIONAL", "content": {"type": "SYMBOL", "name": "x"}}"#,
5178        )
5179        .unwrap();
5180        let g = serde_json::from_str::<Grammar>(r#"{"name":"t","rules":{}}"#).unwrap();
5181        let ys = yield_of(&g, &prod);
5182        assert!(ys.contains("x"));
5183        assert!(ys.contains(""), "OPTIONAL includes epsilon");
5184    }
5185
5186    #[test]
5187    fn yield_set_alias_uses_value() {
5188        let prod: Production = serde_json::from_str(
5189            r#"{"type": "ALIAS", "content": {"type": "SYMBOL", "name": "real"},
5190                "named": true, "value": "alias_name"}"#,
5191        )
5192        .unwrap();
5193        let g = serde_json::from_str::<Grammar>(r#"{"name":"t","rules":{}}"#).unwrap();
5194        let ys = yield_of(&g, &prod);
5195        assert_eq!(ys.len(), 1);
5196        assert!(ys.contains("alias_name"), "named ALIAS yields its value");
5197    }
5198}